diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8
index 479ab75be60f..4f6d771cb93d 100644
--- a/usr.sbin/bhyve/bhyve.8
+++ b/usr.sbin/bhyve/bhyve.8
@@ -1,1048 +1,1066 @@
 .\" Copyright (c) 2013 Peter Grehan
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
 .Dd May 5, 2023
 .Dt BHYVE 8
 .Os
 .Sh NAME
 .Nm bhyve
 .Nd "run a guest operating system inside a virtual machine"
 .Sh SYNOPSIS
 .Nm
 .Op Fl AaCDeHhPSuWwxY
 .Oo
 .Sm off
 .Fl c\~
 .Oo
 .Op Cm cpus=
 .Ar numcpus
 .Oc
 .Op Cm ,sockets= Ar n
 .Op Cm ,cores= Ar n
 .Op Cm ,threads= Ar n
 .Oc
 .Sm on
 .Oo Fl f
 .Sm off
 .Ar name Cm \&,
 .Oo
 .Cm string No | Cm file
 .Oc
 .Cm \&= Ar data
 .Sm on
 .Oc
 .Oo
 .Sm off
 .Fl G\~
 .Oo Ar w Oc
 .Oo Ar bind_address Cm \&: Oc
 .Ar port
 .Sm on
 .Oc
 .Op Fl k Ar config_file
 .Op Fl K Ar layout
 .Oo Fl l
 .Sm off
 .Ar lpcdev Op Cm \&, Ar conf
 .Sm on
 .Oc
 .Sm off
 .Oo Fl m\~
 .Ar memsize
 .Oo
 .Cm K | Cm k | Cm M | Cm m | Cm G | Cm g | Cm T | Cm t
 .Oc
 .Sm on
 .Oc
 .Op Fl o Ar var Ns Cm = Ns Ar value
 .Op Fl p Ar vcpu Ns Cm \&: Ns Ar hostcpu
 .Op Fl r Ar file
 .Oo Fl R
 .Sm off
 .Ar host Op Cm \&: Ar port
 .Sm on
 .Oc
 .Sm off
 .Oo Fl s\~
 .Ar slot Cm \&, Ar emulation Op Cm \&, Ar conf
 .Sm on
 .Oc
 .Op Fl U Ar uuid
 .Ar vmname
 .Nm
 .Fl l Cm help
 .Nm
 .Fl s Cm help
 .Sh DESCRIPTION
 .Nm
 is a hypervisor that runs guest operating systems inside a
 virtual machine.
 .Pp
 Parameters such as the number of virtual CPUs, amount of guest memory, and
 I/O connectivity can be specified with command-line parameters.
 .Pp
 If not using a boot ROM, the guest operating system must be loaded with
 .Xr bhyveload 8
 or a similar boot loader before running
 .Nm ,
 otherwise, it is enough to run
 .Nm
 with a boot ROM of choice.
 .Pp
 .Nm
 runs until the guest operating system reboots or an unhandled hypervisor
 exit is detected.
 .Sh OPTIONS
 .Bl -tag -width 10n
 .It Fl A
 Generate ACPI tables.
 Required for
 .Fx Ns /amd64
 guests.
 .It Fl a
 The guest's local APIC is configured in xAPIC mode.
 The xAPIC mode is the default setting so this option is redundant.
 It will be deprecated in a future version.
 .It Fl C
 Include guest memory in core file.
 .It Fl c Op Ar setting ...
 Number of guest virtual CPUs
 and/or the CPU topology.
 The default value for each of
 .Ar numcpus ,
 .Ar sockets ,
 .Ar cores ,
 and
 .Ar threads
 is 1.
 The current maximum number of guest virtual CPUs is 16.
 If
 .Ar numcpus
 is not specified then it will be calculated from the other arguments.
 The topology must be consistent in that the
 .Ar numcpus
 must equal the product of
 .Ar sockets ,
 .Ar cores ,
 and
 .Ar threads .
 If a
 .Ar setting
 is specified more than once the last one has precedence.
 .It Fl D
 Destroy the VM on guest initiated power-off.
 .It Fl e
 Force
 .Nm
 to exit when a guest issues an access to an I/O port that is not emulated.
 This is intended for debug purposes.
 .It Fl f Ar name Ns Cm \&, Ns Oo Cm string Ns No | Ns Cm file Ns Oc Ns Cm \&= Ns Ar data
 Add a fw_cfg file
 .Ar name
 to the fw_cfg interface.
 If a
 .Cm string
 is specified, the fw_cfg file contains the string as data.
 If a
 .Cm file
 is specified, bhyve reads the file and adds the file content as fw_cfg data.
 .It Fl G Xo
 .Sm off
 .Oo Ar w Oc
 .Oo Ar bind_address Cm \&: Oc
 .Ar port
 .Sm on
 .Xc
 Start a debug server that uses the GDB protocol to export guest state to a
 debugger.
 An IPv4 TCP socket will be bound to the supplied
 .Ar bind_address
 and
 .Ar port
 to listen for debugger connections.
 Only a single debugger may be attached to the debug server at a time.
 If the option begins with
 .Sq w ,
 .Nm
 will pause execution at the first instruction waiting for a debugger to attach.
 .It Fl H
 Yield the virtual CPU thread when a HLT instruction is detected.
 If this option is not specified, virtual CPUs will use 100% of a host CPU.
 .It Fl h
 Print help message and exit.
 .It Fl k Ar config_file
 Set configuration variables from a simple, key-value config file.
 Each line of the config file is expected to consist of a config variable
 name, an equals sign
 .It Fl K Ar layout
 Specify the keyboard layout.
 The value that can be specified sets the file name in
 .Ar /usr/share/bhyve/kbdlayout .
 This specification only works when loaded with UEFI mode for VNC.
 When using a VNC client that supports QEMU Extended Key Event Message (e.g. TigerVNC), this option isn't needed.
 When using a VNC client that doesn't support QEMU Extended Key Event Message (e.g. tightVNC), the layout defaults to the US keyboard unless specified otherwise.
 .Pq Sq = ,
 and a value.
 No spaces are permitted between the variable name, equals sign, or
 value.
 Blank lines and lines starting with
 .Sq #
 are ignored.
 See
 .Xr bhyve_config 5
 for more details.
 .It Fl l Cm help
 Print a list of supported LPC devices.
 .It Fl l Ar lpcdev Ns Op Cm \&, Ns Ar conf
 Allow devices behind the LPC PCI-ISA bridge to be configured.
 The only supported devices are the TTY-class devices
 .Cm com1 , com2 , com3 ,
 and
 .Cm com4 ,
 the boot ROM device
 .Cm bootrom ,
 the
 .Cm fwcfg
 type and the debug/test device
 .Cm pc-testdev .
 .Pp
 The possible values for the
 .Ar conf
 argument are listed in the
 .Fl s
 flag description.
 .It Xo
 .Fl m Ar memsize Ns Oo
 .Sm off
 .Cm K | k | M | m | G | g | T | t
 .Sm on
 .Oc
 .Xc
 Set the guest physical memory size
 This must be the same size that was given to
 .Xr bhyveload 8 .
 .Pp
 The size argument may be suffixed with one of
 .Cm K , M , G
 or
 .Cm T
 (either upper or lower case)
 to indicate a multiple of kilobytes, megabytes, gigabytes, or terabytes.
 If no suffix is given, the value is assumed to be in megabytes.
 .Pp
 The default is 256M.
 .It Fl o Ar var Ns Cm = Ns Ar value
 Set the configuration variable
 .Ar var
 to
 .Ar value .
 .It Fl P
 Force the guest virtual CPU to exit when a PAUSE instruction is detected.
 .It Fl p Ar vcpu Ns Cm \& : Ns Ar hostcpu
 Pin guest's virtual CPU
 .Em vcpu
 to
 .Em hostcpu .
 .It Fl r Ar file
 Resume a guest from a snapshot.
 The guest memory contents are restored from
 .Ar file ,
 and the guest device and vCPU state are restored from the file
 .Dq Ar file Ns .kern .
 .Pp
 Note that the current snapshot file format requires that the configuration of
 devices in the new VM match the VM from which the snapshot was taken by specifying the
 same
 .Fl s
 and
 .Fl l
 options.
 The count of vCPUs and memory configuration are read from the snapshot.
 .It Fl R Ar host Ns Op Cm \&: Ns Ar port
 Receive migration from a source guest.
 Await for a connection from
 .Ar host
 on the specified
 .Ar port
 and resume execution. The default migration port is 24983.
 .It Fl S
 Wire guest memory.
 .It Fl s Cm help
 Print a list of supported PCI devices.
 .It Fl s Ar slot Ns Cm \&, Ns Ar emulation Ns Op Cm \&, Ns Ar conf
 Configure a virtual PCI slot and function.
 .Pp
 .Nm
 provides PCI bus emulation and virtual devices that can be attached to
 slots on the bus.
 There are 32 available slots, with the option of providing up to 8 functions
 per slot.
 .Pp
 The
 .Ar slot
 can be specified in one of the following formats:
 .Pp
 .Bl -bullet -compact
 .It
 .Ar pcislot
 .It
 .Sm off
 .Ar pcislot Cm \&: Ar function
 .Sm on
 .It
 .Sm off
 .Ar bus Cm \&: Ar pcislot Cm \&: Ar function
 .Sm on
 .El
 .Pp
 The
 .Ar pcislot
 value is 0 to 31.
 The optional
 .Ar function
 value is 0 to 7.
 The optional
 .Ar bus
 value is 0 to 255.
 If not specified, the
 .Ar function
 value defaults to 0.
 If not specified, the
 .Ar bus
 value defaults to 0.
 .Pp
 The
 .Ar emulation
 argument
 can be one of the following:
 .Bl -tag -width "amd_hostbridge"
 .It Cm hostbridge
 A simple host bridge.
 This is usually configured at slot 0, and is required by most guest
 operating systems.
 .It Cm amd_hostbridge
 Emulation identical to
 .Cm hostbridge
 using a PCI vendor ID of AMD.
 .It Cm passthru
 PCI pass-through device.
 .It Cm virtio-net
 Virtio network interface.
 .It Cm virtio-blk
 Virtio block storage interface.
 .It Cm virtio-scsi
 Virtio SCSI interface.
 .It Cm virtio-9p
 Virtio 9p (VirtFS) interface.
 .It Cm virtio-rnd
 Virtio RNG interface.
 .It Cm virtio-console
 Virtio console interface, which exposes multiple ports
 to the guest in the form of simple char devices for simple IO
 between the guest and host userspaces.
 .It Cm virtio-input
 Virtio input interface.
 .It Cm ahci
 AHCI controller attached to arbitrary devices.
 .It Cm ahci-cd
 AHCI controller attached to an ATAPI CD/DVD.
 .It Cm ahci-hd
 AHCI controller attached to a SATA hard drive.
 .It Cm e1000
 Intel e82545 network interface.
 .It Cm uart
 PCI 16550 serial device.
 .It Cm lpc
 LPC PCI-ISA bridge with COM1, COM2, COM3, and COM4 16550 serial ports,
 a boot ROM, and,
 optionally, a fwcfg type and the debug/test device.
 The LPC bridge emulation can only be configured on bus 0.
 .It Cm fbuf
 Raw framebuffer device attached to VNC server.
 .It Cm xhci
 eXtensible Host Controller Interface (xHCI) USB controller.
 .It Cm nvme
 NVM Express (NVMe) controller.
 .It Cm hda
 High Definition Audio Controller.
 .El
 .Pp
 The optional parameter
 .Ar conf
 describes the backend for device emulations.
 If
 .Ar conf
 is not specified, the device emulation has no backend and can be
 considered unconnected.
 .Pp
 Network device backends:
 .Sm off
 .Bl -bullet
 .It
 .Xo
 .Cm tap Ar N
 .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx
 .Op Cm \&,mtu= Ar N
 .Xc
 .It
 .Xo
 .Cm vmnet Ar N
 .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx
 .Op Cm \&,mtu= Ar N
 .Xc
 .It
 .Xo
 .Cm netgraph,path= Ar ADDRESS Cm \&,peerhook= Ar HOOK
 .Op Cm \&,socket= Ar NAME
 .Op Cm \&,hook= Ar HOOK
 .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx
 .Op Cm \&,mtu= Ar N
 .Xc
 .El
 .Sm on
 .Pp
 If
 .Cm mac
 is not specified, the MAC address is derived from a fixed OUI and the
 remaining bytes from an MD5 hash of the slot and function numbers and
 the device name.
 .Pp
 The MAC address is an ASCII string in
 .Xr ethers 5
 format.
 .Pp
 With
 .Cm virtio-net
 devices, the
 .Cm mtu
 parameter can be specified to inform the guest about the largest MTU
 that should be allowed, expressed in bytes.
 .Pp
 With
 .Cm netgraph
 backend, the
 .Cm path
 and
 .Cm peerhook
 parameters must be specified to set the destination node and corresponding hook.
 The optional parameters
 .Cm socket
 and
 .Cm hook
 may be used to set the
 .Xr ng_socket 4
 node name and source hook.
 The
 .Ar ADDRESS ,
 .Ar HOOK ,
 and
 .Ar NAME
 must comply with
 .Xr netgraph 4
 addressing rules.
 .Pp
 Block storage device backends:
 .Sm off
 .Bl -bullet
 .It
 .Ar /filename Op Cm \&, Ar block-device-options
 .It
 .Ar /dev/xxx Op Cm \&, Ar block-device-options
 .El
 .Sm on
 .Pp
 The
 .Ar block-device-options
 are:
 .Bl -tag -width 10n
 .It Cm nocache
 Open the file with
 .Dv O_DIRECT .
 .It Cm direct
 Open the file using
 .Dv O_SYNC .
 .It Cm ro
 Force the file to be opened read-only.
 .It Cm sectorsize= Ns Ar logical Ns Oo Cm \&/ Ns Ar physical Oc
 Specify the logical and physical sector sizes of the emulated disk.
 The physical sector size is optional and is equal to the logical sector size
 if not explicitly specified.
 .It Cm nodelete
 Disable emulation of guest trim requests via
 .Dv DIOCGDELETE
 requests.
+.It Li bootindex= Ns Ar index
+Add the device to the bootorder at
+.Ar index .
+A fwcfg file is used to specify the bootorder.
+The guest firmware may ignore or doesn't support this fwcfg file.
+In that case, this feature doesn't work as expected.
 .El
 .Pp
 SCSI device backends:
 .Sm off
 .Bl -bullet
 .It
 .Pa /dev/cam/ctl Oo Ar pp Cm \&. Ar vp Oc Oo Cm \&, Ar scsi-device-options Oc
 .El
 .Sm on
 .Pp
 The
 .Ar scsi-device-options
 are:
 .Bl -tag -width 10n
 .It Cm iid= Ns Ar IID
 Initiator ID to use when sending requests to specified CTL port.
 The default value is 0.
+.It Li bootindex= Ns Ar index
+Add the device to the bootorder at
+.Ar index .
+A fwcfg file is used to specify the bootorder.
+The guest firmware may ignore or doesn't support this fwcfg file.
+In that case, this feature doesn't work as expected.
 .El
 .Pp
 9P device backends:
 .Sm off
 .Bl -bullet
 .It
 .Ar sharename Cm = Ar /path/to/share Op Cm \&, Ar 9p-device-options
 .El
 .Sm on
 .Pp
 The
 .Ar 9p-device-options
 are:
 .Bl -tag -width 10n
 .It Cm ro
 Expose the share in read-only mode.
 .El
 .Pp
 TTY device backends:
 .Bl -tag -width 10n
 .It Cm stdio
 Connect the serial port to the standard input and output of
 the
 .Nm
 process.
 .It Ar /dev/xxx
 Use the host TTY device for serial port I/O.
 .El
 .Pp
 Boot ROM device backends:
 .Bl -tag -width 10n
 .It Ar romfile Ns Op Cm \&, Ns Ar varfile
 Map
 .Ar romfile
 in the guest address space reserved for boot firmware.
 If
 .Ar varfile
 is provided, that file is also mapped in the boot firmware guest
 address space, and any modifications the guest makes will be saved
 to that file.
 .El
 .Pp
 Fwcfg types:
 .Bl -tag -width 10n
 .It Ar fwcfg
 The fwcfg interface is used to pass information such as the CPU count or ACPI tables to the guest firmware.
 Supported values are
 .Ql bhyve
 and
 .Ql qemu .
 Due to backward compatibility reasons,
 .Ql bhyve
 is the default option.
 When
 .Ql bhyve
 is used, bhyve's fwctl interface is used.
 It currently reports only the CPU count to the guest firmware.
 The
 .Ql qemu
 option uses QEMU's fwcfg interface.
 This interface is widely used and allows user-defined information to be passed to the guest.
 It is used for passing the CPU count, ACPI tables, a boot order and many other things to the guest.
 Some operating systems such as Fedora CoreOS can be configured by qemu's fwcfg interface as well.
 .El
 .Pp
 Pass-through device backends:
 .Sm off
 .Bl -bullet
 .It
 .Cm ppt Ar N Oo , Ar passthru-device-options Oc
 .It
 .Ns Ar bus Cm \&/ Ar slot Cm \&/ Ar function
 .Op , Ar passthru-device-options
 .It
 .Cm pci Ar bus Cm : Ar slot Cm : Ns Ar function
 .Op , Ar passthru-device-options
 .El
 .Sm on
 .Pp
 Connect to a PCI device on the host either named ppt
 .Ns Ar N
 or at the selector described by
 .Ar slot ,
 .Ar bus ,
 and
 .Ar function
 numbers.
 .Pp
 The
 .Ar passthru-device-options
 are:
 .Bl -tag -width 10n
 .It Cm rom= Ns Ar romfile
 Add
 .Ar romfile
 as option ROM to the PCI device.
 The ROM will be loaded by firmware and should be capable of initializing the device.
+.It Li bootindex= Ns Ar index
+Add the device to the bootorder at
+.Ar index .
+A fwcfg file is used to specify the bootorder.
+The guest firmware may ignore or doesn't support this fwcfg file.
+In that case, this feature doesn't work as expected.
 .El
 .Pp
 Guest memory must be wired using the
 .Fl S
 option when a pass-through device is configured.
 .Pp
 The host device must have been reserved at boot-time using the
 .Va pptdevs
 loader variable as described in
 .Xr vmm 4 .
 .Pp
 Virtio console device backends:
 .Bl -bullet
 .Sm off
 .It
 .Cm port1= Ns Ar /path/to/port1.sock Ns Op Cm ,port Ns Ar N Cm \&= Ns Ar /path/to/port2.sock No \~ Ar ...
 .Sm on
 .El
 .Pp
 A maximum of 16 ports per device can be created.
 Every port is named and corresponds to a Unix domain socket created by
 .Nm .
 .Nm
 accepts at most one connection per port at a time.
 .Pp
 Limitations:
 .Bl -bullet
 .It
 Due to lack of destructors in
 .Nm ,
 sockets on the filesystem must be cleaned up manually after
 .Nm
 exits.
 .It
 There is no way to use the
 .Dq console port
 feature, nor the console port
 resize at present.
 .It
 Emergency write is advertised, but no-op at present.
 .El
 .Pp
 Virtio input device backends:
 .Bl -tag -width 10n
 .It Ar /dev/input/eventX
 Send input events of
 .Ar /dev/input/eventX
 to guest by VirtIO Input Interface.
 .El
 .Pp
 Framebuffer devices backends:
 .Bl -bullet
 .Sm off
 .It
 .Op Cm rfb= Ar ip-and-port
 .Op Cm ,w= Ar width
 .Op Cm ,h= Ar height
 .Op Cm ,vga= Ar vgaconf
 .Op Cm ,wait
 .Op Cm ,password= Ar password
 .Sm on
 .El
 .Pp
 Configuration options are defined as follows:
 .Bl -tag -width 10n
 .It Cm rfb= Ns Ar ip-and-port Pq or Cm tcp= Ns Ar ip-and-port
 An IP address and a port VNC should listen on.
 There are two formats:
 .Pp
 .Bl -bullet -compact
 .It
 .Sm off
 .Op Ar IPv4 Cm \&:
 .Ar port
 .Sm on
 .It
 .Sm off
 .Cm \&[ Ar IPv6%zone Cm \&] Cm \&: Ar port
 .Sm on
 .El
 .Pp
 The default is to listen on localhost IPv4 address and default VNC port 5900.
 An IPv6 address must be enclosed in square brackets and may contain an
 optional zone identifier.
 .It Cm w= Ns Ar width No and Cm h= Ns Ar height
 A display resolution, width and height, respectively.
 If not specified, a default resolution of 1024x768 pixels will be used.
 Minimal supported resolution is 640x480 pixels,
 and maximum is 1920x1200 pixels.
 .It Cm vga= Ns Ar vgaconf
 Possible values for this option are
 .Cm io
 (default),
 .Cm on
 , and
 .Cm off .
 PCI graphics cards have a dual personality in that they are
 standard PCI devices with BAR addressing, but may also
 implicitly decode legacy VGA I/O space
 .Pq Ad 0x3c0-3df
 and memory space
 .Pq 64KB at Ad 0xA0000 .
 The default
 .Cm io
 option should be used for guests that attempt to issue BIOS calls which result
 in I/O port queries, and fail to boot if I/O decode is disabled.
 .Pp
 The
 .Cm on
 option should be used along with the CSM BIOS capability in UEFI
 to boot traditional BIOS guests that require the legacy VGA I/O and
 memory regions to be available.
 .Pp
 The
 .Cm off
 option should be used for the UEFI guests that assume that
 VGA adapter is present if they detect the I/O ports.
 An example of such a guest is
 .Ox
 in UEFI mode.
 .Pp
 Please refer to the
 .Nm
 .Fx
 wiki page
 .Pq Lk https://wiki.freebsd.org/bhyve
 for configuration notes of particular guests.
 .It Cm wait
 Instruct
 .Nm
 to only boot upon the initiation of a VNC connection, simplifying the
 installation of operating systems that require immediate keyboard input.
 This can be removed for post-installation use.
 .It Cm password= Ns Ar password
 This type of authentication is known to be cryptographically weak and is not
 intended for use on untrusted networks.
 Many implementations will want to use stronger security, such as running
 the session over an encrypted channel provided by IPsec or SSH.
 .El
 .Pp
 xHCI USB device backends:
 .Bl -tag -width 10n
 .It Cm tablet
 A USB tablet device which provides precise cursor synchronization
 when using VNC.
 .El
 .Pp
 NVMe device backends:
 .Bl -bullet
 .Sm off
 .It
 .Ar devpath
 .Op Cm ,maxq= Ar #
 .Op Cm ,qsz= Ar #
 .Op Cm ,ioslots= Ar #
 .Op Cm ,sectsz= Ar #
 .Op Cm ,ser= Ar #
 .Op Cm ,eui64= Ar #
 .Op Cm ,dsm= Ar opt
 .Sm on
 .El
 .Pp
 Configuration options are defined as follows:
 .Bl -tag -width 10n
 .It Ar devpath
 Accepted device paths are:
 .Ar /dev/blockdev
 or
 .Ar /path/to/image
 or
 .Cm ram= Ns Ar size_in_MiB .
 .It Cm maxq
 Max number of queues.
 .It Cm qsz
 Max elements in each queue.
 .It Cm ioslots
 Max number of concurrent I/O requests.
 .It Cm sectsz
 Sector size (defaults to blockif sector size).
 .It Cm ser
 Serial number with maximum 20 characters.
 .It Cm eui64
 IEEE Extended Unique Identifier (8 byte value).
 .It Cm dsm
 DataSet Management support.
 Supported values are:
 .Cm auto , enable ,
 and
 .Cm disable .
 .El
 .Pp
 AHCI device backends:
 .Bl -bullet
 .It
 .Sm off
 .Op Oo Cm hd\&: | cd\&: Oc Ar path
 .Op Cm ,nmrr= Ar nmrr
 .Op Cm ,ser= Ar #
 .Op Cm ,rev= Ar #
 .Op Cm ,model= Ar #
 .Sm on
 .El
 .Pp
 Configuration options are defined as follows:
 .Bl -tag -width 10n
 .It Cm nmrr
 Nominal Media Rotation Rate, known as RPM.
 Value 1 will indicate device as Solid State Disk.
 Default value is 0, not report.
 .It Cm ser
 Serial Number with maximum 20 characters.
 .It Cm rev
 Revision Number with maximum 8 characters.
 .It Cm model
 Model Number with maximum 40 characters.
 .El
 .Pp
 HD Audio device backends:
 .Bl -bullet
 .It
 .Sm off
 .Op Cm play= Ar playback
 .Op Cm ,rec= Ar recording
 .Sm on
 .El
 .Pp
 Configuration options are defined as follows:
 .Bl -tag -width 10n
 .It Cm play
 Playback device, typically
 .Ar /dev/dsp0 .
 .It Cm rec
 Recording device, typically
 .Ar /dev/dsp0 .
 .El
 .It Fl U Ar uuid
 Set the universally unique identifier
 .Pq UUID
 in the guest's System Management BIOS System Information structure.
 By default a UUID is generated from the host's hostname and
 .Ar vmname .
 .It Fl u
 RTC keeps UTC time.
 .It Fl W
 Force virtio PCI device emulations to use MSI interrupts instead of MSI-X
 interrupts.
 .It Fl w
 Ignore accesses to unimplemented Model Specific Registers (MSRs).
 This is intended for debug purposes.
 .It Fl x
 The guest's local APIC is configured in x2APIC mode.
 .It Fl Y
 Disable MPtable generation.
 .It Ar vmname
 Alphanumeric name of the guest.
 This should be the same as that created by
 .Xr bhyveload 8 .
 .El
 .Sh CONFIGURATION VARIABLES
 .Nm
 uses an internal tree of configuration variables to describe global and
 per-device settings.
 When
 .Nm
 starts,
 it parses command line options (including config files) in the order given
 on the command line.
 Each command line option sets one or more configuration variables.
 For example,
 the
 .Fl s
 option creates a new tree node for a PCI device and sets one or more variables
 under that node including the device model and device model-specific variables.
 Variables may be set multiple times during this parsing stage with the final
 value overriding previous values.
 .Pp
 Once all of the command line options have been processed,
 the configuration values are frozen.
 .Nm
 then uses the value of configuration values to initialize device models
 and global settings.
 .Pp
 More details on configuration variables can be found in
 .Xr bhyve_config 5 .
 .Sh DEBUG SERVER
 The current debug server provides limited support for debuggers.
 .Ss Registers
 Each virtual CPU is exposed to the debugger as a thread.
 .Pp
 General purpose registers can be queried for each virtual CPU, but other
 registers such as floating-point and system registers cannot be queried.
 .Ss Memory
 Memory (including memory mapped I/O regions) can be read and written by the debugger.
 Memory operations use virtual addresses that are resolved to physical addresses
 via the current virtual CPU's active address translation.
 .Ss Control
 The running guest can be interrupted by the debugger at any time
 .Pq for example, by pressing Ctrl-C in the debugger .
 .Pp
 Single stepping is only supported on Intel CPUs supporting the MTRAP VM exit.
 .Pp
 Breakpoints are supported on Intel CPUs that support single stepping.
 Note that continuing from a breakpoint while interrupts are enabled in the
 guest may not work as expected due to timer interrupts firing while single
 stepping over the breakpoint.
 .Sh SIGNAL HANDLING
 .Nm
 deals with the following signals:
 .Pp
 .Bl -tag -width SIGTERM -compact
 .It SIGTERM
 Trigger ACPI poweroff for a VM
 .El
 .Sh EXIT STATUS
 Exit status indicates how the VM was terminated:
 .Pp
 .Bl -tag -width indent -compact
 .It 0
 rebooted
 .It 1
 powered off
 .It 2
 halted
 .It 3
 triple fault
 .It 4
 exited due to an error
 .El
 .Sh EXAMPLES
 If not using a boot ROM, the guest operating system must have been loaded with
 .Xr bhyveload 8
 or a similar boot loader before
 .Xr bhyve 4
 can be run.
 Otherwise, the boot loader is not needed.
 .Pp
 To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio
 block device backed by the
 .Pa /my/image
 filesystem image, and a serial port for the console:
 .Bd -literal -offset indent
 bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\
   -l com1,stdio -A -H -P -m 1G vm1
 .Ed
 .Pp
 Run a 24GB single-CPU virtual machine with three network ports, one of which
 has a MAC address specified:
 .Bd -literal -offset indent
 bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\
   -s 2:1,virtio-net,tap1 \\
   -s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\
   -s 3,virtio-blk,/my/image -l com1,stdio \\
   -A -H -P -m 24G bigvm
 .Ed
 .Pp
 Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI
 CD-ROM, a single virtio network port, an AMD hostbridge, and the console
 port connected to an
 .Xr nmdm 4
 null-modem device.
 .Bd -literal -offset indent
 bhyve -c 4 \\
   -s 0,amd_hostbridge -s 1,lpc \\
   -s 1:0,ahci,hd:/images/disk.1,hd:/images/disk.2,\\
 hd:/images/disk.3,hd:/images/disk.4,\\
 hd:/images/disk.5,hd:/images/disk.6,\\
 hd:/images/disk.7,hd:/images/disk.8,\\
 cd:/images/install.iso \\
   -s 3,virtio-net,tap0 \\
   -l com1,/dev/nmdm0A \\
   -A -H -P -m 8G
 .Ed
 .Pp
 Run a UEFI virtual machine with a display resolution of 800 by 600 pixels
 that can be accessed via VNC at: 0.0.0.0:5900.
 .Bd -literal -offset indent
 bhyve -c 2 -m 4G -w -H \\
   -s 0,hostbridge \\
   -s 3,ahci-cd,/path/to/uefi-OS-install.iso \\
   -s 4,ahci-hd,disk.img \\
   -s 5,virtio-net,tap0 \\
   -s 29,fbuf,tcp=0.0.0.0:5900,w=800,h=600,wait \\
   -s 30,xhci,tablet \\
   -s 31,lpc -l com1,stdio \\
   -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\
    uefivm
 .Ed
 .Pp
 Run a UEFI virtual machine with a VNC display that is bound to all IPv6
 addresses on port 5900.
 .Bd -literal -offset indent
 bhyve -c 2 -m 4G -w -H \\
   -s 0,hostbridge \\
   -s 4,ahci-hd,disk.img \\
   -s 5,virtio-net,tap0 \\
   -s 29,fbuf,tcp=[::]:5900,w=800,h=600 \\
   -s 30,xhci,tablet \\
   -s 31,lpc -l com1,stdio \\
   -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\
    uefivm
 .Ed
 .Pp
 Run a UEFI virtual machine with a VARS file to save EFI variables.
 Note that
 .Nm
 will write guest modifications to the given VARS file.
 Be sure to create a per-guest copy of the template VARS file from
 .Pa /usr .
 .Bd -literal -offset indent
 bhyve -c 2 -m 4g -w -H \\
   -s 0,hostbridge \\
   -s 31,lpc -l com1,stdio \\
   -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI_CODE.fd,BHYVE_UEFI_VARS.fd
    uefivm
 .Ed
 .Sh SEE ALSO
 .Xr bhyve 4 ,
 .Xr netgraph 4 ,
 .Xr ng_socket 4 ,
 .Xr nmdm 4 ,
 .Xr vmm 4 ,
 .Xr bhyve_config 5 ,
 .Xr ethers 5 ,
 .Xr bhyvectl 8 ,
 .Xr bhyveload 8
 .Pp
 .Rs
 .%A Intel
 .%B 64 and IA-32 Architectures Software Developer’s Manual
 .%V Volume 3
 .Re
 .Sh HISTORY
 .Nm
 first appeared in
 .Fx 10.0 .
 .Sh AUTHORS
 .An Neel Natu Aq Mt neel@freebsd.org
 .An Peter Grehan Aq Mt grehan@freebsd.org
diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c
index 42c1b5c27293..62f8a9f21661 100644
--- a/usr.sbin/bhyve/block_if.c
+++ b/usr.sbin/bhyve/block_if.c
@@ -1,1027 +1,1046 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
  * All rights reserved.
  * Copyright 2020 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #ifndef WITHOUT_CAPSICUM
 #include <sys/capsicum.h>
 #endif
 #include <sys/queue.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/ioctl.h>
 #include <sys/disk.h>
 
 #include <assert.h>
 #ifndef WITHOUT_CAPSICUM
 #include <capsicum_helpers.h>
 #endif
 #include <err.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <signal.h>
 #include <sysexits.h>
 #include <unistd.h>
 
 #include <machine/atomic.h>
 #include <machine/vmm_snapshot.h>
 
 #include "bhyverun.h"
 #include "config.h"
 #include "debug.h"
 #include "mevent.h"
 #include "pci_emul.h"
 #include "block_if.h"
 
 #define BLOCKIF_SIG	0xb109b109
 
 #define BLOCKIF_NUMTHR	8
 #define BLOCKIF_MAXREQ	(BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
 
 enum blockop {
 	BOP_READ,
 	BOP_WRITE,
 	BOP_FLUSH,
 	BOP_DELETE
 };
 
 enum blockstat {
 	BST_FREE,
 	BST_BLOCK,
 	BST_PEND,
 	BST_BUSY,
 	BST_DONE
 };
 
 struct blockif_elem {
 	TAILQ_ENTRY(blockif_elem) be_link;
 	struct blockif_req  *be_req;
 	enum blockop	     be_op;
 	enum blockstat	     be_status;
 	pthread_t            be_tid;
 	off_t		     be_block;
 };
 
 struct blockif_ctxt {
 	unsigned int		bc_magic;
 	int			bc_fd;
 	int			bc_ischr;
 	int			bc_isgeom;
 	int			bc_candelete;
 	int			bc_rdonly;
 	off_t			bc_size;
 	int			bc_sectsz;
 	int			bc_psectsz;
 	int			bc_psectoff;
 	int			bc_closing;
 	int			bc_paused;
 	pthread_t		bc_btid[BLOCKIF_NUMTHR];
 	pthread_mutex_t		bc_mtx;
 	pthread_cond_t		bc_cond;
 	pthread_cond_t		bc_work_done_cond;
 	blockif_resize_cb	*bc_resize_cb;
 	void			*bc_resize_cb_arg;
 	struct mevent		*bc_resize_event;
 
 	/* Request elements and free/pending/busy queues */
 	TAILQ_HEAD(, blockif_elem) bc_freeq;
 	TAILQ_HEAD(, blockif_elem) bc_pendq;
 	TAILQ_HEAD(, blockif_elem) bc_busyq;
 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
+	int			bc_bootindex;
 };
 
 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
 
 struct blockif_sig_elem {
 	pthread_mutex_t			bse_mtx;
 	pthread_cond_t			bse_cond;
 	int				bse_pending;
 	struct blockif_sig_elem		*bse_next;
 };
 
 static struct blockif_sig_elem *blockif_bse_head;
 
 static int
 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
 		enum blockop op)
 {
 	struct blockif_elem *be, *tbe;
 	off_t off;
 	int i;
 
 	be = TAILQ_FIRST(&bc->bc_freeq);
 	assert(be != NULL);
 	assert(be->be_status == BST_FREE);
 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
 	be->be_req = breq;
 	be->be_op = op;
 	switch (op) {
 	case BOP_READ:
 	case BOP_WRITE:
 	case BOP_DELETE:
 		off = breq->br_offset;
 		for (i = 0; i < breq->br_iovcnt; i++)
 			off += breq->br_iov[i].iov_len;
 		break;
 	default:
 		off = OFF_MAX;
 	}
 	be->be_block = off;
 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
 		if (tbe->be_block == breq->br_offset)
 			break;
 	}
 	if (tbe == NULL) {
 		TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
 			if (tbe->be_block == breq->br_offset)
 				break;
 		}
 	}
 	if (tbe == NULL)
 		be->be_status = BST_PEND;
 	else
 		be->be_status = BST_BLOCK;
 	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
 	return (be->be_status == BST_PEND);
 }
 
 static int
 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
 {
 	struct blockif_elem *be;
 
 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
 		if (be->be_status == BST_PEND)
 			break;
 		assert(be->be_status == BST_BLOCK);
 	}
 	if (be == NULL)
 		return (0);
 	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
 	be->be_status = BST_BUSY;
 	be->be_tid = t;
 	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
 	*bep = be;
 	return (1);
 }
 
 static void
 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
 {
 	struct blockif_elem *tbe;
 
 	if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
 		TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
 	else
 		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
 		if (tbe->be_req->br_offset == be->be_block)
 			tbe->be_status = BST_PEND;
 	}
 	be->be_tid = 0;
 	be->be_status = BST_FREE;
 	be->be_req = NULL;
 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
 }
 
 static int
 blockif_flush_bc(struct blockif_ctxt *bc)
 {
 	if (bc->bc_ischr) {
 		if (ioctl(bc->bc_fd, DIOCGFLUSH))
 			return (errno);
 	} else if (fsync(bc->bc_fd))
 		return (errno);
 
 	return (0);
 }
 
 static void
 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
 {
 	struct spacectl_range range;
 	struct blockif_req *br;
 	off_t arg[2];
 	ssize_t n;
 	size_t clen, len, off, boff, voff;
 	int i, err;
 
 	br = be->be_req;
 	assert(br->br_resid >= 0);
 
 	if (br->br_iovcnt <= 1)
 		buf = NULL;
 	err = 0;
 	switch (be->be_op) {
 	case BOP_READ:
 		if (buf == NULL) {
 			if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
 			    br->br_offset)) < 0)
 				err = errno;
 			else
 				br->br_resid -= n;
 			break;
 		}
 		i = 0;
 		off = voff = 0;
 		while (br->br_resid > 0) {
 			len = MIN(br->br_resid, MAXPHYS);
 			n = pread(bc->bc_fd, buf, len, br->br_offset + off);
 			if (n < 0) {
 				err = errno;
 				break;
 			}
 			len = (size_t)n;
 			boff = 0;
 			do {
 				clen = MIN(len - boff, br->br_iov[i].iov_len -
 				    voff);
 				memcpy((uint8_t *)br->br_iov[i].iov_base + voff,
 				    buf + boff, clen);
 				if (clen < br->br_iov[i].iov_len - voff)
 					voff += clen;
 				else {
 					i++;
 					voff = 0;
 				}
 				boff += clen;
 			} while (boff < len);
 			off += len;
 			br->br_resid -= len;
 		}
 		break;
 	case BOP_WRITE:
 		if (bc->bc_rdonly) {
 			err = EROFS;
 			break;
 		}
 		if (buf == NULL) {
 			if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
 			    br->br_offset)) < 0)
 				err = errno;
 			else
 				br->br_resid -= n;
 			break;
 		}
 		i = 0;
 		off = voff = 0;
 		while (br->br_resid > 0) {
 			len = MIN(br->br_resid, MAXPHYS);
 			boff = 0;
 			do {
 				clen = MIN(len - boff, br->br_iov[i].iov_len -
 				    voff);
 				memcpy(buf + boff,
 				    (uint8_t *)br->br_iov[i].iov_base + voff,
 				    clen);
 				if (clen < br->br_iov[i].iov_len - voff)
 					voff += clen;
 				else {
 					i++;
 					voff = 0;
 				}
 				boff += clen;
 			} while (boff < len);
 
 			n = pwrite(bc->bc_fd, buf, len, br->br_offset + off);
 			if (n < 0) {
 				err = errno;
 				break;
 			}
 			off += n;
 			br->br_resid -= n;
 		}
 		break;
 	case BOP_FLUSH:
 		err = blockif_flush_bc(bc);
 		break;
 	case BOP_DELETE:
 		if (!bc->bc_candelete)
 			err = EOPNOTSUPP;
 		else if (bc->bc_rdonly)
 			err = EROFS;
 		else if (bc->bc_ischr) {
 			arg[0] = br->br_offset;
 			arg[1] = br->br_resid;
 			if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
 				err = errno;
 			else
 				br->br_resid = 0;
 		} else {
 			range.r_offset = br->br_offset;
 			range.r_len = br->br_resid;
 
 			while (range.r_len > 0) {
 				if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC,
 				    &range, 0, &range) != 0) {
 					err = errno;
 					break;
 				}
 			}
 			if (err == 0)
 				br->br_resid = 0;
 		}
 		break;
 	default:
 		err = EINVAL;
 		break;
 	}
 
 	be->be_status = BST_DONE;
 
 	(*br->br_callback)(br, err);
 }
 
 static inline bool
 blockif_empty(const struct blockif_ctxt *bc)
 {
 	return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq));
 }
 
 static void *
 blockif_thr(void *arg)
 {
 	struct blockif_ctxt *bc;
 	struct blockif_elem *be;
 	pthread_t t;
 	uint8_t *buf;
 
 	bc = arg;
 	if (bc->bc_isgeom)
 		buf = malloc(MAXPHYS);
 	else
 		buf = NULL;
 	t = pthread_self();
 
 	pthread_mutex_lock(&bc->bc_mtx);
 	for (;;) {
 		while (blockif_dequeue(bc, t, &be)) {
 			pthread_mutex_unlock(&bc->bc_mtx);
 			blockif_proc(bc, be, buf);
 			pthread_mutex_lock(&bc->bc_mtx);
 			blockif_complete(bc, be);
 		}
 
 		/* If none to work, notify the main thread */
 		if (blockif_empty(bc))
 			pthread_cond_broadcast(&bc->bc_work_done_cond);
 
 		/* Check ctxt status here to see if exit requested */
 		if (bc->bc_closing)
 			break;
 
 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
 	}
 	pthread_mutex_unlock(&bc->bc_mtx);
 
 	if (buf)
 		free(buf);
 	pthread_exit(NULL);
 	return (NULL);
 }
 
 static void
 blockif_sigcont_handler(int signal __unused, enum ev_type type __unused,
     void *arg __unused)
 {
 	struct blockif_sig_elem *bse;
 
 	for (;;) {
 		/*
 		 * Process the entire list even if not intended for
 		 * this thread.
 		 */
 		do {
 			bse = blockif_bse_head;
 			if (bse == NULL)
 				return;
 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
 					    (uintptr_t)bse,
 					    (uintptr_t)bse->bse_next));
 
 		pthread_mutex_lock(&bse->bse_mtx);
 		bse->bse_pending = 0;
 		pthread_cond_signal(&bse->bse_cond);
 		pthread_mutex_unlock(&bse->bse_mtx);
 	}
 }
 
 static void
 blockif_init(void)
 {
 	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
 	(void) signal(SIGCONT, SIG_IGN);
 }
 
 int
 blockif_legacy_config(nvlist_t *nvl, const char *opts)
 {
 	char *cp, *path;
 
 	if (opts == NULL)
 		return (0);
 
 	cp = strchr(opts, ',');
 	if (cp == NULL) {
 		set_config_value_node(nvl, "path", opts);
 		return (0);
 	}
 	path = strndup(opts, cp - opts);
 	set_config_value_node(nvl, "path", path);
 	free(path);
 	return (pci_parse_legacy_config(nvl, cp + 1));
 }
 
+int
+blockif_add_boot_device(struct pci_devinst *const pi,
+    struct blockif_ctxt *const bc)
+{
+	if (bc->bc_bootindex < 0)
+		return (0);
+
+	return (pci_emul_add_boot_device(pi, bc->bc_bootindex));
+}
+
 struct blockif_ctxt *
 blockif_open(nvlist_t *nvl, const char *ident)
 {
 	char tname[MAXCOMLEN + 1];
 	char name[MAXPATHLEN];
-	const char *path, *pssval, *ssval;
+	const char *path, *pssval, *ssval, *bootindex_val;
 	char *cp;
 	struct blockif_ctxt *bc;
 	struct stat sbuf;
 	struct diocgattr_arg arg;
 	off_t size, psectsz, psectoff;
 	int extra, fd, i, sectsz;
 	int ro, candelete, geom, ssopt, pssopt;
 	int nodelete;
+	int bootindex;
 
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_t rights;
 	cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE };
 #endif
 
 	pthread_once(&blockif_once, blockif_init);
 
 	fd = -1;
 	extra = 0;
 	ssopt = 0;
 	ro = 0;
 	nodelete = 0;
+	bootindex = -1;
 
 	if (get_config_bool_node_default(nvl, "nocache", false))
 		extra |= O_DIRECT;
 	if (get_config_bool_node_default(nvl, "nodelete", false))
 		nodelete = 1;
 	if (get_config_bool_node_default(nvl, "sync", false) ||
 	    get_config_bool_node_default(nvl, "direct", false))
 		extra |= O_SYNC;
 	if (get_config_bool_node_default(nvl, "ro", false))
 		ro = 1;
 	ssval = get_config_value_node(nvl, "sectorsize");
 	if (ssval != NULL) {
 		ssopt = strtol(ssval, &cp, 10);
 		if (cp == ssval) {
 			EPRINTLN("Invalid sector size \"%s\"", ssval);
 			goto err;
 		}
 		if (*cp == '\0') {
 			pssopt = ssopt;
 		} else if (*cp == '/') {
 			pssval = cp + 1;
 			pssopt = strtol(pssval, &cp, 10);
 			if (cp == pssval || *cp != '\0') {
 				EPRINTLN("Invalid sector size \"%s\"", ssval);
 				goto err;
 			}
 		} else {
 			EPRINTLN("Invalid sector size \"%s\"", ssval);
 			goto err;
 		}
 	}
 
+	bootindex_val = get_config_value_node(nvl, "bootindex");
+	if (bootindex_val != NULL) {
+		bootindex = atoi(bootindex_val);
+	}
+
 	path = get_config_value_node(nvl, "path");
 	if (path == NULL) {
 		EPRINTLN("Missing \"path\" for block device.");
 		goto err;
 	}
 
 	fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra);
 	if (fd < 0 && !ro) {
 		/* Attempt a r/w fail with a r/o open */
 		fd = open(path, O_RDONLY | extra);
 		ro = 1;
 	}
 
 	if (fd < 0) {
 		warn("Could not open backing file: %s", path);
 		goto err;
 	}
 
         if (fstat(fd, &sbuf) < 0) {
 		warn("Could not stat backing file %s", path);
 		goto err;
         }
 
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
 	    CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF);
 	if (ro)
 		cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
 
 	if (caph_rights_limit(fd, &rights) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 #endif
 
         /*
 	 * Deal with raw devices
 	 */
         size = sbuf.st_size;
 	sectsz = DEV_BSIZE;
 	psectsz = psectoff = 0;
 	candelete = geom = 0;
 	if (S_ISCHR(sbuf.st_mode)) {
 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
 			perror("Could not fetch dev blk/sector size");
 			goto err;
 		}
 		assert(size != 0);
 		assert(sectsz != 0);
 		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
 			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
 		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
 		arg.len = sizeof(arg.value.i);
 		if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
 			candelete = arg.value.i;
 		if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
 			geom = 1;
 	} else {
 		psectsz = sbuf.st_blksize;
 		/* Avoid fallback implementation */
 		candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1;
 	}
 
 #ifndef WITHOUT_CAPSICUM
 	if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 #endif
 
 	if (ssopt != 0) {
 		if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
 		    ssopt > pssopt) {
 			EPRINTLN("Invalid sector size %d/%d",
 			    ssopt, pssopt);
 			goto err;
 		}
 
 		/*
 		 * Some backend drivers (e.g. cd0, ada0) require that the I/O
 		 * size be a multiple of the device's sector size.
 		 *
 		 * Validate that the emulated sector size complies with this
 		 * requirement.
 		 */
 		if (S_ISCHR(sbuf.st_mode)) {
 			if (ssopt < sectsz || (ssopt % sectsz) != 0) {
 				EPRINTLN("Sector size %d incompatible "
 				    "with underlying device sector size %d",
 				    ssopt, sectsz);
 				goto err;
 			}
 		}
 
 		sectsz = ssopt;
 		psectsz = pssopt;
 		psectoff = 0;
 	}
 
 	bc = calloc(1, sizeof(struct blockif_ctxt));
 	if (bc == NULL) {
 		perror("calloc");
 		goto err;
 	}
 
 	bc->bc_magic = BLOCKIF_SIG;
 	bc->bc_fd = fd;
 	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
 	bc->bc_isgeom = geom;
 	bc->bc_candelete = candelete;
 	bc->bc_rdonly = ro;
 	bc->bc_size = size;
 	bc->bc_sectsz = sectsz;
 	bc->bc_psectsz = psectsz;
 	bc->bc_psectoff = psectoff;
 	pthread_mutex_init(&bc->bc_mtx, NULL);
 	pthread_cond_init(&bc->bc_cond, NULL);
 	bc->bc_paused = 0;
 	pthread_cond_init(&bc->bc_work_done_cond, NULL);
 	TAILQ_INIT(&bc->bc_freeq);
 	TAILQ_INIT(&bc->bc_pendq);
 	TAILQ_INIT(&bc->bc_busyq);
+	bc->bc_bootindex = bootindex;
 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
 		bc->bc_reqs[i].be_status = BST_FREE;
 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
 	}
 
 	for (i = 0; i < BLOCKIF_NUMTHR; i++) {
 		pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
 		snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
 		pthread_set_name_np(bc->bc_btid[i], tname);
 	}
 
 	return (bc);
 err:
 	if (fd >= 0)
 		close(fd);
 	return (NULL);
 }
 
 static void
 blockif_resized(int fd, enum ev_type type __unused, void *arg)
 {
 	struct blockif_ctxt *bc;
 	struct stat sb;
 	off_t mediasize;
 
 	if (fstat(fd, &sb) != 0)
 		return;
 
 	if (S_ISCHR(sb.st_mode)) {
 		if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) {
 			EPRINTLN("blockif_resized: get mediasize failed: %s",
 			    strerror(errno));
 			return;
 		}
 	} else
 		mediasize = sb.st_size;
 
 	bc = arg;
 	pthread_mutex_lock(&bc->bc_mtx);
 	if (mediasize != bc->bc_size) {
 		bc->bc_size = mediasize;
 		bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size);
 	}
 	pthread_mutex_unlock(&bc->bc_mtx);
 }
 
 int
 blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb,
     void *cb_arg)
 {
 	struct stat sb;
 	int err;
 
 	if (cb == NULL)
 		return (EINVAL);
 
 	err = 0;
 
 	pthread_mutex_lock(&bc->bc_mtx);
 	if (bc->bc_resize_cb != NULL) {
 		err = EBUSY;
 		goto out;
 	}
 
 	assert(bc->bc_closing == 0);
 
 	if (fstat(bc->bc_fd, &sb) != 0) {
 		err = errno;
 		goto out;
 	}
 
 	bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE,
 	    EVFF_ATTRIB, blockif_resized, bc);
 	if (bc->bc_resize_event == NULL) {
 		err = ENXIO;
 		goto out;
 	}
 
 	bc->bc_resize_cb = cb;
 	bc->bc_resize_cb_arg = cb_arg;
 out:
 	pthread_mutex_unlock(&bc->bc_mtx);
 
 	return (err);
 }
 
 static int
 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
 		enum blockop op)
 {
 	int err;
 
 	err = 0;
 
 	pthread_mutex_lock(&bc->bc_mtx);
 	assert(!bc->bc_paused);
 	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
 		/*
 		 * Enqueue and inform the block i/o thread
 		 * that there is work available
 		 */
 		if (blockif_enqueue(bc, breq, op))
 			pthread_cond_signal(&bc->bc_cond);
 	} else {
 		/*
 		 * Callers are not allowed to enqueue more than
 		 * the specified blockif queue limit. Return an
 		 * error to indicate that the queue length has been
 		 * exceeded.
 		 */
 		err = E2BIG;
 	}
 	pthread_mutex_unlock(&bc->bc_mtx);
 
 	return (err);
 }
 
 int
 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_READ));
 }
 
 int
 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_WRITE));
 }
 
 int
 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_FLUSH));
 }
 
 int
 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_DELETE));
 }
 
 int
 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
 	struct blockif_elem *be;
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 
 	pthread_mutex_lock(&bc->bc_mtx);
 	/* XXX: not waiting while paused */
 
 	/*
 	 * Check pending requests.
 	 */
 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
 		if (be->be_req == breq)
 			break;
 	}
 	if (be != NULL) {
 		/*
 		 * Found it.
 		 */
 		blockif_complete(bc, be);
 		pthread_mutex_unlock(&bc->bc_mtx);
 
 		return (0);
 	}
 
 	/*
 	 * Check in-flight requests.
 	 */
 	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
 		if (be->be_req == breq)
 			break;
 	}
 	if (be == NULL) {
 		/*
 		 * Didn't find it.
 		 */
 		pthread_mutex_unlock(&bc->bc_mtx);
 		return (EINVAL);
 	}
 
 	/*
 	 * Interrupt the processing thread to force it return
 	 * prematurely via it's normal callback path.
 	 */
 	while (be->be_status == BST_BUSY) {
 		struct blockif_sig_elem bse, *old_head;
 
 		pthread_mutex_init(&bse.bse_mtx, NULL);
 		pthread_cond_init(&bse.bse_cond, NULL);
 
 		bse.bse_pending = 1;
 
 		do {
 			old_head = blockif_bse_head;
 			bse.bse_next = old_head;
 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
 					    (uintptr_t)old_head,
 					    (uintptr_t)&bse));
 
 		pthread_kill(be->be_tid, SIGCONT);
 
 		pthread_mutex_lock(&bse.bse_mtx);
 		while (bse.bse_pending)
 			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
 		pthread_mutex_unlock(&bse.bse_mtx);
 	}
 
 	pthread_mutex_unlock(&bc->bc_mtx);
 
 	/*
 	 * The processing thread has been interrupted.  Since it's not
 	 * clear if the callback has been invoked yet, return EBUSY.
 	 */
 	return (EBUSY);
 }
 
 int
 blockif_close(struct blockif_ctxt *bc)
 {
 	void *jval;
 	int i;
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 
 	/*
 	 * Stop the block i/o thread
 	 */
 	pthread_mutex_lock(&bc->bc_mtx);
 	bc->bc_closing = 1;
 	if (bc->bc_resize_event != NULL)
 		mevent_disable(bc->bc_resize_event);
 	pthread_mutex_unlock(&bc->bc_mtx);
 	pthread_cond_broadcast(&bc->bc_cond);
 	for (i = 0; i < BLOCKIF_NUMTHR; i++)
 		pthread_join(bc->bc_btid[i], &jval);
 
 	/* XXX Cancel queued i/o's ??? */
 
 	/*
 	 * Release resources
 	 */
 	bc->bc_magic = 0;
 	close(bc->bc_fd);
 	free(bc);
 
 	return (0);
 }
 
 /*
  * Return virtual C/H/S values for a given block. Use the algorithm
  * outlined in the VHD specification to calculate values.
  */
 void
 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
 {
 	off_t sectors;		/* total sectors of the block dev */
 	off_t hcyl;		/* cylinders times heads */
 	uint16_t secpt;		/* sectors per track */
 	uint8_t heads;
 
 	assert(bc->bc_magic == BLOCKIF_SIG);
 
 	sectors = bc->bc_size / bc->bc_sectsz;
 
 	/* Clamp the size to the largest possible with CHS */
 	if (sectors > 65535L * 16 * 255)
 		sectors = 65535L * 16 * 255;
 
 	if (sectors >= 65536L * 16 * 63) {
 		secpt = 255;
 		heads = 16;
 		hcyl = sectors / secpt;
 	} else {
 		secpt = 17;
 		hcyl = sectors / secpt;
 		heads = (hcyl + 1023) / 1024;
 
 		if (heads < 4)
 			heads = 4;
 
 		if (hcyl >= (heads * 1024) || heads > 16) {
 			secpt = 31;
 			heads = 16;
 			hcyl = sectors / secpt;
 		}
 		if (hcyl >= (heads * 1024)) {
 			secpt = 63;
 			heads = 16;
 			hcyl = sectors / secpt;
 		}
 	}
 
 	*c = hcyl / heads;
 	*h = heads;
 	*s = secpt;
 }
 
 /*
  * Accessors
  */
 off_t
 blockif_size(struct blockif_ctxt *bc)
 {
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_size);
 }
 
 int
 blockif_sectsz(struct blockif_ctxt *bc)
 {
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_sectsz);
 }
 
 void
 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
 {
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	*size = bc->bc_psectsz;
 	*off = bc->bc_psectoff;
 }
 
 int
 blockif_queuesz(struct blockif_ctxt *bc)
 {
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (BLOCKIF_MAXREQ - 1);
 }
 
 int
 blockif_is_ro(struct blockif_ctxt *bc)
 {
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_rdonly);
 }
 
 int
 blockif_candelete(struct blockif_ctxt *bc)
 {
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_candelete);
 }
 
 #ifdef BHYVE_SNAPSHOT
 void
 blockif_pause(struct blockif_ctxt *bc)
 {
 	assert(bc != NULL);
 	assert(bc->bc_magic == BLOCKIF_SIG);
 
 	pthread_mutex_lock(&bc->bc_mtx);
 	bc->bc_paused = 1;
 
 	/* The interface is paused. Wait for workers to finish their work */
 	while (!blockif_empty(bc))
 		pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
 	pthread_mutex_unlock(&bc->bc_mtx);
 
 	if (!bc->bc_rdonly && blockif_flush_bc(bc))
 		fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n",
 			__func__);
 }
 
 void
 blockif_resume(struct blockif_ctxt *bc)
 {
 	assert(bc != NULL);
 	assert(bc->bc_magic == BLOCKIF_SIG);
 
 	pthread_mutex_lock(&bc->bc_mtx);
 	bc->bc_paused = 0;
 	pthread_mutex_unlock(&bc->bc_mtx);
 }
 #endif	/* BHYVE_SNAPSHOT */
diff --git a/usr.sbin/bhyve/block_if.h b/usr.sbin/bhyve/block_if.h
index b36d0c367890..52ebd8634b8e 100644
--- a/usr.sbin/bhyve/block_if.h
+++ b/usr.sbin/bhyve/block_if.h
@@ -1,92 +1,94 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * The block API to be used by bhyve block-device emulations. The routines
  * are thread safe, with no assumptions about the context of the completion
  * callback - it may occur in the caller's context, or asynchronously in
  * another thread.
  */
 
 #ifndef _BLOCK_IF_H_
 #define _BLOCK_IF_H_
 
 #include <sys/nv.h>
 #include <sys/uio.h>
 #include <sys/unistd.h>
 
 struct vm_snapshot_meta;
 
 
 /*
  * BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in
  * a single request.  BLOCKIF_RING_MAX is the maxmimum number of
  * pending requests that can be queued.
  */
 #define	BLOCKIF_IOV_MAX		128	/* not practical to be IOV_MAX */
 #define	BLOCKIF_RING_MAX	128
 
 struct blockif_req {
 	int		br_iovcnt;
 	off_t		br_offset;
 	ssize_t		br_resid;
 	void		(*br_callback)(struct blockif_req *req, int err);
 	void		*br_param;
 	struct iovec	br_iov[BLOCKIF_IOV_MAX];
 };
 
+struct pci_devinst;
 struct blockif_ctxt;
 
 typedef void blockif_resize_cb(struct blockif_ctxt *, void *, size_t);
 
 int	blockif_legacy_config(nvlist_t *nvl, const char *opts);
+int 	blockif_add_boot_device(struct pci_devinst *const pi, struct blockif_ctxt *const bc);
 struct blockif_ctxt *blockif_open(nvlist_t *nvl, const char *ident);
 int	blockif_register_resize_callback(struct blockif_ctxt *bc,
     blockif_resize_cb *cb, void *cb_arg);
 off_t	blockif_size(struct blockif_ctxt *bc);
 void	blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h,
     uint8_t *s);
 int	blockif_sectsz(struct blockif_ctxt *bc);
 void	blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off);
 int	blockif_queuesz(struct blockif_ctxt *bc);
 int	blockif_is_ro(struct blockif_ctxt *bc);
 int	blockif_candelete(struct blockif_ctxt *bc);
 int	blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_close(struct blockif_ctxt *bc);
 #ifdef BHYVE_SNAPSHOT
 void	blockif_pause(struct blockif_ctxt *bc);
 void	blockif_resume(struct blockif_ctxt *bc);
 #endif
 
 #endif /* _BLOCK_IF_H_ */
diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c
index 2c36e1d5cbb9..a2731876bbea 100644
--- a/usr.sbin/bhyve/pci_ahci.c
+++ b/usr.sbin/bhyve/pci_ahci.c
@@ -1,2739 +1,2746 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2013  Zhixiang Yu <zcore@freebsd.org>
  * Copyright (c) 2015-2016 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
 #include <sys/disk.h>
 #include <sys/ata.h>
 #include <sys/endian.h>
 
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <strings.h>
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <inttypes.h>
 #include <md5.h>
 
 #include "bhyverun.h"
 #include "config.h"
 #include "debug.h"
 #include "pci_emul.h"
 #ifdef BHYVE_SNAPSHOT
 #include "snapshot.h"
 #endif
 #include "ahci.h"
 #include "block_if.h"
 
 #define	DEF_PORTS	6	/* Intel ICH8 AHCI supports 6 ports */
 #define	MAX_PORTS	32	/* AHCI supports 32 ports */
 
 #define	PxSIG_ATA	0x00000101 /* ATA drive */
 #define	PxSIG_ATAPI	0xeb140101 /* ATAPI drive */
 
 enum sata_fis_type {
 	FIS_TYPE_REGH2D		= 0x27,	/* Register FIS - host to device */
 	FIS_TYPE_REGD2H		= 0x34,	/* Register FIS - device to host */
 	FIS_TYPE_DMAACT		= 0x39,	/* DMA activate FIS - device to host */
 	FIS_TYPE_DMASETUP	= 0x41,	/* DMA setup FIS - bidirectional */
 	FIS_TYPE_DATA		= 0x46,	/* Data FIS - bidirectional */
 	FIS_TYPE_BIST		= 0x58,	/* BIST activate FIS - bidirectional */
 	FIS_TYPE_PIOSETUP	= 0x5F,	/* PIO setup FIS - device to host */
 	FIS_TYPE_SETDEVBITS	= 0xA1,	/* Set dev bits FIS - device to host */
 };
 
 /*
  * SCSI opcodes
  */
 #define	TEST_UNIT_READY		0x00
 #define	REQUEST_SENSE		0x03
 #define	INQUIRY			0x12
 #define	START_STOP_UNIT		0x1B
 #define	PREVENT_ALLOW		0x1E
 #define	READ_CAPACITY		0x25
 #define	READ_10			0x28
 #define	POSITION_TO_ELEMENT	0x2B
 #define	READ_TOC		0x43
 #define	GET_EVENT_STATUS_NOTIFICATION 0x4A
 #define	MODE_SENSE_10		0x5A
 #define	REPORT_LUNS		0xA0
 #define	READ_12			0xA8
 #define	READ_CD			0xBE
 
 /*
  * SCSI mode page codes
  */
 #define	MODEPAGE_RW_ERROR_RECOVERY	0x01
 #define	MODEPAGE_CD_CAPABILITIES	0x2A
 
 /*
  * ATA commands
  */
 #define	ATA_SF_ENAB_SATA_SF		0x10
 #define	ATA_SATA_SF_AN			0x05
 #define	ATA_SF_DIS_SATA_SF		0x90
 
 /*
  * Debug printf
  */
 #ifdef AHCI_DEBUG
 static FILE *dbg;
 #define DPRINTF(format, arg...)	do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0)
 #else
 #define DPRINTF(format, arg...)
 #endif
 #define WPRINTF(format, arg...) printf(format, ##arg)
 
 #define AHCI_PORT_IDENT 20 + 1
 
 struct ahci_ioreq {
 	struct blockif_req io_req;
 	struct ahci_port *io_pr;
 	STAILQ_ENTRY(ahci_ioreq) io_flist;
 	TAILQ_ENTRY(ahci_ioreq) io_blist;
 	uint8_t *cfis;
 	uint32_t len;
 	uint32_t done;
 	int slot;
 	int more;
 	int readop;
 };
 
 struct ahci_port {
 	struct blockif_ctxt *bctx;
 	struct pci_ahci_softc *pr_sc;
 	struct ata_params ata_ident;
 	uint8_t *cmd_lst;
 	uint8_t *rfis;
 	int port;
 	int atapi;
 	int reset;
 	int waitforclear;
 	int mult_sectors;
 	uint8_t xfermode;
 	uint8_t err_cfis[20];
 	uint8_t sense_key;
 	uint8_t asc;
 	u_int ccs;
 	uint32_t pending;
 
 	uint32_t clb;
 	uint32_t clbu;
 	uint32_t fb;
 	uint32_t fbu;
 	uint32_t is;
 	uint32_t ie;
 	uint32_t cmd;
 	uint32_t unused0;
 	uint32_t tfd;
 	uint32_t sig;
 	uint32_t ssts;
 	uint32_t sctl;
 	uint32_t serr;
 	uint32_t sact;
 	uint32_t ci;
 	uint32_t sntf;
 	uint32_t fbs;
 
 	/*
 	 * i/o request info
 	 */
 	struct ahci_ioreq *ioreq;
 	int ioqsz;
 	STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd;
 	TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd;
 };
 
 struct ahci_cmd_hdr {
 	uint16_t flags;
 	uint16_t prdtl;
 	uint32_t prdbc;
 	uint64_t ctba;
 	uint32_t reserved[4];
 };
 
 struct ahci_prdt_entry {
 	uint64_t dba;
 	uint32_t reserved;
 #define	DBCMASK		0x3fffff
 	uint32_t dbc;
 };
 
 struct pci_ahci_softc {
 	struct pci_devinst *asc_pi;
 	pthread_mutex_t	mtx;
 	int ports;
 	uint32_t cap;
 	uint32_t ghc;
 	uint32_t is;
 	uint32_t pi;
 	uint32_t vs;
 	uint32_t ccc_ctl;
 	uint32_t ccc_pts;
 	uint32_t em_loc;
 	uint32_t em_ctl;
 	uint32_t cap2;
 	uint32_t bohc;
 	uint32_t lintr;
 	struct ahci_port port[MAX_PORTS];
 };
 #define	ahci_ctx(sc)	((sc)->asc_pi->pi_vmctx)
 
 static void ahci_handle_port(struct ahci_port *p);
 
 static inline void lba_to_msf(uint8_t *buf, int lba)
 {
 	lba += 150;
 	buf[0] = (lba / 75) / 60;
 	buf[1] = (lba / 75) % 60;
 	buf[2] = lba % 75;
 }
 
 /*
  * Generate HBA interrupts on global IS register write.
  */
 static void
 ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask)
 {
 	struct pci_devinst *pi = sc->asc_pi;
 	struct ahci_port *p;
 	int i, nmsg;
 	uint32_t mmask;
 
 	/* Update global IS from PxIS/PxIE. */
 	for (i = 0; i < sc->ports; i++) {
 		p = &sc->port[i];
 		if (p->is & p->ie)
 			sc->is |= (1 << i);
 	}
 	DPRINTF("%s(%08x) %08x", __func__, mask, sc->is);
 
 	/* If there is nothing enabled -- clear legacy interrupt and exit. */
 	if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) {
 		if (sc->lintr) {
 			pci_lintr_deassert(pi);
 			sc->lintr = 0;
 		}
 		return;
 	}
 
 	/* If there is anything and no MSI -- assert legacy interrupt. */
 	nmsg = pci_msi_maxmsgnum(pi);
 	if (nmsg == 0) {
 		if (!sc->lintr) {
 			sc->lintr = 1;
 			pci_lintr_assert(pi);
 		}
 		return;
 	}
 
 	/* Assert respective MSIs for ports that were touched. */
 	for (i = 0; i < nmsg; i++) {
 		if (sc->ports <= nmsg || i < nmsg - 1)
 			mmask = 1 << i;
 		else
 			mmask = 0xffffffff << i;
 		if (sc->is & mask && mmask & mask)
 			pci_generate_msi(pi, i);
 	}
 }
 
 /*
  * Generate HBA interrupt on specific port event.
  */
 static void
 ahci_port_intr(struct ahci_port *p)
 {
 	struct pci_ahci_softc *sc = p->pr_sc;
 	struct pci_devinst *pi = sc->asc_pi;
 	int nmsg;
 
 	DPRINTF("%s(%d) %08x/%08x %08x", __func__,
 	    p->port, p->is, p->ie, sc->is);
 
 	/* If there is nothing enabled -- we are done. */
 	if ((p->is & p->ie) == 0)
 		return;
 
 	/* In case of non-shared MSI always generate interrupt. */
 	nmsg = pci_msi_maxmsgnum(pi);
 	if (sc->ports <= nmsg || p->port < nmsg - 1) {
 		sc->is |= (1 << p->port);
 		if ((sc->ghc & AHCI_GHC_IE) == 0)
 			return;
 		pci_generate_msi(pi, p->port);
 		return;
 	}
 
 	/* If IS for this port is already set -- do nothing. */
 	if (sc->is & (1 << p->port))
 		return;
 
 	sc->is |= (1 << p->port);
 
 	/* If interrupts are enabled -- generate one. */
 	if ((sc->ghc & AHCI_GHC_IE) == 0)
 		return;
 	if (nmsg > 0) {
 		pci_generate_msi(pi, nmsg - 1);
 	} else if (!sc->lintr) {
 		sc->lintr = 1;
 		pci_lintr_assert(pi);
 	}
 }
 
 static void
 ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis)
 {
 	int offset, len, irq;
 
 	if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE))
 		return;
 
 	switch (ft) {
 	case FIS_TYPE_REGD2H:
 		offset = 0x40;
 		len = 20;
 		irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0;
 		break;
 	case FIS_TYPE_SETDEVBITS:
 		offset = 0x58;
 		len = 8;
 		irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0;
 		break;
 	case FIS_TYPE_PIOSETUP:
 		offset = 0x20;
 		len = 20;
 		irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0;
 		break;
 	default:
 		WPRINTF("unsupported fis type %d", ft);
 		return;
 	}
 	if (fis[2] & ATA_S_ERROR) {
 		p->waitforclear = 1;
 		irq |= AHCI_P_IX_TFE;
 	}
 	memcpy(p->rfis + offset, fis, len);
 	if (irq) {
 		if (~p->is & irq) {
 			p->is |= irq;
 			ahci_port_intr(p);
 		}
 	}
 }
 
 static void
 ahci_write_fis_piosetup(struct ahci_port *p)
 {
 	uint8_t fis[20];
 
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_PIOSETUP;
 	ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis);
 }
 
 static void
 ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
 {
 	uint8_t fis[8];
 	uint8_t error;
 
 	error = (tfd >> 8) & 0xff;
 	tfd &= 0x77;
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_SETDEVBITS;
 	fis[1] = (1 << 6);
 	fis[2] = tfd;
 	fis[3] = error;
 	if (fis[2] & ATA_S_ERROR) {
 		p->err_cfis[0] = slot;
 		p->err_cfis[2] = tfd;
 		p->err_cfis[3] = error;
 		memcpy(&p->err_cfis[4], cfis + 4, 16);
 	} else {
 		*(uint32_t *)(fis + 4) = (1 << slot);
 		p->sact &= ~(1 << slot);
 	}
 	p->tfd &= ~0x77;
 	p->tfd |= tfd;
 	ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis);
 }
 
 static void
 ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
 {
 	uint8_t fis[20];
 	uint8_t error;
 
 	error = (tfd >> 8) & 0xff;
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_REGD2H;
 	fis[1] = (1 << 6);
 	fis[2] = tfd & 0xff;
 	fis[3] = error;
 	fis[4] = cfis[4];
 	fis[5] = cfis[5];
 	fis[6] = cfis[6];
 	fis[7] = cfis[7];
 	fis[8] = cfis[8];
 	fis[9] = cfis[9];
 	fis[10] = cfis[10];
 	fis[11] = cfis[11];
 	fis[12] = cfis[12];
 	fis[13] = cfis[13];
 	if (fis[2] & ATA_S_ERROR) {
 		p->err_cfis[0] = 0x80;
 		p->err_cfis[2] = tfd & 0xff;
 		p->err_cfis[3] = error;
 		memcpy(&p->err_cfis[4], cfis + 4, 16);
 	} else
 		p->ci &= ~(1 << slot);
 	p->tfd = tfd;
 	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
 }
 
 static void
 ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot)
 {
 	uint8_t fis[20];
 
 	p->tfd = ATA_S_READY | ATA_S_DSC;
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_REGD2H;
 	fis[1] = 0;			/* No interrupt */
 	fis[2] = p->tfd;		/* Status */
 	fis[3] = 0;			/* No error */
 	p->ci &= ~(1 << slot);
 	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
 }
 
 static void
 ahci_write_reset_fis_d2h(struct ahci_port *p)
 {
 	uint8_t fis[20];
 
 	memset(fis, 0, sizeof(fis));
 	fis[0] = FIS_TYPE_REGD2H;
 	fis[3] = 1;
 	fis[4] = 1;
 	if (p->atapi) {
 		fis[5] = 0x14;
 		fis[6] = 0xeb;
 	}
 	fis[12] = 1;
 	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
 }
 
 static void
 ahci_check_stopped(struct ahci_port *p)
 {
 	/*
 	 * If we are no longer processing the command list and nothing
 	 * is in-flight, clear the running bit, the current command
 	 * slot, the command issue and active bits.
 	 */
 	if (!(p->cmd & AHCI_P_CMD_ST)) {
 		if (p->pending == 0) {
 			p->ccs = 0;
 			p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK);
 			p->ci = 0;
 			p->sact = 0;
 			p->waitforclear = 0;
 		}
 	}
 }
 
 static void
 ahci_port_stop(struct ahci_port *p)
 {
 	struct ahci_ioreq *aior;
 	uint8_t *cfis;
 	int slot;
 	int error;
 
 	assert(pthread_mutex_isowned_np(&p->pr_sc->mtx));
 
 	TAILQ_FOREACH(aior, &p->iobhd, io_blist) {
 		/*
 		 * Try to cancel the outstanding blockif request.
 		 */
 		error = blockif_cancel(p->bctx, &aior->io_req);
 		if (error != 0)
 			continue;
 
 		slot = aior->slot;
 		cfis = aior->cfis;
 		if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
 		    cfis[2] == ATA_READ_FPDMA_QUEUED ||
 		    cfis[2] == ATA_SEND_FPDMA_QUEUED)
 			p->sact &= ~(1 << slot);	/* NCQ */
 		else
 			p->ci &= ~(1 << slot);
 
 		/*
 		 * This command is now done.
 		 */
 		p->pending &= ~(1 << slot);
 
 		/*
 		 * Delete the blockif request from the busy list
 		 */
 		TAILQ_REMOVE(&p->iobhd, aior, io_blist);
 
 		/*
 		 * Move the blockif request back to the free list
 		 */
 		STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
 	}
 
 	ahci_check_stopped(p);
 }
 
 static void
 ahci_port_reset(struct ahci_port *pr)
 {
 	pr->serr = 0;
 	pr->sact = 0;
 	pr->xfermode = ATA_UDMA6;
 	pr->mult_sectors = 128;
 
 	if (!pr->bctx) {
 		pr->ssts = ATA_SS_DET_NO_DEVICE;
 		pr->sig = 0xFFFFFFFF;
 		pr->tfd = 0x7F;
 		return;
 	}
 	pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE;
 	if (pr->sctl & ATA_SC_SPD_MASK)
 		pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK);
 	else
 		pr->ssts |= ATA_SS_SPD_GEN3;
 	pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA;
 	if (!pr->atapi) {
 		pr->sig = PxSIG_ATA;
 		pr->tfd |= ATA_S_READY;
 	} else
 		pr->sig = PxSIG_ATAPI;
 	ahci_write_reset_fis_d2h(pr);
 }
 
 static void
 ahci_reset(struct pci_ahci_softc *sc)
 {
 	int i;
 
 	sc->ghc = AHCI_GHC_AE;
 	sc->is = 0;
 
 	if (sc->lintr) {
 		pci_lintr_deassert(sc->asc_pi);
 		sc->lintr = 0;
 	}
 
 	for (i = 0; i < sc->ports; i++) {
 		sc->port[i].ie = 0;
 		sc->port[i].is = 0;
 		sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD);
 		if (sc->port[i].bctx)
 			sc->port[i].cmd |= AHCI_P_CMD_CPS;
 		sc->port[i].sctl = 0;
 		ahci_port_reset(&sc->port[i]);
 	}
 }
 
 static void
 ata_string(uint8_t *dest, const char *src, int len)
 {
 	int i;
 
 	for (i = 0; i < len; i++) {
 		if (*src)
 			dest[i ^ 1] = *src++;
 		else
 			dest[i ^ 1] = ' ';
 	}
 }
 
 static void
 atapi_string(uint8_t *dest, const char *src, int len)
 {
 	int i;
 
 	for (i = 0; i < len; i++) {
 		if (*src)
 			dest[i] = *src++;
 		else
 			dest[i] = ' ';
 	}
 }
 
 /*
  * Build up the iovec based on the PRDT, 'done' and 'len'.
  */
 static void
 ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior,
     struct ahci_prdt_entry *prdt, uint16_t prdtl)
 {
 	struct blockif_req *breq = &aior->io_req;
 	uint32_t dbcsz, extra, left, skip, todo;
 	int i, j;
 
 	assert(aior->len >= aior->done);
 
 	/* Copy part of PRDT between 'done' and 'len' bytes into the iov. */
 	skip = aior->done;
 	left = aior->len - aior->done;
 	todo = 0;
 	for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0;
 	    i++, prdt++) {
 		dbcsz = (prdt->dbc & DBCMASK) + 1;
 		/* Skip already done part of the PRDT */
 		if (dbcsz <= skip) {
 			skip -= dbcsz;
 			continue;
 		}
 		dbcsz -= skip;
 		if (dbcsz > left)
 			dbcsz = left;
 		breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc),
 		    prdt->dba + skip, dbcsz);
 		breq->br_iov[j].iov_len = dbcsz;
 		todo += dbcsz;
 		left -= dbcsz;
 		skip = 0;
 		j++;
 	}
 
 	/* If we got limited by IOV length, round I/O down to sector size. */
 	if (j == BLOCKIF_IOV_MAX) {
 		extra = todo % blockif_sectsz(p->bctx);
 		todo -= extra;
 		assert(todo > 0);
 		while (extra > 0) {
 			if (breq->br_iov[j - 1].iov_len > extra) {
 				breq->br_iov[j - 1].iov_len -= extra;
 				break;
 			}
 			extra -= breq->br_iov[j - 1].iov_len;
 			j--;
 		}
 	}
 
 	breq->br_iovcnt = j;
 	breq->br_resid = todo;
 	aior->done += todo;
 	aior->more = (aior->done < aior->len && i < prdtl);
 }
 
 static void
 ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
 {
 	struct ahci_ioreq *aior;
 	struct blockif_req *breq;
 	struct ahci_prdt_entry *prdt;
 	struct ahci_cmd_hdr *hdr;
 	uint64_t lba;
 	uint32_t len;
 	int err, first, ncq, readop;
 
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	ncq = 0;
 	readop = 1;
 	first = (done == 0);
 
 	if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 ||
 	    cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 ||
 	    cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 ||
 	    cfis[2] == ATA_WRITE_FPDMA_QUEUED)
 		readop = 0;
 
 	if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
 	    cfis[2] == ATA_READ_FPDMA_QUEUED) {
 		lba = ((uint64_t)cfis[10] << 40) |
 			((uint64_t)cfis[9] << 32) |
 			((uint64_t)cfis[8] << 24) |
 			((uint64_t)cfis[6] << 16) |
 			((uint64_t)cfis[5] << 8) |
 			cfis[4];
 		len = cfis[11] << 8 | cfis[3];
 		if (!len)
 			len = 65536;
 		ncq = 1;
 	} else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 ||
 	    cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 ||
 	    cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) {
 		lba = ((uint64_t)cfis[10] << 40) |
 			((uint64_t)cfis[9] << 32) |
 			((uint64_t)cfis[8] << 24) |
 			((uint64_t)cfis[6] << 16) |
 			((uint64_t)cfis[5] << 8) |
 			cfis[4];
 		len = cfis[13] << 8 | cfis[12];
 		if (!len)
 			len = 65536;
 	} else {
 		lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) |
 			(cfis[5] << 8) | cfis[4];
 		len = cfis[12];
 		if (!len)
 			len = 256;
 	}
 	lba *= blockif_sectsz(p->bctx);
 	len *= blockif_sectsz(p->bctx);
 
 	/* Pull request off free list */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
 	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = len;
 	aior->done = done;
 	aior->readop = readop;
 	breq = &aior->io_req;
 	breq->br_offset = lba + done;
 	ahci_build_iov(p, aior, prdt, hdr->prdtl);
 
 	/* Mark this command in-flight. */
 	p->pending |= 1 << slot;
 
 	/* Stuff request onto busy list. */
 	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
 
 	if (ncq && first)
 		ahci_write_fis_d2h_ncq(p, slot);
 
 	if (readop)
 		err = blockif_read(p->bctx, breq);
 	else
 		err = blockif_write(p->bctx, breq);
 	assert(err == 0);
 }
 
 static void
 ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	struct ahci_ioreq *aior;
 	struct blockif_req *breq;
 	int err;
 
 	/*
 	 * Pull request off free list
 	 */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
 	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = 0;
 	aior->done = 0;
 	aior->more = 0;
 	breq = &aior->io_req;
 
 	/*
 	 * Mark this command in-flight.
 	 */
 	p->pending |= 1 << slot;
 
 	/*
 	 * Stuff request onto busy list
 	 */
 	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
 
 	err = blockif_flush(p->bctx, breq);
 	assert(err == 0);
 }
 
 static inline void
 read_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf,
     unsigned int size)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_prdt_entry *prdt;
 	uint8_t *to;
 	unsigned int len;
 	int i;
 
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	len = size;
 	to = buf;
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 	for (i = 0; i < hdr->prdtl && len; i++) {
 		uint8_t *ptr;
 		uint32_t dbcsz;
 		unsigned int sublen;
 
 		dbcsz = (prdt->dbc & DBCMASK) + 1;
 		ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz);
 		sublen = MIN(len, dbcsz);
 		memcpy(to, ptr, sublen);
 		len -= sublen;
 		to += sublen;
 		prdt++;
 	}
 }
 
 static void
 ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
 {
 	struct ahci_ioreq *aior;
 	struct blockif_req *breq;
 	uint8_t *entry;
 	uint64_t elba;
 	uint32_t len, elen;
 	int err, first, ncq;
 	uint8_t buf[512];
 
 	first = (done == 0);
 	if (cfis[2] == ATA_DATA_SET_MANAGEMENT) {
 		len = (uint16_t)cfis[13] << 8 | cfis[12];
 		len *= 512;
 		ncq = 0;
 	} else { /* ATA_SEND_FPDMA_QUEUED */
 		len = (uint16_t)cfis[11] << 8 | cfis[3];
 		len *= 512;
 		ncq = 1;
 	}
 	read_prdt(p, slot, cfis, buf, sizeof(buf));
 
 next:
 	entry = &buf[done];
 	elba = ((uint64_t)entry[5] << 40) |
 		((uint64_t)entry[4] << 32) |
 		((uint64_t)entry[3] << 24) |
 		((uint64_t)entry[2] << 16) |
 		((uint64_t)entry[1] << 8) |
 		entry[0];
 	elen = (uint16_t)entry[7] << 8 | entry[6];
 	done += 8;
 	if (elen == 0) {
 		if (done >= len) {
 			if (ncq) {
 				if (first)
 					ahci_write_fis_d2h_ncq(p, slot);
 				ahci_write_fis_sdb(p, slot, cfis,
 				    ATA_S_READY | ATA_S_DSC);
 			} else {
 				ahci_write_fis_d2h(p, slot, cfis,
 				    ATA_S_READY | ATA_S_DSC);
 			}
 			p->pending &= ~(1 << slot);
 			ahci_check_stopped(p);
 			if (!first)
 				ahci_handle_port(p);
 			return;
 		}
 		goto next;
 	}
 
 	/*
 	 * Pull request off free list
 	 */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
 	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = len;
 	aior->done = done;
 	aior->more = (len != done);
 
 	breq = &aior->io_req;
 	breq->br_offset = elba * blockif_sectsz(p->bctx);
 	breq->br_resid = elen * blockif_sectsz(p->bctx);
 
 	/*
 	 * Mark this command in-flight.
 	 */
 	p->pending |= 1 << slot;
 
 	/*
 	 * Stuff request onto busy list
 	 */
 	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
 
 	if (ncq && first)
 		ahci_write_fis_d2h_ncq(p, slot);
 
 	err = blockif_delete(p->bctx, breq);
 	assert(err == 0);
 }
 
 static inline void
 write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf,
     unsigned int size)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_prdt_entry *prdt;
 	uint8_t *from;
 	unsigned int len;
 	int i;
 
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	len = size;
 	from = buf;
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 	for (i = 0; i < hdr->prdtl && len; i++) {
 		uint8_t *ptr;
 		uint32_t dbcsz;
 		int sublen;
 
 		dbcsz = (prdt->dbc & DBCMASK) + 1;
 		ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz);
 		sublen = MIN(len, dbcsz);
 		memcpy(ptr, from, sublen);
 		len -= sublen;
 		from += sublen;
 		prdt++;
 	}
 	hdr->prdbc = size - len;
 }
 
 static void
 ahci_checksum(uint8_t *buf, int size)
 {
 	int i;
 	uint8_t sum = 0;
 
 	for (i = 0; i < size - 1; i++)
 		sum += buf[i];
 	buf[size - 1] = 0x100 - sum;
 }
 
 static void
 ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	struct ahci_cmd_hdr *hdr;
 	uint32_t buf[128];
 	uint8_t *buf8 = (uint8_t *)buf;
 	uint16_t *buf16 = (uint16_t *)buf;
 
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	if (p->atapi || hdr->prdtl == 0 || cfis[5] != 0 ||
 	    cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) {
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		return;
 	}
 
 	memset(buf, 0, sizeof(buf));
 	if (cfis[4] == 0x00) {	/* Log directory */
 		buf16[0x00] = 1; /* Version -- 1 */
 		buf16[0x10] = 1; /* NCQ Command Error Log -- 1 page */
 		buf16[0x13] = 1; /* SATA NCQ Send and Receive Log -- 1 page */
 	} else if (cfis[4] == 0x10) {	/* NCQ Command Error Log */
 		memcpy(buf8, p->err_cfis, sizeof(p->err_cfis));
 		ahci_checksum(buf8, sizeof(buf));
 	} else if (cfis[4] == 0x13) {	/* SATA NCQ Send and Receive Log */
 		if (blockif_candelete(p->bctx) && !blockif_is_ro(p->bctx)) {
 			buf[0x00] = 1;	/* SFQ DSM supported */
 			buf[0x01] = 1;	/* SFQ DSM TRIM supported */
 		}
 	} else {
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		return;
 	}
 
 	if (cfis[2] == ATA_READ_LOG_EXT)
 		ahci_write_fis_piosetup(p);
 	write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
 }
 
 static void
 handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	struct ahci_cmd_hdr *hdr;
 
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	if (p->atapi || hdr->prdtl == 0) {
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 	} else {
 		ahci_write_fis_piosetup(p);
 		write_prdt(p, slot, cfis, (void*)&p->ata_ident, sizeof(struct ata_params));
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
 	}
 }
 
 static void
 ata_identify_init(struct ahci_port* p, int atapi)
 {
 	struct ata_params* ata_ident = &p->ata_ident;
 
 	if (atapi) {
 		ata_ident->config = ATA_PROTO_ATAPI | ATA_ATAPI_TYPE_CDROM |
 		    ATA_ATAPI_REMOVABLE | ATA_DRQ_FAST;
 		ata_ident->capabilities1 = ATA_SUPPORT_LBA |
 			ATA_SUPPORT_DMA;
 		ata_ident->capabilities2 = (1 << 14 | 1);
 		ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88;
 		ata_ident->obsolete62 = 0x3f;
 		ata_ident->mwdmamodes = 7;
 		if (p->xfermode & ATA_WDMA0)
 			ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8));
 		ata_ident->apiomodes = 3;
 		ata_ident->mwdmamin = 0x0078;
 		ata_ident->mwdmarec = 0x0078;
 		ata_ident->pioblind = 0x0078;
 		ata_ident->pioiordy = 0x0078;
 		ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3);
 		ata_ident->satacapabilities2 = ((p->ssts & ATA_SS_SPD_MASK) >> 3);
 		ata_ident->satasupport = ATA_SUPPORT_NCQ_STREAM;
 		ata_ident->version_major = 0x3f0;
 		ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
 			ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
 		ata_ident->support.command2 = (1 << 14);
 		ata_ident->support.extension = (1 << 14);
 		ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
 			ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
 		ata_ident->enabled.extension = (1 << 14);
 		ata_ident->udmamodes = 0x7f;
 		if (p->xfermode & ATA_UDMA0)
 			ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8));
 		ata_ident->transport_major = 0x1020;
 		ata_ident->integrity = 0x00a5;
 	} else {
 		uint64_t sectors;
 		int sectsz, psectsz, psectoff, candelete, ro;
 		uint16_t cyl;
 		uint8_t sech, heads;
 
 		ro = blockif_is_ro(p->bctx);
 		candelete = blockif_candelete(p->bctx);
 		sectsz = blockif_sectsz(p->bctx);
 		sectors = blockif_size(p->bctx) / sectsz;
 		blockif_chs(p->bctx, &cyl, &heads, &sech);
 		blockif_psectsz(p->bctx, &psectsz, &psectoff);
 		ata_ident->config = ATA_DRQ_FAST;
 		ata_ident->cylinders = cyl;
 		ata_ident->heads = heads;
 		ata_ident->sectors = sech;
 
 		ata_ident->sectors_intr = (0x8000 | 128);
 		ata_ident->tcg = 0;
 
 		ata_ident->capabilities1 = ATA_SUPPORT_DMA |
 			ATA_SUPPORT_LBA | ATA_SUPPORT_IORDY;
 		ata_ident->capabilities2 = (1 << 14);
 		ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88;
 		if (p->mult_sectors)
 			ata_ident->multi = (ATA_MULTI_VALID | p->mult_sectors);
 		if (sectors <= 0x0fffffff) {
 			ata_ident->lba_size_1 = sectors;
 			ata_ident->lba_size_2 = (sectors >> 16);
 		} else {
 			ata_ident->lba_size_1 = 0xffff;
 			ata_ident->lba_size_2 = 0x0fff;
 		}
 		ata_ident->mwdmamodes = 0x7;
 		if (p->xfermode & ATA_WDMA0)
 			ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8));
 		ata_ident->apiomodes = 0x3;
 		ata_ident->mwdmamin = 0x0078;
 		ata_ident->mwdmarec = 0x0078;
 		ata_ident->pioblind = 0x0078;
 		ata_ident->pioiordy = 0x0078;
 		ata_ident->support3 = 0;
 		ata_ident->queue = 31;
 		ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 |
 			ATA_SUPPORT_NCQ);
 		ata_ident->satacapabilities2 = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED |
 			(p->ssts & ATA_SS_SPD_MASK) >> 3);
 		ata_ident->version_major = 0x3f0;
 		ata_ident->version_minor = 0x28;
 		ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE |
 			ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP);
 		ata_ident->support.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE |
 			ATA_SUPPORT_FLUSHCACHE48 | 1 << 14);
 		ata_ident->support.extension = (1 << 14);
 		ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE |
 			ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP);
 		ata_ident->enabled.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE |
 			ATA_SUPPORT_FLUSHCACHE48 | 1 << 15);
 		ata_ident->enabled.extension = (1 << 14);
 		ata_ident->udmamodes = 0x7f;
 		if (p->xfermode & ATA_UDMA0)
 			ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8));
 		ata_ident->lba_size48_1 = sectors;
 		ata_ident->lba_size48_2 = (sectors >> 16);
 		ata_ident->lba_size48_3 = (sectors >> 32);
 		ata_ident->lba_size48_4 = (sectors >> 48);
 
 		if (candelete && !ro) {
 			ata_ident->support3 |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT;
 			ata_ident->max_dsm_blocks = 1;
 			ata_ident->support_dsm = ATA_SUPPORT_DSM_TRIM;
 		}
 		ata_ident->pss = ATA_PSS_VALID_VALUE;
 		ata_ident->lsalign = 0x4000;
 		if (psectsz > sectsz) {
 			ata_ident->pss |= ATA_PSS_MULTLS;
 			ata_ident->pss |= ffsl(psectsz / sectsz) - 1;
 			ata_ident->lsalign |= (psectoff / sectsz);
 		}
 		if (sectsz > 512) {
 			ata_ident->pss |= ATA_PSS_LSSABOVE512;
 			ata_ident->lss_1 = sectsz / 2;
 			ata_ident->lss_2 = ((sectsz / 2) >> 16);
 		}
 		ata_ident->support2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14);
 		ata_ident->enabled2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14);
 		ata_ident->transport_major = 0x1020;
 		ata_ident->integrity = 0x00a5;
 	}
 	ahci_checksum((uint8_t*)ata_ident, sizeof(struct ata_params));
 }
 
 static void
 handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	if (!p->atapi) {
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 	} else {
 		ahci_write_fis_piosetup(p);
 		write_prdt(p, slot, cfis, (void *)&p->ata_ident, sizeof(struct ata_params));
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
 	}
 }
 
 static void
 atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t buf[36];
 	uint8_t *acmd;
 	unsigned int len;
 	uint32_t tfd;
 
 	acmd = cfis + 0x40;
 
 	if (acmd[1] & 1) {		/* VPD */
 		if (acmd[2] == 0) {	/* Supported VPD pages */
 			buf[0] = 0x05;
 			buf[1] = 0;
 			buf[2] = 0;
 			buf[3] = 1;
 			buf[4] = 0;
 			len = 4 + buf[3];
 		} else {
 			p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 			p->asc = 0x24;
 			tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 			cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 			ahci_write_fis_d2h(p, slot, cfis, tfd);
 			return;
 		}
 	} else {
 		buf[0] = 0x05;
 		buf[1] = 0x80;
 		buf[2] = 0x00;
 		buf[3] = 0x21;
 		buf[4] = 31;
 		buf[5] = 0;
 		buf[6] = 0;
 		buf[7] = 0;
 		atapi_string(buf + 8, "BHYVE", 8);
 		atapi_string(buf + 16, "BHYVE DVD-ROM", 16);
 		atapi_string(buf + 32, "001", 4);
 		len = sizeof(buf);
 	}
 
 	if (len > acmd[4])
 		len = acmd[4];
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	write_prdt(p, slot, cfis, buf, len);
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 }
 
 static void
 atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t buf[8];
 	uint64_t sectors;
 
 	sectors = blockif_size(p->bctx) / 2048;
 	be32enc(buf, sectors - 1);
 	be32enc(buf + 4, 2048);
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	write_prdt(p, slot, cfis, buf, sizeof(buf));
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 }
 
 static void
 atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd;
 	uint8_t format;
 	unsigned int len;
 
 	acmd = cfis + 0x40;
 
 	len = be16dec(acmd + 7);
 	format = acmd[9] >> 6;
 	switch (format) {
 	case 0:
 	{
 		size_t size;
 		int msf;
 		uint64_t sectors;
 		uint8_t start_track, buf[20], *bp;
 
 		msf = (acmd[1] >> 1) & 1;
 		start_track = acmd[6];
 		if (start_track > 1 && start_track != 0xaa) {
 			uint32_t tfd;
 			p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 			p->asc = 0x24;
 			tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 			cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 			ahci_write_fis_d2h(p, slot, cfis, tfd);
 			return;
 		}
 		bp = buf + 2;
 		*bp++ = 1;
 		*bp++ = 1;
 		if (start_track <= 1) {
 			*bp++ = 0;
 			*bp++ = 0x14;
 			*bp++ = 1;
 			*bp++ = 0;
 			if (msf) {
 				*bp++ = 0;
 				lba_to_msf(bp, 0);
 				bp += 3;
 			} else {
 				*bp++ = 0;
 				*bp++ = 0;
 				*bp++ = 0;
 				*bp++ = 0;
 			}
 		}
 		*bp++ = 0;
 		*bp++ = 0x14;
 		*bp++ = 0xaa;
 		*bp++ = 0;
 		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
 		sectors >>= 2;
 		if (msf) {
 			*bp++ = 0;
 			lba_to_msf(bp, sectors);
 			bp += 3;
 		} else {
 			be32enc(bp, sectors);
 			bp += 4;
 		}
 		size = bp - buf;
 		be16enc(buf, size - 2);
 		if (len > size)
 			len = size;
 		write_prdt(p, slot, cfis, buf, len);
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	}
 	case 1:
 	{
 		uint8_t buf[12];
 
 		memset(buf, 0, sizeof(buf));
 		buf[1] = 0xa;
 		buf[2] = 0x1;
 		buf[3] = 0x1;
 		if (len > sizeof(buf))
 			len = sizeof(buf);
 		write_prdt(p, slot, cfis, buf, len);
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	}
 	case 2:
 	{
 		size_t size;
 		int msf;
 		uint64_t sectors;
 		uint8_t *bp, buf[50];
 
 		msf = (acmd[1] >> 1) & 1;
 		bp = buf + 2;
 		*bp++ = 1;
 		*bp++ = 1;
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 0xa0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 1;
 		*bp++ = 0;
 		*bp++ = 0;
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 0xa1;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 1;
 		*bp++ = 0;
 		*bp++ = 0;
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 0xa2;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
 		sectors >>= 2;
 		if (msf) {
 			*bp++ = 0;
 			lba_to_msf(bp, sectors);
 			bp += 3;
 		} else {
 			be32enc(bp, sectors);
 			bp += 4;
 		}
 
 		*bp++ = 1;
 		*bp++ = 0x14;
 		*bp++ = 0;
 		*bp++ = 1;
 		*bp++ = 0;
 		*bp++ = 0;
 		*bp++ = 0;
 		if (msf) {
 			*bp++ = 0;
 			lba_to_msf(bp, 0);
 			bp += 3;
 		} else {
 			*bp++ = 0;
 			*bp++ = 0;
 			*bp++ = 0;
 			*bp++ = 0;
 		}
 
 		size = bp - buf;
 		be16enc(buf, size - 2);
 		if (len > size)
 			len = size;
 		write_prdt(p, slot, cfis, buf, len);
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	}
 	default:
 	{
 		uint32_t tfd;
 
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x24;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, tfd);
 		break;
 	}
 	}
 }
 
 static void
 atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t buf[16];
 
 	memset(buf, 0, sizeof(buf));
 	buf[3] = 8;
 
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	write_prdt(p, slot, cfis, buf, sizeof(buf));
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 }
 
 static void
 atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done)
 {
 	struct ahci_ioreq *aior;
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_prdt_entry *prdt;
 	struct blockif_req *breq;
 	uint8_t *acmd;
 	uint64_t lba;
 	uint32_t len;
 	int err;
 
 	acmd = cfis + 0x40;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 
 	lba = be32dec(acmd + 2);
 	if (acmd[0] == READ_10)
 		len = be16dec(acmd + 7);
 	else
 		len = be32dec(acmd + 6);
 	if (len == 0) {
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 	}
 	lba *= 2048;
 	len *= 2048;
 
 	/*
 	 * Pull request off free list
 	 */
 	aior = STAILQ_FIRST(&p->iofhd);
 	assert(aior != NULL);
 	STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
 	aior->cfis = cfis;
 	aior->slot = slot;
 	aior->len = len;
 	aior->done = done;
 	aior->readop = 1;
 	breq = &aior->io_req;
 	breq->br_offset = lba + done;
 	ahci_build_iov(p, aior, prdt, hdr->prdtl);
 
 	/* Mark this command in-flight. */
 	p->pending |= 1 << slot;
 
 	/* Stuff request onto busy list. */
 	TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
 
 	err = blockif_read(p->bctx, breq);
 	assert(err == 0);
 }
 
 static void
 atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t buf[64];
 	uint8_t *acmd;
 	unsigned int len;
 
 	acmd = cfis + 0x40;
 	len = acmd[4];
 	if (len > sizeof(buf))
 		len = sizeof(buf);
 	memset(buf, 0, len);
 	buf[0] = 0x70 | (1 << 7);
 	buf[2] = p->sense_key;
 	buf[7] = 10;
 	buf[12] = p->asc;
 	write_prdt(p, slot, cfis, buf, len);
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 }
 
 static void
 atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd = cfis + 0x40;
 	uint32_t tfd;
 
 	switch (acmd[4] & 3) {
 	case 0:
 	case 1:
 	case 3:
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		tfd = ATA_S_READY | ATA_S_DSC;
 		break;
 	case 2:
 		/* TODO eject media */
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x53;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		break;
 	}
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 }
 
 static void
 atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd;
 	uint32_t tfd;
 	uint8_t pc, code;
 	unsigned int len;
 
 	acmd = cfis + 0x40;
 	len = be16dec(acmd + 7);
 	pc = acmd[2] >> 6;
 	code = acmd[2] & 0x3f;
 
 	switch (pc) {
 	case 0:
 		switch (code) {
 		case MODEPAGE_RW_ERROR_RECOVERY:
 		{
 			uint8_t buf[16];
 
 			if (len > sizeof(buf))
 				len = sizeof(buf);
 
 			memset(buf, 0, sizeof(buf));
 			be16enc(buf, 16 - 2);
 			buf[2] = 0x70;
 			buf[8] = 0x01;
 			buf[9] = 16 - 10;
 			buf[11] = 0x05;
 			write_prdt(p, slot, cfis, buf, len);
 			tfd = ATA_S_READY | ATA_S_DSC;
 			break;
 		}
 		case MODEPAGE_CD_CAPABILITIES:
 		{
 			uint8_t buf[30];
 
 			if (len > sizeof(buf))
 				len = sizeof(buf);
 
 			memset(buf, 0, sizeof(buf));
 			be16enc(buf, 30 - 2);
 			buf[2] = 0x70;
 			buf[8] = 0x2A;
 			buf[9] = 30 - 10;
 			buf[10] = 0x08;
 			buf[12] = 0x71;
 			be16enc(&buf[18], 2);
 			be16enc(&buf[20], 512);
 			write_prdt(p, slot, cfis, buf, len);
 			tfd = ATA_S_READY | ATA_S_DSC;
 			break;
 		}
 		default:
 			goto error;
 			break;
 		}
 		break;
 	case 3:
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x39;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		break;
 error:
 	case 1:
 	case 2:
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x24;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 		break;
 	}
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 }
 
 static void
 atapi_get_event_status_notification(struct ahci_port *p, int slot,
     uint8_t *cfis)
 {
 	uint8_t *acmd;
 	uint32_t tfd;
 
 	acmd = cfis + 0x40;
 
 	/* we don't support asynchronous operation */
 	if (!(acmd[1] & 1)) {
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x24;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 	} else {
 		uint8_t buf[8];
 		unsigned int len;
 
 		len = be16dec(acmd + 7);
 		if (len > sizeof(buf))
 			len = sizeof(buf);
 
 		memset(buf, 0, sizeof(buf));
 		be16enc(buf, 8 - 2);
 		buf[2] = 0x04;
 		buf[3] = 0x10;
 		buf[5] = 0x02;
 		write_prdt(p, slot, cfis, buf, len);
 		tfd = ATA_S_READY | ATA_S_DSC;
 	}
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 }
 
 static void
 handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 	uint8_t *acmd;
 
 	acmd = cfis + 0x40;
 
 #ifdef AHCI_DEBUG
 	{
 		int i;
 		DPRINTF("ACMD:");
 		for (i = 0; i < 16; i++)
 			DPRINTF("%02x ", acmd[i]);
 		DPRINTF("");
 	}
 #endif
 
 	switch (acmd[0]) {
 	case TEST_UNIT_READY:
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case INQUIRY:
 		atapi_inquiry(p, slot, cfis);
 		break;
 	case READ_CAPACITY:
 		atapi_read_capacity(p, slot, cfis);
 		break;
 	case PREVENT_ALLOW:
 		/* TODO */
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case READ_TOC:
 		atapi_read_toc(p, slot, cfis);
 		break;
 	case REPORT_LUNS:
 		atapi_report_luns(p, slot, cfis);
 		break;
 	case READ_10:
 	case READ_12:
 		atapi_read(p, slot, cfis, 0);
 		break;
 	case REQUEST_SENSE:
 		atapi_request_sense(p, slot, cfis);
 		break;
 	case START_STOP_UNIT:
 		atapi_start_stop_unit(p, slot, cfis);
 		break;
 	case MODE_SENSE_10:
 		atapi_mode_sense(p, slot, cfis);
 		break;
 	case GET_EVENT_STATUS_NOTIFICATION:
 		atapi_get_event_status_notification(p, slot, cfis);
 		break;
 	default:
 		cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x20;
 		ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) |
 				ATA_S_READY | ATA_S_ERROR);
 		break;
 	}
 }
 
 static void
 ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 {
 
 	p->tfd |= ATA_S_BUSY;
 	switch (cfis[2]) {
 	case ATA_ATA_IDENTIFY:
 		handle_identify(p, slot, cfis);
 		break;
 	case ATA_SETFEATURES:
 	{
 		switch (cfis[3]) {
 		case ATA_SF_ENAB_SATA_SF:
 			switch (cfis[12]) {
 			case ATA_SATA_SF_AN:
 				p->tfd = ATA_S_DSC | ATA_S_READY;
 				break;
 			default:
 				p->tfd = ATA_S_ERROR | ATA_S_READY;
 				p->tfd |= (ATA_ERROR_ABORT << 8);
 				break;
 			}
 			break;
 		case ATA_SF_ENAB_WCACHE:
 		case ATA_SF_DIS_WCACHE:
 		case ATA_SF_ENAB_RCACHE:
 		case ATA_SF_DIS_RCACHE:
 			p->tfd = ATA_S_DSC | ATA_S_READY;
 			break;
 		case ATA_SF_SETXFER:
 		{
 			switch (cfis[12] & 0xf8) {
 			case ATA_PIO:
 			case ATA_PIO0:
 				break;
 			case ATA_WDMA0:
 			case ATA_UDMA0:
 				p->xfermode = (cfis[12] & 0x7);
 				break;
 			}
 			p->tfd = ATA_S_DSC | ATA_S_READY;
 			break;
 		}
 		default:
 			p->tfd = ATA_S_ERROR | ATA_S_READY;
 			p->tfd |= (ATA_ERROR_ABORT << 8);
 			break;
 		}
 		ahci_write_fis_d2h(p, slot, cfis, p->tfd);
 		break;
 	}
 	case ATA_SET_MULTI:
 		if (cfis[12] != 0 &&
 			(cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) {
 			p->tfd = ATA_S_ERROR | ATA_S_READY;
 			p->tfd |= (ATA_ERROR_ABORT << 8);
 		} else {
 			p->mult_sectors = cfis[12];
 			p->tfd = ATA_S_DSC | ATA_S_READY;
 		}
 		ahci_write_fis_d2h(p, slot, cfis, p->tfd);
 		break;
 	case ATA_READ:
 	case ATA_WRITE:
 	case ATA_READ48:
 	case ATA_WRITE48:
 	case ATA_READ_MUL:
 	case ATA_WRITE_MUL:
 	case ATA_READ_MUL48:
 	case ATA_WRITE_MUL48:
 	case ATA_READ_DMA:
 	case ATA_WRITE_DMA:
 	case ATA_READ_DMA48:
 	case ATA_WRITE_DMA48:
 	case ATA_READ_FPDMA_QUEUED:
 	case ATA_WRITE_FPDMA_QUEUED:
 		ahci_handle_rw(p, slot, cfis, 0);
 		break;
 	case ATA_FLUSHCACHE:
 	case ATA_FLUSHCACHE48:
 		ahci_handle_flush(p, slot, cfis);
 		break;
 	case ATA_DATA_SET_MANAGEMENT:
 		if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM &&
 		    cfis[13] == 0 && cfis[12] == 1) {
 			ahci_handle_dsm_trim(p, slot, cfis, 0);
 			break;
 		}
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		break;
 	case ATA_SEND_FPDMA_QUEUED:
 		if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM &&
 		    cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM &&
 		    cfis[11] == 0 && cfis[3] == 1) {
 			ahci_handle_dsm_trim(p, slot, cfis, 0);
 			break;
 		}
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		break;
 	case ATA_READ_LOG_EXT:
 	case ATA_READ_LOG_DMA_EXT:
 		ahci_handle_read_log(p, slot, cfis);
 		break;
 	case ATA_SECURITY_FREEZE_LOCK:
 	case ATA_SMART_CMD:
 	case ATA_NOP:
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		break;
 	case ATA_CHECK_POWER_MODE:
 		cfis[12] = 0xff;	/* always on */
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case ATA_STANDBY_CMD:
 	case ATA_STANDBY_IMMEDIATE:
 	case ATA_IDLE_CMD:
 	case ATA_IDLE_IMMEDIATE:
 	case ATA_SLEEP:
 	case ATA_READ_VERIFY:
 	case ATA_READ_VERIFY48:
 		ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
 		break;
 	case ATA_ATAPI_IDENTIFY:
 		handle_atapi_identify(p, slot, cfis);
 		break;
 	case ATA_PACKET_CMD:
 		if (!p->atapi) {
 			ahci_write_fis_d2h(p, slot, cfis,
 			    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		} else
 			handle_packet_cmd(p, slot, cfis);
 		break;
 	default:
 		WPRINTF("Unsupported cmd:%02x", cfis[2]);
 		ahci_write_fis_d2h(p, slot, cfis,
 		    (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
 		break;
 	}
 }
 
 static void
 ahci_handle_slot(struct ahci_port *p, int slot)
 {
 	struct ahci_cmd_hdr *hdr;
 #ifdef AHCI_DEBUG
 	struct ahci_prdt_entry *prdt;
 #endif
 	struct pci_ahci_softc *sc;
 	uint8_t *cfis;
 #ifdef AHCI_DEBUG
 	int cfl, i;
 #endif
 
 	sc = p->pr_sc;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 #ifdef AHCI_DEBUG
 	cfl = (hdr->flags & 0x1f) * 4;
 #endif
 	cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba,
 			0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry));
 #ifdef AHCI_DEBUG
 	prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
 
 	DPRINTF("cfis:");
 	for (i = 0; i < cfl; i++) {
 		if (i % 10 == 0)
 			DPRINTF("");
 		DPRINTF("%02x ", cfis[i]);
 	}
 	DPRINTF("");
 
 	for (i = 0; i < hdr->prdtl; i++) {
 		DPRINTF("%d@%08"PRIx64"", prdt->dbc & 0x3fffff, prdt->dba);
 		prdt++;
 	}
 #endif
 
 	if (cfis[0] != FIS_TYPE_REGH2D) {
 		WPRINTF("Not a H2D FIS:%02x", cfis[0]);
 		return;
 	}
 
 	if (cfis[1] & 0x80) {
 		ahci_handle_cmd(p, slot, cfis);
 	} else {
 		if (cfis[15] & (1 << 2))
 			p->reset = 1;
 		else if (p->reset) {
 			p->reset = 0;
 			ahci_port_reset(p);
 		}
 		p->ci &= ~(1 << slot);
 	}
 }
 
 static void
 ahci_handle_port(struct ahci_port *p)
 {
 
 	if (!(p->cmd & AHCI_P_CMD_ST))
 		return;
 
 	/*
 	 * Search for any new commands to issue ignoring those that
 	 * are already in-flight.  Stop if device is busy or in error.
 	 */
 	for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) {
 		if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0)
 			break;
 		if (p->waitforclear)
 			break;
 		if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) {
 			p->cmd &= ~AHCI_P_CMD_CCS_MASK;
 			p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT;
 			ahci_handle_slot(p, p->ccs);
 		}
 	}
 }
 
 /*
  * blockif callback routine - this runs in the context of the blockif
  * i/o thread, so the mutex needs to be acquired.
  */
 static void
 ata_ioreq_cb(struct blockif_req *br, int err)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_ioreq *aior;
 	struct ahci_port *p;
 	struct pci_ahci_softc *sc;
 	uint32_t tfd;
 	uint8_t *cfis;
 	int slot, ncq, dsm;
 
 	DPRINTF("%s %d", __func__, err);
 
 	ncq = dsm = 0;
 	aior = br->br_param;
 	p = aior->io_pr;
 	cfis = aior->cfis;
 	slot = aior->slot;
 	sc = p->pr_sc;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
 
 	if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
 	    cfis[2] == ATA_READ_FPDMA_QUEUED ||
 	    cfis[2] == ATA_SEND_FPDMA_QUEUED)
 		ncq = 1;
 	if (cfis[2] == ATA_DATA_SET_MANAGEMENT ||
 	    (cfis[2] == ATA_SEND_FPDMA_QUEUED &&
 	     (cfis[13] & 0x1f) == ATA_SFPDMA_DSM))
 		dsm = 1;
 
 	pthread_mutex_lock(&sc->mtx);
 
 	/*
 	 * Delete the blockif request from the busy list
 	 */
 	TAILQ_REMOVE(&p->iobhd, aior, io_blist);
 
 	/*
 	 * Move the blockif request back to the free list
 	 */
 	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
 
 	if (!err)
 		hdr->prdbc = aior->done;
 
 	if (!err && aior->more) {
 		if (dsm)
 			ahci_handle_dsm_trim(p, slot, cfis, aior->done);
 		else
 			ahci_handle_rw(p, slot, cfis, aior->done);
 		goto out;
 	}
 
 	if (!err)
 		tfd = ATA_S_READY | ATA_S_DSC;
 	else
 		tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
 	if (ncq)
 		ahci_write_fis_sdb(p, slot, cfis, tfd);
 	else
 		ahci_write_fis_d2h(p, slot, cfis, tfd);
 
 	/*
 	 * This command is now complete.
 	 */
 	p->pending &= ~(1 << slot);
 
 	ahci_check_stopped(p);
 	ahci_handle_port(p);
 out:
 	pthread_mutex_unlock(&sc->mtx);
 	DPRINTF("%s exit", __func__);
 }
 
 static void
 atapi_ioreq_cb(struct blockif_req *br, int err)
 {
 	struct ahci_cmd_hdr *hdr;
 	struct ahci_ioreq *aior;
 	struct ahci_port *p;
 	struct pci_ahci_softc *sc;
 	uint8_t *cfis;
 	uint32_t tfd;
 	int slot;
 
 	DPRINTF("%s %d", __func__, err);
 
 	aior = br->br_param;
 	p = aior->io_pr;
 	cfis = aior->cfis;
 	slot = aior->slot;
 	sc = p->pr_sc;
 	hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE);
 
 	pthread_mutex_lock(&sc->mtx);
 
 	/*
 	 * Delete the blockif request from the busy list
 	 */
 	TAILQ_REMOVE(&p->iobhd, aior, io_blist);
 
 	/*
 	 * Move the blockif request back to the free list
 	 */
 	STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
 
 	if (!err)
 		hdr->prdbc = aior->done;
 
 	if (!err && aior->more) {
 		atapi_read(p, slot, cfis, aior->done);
 		goto out;
 	}
 
 	if (!err) {
 		tfd = ATA_S_READY | ATA_S_DSC;
 	} else {
 		p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
 		p->asc = 0x21;
 		tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
 	}
 	cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
 	ahci_write_fis_d2h(p, slot, cfis, tfd);
 
 	/*
 	 * This command is now complete.
 	 */
 	p->pending &= ~(1 << slot);
 
 	ahci_check_stopped(p);
 	ahci_handle_port(p);
 out:
 	pthread_mutex_unlock(&sc->mtx);
 	DPRINTF("%s exit", __func__);
 }
 
 static void
 pci_ahci_ioreq_init(struct ahci_port *pr)
 {
 	struct ahci_ioreq *vr;
 	int i;
 
 	pr->ioqsz = blockif_queuesz(pr->bctx);
 	pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq));
 	STAILQ_INIT(&pr->iofhd);
 
 	/*
 	 * Add all i/o request entries to the free queue
 	 */
 	for (i = 0; i < pr->ioqsz; i++) {
 		vr = &pr->ioreq[i];
 		vr->io_pr = pr;
 		if (!pr->atapi)
 			vr->io_req.br_callback = ata_ioreq_cb;
 		else
 			vr->io_req.br_callback = atapi_ioreq_cb;
 		vr->io_req.br_param = vr;
 		STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist);
 	}
 
 	TAILQ_INIT(&pr->iobhd);
 }
 
 static void
 pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
 {
 	int port = (offset - AHCI_OFFSET) / AHCI_STEP;
 	offset = (offset - AHCI_OFFSET) % AHCI_STEP;
 	struct ahci_port *p = &sc->port[port];
 
 	DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"",
 		port, offset, value);
 
 	switch (offset) {
 	case AHCI_P_CLB:
 		p->clb = value;
 		break;
 	case AHCI_P_CLBU:
 		p->clbu = value;
 		break;
 	case AHCI_P_FB:
 		p->fb = value;
 		break;
 	case AHCI_P_FBU:
 		p->fbu = value;
 		break;
 	case AHCI_P_IS:
 		p->is &= ~value;
 		ahci_port_intr(p);
 		break;
 	case AHCI_P_IE:
 		p->ie = value & 0xFDC000FF;
 		ahci_port_intr(p);
 		break;
 	case AHCI_P_CMD:
 	{
 		p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD |
 		    AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE |
 		    AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE |
 		    AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK);
 		p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD |
 		    AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE |
 		    AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE |
 		    AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value;
 
 		if (!(value & AHCI_P_CMD_ST)) {
 			ahci_port_stop(p);
 		} else {
 			uint64_t clb;
 
 			p->cmd |= AHCI_P_CMD_CR;
 			clb = (uint64_t)p->clbu << 32 | p->clb;
 			p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb,
 					AHCI_CL_SIZE * AHCI_MAX_SLOTS);
 		}
 
 		if (value & AHCI_P_CMD_FRE) {
 			uint64_t fb;
 
 			p->cmd |= AHCI_P_CMD_FR;
 			fb = (uint64_t)p->fbu << 32 | p->fb;
 			/* we don't support FBSCP, so rfis size is 256Bytes */
 			p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256);
 		} else {
 			p->cmd &= ~AHCI_P_CMD_FR;
 		}
 
 		if (value & AHCI_P_CMD_CLO) {
 			p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ);
 			p->cmd &= ~AHCI_P_CMD_CLO;
 		}
 
 		if (value & AHCI_P_CMD_ICC_MASK) {
 			p->cmd &= ~AHCI_P_CMD_ICC_MASK;
 		}
 
 		ahci_handle_port(p);
 		break;
 	}
 	case AHCI_P_TFD:
 	case AHCI_P_SIG:
 	case AHCI_P_SSTS:
 		WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"", offset);
 		break;
 	case AHCI_P_SCTL:
 		p->sctl = value;
 		if (!(p->cmd & AHCI_P_CMD_ST)) {
 			if (value & ATA_SC_DET_RESET)
 				ahci_port_reset(p);
 		}
 		break;
 	case AHCI_P_SERR:
 		p->serr &= ~value;
 		break;
 	case AHCI_P_SACT:
 		p->sact |= value;
 		break;
 	case AHCI_P_CI:
 		p->ci |= value;
 		ahci_handle_port(p);
 		break;
 	case AHCI_P_SNTF:
 	case AHCI_P_FBS:
 	default:
 		break;
 	}
 }
 
 static void
 pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
 {
 	DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"",
 		offset, value);
 
 	switch (offset) {
 	case AHCI_CAP:
 	case AHCI_PI:
 	case AHCI_VS:
 	case AHCI_CAP2:
 		DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"", offset);
 		break;
 	case AHCI_GHC:
 		if (value & AHCI_GHC_HR) {
 			ahci_reset(sc);
 			break;
 		}
 		if (value & AHCI_GHC_IE)
 			sc->ghc |= AHCI_GHC_IE;
 		else
 			sc->ghc &= ~AHCI_GHC_IE;
 		ahci_generate_intr(sc, 0xffffffff);
 		break;
 	case AHCI_IS:
 		sc->is &= ~value;
 		ahci_generate_intr(sc, value);
 		break;
 	default:
 		break;
 	}
 }
 
 static void
 pci_ahci_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size,
     uint64_t value)
 {
 	struct pci_ahci_softc *sc = pi->pi_arg;
 
 	assert(baridx == 5);
 	assert((offset % 4) == 0 && size == 4);
 
 	pthread_mutex_lock(&sc->mtx);
 
 	if (offset < AHCI_OFFSET)
 		pci_ahci_host_write(sc, offset, value);
 	else if (offset < (uint64_t)AHCI_OFFSET + sc->ports * AHCI_STEP)
 		pci_ahci_port_write(sc, offset, value);
 	else
 		WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"", offset);
 
 	pthread_mutex_unlock(&sc->mtx);
 }
 
 static uint64_t
 pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset)
 {
 	uint32_t value;
 
 	switch (offset) {
 	case AHCI_CAP:
 	case AHCI_GHC:
 	case AHCI_IS:
 	case AHCI_PI:
 	case AHCI_VS:
 	case AHCI_CCCC:
 	case AHCI_CCCP:
 	case AHCI_EM_LOC:
 	case AHCI_EM_CTL:
 	case AHCI_CAP2:
 	{
 		uint32_t *p = &sc->cap;
 		p += (offset - AHCI_CAP) / sizeof(uint32_t);
 		value = *p;
 		break;
 	}
 	default:
 		value = 0;
 		break;
 	}
 	DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x",
 		offset, value);
 
 	return (value);
 }
 
 static uint64_t
 pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset)
 {
 	uint32_t value;
 	int port = (offset - AHCI_OFFSET) / AHCI_STEP;
 	offset = (offset - AHCI_OFFSET) % AHCI_STEP;
 
 	switch (offset) {
 	case AHCI_P_CLB:
 	case AHCI_P_CLBU:
 	case AHCI_P_FB:
 	case AHCI_P_FBU:
 	case AHCI_P_IS:
 	case AHCI_P_IE:
 	case AHCI_P_CMD:
 	case AHCI_P_TFD:
 	case AHCI_P_SIG:
 	case AHCI_P_SSTS:
 	case AHCI_P_SCTL:
 	case AHCI_P_SERR:
 	case AHCI_P_SACT:
 	case AHCI_P_CI:
 	case AHCI_P_SNTF:
 	case AHCI_P_FBS:
 	{
 		uint32_t *p= &sc->port[port].clb;
 		p += (offset - AHCI_P_CLB) / sizeof(uint32_t);
 		value = *p;
 		break;
 	}
 	default:
 		value = 0;
 		break;
 	}
 
 	DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x",
 		port, offset, value);
 
 	return value;
 }
 
 static uint64_t
 pci_ahci_read(struct pci_devinst *pi, int baridx, uint64_t regoff, int size)
 {
 	struct pci_ahci_softc *sc = pi->pi_arg;
 	uint64_t offset;
 	uint32_t value;
 
 	assert(baridx == 5);
 	assert(size == 1 || size == 2 || size == 4);
 	assert((regoff & (size - 1)) == 0);
 
 	pthread_mutex_lock(&sc->mtx);
 
 	offset = regoff & ~0x3;	    /* round down to a multiple of 4 bytes */
 	if (offset < AHCI_OFFSET)
 		value = pci_ahci_host_read(sc, offset);
 	else if (offset < (uint64_t)AHCI_OFFSET + sc->ports * AHCI_STEP)
 		value = pci_ahci_port_read(sc, offset);
 	else {
 		value = 0;
 		WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"",
 		    regoff);
 	}
 	value >>= 8 * (regoff & 0x3);
 
 	pthread_mutex_unlock(&sc->mtx);
 
 	return (value);
 }
 
 /*
  * Each AHCI controller has a "port" node which contains nodes for
  * each port named after the decimal number of the port (no leading
  * zeroes).  Port nodes contain a "type" ("hd" or "cd"), as well as
  * options for blockif.  For example:
  *
  * pci.0.1.0
  *          .device="ahci"
  *          .port
  *               .0
  *                 .type="hd"
  *                 .path="/path/to/image"
  */
 static int
 pci_ahci_legacy_config_port(nvlist_t *nvl, int port, const char *type,
     const char *opts)
 {
 	char node_name[sizeof("XX")];
 	nvlist_t *port_nvl;
 
 	snprintf(node_name, sizeof(node_name), "%d", port);
 	port_nvl = create_relative_config_node(nvl, node_name);
 	set_config_value_node(port_nvl, "type", type);
 	return (blockif_legacy_config(port_nvl, opts));
 }
 
 static int
 pci_ahci_legacy_config(nvlist_t *nvl, const char *opts)
 {
 	nvlist_t *ports_nvl;
 	const char *type;
 	char *next, *next2, *str, *tofree;
 	int p, ret;
 
 	if (opts == NULL)
 		return (0);
 
 	ports_nvl = create_relative_config_node(nvl, "port");
 	ret = 1;
 	tofree = str = strdup(opts);
 	for (p = 0; p < MAX_PORTS && str != NULL; p++, str = next) {
 		/* Identify and cut off type of present port. */
 		if (strncmp(str, "hd:", 3) == 0) {
 			type = "hd";
 			str += 3;
 		} else if (strncmp(str, "cd:", 3) == 0) {
 			type = "cd";
 			str += 3;
 		} else
 			type = NULL;
 
 		/* Find and cut off the next port options. */
 		next = strstr(str, ",hd:");
 		next2 = strstr(str, ",cd:");
 		if (next == NULL || (next2 != NULL && next2 < next))
 			next = next2;
 		if (next != NULL) {
 			next[0] = 0;
 			next++;
 		}
 
 		if (str[0] == 0)
 			continue;
 
 		if (type == NULL) {
 			EPRINTLN("Missing or invalid type for port %d: \"%s\"",
 			    p, str);
 			goto out;
 		}
 
 		if (pci_ahci_legacy_config_port(ports_nvl, p, type, str) != 0)
 			goto out;
 	}
 	ret = 0;
 out:
 	free(tofree);
 	return (ret);
 }
 
 static int
 pci_ahci_cd_legacy_config(nvlist_t *nvl, const char *opts)
 {
 	nvlist_t *ports_nvl;
 
 	ports_nvl = create_relative_config_node(nvl, "port");
 	return (pci_ahci_legacy_config_port(ports_nvl, 0, "cd", opts));
 }
 
 static int
 pci_ahci_hd_legacy_config(nvlist_t *nvl, const char *opts)
 {
 	nvlist_t *ports_nvl;
 
 	ports_nvl = create_relative_config_node(nvl, "port");
 	return (pci_ahci_legacy_config_port(ports_nvl, 0, "hd", opts));
 }
 
 static int
 pci_ahci_init(struct pci_devinst *pi, nvlist_t *nvl)
 {
 	char bident[sizeof("XXX:XXX:XXX")];
 	char node_name[sizeof("XX")];
 	struct blockif_ctxt *bctxt;
 	struct pci_ahci_softc *sc;
 	int atapi, ret, slots, p;
 	MD5_CTX mdctx;
 	u_char digest[16];
 	const char *path, *type, *value;
 	nvlist_t *ports_nvl, *port_nvl;
 
 	ret = 0;
 
 #ifdef AHCI_DEBUG
 	dbg = fopen("/tmp/log", "w+");
 #endif
 
 	sc = calloc(1, sizeof(struct pci_ahci_softc));
 	pi->pi_arg = sc;
 	sc->asc_pi = pi;
 	pthread_mutex_init(&sc->mtx, NULL);
 	sc->ports = 0;
 	sc->pi = 0;
 	slots = 32;
 
 	ports_nvl = find_relative_config_node(nvl, "port");
 	for (p = 0; ports_nvl != NULL && p < MAX_PORTS; p++) {
 		struct ata_params *ata_ident = &sc->port[p].ata_ident;
 		char ident[AHCI_PORT_IDENT];
 
 		snprintf(node_name, sizeof(node_name), "%d", p);
 		port_nvl = find_relative_config_node(ports_nvl, node_name);
 		if (port_nvl == NULL)
 			continue;
 
 		type = get_config_value_node(port_nvl, "type");
 		if (type == NULL)
 			continue;
 
 		if (strcmp(type, "hd") == 0)
 			atapi = 0;
 		else
 			atapi = 1;
 
 		/*
 		 * Attempt to open the backing image. Use the PCI slot/func
 		 * and the port number for the identifier string.
 		 */
 		snprintf(bident, sizeof(bident), "%u:%u:%u", pi->pi_slot,
 		    pi->pi_func, p);
 
 		bctxt = blockif_open(port_nvl, bident);
 		if (bctxt == NULL) {
 			sc->ports = p;
 			ret = 1;
 			goto open_fail;
 		}
+
+		ret = blockif_add_boot_device(pi, bctxt);
+		if (ret) {
+			sc->ports = p;
+			goto open_fail;
+		}
+
 		sc->port[p].bctx = bctxt;
 		sc->port[p].pr_sc = sc;
 		sc->port[p].port = p;
 		sc->port[p].atapi = atapi;
 
 		/*
 		 * Create an identifier for the backing file.
 		 * Use parts of the md5 sum of the filename
 		 */
 		path = get_config_value_node(port_nvl, "path");
 		MD5Init(&mdctx);
 		MD5Update(&mdctx, path, strlen(path));
 		MD5Final(digest, &mdctx);
 		snprintf(ident, AHCI_PORT_IDENT,
 			"BHYVE-%02X%02X-%02X%02X-%02X%02X",
 			digest[0], digest[1], digest[2], digest[3], digest[4],
 			digest[5]);
 
 		memset(ata_ident, 0, sizeof(struct ata_params));
 		ata_string((uint8_t*)&ata_ident->serial, ident, 20);
 		ata_string((uint8_t*)&ata_ident->revision, "001", 8);
 		if (atapi)
 			ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DVD ROM", 40);
 		else
 			ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DISK", 40);
 		value = get_config_value_node(port_nvl, "nmrr");
 		if (value != NULL)
 			ata_ident->media_rotation_rate = atoi(value);
 		value = get_config_value_node(port_nvl, "ser");
 		if (value != NULL)
 			ata_string((uint8_t*)(&ata_ident->serial), value, 20);
 		value = get_config_value_node(port_nvl, "rev");
 		if (value != NULL)
 			ata_string((uint8_t*)(&ata_ident->revision), value, 8);
 		value = get_config_value_node(port_nvl, "model");
 		if (value != NULL)
 			ata_string((uint8_t*)(&ata_ident->model), value, 40);
 		ata_identify_init(&sc->port[p], atapi);
 
 		/*
 		 * Allocate blockif request structures and add them
 		 * to the free list
 		 */
 		pci_ahci_ioreq_init(&sc->port[p]);
 
 		sc->pi |= (1 << p);
 		if (sc->port[p].ioqsz < slots)
 			slots = sc->port[p].ioqsz;
 	}
 	sc->ports = p;
 
 	/* Intel ICH8 AHCI */
 	--slots;
 	if (sc->ports < DEF_PORTS)
 		sc->ports = DEF_PORTS;
 	sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF |
 	    AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP |
 	    AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)|
 	    AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC |
 	    (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1);
 
 	sc->vs = 0x10300;
 	sc->cap2 = AHCI_CAP2_APST;
 	ahci_reset(sc);
 
 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA);
 	pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0);
 	p = MIN(sc->ports, 16);
 	p = flsl(p) - ((p & (p - 1)) ? 0 : 1);
 	pci_emul_add_msicap(pi, 1 << p);
 	pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32,
 	    AHCI_OFFSET + sc->ports * AHCI_STEP);
 
 	pci_lintr_request(pi);
 
 open_fail:
 	if (ret) {
 		for (p = 0; p < sc->ports; p++) {
 			if (sc->port[p].bctx != NULL)
 				blockif_close(sc->port[p].bctx);
 		}
 		free(sc);
 	}
 
 	return (ret);
 }
 
 #ifdef BHYVE_SNAPSHOT
 static int
 pci_ahci_snapshot(struct vm_snapshot_meta *meta)
 {
 	int i, ret;
 	void *bctx;
 	struct pci_devinst *pi;
 	struct pci_ahci_softc *sc;
 	struct ahci_port *port;
 
 	pi = meta->dev_data;
 	sc = pi->pi_arg;
 
 	/* TODO: add mtx lock/unlock */
 
 	SNAPSHOT_VAR_OR_LEAVE(sc->ports, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(sc->cap, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(sc->ghc, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(sc->is, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(sc->pi, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(sc->vs, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(sc->ccc_ctl, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(sc->ccc_pts, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(sc->em_loc, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(sc->em_ctl, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(sc->cap2, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(sc->bohc, meta, ret, done);
 	SNAPSHOT_VAR_OR_LEAVE(sc->lintr, meta, ret, done);
 
 	for (i = 0; i < MAX_PORTS; i++) {
 		port = &sc->port[i];
 
 		if (meta->op == VM_SNAPSHOT_SAVE)
 			bctx = port->bctx;
 
 		SNAPSHOT_VAR_OR_LEAVE(bctx, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->port, meta, ret, done);
 
 		/* Mostly for restore; save is ensured by the lines above. */
 		if (((bctx == NULL) && (port->bctx != NULL)) ||
 		    ((bctx != NULL) && (port->bctx == NULL))) {
 			fprintf(stderr, "%s: ports not matching\r\n", __func__);
 			ret = EINVAL;
 			goto done;
 		}
 
 		if (port->bctx == NULL)
 			continue;
 
 		if (port->port != i) {
 			fprintf(stderr, "%s: ports not matching: "
 					"actual: %d expected: %d\r\n",
 					__func__, port->port, i);
 			ret = EINVAL;
 			goto done;
 		}
 
 		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, port->cmd_lst,
 			AHCI_CL_SIZE * AHCI_MAX_SLOTS, false, meta, ret, done);
 		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, port->rfis, 256,
 		    false, meta, ret, done);
 
 		SNAPSHOT_VAR_OR_LEAVE(port->ata_ident, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->atapi, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->reset, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->waitforclear, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->mult_sectors, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->xfermode, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->err_cfis, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->sense_key, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->asc, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->ccs, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->pending, meta, ret, done);
 
 		SNAPSHOT_VAR_OR_LEAVE(port->clb, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->clbu, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->fb, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->fbu, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->ie, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->cmd, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->unused0, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->tfd, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->sig, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->ssts, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->sctl, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->serr, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->sact, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->ci, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->sntf, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done);
 
 		assert(TAILQ_EMPTY(&port->iobhd));
 	}
 
 done:
 	return (ret);
 }
 
 static int
 pci_ahci_pause(struct pci_devinst *pi)
 {
 	struct pci_ahci_softc *sc;
 	struct blockif_ctxt *bctxt;
 	int i;
 
 	sc = pi->pi_arg;
 
 	for (i = 0; i < MAX_PORTS; i++) {
 		bctxt = sc->port[i].bctx;
 		if (bctxt == NULL)
 			continue;
 
 		blockif_pause(bctxt);
 	}
 
 	return (0);
 }
 
 static int
 pci_ahci_resume(struct pci_devinst *pi)
 {
 	struct pci_ahci_softc *sc;
 	struct blockif_ctxt *bctxt;
 	int i;
 
 	sc = pi->pi_arg;
 
 	for (i = 0; i < MAX_PORTS; i++) {
 		bctxt = sc->port[i].bctx;
 		if (bctxt == NULL)
 			continue;
 
 		blockif_resume(bctxt);
 	}
 
 	return (0);
 }
 #endif	/* BHYVE_SNAPSHOT */
 
 /*
  * Use separate emulation names to distinguish drive and atapi devices
  */
 static const struct pci_devemu pci_de_ahci = {
 	.pe_emu =	"ahci",
 	.pe_init =	pci_ahci_init,
 	.pe_legacy_config = pci_ahci_legacy_config,
 	.pe_barwrite =	pci_ahci_write,
 	.pe_barread =	pci_ahci_read,
 #ifdef BHYVE_SNAPSHOT
 	.pe_snapshot =	pci_ahci_snapshot,
 	.pe_pause =	pci_ahci_pause,
 	.pe_resume =	pci_ahci_resume,
 #endif
 };
 PCI_EMUL_SET(pci_de_ahci);
 
 static const struct pci_devemu pci_de_ahci_hd = {
 	.pe_emu =	"ahci-hd",
 	.pe_legacy_config = pci_ahci_hd_legacy_config,
 	.pe_alias =	"ahci",
 };
 PCI_EMUL_SET(pci_de_ahci_hd);
 
 static const struct pci_devemu pci_de_ahci_cd = {
 	.pe_emu =	"ahci-cd",
 	.pe_legacy_config = pci_ahci_cd_legacy_config,
 	.pe_alias =	"ahci",
 };
 PCI_EMUL_SET(pci_de_ahci_cd);
diff --git a/usr.sbin/bhyve/pci_nvme.c b/usr.sbin/bhyve/pci_nvme.c
index de5865220155..a18413a50367 100644
--- a/usr.sbin/bhyve/pci_nvme.c
+++ b/usr.sbin/bhyve/pci_nvme.c
@@ -1,3342 +1,3350 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2017 Shunsuke Mie
  * Copyright (c) 2018 Leon Dang
  * Copyright (c) 2020 Chuck Tuffli
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * bhyve PCIe-NVMe device emulation.
  *
  * options:
  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
  *
  *  accepted devpath:
  *    /dev/blockdev
  *    /path/to/image
  *    ram=size_in_MiB
  *
  *  maxq    = max number of queues
  *  qsz     = max elements in each queue
  *  ioslots = max number of concurrent io requests
  *  sectsz  = sector size (defaults to blockif sector size)
  *  ser     = serial number (20-chars max)
  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
  *
  */
 
 /* TODO:
     - create async event for smart and log
     - intr coalesce
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/errno.h>
 #include <sys/types.h>
 #include <sys/crc16.h>
 #include <net/ieee_oui.h>
 
 #include <assert.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <semaphore.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include <machine/atomic.h>
 #include <machine/vmm.h>
 #include <vmmapi.h>
 
 #include <dev/nvme/nvme.h>
 
 #include "bhyverun.h"
 #include "block_if.h"
 #include "config.h"
 #include "debug.h"
 #include "pci_emul.h"
 
 
 static int nvme_debug = 0;
 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
 
 /* defaults; can be overridden */
 #define	NVME_MSIX_BAR		4
 
 #define	NVME_IOSLOTS		8
 
 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
 #define NVME_MMIO_SPACE_MIN	(1 << 14)
 
 #define	NVME_QUEUES		16
 #define	NVME_MAX_QENTRIES	2048
 /* Memory Page size Minimum reported in CAP register */
 #define	NVME_MPSMIN		0
 /* MPSMIN converted to bytes */
 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
 
 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
 #define	NVME_MDTS		9
 /* Note the + 1 allows for the initial descriptor to not be page aligned */
 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
 
 /* This is a synthetic status code to indicate there is no status */
 #define NVME_NO_STATUS		0xffff
 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
 
 /* Reported temperature in Kelvin (i.e. room temperature) */
 #define NVME_TEMPERATURE 296
 
 /* helpers */
 
 /* Convert a zero-based value into a one-based value */
 #define ONE_BASED(zero)		((zero) + 1)
 /* Convert a one-based value into a zero-based value */
 #define ZERO_BASED(one)		((one)  - 1)
 
 /* Encode number of SQ's and CQ's for Set/Get Features */
 #define NVME_FEATURE_NUM_QUEUES(sc) \
 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16
 
 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
 
 enum nvme_controller_register_offsets {
 	NVME_CR_CAP_LOW = 0x00,
 	NVME_CR_CAP_HI  = 0x04,
 	NVME_CR_VS      = 0x08,
 	NVME_CR_INTMS   = 0x0c,
 	NVME_CR_INTMC   = 0x10,
 	NVME_CR_CC      = 0x14,
 	NVME_CR_CSTS    = 0x1c,
 	NVME_CR_NSSR    = 0x20,
 	NVME_CR_AQA     = 0x24,
 	NVME_CR_ASQ_LOW = 0x28,
 	NVME_CR_ASQ_HI  = 0x2c,
 	NVME_CR_ACQ_LOW = 0x30,
 	NVME_CR_ACQ_HI  = 0x34,
 };
 
 enum nvme_cmd_cdw11 {
 	NVME_CMD_CDW11_PC  = 0x0001,
 	NVME_CMD_CDW11_IEN = 0x0002,
 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
 };
 
 enum nvme_copy_dir {
 	NVME_COPY_TO_PRP,
 	NVME_COPY_FROM_PRP,
 };
 
 #define	NVME_CQ_INTEN	0x01
 #define	NVME_CQ_INTCOAL	0x02
 
 struct nvme_completion_queue {
 	struct nvme_completion *qbase;
 	pthread_mutex_t	mtx;
 	uint32_t	size;
 	uint16_t	tail; /* nvme progress */
 	uint16_t	head; /* guest progress */
 	uint16_t	intr_vec;
 	uint32_t	intr_en;
 };
 
 struct nvme_submission_queue {
 	struct nvme_command *qbase;
 	pthread_mutex_t	mtx;
 	uint32_t	size;
 	uint16_t	head; /* nvme progress */
 	uint16_t	tail; /* guest progress */
 	uint16_t	cqid; /* completion queue id */
 	int		qpriority;
 };
 
 enum nvme_storage_type {
 	NVME_STOR_BLOCKIF = 0,
 	NVME_STOR_RAM = 1,
 };
 
 struct pci_nvme_blockstore {
 	enum nvme_storage_type type;
 	void		*ctx;
 	uint64_t	size;
 	uint32_t	sectsz;
 	uint32_t	sectsz_bits;
 	uint64_t	eui64;
 	uint32_t	deallocate:1;
 };
 
 /*
  * Calculate the number of additional page descriptors for guest IO requests
  * based on the advertised Max Data Transfer (MDTS) and given the number of
  * default iovec's in a struct blockif_req.
  */
 #define MDTS_PAD_SIZE \
 	( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
 	  NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
 	  0 )
 
 struct pci_nvme_ioreq {
 	struct pci_nvme_softc *sc;
 	STAILQ_ENTRY(pci_nvme_ioreq) link;
 	struct nvme_submission_queue *nvme_sq;
 	uint16_t	sqid;
 
 	/* command information */
 	uint16_t	opc;
 	uint16_t	cid;
 	uint32_t	nsid;
 
 	uint64_t	prev_gpaddr;
 	size_t		prev_size;
 	size_t		bytes;
 
 	struct blockif_req io_req;
 
 	struct iovec	iovpadding[MDTS_PAD_SIZE];
 };
 
 enum nvme_dsm_type {
 	/* Dataset Management bit in ONCS reflects backing storage capability */
 	NVME_DATASET_MANAGEMENT_AUTO,
 	/* Unconditionally set Dataset Management bit in ONCS */
 	NVME_DATASET_MANAGEMENT_ENABLE,
 	/* Unconditionally clear Dataset Management bit in ONCS */
 	NVME_DATASET_MANAGEMENT_DISABLE,
 };
 
 struct pci_nvme_softc;
 struct nvme_feature_obj;
 
 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
     struct nvme_feature_obj *,
     struct nvme_command *,
     struct nvme_completion *);
 
 struct nvme_feature_obj {
 	uint32_t	cdw11;
 	nvme_feature_cb	set;
 	nvme_feature_cb	get;
 	bool namespace_specific;
 };
 
 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
 
 typedef enum {
 	PCI_NVME_AE_TYPE_ERROR = 0,
 	PCI_NVME_AE_TYPE_SMART,
 	PCI_NVME_AE_TYPE_NOTICE,
 	PCI_NVME_AE_TYPE_IO_CMD = 6,
 	PCI_NVME_AE_TYPE_VENDOR = 7,
 	PCI_NVME_AE_TYPE_MAX		/* Must be last */
 } pci_nvme_async_type;
 
 /* Asynchronous Event Requests */
 struct pci_nvme_aer {
 	STAILQ_ENTRY(pci_nvme_aer) link;
 	uint16_t	cid;	/* Command ID of the submitted AER */
 };
 
 /** Asynchronous Event Information - Notice */
 typedef enum {
 	PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0,
 	PCI_NVME_AEI_NOTICE_FW_ACTIVATION,
 	PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE,
 	PCI_NVME_AEI_NOTICE_ANA_CHANGE,
 	PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE,
 	PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT,
 	PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE,
 	PCI_NVME_AEI_NOTICE_MAX,
 } pci_nvme_async_event_info_notice;
 
 #define PCI_NVME_AEI_NOTICE_SHIFT		8
 #define PCI_NVME_AEI_NOTICE_MASK(event)	(1 << (event + PCI_NVME_AEI_NOTICE_SHIFT))
 
 /* Asynchronous Event Notifications */
 struct pci_nvme_aen {
 	pci_nvme_async_type atype;
 	uint32_t	event_data;
 	bool		posted;
 };
 
 /*
  * By default, enable all Asynchrnous Event Notifications:
  *     SMART / Health Critical Warnings
  *     Namespace Attribute Notices
  */
 #define PCI_NVME_AEN_DEFAULT_MASK	0x11f
 
 typedef enum {
 	NVME_CNTRLTYPE_IO = 1,
 	NVME_CNTRLTYPE_DISCOVERY = 2,
 	NVME_CNTRLTYPE_ADMIN = 3,
 } pci_nvme_cntrl_type;
 
 struct pci_nvme_softc {
 	struct pci_devinst *nsc_pi;
 
 	pthread_mutex_t	mtx;
 
 	struct nvme_registers regs;
 
 	struct nvme_namespace_data  nsdata;
 	struct nvme_controller_data ctrldata;
 	struct nvme_error_information_entry err_log;
 	struct nvme_health_information_page health_log;
 	struct nvme_firmware_page fw_log;
 	struct nvme_ns_list ns_log;
 
 	struct pci_nvme_blockstore nvstore;
 
 	uint16_t	max_qentries;	/* max entries per queue */
 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
 	uint32_t	num_cqueues;
 	uint32_t	num_squeues;
 	bool		num_q_is_set; /* Has host set Number of Queues */
 
 	struct pci_nvme_ioreq *ioreqs;
 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
 	uint32_t	pending_ios;
 	uint32_t	ioslots;
 	sem_t		iosemlock;
 
 	/*
 	 * Memory mapped Submission and Completion queues
 	 * Each array includes both Admin and IO queues
 	 */
 	struct nvme_completion_queue *compl_queues;
 	struct nvme_submission_queue *submit_queues;
 
 	struct nvme_feature_obj feat[NVME_FID_MAX];
 
 	enum nvme_dsm_type dataset_management;
 
 	/* Accounting for SMART data */
 	__uint128_t	read_data_units;
 	__uint128_t	write_data_units;
 	__uint128_t	read_commands;
 	__uint128_t	write_commands;
 	uint32_t	read_dunits_remainder;
 	uint32_t	write_dunits_remainder;
 
 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
 	pthread_mutex_t	aer_mtx;
 	uint32_t	aer_count;
 	struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
 	pthread_t	aen_tid;
 	pthread_mutex_t	aen_mtx;
 	pthread_cond_t	aen_cond;
 };
 
 
 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
     struct nvme_completion_queue *cq,
     uint32_t cdw0,
     uint16_t cid,
     uint16_t sqid,
     uint16_t status);
 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
 static void pci_nvme_io_done(struct blockif_req *, int);
 
 /* Controller Configuration utils */
 #define	NVME_CC_GET_EN(cc) \
 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
 #define	NVME_CC_GET_CSS(cc) \
 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
 #define	NVME_CC_GET_SHN(cc) \
 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
 #define	NVME_CC_GET_IOSQES(cc) \
 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
 #define	NVME_CC_GET_IOCQES(cc) \
 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
 
 #define	NVME_CC_WRITE_MASK \
 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
 
 #define	NVME_CC_NEN_WRITE_MASK \
 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
 
 /* Controller Status utils */
 #define	NVME_CSTS_GET_RDY(sts) \
 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
 
 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
 #define	NVME_CSTS_CFS	(1 << NVME_CSTS_REG_CFS_SHIFT)
 
 /* Completion Queue status word utils */
 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
 #define	NVME_STATUS_MASK \
 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
 
 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
 
 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
     struct nvme_feature_obj *,
     struct nvme_command *,
     struct nvme_completion *);
 static void nvme_feature_temperature(struct pci_nvme_softc *,
     struct nvme_feature_obj *,
     struct nvme_command *,
     struct nvme_completion *);
 static void nvme_feature_num_queues(struct pci_nvme_softc *,
     struct nvme_feature_obj *,
     struct nvme_command *,
     struct nvme_completion *);
 static void nvme_feature_iv_config(struct pci_nvme_softc *,
     struct nvme_feature_obj *,
     struct nvme_command *,
     struct nvme_completion *);
 static void nvme_feature_async_event(struct pci_nvme_softc *,
     struct nvme_feature_obj *,
     struct nvme_command *,
     struct nvme_completion *);
 
 static void *aen_thr(void *arg);
 
 static __inline void
 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
 {
 	size_t len;
 
 	len = strnlen(src, dst_size);
 	memset(dst, pad, dst_size);
 	memcpy(dst, src, len);
 }
 
 static __inline void
 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
 {
 
 	*status &= ~NVME_STATUS_MASK;
 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
 }
 
 static __inline void
 pci_nvme_status_genc(uint16_t *status, uint16_t code)
 {
 
 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
 }
 
 /*
  * Initialize the requested number or IO Submission and Completion Queues.
  * Admin queues are allocated implicitly.
  */
 static void
 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
 {
 	uint32_t i;
 
 	/*
 	 * Allocate and initialize the Submission Queues
 	 */
 	if (nsq > NVME_QUEUES) {
 		WPRINTF("%s: clamping number of SQ from %u to %u",
 					__func__, nsq, NVME_QUEUES);
 		nsq = NVME_QUEUES;
 	}
 
 	sc->num_squeues = nsq;
 
 	sc->submit_queues = calloc(sc->num_squeues + 1,
 				sizeof(struct nvme_submission_queue));
 	if (sc->submit_queues == NULL) {
 		WPRINTF("%s: SQ allocation failed", __func__);
 		sc->num_squeues = 0;
 	} else {
 		struct nvme_submission_queue *sq = sc->submit_queues;
 
 		for (i = 0; i < sc->num_squeues + 1; i++)
 			pthread_mutex_init(&sq[i].mtx, NULL);
 	}
 
 	/*
 	 * Allocate and initialize the Completion Queues
 	 */
 	if (ncq > NVME_QUEUES) {
 		WPRINTF("%s: clamping number of CQ from %u to %u",
 					__func__, ncq, NVME_QUEUES);
 		ncq = NVME_QUEUES;
 	}
 
 	sc->num_cqueues = ncq;
 
 	sc->compl_queues = calloc(sc->num_cqueues + 1,
 				sizeof(struct nvme_completion_queue));
 	if (sc->compl_queues == NULL) {
 		WPRINTF("%s: CQ allocation failed", __func__);
 		sc->num_cqueues = 0;
 	} else {
 		struct nvme_completion_queue *cq = sc->compl_queues;
 
 		for (i = 0; i < sc->num_cqueues + 1; i++)
 			pthread_mutex_init(&cq[i].mtx, NULL);
 	}
 }
 
 static void
 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
 {
 	struct nvme_controller_data *cd = &sc->ctrldata;
 
 	cd->vid = 0xFB5D;
 	cd->ssvid = 0x0000;
 
 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
 
 	/* Num of submission commands that we can handle at a time (2^rab) */
 	cd->rab   = 4;
 
 	/* FreeBSD OUI */
 	cd->ieee[0] = 0x58;
 	cd->ieee[1] = 0x9c;
 	cd->ieee[2] = 0xfc;
 
 	cd->mic = 0;
 
 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
 
 	cd->ver = NVME_REV(1,4);
 
 	cd->cntrltype = NVME_CNTRLTYPE_IO;
 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
 	cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR);
 	cd->acl = 2;
 	cd->aerl = 4;
 
 	/* Advertise 1, Read-only firmware slot */
 	cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) |
 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
 	cd->elpe = 0;	/* max error log page entries */
 	/*
 	 * Report a single power state (zero-based value)
 	 * power_state[] values are left as zero to indicate "Not reported"
 	 */
 	cd->npss = 0;
 
 	/* Warning Composite Temperature Threshold */
 	cd->wctemp = 0x0157;
 	cd->cctemp = 0x0157;
 
 	/* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */
 	cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO <<
 			NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT);
 
 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
 	cd->nn = 1;	/* number of namespaces */
 
 	cd->oncs = 0;
 	switch (sc->dataset_management) {
 	case NVME_DATASET_MANAGEMENT_AUTO:
 		if (sc->nvstore.deallocate)
 			cd->oncs |= NVME_ONCS_DSM;
 		break;
 	case NVME_DATASET_MANAGEMENT_ENABLE:
 		cd->oncs |= NVME_ONCS_DSM;
 		break;
 	default:
 		break;
 	}
 
 	cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
 	    NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
 
 	cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT;
 }
 
 static void
 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
     struct nvme_namespace_data *nd)
 {
 
 	/* Get capacity and block size information from backing store */
 	nd->nsze = nvstore->size / nvstore->sectsz;
 	nd->ncap = nd->nsze;
 	nd->nuse = nd->nsze;
 }
 
 static void
 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
     struct nvme_namespace_data *nd, uint32_t nsid,
     struct pci_nvme_blockstore *nvstore)
 {
 
 	pci_nvme_init_nsdata_size(nvstore, nd);
 
 	if (nvstore->type == NVME_STOR_BLOCKIF)
 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
 
 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
 	nd->flbas = 0;
 
 	/* Create an EUI-64 if user did not provide one */
 	if (nvstore->eui64 == 0) {
 		char *data = NULL;
 		uint64_t eui64 = nvstore->eui64;
 
 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
 		    sc->nsc_pi->pi_func);
 
 		if (data != NULL) {
 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
 			free(data);
 		}
 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
 	}
 	be64enc(nd->eui64, nvstore->eui64);
 
 	/* LBA data-sz = 2^lbads */
 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
 }
 
 static void
 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
 {
 	__uint128_t power_cycles = 1;
 
 	memset(&sc->err_log, 0, sizeof(sc->err_log));
 	memset(&sc->health_log, 0, sizeof(sc->health_log));
 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
 	memset(&sc->ns_log, 0, sizeof(sc->ns_log));
 
 	/* Set read/write remainder to round up according to spec */
 	sc->read_dunits_remainder = 999;
 	sc->write_dunits_remainder = 999;
 
 	/* Set nominal Health values checked by implementations */
 	sc->health_log.temperature = NVME_TEMPERATURE;
 	sc->health_log.available_spare = 100;
 	sc->health_log.available_spare_threshold = 10;
 
 	/* Set Active Firmware Info to slot 1 */
 	sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT);
 	memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr,
 	    sizeof(sc->fw_log.revision[0]));
 
 	memcpy(&sc->health_log.power_cycles, &power_cycles,
 	    sizeof(sc->health_log.power_cycles));
 }
 
 static void
 pci_nvme_init_features(struct pci_nvme_softc *sc)
 {
 	enum nvme_feature	fid;
 
 	for (fid = 0; fid < NVME_FID_MAX; fid++) {
 		switch (fid) {
 		case NVME_FEAT_ARBITRATION:
 		case NVME_FEAT_POWER_MANAGEMENT:
 		case NVME_FEAT_INTERRUPT_COALESCING: //XXX
 		case NVME_FEAT_WRITE_ATOMICITY:
 			/* Mandatory but no special handling required */
 		//XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
 		//XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
 		//		  this returns a data buffer
 			break;
 		case NVME_FEAT_TEMPERATURE_THRESHOLD:
 			sc->feat[fid].set = nvme_feature_temperature;
 			break;
 		case NVME_FEAT_ERROR_RECOVERY:
 			sc->feat[fid].namespace_specific = true;
 			break;
 		case NVME_FEAT_NUMBER_OF_QUEUES:
 			sc->feat[fid].set = nvme_feature_num_queues;
 			break;
 		case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
 			sc->feat[fid].set = nvme_feature_iv_config;
 			break;
 		case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
 			sc->feat[fid].set = nvme_feature_async_event;
 			/* Enable all AENs by default */
 			sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK;
 			break;
 		default:
 			sc->feat[fid].set = nvme_feature_invalid_cb;
 			sc->feat[fid].get = nvme_feature_invalid_cb;
 		}
 	}
 }
 
 static void
 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
 {
 
 	STAILQ_INIT(&sc->aer_list);
 	sc->aer_count = 0;
 }
 
 static void
 pci_nvme_aer_init(struct pci_nvme_softc *sc)
 {
 
 	pthread_mutex_init(&sc->aer_mtx, NULL);
 	pci_nvme_aer_reset(sc);
 }
 
 static void
 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
 {
 	struct pci_nvme_aer *aer = NULL;
 
 	pthread_mutex_lock(&sc->aer_mtx);
 	while (!STAILQ_EMPTY(&sc->aer_list)) {
 		aer = STAILQ_FIRST(&sc->aer_list);
 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
 		free(aer);
 	}
 	pthread_mutex_unlock(&sc->aer_mtx);
 
 	pci_nvme_aer_reset(sc);
 }
 
 static bool
 pci_nvme_aer_available(struct pci_nvme_softc *sc)
 {
 
 	return (sc->aer_count != 0);
 }
 
 static bool
 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
 {
 	struct nvme_controller_data *cd = &sc->ctrldata;
 
 	/* AERL is a zero based value while aer_count is one's based */
 	return (sc->aer_count == (cd->aerl + 1U));
 }
 
 /*
  * Add an Async Event Request
  *
  * Stores an AER to be returned later if the Controller needs to notify the
  * host of an event.
  * Note that while the NVMe spec doesn't require Controllers to return AER's
  * in order, this implementation does preserve the order.
  */
 static int
 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
 {
 	struct pci_nvme_aer *aer = NULL;
 
 	aer = calloc(1, sizeof(struct pci_nvme_aer));
 	if (aer == NULL)
 		return (-1);
 
 	/* Save the Command ID for use in the completion message */
 	aer->cid = cid;
 
 	pthread_mutex_lock(&sc->aer_mtx);
 	sc->aer_count++;
 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
 	pthread_mutex_unlock(&sc->aer_mtx);
 
 	return (0);
 }
 
 /*
  * Get an Async Event Request structure
  *
  * Returns a pointer to an AER previously submitted by the host or NULL if
  * no AER's exist. Caller is responsible for freeing the returned struct.
  */
 static struct pci_nvme_aer *
 pci_nvme_aer_get(struct pci_nvme_softc *sc)
 {
 	struct pci_nvme_aer *aer = NULL;
 
 	pthread_mutex_lock(&sc->aer_mtx);
 	aer = STAILQ_FIRST(&sc->aer_list);
 	if (aer != NULL) {
 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
 		sc->aer_count--;
 	}
 	pthread_mutex_unlock(&sc->aer_mtx);
 
 	return (aer);
 }
 
 static void
 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
 {
 	uint32_t	atype;
 
 	memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
 
 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
 		sc->aen[atype].atype = atype;
 	}
 }
 
 static void
 pci_nvme_aen_init(struct pci_nvme_softc *sc)
 {
 	char nstr[80];
 
 	pci_nvme_aen_reset(sc);
 
 	pthread_mutex_init(&sc->aen_mtx, NULL);
 	pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
 	snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
 	    sc->nsc_pi->pi_func);
 	pthread_set_name_np(sc->aen_tid, nstr);
 }
 
 static void
 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
 {
 
 	pci_nvme_aen_reset(sc);
 }
 
 /* Notify the AEN thread of pending work */
 static void
 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
 {
 
 	pthread_cond_signal(&sc->aen_cond);
 }
 
 /*
  * Post an Asynchronous Event Notification
  */
 static int32_t
 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
 		uint32_t event_data)
 {
 	struct pci_nvme_aen *aen;
 
 	if (atype >= PCI_NVME_AE_TYPE_MAX) {
 		return(EINVAL);
 	}
 
 	pthread_mutex_lock(&sc->aen_mtx);
 	aen = &sc->aen[atype];
 
 	/* Has the controller already posted an event of this type? */
 	if (aen->posted) {
 		pthread_mutex_unlock(&sc->aen_mtx);
 		return(EALREADY);
 	}
 
 	aen->event_data = event_data;
 	aen->posted = true;
 	pthread_mutex_unlock(&sc->aen_mtx);
 
 	pci_nvme_aen_notify(sc);
 
 	return(0);
 }
 
 static void
 pci_nvme_aen_process(struct pci_nvme_softc *sc)
 {
 	struct pci_nvme_aer *aer;
 	struct pci_nvme_aen *aen;
 	pci_nvme_async_type atype;
 	uint32_t mask;
 	uint16_t status;
 	uint8_t lid;
 
 	assert(pthread_mutex_isowned_np(&sc->aen_mtx));
 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
 		aen = &sc->aen[atype];
 		/* Previous iterations may have depleted the available AER's */
 		if (!pci_nvme_aer_available(sc)) {
 			DPRINTF("%s: no AER", __func__);
 			break;
 		}
 
 		if (!aen->posted) {
 			DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
 			continue;
 		}
 
 		status = NVME_SC_SUCCESS;
 
 		/* Is the event masked? */
 		mask =
 		    sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
 
 		DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
 		switch (atype) {
 		case PCI_NVME_AE_TYPE_ERROR:
 			lid = NVME_LOG_ERROR;
 			break;
 		case PCI_NVME_AE_TYPE_SMART:
 			mask &= 0xff;
 			if ((mask & aen->event_data) == 0)
 				continue;
 			lid = NVME_LOG_HEALTH_INFORMATION;
 			break;
 		case PCI_NVME_AE_TYPE_NOTICE:
 			if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) {
 				EPRINTLN("%s unknown AEN notice type %u",
 				    __func__, aen->event_data);
 				status = NVME_SC_INTERNAL_DEVICE_ERROR;
 				lid = 0;
 				break;
 			}
 			if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0)
 				continue;
 			switch (aen->event_data) {
 			case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED:
 				lid = NVME_LOG_CHANGED_NAMESPACE;
 				break;
 			case PCI_NVME_AEI_NOTICE_FW_ACTIVATION:
 				lid = NVME_LOG_FIRMWARE_SLOT;
 				break;
 			case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE:
 				lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
 				break;
 			case PCI_NVME_AEI_NOTICE_ANA_CHANGE:
 				lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS;
 				break;
 			case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE:
 				lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
 				break;
 			case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT:
 				lid = NVME_LOG_LBA_STATUS_INFORMATION;
 				break;
 			case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE:
 				lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
 				break;
 			default:
 				lid = 0;
 			}
 			break;
 		default:
 			/* bad type?!? */
 			EPRINTLN("%s unknown AEN type %u", __func__, atype);
 			status = NVME_SC_INTERNAL_DEVICE_ERROR;
 			lid = 0;
 			break;
 		}
 
 		aer = pci_nvme_aer_get(sc);
 		assert(aer != NULL);
 
 		DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
 		pci_nvme_cq_update(sc, &sc->compl_queues[0],
 		    (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
 		    aer->cid,
 		    0,		/* SQID */
 		    status);
 
 		aen->event_data = 0;
 		aen->posted = false;
 
 		pci_generate_msix(sc->nsc_pi, 0);
 	}
 }
 
 static void *
 aen_thr(void *arg)
 {
 	struct pci_nvme_softc *sc;
 
 	sc = arg;
 
 	pthread_mutex_lock(&sc->aen_mtx);
 	for (;;) {
 		pci_nvme_aen_process(sc);
 		pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
 	}
 	pthread_mutex_unlock(&sc->aen_mtx);
 
 	pthread_exit(NULL);
 	return (NULL);
 }
 
 static void
 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
 {
 	uint32_t i;
 
 	DPRINTF("%s", __func__);
 
 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
 
 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
 
 	sc->regs.vs = NVME_REV(1,4);	/* NVMe v1.4 */
 
 	sc->regs.cc = 0;
 
 	assert(sc->submit_queues != NULL);
 
 	for (i = 0; i < sc->num_squeues + 1; i++) {
 		sc->submit_queues[i].qbase = NULL;
 		sc->submit_queues[i].size = 0;
 		sc->submit_queues[i].cqid = 0;
 		sc->submit_queues[i].tail = 0;
 		sc->submit_queues[i].head = 0;
 	}
 
 	assert(sc->compl_queues != NULL);
 
 	for (i = 0; i < sc->num_cqueues + 1; i++) {
 		sc->compl_queues[i].qbase = NULL;
 		sc->compl_queues[i].size = 0;
 		sc->compl_queues[i].tail = 0;
 		sc->compl_queues[i].head = 0;
 	}
 
 	sc->num_q_is_set = false;
 
 	pci_nvme_aer_destroy(sc);
 	pci_nvme_aen_destroy(sc);
 
 	/*
 	 * Clear CSTS.RDY last to prevent the host from enabling Controller
 	 * before cleanup completes
 	 */
 	sc->regs.csts = 0;
 }
 
 static void
 pci_nvme_reset(struct pci_nvme_softc *sc)
 {
 	pthread_mutex_lock(&sc->mtx);
 	pci_nvme_reset_locked(sc);
 	pthread_mutex_unlock(&sc->mtx);
 }
 
 static int
 pci_nvme_init_controller(struct pci_nvme_softc *sc)
 {
 	uint16_t acqs, asqs;
 
 	DPRINTF("%s", __func__);
 
 	/*
 	 * NVMe 2.0 states that "enabling a controller while this field is
 	 * cleared to 0h produces undefined results" for both ACQS and
 	 * ASQS. If zero, set CFS and do not become ready.
 	 */
 	asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK);
 	if (asqs < 2) {
 		EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__,
 		    asqs - 1, sc->regs.aqa);
 		sc->regs.csts |= NVME_CSTS_CFS;
 		return (-1);
 	}
 	sc->submit_queues[0].size = asqs;
 	sc->submit_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
 	    sc->regs.asq, sizeof(struct nvme_command) * asqs);
 	if (sc->submit_queues[0].qbase == NULL) {
 		EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__,
 		    sc->regs.asq);
 		sc->regs.csts |= NVME_CSTS_CFS;
 		return (-1);
 	}
 
 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
 
 	acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
 	    NVME_AQA_REG_ACQS_MASK);
 	if (acqs < 2) {
 		EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__,
 		    acqs - 1, sc->regs.aqa);
 		sc->regs.csts |= NVME_CSTS_CFS;
 		return (-1);
 	}
 	sc->compl_queues[0].size = acqs;
 	sc->compl_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
 	    sc->regs.acq, sizeof(struct nvme_completion) * acqs);
 	if (sc->compl_queues[0].qbase == NULL) {
 		EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__,
 		    sc->regs.acq);
 		sc->regs.csts |= NVME_CSTS_CFS;
 		return (-1);
 	}
 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
 
 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
 
 	return (0);
 }
 
 static int
 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
 	size_t len, enum nvme_copy_dir dir)
 {
 	uint8_t *p;
 	size_t bytes;
 
 	if (len > (8 * 1024)) {
 		return (-1);
 	}
 
 	/* Copy from the start of prp1 to the end of the physical page */
 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
 	bytes = MIN(bytes, len);
 
 	p = vm_map_gpa(ctx, prp1, bytes);
 	if (p == NULL) {
 		return (-1);
 	}
 
 	if (dir == NVME_COPY_TO_PRP)
 		memcpy(p, b, bytes);
 	else
 		memcpy(b, p, bytes);
 
 	b += bytes;
 
 	len -= bytes;
 	if (len == 0) {
 		return (0);
 	}
 
 	len = MIN(len, PAGE_SIZE);
 
 	p = vm_map_gpa(ctx, prp2, len);
 	if (p == NULL) {
 		return (-1);
 	}
 
 	if (dir == NVME_COPY_TO_PRP)
 		memcpy(p, b, len);
 	else
 		memcpy(b, p, len);
 
 	return (0);
 }
 
 /*
  * Write a Completion Queue Entry update
  *
  * Write the completion and update the doorbell value
  */
 static void
 pci_nvme_cq_update(struct pci_nvme_softc *sc,
 		struct nvme_completion_queue *cq,
 		uint32_t cdw0,
 		uint16_t cid,
 		uint16_t sqid,
 		uint16_t status)
 {
 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
 	struct nvme_completion *cqe;
 
 	assert(cq->qbase != NULL);
 
 	pthread_mutex_lock(&cq->mtx);
 
 	cqe = &cq->qbase[cq->tail];
 
 	/* Flip the phase bit */
 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
 
 	cqe->cdw0 = cdw0;
 	cqe->sqhd = sq->head;
 	cqe->sqid = sqid;
 	cqe->cid = cid;
 	cqe->status = status;
 
 	cq->tail++;
 	if (cq->tail >= cq->size) {
 		cq->tail = 0;
 	}
 
 	pthread_mutex_unlock(&cq->mtx);
 }
 
 static int
 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
 	struct nvme_completion* compl)
 {
 	uint16_t qid = command->cdw10 & 0xffff;
 
 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
 	if (qid == 0 || qid > sc->num_squeues ||
 	    (sc->submit_queues[qid].qbase == NULL)) {
 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
 		        __func__, qid, sc->num_squeues);
 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
 		return (1);
 	}
 
 	sc->submit_queues[qid].qbase = NULL;
 	sc->submit_queues[qid].cqid = 0;
 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
 	return (1);
 }
 
 static int
 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
 	struct nvme_completion* compl)
 {
 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
 		uint16_t qid = command->cdw10 & 0xffff;
 		struct nvme_submission_queue *nsq;
 
 		if ((qid == 0) || (qid > sc->num_squeues) ||
 		    (sc->submit_queues[qid].qbase != NULL)) {
 			WPRINTF("%s queue index %u > num_squeues %u",
 			        __func__, qid, sc->num_squeues);
 			pci_nvme_status_tc(&compl->status,
 			    NVME_SCT_COMMAND_SPECIFIC,
 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
 			return (1);
 		}
 
 		nsq = &sc->submit_queues[qid];
 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
 			/*
 			 * Queues must specify at least two entries
 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
 			 */
 			pci_nvme_status_tc(&compl->status,
 			    NVME_SCT_COMMAND_SPECIFIC,
 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
 			return (1);
 		}
 		nsq->head = nsq->tail = 0;
 
 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
 			pci_nvme_status_tc(&compl->status,
 			    NVME_SCT_COMMAND_SPECIFIC,
 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
 			return (1);
 		}
 
 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
 			pci_nvme_status_tc(&compl->status,
 			    NVME_SCT_COMMAND_SPECIFIC,
 			    NVME_SC_COMPLETION_QUEUE_INVALID);
 			return (1);
 		}
 
 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
 
 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
 		              sizeof(struct nvme_command) * (size_t)nsq->size);
 
 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
 		        qid, nsq->size, nsq->qbase, nsq->cqid);
 
 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
 
 		DPRINTF("%s completed creating IOSQ qid %u",
 		         __func__, qid);
 	} else {
 		/*
 		 * Guest sent non-cont submission queue request.
 		 * This setting is unsupported by this emulation.
 		 */
 		WPRINTF("%s unsupported non-contig (list-based) "
 		         "create i/o submission queue", __func__);
 
 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
 	}
 	return (1);
 }
 
 static int
 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
 	struct nvme_completion* compl)
 {
 	uint16_t qid = command->cdw10 & 0xffff;
 	uint16_t sqid;
 
 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
 	if (qid == 0 || qid > sc->num_cqueues ||
 	    (sc->compl_queues[qid].qbase == NULL)) {
 		WPRINTF("%s queue index %u / num_cqueues %u",
 		        __func__, qid, sc->num_cqueues);
 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
 		return (1);
 	}
 
 	/* Deleting an Active CQ is an error */
 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
 		if (sc->submit_queues[sqid].cqid == qid) {
 			pci_nvme_status_tc(&compl->status,
 			    NVME_SCT_COMMAND_SPECIFIC,
 			    NVME_SC_INVALID_QUEUE_DELETION);
 			return (1);
 		}
 
 	sc->compl_queues[qid].qbase = NULL;
 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
 	return (1);
 }
 
 static int
 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
 	struct nvme_completion* compl)
 {
 	struct nvme_completion_queue *ncq;
 	uint16_t qid = command->cdw10 & 0xffff;
 
 	/* Only support Physically Contiguous queues */
 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
 		WPRINTF("%s unsupported non-contig (list-based) "
 		         "create i/o completion queue",
 		         __func__);
 
 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
 		return (1);
 	}
 
 	if ((qid == 0) || (qid > sc->num_cqueues) ||
 	    (sc->compl_queues[qid].qbase != NULL)) {
 		WPRINTF("%s queue index %u > num_cqueues %u",
 			__func__, qid, sc->num_cqueues);
 		pci_nvme_status_tc(&compl->status,
 		    NVME_SCT_COMMAND_SPECIFIC,
 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
 		return (1);
  	}
 
 	ncq = &sc->compl_queues[qid];
 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
 	if (ncq->intr_vec > (sc->max_queues + 1)) {
 		pci_nvme_status_tc(&compl->status,
 		    NVME_SCT_COMMAND_SPECIFIC,
 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
 		return (1);
 	}
 
 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
 		/*
 		 * Queues must specify at least two entries
 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
 		 */
 		pci_nvme_status_tc(&compl->status,
 		    NVME_SCT_COMMAND_SPECIFIC,
 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
 		return (1);
 	}
 	ncq->head = ncq->tail = 0;
 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
 		     command->prp1,
 		     sizeof(struct nvme_command) * (size_t)ncq->size);
 
 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
 
 
 	return (1);
 }
 
 static int
 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
 	struct nvme_completion* compl)
 {
 	uint64_t logoff;
 	uint32_t logsize;
 	uint8_t logpage;
 
 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
 
 	/*
 	 * Command specifies the number of dwords to return in fields NUMDU
 	 * and NUMDL. This is a zero-based value.
 	 */
 	logpage = command->cdw10 & 0xFF;
 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
 	logsize *= sizeof(uint32_t);
 	logoff  = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
 
 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
 
 	switch (logpage) {
 	case NVME_LOG_ERROR:
 		if (logoff >= sizeof(sc->err_log)) {
 			pci_nvme_status_genc(&compl->status,
 			    NVME_SC_INVALID_FIELD);
 			break;
 		}
 
 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
 		    command->prp2, (uint8_t *)&sc->err_log + logoff,
 		    MIN(logsize - logoff, sizeof(sc->err_log)),
 		    NVME_COPY_TO_PRP);
 		break;
 	case NVME_LOG_HEALTH_INFORMATION:
 		if (logoff >= sizeof(sc->health_log)) {
 			pci_nvme_status_genc(&compl->status,
 			    NVME_SC_INVALID_FIELD);
 			break;
 		}
 
 		pthread_mutex_lock(&sc->mtx);
 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
 		    sizeof(sc->health_log.data_units_read));
 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
 		    sizeof(sc->health_log.data_units_written));
 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
 		    sizeof(sc->health_log.host_read_commands));
 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
 		    sizeof(sc->health_log.host_write_commands));
 		pthread_mutex_unlock(&sc->mtx);
 
 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
 		    command->prp2, (uint8_t *)&sc->health_log + logoff,
 		    MIN(logsize - logoff, sizeof(sc->health_log)),
 		    NVME_COPY_TO_PRP);
 		break;
 	case NVME_LOG_FIRMWARE_SLOT:
 		if (logoff >= sizeof(sc->fw_log)) {
 			pci_nvme_status_genc(&compl->status,
 			    NVME_SC_INVALID_FIELD);
 			break;
 		}
 
 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
 		    command->prp2, (uint8_t *)&sc->fw_log + logoff,
 		    MIN(logsize - logoff, sizeof(sc->fw_log)),
 		    NVME_COPY_TO_PRP);
 		break;
 	case NVME_LOG_CHANGED_NAMESPACE:
 		if (logoff >= sizeof(sc->ns_log)) {
 			pci_nvme_status_genc(&compl->status,
 			    NVME_SC_INVALID_FIELD);
 			break;
 		}
 
 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
 		    command->prp2, (uint8_t *)&sc->ns_log + logoff,
 		    MIN(logsize - logoff, sizeof(sc->ns_log)),
 		    NVME_COPY_TO_PRP);
 		memset(&sc->ns_log, 0, sizeof(sc->ns_log));
 		break;
 	default:
 		DPRINTF("%s get log page %x command not supported",
 		        __func__, logpage);
 
 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
 		    NVME_SC_INVALID_LOG_PAGE);
 	}
 
 	return (1);
 }
 
 static int
 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
 	struct nvme_completion* compl)
 {
 	void *dest;
 	uint16_t status;
 
 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
 	        command->cdw10 & 0xFF, command->nsid);
 
 	status = 0;
 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
 
 	switch (command->cdw10 & 0xFF) {
 	case 0x00: /* return Identify Namespace data structure */
 		/* Global NS only valid with NS Management */
 		if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) {
 			pci_nvme_status_genc(&status,
 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
 			break;
 		}
 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
 		    NVME_COPY_TO_PRP);
 		break;
 	case 0x01: /* return Identify Controller data structure */
 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
 		    command->prp2, (uint8_t *)&sc->ctrldata,
 		    sizeof(sc->ctrldata),
 		    NVME_COPY_TO_PRP);
 		break;
 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
 		                  sizeof(uint32_t) * 1024);
 		/* All unused entries shall be zero */
 		memset(dest, 0, sizeof(uint32_t) * 1024);
 		((uint32_t *)dest)[0] = 1;
 		break;
 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
 		if (command->nsid != 1) {
 			pci_nvme_status_genc(&status,
 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
 			break;
 		}
 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
 		                  sizeof(uint32_t) * 1024);
 		/* All bytes after the descriptor shall be zero */
 		memset(dest, 0, sizeof(uint32_t) * 1024);
 
 		/* Return NIDT=1 (i.e. EUI64) descriptor */
 		((uint8_t *)dest)[0] = 1;
 		((uint8_t *)dest)[1] = sizeof(uint64_t);
 		memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t));
 		break;
 	case 0x13:
 		/*
 		 * Controller list is optional but used by UNH tests. Return
 		 * a valid but empty list.
 		 */
 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
 		                  sizeof(uint16_t) * 2048);
 		memset(dest, 0, sizeof(uint16_t) * 2048);
 		break;
 	default:
 		DPRINTF("%s unsupported identify command requested 0x%x",
 		         __func__, command->cdw10 & 0xFF);
 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
 		break;
 	}
 
 	compl->status = status;
 	return (1);
 }
 
 static const char *
 nvme_fid_to_name(uint8_t fid)
 {
 	const char *name;
 
 	switch (fid) {
 	case NVME_FEAT_ARBITRATION:
 		name = "Arbitration";
 		break;
 	case NVME_FEAT_POWER_MANAGEMENT:
 		name = "Power Management";
 		break;
 	case NVME_FEAT_LBA_RANGE_TYPE:
 		name = "LBA Range Type";
 		break;
 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
 		name = "Temperature Threshold";
 		break;
 	case NVME_FEAT_ERROR_RECOVERY:
 		name = "Error Recovery";
 		break;
 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
 		name = "Volatile Write Cache";
 		break;
 	case NVME_FEAT_NUMBER_OF_QUEUES:
 		name = "Number of Queues";
 		break;
 	case NVME_FEAT_INTERRUPT_COALESCING:
 		name = "Interrupt Coalescing";
 		break;
 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
 		name = "Interrupt Vector Configuration";
 		break;
 	case NVME_FEAT_WRITE_ATOMICITY:
 		name = "Write Atomicity Normal";
 		break;
 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
 		name = "Asynchronous Event Configuration";
 		break;
 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
 		name = "Autonomous Power State Transition";
 		break;
 	case NVME_FEAT_HOST_MEMORY_BUFFER:
 		name = "Host Memory Buffer";
 		break;
 	case NVME_FEAT_TIMESTAMP:
 		name = "Timestamp";
 		break;
 	case NVME_FEAT_KEEP_ALIVE_TIMER:
 		name = "Keep Alive Timer";
 		break;
 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
 		name = "Host Controlled Thermal Management";
 		break;
 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
 		name = "Non-Operation Power State Config";
 		break;
 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
 		name = "Read Recovery Level Config";
 		break;
 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
 		name = "Predictable Latency Mode Config";
 		break;
 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
 		name = "Predictable Latency Mode Window";
 		break;
 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
 		name = "LBA Status Information Report Interval";
 		break;
 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
 		name = "Host Behavior Support";
 		break;
 	case NVME_FEAT_SANITIZE_CONFIG:
 		name = "Sanitize Config";
 		break;
 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
 		name = "Endurance Group Event Configuration";
 		break;
 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
 		name = "Software Progress Marker";
 		break;
 	case NVME_FEAT_HOST_IDENTIFIER:
 		name = "Host Identifier";
 		break;
 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
 		name = "Reservation Notification Mask";
 		break;
 	case NVME_FEAT_RESERVATION_PERSISTENCE:
 		name = "Reservation Persistence";
 		break;
 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
 		name = "Namespace Write Protection Config";
 		break;
 	default:
 		name = "Unknown";
 		break;
 	}
 
 	return (name);
 }
 
 static void
 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused,
     struct nvme_feature_obj *feat __unused,
     struct nvme_command *command __unused,
     struct nvme_completion *compl)
 {
 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
 }
 
 static void
 nvme_feature_iv_config(struct pci_nvme_softc *sc,
     struct nvme_feature_obj *feat __unused,
     struct nvme_command *command,
     struct nvme_completion *compl)
 {
 	uint32_t i;
 	uint32_t cdw11 = command->cdw11;
 	uint16_t iv;
 	bool cd;
 
 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
 
 	iv = cdw11 & 0xffff;
 	cd = cdw11 & (1 << 16);
 
 	if (iv > (sc->max_queues + 1)) {
 		return;
 	}
 
 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
 	if ((iv == 0) && !cd)
 		return;
 
 	/* Requested Interrupt Vector must be used by a CQ */
 	for (i = 0; i < sc->num_cqueues + 1; i++) {
 		if (sc->compl_queues[i].intr_vec == iv) {
 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
 		}
 	}
 }
 
 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP		(0x4000)
 static void
 nvme_feature_async_event(struct pci_nvme_softc *sc __unused,
     struct nvme_feature_obj *feat __unused,
     struct nvme_command *command,
     struct nvme_completion *compl)
 {
 	if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP)
 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
 }
 
 #define NVME_TEMP_THRESH_OVER	0
 #define NVME_TEMP_THRESH_UNDER	1
 static void
 nvme_feature_temperature(struct pci_nvme_softc *sc,
     struct nvme_feature_obj *feat __unused,
     struct nvme_command *command,
     struct nvme_completion *compl)
 {
 	uint16_t	tmpth;	/* Temperature Threshold */
 	uint8_t		tmpsel; /* Threshold Temperature Select */
 	uint8_t		thsel;  /* Threshold Type Select */
 	bool		set_crit = false;
 	bool		report_crit;
 
 	tmpth  = command->cdw11 & 0xffff;
 	tmpsel = (command->cdw11 >> 16) & 0xf;
 	thsel  = (command->cdw11 >> 20) & 0x3;
 
 	DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
 
 	/* Check for unsupported values */
 	if (((tmpsel != 0) && (tmpsel != 0xf)) ||
 	    (thsel > NVME_TEMP_THRESH_UNDER)) {
 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
 		return;
 	}
 
 	if (((thsel == NVME_TEMP_THRESH_OVER)  && (NVME_TEMPERATURE >= tmpth)) ||
 	    ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
 		set_crit = true;
 
 	pthread_mutex_lock(&sc->mtx);
 	if (set_crit)
 		sc->health_log.critical_warning |=
 		    NVME_CRIT_WARN_ST_TEMPERATURE;
 	else
 		sc->health_log.critical_warning &=
 		    ~NVME_CRIT_WARN_ST_TEMPERATURE;
 	pthread_mutex_unlock(&sc->mtx);
 
 	report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 &
 	    NVME_CRIT_WARN_ST_TEMPERATURE;
 
 	if (set_crit && report_crit)
 		pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
 		    sc->health_log.critical_warning);
 
 	DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
 }
 
 static void
 nvme_feature_num_queues(struct pci_nvme_softc *sc,
     struct nvme_feature_obj *feat __unused,
     struct nvme_command *command,
     struct nvme_completion *compl)
 {
 	uint16_t nqr;	/* Number of Queues Requested */
 
 	if (sc->num_q_is_set) {
 		WPRINTF("%s: Number of Queues already set", __func__);
 		pci_nvme_status_genc(&compl->status,
 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
 		return;
 	}
 
 	nqr = command->cdw11 & 0xFFFF;
 	if (nqr == 0xffff) {
 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
 		return;
 	}
 
 	sc->num_squeues = ONE_BASED(nqr);
 	if (sc->num_squeues > sc->max_queues) {
 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
 					sc->max_queues);
 		sc->num_squeues = sc->max_queues;
 	}
 
 	nqr = (command->cdw11 >> 16) & 0xFFFF;
 	if (nqr == 0xffff) {
 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
 		return;
 	}
 
 	sc->num_cqueues = ONE_BASED(nqr);
 	if (sc->num_cqueues > sc->max_queues) {
 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
 					sc->max_queues);
 		sc->num_cqueues = sc->max_queues;
 	}
 
 	/* Patch the command value which will be saved on callback's return */
 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
 
 	sc->num_q_is_set = true;
 }
 
 static int
 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
 	struct nvme_completion *compl)
 {
 	struct nvme_feature_obj *feat;
 	uint32_t nsid = command->nsid;
 	uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10);
 	bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10);
 
 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
 
 	if (fid >= NVME_FID_MAX) {
 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
 		return (1);
 	}
 
 	if (sv) {
 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
 		    NVME_SC_FEATURE_NOT_SAVEABLE);
 		return (1);
 	}
 
 	feat = &sc->feat[fid];
 
 	if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
 		return (1);
 	}
 
 	if (!feat->namespace_specific &&
 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
 		return (1);
 	}
 
 	compl->cdw0 = 0;
 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
 
 	if (feat->set)
 		feat->set(sc, feat, command, compl);
 	else {
 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
 		    NVME_SC_FEATURE_NOT_CHANGEABLE);
 		return (1);
 	}
 
 	DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
 	if (compl->status == NVME_SC_SUCCESS) {
 		feat->cdw11 = command->cdw11;
 		if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
 		    (command->cdw11 != 0))
 			pci_nvme_aen_notify(sc);
 	}
 
 	return (0);
 }
 
 #define NVME_FEATURES_SEL_SUPPORTED	0x3
 #define NVME_FEATURES_NS_SPECIFIC	(1 << 1)
 
 static int
 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
 	struct nvme_completion* compl)
 {
 	struct nvme_feature_obj *feat;
 	uint8_t fid = command->cdw10 & 0xFF;
 	uint8_t sel = (command->cdw10 >> 8) & 0x7;
 
 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
 
 	if (fid >= NVME_FID_MAX) {
 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
 		return (1);
 	}
 
 	compl->cdw0 = 0;
 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
 
 	feat = &sc->feat[fid];
 	if (feat->get) {
 		feat->get(sc, feat, command, compl);
 	}
 
 	if (compl->status == NVME_SC_SUCCESS) {
 		if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
 			compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
 		else
 			compl->cdw0 = feat->cdw11;
 	}
 
 	return (0);
 }
 
 static int
 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
 	struct nvme_completion* compl)
 {
 	uint8_t	ses, lbaf, pi;
 
 	/* Only supports Secure Erase Setting - User Data Erase */
 	ses = (command->cdw10 >> 9) & 0x7;
 	if (ses > 0x1) {
 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
 		return (1);
 	}
 
 	/* Only supports a single LBA Format */
 	lbaf = command->cdw10 & 0xf;
 	if (lbaf != 0) {
 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
 		    NVME_SC_INVALID_FORMAT);
 		return (1);
 	}
 
 	/* Doesn't support Protection Information */
 	pi = (command->cdw10 >> 5) & 0x7;
 	if (pi != 0) {
 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
 		return (1);
 	}
 
 	if (sc->nvstore.type == NVME_STOR_RAM) {
 		if (sc->nvstore.ctx)
 			free(sc->nvstore.ctx);
 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
 	} else {
 		struct pci_nvme_ioreq *req;
 		int err;
 
 		req = pci_nvme_get_ioreq(sc);
 		if (req == NULL) {
 			pci_nvme_status_genc(&compl->status,
 			    NVME_SC_INTERNAL_DEVICE_ERROR);
 			WPRINTF("%s: unable to allocate IO req", __func__);
 			return (1);
 		}
 		req->nvme_sq = &sc->submit_queues[0];
 		req->sqid = 0;
 		req->opc = command->opc;
 		req->cid = command->cid;
 		req->nsid = command->nsid;
 
 		req->io_req.br_offset = 0;
 		req->io_req.br_resid = sc->nvstore.size;
 		req->io_req.br_callback = pci_nvme_io_done;
 
 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
 		if (err) {
 			pci_nvme_status_genc(&compl->status,
 			    NVME_SC_INTERNAL_DEVICE_ERROR);
 			pci_nvme_release_ioreq(sc, req);
 		} else
 			compl->status = NVME_NO_STATUS;
 	}
 
 	return (1);
 }
 
 static int
 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command,
     struct nvme_completion *compl)
 {
 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
 
 	/* TODO: search for the command ID and abort it */
 
 	compl->cdw0 = 1;
 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
 	return (1);
 }
 
 static int
 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
 	struct nvme_command* command, struct nvme_completion* compl)
 {
 	DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
 	    sc->aer_count, sc->ctrldata.aerl, command->cid);
 
 	/* Don't exceed the Async Event Request Limit (AERL). */
 	if (pci_nvme_aer_limit_reached(sc)) {
 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
 		return (1);
 	}
 
 	if (pci_nvme_aer_add(sc, command->cid)) {
 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
 				NVME_SC_INTERNAL_DEVICE_ERROR);
 		return (1);
 	}
 
 	/*
 	 * Raise events when they happen based on the Set Features cmd.
 	 * These events happen async, so only set completion successful if
 	 * there is an event reflective of the request to get event.
 	 */
 	compl->status = NVME_NO_STATUS;
 	pci_nvme_aen_notify(sc);
 
 	return (0);
 }
 
 static void
 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
 {
 	struct nvme_completion compl;
 	struct nvme_command *cmd;
 	struct nvme_submission_queue *sq;
 	struct nvme_completion_queue *cq;
 	uint16_t sqhead;
 
 	DPRINTF("%s index %u", __func__, (uint32_t)value);
 
 	sq = &sc->submit_queues[0];
 	cq = &sc->compl_queues[0];
 
 	pthread_mutex_lock(&sq->mtx);
 
 	sqhead = sq->head;
 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
 
 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
 		cmd = &(sq->qbase)[sqhead];
 		compl.cdw0 = 0;
 		compl.status = 0;
 
 		switch (cmd->opc) {
 		case NVME_OPC_DELETE_IO_SQ:
 			DPRINTF("%s command DELETE_IO_SQ", __func__);
 			nvme_opc_delete_io_sq(sc, cmd, &compl);
 			break;
 		case NVME_OPC_CREATE_IO_SQ:
 			DPRINTF("%s command CREATE_IO_SQ", __func__);
 			nvme_opc_create_io_sq(sc, cmd, &compl);
 			break;
 		case NVME_OPC_DELETE_IO_CQ:
 			DPRINTF("%s command DELETE_IO_CQ", __func__);
 			nvme_opc_delete_io_cq(sc, cmd, &compl);
 			break;
 		case NVME_OPC_CREATE_IO_CQ:
 			DPRINTF("%s command CREATE_IO_CQ", __func__);
 			nvme_opc_create_io_cq(sc, cmd, &compl);
 			break;
 		case NVME_OPC_GET_LOG_PAGE:
 			DPRINTF("%s command GET_LOG_PAGE", __func__);
 			nvme_opc_get_log_page(sc, cmd, &compl);
 			break;
 		case NVME_OPC_IDENTIFY:
 			DPRINTF("%s command IDENTIFY", __func__);
 			nvme_opc_identify(sc, cmd, &compl);
 			break;
 		case NVME_OPC_ABORT:
 			DPRINTF("%s command ABORT", __func__);
 			nvme_opc_abort(sc, cmd, &compl);
 			break;
 		case NVME_OPC_SET_FEATURES:
 			DPRINTF("%s command SET_FEATURES", __func__);
 			nvme_opc_set_features(sc, cmd, &compl);
 			break;
 		case NVME_OPC_GET_FEATURES:
 			DPRINTF("%s command GET_FEATURES", __func__);
 			nvme_opc_get_features(sc, cmd, &compl);
 			break;
 		case NVME_OPC_FIRMWARE_ACTIVATE:
 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
 			pci_nvme_status_tc(&compl.status,
 			    NVME_SCT_COMMAND_SPECIFIC,
 			    NVME_SC_INVALID_FIRMWARE_SLOT);
 			break;
 		case NVME_OPC_ASYNC_EVENT_REQUEST:
 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
 			nvme_opc_async_event_req(sc, cmd, &compl);
 			break;
 		case NVME_OPC_FORMAT_NVM:
 			DPRINTF("%s command FORMAT_NVM", __func__);
 			if ((sc->ctrldata.oacs &
 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
 				break;
 			}
 			nvme_opc_format_nvm(sc, cmd, &compl);
 			break;
 		case NVME_OPC_SECURITY_SEND:
 		case NVME_OPC_SECURITY_RECEIVE:
 		case NVME_OPC_SANITIZE:
 		case NVME_OPC_GET_LBA_STATUS:
 			DPRINTF("%s command OPC=%#x (unsupported)", __func__,
 			    cmd->opc);
 			/* Valid but unsupported opcodes */
 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
 			break;
 		default:
 			DPRINTF("%s command OPC=%#X (not implemented)",
 			    __func__,
 			    cmd->opc);
 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
 		}
 		sqhead = (sqhead + 1) % sq->size;
 
 		if (NVME_COMPLETION_VALID(compl)) {
 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
 			    compl.cdw0,
 			    cmd->cid,
 			    0,		/* SQID */
 			    compl.status);
 		}
 	}
 
 	DPRINTF("setting sqhead %u", sqhead);
 	sq->head = sqhead;
 
 	if (cq->head != cq->tail)
 		pci_generate_msix(sc->nsc_pi, 0);
 
 	pthread_mutex_unlock(&sq->mtx);
 }
 
 /*
  * Update the Write and Read statistics reported in SMART data
  *
  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
  * 512 byte blocks. Rounding up is achieved by initializing the remainder to 999.
  */
 static void
 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
     size_t bytes, uint16_t status)
 {
 
 	pthread_mutex_lock(&sc->mtx);
 	switch (opc) {
 	case NVME_OPC_WRITE:
 		sc->write_commands++;
 		if (status != NVME_SC_SUCCESS)
 			break;
 		sc->write_dunits_remainder += (bytes / 512);
 		while (sc->write_dunits_remainder >= 1000) {
 			sc->write_data_units++;
 			sc->write_dunits_remainder -= 1000;
 		}
 		break;
 	case NVME_OPC_READ:
 		sc->read_commands++;
 		if (status != NVME_SC_SUCCESS)
 			break;
 		sc->read_dunits_remainder += (bytes / 512);
 		while (sc->read_dunits_remainder >= 1000) {
 			sc->read_data_units++;
 			sc->read_dunits_remainder -= 1000;
 		}
 		break;
 	default:
 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
 		break;
 	}
 	pthread_mutex_unlock(&sc->mtx);
 }
 
 /*
  * Check if the combination of Starting LBA (slba) and number of blocks
  * exceeds the range of the underlying storage.
  *
  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
  * overflow.
  */
 static bool
 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
     uint32_t nblocks)
 {
 	size_t	offset, bytes;
 
 	/* Overflow check of multiplying Starting LBA by the sector size */
 	if (slba >> (64 - nvstore->sectsz_bits))
 		return (true);
 
 	offset = slba << nvstore->sectsz_bits;
 	bytes = nblocks << nvstore->sectsz_bits;
 
 	/* Overflow check of Number of Logical Blocks */
 	if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes))
 		return (true);
 
 	return (false);
 }
 
 static int
 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused,
     struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset)
 {
 	int iovidx;
 	bool range_is_contiguous;
 
 	if (req == NULL)
 		return (-1);
 
 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
 		return (-1);
 	}
 
 	/*
 	 * Minimize the number of IOVs by concatenating contiguous address
 	 * ranges. If the IOV count is zero, there is no previous range to
 	 * concatenate.
 	 */
 	if (req->io_req.br_iovcnt == 0)
 		range_is_contiguous = false;
 	else
 		range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr;
 
 	if (range_is_contiguous) {
 		iovidx = req->io_req.br_iovcnt - 1;
 
 		req->io_req.br_iov[iovidx].iov_base =
 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
 				     req->prev_gpaddr, size);
 		if (req->io_req.br_iov[iovidx].iov_base == NULL)
 			return (-1);
 
 		req->prev_size += size;
 		req->io_req.br_resid += size;
 
 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
 	} else {
 		iovidx = req->io_req.br_iovcnt;
 		if (iovidx == 0) {
 			req->io_req.br_offset = offset;
 			req->io_req.br_resid = 0;
 			req->io_req.br_param = req;
 		}
 
 		req->io_req.br_iov[iovidx].iov_base =
 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
 				     gpaddr, size);
 		if (req->io_req.br_iov[iovidx].iov_base == NULL)
 			return (-1);
 
 		req->io_req.br_iov[iovidx].iov_len = size;
 
 		req->prev_gpaddr = gpaddr;
 		req->prev_size = size;
 		req->io_req.br_resid += size;
 
 		req->io_req.br_iovcnt++;
 	}
 
 	return (0);
 }
 
 static void
 pci_nvme_set_completion(struct pci_nvme_softc *sc,
     struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status)
 {
 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
 
 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
 		 NVME_STATUS_GET_SC(status));
 
 	pci_nvme_cq_update(sc, cq, 0, cid, sqid, status);
 
 	if (cq->head != cq->tail) {
 		if (cq->intr_en & NVME_CQ_INTEN) {
 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
 		} else {
 			DPRINTF("%s: CQ%u interrupt disabled",
 						__func__, sq->cqid);
 		}
 	}
 }
 
 static void
 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
 {
 	req->sc = NULL;
 	req->nvme_sq = NULL;
 	req->sqid = 0;
 
 	pthread_mutex_lock(&sc->mtx);
 
 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
 	sc->pending_ios--;
 
 	/* when no more IO pending, can set to ready if device reset/enabled */
 	if (sc->pending_ios == 0 &&
 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
 		sc->regs.csts |= NVME_CSTS_RDY;
 
 	pthread_mutex_unlock(&sc->mtx);
 
 	sem_post(&sc->iosemlock);
 }
 
 static struct pci_nvme_ioreq *
 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
 {
 	struct pci_nvme_ioreq *req = NULL;
 
 	sem_wait(&sc->iosemlock);
 	pthread_mutex_lock(&sc->mtx);
 
 	req = STAILQ_FIRST(&sc->ioreqs_free);
 	assert(req != NULL);
 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
 
 	req->sc = sc;
 
 	sc->pending_ios++;
 
 	pthread_mutex_unlock(&sc->mtx);
 
 	req->io_req.br_iovcnt = 0;
 	req->io_req.br_offset = 0;
 	req->io_req.br_resid = 0;
 	req->io_req.br_param = req;
 	req->prev_gpaddr = 0;
 	req->prev_size = 0;
 
 	return req;
 }
 
 static void
 pci_nvme_io_done(struct blockif_req *br, int err)
 {
 	struct pci_nvme_ioreq *req = br->br_param;
 	struct nvme_submission_queue *sq = req->nvme_sq;
 	uint16_t code, status;
 
 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
 
 	/* TODO return correct error */
 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
 	status = 0;
 	pci_nvme_status_genc(&status, code);
 
 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status);
 	pci_nvme_stats_write_read_update(req->sc, req->opc,
 	    req->bytes, status);
 	pci_nvme_release_ioreq(req->sc, req);
 }
 
 /*
  * Implements the Flush command. The specification states:
  *    If a volatile write cache is not present, Flush commands complete
  *    successfully and have no effect
  * in the description of the Volatile Write Cache (VWC) field of the Identify
  * Controller data. Therefore, set status to Success if the command is
  * not supported (i.e. RAM or as indicated by the blockif).
  */
 static bool
 nvme_opc_flush(struct pci_nvme_softc *sc __unused,
     struct nvme_command *cmd __unused,
     struct pci_nvme_blockstore *nvstore,
     struct pci_nvme_ioreq *req,
     uint16_t *status)
 {
 	bool pending = false;
 
 	if (nvstore->type == NVME_STOR_RAM) {
 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
 	} else {
 		int err;
 
 		req->io_req.br_callback = pci_nvme_io_done;
 
 		err = blockif_flush(nvstore->ctx, &req->io_req);
 		switch (err) {
 		case 0:
 			pending = true;
 			break;
 		case EOPNOTSUPP:
 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
 			break;
 		default:
 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
 		}
 	}
 
 	return (pending);
 }
 
 static uint16_t
 nvme_write_read_ram(struct pci_nvme_softc *sc,
     struct pci_nvme_blockstore *nvstore,
     uint64_t prp1, uint64_t prp2,
     size_t offset, uint64_t bytes,
     bool is_write)
 {
 	uint8_t *buf = nvstore->ctx;
 	enum nvme_copy_dir dir;
 	uint16_t status;
 
 	if (is_write)
 		dir = NVME_COPY_TO_PRP;
 	else
 		dir = NVME_COPY_FROM_PRP;
 
 	status = 0;
 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
 	    buf + offset, bytes, dir))
 		pci_nvme_status_genc(&status,
 		    NVME_SC_DATA_TRANSFER_ERROR);
 	else
 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
 
 	return (status);
 }
 
 static uint16_t
 nvme_write_read_blockif(struct pci_nvme_softc *sc,
     struct pci_nvme_blockstore *nvstore,
     struct pci_nvme_ioreq *req,
     uint64_t prp1, uint64_t prp2,
     size_t offset, uint64_t bytes,
     bool is_write)
 {
 	uint64_t size;
 	int err;
 	uint16_t status = NVME_NO_STATUS;
 
 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
 	if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) {
 		err = -1;
 		goto out;
 	}
 
 	offset += size;
 	bytes  -= size;
 
 	if (bytes == 0) {
 		;
 	} else if (bytes <= PAGE_SIZE) {
 		size = bytes;
 		if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) {
 			err = -1;
 			goto out;
 		}
 	} else {
 		void *vmctx = sc->nsc_pi->pi_vmctx;
 		uint64_t *prp_list = &prp2;
 		uint64_t *last = prp_list;
 
 		/* PRP2 is pointer to a physical region page list */
 		while (bytes) {
 			/* Last entry in list points to the next list */
 			if ((prp_list == last) && (bytes > PAGE_SIZE)) {
 				uint64_t prp = *prp_list;
 
 				prp_list = paddr_guest2host(vmctx, prp,
 				    PAGE_SIZE - (prp % PAGE_SIZE));
 				if (prp_list == NULL) {
 					err = -1;
 					goto out;
 				}
 				last = prp_list + (NVME_PRP2_ITEMS - 1);
 			}
 
 			size = MIN(bytes, PAGE_SIZE);
 
 			if (pci_nvme_append_iov_req(sc, req, *prp_list, size,
 			    offset)) {
 				err = -1;
 				goto out;
 			}
 
 			offset += size;
 			bytes  -= size;
 
 			prp_list++;
 		}
 	}
 	req->io_req.br_callback = pci_nvme_io_done;
 	if (is_write)
 		err = blockif_write(nvstore->ctx, &req->io_req);
 	else
 		err = blockif_read(nvstore->ctx, &req->io_req);
 out:
 	if (err)
 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
 
 	return (status);
 }
 
 static bool
 nvme_opc_write_read(struct pci_nvme_softc *sc,
     struct nvme_command *cmd,
     struct pci_nvme_blockstore *nvstore,
     struct pci_nvme_ioreq *req,
     uint16_t *status)
 {
 	uint64_t lba, nblocks, bytes;
 	size_t offset;
 	bool is_write = cmd->opc == NVME_OPC_WRITE;
 	bool pending = false;
 
 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
 	bytes = nblocks << nvstore->sectsz_bits;
 	if (bytes > NVME_MAX_DATA_SIZE) {
 		WPRINTF("%s command would exceed MDTS", __func__);
 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
 		goto out;
 	}
 
 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
 		WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
 		    __func__, lba, nblocks);
 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
 		goto out;
 	}
 
 	offset = lba << nvstore->sectsz_bits;
 
 	req->bytes = bytes;
 	req->io_req.br_offset = lba;
 
 	/* PRP bits 1:0 must be zero */
 	cmd->prp1 &= ~0x3UL;
 	cmd->prp2 &= ~0x3UL;
 
 	if (nvstore->type == NVME_STOR_RAM) {
 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
 		    cmd->prp2, offset, bytes, is_write);
 	} else {
 		*status = nvme_write_read_blockif(sc, nvstore, req,
 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
 
 		if (*status == NVME_NO_STATUS)
 			pending = true;
 	}
 out:
 	if (!pending)
 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
 
 	return (pending);
 }
 
 static void
 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
 {
 	struct pci_nvme_ioreq *req = br->br_param;
 	struct pci_nvme_softc *sc = req->sc;
 	bool done = true;
 	uint16_t status;
 
 	status = 0;
 	if (err) {
 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
 	} else {
 		struct iovec *iov = req->io_req.br_iov;
 
 		req->prev_gpaddr++;
 		iov += req->prev_gpaddr;
 
 		/* The iov_* values already include the sector size */
 		req->io_req.br_offset = (off_t)iov->iov_base;
 		req->io_req.br_resid = iov->iov_len;
 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
 			pci_nvme_status_genc(&status,
 			    NVME_SC_INTERNAL_DEVICE_ERROR);
 		} else
 			done = false;
 	}
 
 	if (done) {
 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid,
 		    status);
 		pci_nvme_release_ioreq(sc, req);
 	}
 }
 
 static bool
 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
     struct nvme_command *cmd,
     struct pci_nvme_blockstore *nvstore,
     struct pci_nvme_ioreq *req,
     uint16_t *status)
 {
 	struct nvme_dsm_range *range = NULL;
 	uint32_t nr, r, non_zero, dr;
 	int err;
 	bool pending = false;
 
 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
 		goto out;
 	}
 
 	nr = cmd->cdw10 & 0xff;
 
 	/* copy locally because a range entry could straddle PRPs */
 	range = calloc(1, NVME_MAX_DSM_TRIM);
 	if (range == NULL) {
 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
 		goto out;
 	}
 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
 
 	/* Check for invalid ranges and the number of non-zero lengths */
 	non_zero = 0;
 	for (r = 0; r <= nr; r++) {
 		if (pci_nvme_out_of_range(nvstore,
 		    range[r].starting_lba, range[r].length)) {
 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
 			goto out;
 		}
 		if (range[r].length != 0)
 			non_zero++;
 	}
 
 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
 		size_t offset, bytes;
 		int sectsz_bits = sc->nvstore.sectsz_bits;
 
 		/*
 		 * DSM calls are advisory only, and compliant controllers
 		 * may choose to take no actions (i.e. return Success).
 		 */
 		if (!nvstore->deallocate) {
 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
 			goto out;
 		}
 
 		/* If all ranges have a zero length, return Success */
 		if (non_zero == 0) {
 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
 			goto out;
 		}
 
 		if (req == NULL) {
 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
 			goto out;
 		}
 
 		offset = range[0].starting_lba << sectsz_bits;
 		bytes = range[0].length << sectsz_bits;
 
 		/*
 		 * If the request is for more than a single range, store
 		 * the ranges in the br_iov. Optimize for the common case
 		 * of a single range.
 		 *
 		 * Note that NVMe Number of Ranges is a zero based value
 		 */
 		req->io_req.br_iovcnt = 0;
 		req->io_req.br_offset = offset;
 		req->io_req.br_resid = bytes;
 
 		if (nr == 0) {
 			req->io_req.br_callback = pci_nvme_io_done;
 		} else {
 			struct iovec *iov = req->io_req.br_iov;
 
 			for (r = 0, dr = 0; r <= nr; r++) {
 				offset = range[r].starting_lba << sectsz_bits;
 				bytes = range[r].length << sectsz_bits;
 				if (bytes == 0)
 					continue;
 
 				if ((nvstore->size - offset) < bytes) {
 					pci_nvme_status_genc(status,
 					    NVME_SC_LBA_OUT_OF_RANGE);
 					goto out;
 				}
 				iov[dr].iov_base = (void *)offset;
 				iov[dr].iov_len = bytes;
 				dr++;
 			}
 			req->io_req.br_callback = pci_nvme_dealloc_sm;
 
 			/*
 			 * Use prev_gpaddr to track the current entry and
 			 * prev_size to track the number of entries
 			 */
 			req->prev_gpaddr = 0;
 			req->prev_size = dr;
 		}
 
 		err = blockif_delete(nvstore->ctx, &req->io_req);
 		if (err)
 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
 		else
 			pending = true;
 	}
 out:
 	free(range);
 	return (pending);
 }
 
 static void
 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
 {
 	struct nvme_submission_queue *sq;
 	uint16_t status;
 	uint16_t sqhead;
 
 	/* handle all submissions up to sq->tail index */
 	sq = &sc->submit_queues[idx];
 
 	pthread_mutex_lock(&sq->mtx);
 
 	sqhead = sq->head;
 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
 	         idx, sqhead, sq->tail, sq->qbase);
 
 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
 		struct nvme_command *cmd;
 		struct pci_nvme_ioreq *req;
 		uint32_t nsid;
 		bool pending;
 
 		pending = false;
 		req = NULL;
 		status = 0;
 
 		cmd = &sq->qbase[sqhead];
 		sqhead = (sqhead + 1) % sq->size;
 
 		nsid = le32toh(cmd->nsid);
 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
 			pci_nvme_status_genc(&status,
 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
 			status |=
 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
 			goto complete;
  		}
 
 		req = pci_nvme_get_ioreq(sc);
 		if (req == NULL) {
 			pci_nvme_status_genc(&status,
 			    NVME_SC_INTERNAL_DEVICE_ERROR);
 			WPRINTF("%s: unable to allocate IO req", __func__);
 			goto complete;
 		}
 		req->nvme_sq = sq;
 		req->sqid = idx;
 		req->opc = cmd->opc;
 		req->cid = cmd->cid;
 		req->nsid = cmd->nsid;
 
 		switch (cmd->opc) {
 		case NVME_OPC_FLUSH:
 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
 			    req, &status);
  			break;
 		case NVME_OPC_WRITE:
 		case NVME_OPC_READ:
 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
 			    req, &status);
 			break;
 		case NVME_OPC_WRITE_ZEROES:
 			/* TODO: write zeroes
 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
 			break;
 		case NVME_OPC_DATASET_MANAGEMENT:
  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
 			    req, &status);
 			break;
  		default:
  			WPRINTF("%s unhandled io command 0x%x",
 			    __func__, cmd->opc);
 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
 		}
 complete:
 		if (!pending) {
 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, status);
 			if (req != NULL)
 				pci_nvme_release_ioreq(sc, req);
 		}
 	}
 
 	sq->head = sqhead;
 
 	pthread_mutex_unlock(&sq->mtx);
 }
 
 static void
 pci_nvme_handle_doorbell(struct pci_nvme_softc* sc,
 	uint64_t idx, int is_sq, uint64_t value)
 {
 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
 
 	if (is_sq) {
 		if (idx > sc->num_squeues) {
 			WPRINTF("%s queue index %lu overflow from "
 			         "guest (max %u)",
 			         __func__, idx, sc->num_squeues);
 			return;
 		}
 
 		atomic_store_short(&sc->submit_queues[idx].tail,
 		                   (uint16_t)value);
 
 		if (idx == 0) {
 			pci_nvme_handle_admin_cmd(sc, value);
 		} else {
 			/* submission queue; handle new entries in SQ */
 			if (idx > sc->num_squeues) {
 				WPRINTF("%s SQ index %lu overflow from "
 				         "guest (max %u)",
 				         __func__, idx, sc->num_squeues);
 				return;
 			}
 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
 		}
 	} else {
 		if (idx > sc->num_cqueues) {
 			WPRINTF("%s queue index %lu overflow from "
 			         "guest (max %u)",
 			         __func__, idx, sc->num_cqueues);
 			return;
 		}
 
 		atomic_store_short(&sc->compl_queues[idx].head,
 				(uint16_t)value);
 	}
 }
 
 static void
 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
 {
 	const char *s = iswrite ? "WRITE" : "READ";
 
 	switch (offset) {
 	case NVME_CR_CAP_LOW:
 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
 		break;
 	case NVME_CR_CAP_HI:
 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
 		break;
 	case NVME_CR_VS:
 		DPRINTF("%s %s NVME_CR_VS", func, s);
 		break;
 	case NVME_CR_INTMS:
 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
 		break;
 	case NVME_CR_INTMC:
 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
 		break;
 	case NVME_CR_CC:
 		DPRINTF("%s %s NVME_CR_CC", func, s);
 		break;
 	case NVME_CR_CSTS:
 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
 		break;
 	case NVME_CR_NSSR:
 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
 		break;
 	case NVME_CR_AQA:
 		DPRINTF("%s %s NVME_CR_AQA", func, s);
 		break;
 	case NVME_CR_ASQ_LOW:
 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
 		break;
 	case NVME_CR_ASQ_HI:
 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
 		break;
 	case NVME_CR_ACQ_LOW:
 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
 		break;
 	case NVME_CR_ACQ_HI:
 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
 		break;
 	default:
 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
 	}
 
 }
 
 static void
 pci_nvme_write_bar_0(struct pci_nvme_softc *sc, uint64_t offset, int size,
     uint64_t value)
 {
 	uint32_t ccreg;
 
 	if (offset >= NVME_DOORBELL_OFFSET) {
 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
 		int is_sq = (belloffset % 8) < 4;
 
 		if ((sc->regs.csts & NVME_CSTS_RDY) == 0) {
 			WPRINTF("doorbell write prior to RDY (offset=%#lx)\n",
 			    offset);
 			return;
 		}
 
 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
 			WPRINTF("guest attempted an overflow write offset "
 			         "0x%lx, val 0x%lx in %s",
 			         offset, value, __func__);
 			return;
 		}
 
 		if (is_sq) {
 			if (sc->submit_queues[idx].qbase == NULL)
 				return;
 		} else if (sc->compl_queues[idx].qbase == NULL)
 			return;
 
 		pci_nvme_handle_doorbell(sc, idx, is_sq, value);
 		return;
 	}
 
 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
 	        offset, size, value);
 
 	if (size != 4) {
 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
 		         "val 0x%lx) to bar0 in %s",
 		         size, offset, value, __func__);
 		/* TODO: shutdown device */
 		return;
 	}
 
 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
 
 	pthread_mutex_lock(&sc->mtx);
 
 	switch (offset) {
 	case NVME_CR_CAP_LOW:
 	case NVME_CR_CAP_HI:
 		/* readonly */
 		break;
 	case NVME_CR_VS:
 		/* readonly */
 		break;
 	case NVME_CR_INTMS:
 		/* MSI-X, so ignore */
 		break;
 	case NVME_CR_INTMC:
 		/* MSI-X, so ignore */
 		break;
 	case NVME_CR_CC:
 		ccreg = (uint32_t)value;
 
 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
 		         "iocqes %u",
 		        __func__,
 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
 			 NVME_CC_GET_IOCQES(ccreg));
 
 		if (NVME_CC_GET_SHN(ccreg)) {
 			/* perform shutdown - flush out data to backend */
 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
 			    NVME_CSTS_REG_SHST_SHIFT);
 			sc->regs.csts |= NVME_SHST_COMPLETE <<
 			    NVME_CSTS_REG_SHST_SHIFT;
 		}
 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
 			if (NVME_CC_GET_EN(ccreg) == 0)
 				/* transition 1-> causes controller reset */
 				pci_nvme_reset_locked(sc);
 			else
 				pci_nvme_init_controller(sc);
 		}
 
 		/* Insert the iocqes, iosqes and en bits from the write */
 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
 		if (NVME_CC_GET_EN(ccreg) == 0) {
 			/* Insert the ams, mps and css bit fields */
 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
 			sc->regs.csts &= ~NVME_CSTS_RDY;
 		} else if ((sc->pending_ios == 0) &&
 		    !(sc->regs.csts & NVME_CSTS_CFS)) {
 			sc->regs.csts |= NVME_CSTS_RDY;
 		}
 		break;
 	case NVME_CR_CSTS:
 		break;
 	case NVME_CR_NSSR:
 		/* ignore writes; don't support subsystem reset */
 		break;
 	case NVME_CR_AQA:
 		sc->regs.aqa = (uint32_t)value;
 		break;
 	case NVME_CR_ASQ_LOW:
 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
 		               (0xFFFFF000 & value);
 		break;
 	case NVME_CR_ASQ_HI:
 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
 		               (value << 32);
 		break;
 	case NVME_CR_ACQ_LOW:
 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
 		               (0xFFFFF000 & value);
 		break;
 	case NVME_CR_ACQ_HI:
 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
 		               (value << 32);
 		break;
 	default:
 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
 		         __func__, offset, value, size);
 	}
 	pthread_mutex_unlock(&sc->mtx);
 }
 
 static void
 pci_nvme_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size,
     uint64_t value)
 {
 	struct pci_nvme_softc* sc = pi->pi_arg;
 
 	if (baridx == pci_msix_table_bar(pi) ||
 	    baridx == pci_msix_pba_bar(pi)) {
 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
 		         " value 0x%lx", baridx, offset, size, value);
 
 		pci_emul_msix_twrite(pi, offset, size, value);
 		return;
 	}
 
 	switch (baridx) {
 	case 0:
 		pci_nvme_write_bar_0(sc, offset, size, value);
 		break;
 
 	default:
 		DPRINTF("%s unknown baridx %d, val 0x%lx",
 		         __func__, baridx, value);
 	}
 }
 
 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
 	uint64_t offset, int size)
 {
 	uint64_t value;
 
 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
 
 	if (offset < NVME_DOORBELL_OFFSET) {
 		void *p = &(sc->regs);
 		pthread_mutex_lock(&sc->mtx);
 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
 		pthread_mutex_unlock(&sc->mtx);
 	} else {
 		value = 0;
                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
 	}
 
 	switch (size) {
 	case 1:
 		value &= 0xFF;
 		break;
 	case 2:
 		value &= 0xFFFF;
 		break;
 	case 4:
 		value &= 0xFFFFFFFF;
 		break;
 	}
 
 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
 	         offset, size, (uint32_t)value);
 
 	return (value);
 }
 
 
 
 static uint64_t
 pci_nvme_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size)
 {
 	struct pci_nvme_softc* sc = pi->pi_arg;
 
 	if (baridx == pci_msix_table_bar(pi) ||
 	    baridx == pci_msix_pba_bar(pi)) {
 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
 		        baridx, offset, size);
 
 		return pci_emul_msix_tread(pi, offset, size);
 	}
 
 	switch (baridx) {
 	case 0:
        		return pci_nvme_read_bar_0(sc, offset, size);
 
 	default:
 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
 	}
 
 	return (0);
 }
 
 static int
 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
 {
 	char bident[sizeof("XXX:XXX")];
 	const char *value;
 	uint32_t sectsz;
 
 	sc->max_queues = NVME_QUEUES;
 	sc->max_qentries = NVME_MAX_QENTRIES;
 	sc->ioslots = NVME_IOSLOTS;
 	sc->num_squeues = sc->max_queues;
 	sc->num_cqueues = sc->max_queues;
 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
 	sectsz = 0;
 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
 
 	value = get_config_value_node(nvl, "maxq");
 	if (value != NULL)
 		sc->max_queues = atoi(value);
 	value = get_config_value_node(nvl, "qsz");
 	if (value != NULL) {
 		sc->max_qentries = atoi(value);
 		if (sc->max_qentries <= 0) {
 			EPRINTLN("nvme: Invalid qsz option %d",
 			    sc->max_qentries);
 			return (-1);
 		}
 	}
 	value = get_config_value_node(nvl, "ioslots");
 	if (value != NULL) {
 		sc->ioslots = atoi(value);
 		if (sc->ioslots <= 0) {
 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
 			return (-1);
 		}
 	}
 	value = get_config_value_node(nvl, "sectsz");
 	if (value != NULL)
 		sectsz = atoi(value);
 	value = get_config_value_node(nvl, "ser");
 	if (value != NULL) {
 		/*
 		 * This field indicates the Product Serial Number in
 		 * 7-bit ASCII, unused bytes should be space characters.
 		 * Ref: NVMe v1.3c.
 		 */
 		cpywithpad((char *)sc->ctrldata.sn,
 		    sizeof(sc->ctrldata.sn), value, ' ');
 	}
 	value = get_config_value_node(nvl, "eui64");
 	if (value != NULL)
 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
 	value = get_config_value_node(nvl, "dsm");
 	if (value != NULL) {
 		if (strcmp(value, "auto") == 0)
 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
 		else if (strcmp(value, "enable") == 0)
 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
 		else if (strcmp(value, "disable") == 0)
 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
 	}
 
+	value = get_config_value_node(nvl, "bootindex");
+	if (value != NULL) {
+		if (pci_emul_add_boot_device(sc->nsc_pi, atoi(value))) {
+			EPRINTLN("Invalid bootindex %d", atoi(value));
+			return (-1);
+		}
+	}
+
 	value = get_config_value_node(nvl, "ram");
 	if (value != NULL) {
 		uint64_t sz = strtoull(value, NULL, 10);
 
 		sc->nvstore.type = NVME_STOR_RAM;
 		sc->nvstore.size = sz * 1024 * 1024;
 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
 		sc->nvstore.sectsz = 4096;
 		sc->nvstore.sectsz_bits = 12;
 		if (sc->nvstore.ctx == NULL) {
 			EPRINTLN("nvme: Unable to allocate RAM");
 			return (-1);
 		}
 	} else {
 		snprintf(bident, sizeof(bident), "%u:%u",
 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
 		sc->nvstore.ctx = blockif_open(nvl, bident);
 		if (sc->nvstore.ctx == NULL) {
 			EPRINTLN("nvme: Could not open backing file: %s",
 			    strerror(errno));
 			return (-1);
 		}
 		sc->nvstore.type = NVME_STOR_BLOCKIF;
 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
 	}
 
 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
 		sc->nvstore.sectsz = sectsz;
 	else if (sc->nvstore.type != NVME_STOR_RAM)
 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
 	for (sc->nvstore.sectsz_bits = 9;
 	     (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
 	     sc->nvstore.sectsz_bits++);
 
 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
 		sc->max_queues = NVME_QUEUES;
 
 	return (0);
 }
 
 static void
 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg,
     size_t new_size)
 {
 	struct pci_nvme_softc *sc;
 	struct pci_nvme_blockstore *nvstore;
 	struct nvme_namespace_data *nd;
 
 	sc = arg;
 	nvstore = &sc->nvstore;
 	nd = &sc->nsdata;
 
 	nvstore->size = new_size;
 	pci_nvme_init_nsdata_size(nvstore, nd);
 
 	/* Add changed NSID to list */
 	sc->ns_log.ns[0] = 1;
 	sc->ns_log.ns[1] = 0;
 
 	pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
 	    PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED);
 }
 
 static int
 pci_nvme_init(struct pci_devinst *pi, nvlist_t *nvl)
 {
 	struct pci_nvme_softc *sc;
 	uint32_t pci_membar_sz;
 	int	error;
 
 	error = 0;
 
 	sc = calloc(1, sizeof(struct pci_nvme_softc));
 	pi->pi_arg = sc;
 	sc->nsc_pi = pi;
 
 	error = pci_nvme_parse_config(sc, nvl);
 	if (error < 0)
 		goto done;
 	else
 		error = 0;
 
 	STAILQ_INIT(&sc->ioreqs_free);
 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
 	for (uint32_t i = 0; i < sc->ioslots; i++) {
 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
 	}
 
 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
 	pci_set_cfgdata8(pi, PCIR_PROGIF,
 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
 
 	/*
 	 * Allocate size of NVMe registers + doorbell space for all queues.
 	 *
 	 * The specification requires a minimum memory I/O window size of 16K.
 	 * The Windows driver will refuse to start a device with a smaller
 	 * window.
 	 */
 	pci_membar_sz = sizeof(struct nvme_registers) +
 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
 
 	DPRINTF("nvme membar size: %u", pci_membar_sz);
 
 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
 	if (error) {
 		WPRINTF("%s pci alloc mem bar failed", __func__);
 		goto done;
 	}
 
 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
 	if (error) {
 		WPRINTF("%s pci add msixcap failed", __func__);
 		goto done;
 	}
 
 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
 	if (error) {
 		WPRINTF("%s pci add Express capability failed", __func__);
 		goto done;
 	}
 
 	pthread_mutex_init(&sc->mtx, NULL);
 	sem_init(&sc->iosemlock, 0, sc->ioslots);
 	blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
 
 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
 	/*
 	 * Controller data depends on Namespace data so initialize Namespace
 	 * data first.
 	 */
 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
 	pci_nvme_init_ctrldata(sc);
 	pci_nvme_init_logpages(sc);
 	pci_nvme_init_features(sc);
 
 	pci_nvme_aer_init(sc);
 	pci_nvme_aen_init(sc);
 
 	pci_nvme_reset(sc);
 
 	pci_lintr_request(pi);
 
 done:
 	return (error);
 }
 
 static int
 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
 {
 	char *cp, *ram;
 
 	if (opts == NULL)
 		return (0);
 
 	if (strncmp(opts, "ram=", 4) == 0) {
 		cp = strchr(opts, ',');
 		if (cp == NULL) {
 			set_config_value_node(nvl, "ram", opts + 4);
 			return (0);
 		}
 		ram = strndup(opts + 4, cp - opts - 4);
 		set_config_value_node(nvl, "ram", ram);
 		free(ram);
 		return (pci_parse_legacy_config(nvl, cp + 1));
 	} else
 		return (blockif_legacy_config(nvl, opts));
 }
 
 static const struct pci_devemu pci_de_nvme = {
 	.pe_emu =	"nvme",
 	.pe_init =	pci_nvme_init,
 	.pe_legacy_config = pci_nvme_legacy_config,
 	.pe_barwrite =	pci_nvme_write,
 	.pe_barread =	pci_nvme_read
 };
 PCI_EMUL_SET(pci_de_nvme);
diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c
index 9fd6db41dba8..c8ec62a66793 100644
--- a/usr.sbin/bhyve/pci_virtio_block.c
+++ b/usr.sbin/bhyve/pci_virtio_block.c
@@ -1,601 +1,606 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
  * Copyright 2020-2021 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
 #include <sys/ioctl.h>
 #include <sys/disk.h>
 
 #include <machine/vmm_snapshot.h>
 
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <strings.h>
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
 #include <md5.h>
 
 #include "bhyverun.h"
 #include "config.h"
 #include "debug.h"
 #include "pci_emul.h"
 #include "virtio.h"
 #include "block_if.h"
 
 #define	VTBLK_BSIZE	512
 #define	VTBLK_RINGSZ	128
 
 _Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request");
 
 #define	VTBLK_S_OK	0
 #define	VTBLK_S_IOERR	1
 #define	VTBLK_S_UNSUPP	2
 
 #define	VTBLK_BLK_ID_BYTES	20 + 1
 
 /* Capability bits */
 #define	VTBLK_F_BARRIER		(1 << 0)	/* Does host support barriers? */
 #define	VTBLK_F_SIZE_MAX	(1 << 1)	/* Indicates maximum segment size */
 #define	VTBLK_F_SEG_MAX		(1 << 2)	/* Indicates maximum # of segments */
 #define	VTBLK_F_GEOMETRY	(1 << 4)	/* Legacy geometry available  */
 #define	VTBLK_F_RO		(1 << 5)	/* Disk is read-only */
 #define	VTBLK_F_BLK_SIZE	(1 << 6)	/* Block size of disk is available*/
 #define	VTBLK_F_SCSI		(1 << 7)	/* Supports scsi command passthru */
 #define	VTBLK_F_FLUSH		(1 << 9)	/* Writeback mode enabled after reset */
 #define	VTBLK_F_WCE		(1 << 9)	/* Legacy alias for FLUSH */
 #define	VTBLK_F_TOPOLOGY	(1 << 10)	/* Topology information is available */
 #define	VTBLK_F_CONFIG_WCE	(1 << 11)	/* Writeback mode available in config */
 #define	VTBLK_F_MQ		(1 << 12)	/* Multi-Queue */
 #define	VTBLK_F_DISCARD		(1 << 13)	/* Trim blocks */
 #define	VTBLK_F_WRITE_ZEROES	(1 << 14)	/* Write zeros */
 
 /*
  * Host capabilities
  */
 #define	VTBLK_S_HOSTCAPS      \
   ( VTBLK_F_SEG_MAX  |						    \
     VTBLK_F_BLK_SIZE |						    \
     VTBLK_F_FLUSH    |						    \
     VTBLK_F_TOPOLOGY |						    \
     VIRTIO_RING_F_INDIRECT_DESC )	/* indirect descriptors */
 
 /*
  * The current blockif_delete() interface only allows a single delete
  * request at a time.
  */
 #define	VTBLK_MAX_DISCARD_SEG	1
 
 /*
  * An arbitrary limit to prevent excessive latency due to large
  * delete requests.
  */
 #define	VTBLK_MAX_DISCARD_SECT	((16 << 20) / VTBLK_BSIZE)	/* 16 MiB */
 
 /*
  * Config space "registers"
  */
 struct vtblk_config {
 	uint64_t	vbc_capacity;
 	uint32_t	vbc_size_max;
 	uint32_t	vbc_seg_max;
 	struct {
 		uint16_t cylinders;
 		uint8_t heads;
 		uint8_t sectors;
 	} vbc_geometry;
 	uint32_t	vbc_blk_size;
 	struct {
 		uint8_t physical_block_exp;
 		uint8_t alignment_offset;
 		uint16_t min_io_size;
 		uint32_t opt_io_size;
 	} vbc_topology;
 	uint8_t		vbc_writeback;
 	uint8_t		unused0[1];
 	uint16_t	num_queues;
 	uint32_t	max_discard_sectors;
 	uint32_t	max_discard_seg;
 	uint32_t	discard_sector_alignment;
 	uint32_t	max_write_zeroes_sectors;
 	uint32_t	max_write_zeroes_seg;
 	uint8_t		write_zeroes_may_unmap;
 	uint8_t		unused1[3];
 } __packed;
 
 /*
  * Fixed-size block header
  */
 struct virtio_blk_hdr {
 #define	VBH_OP_READ		0
 #define	VBH_OP_WRITE		1
 #define	VBH_OP_SCSI_CMD		2
 #define	VBH_OP_SCSI_CMD_OUT	3
 #define	VBH_OP_FLUSH		4
 #define	VBH_OP_FLUSH_OUT	5
 #define	VBH_OP_IDENT		8
 #define	VBH_OP_DISCARD		11
 #define	VBH_OP_WRITE_ZEROES	13
 
 #define	VBH_FLAG_BARRIER	0x80000000	/* OR'ed into vbh_type */
 	uint32_t	vbh_type;
 	uint32_t	vbh_ioprio;
 	uint64_t	vbh_sector;
 } __packed;
 
 /*
  * Debug printf
  */
 static int pci_vtblk_debug;
 #define	DPRINTF(params) if (pci_vtblk_debug) PRINTLN params
 #define	WPRINTF(params) PRINTLN params
 
 struct pci_vtblk_ioreq {
 	struct blockif_req		io_req;
 	struct pci_vtblk_softc		*io_sc;
 	uint8_t				*io_status;
 	uint16_t			io_idx;
 };
 
 struct virtio_blk_discard_write_zeroes {
 	uint64_t	sector;
 	uint32_t	num_sectors;
 	struct {
 		uint32_t unmap:1;
 		uint32_t reserved:31;
 	} flags;
 };
 
 /*
  * Per-device softc
  */
 struct pci_vtblk_softc {
 	struct virtio_softc vbsc_vs;
 	pthread_mutex_t vsc_mtx;
 	struct vqueue_info vbsc_vq;
 	struct vtblk_config vbsc_cfg;
 	struct virtio_consts vbsc_consts;
 	struct blockif_ctxt *bc;
 	char vbsc_ident[VTBLK_BLK_ID_BYTES];
 	struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ];
 };
 
 static void pci_vtblk_reset(void *);
 static void pci_vtblk_notify(void *, struct vqueue_info *);
 static int pci_vtblk_cfgread(void *, int, int, uint32_t *);
 static int pci_vtblk_cfgwrite(void *, int, int, uint32_t);
 #ifdef BHYVE_SNAPSHOT
 static void pci_vtblk_pause(void *);
 static void pci_vtblk_resume(void *);
 static int pci_vtblk_snapshot(void *, struct vm_snapshot_meta *);
 #endif
 
 static struct virtio_consts vtblk_vi_consts = {
 	.vc_name =	"vtblk",
 	.vc_nvq =	1,
 	.vc_cfgsize =	sizeof(struct vtblk_config),
 	.vc_reset =	pci_vtblk_reset,
 	.vc_qnotify =	pci_vtblk_notify,
 	.vc_cfgread =	pci_vtblk_cfgread,
 	.vc_cfgwrite =	pci_vtblk_cfgwrite,
 	.vc_apply_features = NULL,
 	.vc_hv_caps =	VTBLK_S_HOSTCAPS,
 #ifdef BHYVE_SNAPSHOT
 	.vc_pause =	pci_vtblk_pause,
 	.vc_resume =	pci_vtblk_resume,
 	.vc_snapshot =	pci_vtblk_snapshot,
 #endif
 };
 
 static void
 pci_vtblk_reset(void *vsc)
 {
 	struct pci_vtblk_softc *sc = vsc;
 
 	DPRINTF(("vtblk: device reset requested !"));
 	vi_reset_dev(&sc->vbsc_vs);
 }
 
 static void
 pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err)
 {
 	struct pci_vtblk_softc *sc = io->io_sc;
 
 	/* convert errno into a virtio block error return */
 	if (err == EOPNOTSUPP || err == ENOSYS)
 		*io->io_status = VTBLK_S_UNSUPP;
 	else if (err != 0)
 		*io->io_status = VTBLK_S_IOERR;
 	else
 		*io->io_status = VTBLK_S_OK;
 
 	/*
 	 * Return the descriptor back to the host.
 	 * We wrote 1 byte (our status) to host.
 	 */
 	vq_relchain(&sc->vbsc_vq, io->io_idx, 1);
 	vq_endchains(&sc->vbsc_vq, 0);
 }
 
 #ifdef BHYVE_SNAPSHOT
 static void
 pci_vtblk_pause(void *vsc)
 {
 	struct pci_vtblk_softc *sc = vsc;
 
 	DPRINTF(("vtblk: device pause requested !\n"));
 	blockif_pause(sc->bc);
 }
 
 static void
 pci_vtblk_resume(void *vsc)
 {
 	struct pci_vtblk_softc *sc = vsc;
 
 	DPRINTF(("vtblk: device resume requested !\n"));
 	blockif_resume(sc->bc);
 }
 
 static int
 pci_vtblk_snapshot(void *vsc, struct vm_snapshot_meta *meta)
 {
 	int ret;
 	struct pci_vtblk_softc *sc = vsc;
 
 	SNAPSHOT_VAR_OR_LEAVE(sc->vbsc_cfg, meta, ret, done);
 	SNAPSHOT_BUF_OR_LEAVE(sc->vbsc_ident, sizeof(sc->vbsc_ident),
 			      meta, ret, done);
 
 done:
 	return (ret);
 }
 #endif
 
 static void
 pci_vtblk_done(struct blockif_req *br, int err)
 {
 	struct pci_vtblk_ioreq *io = br->br_param;
 	struct pci_vtblk_softc *sc = io->io_sc;
 
 	pthread_mutex_lock(&sc->vsc_mtx);
 	pci_vtblk_done_locked(io, err);
 	pthread_mutex_unlock(&sc->vsc_mtx);
 }
 
 static void
 pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
 {
 	struct virtio_blk_hdr *vbh;
 	struct pci_vtblk_ioreq *io;
 	int i, n;
 	int err;
 	ssize_t iolen;
 	int writeop, type;
 	struct vi_req req;
 	struct iovec iov[BLOCKIF_IOV_MAX + 2];
 	struct virtio_blk_discard_write_zeroes *discard;
 
 	n = vq_getchain(vq, iov, BLOCKIF_IOV_MAX + 2, &req);
 
 	/*
 	 * The first descriptor will be the read-only fixed header,
 	 * and the last is for status (hence +2 above and below).
 	 * The remaining iov's are the actual data I/O vectors.
 	 *
 	 * XXX - note - this fails on crash dump, which does a
 	 * VIRTIO_BLK_T_FLUSH with a zero transfer length
 	 */
 	assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2);
 
 	io = &sc->vbsc_ios[req.idx];
 	assert(req.readable != 0);
 	assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
 	vbh = (struct virtio_blk_hdr *)iov[0].iov_base;
 	memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2));
 	io->io_req.br_iovcnt = n - 2;
 	io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE;
 	io->io_status = (uint8_t *)iov[--n].iov_base;
 	assert(req.writable != 0);
 	assert(iov[n].iov_len == 1);
 
 	/*
 	 * XXX
 	 * The guest should not be setting the BARRIER flag because
 	 * we don't advertise the capability.
 	 */
 	type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
 	writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD);
 	/*
 	 * - Write op implies read-only descriptor
 	 * - Read/ident op implies write-only descriptor
 	 *
 	 * By taking away either the read-only fixed header or the write-only
 	 * status iovec, the following condition should hold true.
 	 */
 	assert(n == (writeop ? req.readable : req.writable));
 
 	iolen = 0;
 	for (i = 1; i < n; i++) {
 		iolen += iov[i].iov_len;
 	}
 	io->io_req.br_resid = iolen;
 
 	DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld",
 		 writeop ? "write/discard" : "read/ident", iolen, i - 1,
 		 io->io_req.br_offset));
 
 	switch (type) {
 	case VBH_OP_READ:
 		err = blockif_read(sc->bc, &io->io_req);
 		break;
 	case VBH_OP_WRITE:
 		err = blockif_write(sc->bc, &io->io_req);
 		break;
 	case VBH_OP_DISCARD:
 		/*
 		 * We currently only support a single request, if the guest
 		 * has submitted a request that doesn't conform to the
 		 * requirements, we return a error.
 		 */
 		if (iov[1].iov_len != sizeof (*discard)) {
 			pci_vtblk_done_locked(io, EINVAL);
 			return;
 		}
 
 		/* The segments to discard are provided rather than data */
 		discard = (struct virtio_blk_discard_write_zeroes *)
 		    iov[1].iov_base;
 
 		/*
 		 * virtio v1.1 5.2.6.2:
 		 * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP
 		 * for discard and write zeroes commands if any unknown flag is
 		 * set. Furthermore, the device MUST set the status byte to
 		 * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag
 		 * is set.
 		 *
 		 * Currently there are no known flags for a DISCARD request.
 		 */
 		if (discard->flags.unmap != 0 || discard->flags.reserved != 0) {
 			pci_vtblk_done_locked(io, ENOTSUP);
 			return;
 		}
 
 		/* Make sure the request doesn't exceed our size limit */
 		if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) {
 			pci_vtblk_done_locked(io, EINVAL);
 			return;
 		}
 
 		io->io_req.br_offset = discard->sector * VTBLK_BSIZE;
 		io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE;
 		err = blockif_delete(sc->bc, &io->io_req);
 		break;
 	case VBH_OP_FLUSH:
 	case VBH_OP_FLUSH_OUT:
 		err = blockif_flush(sc->bc, &io->io_req);
 		break;
 	case VBH_OP_IDENT:
 		/* Assume a single buffer */
 		/* S/n equal to buffer is not zero-terminated. */
 		memset(iov[1].iov_base, 0, iov[1].iov_len);
 		strncpy(iov[1].iov_base, sc->vbsc_ident,
 		    MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
 		pci_vtblk_done_locked(io, 0);
 		return;
 	default:
 		pci_vtblk_done_locked(io, EOPNOTSUPP);
 		return;
 	}
 	assert(err == 0);
 }
 
 static void
 pci_vtblk_notify(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtblk_softc *sc = vsc;
 
 	while (vq_has_descs(vq))
 		pci_vtblk_proc(sc, vq);
 }
 
 static void
 pci_vtblk_resized(struct blockif_ctxt *bctxt __unused, void *arg,
     size_t new_size)
 {
 	struct pci_vtblk_softc *sc;
 
 	sc = arg;
 
 	sc->vbsc_cfg.vbc_capacity = new_size / VTBLK_BSIZE; /* 512-byte units */
 	vi_interrupt(&sc->vbsc_vs, VIRTIO_PCI_ISR_CONFIG,
 	    sc->vbsc_vs.vs_msix_cfg_idx);
 }
 
 static int
 pci_vtblk_init(struct pci_devinst *pi, nvlist_t *nvl)
 {
 	char bident[sizeof("XXX:XXX")];
 	struct blockif_ctxt *bctxt;
 	const char *path, *serial;
 	MD5_CTX mdctx;
 	u_char digest[16];
 	struct pci_vtblk_softc *sc;
 	off_t size;
 	int i, sectsz, sts, sto;
 
 	/*
 	 * The supplied backing file has to exist
 	 */
 	snprintf(bident, sizeof(bident), "%u:%u", pi->pi_slot, pi->pi_func);
 	bctxt = blockif_open(nvl, bident);
 	if (bctxt == NULL) {
 		perror("Could not open backing file");
 		return (1);
 	}
 
+	if (blockif_add_boot_device(pi, bctxt)) {
+		perror("Invalid boot device");
+		return (1);
+	}
+
 	size = blockif_size(bctxt);
 	sectsz = blockif_sectsz(bctxt);
 	blockif_psectsz(bctxt, &sts, &sto);
 
 	sc = calloc(1, sizeof(struct pci_vtblk_softc));
 	sc->bc = bctxt;
 	for (i = 0; i < VTBLK_RINGSZ; i++) {
 		struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i];
 		io->io_req.br_callback = pci_vtblk_done;
 		io->io_req.br_param = io;
 		io->io_sc = sc;
 		io->io_idx = i;
 	}
 
 	bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts));
 	if (blockif_candelete(sc->bc))
 		sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD;
 
 	pthread_mutex_init(&sc->vsc_mtx, NULL);
 
 	/* init virtio softc and virtqueues */
 	vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, pi, &sc->vbsc_vq);
 	sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;
 
 	sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
 	/* sc->vbsc_vq.vq_notify = we have no per-queue notify */
 
 	/*
 	 * If an explicit identifier is not given, create an
 	 * identifier using parts of the md5 sum of the filename.
 	 */
 	bzero(sc->vbsc_ident, VTBLK_BLK_ID_BYTES);
 	if ((serial = get_config_value_node(nvl, "serial")) != NULL ||
 	    (serial = get_config_value_node(nvl, "ser")) != NULL) {
 		strlcpy(sc->vbsc_ident, serial, VTBLK_BLK_ID_BYTES);
 	} else {
 		path = get_config_value_node(nvl, "path");
 		MD5Init(&mdctx);
 		MD5Update(&mdctx, path, strlen(path));
 		MD5Final(digest, &mdctx);
 		snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES,
 		    "BHYVE-%02X%02X-%02X%02X-%02X%02X",
 		    digest[0], digest[1], digest[2], digest[3], digest[4],
 		    digest[5]);
 	}
 
 	/* setup virtio block config space */
 	sc->vbsc_cfg.vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */
 	sc->vbsc_cfg.vbc_size_max = 0;	/* not negotiated */
 
 	/*
 	 * If Linux is presented with a seg_max greater than the virtio queue
 	 * size, it can stumble into situations where it violates its own
 	 * invariants and panics.  For safety, we keep seg_max clamped, paying
 	 * heed to the two extra descriptors needed for the header and status
 	 * of a request.
 	 */
 	sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX);
 	sc->vbsc_cfg.vbc_geometry.cylinders = 0;	/* no geometry */
 	sc->vbsc_cfg.vbc_geometry.heads = 0;
 	sc->vbsc_cfg.vbc_geometry.sectors = 0;
 	sc->vbsc_cfg.vbc_blk_size = sectsz;
 	sc->vbsc_cfg.vbc_topology.physical_block_exp =
 	    (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0;
 	sc->vbsc_cfg.vbc_topology.alignment_offset =
 	    (sto != 0) ? ((sts - sto) / sectsz) : 0;
 	sc->vbsc_cfg.vbc_topology.min_io_size = 0;
 	sc->vbsc_cfg.vbc_topology.opt_io_size = 0;
 	sc->vbsc_cfg.vbc_writeback = 0;
 	sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT;
 	sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG;
 	sc->vbsc_cfg.discard_sector_alignment = MAX(sectsz, sts) / VTBLK_BSIZE;
 
 	/*
 	 * Should we move some of this into virtio.c?  Could
 	 * have the device, class, and subdev_0 as fields in
 	 * the virtio constants structure.
 	 */
 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_BLOCK);
 	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
 
 	if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) {
 		blockif_close(sc->bc);
 		free(sc);
 		return (1);
 	}
 	vi_set_io_bar(&sc->vbsc_vs, 0);
 	blockif_register_resize_callback(sc->bc, pci_vtblk_resized, sc);
 	return (0);
 }
 
 static int
 pci_vtblk_cfgwrite(void *vsc __unused, int offset, int size __unused,
     uint32_t value __unused)
 {
 
 	DPRINTF(("vtblk: write to readonly reg %d", offset));
 	return (1);
 }
 
 static int
 pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval)
 {
 	struct pci_vtblk_softc *sc = vsc;
 	void *ptr;
 
 	/* our caller has already verified offset and size */
 	ptr = (uint8_t *)&sc->vbsc_cfg + offset;
 	memcpy(retval, ptr, size);
 	return (0);
 }
 
 static const struct pci_devemu pci_de_vblk = {
 	.pe_emu =	"virtio-blk",
 	.pe_init =	pci_vtblk_init,
 	.pe_legacy_config = blockif_legacy_config,
 	.pe_barwrite =	vi_pci_write,
 	.pe_barread =	vi_pci_read,
 #ifdef BHYVE_SNAPSHOT
 	.pe_snapshot =	vi_pci_snapshot,
 	.pe_pause =     vi_pci_pause,
 	.pe_resume =    vi_pci_resume,
 #endif
 };
 PCI_EMUL_SET(pci_de_vblk);
diff --git a/usr.sbin/bhyve/pci_virtio_scsi.c b/usr.sbin/bhyve/pci_virtio_scsi.c
index 7d5409cff6a1..fa6cb3a48787 100644
--- a/usr.sbin/bhyve/pci_virtio_scsi.c
+++ b/usr.sbin/bhyve/pci_virtio_scsi.c
@@ -1,764 +1,773 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>.
  * Copyright (c) 2018 Marcelo Araujo <araujo@FreeBSD.org>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
 #include <sys/types.h>
 #include <sys/uio.h>
 #include <sys/time.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
 #include <pthread_np.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_message.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_ioctl.h>
 #include <cam/ctl/ctl_util.h>
 #include <cam/ctl/ctl_scsi_all.h>
 #include <camlib.h>
 
 #include "bhyverun.h"
 #include "config.h"
 #include "debug.h"
 #include "pci_emul.h"
 #include "virtio.h"
 #include "iov.h"
 
 #define VTSCSI_RINGSZ		64
 #define	VTSCSI_REQUESTQ		1
 #define	VTSCSI_THR_PER_Q	16
 #define	VTSCSI_MAXQ		(VTSCSI_REQUESTQ + 2)
 #define	VTSCSI_MAXSEG		64
 
 #define	VTSCSI_IN_HEADER_LEN(_sc)	\
 	(sizeof(struct pci_vtscsi_req_cmd_rd) + _sc->vss_config.cdb_size)
 
 #define	VTSCSI_OUT_HEADER_LEN(_sc) 	\
 	(sizeof(struct pci_vtscsi_req_cmd_wr) + _sc->vss_config.sense_size)
 
 #define	VIRTIO_SCSI_MAX_CHANNEL	0
 #define	VIRTIO_SCSI_MAX_TARGET	0
 #define	VIRTIO_SCSI_MAX_LUN	16383
 
 #define	VIRTIO_SCSI_F_INOUT	(1 << 0)
 #define	VIRTIO_SCSI_F_HOTPLUG	(1 << 1)
 #define	VIRTIO_SCSI_F_CHANGE	(1 << 2)
 
 static int pci_vtscsi_debug = 0;
 #define	WPRINTF(msg, params...) PRINTLN("virtio-scsi: " msg, ##params)
 #define	DPRINTF(msg, params...) if (pci_vtscsi_debug) WPRINTF(msg, ##params)
 
 struct pci_vtscsi_config {
 	uint32_t num_queues;
 	uint32_t seg_max;
 	uint32_t max_sectors;
 	uint32_t cmd_per_lun;
 	uint32_t event_info_size;
 	uint32_t sense_size;
 	uint32_t cdb_size;
 	uint16_t max_channel;
 	uint16_t max_target;
 	uint32_t max_lun;
 } __attribute__((packed));
 
 struct pci_vtscsi_queue {
 	struct pci_vtscsi_softc *         vsq_sc;
 	struct vqueue_info *              vsq_vq;
 	pthread_mutex_t                   vsq_mtx;
 	pthread_mutex_t                   vsq_qmtx;
 	pthread_cond_t                    vsq_cv;
 	STAILQ_HEAD(, pci_vtscsi_request) vsq_requests;
 	LIST_HEAD(, pci_vtscsi_worker)    vsq_workers;
 };
 
 struct pci_vtscsi_worker {
 	struct pci_vtscsi_queue *     vsw_queue;
 	pthread_t                     vsw_thread;
 	bool                          vsw_exiting;
 	LIST_ENTRY(pci_vtscsi_worker) vsw_link;
 };
 
 struct pci_vtscsi_request {
 	struct pci_vtscsi_queue * vsr_queue;
 	struct iovec              vsr_iov_in[VTSCSI_MAXSEG];
 	int                       vsr_niov_in;
 	struct iovec              vsr_iov_out[VTSCSI_MAXSEG];
 	int                       vsr_niov_out;
 	uint32_t                  vsr_idx;
 	STAILQ_ENTRY(pci_vtscsi_request) vsr_link;
 };
 
 /*
  * Per-device softc
  */
 struct pci_vtscsi_softc {
 	struct virtio_softc      vss_vs;
 	struct vqueue_info       vss_vq[VTSCSI_MAXQ];
 	struct pci_vtscsi_queue  vss_queues[VTSCSI_REQUESTQ];
 	pthread_mutex_t          vss_mtx;
 	int                      vss_iid;
 	int                      vss_ctl_fd;
 	uint32_t                 vss_features;
 	struct pci_vtscsi_config vss_config;
 };
 
 #define	VIRTIO_SCSI_T_TMF			0
 #define	VIRTIO_SCSI_T_TMF_ABORT_TASK		0
 #define	VIRTIO_SCSI_T_TMF_ABORT_TASK_SET	1
 #define	VIRTIO_SCSI_T_TMF_CLEAR_ACA		2
 #define	VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET	3
 #define	VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET	4
 #define	VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET	5
 #define	VIRTIO_SCSI_T_TMF_QUERY_TASK		6
 #define	VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 	7
 
 /* command-specific response values */
 #define	VIRTIO_SCSI_S_FUNCTION_COMPLETE		0
 #define	VIRTIO_SCSI_S_FUNCTION_SUCCEEDED	10
 #define	VIRTIO_SCSI_S_FUNCTION_REJECTED		11
 
 struct pci_vtscsi_ctrl_tmf {
 	uint32_t type;
 	uint32_t subtype;
 	uint8_t lun[8];
 	uint64_t id;
 	uint8_t response;
 } __attribute__((packed));
 
 #define	VIRTIO_SCSI_T_AN_QUERY			1
 #define	VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2
 #define	VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT	4
 #define	VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST	8
 #define	VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE	16
 #define	VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST	32
 #define	VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY	64
 
 struct pci_vtscsi_ctrl_an {
 	uint32_t type;
 	uint8_t lun[8];
 	uint32_t event_requested;
 	uint32_t event_actual;
 	uint8_t response;
 } __attribute__((packed));
 
 /* command-specific response values */
 #define	VIRTIO_SCSI_S_OK 			0
 #define	VIRTIO_SCSI_S_OVERRUN			1
 #define	VIRTIO_SCSI_S_ABORTED			2
 #define	VIRTIO_SCSI_S_BAD_TARGET		3
 #define	VIRTIO_SCSI_S_RESET			4
 #define	VIRTIO_SCSI_S_BUSY			5
 #define	VIRTIO_SCSI_S_TRANSPORT_FAILURE		6
 #define	VIRTIO_SCSI_S_TARGET_FAILURE		7
 #define	VIRTIO_SCSI_S_NEXUS_FAILURE		8
 #define	VIRTIO_SCSI_S_FAILURE			9
 #define	VIRTIO_SCSI_S_INCORRECT_LUN		12
 
 /* task_attr */
 #define	VIRTIO_SCSI_S_SIMPLE			0
 #define	VIRTIO_SCSI_S_ORDERED			1
 #define	VIRTIO_SCSI_S_HEAD			2
 #define	VIRTIO_SCSI_S_ACA			3
 
 struct pci_vtscsi_event {
 	uint32_t event;
 	uint8_t lun[8];
 	uint32_t reason;
 } __attribute__((packed));
 
 struct pci_vtscsi_req_cmd_rd {
 	uint8_t lun[8];
 	uint64_t id;
 	uint8_t task_attr;
 	uint8_t prio;
 	uint8_t crn;
 	uint8_t cdb[];
 } __attribute__((packed));
 
 struct pci_vtscsi_req_cmd_wr {
 	uint32_t sense_len;
 	uint32_t residual;
 	uint16_t status_qualifier;
 	uint8_t status;
 	uint8_t response;
 	uint8_t sense[];
 } __attribute__((packed));
 
 static void *pci_vtscsi_proc(void *);
 static void pci_vtscsi_reset(void *);
 static void pci_vtscsi_neg_features(void *, uint64_t);
 static int pci_vtscsi_cfgread(void *, int, int, uint32_t *);
 static int pci_vtscsi_cfgwrite(void *, int, int, uint32_t);
 static inline int pci_vtscsi_get_lun(uint8_t *);
 static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *, void *, size_t);
 static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *,
     struct pci_vtscsi_ctrl_tmf *);
 static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *,
     struct pci_vtscsi_ctrl_an *);
 static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *, struct iovec *,
     int, struct iovec *, int);
 static void pci_vtscsi_controlq_notify(void *, struct vqueue_info *);
 static void pci_vtscsi_eventq_notify(void *, struct vqueue_info *);
 static void pci_vtscsi_requestq_notify(void *, struct vqueue_info *);
 static int  pci_vtscsi_init_queue(struct pci_vtscsi_softc *,
     struct pci_vtscsi_queue *, int);
 static int pci_vtscsi_init(struct pci_devinst *, nvlist_t *);
 
 static struct virtio_consts vtscsi_vi_consts = {
 	.vc_name =	"vtscsi",
 	.vc_nvq =	VTSCSI_MAXQ,
 	.vc_cfgsize =	sizeof(struct pci_vtscsi_config),
 	.vc_reset =	pci_vtscsi_reset,
 	.vc_cfgread =	pci_vtscsi_cfgread,
 	.vc_cfgwrite =	pci_vtscsi_cfgwrite,
 	.vc_apply_features = pci_vtscsi_neg_features,
 	.vc_hv_caps =	0,
 };
 
 static void *
 pci_vtscsi_proc(void *arg)
 {
 	struct pci_vtscsi_worker *worker = (struct pci_vtscsi_worker *)arg;
 	struct pci_vtscsi_queue *q = worker->vsw_queue;
 	struct pci_vtscsi_request *req;
 	int iolen;
 
 	for (;;) {
 		pthread_mutex_lock(&q->vsq_mtx);
 
 		while (STAILQ_EMPTY(&q->vsq_requests)
 		    && !worker->vsw_exiting)
 			pthread_cond_wait(&q->vsq_cv, &q->vsq_mtx);
 
 		if (worker->vsw_exiting)
 			break;
 
 		req = STAILQ_FIRST(&q->vsq_requests);
 		STAILQ_REMOVE_HEAD(&q->vsq_requests, vsr_link);
 
 		pthread_mutex_unlock(&q->vsq_mtx);
 		iolen = pci_vtscsi_request_handle(q, req->vsr_iov_in,
 		    req->vsr_niov_in, req->vsr_iov_out, req->vsr_niov_out);
 
 		pthread_mutex_lock(&q->vsq_qmtx);
 		vq_relchain(q->vsq_vq, req->vsr_idx, iolen);
 		vq_endchains(q->vsq_vq, 0);
 		pthread_mutex_unlock(&q->vsq_qmtx);
 
 		DPRINTF("request <idx=%d> completed", req->vsr_idx);
 		free(req);
 	}
 
 	pthread_mutex_unlock(&q->vsq_mtx);
 	return (NULL);
 }
 
 static void
 pci_vtscsi_reset(void *vsc)
 {
 	struct pci_vtscsi_softc *sc;
 
 	sc = vsc;
 
 	DPRINTF("device reset requested");
 	vi_reset_dev(&sc->vss_vs);
 
 	/* initialize config structure */
 	sc->vss_config = (struct pci_vtscsi_config){
 		.num_queues = VTSCSI_REQUESTQ,
 		/* Leave room for the request and the response. */
 		.seg_max = VTSCSI_MAXSEG - 2,
 		.max_sectors = 2,
 		.cmd_per_lun = 1,
 		.event_info_size = sizeof(struct pci_vtscsi_event),
 		.sense_size = 96,
 		.cdb_size = 32,
 		.max_channel = VIRTIO_SCSI_MAX_CHANNEL,
 		.max_target = VIRTIO_SCSI_MAX_TARGET,
 		.max_lun = VIRTIO_SCSI_MAX_LUN
 	};
 }
 
 static void
 pci_vtscsi_neg_features(void *vsc, uint64_t negotiated_features)
 {
 	struct pci_vtscsi_softc *sc = vsc;
 
 	sc->vss_features = negotiated_features;
 }
 
 static int
 pci_vtscsi_cfgread(void *vsc, int offset, int size, uint32_t *retval)
 {
 	struct pci_vtscsi_softc *sc = vsc;
 	void *ptr;
 
 	ptr = (uint8_t *)&sc->vss_config + offset;
 	memcpy(retval, ptr, size);
 	return (0);
 }
 
 static int
 pci_vtscsi_cfgwrite(void *vsc __unused, int offset __unused, int size __unused,
     uint32_t val __unused)
 {
 	return (0);
 }
 
 static inline int
 pci_vtscsi_get_lun(uint8_t *lun)
 {
 
 	return (((lun[2] << 8) | lun[3]) & 0x3fff);
 }
 
 static int
 pci_vtscsi_control_handle(struct pci_vtscsi_softc *sc, void *buf,
     size_t bufsize)
 {
 	struct pci_vtscsi_ctrl_tmf *tmf;
 	struct pci_vtscsi_ctrl_an *an;
 	uint32_t type;
 
 	if (bufsize < sizeof(uint32_t)) {
 		WPRINTF("ignoring truncated control request");
 		return (0);
 	}
 
 	type = *(uint32_t *)buf;
 
 	if (type == VIRTIO_SCSI_T_TMF) {
 		if (bufsize != sizeof(*tmf)) {
 			WPRINTF("ignoring tmf request with size %zu", bufsize);
 			return (0);
 		}
 		tmf = (struct pci_vtscsi_ctrl_tmf *)buf;
 		return (pci_vtscsi_tmf_handle(sc, tmf));
 	}
 
 	if (type == VIRTIO_SCSI_T_AN_QUERY) {
 		if (bufsize != sizeof(*an)) {
 			WPRINTF("ignoring AN request with size %zu", bufsize);
 			return (0);
 		}
 		an = (struct pci_vtscsi_ctrl_an *)buf;
 		return (pci_vtscsi_an_handle(sc, an));
 	}
 
 	return (0);
 }
 
 static int
 pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *sc,
     struct pci_vtscsi_ctrl_tmf *tmf)
 {
 	union ctl_io *io;
 	int err;
 
 	io = ctl_scsi_alloc_io(sc->vss_iid);
 	ctl_scsi_zero_io(io);
 
 	io->io_hdr.io_type = CTL_IO_TASK;
 	io->io_hdr.nexus.initid = sc->vss_iid;
 	io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(tmf->lun);
 	io->taskio.tag_type = CTL_TAG_SIMPLE;
 	io->taskio.tag_num = tmf->id;
 	io->io_hdr.flags |= CTL_FLAG_USER_TAG;
 
 	switch (tmf->subtype) {
 	case VIRTIO_SCSI_T_TMF_ABORT_TASK:
 		io->taskio.task_action = CTL_TASK_ABORT_TASK;
 		break;
 
 	case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
 		io->taskio.task_action = CTL_TASK_ABORT_TASK_SET;
 		break;
 
 	case VIRTIO_SCSI_T_TMF_CLEAR_ACA:
 		io->taskio.task_action = CTL_TASK_CLEAR_ACA;
 		break;
 
 	case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET:
 		io->taskio.task_action = CTL_TASK_CLEAR_TASK_SET;
 		break;
 
 	case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
 		io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET;
 		break;
 
 	case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
 		io->taskio.task_action = CTL_TASK_LUN_RESET;
 		break;
 
 	case VIRTIO_SCSI_T_TMF_QUERY_TASK:
 		io->taskio.task_action = CTL_TASK_QUERY_TASK;
 		break;
 
 	case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET:
 		io->taskio.task_action = CTL_TASK_QUERY_TASK_SET;
 		break;
 	}
 
 	if (pci_vtscsi_debug) {
 		struct sbuf *sb = sbuf_new_auto();
 		ctl_io_sbuf(io, sb);
 		sbuf_finish(sb);
 		DPRINTF("%s", sbuf_data(sb));
 		sbuf_delete(sb);
 	}
 
 	err = ioctl(sc->vss_ctl_fd, CTL_IO, io);
 	if (err != 0)
 		WPRINTF("CTL_IO: err=%d (%s)", errno, strerror(errno));
 
 	tmf->response = io->taskio.task_status;
 	ctl_scsi_free_io(io);
 	return (1);
 }
 
 static int
 pci_vtscsi_an_handle(struct pci_vtscsi_softc *sc __unused,
     struct pci_vtscsi_ctrl_an *an __unused)
 {
 	return (0);
 }
 
 static int
 pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in,
     int niov_in, struct iovec *iov_out, int niov_out)
 {
 	struct pci_vtscsi_softc *sc = q->vsq_sc;
 	struct pci_vtscsi_req_cmd_rd *cmd_rd = NULL;
 	struct pci_vtscsi_req_cmd_wr *cmd_wr;
 	struct iovec data_iov_in[VTSCSI_MAXSEG], data_iov_out[VTSCSI_MAXSEG];
 	union ctl_io *io;
 	int data_niov_in, data_niov_out;
 	void *ext_data_ptr = NULL;
 	uint32_t ext_data_len = 0, ext_sg_entries = 0;
 	int err, nxferred;
 
 	if (count_iov(iov_out, niov_out) < VTSCSI_OUT_HEADER_LEN(sc)) {
 		WPRINTF("ignoring request with insufficient output");
 		return (0);
 	}
 	if (count_iov(iov_in, niov_in) < VTSCSI_IN_HEADER_LEN(sc)) {
 		WPRINTF("ignoring request with incomplete header");
 		return (0);
 	}
 
 	seek_iov(iov_in, niov_in, data_iov_in, &data_niov_in,
 	    VTSCSI_IN_HEADER_LEN(sc));
 	seek_iov(iov_out, niov_out, data_iov_out, &data_niov_out,
 	    VTSCSI_OUT_HEADER_LEN(sc));
 
 	truncate_iov(iov_in, &niov_in, VTSCSI_IN_HEADER_LEN(sc));
 	truncate_iov(iov_out, &niov_out, VTSCSI_OUT_HEADER_LEN(sc));
 	iov_to_buf(iov_in, niov_in, (void **)&cmd_rd);
 
 	cmd_wr = calloc(1, VTSCSI_OUT_HEADER_LEN(sc));
 	io = ctl_scsi_alloc_io(sc->vss_iid);
 	ctl_scsi_zero_io(io);
 
 	io->io_hdr.nexus.initid = sc->vss_iid;
 	io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(cmd_rd->lun);
 
 	io->io_hdr.io_type = CTL_IO_SCSI;
 
 	if (data_niov_in > 0) {
 		ext_data_ptr = (void *)data_iov_in;
 		ext_sg_entries = data_niov_in;
 		ext_data_len = count_iov(data_iov_in, data_niov_in);
 		io->io_hdr.flags |= CTL_FLAG_DATA_OUT;
 	} else if (data_niov_out > 0) {
 		ext_data_ptr = (void *)data_iov_out;
 		ext_sg_entries = data_niov_out;
 		ext_data_len = count_iov(data_iov_out, data_niov_out);
 		io->io_hdr.flags |= CTL_FLAG_DATA_IN;
 	}
 
 	io->scsiio.sense_len = sc->vss_config.sense_size;
 	io->scsiio.tag_num = cmd_rd->id;
 	io->io_hdr.flags |= CTL_FLAG_USER_TAG;
 	switch (cmd_rd->task_attr) {
 	case VIRTIO_SCSI_S_ORDERED:
 		io->scsiio.tag_type = CTL_TAG_ORDERED;
 		break;
 	case VIRTIO_SCSI_S_HEAD:
 		io->scsiio.tag_type = CTL_TAG_HEAD_OF_QUEUE;
 		break;
 	case VIRTIO_SCSI_S_ACA:
 		io->scsiio.tag_type = CTL_TAG_ACA;
 		break;
 	case VIRTIO_SCSI_S_SIMPLE:
 	default:
 		io->scsiio.tag_type = CTL_TAG_SIMPLE;
 		break;
 	}
 	io->scsiio.ext_sg_entries = ext_sg_entries;
 	io->scsiio.ext_data_ptr = ext_data_ptr;
 	io->scsiio.ext_data_len = ext_data_len;
 	io->scsiio.ext_data_filled = 0;
 	io->scsiio.cdb_len = sc->vss_config.cdb_size;
 	memcpy(io->scsiio.cdb, cmd_rd->cdb, sc->vss_config.cdb_size);
 
 	if (pci_vtscsi_debug) {
 		struct sbuf *sb = sbuf_new_auto();
 		ctl_io_sbuf(io, sb);
 		sbuf_finish(sb);
 		DPRINTF("%s", sbuf_data(sb));
 		sbuf_delete(sb);
 	}
 
 	err = ioctl(sc->vss_ctl_fd, CTL_IO, io);
 	if (err != 0) {
 		WPRINTF("CTL_IO: err=%d (%s)", errno, strerror(errno));
 		cmd_wr->response = VIRTIO_SCSI_S_FAILURE;
 	} else {
 		cmd_wr->sense_len = MIN(io->scsiio.sense_len,
 		    sc->vss_config.sense_size);
 		cmd_wr->residual = ext_data_len - io->scsiio.ext_data_filled;
 		cmd_wr->status = io->scsiio.scsi_status;
 		cmd_wr->response = VIRTIO_SCSI_S_OK;
 		memcpy(&cmd_wr->sense, &io->scsiio.sense_data,
 		    cmd_wr->sense_len);
 	}
 
 	buf_to_iov(cmd_wr, VTSCSI_OUT_HEADER_LEN(sc), iov_out, niov_out, 0);
 	nxferred = VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled;
 	free(cmd_rd);
 	free(cmd_wr);
 	ctl_scsi_free_io(io);
 	return (nxferred);
 }
 
 static void
 pci_vtscsi_controlq_notify(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtscsi_softc *sc;
 	struct iovec iov[VTSCSI_MAXSEG];
 	struct vi_req req;
 	void *buf = NULL;
 	size_t bufsize;
 	int iolen, n;
 
 	sc = vsc;
 
 	while (vq_has_descs(vq)) {
 		n = vq_getchain(vq, iov, VTSCSI_MAXSEG, &req);
 		assert(n >= 1 && n <= VTSCSI_MAXSEG);
 
 		bufsize = iov_to_buf(iov, n, &buf);
 		iolen = pci_vtscsi_control_handle(sc, buf, bufsize);
 		buf_to_iov((uint8_t *)buf + bufsize - iolen, iolen, iov, n,
 		    bufsize - iolen);
 
 		/*
 		 * Release this chain and handle more
 		 */
 		vq_relchain(vq, req.idx, iolen);
 	}
 	vq_endchains(vq, 1);	/* Generate interrupt if appropriate. */
 	free(buf);
 }
 
 static void
 pci_vtscsi_eventq_notify(void *vsc __unused, struct vqueue_info *vq)
 {
 	vq_kick_disable(vq);
 }
 
 static void
 pci_vtscsi_requestq_notify(void *vsc, struct vqueue_info *vq)
 {
 	struct pci_vtscsi_softc *sc;
 	struct pci_vtscsi_queue *q;
 	struct pci_vtscsi_request *req;
 	struct iovec iov[VTSCSI_MAXSEG];
 	struct vi_req vireq;
 	int n;
 
 	sc = vsc;
 	q = &sc->vss_queues[vq->vq_num - 2];
 
 	while (vq_has_descs(vq)) {
 		n = vq_getchain(vq, iov, VTSCSI_MAXSEG, &vireq);
 		assert(n >= 1 && n <= VTSCSI_MAXSEG);
 
 		req = calloc(1, sizeof(struct pci_vtscsi_request));
 		req->vsr_idx = vireq.idx;
 		req->vsr_queue = q;
 		req->vsr_niov_in = vireq.readable;
 		req->vsr_niov_out = vireq.writable;
 		memcpy(req->vsr_iov_in, iov,
 		    req->vsr_niov_in * sizeof(struct iovec));
 		memcpy(req->vsr_iov_out, iov + vireq.readable,
 		    req->vsr_niov_out * sizeof(struct iovec));
 
 		pthread_mutex_lock(&q->vsq_mtx);
 		STAILQ_INSERT_TAIL(&q->vsq_requests, req, vsr_link);
 		pthread_cond_signal(&q->vsq_cv);
 		pthread_mutex_unlock(&q->vsq_mtx);
 
 		DPRINTF("request <idx=%d> enqueued", vireq.idx);
 	}
 }
 
 static int
 pci_vtscsi_init_queue(struct pci_vtscsi_softc *sc,
     struct pci_vtscsi_queue *queue, int num)
 {
 	struct pci_vtscsi_worker *worker;
 	char tname[MAXCOMLEN + 1];
 	int i;
 
 	queue->vsq_sc = sc;
 	queue->vsq_vq = &sc->vss_vq[num + 2];
 
 	pthread_mutex_init(&queue->vsq_mtx, NULL);
 	pthread_mutex_init(&queue->vsq_qmtx, NULL);
 	pthread_cond_init(&queue->vsq_cv, NULL);
 	STAILQ_INIT(&queue->vsq_requests);
 	LIST_INIT(&queue->vsq_workers);
 
 	for (i = 0; i < VTSCSI_THR_PER_Q; i++) {
 		worker = calloc(1, sizeof(struct pci_vtscsi_worker));
 		worker->vsw_queue = queue;
 
 		pthread_create(&worker->vsw_thread, NULL, &pci_vtscsi_proc,
 		    (void *)worker);
 
 		snprintf(tname, sizeof(tname), "vtscsi:%d-%d", num, i);
 		pthread_set_name_np(worker->vsw_thread, tname);
 		LIST_INSERT_HEAD(&queue->vsq_workers, worker, vsw_link);
 	}
 
 	return (0);
 }
 
 static int
 pci_vtscsi_legacy_config(nvlist_t *nvl, const char *opts)
 {
 	char *cp, *devname;
 
 	if (opts == NULL)
 		return (0);
 
 	cp = strchr(opts, ',');
 	if (cp == NULL) {
 		set_config_value_node(nvl, "dev", opts);
 		return (0);
 	}
 	devname = strndup(opts, cp - opts);
 	set_config_value_node(nvl, "dev", devname);
 	free(devname);
 	return (pci_parse_legacy_config(nvl, cp + 1));
 }
 
 static int
 pci_vtscsi_init(struct pci_devinst *pi, nvlist_t *nvl)
 {
 	struct pci_vtscsi_softc *sc;
 	const char *devname, *value;
 	int i;
 
 	sc = calloc(1, sizeof(struct pci_vtscsi_softc));
 	value = get_config_value_node(nvl, "iid");
 	if (value != NULL)
 		sc->vss_iid = strtoul(value, NULL, 10);
 
+	value = get_config_value_node(nvl, "bootindex");
+	if (value != NULL) {
+		if (pci_emul_add_boot_device(pi, atoi(value))) {
+			EPRINTLN("Invalid bootindex %d", atoi(value));
+			free(sc);
+			return (-1);
+		}
+	}
+
 	devname = get_config_value_node(nvl, "dev");
 	if (devname == NULL)
 		devname = "/dev/cam/ctl";
 	sc->vss_ctl_fd = open(devname, O_RDWR);
 	if (sc->vss_ctl_fd < 0) {
 		WPRINTF("cannot open %s: %s", devname, strerror(errno));
 		free(sc);
 		return (1);
 	}
 
 	pthread_mutex_init(&sc->vss_mtx, NULL);
 
 	vi_softc_linkup(&sc->vss_vs, &vtscsi_vi_consts, sc, pi, sc->vss_vq);
 	sc->vss_vs.vs_mtx = &sc->vss_mtx;
 
 	/* controlq */
 	sc->vss_vq[0].vq_qsize = VTSCSI_RINGSZ;
 	sc->vss_vq[0].vq_notify = pci_vtscsi_controlq_notify;
 
 	/* eventq */
 	sc->vss_vq[1].vq_qsize = VTSCSI_RINGSZ;
 	sc->vss_vq[1].vq_notify = pci_vtscsi_eventq_notify;
 
 	/* request queues */
 	for (i = 2; i < VTSCSI_MAXQ; i++) {
 		sc->vss_vq[i].vq_qsize = VTSCSI_RINGSZ;
 		sc->vss_vq[i].vq_notify = pci_vtscsi_requestq_notify;
 		pci_vtscsi_init_queue(sc, &sc->vss_queues[i - 2], i - 2);
 	}
 
 	/* initialize config space */
 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_SCSI);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_SCSI);
 	pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
 
 	if (vi_intr_init(&sc->vss_vs, 1, fbsdrun_virtio_msix()))
 		return (1);
 	vi_set_io_bar(&sc->vss_vs, 0);
 
 	return (0);
 }
 
 
 static const struct pci_devemu pci_de_vscsi = {
 	.pe_emu =	"virtio-scsi",
 	.pe_init =	pci_vtscsi_init,
 	.pe_legacy_config = pci_vtscsi_legacy_config,
 	.pe_barwrite =	vi_pci_write,
 	.pe_barread =	vi_pci_read
 };
 PCI_EMUL_SET(pci_de_vscsi);