Index: head/usr.sbin/bhyve/Makefile
===================================================================
--- head/usr.sbin/bhyve/Makefile	(revision 335973)
+++ head/usr.sbin/bhyve/Makefile	(revision 335974)
@@ -1,91 +1,92 @@
 #
 # $FreeBSD$
 #
 
 .include <src.opts.mk>
 CFLAGS+=-I${SRCTOP}/sys
 .PATH:  ${SRCTOP}/sys/cam/ctl
 
 PROG=	bhyve
 PACKAGE=	bhyve
 
 DEBUG_FLAGS= -g -O0
 
 MAN=	bhyve.8
 
 BHYVE_SYSDIR?=${SRCTOP}
 
 SRCS=	\
 	atkbdc.c		\
 	acpi.c			\
 	bhyvegc.c		\
 	bhyverun.c		\
 	block_if.c		\
 	bootrom.c		\
 	console.c		\
 	consport.c		\
 	ctl_util.c		\
 	ctl_scsi_all.c		\
 	dbgport.c		\
 	fwctl.c			\
 	gdb.c			\
 	inout.c			\
 	ioapic.c		\
 	mem.c			\
 	mevent.c		\
 	mptbl.c			\
 	pci_ahci.c		\
 	pci_e82545.c		\
 	pci_emul.c		\
 	pci_fbuf.c		\
 	pci_hostbridge.c	\
 	pci_irq.c		\
 	pci_lpc.c		\
+	pci_nvme.c		\
 	pci_passthru.c		\
 	pci_virtio_block.c	\
 	pci_virtio_console.c	\
 	pci_virtio_net.c	\
 	pci_virtio_rnd.c	\
 	pci_virtio_scsi.c	\
 	pci_uart.c		\
 	pci_xhci.c		\
 	pm.c			\
 	post.c			\
 	ps2kbd.c		\
 	ps2mouse.c		\
 	rfb.c			\
 	rtc.c			\
 	smbiostbl.c		\
 	sockstream.c		\
 	task_switch.c		\
 	uart_emul.c		\
 	usb_emul.c		\
 	usb_mouse.c		\
 	virtio.c		\
 	vga.c			\
 	xmsr.c			\
 	spinup_ap.c		\
 	iov.c
 
 .PATH:  ${BHYVE_SYSDIR}/sys/amd64/vmm
 SRCS+=	vmm_instruction_emul.c
 
 LIBADD=	vmmapi md pthread z util sbuf cam
 
 .if ${MK_OPENSSL} == "no"
 CFLAGS+=-DNO_OPENSSL
 .else
 LIBADD+=	crypto
 .endif
 
 CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000
 CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii
 CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller
 
 .ifdef GDB_LOG
 CFLAGS+=-DGDB_LOG
 .endif
 
 WARNS?=	2
 
 .include <bsd.prog.mk>
Index: head/usr.sbin/bhyve/bhyve.8
===================================================================
--- head/usr.sbin/bhyve/bhyve.8	(revision 335973)
+++ head/usr.sbin/bhyve/bhyve.8	(revision 335974)
@@ -1,573 +1,596 @@
 .\" Copyright (c) 2013 Peter Grehan
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd Jun 11, 2018
+.Dd Jul 05, 2018
 .Dt BHYVE 8
 .Os
 .Sh NAME
 .Nm bhyve
 .Nd "run a guest operating system inside a virtual machine"
 .Sh SYNOPSIS
 .Nm
 .Op Fl abehuwxACHPSWY
 .Oo
 .Fl c\~ Ns
 .Oo
 .Op Ar cpus= Ns
 .Ar numcpus Ns
 .Oc Ns
 .Op Ar ,sockets=n Ns
 .Op Ar ,cores=n Ns
 .Op Ar ,threads=n
 .Oc
 .Op Fl g Ar gdbport
 .Op Fl l Ar lpcdev Ns Op , Ns Ar conf
 .Op Fl m Ar memsize Ns Op Ar K|k|M|m|G|g|T|t
 .Op Fl p Ar vcpu:hostcpu
 .Op Fl s Ar slot,emulation Ns Op , Ns Ar conf
 .Op Fl G Ar port
 .Op Fl U Ar uuid
 .Ar vmname
 .Sh DESCRIPTION
 .Nm
 is a hypervisor that runs guest operating systems inside a
 virtual machine.
 .Pp
 Parameters such as the number of virtual CPUs, amount of guest memory, and
 I/O connectivity can be specified with command-line parameters.
 .Pp
 If not using a boot ROM, the guest operating system must be loaded with
 .Xr bhyveload 8
 or a similar boot loader before running
 .Nm ,
 otherwise, it is enough to run
 .Nm
 with a boot ROM of choice.
 .Pp
 .Nm
 runs until the guest operating system reboots or an unhandled hypervisor
 exit is detected.
 .Sh OPTIONS
 .Bl -tag -width 10n
 .It Fl a
 The guest's local APIC is configured in xAPIC mode.
 The xAPIC mode is the default setting so this option is redundant.
 It will be deprecated in a future version.
 .It Fl A
 Generate ACPI tables.
 Required for
 .Fx Ns /amd64
 guests.
 .It Fl b
 Enable a low-level console device supported by
 .Fx
 kernels compiled with
 .Cd "device bvmconsole" .
 This option will be deprecated in a future version.
 .It Fl c Op Ar setting ...
 Number of guest virtual CPUs
 and/or the CPU topology.
 The default value for each of
 .Ar numcpus ,
 .Ar sockets ,
 .Ar cores ,
 and
 .Ar threads
 is 1.
 The current maximum number of guest virtual CPUs is 16.
 If
 .Ar numcpus
 is not specified then it will be calculated from the other arguments.
 The topology must be consistent in that the
 .Ar numcpus
 must equal the product of
 .Ar sockets ,
 .Ar cores ,
 and
 .Ar threads .
 If a
 .Ar setting
 is specified more than once the last one has precedence.
 .It Fl C
 Include guest memory in core file.
 .It Fl e
 Force
 .Nm
 to exit when a guest issues an access to an I/O port that is not emulated.
 This is intended for debug purposes.
 .It Fl g Ar gdbport
 For
 .Fx
 kernels compiled with
 .Cd "device bvmdebug" ,
 allow a remote kernel kgdb to be relayed to the guest kernel gdb stub
 via a local IPv4 address and this port.
 This option will be deprecated in a future version.
 .It Fl G Ar port
 Start a debug server that uses the GDB protocol to export guest state to a
 debugger.
 An IPv4 TCP socket will be bound to the supplied
 .Ar port
 to listen for debugger connections.
 Only a single debugger may be attached to the debug server at a time.
 If
 .Ar port
 begins with
 .Sq w ,
 .Nm
 will pause execution at the first instruction waiting for a debugger to attach.
 .It Fl h
 Print help message and exit.
 .It Fl H
 Yield the virtual CPU thread when a HLT instruction is detected.
 If this option is not specified, virtual CPUs will use 100% of a host CPU.
 .It Fl l Ar lpcdev Ns Op , Ns Ar conf
 Allow devices behind the LPC PCI-ISA bridge to be configured.
 The only supported devices are the TTY-class devices
 .Ar com1
 and
 .Ar com2
 and the boot ROM device
 .Ar bootrom .
 .It Fl m Ar memsize Ns Op Ar K|k|M|m|G|g|T|t
 Guest physical memory size in bytes.
 This must be the same size that was given to
 .Xr bhyveload 8 .
 .Pp
 The size argument may be suffixed with one of K, M, G or T (either upper
 or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes,
 or terabytes.
 If no suffix is given, the value is assumed to be in megabytes.
 .Pp
 .Ar memsize
 defaults to 256M.
 .It Fl p Ar vcpu:hostcpu
 Pin guest's virtual CPU
 .Em vcpu
 to
 .Em hostcpu .
 .It Fl P
 Force the guest virtual CPU to exit when a PAUSE instruction is detected.
 .It Fl s Ar slot,emulation Ns Op , Ns Ar conf
 Configure a virtual PCI slot and function.
 .Pp
 .Nm
 provides PCI bus emulation and virtual devices that can be attached to
 slots on the bus.
 There are 32 available slots, with the option of providing up to 8 functions
 per slot.
 .Bl -tag -width 10n
 .It Ar slot
 .Ar pcislot[:function]
 .Ar bus:pcislot:function
 .Pp
 The
 .Ar pcislot
 value is 0 to 31.
 The optional
 .Ar function
 value is 0 to 7.
 The optional
 .Ar bus
 value is 0 to 255.
 If not specified, the
 .Ar function
 value defaults to 0.
 If not specified, the
 .Ar bus
 value defaults to 0.
 .It Ar emulation
 .Bl -tag -width 10n
 .It Li hostbridge | Li amd_hostbridge
 .Pp
 Provide a simple host bridge.
 This is usually configured at slot 0, and is required by most guest
 operating systems.
 The
 .Li amd_hostbridge
 emulation is identical but uses a PCI vendor ID of
 .Li AMD .
 .It Li passthru
 PCI pass-through device.
 .It Li virtio-net
 Virtio network interface.
 .It Li virtio-blk
 Virtio block storage interface.
 .It Li virtio-scsi
 Virtio SCSI interface.
 .It Li virtio-rnd
 Virtio RNG interface.
 .It Li virtio-console
 Virtio console interface, which exposes multiple ports
 to the guest in the form of simple char devices for simple IO
 between the guest and host userspaces.
 .It Li ahci
 AHCI controller attached to arbitrary devices.
 .It Li ahci-cd
 AHCI controller attached to an ATAPI CD/DVD.
 .It Li ahci-hd
 AHCI controller attached to a SATA hard-drive.
 .It Li e1000
 Intel e82545 network interface.
 .It Li uart
 PCI 16550 serial device.
 .It Li lpc
 LPC PCI-ISA bridge with COM1 and COM2 16550 serial ports and a boot ROM.
 The LPC bridge emulation can only be configured on bus 0.
 .It Li fbuf
 Raw framebuffer device attached to VNC server.
 .It Li xhci
 eXtensible Host Controller Interface (xHCI) USB controller.
+.It Li nvme
+NVM Express (NVMe) controller.
 .El
 .It Op Ar conf
 This optional parameter describes the backend for device emulations.
 If
 .Ar conf
 is not specified, the device emulation has no backend and can be
 considered unconnected.
 .Pp
 Network devices:
 .Bl -tag -width 10n
 .It Ar tapN Ns Op , Ns Ar mac=xx:xx:xx:xx:xx:xx
 .It Ar vmnetN Ns Op , Ns Ar mac=xx:xx:xx:xx:xx:xx
 .Pp
 If
 .Ar mac
 is not specified, the MAC address is derived from a fixed OUI and the
 remaining bytes from an MD5 hash of the slot and function numbers and
 the device name.
 .Pp
 The MAC address is an ASCII string in
 .Xr ethers 5
 format.
 .El
 .Pp
 Block storage devices:
 .Bl -tag -width 10n
 .It Pa /filename Ns Oo , Ns Ar block-device-options Oc
 .It Pa /dev/xxx Ns Oo , Ns Ar block-device-options Oc
 .El
 .Pp
 The
 .Ar block-device-options
 are:
 .Bl -tag -width 8n
 .It Li nocache
 Open the file with
 .Dv O_DIRECT .
 .It Li direct
 Open the file using
 .Dv O_SYNC .
 .It Li ro
 Force the file to be opened read-only.
 .It Li sectorsize= Ns Ar logical Ns Oo / Ns Ar physical Oc
 Specify the logical and physical sector sizes of the emulated disk.
 The physical sector size is optional and is equal to the logical sector size
 if not explicitly specified.
 .El
 .Pp
 SCSI devices:
 .Bl -tag -width 10n
 .It Pa /dev/cam/ Ns Oo , Ns Ar port and initiator_id Oc
 .El
 .Pp
 TTY devices:
 .Bl -tag -width 10n
 .It Li stdio
 Connect the serial port to the standard input and output of
 the
 .Nm
 process.
 .It Pa /dev/xxx
 Use the host TTY device for serial port I/O.
 .El
 .Pp
 Boot ROM device:
 .Bl -tag -width 10n
 .It Pa romfile
 Map
 .Ar romfile
 in the guest address space reserved for boot firmware.
 .El
 .Pp
 Pass-through devices:
 .Bl -tag -width 10n
 .It Ns Ar slot Ns / Ns Ar bus Ns / Ns Ar function
 Connect to a PCI device on the host at the selector described by
 .Ar slot ,
 .Ar bus ,
 and
 .Ar function
 numbers.
 .El
 .Pp
 Guest memory must be wired using the
 .Fl S
 option when a pass-through device is configured.
 .Pp
 The host device must have been reserved at boot-time using the
 .Va pptdev
 loader variable as described in
 .Xr vmm 4 .
 .Pp
 Virtio console devices:
 .Bl -tag -width 10n
 .It Li port1= Ns Pa /path/to/port1.sock Ns ,anotherport= Ns Pa ...
 A maximum of 16 ports per device can be created.
 Every port is named and corresponds to a Unix domain socket created by
 .Nm .
 .Nm
 accepts at most one connection per port at a time.
 .Pp
 Limitations:
 .Bl -bullet -offset 2n
 .It
 Due to lack of destructors in
 .Nm ,
 sockets on the filesystem must be cleaned up manually after
 .Nm
 exits.
 .It
 There is no way to use the "console port" feature, nor the console port
 resize at present.
 .It
 Emergency write is advertised, but no-op at present.
 .El
 .El
 .Pp
 Framebuffer devices:
 .Bl -tag -width 10n
 .It Oo rfb= Ns Oo Ar IP: Oc Ns Ar port Oc Ns Oo ,w= Ns Ar width Oc Ns Oo ,h= Ns Ar height Oc Ns Oo ,vga= Ns Ar vgaconf Oc Ns Oo Ns ,wait Oc Ns Oo ,password= Ns Ar password Oc
 .Bl -tag -width 8n
 .It Ar IP:port
 An
 .Ar IP
 address and a
 .Ar port
 VNC should listen on.
 The default is to listen on localhost IPv4 address and default VNC port 5900.
 Listening on an IPv6 address is not supported.
 .It Ar width No and Ar height
 A display resolution, width and height, respectively.
 If not specified, a default resolution of 1024x768 pixels will be used.
 Minimal supported resolution is 640x480 pixels,
 and maximum is 1920x1200 pixels.
 .It Ar vgaconf
 Possible values for this option are
 .Dq io
 (default),
 .Dq on
 , and
 .Dq off .
 PCI graphics cards have a dual personality in that they are
 standard PCI devices with BAR addressing, but may also
 implicitly decode legacy VGA I/O space
 .Pq Ad 0x3c0-3df
 and memory space
 .Pq 64KB at Ad 0xA0000 .
 The default
 .Dq io
 option should be used for guests that attempt to issue BIOS
 calls which result in I/O port queries, and fail to boot if I/O decode is disabled.
 .Pp
 The
 .Dq on
 option should be used along with the CSM BIOS capability in UEFI
 to boot traditional BIOS guests that require the legacy VGA I/O and
 memory regions to be available.
 .Pp
 The
 .Dq off
 option should be used for the UEFI guests that assume that
 VGA adapter is present if they detect the I/O ports.
 An example of such a guest is
 .Ox
 in UEFI mode.
 .Pp
 Please refer to the
 .Nm
 .Fx
 wiki page
 .Pq Lk https://wiki.freebsd.org/bhyve
 for configuration notes of particular guests.
 .It wait
 Instruct
 .Nm
 to only boot upon the initiation of a VNC connection, simplifying the installation
 of operating systems that require immediate keyboard input.
 This can be removed for post-installation use.
 .It password
 This type of authentication is known to be cryptographically weak and is not
 intended for use on untrusted networks.
 Many implementations will want to use stronger security, such as running
 the session over an encrypted channel provided by IPsec or SSH.
 .El
 .El
 .Pp
 xHCI USB devices:
 .Bl -tag -width 10n
 .It Li tablet
 A USB tablet device which provides precise cursor synchronization
 when using VNC.
+.El
+.Pp
+NVMe devices:
+.Bl -tag -width 10n
+.It Li devpath
+Accepted device paths are:
+.Ar /dev/blockdev
+or
+.Ar /path/to/image
+or
+.Ar ram=size_in_MiB .
+.It Li maxq
+Max number of queues.
+.It Li qsz
+Max elements in each queue.
+.It Li ioslots
+Max number of concurrent I/O requests.
+.It Li sectsz
+Sector size (defaults to blockif sector size).
+.It Li ser
+Serial number with maximum 20 characters.
 .El
 .El
 .It Fl S
 Wire guest memory.
 .It Fl u
 RTC keeps UTC time.
 .It Fl U Ar uuid
 Set the universally unique identifier
 .Pq UUID
 in the guest's System Management BIOS System Information structure.
 By default a UUID is generated from the host's hostname and
 .Ar vmname .
 .It Fl w
 Ignore accesses to unimplemented Model Specific Registers (MSRs).
 This is intended for debug purposes.
 .It Fl W
 Force virtio PCI device emulations to use MSI interrupts instead of MSI-X
 interrupts.
 .It Fl x
 The guest's local APIC is configured in x2APIC mode.
 .It Fl Y
 Disable MPtable generation.
 .It Ar vmname
 Alphanumeric name of the guest.
 This should be the same as that created by
 .Xr bhyveload 8 .
 .El
 .Sh DEBUG SERVER
 The current debug server provides limited support for debuggers.
 .Ss Registers
 Each virtual CPU is exposed to the debugger as a thread.
 .Pp
 General purpose registers can be queried for each virtual CPU, but other
 registers such as floating-point and system registers cannot be queried.
 .Ss Memory
 Memory (including memory mapped I/O regions) can be read by the debugger,
 but not written.  Memory operations use virtual addresses that are resolved
 to physical addresses via the current virtual CPU's active address translation.
 .Ss Control
 The running guest can be interrupted by the debugger at any time
 .Pq for example, by pressing Ctrl-C in the debugger .
 .Pp
 Single stepping is only supported on Intel CPUs supporting the MTRAP VM exit.
 .Pp
 Breakpoints are not supported.
 .Sh SIGNAL HANDLING
 .Nm
 deals with the following signals:
 .Pp
 .Bl -tag -width indent -compact
 .It SIGTERM
 Trigger ACPI poweroff for a VM
 .El
 .Sh EXIT STATUS
 Exit status indicates how the VM was terminated:
 .Pp
 .Bl -tag -width indent -compact
 .It 0
 rebooted
 .It 1
 powered off
 .It 2
 halted
 .It 3
 triple fault
 .El
 .Sh EXAMPLES
 If not using a boot ROM, the guest operating system must have been loaded with
 .Xr bhyveload 8
 or a similar boot loader before
 .Xr bhyve 4
 can be run.
 Otherwise, the boot loader is not needed.
 .Pp
 To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio
 block device backed by the
 .Pa /my/image
 filesystem image, and a serial port for the console:
 .Bd -literal -offset indent
 bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\
   -l com1,stdio -A -H -P -m 1G vm1
 .Ed
 .Pp
 Run a 24GB single-CPU virtual machine with three network ports, one of which
 has a MAC address specified:
 .Bd -literal -offset indent
 bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\
   -s 2:1,virtio-net,tap1 \\
   -s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\
   -s 3,virtio-blk,/my/image -l com1,stdio \\
   -A -H -P -m 24G bigvm
 .Ed
 .Pp
 Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI
 CD-ROM, a single virtio network port, an AMD hostbridge, and the console
 port connected to an
 .Xr nmdm 4
 null-modem device.
 .Bd -literal -offset indent
 bhyve -c 4 \\
   -s 0,amd_hostbridge -s 1,lpc \\
   -s 1:0,ahci,hd:/images/disk.1,hd:/images/disk.2,\\
 hd:/images/disk.3,hd:/images/disk.4,\\
 hd:/images/disk.5,hd:/images/disk.6,\\
 hd:/images/disk.7,hd:/images/disk.8,\\
 cd:/images/install.iso \\
   -s 3,virtio-net,tap0 \\
   -l com1,/dev/nmdm0A \\
   -A -H -P -m 8G
 .Ed
 .Pp
 Run a UEFI virtual machine with a display resolution of 800 by 600 pixels
 that can be accessed via VNC at: 0.0.0.0:5900.
 .Bd -literal -offset indent
 bhyve -c 2 -m 4G -w -H \\
   -s 0,hostbridge \\
   -s 3,ahci-cd,/path/to/uefi-OS-install.iso \\
   -s 4,ahci-hd,disk.img \\
   -s 5,virtio-net,tap0 \\
   -s 29,fbuf,tcp=0.0.0.0:5900,w=800,h=600,wait \\
   -s 30,xhci,tablet \\
   -s 31,lpc -l com1,stdio \\
   -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\
    uefivm
 .Ed
 .Sh SEE ALSO
 .Xr bhyve 4 ,
 .Xr nmdm 4 ,
 .Xr vmm 4 ,
 .Xr ethers 5 ,
 .Xr bhyvectl 8 ,
 .Xr bhyveload 8
 .Sh HISTORY
 .Nm
 first appeared in
 .Fx 10.0 .
 .Sh AUTHORS
 .An Neel Natu Aq Mt neel@freebsd.org
 .An Peter Grehan Aq Mt grehan@freebsd.org
Index: head/usr.sbin/bhyve/block_if.h
===================================================================
--- head/usr.sbin/bhyve/block_if.h	(revision 335973)
+++ head/usr.sbin/bhyve/block_if.h	(revision 335974)
@@ -1,72 +1,72 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * The block API to be used by bhyve block-device emulations. The routines
  * are thread safe, with no assumptions about the context of the completion
  * callback - it may occur in the caller's context, or asynchronously in
  * another thread.
  */
 
 #ifndef _BLOCK_IF_H_
 #define _BLOCK_IF_H_
 
 #include <sys/uio.h>
 #include <sys/unistd.h>
 
 #define BLOCKIF_IOV_MAX		33	/* not practical to be IOV_MAX */
 
 struct blockif_req {
-	struct iovec	br_iov[BLOCKIF_IOV_MAX];
 	int		br_iovcnt;
 	off_t		br_offset;
 	ssize_t		br_resid;
 	void		(*br_callback)(struct blockif_req *req, int err);
 	void		*br_param;
+	struct iovec	br_iov[BLOCKIF_IOV_MAX];
 };
 
 struct blockif_ctxt;
 struct blockif_ctxt *blockif_open(const char *optstr, const char *ident);
 off_t	blockif_size(struct blockif_ctxt *bc);
 void	blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h,
     uint8_t *s);
 int	blockif_sectsz(struct blockif_ctxt *bc);
 void	blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off);
 int	blockif_queuesz(struct blockif_ctxt *bc);
 int	blockif_is_ro(struct blockif_ctxt *bc);
 int	blockif_candelete(struct blockif_ctxt *bc);
 int	blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq);
 int	blockif_close(struct blockif_ctxt *bc);
 
 #endif /* _BLOCK_IF_H_ */
Index: head/usr.sbin/bhyve/pci_nvme.c
===================================================================
--- head/usr.sbin/bhyve/pci_nvme.c	(nonexistent)
+++ head/usr.sbin/bhyve/pci_nvme.c	(revision 335974)
@@ -0,0 +1,1853 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2017 Shunsuke Mie
+ * Copyright (c) 2018 Leon Dang
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * bhyve PCIe-NVMe device emulation.
+ *
+ * options:
+ *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z
+ *
+ *  accepted devpath:
+ *    /dev/blockdev
+ *    /path/to/image
+ *    ram=size_in_MiB
+ *
+ *  maxq    = max number of queues
+ *  qsz     = max elements in each queue
+ *  ioslots = max number of concurrent io requests
+ *  sectsz  = sector size (defaults to blockif sector size)
+ *  ser     = serial number (20-chars max)
+ *
+ */
+
+/* TODO:
+    - create async event for smart and log
+    - intr coalesce
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <assert.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <machine/atomic.h>
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <dev/nvme/nvme.h>
+
+#include "bhyverun.h"
+#include "block_if.h"
+#include "pci_emul.h"
+
+
+static int nvme_debug = 0;
+#define	DPRINTF(params) if (nvme_debug) printf params
+#define	WPRINTF(params) printf params
+
+/* defaults; can be overridden */
+#define	NVME_MSIX_BAR		4
+
+#define	NVME_IOSLOTS		8
+
+#define	NVME_QUEUES		16
+#define	NVME_MAX_QENTRIES	2048
+
+#define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
+#define	NVME_MAX_BLOCKIOVS	512
+
+/* helpers */
+
+#define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
+
+enum nvme_controller_register_offsets {
+	NVME_CR_CAP_LOW = 0x00,
+	NVME_CR_CAP_HI  = 0x04,
+	NVME_CR_VS      = 0x08,
+	NVME_CR_INTMS   = 0x0c,
+	NVME_CR_INTMC   = 0x10,
+	NVME_CR_CC      = 0x14,
+	NVME_CR_CSTS    = 0x1c,
+	NVME_CR_NSSR    = 0x20,
+	NVME_CR_AQA     = 0x24,
+	NVME_CR_ASQ_LOW = 0x28,
+	NVME_CR_ASQ_HI  = 0x2c,
+	NVME_CR_ACQ_LOW = 0x30,
+	NVME_CR_ACQ_HI  = 0x34,
+};
+
+enum nvme_cmd_cdw11 {
+	NVME_CMD_CDW11_PC  = 0x0001,
+	NVME_CMD_CDW11_IEN = 0x0002,
+	NVME_CMD_CDW11_IV  = 0xFFFF0000,
+};
+
+#define	NVME_CMD_GET_OPC(opc) \
+	((opc) >> NVME_CMD_OPC_SHIFT & NVME_CMD_OPC_MASK)
+
+#define	NVME_CQ_INTEN	0x01
+#define	NVME_CQ_INTCOAL	0x02
+
+struct nvme_completion_queue {
+	struct nvme_completion *qbase;
+	uint32_t	size;
+	uint16_t	tail; /* nvme progress */
+	uint16_t	head; /* guest progress */
+	uint16_t	intr_vec;
+	uint32_t	intr_en;
+	pthread_mutex_t	mtx;
+};
+
+struct nvme_submission_queue {
+	struct nvme_command *qbase;
+	uint32_t	size;
+	uint16_t	head; /* nvme progress */
+	uint16_t	tail; /* guest progress */
+	uint16_t	cqid; /* completion queue id */
+	int		busy; /* queue is being processed */
+	int		qpriority;
+};
+
+enum nvme_storage_type {
+	NVME_STOR_BLOCKIF = 0,
+	NVME_STOR_RAM = 1,
+};
+
+struct pci_nvme_blockstore {
+	enum nvme_storage_type type;
+	void		*ctx;
+	uint64_t	size;
+	uint32_t	sectsz;
+	uint32_t	sectsz_bits;
+};
+
+struct pci_nvme_ioreq {
+	struct pci_nvme_softc *sc;
+	struct pci_nvme_ioreq *next;
+	struct nvme_submission_queue *nvme_sq;
+	uint16_t	sqid;
+
+	/* command information */
+	uint16_t	opc;
+	uint16_t	cid;
+	uint32_t	nsid;
+
+	uint64_t	prev_gpaddr;
+	size_t		prev_size;
+
+	/*
+	 * lock if all iovs consumed (big IO);
+	 * complete transaction before continuing
+	 */
+	pthread_mutex_t	mtx;
+	pthread_cond_t	cv;
+
+	struct blockif_req io_req;
+
+	/* pad to fit up to 512 page descriptors from guest IO request */
+	struct iovec	iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
+};
+
+struct pci_nvme_softc {
+	struct pci_devinst *nsc_pi;
+
+	pthread_mutex_t	mtx;
+
+	struct nvme_registers regs;
+
+	struct nvme_namespace_data  nsdata;
+	struct nvme_controller_data ctrldata;
+
+	struct pci_nvme_blockstore nvstore;
+
+	uint16_t	max_qentries; /* max entries per queue */
+	uint32_t	max_queues;
+	uint32_t	num_cqueues;
+	uint32_t	num_squeues;
+
+	struct pci_nvme_ioreq *ioreqs;
+	struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
+	uint32_t	pending_ios;
+	uint32_t	ioslots;
+	sem_t		iosemlock;
+
+	/* status and guest memory mapped queues */
+	struct nvme_completion_queue *compl_queues;
+	struct nvme_submission_queue *submit_queues;
+
+	/* controller features */
+	uint32_t	intr_coales_aggr_time;   /* 0x08: uS to delay intr */
+	uint32_t	intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
+	uint32_t	async_ev_config;         /* 0x0B: async event config */
+};
+
+
+static void pci_nvme_io_partial(struct blockif_req *br, int err);
+
+/* Controller Configuration utils */
+#define	NVME_CC_GET_EN(cc) \
+	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
+#define	NVME_CC_GET_CSS(cc) \
+	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
+#define	NVME_CC_GET_SHN(cc) \
+	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
+#define	NVME_CC_GET_IOSQES(cc) \
+	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
+#define	NVME_CC_GET_IOCQES(cc) \
+	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
+
+#define	NVME_CC_WRITE_MASK \
+	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
+	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
+	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
+
+#define	NVME_CC_NEN_WRITE_MASK \
+	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
+	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
+	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
+
+/* Controller Status utils */
+#define	NVME_CSTS_GET_RDY(sts) \
+	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
+
+#define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
+
+/* Completion Queue status word utils */
+#define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
+#define	NVME_STATUS_MASK \
+	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
+	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
+
+static __inline void
+pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
+{
+
+	*status &= ~NVME_STATUS_MASK;
+	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
+		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
+}
+
+static __inline void
+pci_nvme_status_genc(uint16_t *status, uint16_t code)
+{
+
+	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
+}
+
+static __inline void
+pci_nvme_toggle_phase(uint16_t *status, int prev)
+{
+
+	if (prev)
+		*status &= ~NVME_STATUS_P;
+	else
+		*status |= NVME_STATUS_P;
+}
+
+static void
+pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
+{
+	struct nvme_controller_data *cd = &sc->ctrldata;
+
+	cd->vid = 0xFB5D;
+	cd->ssvid = 0x0000;
+
+	cd->mn[0] = 'b';
+	cd->mn[1] = 'h';
+	cd->mn[2] = 'y';
+	cd->mn[3] = 'v';
+	cd->mn[4] = 'e';
+	cd->mn[5] = '-';
+	cd->mn[6] = 'N';
+	cd->mn[7] = 'V';
+	cd->mn[8] = 'M';
+	cd->mn[9] = 'e';
+
+	cd->fr[0] = '1';
+	cd->fr[1] = '.';
+	cd->fr[2] = '0';
+
+	/* Num of submission commands that we can handle at a time (2^rab) */
+	cd->rab   = 4;
+
+	/* FreeBSD OUI */
+	cd->ieee[0] = 0x58;
+	cd->ieee[1] = 0x9c;
+	cd->ieee[2] = 0xfc;
+
+	cd->mic = 0;
+
+	cd->mdts = 9;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
+
+	cd->ver = 0x00010300;
+
+	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
+	cd->acl = 2;
+	cd->aerl = 4;
+
+	cd->lpa = 0;	/* TODO: support some simple things like SMART */
+	cd->elpe = 0;	/* max error log page entries */
+	cd->npss = 1;	/* number of power states support */
+
+	/* Warning Composite Temperature Threshold */
+	cd->wctemp = 0x0157;
+
+	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
+	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
+	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
+	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
+	cd->nn = 1;	/* number of namespaces */
+
+	cd->fna = 0x03;
+
+	cd->power_state[0].mp = 10;
+}
+
+static void
+pci_nvme_init_nsdata(struct pci_nvme_softc *sc)
+{
+	struct nvme_namespace_data *nd;
+
+	nd = &sc->nsdata;
+
+	nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
+	nd->ncap = nd->nsze;
+	nd->nuse = nd->nsze;
+
+	/* Get LBA and backstore information from backing store */
+	nd->nlbaf = 1;
+	/* LBA data-sz = 2^lbads */
+	nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
+
+	nd->flbas = 0;
+}
+
+static void
+pci_nvme_reset(struct pci_nvme_softc *sc)
+{
+	DPRINTF(("%s\r\n", __func__));
+
+	sc->regs.cap_lo = (sc->max_qentries & NVME_CAP_LO_REG_MQES_MASK) |
+	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
+	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
+
+	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
+
+	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
+
+	sc->regs.cc = 0;
+	sc->regs.csts = 0;
+
+	if (sc->submit_queues != NULL) {
+		pthread_mutex_lock(&sc->mtx);
+		sc->num_cqueues = sc->num_squeues = sc->max_queues;
+
+		for (int i = 0; i <= sc->max_queues; i++) {
+			/*
+			 * The Admin Submission Queue is at index 0.
+			 * It must not be changed at reset otherwise the
+			 * emulation will be out of sync with the guest.
+			 */
+			if (i != 0) {
+				sc->submit_queues[i].qbase = NULL;
+				sc->submit_queues[i].size = 0;
+				sc->submit_queues[i].cqid = 0;
+
+				sc->compl_queues[i].qbase = NULL;
+				sc->compl_queues[i].size = 0;
+			}
+			sc->submit_queues[i].tail = 0;
+			sc->submit_queues[i].head = 0;
+			sc->submit_queues[i].busy = 0;
+
+			sc->compl_queues[i].tail = 0;
+			sc->compl_queues[i].head = 0;
+		}
+
+		pthread_mutex_unlock(&sc->mtx);
+	} else
+		sc->submit_queues = calloc(sc->max_queues + 1,
+		                        sizeof(struct nvme_submission_queue));
+
+	if (sc->compl_queues == NULL) {
+		sc->compl_queues = calloc(sc->max_queues + 1,
+		                        sizeof(struct nvme_completion_queue));
+
+		for (int i = 0; i <= sc->num_cqueues; i++)
+			pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
+	}
+}
+
+static void
+pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
+{
+	uint16_t acqs, asqs;
+
+	DPRINTF(("%s\r\n", __func__));
+
+	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
+	sc->submit_queues[0].size = asqs;
+	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
+	            sizeof(struct nvme_command) * asqs);
+
+	DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
+	        __func__, sc->regs.asq, sc->submit_queues[0].qbase));
+
+	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
+	    NVME_AQA_REG_ACQS_MASK) + 1;
+	sc->compl_queues[0].size = acqs;
+	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
+	         sizeof(struct nvme_completion) * acqs);
+	DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
+	        __func__, sc->regs.acq, sc->compl_queues[0].qbase));
+}
+
+static int
+nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	uint16_t qid = command->cdw10 & 0xffff;
+
+	DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
+	if (qid == 0 || qid > sc->num_cqueues) {
+		WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
+		        __func__, qid, sc->num_squeues));
+		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
+		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
+		return (1);
+	}
+
+	sc->submit_queues[qid].qbase = NULL;
+	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+	return (1);
+}
+
+static int
+nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	if (command->cdw11 & NVME_CMD_CDW11_PC) {
+		uint16_t qid = command->cdw10 & 0xffff;
+		struct nvme_submission_queue *nsq;
+
+		if (qid > sc->num_squeues) {
+			WPRINTF(("%s queue index %u > num_squeues %u\r\n",
+			        __func__, qid, sc->num_squeues));
+			pci_nvme_status_tc(&compl->status,
+			    NVME_SCT_COMMAND_SPECIFIC,
+			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
+			return (1);
+		}
+
+		nsq = &sc->submit_queues[qid];
+		nsq->size = ((command->cdw10 >> 16) & 0xffff) + 1;
+
+		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
+		              sizeof(struct nvme_command) * (size_t)nsq->size);
+		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
+		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
+
+		DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
+		        qid, nsq->size, nsq->qbase, nsq->cqid));
+
+		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+
+		DPRINTF(("%s completed creating IOSQ qid %u\r\n",
+		         __func__, qid));
+	} else {
+		/* 
+		 * Guest sent non-cont submission queue request.
+		 * This setting is unsupported by this emulation.
+		 */
+		WPRINTF(("%s unsupported non-contig (list-based) "
+		         "create i/o submission queue\r\n", __func__));
+
+		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+	}
+	return (1);
+}
+
+static int
+nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	uint16_t qid = command->cdw10 & 0xffff;
+
+	DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
+	if (qid == 0 || qid > sc->num_cqueues) {
+		WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
+		        __func__, qid, sc->num_cqueues));
+		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
+		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
+		return (1);
+	}
+
+	sc->compl_queues[qid].qbase = NULL;
+	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+	return (1);
+}
+
+static int
+nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	if (command->cdw11 & NVME_CMD_CDW11_PC) {
+		uint16_t qid = command->cdw10 & 0xffff;
+		struct nvme_completion_queue *ncq;
+
+		if (qid > sc->num_cqueues) {
+			WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
+			        __func__, qid, sc->num_cqueues));
+			pci_nvme_status_tc(&compl->status,
+			    NVME_SCT_COMMAND_SPECIFIC,
+			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
+			return (1);
+		}
+
+		ncq = &sc->compl_queues[qid];
+		ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
+		ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
+		ncq->size = ((command->cdw10 >> 16) & 0xffff) + 1;
+
+		ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
+		             command->prp1,
+		             sizeof(struct nvme_command) * (size_t)ncq->size);
+
+		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+	} else {
+		/* 
+		 * Non-contig completion queue unsupported.
+		 */
+		WPRINTF(("%s unsupported non-contig (list-based) "
+		         "create i/o completion queue\r\n",
+		         __func__));
+
+		/* 0x12 = Invalid Use of Controller Memory Buffer */
+		pci_nvme_status_genc(&compl->status, 0x12);
+	}
+
+	return (1);
+}
+
+static int
+nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
+	uint8_t logpage = command->cdw10 & 0xFF;
+	void *data;
+
+	DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
+
+	if (logpage >= 1 && logpage <= 3)
+		data = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
+		                  PAGE_SIZE);
+
+	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+
+	switch (logpage) {
+	case 0x01: /* Error information */
+		memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
+		break;
+	case 0x02: /* SMART/Health information */
+		/* TODO: present some smart info */
+		memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
+		break;
+	case 0x03: /* Firmware slot information */
+		memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
+		break;
+	default:
+		WPRINTF(("%s get log page %x command not supported\r\n",
+		        __func__, logpage));
+
+		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
+		    NVME_SC_INVALID_LOG_PAGE);
+	}
+
+	return (1);
+}
+
+static int
+nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	void *dest;
+
+	DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
+	        command->cdw10 & 0xFF, command->nsid));
+
+	switch (command->cdw10 & 0xFF) {
+	case 0x00: /* return Identify Namespace data structure */
+		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
+		                  sizeof(sc->nsdata));
+		memcpy(dest, &sc->nsdata, sizeof(sc->nsdata));
+		break;
+	case 0x01: /* return Identify Controller data structure */
+		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
+		                  sizeof(sc->ctrldata));
+		memcpy(dest, &sc->ctrldata, sizeof(sc->ctrldata));
+		break;
+	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
+		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
+		                  sizeof(uint32_t) * 1024);
+		((uint32_t *)dest)[0] = 1;
+		((uint32_t *)dest)[1] = 0;
+		break;
+	case 0x11:
+		pci_nvme_status_genc(&compl->status,
+		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
+		return (1);
+	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
+	case 0x10:
+	case 0x12:
+	case 0x13:
+	case 0x14:
+	case 0x15:
+	default:
+		DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
+		         __func__, command->cdw10 & 0xFF));
+		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+		return (1);
+	}
+
+	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+	return (1);
+}
+
+static int
+nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	int feature = command->cdw10 & 0x0F;
+	uint32_t iv;
+
+	DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
+	compl->cdw0 = 0;
+
+	switch (feature) {
+	case NVME_FEAT_ARBITRATION:
+		DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
+		break;
+	case NVME_FEAT_POWER_MANAGEMENT:
+		DPRINTF(("  power management 0x%x\r\n", command->cdw11));
+		break;
+	case NVME_FEAT_LBA_RANGE_TYPE:
+		DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
+		break;
+	case NVME_FEAT_TEMPERATURE_THRESHOLD:
+		DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
+		break;
+	case NVME_FEAT_ERROR_RECOVERY:
+		DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
+		break;
+	case NVME_FEAT_VOLATILE_WRITE_CACHE:
+		DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
+		break;
+	case NVME_FEAT_NUMBER_OF_QUEUES:
+		sc->num_squeues = command->cdw11 & 0xFFFF;
+		sc->num_cqueues = (command->cdw11 >> 16) & 0xFFFF;
+		DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
+		        sc->num_squeues, sc->num_cqueues));
+
+		if (sc->num_squeues == 0 || sc->num_squeues > sc->max_queues)
+			sc->num_squeues = sc->max_queues;
+		if (sc->num_cqueues == 0 || sc->num_cqueues > sc->max_queues)
+			sc->num_cqueues = sc->max_queues;
+
+		compl->cdw0 = (sc->num_squeues & 0xFFFF) |
+		              ((sc->num_cqueues & 0xFFFF) << 16);
+
+		break;
+	case NVME_FEAT_INTERRUPT_COALESCING:
+		DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
+
+		/* in uS */
+		sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
+
+		sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
+		break;
+	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
+		iv = command->cdw11 & 0xFFFF;
+
+		DPRINTF(("  interrupt vector configuration 0x%x\r\n",
+		        command->cdw11));
+
+		for (uint32_t i = 0; i <= sc->num_cqueues; i++) {
+			if (sc->compl_queues[i].intr_vec == iv) {
+				if (command->cdw11 & (1 << 16))
+					sc->compl_queues[i].intr_en |=
+					                      NVME_CQ_INTCOAL;  
+				else
+					sc->compl_queues[i].intr_en &=
+					                     ~NVME_CQ_INTCOAL;  
+			}
+		}
+		break;
+	case NVME_FEAT_WRITE_ATOMICITY:
+		DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
+		break;
+	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
+		DPRINTF(("  async event configuration 0x%x\r\n",
+		        command->cdw11));
+		sc->async_ev_config = command->cdw11;
+		break;
+	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
+		DPRINTF(("  software progress marker 0x%x\r\n",
+		        command->cdw11));
+		break;
+	case 0x0C:
+		DPRINTF(("  autonomous power state transition 0x%x\r\n",
+		        command->cdw11));
+		break;
+	default:
+		WPRINTF(("%s invalid feature\r\n", __func__));
+		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+		return (1);
+	}
+
+	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+	return (1);
+}
+
+static int
+nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	int feature = command->cdw10 & 0x0F;
+
+	DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
+
+	compl->cdw0 = 0;
+
+	switch (feature) {
+	case NVME_FEAT_ARBITRATION:
+		DPRINTF(("  arbitration\r\n"));
+		break;
+	case NVME_FEAT_POWER_MANAGEMENT:
+		DPRINTF(("  power management\r\n"));
+		break;
+	case NVME_FEAT_LBA_RANGE_TYPE:
+		DPRINTF(("  lba range\r\n"));
+		break;
+	case NVME_FEAT_TEMPERATURE_THRESHOLD:
+		DPRINTF(("  temperature threshold\r\n"));
+		switch ((command->cdw11 >> 20) & 0x3) {
+		case 0:
+			/* Over temp threshold */
+			compl->cdw0 = 0xFFFF;
+			break;
+		case 1:
+			/* Under temp threshold */
+			compl->cdw0 = 0;
+			break;
+		default:
+			WPRINTF(("  invalid threshold type select\r\n"));
+			pci_nvme_status_genc(&compl->status,
+			    NVME_SC_INVALID_FIELD);
+			return (1);
+		}
+		break;
+	case NVME_FEAT_ERROR_RECOVERY:
+		DPRINTF(("  error recovery\r\n"));
+		break;
+	case NVME_FEAT_VOLATILE_WRITE_CACHE:
+		DPRINTF(("  volatile write cache\r\n"));
+		break;
+	case NVME_FEAT_NUMBER_OF_QUEUES:
+		compl->cdw0 = 0;
+		if (sc->num_squeues == 0)
+			compl->cdw0 |= sc->max_queues & 0xFFFF;
+		else
+			compl->cdw0 |= sc->num_squeues & 0xFFFF;
+
+		if (sc->num_cqueues == 0)
+			compl->cdw0 |= (sc->max_queues & 0xFFFF) << 16;
+		else
+			compl->cdw0 |= (sc->num_cqueues & 0xFFFF) << 16;
+
+		DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
+		        compl->cdw0 & 0xFFFF,
+		        (compl->cdw0 >> 16) & 0xFFFF));
+
+		break;
+	case NVME_FEAT_INTERRUPT_COALESCING:
+		DPRINTF(("  interrupt coalescing\r\n"));
+		break;
+	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
+		DPRINTF(("  interrupt vector configuration\r\n"));
+		break;
+	case NVME_FEAT_WRITE_ATOMICITY:
+		DPRINTF(("  write atomicity\r\n"));
+		break;
+	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
+		DPRINTF(("  async event configuration\r\n"));
+		sc->async_ev_config = command->cdw11;
+		break;
+	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
+		DPRINTF(("  software progress marker\r\n"));
+		break;
+	case 0x0C:
+		DPRINTF(("  autonomous power state transition\r\n"));
+		break;
+	default:
+		WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
+		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+		return (1);
+	}
+
+	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+	return (1);
+}
+
+static int
+nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
+	struct nvme_completion* compl)
+{
+	DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
+	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
+
+	/* TODO: search for the command ID and abort it */
+
+	compl->cdw0 = 1;
+	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+	return (1);
+}
+
+static int
+nvme_opc_async_event_req(struct pci_nvme_softc* sc,
+	struct nvme_command* command, struct nvme_completion* compl)
+{
+	DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
+
+	/*
+	 * TODO: raise events when they happen based on the Set Features cmd.
+	 * These events happen async, so only set completion successful if
+	 * there is an event reflective of the request to get event.
+	 */
+	pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
+	    NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
+	return (0);
+}
+
+static void
+pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
+{
+	struct nvme_completion compl;
+	struct nvme_command *cmd;
+	struct nvme_submission_queue *sq;
+	struct nvme_completion_queue *cq;
+	int do_intr = 0;
+	uint16_t sqhead;
+
+	DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
+
+	sq = &sc->submit_queues[0];
+
+	sqhead = atomic_load_acq_short(&sq->head);
+
+	if (atomic_testandset_int(&sq->busy, 1)) {
+		DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
+		        __func__, sqhead, sq->tail));
+		return;
+	}
+
+	DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
+	
+	while (sqhead != atomic_load_acq_short(&sq->tail)) {
+		cmd = &(sq->qbase)[sqhead];
+		compl.status = 0;
+
+		switch (NVME_CMD_GET_OPC(cmd->opc_fuse)) {
+		case NVME_OPC_DELETE_IO_SQ:
+			DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
+			do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
+			break;
+		case NVME_OPC_CREATE_IO_SQ:
+			DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
+			do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
+			break;
+		case NVME_OPC_DELETE_IO_CQ:
+			DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
+			do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
+			break;
+		case NVME_OPC_CREATE_IO_CQ:
+			DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
+			do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
+			break;
+		case NVME_OPC_GET_LOG_PAGE:
+			DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
+			do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
+			break;
+		case NVME_OPC_IDENTIFY:
+			DPRINTF(("%s command IDENTIFY\r\n", __func__));
+			do_intr |= nvme_opc_identify(sc, cmd, &compl);
+			break;
+		case NVME_OPC_ABORT:
+			DPRINTF(("%s command ABORT\r\n", __func__));
+			do_intr |= nvme_opc_abort(sc, cmd, &compl);
+			break;
+		case NVME_OPC_SET_FEATURES:
+			DPRINTF(("%s command SET_FEATURES\r\n", __func__));
+			do_intr |= nvme_opc_set_features(sc, cmd, &compl);
+			break;
+		case NVME_OPC_GET_FEATURES:
+			DPRINTF(("%s command GET_FEATURES\r\n", __func__));
+			do_intr |= nvme_opc_get_features(sc, cmd, &compl);
+			break;
+		case NVME_OPC_ASYNC_EVENT_REQUEST:
+			DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
+			/* XXX dont care, unhandled for now
+			do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
+			*/
+			break;
+		default:
+			WPRINTF(("0x%x command is not implemented\r\n",
+			    NVME_CMD_GET_OPC(cmd->opc_fuse)));
+		}
+	
+		/* for now skip async event generation */
+		if (NVME_CMD_GET_OPC(cmd->opc_fuse) !=
+		    NVME_OPC_ASYNC_EVENT_REQUEST) {
+			struct nvme_completion *cp;
+			int phase;
+
+			cq = &sc->compl_queues[0];
+
+			cp = &(cq->qbase)[cq->tail];
+			cp->sqid = 0;
+			cp->sqhd = sqhead;
+			cp->cid = cmd->cid;
+
+			phase = NVME_STATUS_GET_P(cp->status);
+			cp->status = compl.status;
+			pci_nvme_toggle_phase(&cp->status, phase);
+
+			cq->tail = (cq->tail + 1) % cq->size;
+		}
+		sqhead = (sqhead + 1) % sq->size;
+	}
+
+	DPRINTF(("setting sqhead %u\r\n", sqhead));
+	atomic_store_short(&sq->head, sqhead);
+	atomic_store_int(&sq->busy, 0);
+
+	if (do_intr)
+		pci_generate_msix(sc->nsc_pi, 0);
+
+}
+
+static int
+pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
+	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
+{
+	int iovidx;
+
+	if (req != NULL) {
+		/* concatenate contig block-iovs to minimize number of iovs */
+		if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
+			iovidx = req->io_req.br_iovcnt - 1;
+
+			req->io_req.br_iov[iovidx].iov_base =
+			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
+			                     req->prev_gpaddr, size);
+
+			req->prev_size += size;
+			req->io_req.br_resid += size;
+
+			req->io_req.br_iov[iovidx].iov_len = req->prev_size;
+		} else {
+			pthread_mutex_lock(&req->mtx);
+
+			iovidx = req->io_req.br_iovcnt;
+			if (iovidx == NVME_MAX_BLOCKIOVS) {
+				int err = 0;
+
+				DPRINTF(("large I/O, doing partial req\r\n"));
+
+				iovidx = 0;
+				req->io_req.br_iovcnt = 0;
+
+				req->io_req.br_callback = pci_nvme_io_partial;
+
+				if (!do_write)
+					err = blockif_read(sc->nvstore.ctx,
+					                   &req->io_req);
+				else
+					err = blockif_write(sc->nvstore.ctx,
+					                    &req->io_req);
+
+				/* wait until req completes before cont */
+				if (err == 0)
+					pthread_cond_wait(&req->cv, &req->mtx);
+			}
+			if (iovidx == 0) {
+				req->io_req.br_offset = lba;
+				req->io_req.br_resid = 0;
+				req->io_req.br_param = req;
+			}
+
+			req->io_req.br_iov[iovidx].iov_base =
+			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
+			                     gpaddr, size);
+
+			req->io_req.br_iov[iovidx].iov_len = size;
+
+			req->prev_gpaddr = gpaddr;
+			req->prev_size = size;
+			req->io_req.br_resid += size;
+
+			req->io_req.br_iovcnt++;
+
+			pthread_mutex_unlock(&req->mtx);
+		}
+	} else {
+		/* RAM buffer: read/write directly */
+		void *p = sc->nvstore.ctx;
+		void *gptr;
+
+		if ((lba + size) > sc->nvstore.size) {
+			WPRINTF(("%s write would overflow RAM\r\n", __func__));
+			return (-1);
+		}
+
+		p = (void *)((uintptr_t)p + (uintptr_t)lba);
+		gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
+		if (do_write) 
+			memcpy(p, gptr, size);
+		else
+			memcpy(gptr, p, size);
+	}
+	return (0);
+}
+
+static void
+pci_nvme_set_completion(struct pci_nvme_softc *sc,
+	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
+	uint32_t cdw0, uint16_t status, int ignore_busy)
+{
+	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
+	struct nvme_completion *compl;
+	int do_intr = 0;
+	int phase;
+
+	DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
+		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
+		 NVME_STATUS_GET_SC(status)));
+
+	pthread_mutex_lock(&cq->mtx);
+
+	assert(cq->qbase != NULL);
+
+	compl = &cq->qbase[cq->tail];
+
+	compl->sqhd = atomic_load_acq_short(&sq->head);
+	compl->sqid = sqid;
+	compl->cid = cid;
+
+	// toggle phase
+	phase = NVME_STATUS_GET_P(compl->status);
+	compl->status = status;
+	pci_nvme_toggle_phase(&compl->status, phase);
+
+	cq->tail = (cq->tail + 1) % cq->size;
+
+	if (cq->intr_en & NVME_CQ_INTEN)
+		do_intr = 1;
+
+	pthread_mutex_unlock(&cq->mtx);
+
+	if (ignore_busy || !atomic_load_acq_int(&sq->busy))
+		if (do_intr)
+			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
+}
+
+static void
+pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
+{
+	req->sc = NULL;
+	req->nvme_sq = NULL;
+	req->sqid = 0;
+
+	pthread_mutex_lock(&sc->mtx);
+
+	req->next = sc->ioreqs_free;
+	sc->ioreqs_free = req;
+	sc->pending_ios--;
+
+	/* when no more IO pending, can set to ready if device reset/enabled */
+	if (sc->pending_ios == 0 &&
+	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
+		sc->regs.csts |= NVME_CSTS_RDY;
+
+	pthread_mutex_unlock(&sc->mtx);
+
+	sem_post(&sc->iosemlock);
+}
+
+static struct pci_nvme_ioreq *
+pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
+{
+	struct pci_nvme_ioreq *req = NULL;;
+
+	sem_wait(&sc->iosemlock);
+	pthread_mutex_lock(&sc->mtx);
+
+	req = sc->ioreqs_free;
+	assert(req != NULL);
+
+	sc->ioreqs_free = req->next;
+
+	req->next = NULL;
+	req->sc = sc;
+
+	sc->pending_ios++;
+
+	pthread_mutex_unlock(&sc->mtx);
+
+	req->io_req.br_iovcnt = 0;
+	req->io_req.br_offset = 0;
+	req->io_req.br_resid = 0;
+	req->io_req.br_param = req;
+	req->prev_gpaddr = 0;
+	req->prev_size = 0;
+
+	return req;
+}
+
+static void
+pci_nvme_io_done(struct blockif_req *br, int err)
+{
+	struct pci_nvme_ioreq *req = br->br_param;
+	struct nvme_submission_queue *sq = req->nvme_sq;
+	uint16_t code, status;
+
+	DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
+	
+	/* TODO return correct error */
+	if (err)
+		code = NVME_SC_DATA_TRANSFER_ERROR;
+	else
+		code = NVME_SC_SUCCESS;
+
+	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
+	pci_nvme_status_genc(&status, code);
+
+	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
+	pci_nvme_release_ioreq(req->sc, req);
+}
+
+static void
+pci_nvme_io_partial(struct blockif_req *br, int err)
+{
+	struct pci_nvme_ioreq *req = br->br_param;
+
+	DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
+
+	pthread_cond_signal(&req->cv);
+}
+
+
+static void
+pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
+{
+	struct nvme_submission_queue *sq;
+	uint16_t status;
+	uint16_t sqhead;
+	int err;
+
+	/* handle all submissions up to sq->tail index */
+	sq = &sc->submit_queues[idx];
+
+	if (atomic_testandset_int(&sq->busy, 1)) {
+		DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
+		return;
+	}
+
+	sqhead = atomic_load_acq_short(&sq->head);
+
+	DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
+	         idx, sqhead, sq->tail, sq->qbase));
+
+	while (sqhead != atomic_load_acq_short(&sq->tail)) {
+		struct nvme_command *cmd;
+		struct pci_nvme_ioreq *req = NULL;
+		uint64_t lba;
+		uint64_t nblocks, bytes, size, cpsz;
+
+		/* TODO: support scatter gather list handling */
+
+		cmd = &sq->qbase[sqhead];
+		sqhead = (sqhead + 1) % sq->size;
+
+		lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
+
+		if (NVME_CMD_GET_OPC(cmd->opc_fuse) == NVME_OPC_FLUSH) {
+			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
+			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
+			                        status, 1);
+
+			continue;
+		} else if (NVME_CMD_GET_OPC(cmd->opc_fuse) == 0x08) {
+			/* TODO: write zeroes */
+			WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
+			        __func__, lba, cmd->cdw12 & 0xFFFF));
+			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
+			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
+			                        status, 1);
+
+			continue;
+		}
+
+		nblocks = (cmd->cdw12 & 0xFFFF) + 1;
+
+		bytes = nblocks * sc->nvstore.sectsz;
+
+		if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
+			req = pci_nvme_get_ioreq(sc);
+			req->nvme_sq = sq;
+			req->sqid = idx;
+		}
+
+		/*
+		 * If data starts mid-page and flows into the next page, then
+		 * increase page count
+		 */
+
+		DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
+		         "(%lu-bytes)\r\n",
+		         sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
+		         NVME_CMD_GET_OPC(cmd->opc_fuse) == NVME_OPC_WRITE ?
+			     "WRITE" : "READ",
+		         lba, nblocks, bytes));
+
+		cmd->prp1 &= ~(0x03UL);
+		cmd->prp2 &= ~(0x03UL);
+
+		DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
+
+		size = bytes;
+		lba *= sc->nvstore.sectsz;
+
+		cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
+
+		if (cpsz > bytes)
+			cpsz = bytes;
+
+		if (req != NULL) {
+			req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
+			                        cmd->cdw10;
+			req->opc = NVME_CMD_GET_OPC(cmd->opc_fuse);
+			req->cid = cmd->cid;
+			req->nsid = cmd->nsid;
+		}
+
+		err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
+		    NVME_CMD_GET_OPC(cmd->opc_fuse) == NVME_OPC_WRITE, lba);
+		lba += cpsz;
+		size -= cpsz;
+
+		if (size == 0)
+			goto iodone;
+
+		if (size <= PAGE_SIZE) {
+			/* prp2 is second (and final) page in transfer */
+
+			err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
+			    size,
+			    NVME_CMD_GET_OPC(cmd->opc_fuse) == NVME_OPC_WRITE,
+			    lba);
+		} else {
+			uint64_t *prp_list;
+			int i;
+
+			/* prp2 is pointer to a physical region page list */
+			prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
+			                            cmd->prp2, PAGE_SIZE);
+
+			i = 0;
+			while (size != 0) {
+				cpsz = MIN(size, PAGE_SIZE);
+
+				/*
+				 * Move to linked physical region page list
+				 * in last item.
+				 */ 
+				if (i == (NVME_PRP2_ITEMS-1) &&
+				    size > PAGE_SIZE) {
+					assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
+					prp_list = paddr_guest2host(
+					              sc->nsc_pi->pi_vmctx,
+					              prp_list[i], PAGE_SIZE);
+					i = 0;
+				}
+				if (prp_list[i] == 0) {
+					WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
+					err = 1;
+					break;
+				}
+
+				err = pci_nvme_append_iov_req(sc, req,
+				    prp_list[i], cpsz,
+				    NVME_CMD_GET_OPC(cmd->opc_fuse) ==
+				        NVME_OPC_WRITE, lba);
+				if (err)
+					break;
+
+				lba += cpsz;
+				size -= cpsz;
+				i++;
+			}
+		}
+
+iodone:
+		if (sc->nvstore.type == NVME_STOR_RAM) {
+			uint16_t code, status;
+
+			code = err ? NVME_SC_LBA_OUT_OF_RANGE :
+			    NVME_SC_SUCCESS;
+			pci_nvme_status_genc(&status, code);
+
+			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
+			                        status, 1);
+
+			continue;
+		}
+
+
+		if (err)
+			goto do_error;
+
+		req->io_req.br_callback = pci_nvme_io_done;
+
+		err = 0;
+		switch (NVME_CMD_GET_OPC(cmd->opc_fuse)) {
+		case NVME_OPC_READ:
+			err = blockif_read(sc->nvstore.ctx, &req->io_req);
+			break;
+		case NVME_OPC_WRITE:
+			err = blockif_write(sc->nvstore.ctx, &req->io_req);
+			break;
+		default:
+			WPRINTF(("%s unhandled io command 0x%x\r\n",
+				 __func__, NVME_CMD_GET_OPC(cmd->opc_fuse)));
+			err = 1;
+		}
+
+do_error:
+		if (err) {
+			uint16_t status;
+
+			pci_nvme_status_genc(&status,
+			    NVME_SC_DATA_TRANSFER_ERROR);
+
+			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
+			                        status, 1);
+			pci_nvme_release_ioreq(sc, req);
+		}
+	}
+
+	atomic_store_short(&sq->head, sqhead);
+	atomic_store_int(&sq->busy, 0);
+}
+
+static void
+pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
+	uint64_t idx, int is_sq, uint64_t value)
+{
+	DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
+	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
+
+	if (is_sq) {
+		atomic_store_short(&sc->submit_queues[idx].tail,
+		                   (uint16_t)value);
+
+		if (idx == 0) {
+			pci_nvme_handle_admin_cmd(sc, value);
+		} else {
+			/* submission queue; handle new entries in SQ */
+			if (idx > sc->num_squeues) {
+				WPRINTF(("%s SQ index %lu overflow from "
+				         "guest (max %u)\r\n",
+				         __func__, idx, sc->num_squeues));
+				return;
+			}
+			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
+		}
+	} else {
+		if (idx > sc->num_cqueues) {
+			WPRINTF(("%s queue index %lu overflow from "
+			         "guest (max %u)\r\n",
+			         __func__, idx, sc->num_cqueues));
+			return;
+		}
+
+		sc->compl_queues[idx].head = (uint16_t)value;
+	}
+}
+
+static void
+pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
+{
+	const char *s = iswrite ? "WRITE" : "READ";
+
+	switch (offset) {
+	case NVME_CR_CAP_LOW:
+		DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
+		break;
+	case NVME_CR_CAP_HI:
+		DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
+		break;
+	case NVME_CR_VS:
+		DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
+		break;
+	case NVME_CR_INTMS:
+		DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
+		break;
+	case NVME_CR_INTMC:
+		DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
+		break;
+	case NVME_CR_CC:
+		DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
+		break;
+	case NVME_CR_CSTS:
+		DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
+		break;
+	case NVME_CR_NSSR:
+		DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
+		break;
+	case NVME_CR_AQA:
+		DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
+		break;
+	case NVME_CR_ASQ_LOW:
+		DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
+		break;
+	case NVME_CR_ASQ_HI:
+		DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
+		break;
+	case NVME_CR_ACQ_LOW:
+		DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
+		break;
+	case NVME_CR_ACQ_HI:
+		DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
+		break;
+	default:
+		DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
+	}
+
+}
+
+static void
+pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
+	uint64_t offset, int size, uint64_t value)
+{
+	uint32_t ccreg;
+
+	if (offset >= NVME_DOORBELL_OFFSET) {
+		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
+		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
+		int is_sq = (belloffset % 8) < 4;
+
+		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
+			WPRINTF(("guest attempted an overflow write offset "
+			         "0x%lx, val 0x%lx in %s",
+			         offset, value, __func__));
+			return;
+		}
+
+		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
+		return;
+	}
+
+	DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
+	        offset, size, value));
+
+	if (size != 4) {
+		WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
+		         "val 0x%lx) to bar0 in %s",
+		         size, offset, value, __func__));
+		/* TODO: shutdown device */
+		return;
+	}
+
+	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
+
+	pthread_mutex_lock(&sc->mtx);
+
+	switch (offset) {
+	case NVME_CR_CAP_LOW:
+	case NVME_CR_CAP_HI:
+		/* readonly */
+		break;
+	case NVME_CR_VS:
+		/* readonly */
+		break;
+	case NVME_CR_INTMS:
+		/* MSI-X, so ignore */
+		break;
+	case NVME_CR_INTMC:
+		/* MSI-X, so ignore */
+		break;
+	case NVME_CR_CC:
+		ccreg = (uint32_t)value;
+
+		DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
+		         "iocqes %u\r\n",
+		        __func__,
+			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
+			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
+			 NVME_CC_GET_IOCQES(ccreg)));
+
+		if (NVME_CC_GET_SHN(ccreg)) {
+			/* perform shutdown - flush out data to backend */
+			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
+			    NVME_CSTS_REG_SHST_SHIFT);
+			sc->regs.csts |= NVME_SHST_COMPLETE <<
+			    NVME_CSTS_REG_SHST_SHIFT;
+		}
+		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
+			if (NVME_CC_GET_EN(ccreg) == 0)
+				/* transition 1-> causes controller reset */
+				pci_nvme_reset(sc);
+			else
+				pci_nvme_init_controller(ctx, sc);
+		}
+
+		/* Insert the iocqes, iosqes and en bits from the write */
+		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
+		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
+		if (NVME_CC_GET_EN(ccreg) == 0) {
+			/* Insert the ams, mps and css bit fields */
+			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
+			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
+			sc->regs.csts &= ~NVME_CSTS_RDY;
+		} else if (sc->pending_ios == 0) {
+			sc->regs.csts |= NVME_CSTS_RDY;
+		}
+		break;
+	case NVME_CR_CSTS:
+		break;
+	case NVME_CR_NSSR:
+		/* ignore writes; don't support subsystem reset */
+		break;
+	case NVME_CR_AQA:
+		sc->regs.aqa = (uint32_t)value;
+		break;
+	case NVME_CR_ASQ_LOW:
+		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
+		               (0xFFFFF000 & value);
+		break;
+	case NVME_CR_ASQ_HI:
+		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
+		               (value << 32);
+		break;
+	case NVME_CR_ACQ_LOW:
+		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
+		               (0xFFFFF000 & value);
+		break;
+	case NVME_CR_ACQ_HI:
+		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
+		               (value << 32);
+		break;
+	default:
+		DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
+		         __func__, offset, value, size));
+	}
+	pthread_mutex_unlock(&sc->mtx);
+}
+
+static void
+pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+                int baridx, uint64_t offset, int size, uint64_t value)
+{
+	struct pci_nvme_softc* sc = pi->pi_arg;
+
+	if (baridx == pci_msix_table_bar(pi) ||
+	    baridx == pci_msix_pba_bar(pi)) {
+		DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
+		         " value 0x%lx\r\n", baridx, offset, size, value));
+
+		pci_emul_msix_twrite(pi, offset, size, value);
+		return;
+	}
+
+	switch (baridx) {
+	case 0:
+		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
+		break;
+
+	default:
+		DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
+		         __func__, baridx, value));
+	}
+}
+
+static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
+	uint64_t offset, int size)
+{
+	uint64_t value;
+
+	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
+
+	if (offset < NVME_DOORBELL_OFFSET) {
+		void *p = &(sc->regs);
+		pthread_mutex_lock(&sc->mtx);
+		memcpy(&value, (void *)((uintptr_t)p + offset), size);
+		pthread_mutex_unlock(&sc->mtx);
+	} else {
+		value = 0;
+                WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
+	}
+
+	switch (size) {
+	case 1:
+		value &= 0xFF;
+		break;
+	case 2:
+		value &= 0xFFFF;
+		break;
+	case 4:
+		value &= 0xFFFFFFFF;
+		break;
+	}
+
+	DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
+	         offset, size, (uint32_t)value));
+
+	return (value);
+}
+
+
+
+static uint64_t
+pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+    uint64_t offset, int size)
+{
+	struct pci_nvme_softc* sc = pi->pi_arg;
+
+	if (baridx == pci_msix_table_bar(pi) ||
+	    baridx == pci_msix_pba_bar(pi)) {
+		DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
+		        baridx, offset, size));
+
+		return pci_emul_msix_tread(pi, offset, size);
+	}
+
+	switch (baridx) {
+	case 0:
+       		return pci_nvme_read_bar_0(sc, offset, size);
+
+	default:
+		DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
+	}
+
+	return (0);
+}
+
+
+static int
+pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
+{
+	char bident[sizeof("XX:X:X")];
+	char	*uopt, *xopts, *config;
+	uint32_t sectsz;
+	int optidx;
+
+	sc->max_queues = NVME_QUEUES;
+	sc->max_qentries = NVME_MAX_QENTRIES;
+	sc->ioslots = NVME_IOSLOTS;
+	sc->num_squeues = sc->max_queues;
+	sc->num_cqueues = sc->max_queues;
+	sectsz = 0;
+
+	uopt = strdup(opts);
+	optidx = 0;
+	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
+	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
+	for (xopts = strtok(uopt, ",");
+	     xopts != NULL;
+	     xopts = strtok(NULL, ",")) {
+
+		if ((config = strchr(xopts, '=')) != NULL)
+			*config++ = '\0';
+
+		if (!strcmp("maxq", xopts)) {
+			sc->max_queues = atoi(config);
+		} else if (!strcmp("qsz", xopts)) {
+			sc->max_qentries = atoi(config);
+		} else if (!strcmp("ioslots", xopts)) {
+			sc->ioslots = atoi(config);
+		} else if (!strcmp("sectsz", xopts)) {
+			sectsz = atoi(config);
+		} else if (!strcmp("ser", xopts)) {
+			memset(sc->ctrldata.sn, 0, sizeof(sc->ctrldata.sn));
+			strncpy(sc->ctrldata.sn, config,
+			        sizeof(sc->ctrldata.sn));
+		} else if (!strcmp("ram", xopts)) {
+			uint64_t sz = strtoull(&xopts[4], NULL, 10);
+
+			sc->nvstore.type = NVME_STOR_RAM;
+			sc->nvstore.size = sz * 1024 * 1024;
+			sc->nvstore.ctx = calloc(1, sc->nvstore.size);
+			sc->nvstore.sectsz = 4096;
+			sc->nvstore.sectsz_bits = 12;
+			if (sc->nvstore.ctx == NULL) {
+				perror("Unable to allocate RAM");
+				return (-1);
+			}
+		} else if (optidx == 0) {
+			snprintf(bident, sizeof(bident), "%d:%d",
+			         sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
+			sc->nvstore.ctx = blockif_open(xopts, bident);
+			if (sc->nvstore.ctx == NULL) {
+				perror("Could not open backing file");
+				return (-1);
+			}
+			sc->nvstore.type = NVME_STOR_BLOCKIF;
+			sc->nvstore.size = blockif_size(sc->nvstore.ctx);
+		} else {
+			fprintf(stderr, "Invalid option %s\n", xopts);
+			return (-1);
+		}
+
+		optidx++;
+	}
+	if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
+		fprintf(stderr, "backing store not specified\n");
+		return (-1);
+	}
+	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
+		sc->nvstore.sectsz = sectsz;
+	else if (sc->nvstore.type != NVME_STOR_RAM)
+		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
+	for (sc->nvstore.sectsz_bits = 9;
+	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
+	     sc->nvstore.sectsz_bits++)
+		;
+
+
+	if (sc->max_queues == 0) {
+		fprintf(stderr, "Invalid maxq option\n");
+		return (-1);
+	}
+	if (sc->max_qentries <= 0) {
+		fprintf(stderr, "Invalid qsz option\n");
+		return (-1);
+	}
+	if (sc->ioslots <= 0) {
+		fprintf(stderr, "Invalid ioslots option\n");
+		return (-1);
+	}
+
+	return (0);
+}
+
+static int
+pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	struct pci_nvme_softc *sc;
+	uint32_t pci_membar_sz;
+	int	error;
+
+	error = 0;
+
+	sc = calloc(1, sizeof(struct pci_nvme_softc));
+	pi->pi_arg = sc;
+	sc->nsc_pi = pi;
+
+	error = pci_nvme_parse_opts(sc, opts);
+	if (error < 0)
+		goto done;
+	else
+		error = 0;
+
+	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
+	for (int i = 0; i < sc->ioslots; i++) {
+		if (i < (sc->ioslots-1))
+			sc->ioreqs[i].next = &sc->ioreqs[i+1];
+		pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
+		pthread_cond_init(&sc->ioreqs[i].cv, NULL);
+	}
+	sc->ioreqs_free = sc->ioreqs;
+	sc->intr_coales_aggr_thresh = 1;
+
+	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
+	pci_set_cfgdata8(pi, PCIR_PROGIF,
+	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
+
+	/* allocate size of nvme registers + doorbell space for all queues */
+	pci_membar_sz = sizeof(struct nvme_registers) +
+	                2*sizeof(uint32_t)*(sc->max_queues);
+
+	DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
+
+	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
+	if (error) {
+		WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
+		goto done;
+	}
+
+	error = pci_emul_add_msixcap(pi, sc->max_queues, NVME_MSIX_BAR);
+	if (error) {
+		WPRINTF(("%s pci add msixcap failed\r\n", __func__));
+		goto done;
+	}
+
+	pthread_mutex_init(&sc->mtx, NULL);
+	sem_init(&sc->iosemlock, 0, sc->ioslots);
+
+	pci_nvme_reset(sc);
+	pci_nvme_init_ctrldata(sc);
+	pci_nvme_init_nsdata(sc);
+
+	pci_lintr_request(pi);
+
+done:
+	return (error);
+}
+
+
+struct pci_devemu pci_de_nvme = {
+	.pe_emu =	"nvme",
+	.pe_init =	pci_nvme_init,
+	.pe_barwrite =	pci_nvme_write,
+	.pe_barread =	pci_nvme_read
+};
+PCI_EMUL_SET(pci_de_nvme);

Property changes on: head/usr.sbin/bhyve/pci_nvme.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property