diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 index 479ab75be60f..4f6d771cb93d 100644 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -1,1048 +1,1066 @@ .\" Copyright (c) 2013 Peter Grehan .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd May 5, 2023 .Dt BHYVE 8 .Os .Sh NAME .Nm bhyve .Nd "run a guest operating system inside a virtual machine" .Sh SYNOPSIS .Nm .Op Fl AaCDeHhPSuWwxY .Oo .Sm off .Fl c\~ .Oo .Op Cm cpus= .Ar numcpus .Oc .Op Cm ,sockets= Ar n .Op Cm ,cores= Ar n .Op Cm ,threads= Ar n .Oc .Sm on .Oo Fl f .Sm off .Ar name Cm \&, .Oo .Cm string No | Cm file .Oc .Cm \&= Ar data .Sm on .Oc .Oo .Sm off .Fl G\~ .Oo Ar w Oc .Oo Ar bind_address Cm \&: Oc .Ar port .Sm on .Oc .Op Fl k Ar config_file .Op Fl K Ar layout .Oo Fl l .Sm off .Ar lpcdev Op Cm \&, Ar conf .Sm on .Oc .Sm off .Oo Fl m\~ .Ar memsize .Oo .Cm K | Cm k | Cm M | Cm m | Cm G | Cm g | Cm T | Cm t .Oc .Sm on .Oc .Op Fl o Ar var Ns Cm = Ns Ar value .Op Fl p Ar vcpu Ns Cm \&: Ns Ar hostcpu .Op Fl r Ar file .Oo Fl R .Sm off .Ar host Op Cm \&: Ar port .Sm on .Oc .Sm off .Oo Fl s\~ .Ar slot Cm \&, Ar emulation Op Cm \&, Ar conf .Sm on .Oc .Op Fl U Ar uuid .Ar vmname .Nm .Fl l Cm help .Nm .Fl s Cm help .Sh DESCRIPTION .Nm is a hypervisor that runs guest operating systems inside a virtual machine. .Pp Parameters such as the number of virtual CPUs, amount of guest memory, and I/O connectivity can be specified with command-line parameters. .Pp If not using a boot ROM, the guest operating system must be loaded with .Xr bhyveload 8 or a similar boot loader before running .Nm , otherwise, it is enough to run .Nm with a boot ROM of choice. .Pp .Nm runs until the guest operating system reboots or an unhandled hypervisor exit is detected. .Sh OPTIONS .Bl -tag -width 10n .It Fl A Generate ACPI tables. Required for .Fx Ns /amd64 guests. .It Fl a The guest's local APIC is configured in xAPIC mode. The xAPIC mode is the default setting so this option is redundant. It will be deprecated in a future version. .It Fl C Include guest memory in core file. .It Fl c Op Ar setting ... Number of guest virtual CPUs and/or the CPU topology. The default value for each of .Ar numcpus , .Ar sockets , .Ar cores , and .Ar threads is 1. The current maximum number of guest virtual CPUs is 16. If .Ar numcpus is not specified then it will be calculated from the other arguments. The topology must be consistent in that the .Ar numcpus must equal the product of .Ar sockets , .Ar cores , and .Ar threads . If a .Ar setting is specified more than once the last one has precedence. .It Fl D Destroy the VM on guest initiated power-off. .It Fl e Force .Nm to exit when a guest issues an access to an I/O port that is not emulated. This is intended for debug purposes. .It Fl f Ar name Ns Cm \&, Ns Oo Cm string Ns No | Ns Cm file Ns Oc Ns Cm \&= Ns Ar data Add a fw_cfg file .Ar name to the fw_cfg interface. If a .Cm string is specified, the fw_cfg file contains the string as data. If a .Cm file is specified, bhyve reads the file and adds the file content as fw_cfg data. .It Fl G Xo .Sm off .Oo Ar w Oc .Oo Ar bind_address Cm \&: Oc .Ar port .Sm on .Xc Start a debug server that uses the GDB protocol to export guest state to a debugger. An IPv4 TCP socket will be bound to the supplied .Ar bind_address and .Ar port to listen for debugger connections. Only a single debugger may be attached to the debug server at a time. If the option begins with .Sq w , .Nm will pause execution at the first instruction waiting for a debugger to attach. .It Fl H Yield the virtual CPU thread when a HLT instruction is detected. If this option is not specified, virtual CPUs will use 100% of a host CPU. .It Fl h Print help message and exit. .It Fl k Ar config_file Set configuration variables from a simple, key-value config file. Each line of the config file is expected to consist of a config variable name, an equals sign .It Fl K Ar layout Specify the keyboard layout. The value that can be specified sets the file name in .Ar /usr/share/bhyve/kbdlayout . This specification only works when loaded with UEFI mode for VNC. When using a VNC client that supports QEMU Extended Key Event Message (e.g. TigerVNC), this option isn't needed. When using a VNC client that doesn't support QEMU Extended Key Event Message (e.g. tightVNC), the layout defaults to the US keyboard unless specified otherwise. .Pq Sq = , and a value. No spaces are permitted between the variable name, equals sign, or value. Blank lines and lines starting with .Sq # are ignored. See .Xr bhyve_config 5 for more details. .It Fl l Cm help Print a list of supported LPC devices. .It Fl l Ar lpcdev Ns Op Cm \&, Ns Ar conf Allow devices behind the LPC PCI-ISA bridge to be configured. The only supported devices are the TTY-class devices .Cm com1 , com2 , com3 , and .Cm com4 , the boot ROM device .Cm bootrom , the .Cm fwcfg type and the debug/test device .Cm pc-testdev . .Pp The possible values for the .Ar conf argument are listed in the .Fl s flag description. .It Xo .Fl m Ar memsize Ns Oo .Sm off .Cm K | k | M | m | G | g | T | t .Sm on .Oc .Xc Set the guest physical memory size This must be the same size that was given to .Xr bhyveload 8 . .Pp The size argument may be suffixed with one of .Cm K , M , G or .Cm T (either upper or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes, or terabytes. If no suffix is given, the value is assumed to be in megabytes. .Pp The default is 256M. .It Fl o Ar var Ns Cm = Ns Ar value Set the configuration variable .Ar var to .Ar value . .It Fl P Force the guest virtual CPU to exit when a PAUSE instruction is detected. .It Fl p Ar vcpu Ns Cm \& : Ns Ar hostcpu Pin guest's virtual CPU .Em vcpu to .Em hostcpu . .It Fl r Ar file Resume a guest from a snapshot. The guest memory contents are restored from .Ar file , and the guest device and vCPU state are restored from the file .Dq Ar file Ns .kern . .Pp Note that the current snapshot file format requires that the configuration of devices in the new VM match the VM from which the snapshot was taken by specifying the same .Fl s and .Fl l options. The count of vCPUs and memory configuration are read from the snapshot. .It Fl R Ar host Ns Op Cm \&: Ns Ar port Receive migration from a source guest. Await for a connection from .Ar host on the specified .Ar port and resume execution. The default migration port is 24983. .It Fl S Wire guest memory. .It Fl s Cm help Print a list of supported PCI devices. .It Fl s Ar slot Ns Cm \&, Ns Ar emulation Ns Op Cm \&, Ns Ar conf Configure a virtual PCI slot and function. .Pp .Nm provides PCI bus emulation and virtual devices that can be attached to slots on the bus. There are 32 available slots, with the option of providing up to 8 functions per slot. .Pp The .Ar slot can be specified in one of the following formats: .Pp .Bl -bullet -compact .It .Ar pcislot .It .Sm off .Ar pcislot Cm \&: Ar function .Sm on .It .Sm off .Ar bus Cm \&: Ar pcislot Cm \&: Ar function .Sm on .El .Pp The .Ar pcislot value is 0 to 31. The optional .Ar function value is 0 to 7. The optional .Ar bus value is 0 to 255. If not specified, the .Ar function value defaults to 0. If not specified, the .Ar bus value defaults to 0. .Pp The .Ar emulation argument can be one of the following: .Bl -tag -width "amd_hostbridge" .It Cm hostbridge A simple host bridge. This is usually configured at slot 0, and is required by most guest operating systems. .It Cm amd_hostbridge Emulation identical to .Cm hostbridge using a PCI vendor ID of AMD. .It Cm passthru PCI pass-through device. .It Cm virtio-net Virtio network interface. .It Cm virtio-blk Virtio block storage interface. .It Cm virtio-scsi Virtio SCSI interface. .It Cm virtio-9p Virtio 9p (VirtFS) interface. .It Cm virtio-rnd Virtio RNG interface. .It Cm virtio-console Virtio console interface, which exposes multiple ports to the guest in the form of simple char devices for simple IO between the guest and host userspaces. .It Cm virtio-input Virtio input interface. .It Cm ahci AHCI controller attached to arbitrary devices. .It Cm ahci-cd AHCI controller attached to an ATAPI CD/DVD. .It Cm ahci-hd AHCI controller attached to a SATA hard drive. .It Cm e1000 Intel e82545 network interface. .It Cm uart PCI 16550 serial device. .It Cm lpc LPC PCI-ISA bridge with COM1, COM2, COM3, and COM4 16550 serial ports, a boot ROM, and, optionally, a fwcfg type and the debug/test device. The LPC bridge emulation can only be configured on bus 0. .It Cm fbuf Raw framebuffer device attached to VNC server. .It Cm xhci eXtensible Host Controller Interface (xHCI) USB controller. .It Cm nvme NVM Express (NVMe) controller. .It Cm hda High Definition Audio Controller. .El .Pp The optional parameter .Ar conf describes the backend for device emulations. If .Ar conf is not specified, the device emulation has no backend and can be considered unconnected. .Pp Network device backends: .Sm off .Bl -bullet .It .Xo .Cm tap Ar N .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx .Op Cm \&,mtu= Ar N .Xc .It .Xo .Cm vmnet Ar N .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx .Op Cm \&,mtu= Ar N .Xc .It .Xo .Cm netgraph,path= Ar ADDRESS Cm \&,peerhook= Ar HOOK .Op Cm \&,socket= Ar NAME .Op Cm \&,hook= Ar HOOK .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx .Op Cm \&,mtu= Ar N .Xc .El .Sm on .Pp If .Cm mac is not specified, the MAC address is derived from a fixed OUI and the remaining bytes from an MD5 hash of the slot and function numbers and the device name. .Pp The MAC address is an ASCII string in .Xr ethers 5 format. .Pp With .Cm virtio-net devices, the .Cm mtu parameter can be specified to inform the guest about the largest MTU that should be allowed, expressed in bytes. .Pp With .Cm netgraph backend, the .Cm path and .Cm peerhook parameters must be specified to set the destination node and corresponding hook. The optional parameters .Cm socket and .Cm hook may be used to set the .Xr ng_socket 4 node name and source hook. The .Ar ADDRESS , .Ar HOOK , and .Ar NAME must comply with .Xr netgraph 4 addressing rules. .Pp Block storage device backends: .Sm off .Bl -bullet .It .Ar /filename Op Cm \&, Ar block-device-options .It .Ar /dev/xxx Op Cm \&, Ar block-device-options .El .Sm on .Pp The .Ar block-device-options are: .Bl -tag -width 10n .It Cm nocache Open the file with .Dv O_DIRECT . .It Cm direct Open the file using .Dv O_SYNC . .It Cm ro Force the file to be opened read-only. .It Cm sectorsize= Ns Ar logical Ns Oo Cm \&/ Ns Ar physical Oc Specify the logical and physical sector sizes of the emulated disk. The physical sector size is optional and is equal to the logical sector size if not explicitly specified. .It Cm nodelete Disable emulation of guest trim requests via .Dv DIOCGDELETE requests. +.It Li bootindex= Ns Ar index +Add the device to the bootorder at +.Ar index . +A fwcfg file is used to specify the bootorder. +The guest firmware may ignore or doesn't support this fwcfg file. +In that case, this feature doesn't work as expected. .El .Pp SCSI device backends: .Sm off .Bl -bullet .It .Pa /dev/cam/ctl Oo Ar pp Cm \&. Ar vp Oc Oo Cm \&, Ar scsi-device-options Oc .El .Sm on .Pp The .Ar scsi-device-options are: .Bl -tag -width 10n .It Cm iid= Ns Ar IID Initiator ID to use when sending requests to specified CTL port. The default value is 0. +.It Li bootindex= Ns Ar index +Add the device to the bootorder at +.Ar index . +A fwcfg file is used to specify the bootorder. +The guest firmware may ignore or doesn't support this fwcfg file. +In that case, this feature doesn't work as expected. .El .Pp 9P device backends: .Sm off .Bl -bullet .It .Ar sharename Cm = Ar /path/to/share Op Cm \&, Ar 9p-device-options .El .Sm on .Pp The .Ar 9p-device-options are: .Bl -tag -width 10n .It Cm ro Expose the share in read-only mode. .El .Pp TTY device backends: .Bl -tag -width 10n .It Cm stdio Connect the serial port to the standard input and output of the .Nm process. .It Ar /dev/xxx Use the host TTY device for serial port I/O. .El .Pp Boot ROM device backends: .Bl -tag -width 10n .It Ar romfile Ns Op Cm \&, Ns Ar varfile Map .Ar romfile in the guest address space reserved for boot firmware. If .Ar varfile is provided, that file is also mapped in the boot firmware guest address space, and any modifications the guest makes will be saved to that file. .El .Pp Fwcfg types: .Bl -tag -width 10n .It Ar fwcfg The fwcfg interface is used to pass information such as the CPU count or ACPI tables to the guest firmware. Supported values are .Ql bhyve and .Ql qemu . Due to backward compatibility reasons, .Ql bhyve is the default option. When .Ql bhyve is used, bhyve's fwctl interface is used. It currently reports only the CPU count to the guest firmware. The .Ql qemu option uses QEMU's fwcfg interface. This interface is widely used and allows user-defined information to be passed to the guest. It is used for passing the CPU count, ACPI tables, a boot order and many other things to the guest. Some operating systems such as Fedora CoreOS can be configured by qemu's fwcfg interface as well. .El .Pp Pass-through device backends: .Sm off .Bl -bullet .It .Cm ppt Ar N Oo , Ar passthru-device-options Oc .It .Ns Ar bus Cm \&/ Ar slot Cm \&/ Ar function .Op , Ar passthru-device-options .It .Cm pci Ar bus Cm : Ar slot Cm : Ns Ar function .Op , Ar passthru-device-options .El .Sm on .Pp Connect to a PCI device on the host either named ppt .Ns Ar N or at the selector described by .Ar slot , .Ar bus , and .Ar function numbers. .Pp The .Ar passthru-device-options are: .Bl -tag -width 10n .It Cm rom= Ns Ar romfile Add .Ar romfile as option ROM to the PCI device. The ROM will be loaded by firmware and should be capable of initializing the device. +.It Li bootindex= Ns Ar index +Add the device to the bootorder at +.Ar index . +A fwcfg file is used to specify the bootorder. +The guest firmware may ignore or doesn't support this fwcfg file. +In that case, this feature doesn't work as expected. .El .Pp Guest memory must be wired using the .Fl S option when a pass-through device is configured. .Pp The host device must have been reserved at boot-time using the .Va pptdevs loader variable as described in .Xr vmm 4 . .Pp Virtio console device backends: .Bl -bullet .Sm off .It .Cm port1= Ns Ar /path/to/port1.sock Ns Op Cm ,port Ns Ar N Cm \&= Ns Ar /path/to/port2.sock No \~ Ar ... .Sm on .El .Pp A maximum of 16 ports per device can be created. Every port is named and corresponds to a Unix domain socket created by .Nm . .Nm accepts at most one connection per port at a time. .Pp Limitations: .Bl -bullet .It Due to lack of destructors in .Nm , sockets on the filesystem must be cleaned up manually after .Nm exits. .It There is no way to use the .Dq console port feature, nor the console port resize at present. .It Emergency write is advertised, but no-op at present. .El .Pp Virtio input device backends: .Bl -tag -width 10n .It Ar /dev/input/eventX Send input events of .Ar /dev/input/eventX to guest by VirtIO Input Interface. .El .Pp Framebuffer devices backends: .Bl -bullet .Sm off .It .Op Cm rfb= Ar ip-and-port .Op Cm ,w= Ar width .Op Cm ,h= Ar height .Op Cm ,vga= Ar vgaconf .Op Cm ,wait .Op Cm ,password= Ar password .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Cm rfb= Ns Ar ip-and-port Pq or Cm tcp= Ns Ar ip-and-port An IP address and a port VNC should listen on. There are two formats: .Pp .Bl -bullet -compact .It .Sm off .Op Ar IPv4 Cm \&: .Ar port .Sm on .It .Sm off .Cm \&[ Ar IPv6%zone Cm \&] Cm \&: Ar port .Sm on .El .Pp The default is to listen on localhost IPv4 address and default VNC port 5900. An IPv6 address must be enclosed in square brackets and may contain an optional zone identifier. .It Cm w= Ns Ar width No and Cm h= Ns Ar height A display resolution, width and height, respectively. If not specified, a default resolution of 1024x768 pixels will be used. Minimal supported resolution is 640x480 pixels, and maximum is 1920x1200 pixels. .It Cm vga= Ns Ar vgaconf Possible values for this option are .Cm io (default), .Cm on , and .Cm off . PCI graphics cards have a dual personality in that they are standard PCI devices with BAR addressing, but may also implicitly decode legacy VGA I/O space .Pq Ad 0x3c0-3df and memory space .Pq 64KB at Ad 0xA0000 . The default .Cm io option should be used for guests that attempt to issue BIOS calls which result in I/O port queries, and fail to boot if I/O decode is disabled. .Pp The .Cm on option should be used along with the CSM BIOS capability in UEFI to boot traditional BIOS guests that require the legacy VGA I/O and memory regions to be available. .Pp The .Cm off option should be used for the UEFI guests that assume that VGA adapter is present if they detect the I/O ports. An example of such a guest is .Ox in UEFI mode. .Pp Please refer to the .Nm .Fx wiki page .Pq Lk https://wiki.freebsd.org/bhyve for configuration notes of particular guests. .It Cm wait Instruct .Nm to only boot upon the initiation of a VNC connection, simplifying the installation of operating systems that require immediate keyboard input. This can be removed for post-installation use. .It Cm password= Ns Ar password This type of authentication is known to be cryptographically weak and is not intended for use on untrusted networks. Many implementations will want to use stronger security, such as running the session over an encrypted channel provided by IPsec or SSH. .El .Pp xHCI USB device backends: .Bl -tag -width 10n .It Cm tablet A USB tablet device which provides precise cursor synchronization when using VNC. .El .Pp NVMe device backends: .Bl -bullet .Sm off .It .Ar devpath .Op Cm ,maxq= Ar # .Op Cm ,qsz= Ar # .Op Cm ,ioslots= Ar # .Op Cm ,sectsz= Ar # .Op Cm ,ser= Ar # .Op Cm ,eui64= Ar # .Op Cm ,dsm= Ar opt .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Ar devpath Accepted device paths are: .Ar /dev/blockdev or .Ar /path/to/image or .Cm ram= Ns Ar size_in_MiB . .It Cm maxq Max number of queues. .It Cm qsz Max elements in each queue. .It Cm ioslots Max number of concurrent I/O requests. .It Cm sectsz Sector size (defaults to blockif sector size). .It Cm ser Serial number with maximum 20 characters. .It Cm eui64 IEEE Extended Unique Identifier (8 byte value). .It Cm dsm DataSet Management support. Supported values are: .Cm auto , enable , and .Cm disable . .El .Pp AHCI device backends: .Bl -bullet .It .Sm off .Op Oo Cm hd\&: | cd\&: Oc Ar path .Op Cm ,nmrr= Ar nmrr .Op Cm ,ser= Ar # .Op Cm ,rev= Ar # .Op Cm ,model= Ar # .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Cm nmrr Nominal Media Rotation Rate, known as RPM. Value 1 will indicate device as Solid State Disk. Default value is 0, not report. .It Cm ser Serial Number with maximum 20 characters. .It Cm rev Revision Number with maximum 8 characters. .It Cm model Model Number with maximum 40 characters. .El .Pp HD Audio device backends: .Bl -bullet .It .Sm off .Op Cm play= Ar playback .Op Cm ,rec= Ar recording .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Cm play Playback device, typically .Ar /dev/dsp0 . .It Cm rec Recording device, typically .Ar /dev/dsp0 . .El .It Fl U Ar uuid Set the universally unique identifier .Pq UUID in the guest's System Management BIOS System Information structure. By default a UUID is generated from the host's hostname and .Ar vmname . .It Fl u RTC keeps UTC time. .It Fl W Force virtio PCI device emulations to use MSI interrupts instead of MSI-X interrupts. .It Fl w Ignore accesses to unimplemented Model Specific Registers (MSRs). This is intended for debug purposes. .It Fl x The guest's local APIC is configured in x2APIC mode. .It Fl Y Disable MPtable generation. .It Ar vmname Alphanumeric name of the guest. This should be the same as that created by .Xr bhyveload 8 . .El .Sh CONFIGURATION VARIABLES .Nm uses an internal tree of configuration variables to describe global and per-device settings. When .Nm starts, it parses command line options (including config files) in the order given on the command line. Each command line option sets one or more configuration variables. For example, the .Fl s option creates a new tree node for a PCI device and sets one or more variables under that node including the device model and device model-specific variables. Variables may be set multiple times during this parsing stage with the final value overriding previous values. .Pp Once all of the command line options have been processed, the configuration values are frozen. .Nm then uses the value of configuration values to initialize device models and global settings. .Pp More details on configuration variables can be found in .Xr bhyve_config 5 . .Sh DEBUG SERVER The current debug server provides limited support for debuggers. .Ss Registers Each virtual CPU is exposed to the debugger as a thread. .Pp General purpose registers can be queried for each virtual CPU, but other registers such as floating-point and system registers cannot be queried. .Ss Memory Memory (including memory mapped I/O regions) can be read and written by the debugger. Memory operations use virtual addresses that are resolved to physical addresses via the current virtual CPU's active address translation. .Ss Control The running guest can be interrupted by the debugger at any time .Pq for example, by pressing Ctrl-C in the debugger . .Pp Single stepping is only supported on Intel CPUs supporting the MTRAP VM exit. .Pp Breakpoints are supported on Intel CPUs that support single stepping. Note that continuing from a breakpoint while interrupts are enabled in the guest may not work as expected due to timer interrupts firing while single stepping over the breakpoint. .Sh SIGNAL HANDLING .Nm deals with the following signals: .Pp .Bl -tag -width SIGTERM -compact .It SIGTERM Trigger ACPI poweroff for a VM .El .Sh EXIT STATUS Exit status indicates how the VM was terminated: .Pp .Bl -tag -width indent -compact .It 0 rebooted .It 1 powered off .It 2 halted .It 3 triple fault .It 4 exited due to an error .El .Sh EXAMPLES If not using a boot ROM, the guest operating system must have been loaded with .Xr bhyveload 8 or a similar boot loader before .Xr bhyve 4 can be run. Otherwise, the boot loader is not needed. .Pp To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio block device backed by the .Pa /my/image filesystem image, and a serial port for the console: .Bd -literal -offset indent bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\ -l com1,stdio -A -H -P -m 1G vm1 .Ed .Pp Run a 24GB single-CPU virtual machine with three network ports, one of which has a MAC address specified: .Bd -literal -offset indent bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\ -s 2:1,virtio-net,tap1 \\ -s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\ -s 3,virtio-blk,/my/image -l com1,stdio \\ -A -H -P -m 24G bigvm .Ed .Pp Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI CD-ROM, a single virtio network port, an AMD hostbridge, and the console port connected to an .Xr nmdm 4 null-modem device. .Bd -literal -offset indent bhyve -c 4 \\ -s 0,amd_hostbridge -s 1,lpc \\ -s 1:0,ahci,hd:/images/disk.1,hd:/images/disk.2,\\ hd:/images/disk.3,hd:/images/disk.4,\\ hd:/images/disk.5,hd:/images/disk.6,\\ hd:/images/disk.7,hd:/images/disk.8,\\ cd:/images/install.iso \\ -s 3,virtio-net,tap0 \\ -l com1,/dev/nmdm0A \\ -A -H -P -m 8G .Ed .Pp Run a UEFI virtual machine with a display resolution of 800 by 600 pixels that can be accessed via VNC at: 0.0.0.0:5900. .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 3,ahci-cd,/path/to/uefi-OS-install.iso \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=0.0.0.0:5900,w=800,h=600,wait \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Pp Run a UEFI virtual machine with a VNC display that is bound to all IPv6 addresses on port 5900. .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=[::]:5900,w=800,h=600 \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Pp Run a UEFI virtual machine with a VARS file to save EFI variables. Note that .Nm will write guest modifications to the given VARS file. Be sure to create a per-guest copy of the template VARS file from .Pa /usr . .Bd -literal -offset indent bhyve -c 2 -m 4g -w -H \\ -s 0,hostbridge \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI_CODE.fd,BHYVE_UEFI_VARS.fd uefivm .Ed .Sh SEE ALSO .Xr bhyve 4 , .Xr netgraph 4 , .Xr ng_socket 4 , .Xr nmdm 4 , .Xr vmm 4 , .Xr bhyve_config 5 , .Xr ethers 5 , .Xr bhyvectl 8 , .Xr bhyveload 8 .Pp .Rs .%A Intel .%B 64 and IA-32 Architectures Software Developer’s Manual .%V Volume 3 .Re .Sh HISTORY .Nm first appeared in .Fx 10.0 . .Sh AUTHORS .An Neel Natu Aq Mt neel@freebsd.org .An Peter Grehan Aq Mt grehan@freebsd.org diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c index 42c1b5c27293..62f8a9f21661 100644 --- a/usr.sbin/bhyve/block_if.c +++ b/usr.sbin/bhyve/block_if.c @@ -1,1027 +1,1046 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 Peter Grehan * All rights reserved. * Copyright 2020 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "mevent.h" #include "pci_emul.h" #include "block_if.h" #define BLOCKIF_SIG 0xb109b109 #define BLOCKIF_NUMTHR 8 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) enum blockop { BOP_READ, BOP_WRITE, BOP_FLUSH, BOP_DELETE }; enum blockstat { BST_FREE, BST_BLOCK, BST_PEND, BST_BUSY, BST_DONE }; struct blockif_elem { TAILQ_ENTRY(blockif_elem) be_link; struct blockif_req *be_req; enum blockop be_op; enum blockstat be_status; pthread_t be_tid; off_t be_block; }; struct blockif_ctxt { unsigned int bc_magic; int bc_fd; int bc_ischr; int bc_isgeom; int bc_candelete; int bc_rdonly; off_t bc_size; int bc_sectsz; int bc_psectsz; int bc_psectoff; int bc_closing; int bc_paused; pthread_t bc_btid[BLOCKIF_NUMTHR]; pthread_mutex_t bc_mtx; pthread_cond_t bc_cond; pthread_cond_t bc_work_done_cond; blockif_resize_cb *bc_resize_cb; void *bc_resize_cb_arg; struct mevent *bc_resize_event; /* Request elements and free/pending/busy queues */ TAILQ_HEAD(, blockif_elem) bc_freeq; TAILQ_HEAD(, blockif_elem) bc_pendq; TAILQ_HEAD(, blockif_elem) bc_busyq; struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; + int bc_bootindex; }; static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; struct blockif_sig_elem { pthread_mutex_t bse_mtx; pthread_cond_t bse_cond; int bse_pending; struct blockif_sig_elem *bse_next; }; static struct blockif_sig_elem *blockif_bse_head; static int blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { struct blockif_elem *be, *tbe; off_t off; int i; be = TAILQ_FIRST(&bc->bc_freeq); assert(be != NULL); assert(be->be_status == BST_FREE); TAILQ_REMOVE(&bc->bc_freeq, be, be_link); be->be_req = breq; be->be_op = op; switch (op) { case BOP_READ: case BOP_WRITE: case BOP_DELETE: off = breq->br_offset; for (i = 0; i < breq->br_iovcnt; i++) off += breq->br_iov[i].iov_len; break; default: off = OFF_MAX; } be->be_block = off; TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { if (tbe->be_block == breq->br_offset) break; } if (tbe == NULL) { TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { if (tbe->be_block == breq->br_offset) break; } } if (tbe == NULL) be->be_status = BST_PEND; else be->be_status = BST_BLOCK; TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); return (be->be_status == BST_PEND); } static int blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) { struct blockif_elem *be; TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { if (be->be_status == BST_PEND) break; assert(be->be_status == BST_BLOCK); } if (be == NULL) return (0); TAILQ_REMOVE(&bc->bc_pendq, be, be_link); be->be_status = BST_BUSY; be->be_tid = t; TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); *bep = be; return (1); } static void blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) { struct blockif_elem *tbe; if (be->be_status == BST_DONE || be->be_status == BST_BUSY) TAILQ_REMOVE(&bc->bc_busyq, be, be_link); else TAILQ_REMOVE(&bc->bc_pendq, be, be_link); TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { if (tbe->be_req->br_offset == be->be_block) tbe->be_status = BST_PEND; } be->be_tid = 0; be->be_status = BST_FREE; be->be_req = NULL; TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); } static int blockif_flush_bc(struct blockif_ctxt *bc) { if (bc->bc_ischr) { if (ioctl(bc->bc_fd, DIOCGFLUSH)) return (errno); } else if (fsync(bc->bc_fd)) return (errno); return (0); } static void blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) { struct spacectl_range range; struct blockif_req *br; off_t arg[2]; ssize_t n; size_t clen, len, off, boff, voff; int i, err; br = be->be_req; assert(br->br_resid >= 0); if (br->br_iovcnt <= 1) buf = NULL; err = 0; switch (be->be_op) { case BOP_READ: if (buf == NULL) { if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, br->br_offset)) < 0) err = errno; else br->br_resid -= n; break; } i = 0; off = voff = 0; while (br->br_resid > 0) { len = MIN(br->br_resid, MAXPHYS); n = pread(bc->bc_fd, buf, len, br->br_offset + off); if (n < 0) { err = errno; break; } len = (size_t)n; boff = 0; do { clen = MIN(len - boff, br->br_iov[i].iov_len - voff); memcpy((uint8_t *)br->br_iov[i].iov_base + voff, buf + boff, clen); if (clen < br->br_iov[i].iov_len - voff) voff += clen; else { i++; voff = 0; } boff += clen; } while (boff < len); off += len; br->br_resid -= len; } break; case BOP_WRITE: if (bc->bc_rdonly) { err = EROFS; break; } if (buf == NULL) { if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, br->br_offset)) < 0) err = errno; else br->br_resid -= n; break; } i = 0; off = voff = 0; while (br->br_resid > 0) { len = MIN(br->br_resid, MAXPHYS); boff = 0; do { clen = MIN(len - boff, br->br_iov[i].iov_len - voff); memcpy(buf + boff, (uint8_t *)br->br_iov[i].iov_base + voff, clen); if (clen < br->br_iov[i].iov_len - voff) voff += clen; else { i++; voff = 0; } boff += clen; } while (boff < len); n = pwrite(bc->bc_fd, buf, len, br->br_offset + off); if (n < 0) { err = errno; break; } off += n; br->br_resid -= n; } break; case BOP_FLUSH: err = blockif_flush_bc(bc); break; case BOP_DELETE: if (!bc->bc_candelete) err = EOPNOTSUPP; else if (bc->bc_rdonly) err = EROFS; else if (bc->bc_ischr) { arg[0] = br->br_offset; arg[1] = br->br_resid; if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) err = errno; else br->br_resid = 0; } else { range.r_offset = br->br_offset; range.r_len = br->br_resid; while (range.r_len > 0) { if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC, &range, 0, &range) != 0) { err = errno; break; } } if (err == 0) br->br_resid = 0; } break; default: err = EINVAL; break; } be->be_status = BST_DONE; (*br->br_callback)(br, err); } static inline bool blockif_empty(const struct blockif_ctxt *bc) { return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq)); } static void * blockif_thr(void *arg) { struct blockif_ctxt *bc; struct blockif_elem *be; pthread_t t; uint8_t *buf; bc = arg; if (bc->bc_isgeom) buf = malloc(MAXPHYS); else buf = NULL; t = pthread_self(); pthread_mutex_lock(&bc->bc_mtx); for (;;) { while (blockif_dequeue(bc, t, &be)) { pthread_mutex_unlock(&bc->bc_mtx); blockif_proc(bc, be, buf); pthread_mutex_lock(&bc->bc_mtx); blockif_complete(bc, be); } /* If none to work, notify the main thread */ if (blockif_empty(bc)) pthread_cond_broadcast(&bc->bc_work_done_cond); /* Check ctxt status here to see if exit requested */ if (bc->bc_closing) break; pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); } pthread_mutex_unlock(&bc->bc_mtx); if (buf) free(buf); pthread_exit(NULL); return (NULL); } static void blockif_sigcont_handler(int signal __unused, enum ev_type type __unused, void *arg __unused) { struct blockif_sig_elem *bse; for (;;) { /* * Process the entire list even if not intended for * this thread. */ do { bse = blockif_bse_head; if (bse == NULL) return; } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, (uintptr_t)bse, (uintptr_t)bse->bse_next)); pthread_mutex_lock(&bse->bse_mtx); bse->bse_pending = 0; pthread_cond_signal(&bse->bse_cond); pthread_mutex_unlock(&bse->bse_mtx); } } static void blockif_init(void) { mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); (void) signal(SIGCONT, SIG_IGN); } int blockif_legacy_config(nvlist_t *nvl, const char *opts) { char *cp, *path; if (opts == NULL) return (0); cp = strchr(opts, ','); if (cp == NULL) { set_config_value_node(nvl, "path", opts); return (0); } path = strndup(opts, cp - opts); set_config_value_node(nvl, "path", path); free(path); return (pci_parse_legacy_config(nvl, cp + 1)); } +int +blockif_add_boot_device(struct pci_devinst *const pi, + struct blockif_ctxt *const bc) +{ + if (bc->bc_bootindex < 0) + return (0); + + return (pci_emul_add_boot_device(pi, bc->bc_bootindex)); +} + struct blockif_ctxt * blockif_open(nvlist_t *nvl, const char *ident) { char tname[MAXCOMLEN + 1]; char name[MAXPATHLEN]; - const char *path, *pssval, *ssval; + const char *path, *pssval, *ssval, *bootindex_val; char *cp; struct blockif_ctxt *bc; struct stat sbuf; struct diocgattr_arg arg; off_t size, psectsz, psectoff; int extra, fd, i, sectsz; int ro, candelete, geom, ssopt, pssopt; int nodelete; + int bootindex; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE }; #endif pthread_once(&blockif_once, blockif_init); fd = -1; extra = 0; ssopt = 0; ro = 0; nodelete = 0; + bootindex = -1; if (get_config_bool_node_default(nvl, "nocache", false)) extra |= O_DIRECT; if (get_config_bool_node_default(nvl, "nodelete", false)) nodelete = 1; if (get_config_bool_node_default(nvl, "sync", false) || get_config_bool_node_default(nvl, "direct", false)) extra |= O_SYNC; if (get_config_bool_node_default(nvl, "ro", false)) ro = 1; ssval = get_config_value_node(nvl, "sectorsize"); if (ssval != NULL) { ssopt = strtol(ssval, &cp, 10); if (cp == ssval) { EPRINTLN("Invalid sector size \"%s\"", ssval); goto err; } if (*cp == '\0') { pssopt = ssopt; } else if (*cp == '/') { pssval = cp + 1; pssopt = strtol(pssval, &cp, 10); if (cp == pssval || *cp != '\0') { EPRINTLN("Invalid sector size \"%s\"", ssval); goto err; } } else { EPRINTLN("Invalid sector size \"%s\"", ssval); goto err; } } + bootindex_val = get_config_value_node(nvl, "bootindex"); + if (bootindex_val != NULL) { + bootindex = atoi(bootindex_val); + } + path = get_config_value_node(nvl, "path"); if (path == NULL) { EPRINTLN("Missing \"path\" for block device."); goto err; } fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra); if (fd < 0 && !ro) { /* Attempt a r/w fail with a r/o open */ fd = open(path, O_RDONLY | extra); ro = 1; } if (fd < 0) { warn("Could not open backing file: %s", path); goto err; } if (fstat(fd, &sbuf) < 0) { warn("Could not stat backing file %s", path); goto err; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF); if (ro) cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); if (caph_rights_limit(fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif /* * Deal with raw devices */ size = sbuf.st_size; sectsz = DEV_BSIZE; psectsz = psectoff = 0; candelete = geom = 0; if (S_ISCHR(sbuf.st_mode)) { if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || ioctl(fd, DIOCGSECTORSIZE, §sz)) { perror("Could not fetch dev blk/sector size"); goto err; } assert(size != 0); assert(sectsz != 0); if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); arg.len = sizeof(arg.value.i); if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) candelete = arg.value.i; if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) geom = 1; } else { psectsz = sbuf.st_blksize; /* Avoid fallback implementation */ candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1; } #ifndef WITHOUT_CAPSICUM if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif if (ssopt != 0) { if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || ssopt > pssopt) { EPRINTLN("Invalid sector size %d/%d", ssopt, pssopt); goto err; } /* * Some backend drivers (e.g. cd0, ada0) require that the I/O * size be a multiple of the device's sector size. * * Validate that the emulated sector size complies with this * requirement. */ if (S_ISCHR(sbuf.st_mode)) { if (ssopt < sectsz || (ssopt % sectsz) != 0) { EPRINTLN("Sector size %d incompatible " "with underlying device sector size %d", ssopt, sectsz); goto err; } } sectsz = ssopt; psectsz = pssopt; psectoff = 0; } bc = calloc(1, sizeof(struct blockif_ctxt)); if (bc == NULL) { perror("calloc"); goto err; } bc->bc_magic = BLOCKIF_SIG; bc->bc_fd = fd; bc->bc_ischr = S_ISCHR(sbuf.st_mode); bc->bc_isgeom = geom; bc->bc_candelete = candelete; bc->bc_rdonly = ro; bc->bc_size = size; bc->bc_sectsz = sectsz; bc->bc_psectsz = psectsz; bc->bc_psectoff = psectoff; pthread_mutex_init(&bc->bc_mtx, NULL); pthread_cond_init(&bc->bc_cond, NULL); bc->bc_paused = 0; pthread_cond_init(&bc->bc_work_done_cond, NULL); TAILQ_INIT(&bc->bc_freeq); TAILQ_INIT(&bc->bc_pendq); TAILQ_INIT(&bc->bc_busyq); + bc->bc_bootindex = bootindex; for (i = 0; i < BLOCKIF_MAXREQ; i++) { bc->bc_reqs[i].be_status = BST_FREE; TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); } for (i = 0; i < BLOCKIF_NUMTHR; i++) { pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); pthread_set_name_np(bc->bc_btid[i], tname); } return (bc); err: if (fd >= 0) close(fd); return (NULL); } static void blockif_resized(int fd, enum ev_type type __unused, void *arg) { struct blockif_ctxt *bc; struct stat sb; off_t mediasize; if (fstat(fd, &sb) != 0) return; if (S_ISCHR(sb.st_mode)) { if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) { EPRINTLN("blockif_resized: get mediasize failed: %s", strerror(errno)); return; } } else mediasize = sb.st_size; bc = arg; pthread_mutex_lock(&bc->bc_mtx); if (mediasize != bc->bc_size) { bc->bc_size = mediasize; bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size); } pthread_mutex_unlock(&bc->bc_mtx); } int blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb, void *cb_arg) { struct stat sb; int err; if (cb == NULL) return (EINVAL); err = 0; pthread_mutex_lock(&bc->bc_mtx); if (bc->bc_resize_cb != NULL) { err = EBUSY; goto out; } assert(bc->bc_closing == 0); if (fstat(bc->bc_fd, &sb) != 0) { err = errno; goto out; } bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE, EVFF_ATTRIB, blockif_resized, bc); if (bc->bc_resize_event == NULL) { err = ENXIO; goto out; } bc->bc_resize_cb = cb; bc->bc_resize_cb_arg = cb_arg; out: pthread_mutex_unlock(&bc->bc_mtx); return (err); } static int blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { int err; err = 0; pthread_mutex_lock(&bc->bc_mtx); assert(!bc->bc_paused); if (!TAILQ_EMPTY(&bc->bc_freeq)) { /* * Enqueue and inform the block i/o thread * that there is work available */ if (blockif_enqueue(bc, breq, op)) pthread_cond_signal(&bc->bc_cond); } else { /* * Callers are not allowed to enqueue more than * the specified blockif queue limit. Return an * error to indicate that the queue length has been * exceeded. */ err = E2BIG; } pthread_mutex_unlock(&bc->bc_mtx); return (err); } int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_READ)); } int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_WRITE)); } int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_FLUSH)); } int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_DELETE)); } int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) { struct blockif_elem *be; assert(bc->bc_magic == BLOCKIF_SIG); pthread_mutex_lock(&bc->bc_mtx); /* XXX: not waiting while paused */ /* * Check pending requests. */ TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { if (be->be_req == breq) break; } if (be != NULL) { /* * Found it. */ blockif_complete(bc, be); pthread_mutex_unlock(&bc->bc_mtx); return (0); } /* * Check in-flight requests. */ TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { if (be->be_req == breq) break; } if (be == NULL) { /* * Didn't find it. */ pthread_mutex_unlock(&bc->bc_mtx); return (EINVAL); } /* * Interrupt the processing thread to force it return * prematurely via it's normal callback path. */ while (be->be_status == BST_BUSY) { struct blockif_sig_elem bse, *old_head; pthread_mutex_init(&bse.bse_mtx, NULL); pthread_cond_init(&bse.bse_cond, NULL); bse.bse_pending = 1; do { old_head = blockif_bse_head; bse.bse_next = old_head; } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, (uintptr_t)old_head, (uintptr_t)&bse)); pthread_kill(be->be_tid, SIGCONT); pthread_mutex_lock(&bse.bse_mtx); while (bse.bse_pending) pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); pthread_mutex_unlock(&bse.bse_mtx); } pthread_mutex_unlock(&bc->bc_mtx); /* * The processing thread has been interrupted. Since it's not * clear if the callback has been invoked yet, return EBUSY. */ return (EBUSY); } int blockif_close(struct blockif_ctxt *bc) { void *jval; int i; assert(bc->bc_magic == BLOCKIF_SIG); /* * Stop the block i/o thread */ pthread_mutex_lock(&bc->bc_mtx); bc->bc_closing = 1; if (bc->bc_resize_event != NULL) mevent_disable(bc->bc_resize_event); pthread_mutex_unlock(&bc->bc_mtx); pthread_cond_broadcast(&bc->bc_cond); for (i = 0; i < BLOCKIF_NUMTHR; i++) pthread_join(bc->bc_btid[i], &jval); /* XXX Cancel queued i/o's ??? */ /* * Release resources */ bc->bc_magic = 0; close(bc->bc_fd); free(bc); return (0); } /* * Return virtual C/H/S values for a given block. Use the algorithm * outlined in the VHD specification to calculate values. */ void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) { off_t sectors; /* total sectors of the block dev */ off_t hcyl; /* cylinders times heads */ uint16_t secpt; /* sectors per track */ uint8_t heads; assert(bc->bc_magic == BLOCKIF_SIG); sectors = bc->bc_size / bc->bc_sectsz; /* Clamp the size to the largest possible with CHS */ if (sectors > 65535L * 16 * 255) sectors = 65535L * 16 * 255; if (sectors >= 65536L * 16 * 63) { secpt = 255; heads = 16; hcyl = sectors / secpt; } else { secpt = 17; hcyl = sectors / secpt; heads = (hcyl + 1023) / 1024; if (heads < 4) heads = 4; if (hcyl >= (heads * 1024) || heads > 16) { secpt = 31; heads = 16; hcyl = sectors / secpt; } if (hcyl >= (heads * 1024)) { secpt = 63; heads = 16; hcyl = sectors / secpt; } } *c = hcyl / heads; *h = heads; *s = secpt; } /* * Accessors */ off_t blockif_size(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_size); } int blockif_sectsz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_sectsz); } void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) { assert(bc->bc_magic == BLOCKIF_SIG); *size = bc->bc_psectsz; *off = bc->bc_psectoff; } int blockif_queuesz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (BLOCKIF_MAXREQ - 1); } int blockif_is_ro(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_rdonly); } int blockif_candelete(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_candelete); } #ifdef BHYVE_SNAPSHOT void blockif_pause(struct blockif_ctxt *bc) { assert(bc != NULL); assert(bc->bc_magic == BLOCKIF_SIG); pthread_mutex_lock(&bc->bc_mtx); bc->bc_paused = 1; /* The interface is paused. Wait for workers to finish their work */ while (!blockif_empty(bc)) pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx); pthread_mutex_unlock(&bc->bc_mtx); if (!bc->bc_rdonly && blockif_flush_bc(bc)) fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n", __func__); } void blockif_resume(struct blockif_ctxt *bc) { assert(bc != NULL); assert(bc->bc_magic == BLOCKIF_SIG); pthread_mutex_lock(&bc->bc_mtx); bc->bc_paused = 0; pthread_mutex_unlock(&bc->bc_mtx); } #endif /* BHYVE_SNAPSHOT */ diff --git a/usr.sbin/bhyve/block_if.h b/usr.sbin/bhyve/block_if.h index b36d0c367890..52ebd8634b8e 100644 --- a/usr.sbin/bhyve/block_if.h +++ b/usr.sbin/bhyve/block_if.h @@ -1,92 +1,94 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 Peter Grehan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * The block API to be used by bhyve block-device emulations. The routines * are thread safe, with no assumptions about the context of the completion * callback - it may occur in the caller's context, or asynchronously in * another thread. */ #ifndef _BLOCK_IF_H_ #define _BLOCK_IF_H_ #include #include #include struct vm_snapshot_meta; /* * BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in * a single request. BLOCKIF_RING_MAX is the maxmimum number of * pending requests that can be queued. */ #define BLOCKIF_IOV_MAX 128 /* not practical to be IOV_MAX */ #define BLOCKIF_RING_MAX 128 struct blockif_req { int br_iovcnt; off_t br_offset; ssize_t br_resid; void (*br_callback)(struct blockif_req *req, int err); void *br_param; struct iovec br_iov[BLOCKIF_IOV_MAX]; }; +struct pci_devinst; struct blockif_ctxt; typedef void blockif_resize_cb(struct blockif_ctxt *, void *, size_t); int blockif_legacy_config(nvlist_t *nvl, const char *opts); +int blockif_add_boot_device(struct pci_devinst *const pi, struct blockif_ctxt *const bc); struct blockif_ctxt *blockif_open(nvlist_t *nvl, const char *ident); int blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb, void *cb_arg); off_t blockif_size(struct blockif_ctxt *bc); void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s); int blockif_sectsz(struct blockif_ctxt *bc); void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off); int blockif_queuesz(struct blockif_ctxt *bc); int blockif_is_ro(struct blockif_ctxt *bc); int blockif_candelete(struct blockif_ctxt *bc); int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_close(struct blockif_ctxt *bc); #ifdef BHYVE_SNAPSHOT void blockif_pause(struct blockif_ctxt *bc); void blockif_resume(struct blockif_ctxt *bc); #endif #endif /* _BLOCK_IF_H_ */ diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c index 2c36e1d5cbb9..a2731876bbea 100644 --- a/usr.sbin/bhyve/pci_ahci.c +++ b/usr.sbin/bhyve/pci_ahci.c @@ -1,2739 +1,2746 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 Zhixiang Yu * Copyright (c) 2015-2016 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "ahci.h" #include "block_if.h" #define DEF_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ #define MAX_PORTS 32 /* AHCI supports 32 ports */ #define PxSIG_ATA 0x00000101 /* ATA drive */ #define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */ enum sata_fis_type { FIS_TYPE_REGH2D = 0x27, /* Register FIS - host to device */ FIS_TYPE_REGD2H = 0x34, /* Register FIS - device to host */ FIS_TYPE_DMAACT = 0x39, /* DMA activate FIS - device to host */ FIS_TYPE_DMASETUP = 0x41, /* DMA setup FIS - bidirectional */ FIS_TYPE_DATA = 0x46, /* Data FIS - bidirectional */ FIS_TYPE_BIST = 0x58, /* BIST activate FIS - bidirectional */ FIS_TYPE_PIOSETUP = 0x5F, /* PIO setup FIS - device to host */ FIS_TYPE_SETDEVBITS = 0xA1, /* Set dev bits FIS - device to host */ }; /* * SCSI opcodes */ #define TEST_UNIT_READY 0x00 #define REQUEST_SENSE 0x03 #define INQUIRY 0x12 #define START_STOP_UNIT 0x1B #define PREVENT_ALLOW 0x1E #define READ_CAPACITY 0x25 #define READ_10 0x28 #define POSITION_TO_ELEMENT 0x2B #define READ_TOC 0x43 #define GET_EVENT_STATUS_NOTIFICATION 0x4A #define MODE_SENSE_10 0x5A #define REPORT_LUNS 0xA0 #define READ_12 0xA8 #define READ_CD 0xBE /* * SCSI mode page codes */ #define MODEPAGE_RW_ERROR_RECOVERY 0x01 #define MODEPAGE_CD_CAPABILITIES 0x2A /* * ATA commands */ #define ATA_SF_ENAB_SATA_SF 0x10 #define ATA_SATA_SF_AN 0x05 #define ATA_SF_DIS_SATA_SF 0x90 /* * Debug printf */ #ifdef AHCI_DEBUG static FILE *dbg; #define DPRINTF(format, arg...) do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0) #else #define DPRINTF(format, arg...) #endif #define WPRINTF(format, arg...) printf(format, ##arg) #define AHCI_PORT_IDENT 20 + 1 struct ahci_ioreq { struct blockif_req io_req; struct ahci_port *io_pr; STAILQ_ENTRY(ahci_ioreq) io_flist; TAILQ_ENTRY(ahci_ioreq) io_blist; uint8_t *cfis; uint32_t len; uint32_t done; int slot; int more; int readop; }; struct ahci_port { struct blockif_ctxt *bctx; struct pci_ahci_softc *pr_sc; struct ata_params ata_ident; uint8_t *cmd_lst; uint8_t *rfis; int port; int atapi; int reset; int waitforclear; int mult_sectors; uint8_t xfermode; uint8_t err_cfis[20]; uint8_t sense_key; uint8_t asc; u_int ccs; uint32_t pending; uint32_t clb; uint32_t clbu; uint32_t fb; uint32_t fbu; uint32_t is; uint32_t ie; uint32_t cmd; uint32_t unused0; uint32_t tfd; uint32_t sig; uint32_t ssts; uint32_t sctl; uint32_t serr; uint32_t sact; uint32_t ci; uint32_t sntf; uint32_t fbs; /* * i/o request info */ struct ahci_ioreq *ioreq; int ioqsz; STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd; TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd; }; struct ahci_cmd_hdr { uint16_t flags; uint16_t prdtl; uint32_t prdbc; uint64_t ctba; uint32_t reserved[4]; }; struct ahci_prdt_entry { uint64_t dba; uint32_t reserved; #define DBCMASK 0x3fffff uint32_t dbc; }; struct pci_ahci_softc { struct pci_devinst *asc_pi; pthread_mutex_t mtx; int ports; uint32_t cap; uint32_t ghc; uint32_t is; uint32_t pi; uint32_t vs; uint32_t ccc_ctl; uint32_t ccc_pts; uint32_t em_loc; uint32_t em_ctl; uint32_t cap2; uint32_t bohc; uint32_t lintr; struct ahci_port port[MAX_PORTS]; }; #define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx) static void ahci_handle_port(struct ahci_port *p); static inline void lba_to_msf(uint8_t *buf, int lba) { lba += 150; buf[0] = (lba / 75) / 60; buf[1] = (lba / 75) % 60; buf[2] = lba % 75; } /* * Generate HBA interrupts on global IS register write. */ static void ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask) { struct pci_devinst *pi = sc->asc_pi; struct ahci_port *p; int i, nmsg; uint32_t mmask; /* Update global IS from PxIS/PxIE. */ for (i = 0; i < sc->ports; i++) { p = &sc->port[i]; if (p->is & p->ie) sc->is |= (1 << i); } DPRINTF("%s(%08x) %08x", __func__, mask, sc->is); /* If there is nothing enabled -- clear legacy interrupt and exit. */ if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) { if (sc->lintr) { pci_lintr_deassert(pi); sc->lintr = 0; } return; } /* If there is anything and no MSI -- assert legacy interrupt. */ nmsg = pci_msi_maxmsgnum(pi); if (nmsg == 0) { if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } return; } /* Assert respective MSIs for ports that were touched. */ for (i = 0; i < nmsg; i++) { if (sc->ports <= nmsg || i < nmsg - 1) mmask = 1 << i; else mmask = 0xffffffff << i; if (sc->is & mask && mmask & mask) pci_generate_msi(pi, i); } } /* * Generate HBA interrupt on specific port event. */ static void ahci_port_intr(struct ahci_port *p) { struct pci_ahci_softc *sc = p->pr_sc; struct pci_devinst *pi = sc->asc_pi; int nmsg; DPRINTF("%s(%d) %08x/%08x %08x", __func__, p->port, p->is, p->ie, sc->is); /* If there is nothing enabled -- we are done. */ if ((p->is & p->ie) == 0) return; /* In case of non-shared MSI always generate interrupt. */ nmsg = pci_msi_maxmsgnum(pi); if (sc->ports <= nmsg || p->port < nmsg - 1) { sc->is |= (1 << p->port); if ((sc->ghc & AHCI_GHC_IE) == 0) return; pci_generate_msi(pi, p->port); return; } /* If IS for this port is already set -- do nothing. */ if (sc->is & (1 << p->port)) return; sc->is |= (1 << p->port); /* If interrupts are enabled -- generate one. */ if ((sc->ghc & AHCI_GHC_IE) == 0) return; if (nmsg > 0) { pci_generate_msi(pi, nmsg - 1); } else if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } } static void ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis) { int offset, len, irq; if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE)) return; switch (ft) { case FIS_TYPE_REGD2H: offset = 0x40; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0; break; case FIS_TYPE_SETDEVBITS: offset = 0x58; len = 8; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0; break; case FIS_TYPE_PIOSETUP: offset = 0x20; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0; break; default: WPRINTF("unsupported fis type %d", ft); return; } if (fis[2] & ATA_S_ERROR) { p->waitforclear = 1; irq |= AHCI_P_IX_TFE; } memcpy(p->rfis + offset, fis, len); if (irq) { if (~p->is & irq) { p->is |= irq; ahci_port_intr(p); } } } static void ahci_write_fis_piosetup(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_PIOSETUP; ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis); } static void ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[8]; uint8_t error; error = (tfd >> 8) & 0xff; tfd &= 0x77; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_SETDEVBITS; fis[1] = (1 << 6); fis[2] = tfd; fis[3] = error; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = slot; p->err_cfis[2] = tfd; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else { *(uint32_t *)(fis + 4) = (1 << slot); p->sact &= ~(1 << slot); } p->tfd &= ~0x77; p->tfd |= tfd; ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis); } static void ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[20]; uint8_t error; error = (tfd >> 8) & 0xff; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = (1 << 6); fis[2] = tfd & 0xff; fis[3] = error; fis[4] = cfis[4]; fis[5] = cfis[5]; fis[6] = cfis[6]; fis[7] = cfis[7]; fis[8] = cfis[8]; fis[9] = cfis[9]; fis[10] = cfis[10]; fis[11] = cfis[11]; fis[12] = cfis[12]; fis[13] = cfis[13]; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = 0x80; p->err_cfis[2] = tfd & 0xff; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else p->ci &= ~(1 << slot); p->tfd = tfd; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot) { uint8_t fis[20]; p->tfd = ATA_S_READY | ATA_S_DSC; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = 0; /* No interrupt */ fis[2] = p->tfd; /* Status */ fis[3] = 0; /* No error */ p->ci &= ~(1 << slot); ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_reset_fis_d2h(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[3] = 1; fis[4] = 1; if (p->atapi) { fis[5] = 0x14; fis[6] = 0xeb; } fis[12] = 1; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_check_stopped(struct ahci_port *p) { /* * If we are no longer processing the command list and nothing * is in-flight, clear the running bit, the current command * slot, the command issue and active bits. */ if (!(p->cmd & AHCI_P_CMD_ST)) { if (p->pending == 0) { p->ccs = 0; p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK); p->ci = 0; p->sact = 0; p->waitforclear = 0; } } } static void ahci_port_stop(struct ahci_port *p) { struct ahci_ioreq *aior; uint8_t *cfis; int slot; int error; assert(pthread_mutex_isowned_np(&p->pr_sc->mtx)); TAILQ_FOREACH(aior, &p->iobhd, io_blist) { /* * Try to cancel the outstanding blockif request. */ error = blockif_cancel(p->bctx, &aior->io_req); if (error != 0) continue; slot = aior->slot; cfis = aior->cfis; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) p->sact &= ~(1 << slot); /* NCQ */ else p->ci &= ~(1 << slot); /* * This command is now done. */ p->pending &= ~(1 << slot); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); } ahci_check_stopped(p); } static void ahci_port_reset(struct ahci_port *pr) { pr->serr = 0; pr->sact = 0; pr->xfermode = ATA_UDMA6; pr->mult_sectors = 128; if (!pr->bctx) { pr->ssts = ATA_SS_DET_NO_DEVICE; pr->sig = 0xFFFFFFFF; pr->tfd = 0x7F; return; } pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE; if (pr->sctl & ATA_SC_SPD_MASK) pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK); else pr->ssts |= ATA_SS_SPD_GEN3; pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA; if (!pr->atapi) { pr->sig = PxSIG_ATA; pr->tfd |= ATA_S_READY; } else pr->sig = PxSIG_ATAPI; ahci_write_reset_fis_d2h(pr); } static void ahci_reset(struct pci_ahci_softc *sc) { int i; sc->ghc = AHCI_GHC_AE; sc->is = 0; if (sc->lintr) { pci_lintr_deassert(sc->asc_pi); sc->lintr = 0; } for (i = 0; i < sc->ports; i++) { sc->port[i].ie = 0; sc->port[i].is = 0; sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD); if (sc->port[i].bctx) sc->port[i].cmd |= AHCI_P_CMD_CPS; sc->port[i].sctl = 0; ahci_port_reset(&sc->port[i]); } } static void ata_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i ^ 1] = *src++; else dest[i ^ 1] = ' '; } } static void atapi_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i] = *src++; else dest[i] = ' '; } } /* * Build up the iovec based on the PRDT, 'done' and 'len'. */ static void ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior, struct ahci_prdt_entry *prdt, uint16_t prdtl) { struct blockif_req *breq = &aior->io_req; uint32_t dbcsz, extra, left, skip, todo; int i, j; assert(aior->len >= aior->done); /* Copy part of PRDT between 'done' and 'len' bytes into the iov. */ skip = aior->done; left = aior->len - aior->done; todo = 0; for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0; i++, prdt++) { dbcsz = (prdt->dbc & DBCMASK) + 1; /* Skip already done part of the PRDT */ if (dbcsz <= skip) { skip -= dbcsz; continue; } dbcsz -= skip; if (dbcsz > left) dbcsz = left; breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba + skip, dbcsz); breq->br_iov[j].iov_len = dbcsz; todo += dbcsz; left -= dbcsz; skip = 0; j++; } /* If we got limited by IOV length, round I/O down to sector size. */ if (j == BLOCKIF_IOV_MAX) { extra = todo % blockif_sectsz(p->bctx); todo -= extra; assert(todo > 0); while (extra > 0) { if (breq->br_iov[j - 1].iov_len > extra) { breq->br_iov[j - 1].iov_len -= extra; break; } extra -= breq->br_iov[j - 1].iov_len; j--; } } breq->br_iovcnt = j; breq->br_resid = todo; aior->done += todo; aior->more = (aior->done < aior->len && i < prdtl); } static void ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; struct ahci_prdt_entry *prdt; struct ahci_cmd_hdr *hdr; uint64_t lba; uint32_t len; int err, first, ncq, readop; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); ncq = 0; readop = 1; first = (done == 0); if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || cfis[2] == ATA_WRITE_FPDMA_QUEUED) readop = 0; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[11] << 8 | cfis[3]; if (!len) len = 65536; ncq = 1; } else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[13] << 8 | cfis[12]; if (!len) len = 65536; } else { lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) | (cfis[5] << 8) | cfis[4]; len = cfis[12]; if (!len) len = 256; } lba *= blockif_sectsz(p->bctx); len *= blockif_sectsz(p->bctx); /* Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->readop = readop; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); if (readop) err = blockif_read(p->bctx, breq); else err = blockif_write(p->bctx, breq); assert(err == 0); } static void ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_ioreq *aior; struct blockif_req *breq; int err; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = 0; aior->done = 0; aior->more = 0; breq = &aior->io_req; /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_flush(p->bctx, breq); assert(err == 0); } static inline void read_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, unsigned int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; uint8_t *to; unsigned int len; int i; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; to = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; unsigned int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); sublen = MIN(len, dbcsz); memcpy(to, ptr, sublen); len -= sublen; to += sublen; prdt++; } } static void ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; uint8_t *entry; uint64_t elba; uint32_t len, elen; int err, first, ncq; uint8_t buf[512]; first = (done == 0); if (cfis[2] == ATA_DATA_SET_MANAGEMENT) { len = (uint16_t)cfis[13] << 8 | cfis[12]; len *= 512; ncq = 0; } else { /* ATA_SEND_FPDMA_QUEUED */ len = (uint16_t)cfis[11] << 8 | cfis[3]; len *= 512; ncq = 1; } read_prdt(p, slot, cfis, buf, sizeof(buf)); next: entry = &buf[done]; elba = ((uint64_t)entry[5] << 40) | ((uint64_t)entry[4] << 32) | ((uint64_t)entry[3] << 24) | ((uint64_t)entry[2] << 16) | ((uint64_t)entry[1] << 8) | entry[0]; elen = (uint16_t)entry[7] << 8 | entry[6]; done += 8; if (elen == 0) { if (done >= len) { if (ncq) { if (first) ahci_write_fis_d2h_ncq(p, slot); ahci_write_fis_sdb(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } else { ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } p->pending &= ~(1 << slot); ahci_check_stopped(p); if (!first) ahci_handle_port(p); return; } goto next; } /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->more = (len != done); breq = &aior->io_req; breq->br_offset = elba * blockif_sectsz(p->bctx); breq->br_resid = elen * blockif_sectsz(p->bctx); /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); err = blockif_delete(p->bctx, breq); assert(err == 0); } static inline void write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, unsigned int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; uint8_t *from; unsigned int len; int i; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; from = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); sublen = MIN(len, dbcsz); memcpy(ptr, from, sublen); len -= sublen; from += sublen; prdt++; } hdr->prdbc = size - len; } static void ahci_checksum(uint8_t *buf, int size) { int i; uint8_t sum = 0; for (i = 0; i < size - 1; i++) sum += buf[i]; buf[size - 1] = 0x100 - sum; } static void ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; uint32_t buf[128]; uint8_t *buf8 = (uint8_t *)buf; uint16_t *buf16 = (uint16_t *)buf; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0 || cfis[5] != 0 || cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); return; } memset(buf, 0, sizeof(buf)); if (cfis[4] == 0x00) { /* Log directory */ buf16[0x00] = 1; /* Version -- 1 */ buf16[0x10] = 1; /* NCQ Command Error Log -- 1 page */ buf16[0x13] = 1; /* SATA NCQ Send and Receive Log -- 1 page */ } else if (cfis[4] == 0x10) { /* NCQ Command Error Log */ memcpy(buf8, p->err_cfis, sizeof(p->err_cfis)); ahci_checksum(buf8, sizeof(buf)); } else if (cfis[4] == 0x13) { /* SATA NCQ Send and Receive Log */ if (blockif_candelete(p->bctx) && !blockif_is_ro(p->bctx)) { buf[0x00] = 1; /* SFQ DSM supported */ buf[0x01] = 1; /* SFQ DSM TRIM supported */ } } else { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); return; } if (cfis[2] == ATA_READ_LOG_EXT) ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } static void handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void*)&p->ata_ident, sizeof(struct ata_params)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void ata_identify_init(struct ahci_port* p, int atapi) { struct ata_params* ata_ident = &p->ata_ident; if (atapi) { ata_ident->config = ATA_PROTO_ATAPI | ATA_ATAPI_TYPE_CDROM | ATA_ATAPI_REMOVABLE | ATA_DRQ_FAST; ata_ident->capabilities1 = ATA_SUPPORT_LBA | ATA_SUPPORT_DMA; ata_ident->capabilities2 = (1 << 14 | 1); ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88; ata_ident->obsolete62 = 0x3f; ata_ident->mwdmamodes = 7; if (p->xfermode & ATA_WDMA0) ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->apiomodes = 3; ata_ident->mwdmamin = 0x0078; ata_ident->mwdmarec = 0x0078; ata_ident->pioblind = 0x0078; ata_ident->pioiordy = 0x0078; ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3); ata_ident->satacapabilities2 = ((p->ssts & ATA_SS_SPD_MASK) >> 3); ata_ident->satasupport = ATA_SUPPORT_NCQ_STREAM; ata_ident->version_major = 0x3f0; ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); ata_ident->support.command2 = (1 << 14); ata_ident->support.extension = (1 << 14); ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); ata_ident->enabled.extension = (1 << 14); ata_ident->udmamodes = 0x7f; if (p->xfermode & ATA_UDMA0) ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->transport_major = 0x1020; ata_ident->integrity = 0x00a5; } else { uint64_t sectors; int sectsz, psectsz, psectoff, candelete, ro; uint16_t cyl; uint8_t sech, heads; ro = blockif_is_ro(p->bctx); candelete = blockif_candelete(p->bctx); sectsz = blockif_sectsz(p->bctx); sectors = blockif_size(p->bctx) / sectsz; blockif_chs(p->bctx, &cyl, &heads, &sech); blockif_psectsz(p->bctx, &psectsz, &psectoff); ata_ident->config = ATA_DRQ_FAST; ata_ident->cylinders = cyl; ata_ident->heads = heads; ata_ident->sectors = sech; ata_ident->sectors_intr = (0x8000 | 128); ata_ident->tcg = 0; ata_ident->capabilities1 = ATA_SUPPORT_DMA | ATA_SUPPORT_LBA | ATA_SUPPORT_IORDY; ata_ident->capabilities2 = (1 << 14); ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88; if (p->mult_sectors) ata_ident->multi = (ATA_MULTI_VALID | p->mult_sectors); if (sectors <= 0x0fffffff) { ata_ident->lba_size_1 = sectors; ata_ident->lba_size_2 = (sectors >> 16); } else { ata_ident->lba_size_1 = 0xffff; ata_ident->lba_size_2 = 0x0fff; } ata_ident->mwdmamodes = 0x7; if (p->xfermode & ATA_WDMA0) ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->apiomodes = 0x3; ata_ident->mwdmamin = 0x0078; ata_ident->mwdmarec = 0x0078; ata_ident->pioblind = 0x0078; ata_ident->pioiordy = 0x0078; ata_ident->support3 = 0; ata_ident->queue = 31; ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 | ATA_SUPPORT_NCQ); ata_ident->satacapabilities2 = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED | (p->ssts & ATA_SS_SPD_MASK) >> 3); ata_ident->version_major = 0x3f0; ata_ident->version_minor = 0x28; ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE | ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); ata_ident->support.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 14); ata_ident->support.extension = (1 << 14); ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE | ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); ata_ident->enabled.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 15); ata_ident->enabled.extension = (1 << 14); ata_ident->udmamodes = 0x7f; if (p->xfermode & ATA_UDMA0) ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->lba_size48_1 = sectors; ata_ident->lba_size48_2 = (sectors >> 16); ata_ident->lba_size48_3 = (sectors >> 32); ata_ident->lba_size48_4 = (sectors >> 48); if (candelete && !ro) { ata_ident->support3 |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT; ata_ident->max_dsm_blocks = 1; ata_ident->support_dsm = ATA_SUPPORT_DSM_TRIM; } ata_ident->pss = ATA_PSS_VALID_VALUE; ata_ident->lsalign = 0x4000; if (psectsz > sectsz) { ata_ident->pss |= ATA_PSS_MULTLS; ata_ident->pss |= ffsl(psectsz / sectsz) - 1; ata_ident->lsalign |= (psectoff / sectsz); } if (sectsz > 512) { ata_ident->pss |= ATA_PSS_LSSABOVE512; ata_ident->lss_1 = sectsz / 2; ata_ident->lss_2 = ((sectsz / 2) >> 16); } ata_ident->support2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); ata_ident->enabled2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); ata_ident->transport_major = 0x1020; ata_ident->integrity = 0x00a5; } ahci_checksum((uint8_t*)ata_ident, sizeof(struct ata_params)); } static void handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) { if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)&p->ata_ident, sizeof(struct ata_params)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[36]; uint8_t *acmd; unsigned int len; uint32_t tfd; acmd = cfis + 0x40; if (acmd[1] & 1) { /* VPD */ if (acmd[2] == 0) { /* Supported VPD pages */ buf[0] = 0x05; buf[1] = 0; buf[2] = 0; buf[3] = 1; buf[4] = 0; len = 4 + buf[3]; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } } else { buf[0] = 0x05; buf[1] = 0x80; buf[2] = 0x00; buf[3] = 0x21; buf[4] = 31; buf[5] = 0; buf[6] = 0; buf[7] = 0; atapi_string(buf + 8, "BHYVE", 8); atapi_string(buf + 16, "BHYVE DVD-ROM", 16); atapi_string(buf + 32, "001", 4); len = sizeof(buf); } if (len > acmd[4]) len = acmd[4]; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, len); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[8]; uint64_t sectors; sectors = blockif_size(p->bctx) / 2048; be32enc(buf, sectors - 1); be32enc(buf + 4, 2048); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint8_t format; unsigned int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); format = acmd[9] >> 6; switch (format) { case 0: { size_t size; int msf; uint64_t sectors; uint8_t start_track, buf[20], *bp; msf = (acmd[1] >> 1) & 1; start_track = acmd[6]; if (start_track > 1 && start_track != 0xaa) { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } bp = buf + 2; *bp++ = 1; *bp++ = 1; if (start_track <= 1) { *bp++ = 0; *bp++ = 0x14; *bp++ = 1; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } } *bp++ = 0; *bp++ = 0x14; *bp++ = 0xaa; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 1: { uint8_t buf[12]; memset(buf, 0, sizeof(buf)); buf[1] = 0xa; buf[2] = 0x1; buf[3] = 0x1; if (len > sizeof(buf)) len = sizeof(buf); write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 2: { size_t size; int msf; uint64_t sectors; uint8_t *bp, buf[50]; msf = (acmd[1] >> 1) & 1; bp = buf + 2; *bp++ = 1; *bp++ = 1; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa1; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa2; *bp++ = 0; *bp++ = 0; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } default: { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); break; } } } static void atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[16]; memset(buf, 0, sizeof(buf)); buf[3] = 8; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; struct blockif_req *breq; uint8_t *acmd; uint64_t lba; uint32_t len; int err; acmd = cfis + 0x40; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); prdt = (struct ahci_prdt_entry *)(cfis + 0x80); lba = be32dec(acmd + 2); if (acmd[0] == READ_10) len = be16dec(acmd + 7); else len = be32dec(acmd + 6); if (len == 0) { cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } lba *= 2048; len *= 2048; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->readop = 1; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_read(p->bctx, breq); assert(err == 0); } static void atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[64]; uint8_t *acmd; unsigned int len; acmd = cfis + 0x40; len = acmd[4]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, len); buf[0] = 0x70 | (1 << 7); buf[2] = p->sense_key; buf[7] = 10; buf[12] = p->asc; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd = cfis + 0x40; uint32_t tfd; switch (acmd[4] & 3) { case 0: case 1: case 3: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; tfd = ATA_S_READY | ATA_S_DSC; break; case 2: /* TODO eject media */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x53; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; uint8_t pc, code; unsigned int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); pc = acmd[2] >> 6; code = acmd[2] & 0x3f; switch (pc) { case 0: switch (code) { case MODEPAGE_RW_ERROR_RECOVERY: { uint8_t buf[16]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 16 - 2); buf[2] = 0x70; buf[8] = 0x01; buf[9] = 16 - 10; buf[11] = 0x05; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } case MODEPAGE_CD_CAPABILITIES: { uint8_t buf[30]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 30 - 2); buf[2] = 0x70; buf[8] = 0x2A; buf[9] = 30 - 10; buf[10] = 0x08; buf[12] = 0x71; be16enc(&buf[18], 2); be16enc(&buf[20], 512); write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } default: goto error; break; } break; case 3: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x39; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; error: case 1: case 2: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_get_event_status_notification(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; acmd = cfis + 0x40; /* we don't support asynchronous operation */ if (!(acmd[1] & 1)) { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } else { uint8_t buf[8]; unsigned int len; len = be16dec(acmd + 7); if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 8 - 2); buf[2] = 0x04; buf[3] = 0x10; buf[5] = 0x02; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; acmd = cfis + 0x40; #ifdef AHCI_DEBUG { int i; DPRINTF("ACMD:"); for (i = 0; i < 16; i++) DPRINTF("%02x ", acmd[i]); DPRINTF(""); } #endif switch (acmd[0]) { case TEST_UNIT_READY: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case INQUIRY: atapi_inquiry(p, slot, cfis); break; case READ_CAPACITY: atapi_read_capacity(p, slot, cfis); break; case PREVENT_ALLOW: /* TODO */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case READ_TOC: atapi_read_toc(p, slot, cfis); break; case REPORT_LUNS: atapi_report_luns(p, slot, cfis); break; case READ_10: case READ_12: atapi_read(p, slot, cfis, 0); break; case REQUEST_SENSE: atapi_request_sense(p, slot, cfis); break; case START_STOP_UNIT: atapi_start_stop_unit(p, slot, cfis); break; case MODE_SENSE_10: atapi_mode_sense(p, slot, cfis); break; case GET_EVENT_STATUS_NOTIFICATION: atapi_get_event_status_notification(p, slot, cfis); break; default: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x20; ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { p->tfd |= ATA_S_BUSY; switch (cfis[2]) { case ATA_ATA_IDENTIFY: handle_identify(p, slot, cfis); break; case ATA_SETFEATURES: { switch (cfis[3]) { case ATA_SF_ENAB_SATA_SF: switch (cfis[12]) { case ATA_SATA_SF_AN: p->tfd = ATA_S_DSC | ATA_S_READY; break; default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } break; case ATA_SF_ENAB_WCACHE: case ATA_SF_DIS_WCACHE: case ATA_SF_ENAB_RCACHE: case ATA_SF_DIS_RCACHE: p->tfd = ATA_S_DSC | ATA_S_READY; break; case ATA_SF_SETXFER: { switch (cfis[12] & 0xf8) { case ATA_PIO: case ATA_PIO0: break; case ATA_WDMA0: case ATA_UDMA0: p->xfermode = (cfis[12] & 0x7); break; } p->tfd = ATA_S_DSC | ATA_S_READY; break; } default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; } case ATA_SET_MULTI: if (cfis[12] != 0 && (cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) { p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); } else { p->mult_sectors = cfis[12]; p->tfd = ATA_S_DSC | ATA_S_READY; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; case ATA_READ: case ATA_WRITE: case ATA_READ48: case ATA_WRITE48: case ATA_READ_MUL: case ATA_WRITE_MUL: case ATA_READ_MUL48: case ATA_WRITE_MUL48: case ATA_READ_DMA: case ATA_WRITE_DMA: case ATA_READ_DMA48: case ATA_WRITE_DMA48: case ATA_READ_FPDMA_QUEUED: case ATA_WRITE_FPDMA_QUEUED: ahci_handle_rw(p, slot, cfis, 0); break; case ATA_FLUSHCACHE: case ATA_FLUSHCACHE48: ahci_handle_flush(p, slot, cfis); break; case ATA_DATA_SET_MANAGEMENT: if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM && cfis[13] == 0 && cfis[12] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_SEND_FPDMA_QUEUED: if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM && cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM && cfis[11] == 0 && cfis[3] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_READ_LOG_EXT: case ATA_READ_LOG_DMA_EXT: ahci_handle_read_log(p, slot, cfis); break; case ATA_SECURITY_FREEZE_LOCK: case ATA_SMART_CMD: case ATA_NOP: ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_CHECK_POWER_MODE: cfis[12] = 0xff; /* always on */ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_STANDBY_CMD: case ATA_STANDBY_IMMEDIATE: case ATA_IDLE_CMD: case ATA_IDLE_IMMEDIATE: case ATA_SLEEP: case ATA_READ_VERIFY: case ATA_READ_VERIFY48: ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_ATAPI_IDENTIFY: handle_atapi_identify(p, slot, cfis); break; case ATA_PACKET_CMD: if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else handle_packet_cmd(p, slot, cfis); break; default: WPRINTF("Unsupported cmd:%02x", cfis[2]); ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_slot(struct ahci_port *p, int slot) { struct ahci_cmd_hdr *hdr; #ifdef AHCI_DEBUG struct ahci_prdt_entry *prdt; #endif struct pci_ahci_softc *sc; uint8_t *cfis; #ifdef AHCI_DEBUG int cfl, i; #endif sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); #ifdef AHCI_DEBUG cfl = (hdr->flags & 0x1f) * 4; #endif cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba, 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry)); #ifdef AHCI_DEBUG prdt = (struct ahci_prdt_entry *)(cfis + 0x80); DPRINTF("cfis:"); for (i = 0; i < cfl; i++) { if (i % 10 == 0) DPRINTF(""); DPRINTF("%02x ", cfis[i]); } DPRINTF(""); for (i = 0; i < hdr->prdtl; i++) { DPRINTF("%d@%08"PRIx64"", prdt->dbc & 0x3fffff, prdt->dba); prdt++; } #endif if (cfis[0] != FIS_TYPE_REGH2D) { WPRINTF("Not a H2D FIS:%02x", cfis[0]); return; } if (cfis[1] & 0x80) { ahci_handle_cmd(p, slot, cfis); } else { if (cfis[15] & (1 << 2)) p->reset = 1; else if (p->reset) { p->reset = 0; ahci_port_reset(p); } p->ci &= ~(1 << slot); } } static void ahci_handle_port(struct ahci_port *p) { if (!(p->cmd & AHCI_P_CMD_ST)) return; /* * Search for any new commands to issue ignoring those that * are already in-flight. Stop if device is busy or in error. */ for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) { if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0) break; if (p->waitforclear) break; if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) { p->cmd &= ~AHCI_P_CMD_CCS_MASK; p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT; ahci_handle_slot(p, p->ccs); } } } /* * blockif callback routine - this runs in the context of the blockif * i/o thread, so the mutex needs to be acquired. */ static void ata_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint32_t tfd; uint8_t *cfis; int slot, ncq, dsm; DPRINTF("%s %d", __func__, err); ncq = dsm = 0; aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) ncq = 1; if (cfis[2] == ATA_DATA_SET_MANAGEMENT || (cfis[2] == ATA_SEND_FPDMA_QUEUED && (cfis[13] & 0x1f) == ATA_SFPDMA_DSM)) dsm = 1; pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { if (dsm) ahci_handle_dsm_trim(p, slot, cfis, aior->done); else ahci_handle_rw(p, slot, cfis, aior->done); goto out; } if (!err) tfd = ATA_S_READY | ATA_S_DSC; else tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; if (ncq) ahci_write_fis_sdb(p, slot, cfis, tfd); else ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit", __func__); } static void atapi_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint8_t *cfis; uint32_t tfd; int slot; DPRINTF("%s %d", __func__, err); aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE); pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { atapi_read(p, slot, cfis, aior->done); goto out; } if (!err) { tfd = ATA_S_READY | ATA_S_DSC; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x21; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit", __func__); } static void pci_ahci_ioreq_init(struct ahci_port *pr) { struct ahci_ioreq *vr; int i; pr->ioqsz = blockif_queuesz(pr->bctx); pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq)); STAILQ_INIT(&pr->iofhd); /* * Add all i/o request entries to the free queue */ for (i = 0; i < pr->ioqsz; i++) { vr = &pr->ioreq[i]; vr->io_pr = pr; if (!pr->atapi) vr->io_req.br_callback = ata_ioreq_cb; else vr->io_req.br_callback = atapi_ioreq_cb; vr->io_req.br_param = vr; STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist); } TAILQ_INIT(&pr->iobhd); } static void pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; struct ahci_port *p = &sc->port[port]; DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"", port, offset, value); switch (offset) { case AHCI_P_CLB: p->clb = value; break; case AHCI_P_CLBU: p->clbu = value; break; case AHCI_P_FB: p->fb = value; break; case AHCI_P_FBU: p->fbu = value; break; case AHCI_P_IS: p->is &= ~value; ahci_port_intr(p); break; case AHCI_P_IE: p->ie = value & 0xFDC000FF; ahci_port_intr(p); break; case AHCI_P_CMD: { p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK); p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value; if (!(value & AHCI_P_CMD_ST)) { ahci_port_stop(p); } else { uint64_t clb; p->cmd |= AHCI_P_CMD_CR; clb = (uint64_t)p->clbu << 32 | p->clb; p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb, AHCI_CL_SIZE * AHCI_MAX_SLOTS); } if (value & AHCI_P_CMD_FRE) { uint64_t fb; p->cmd |= AHCI_P_CMD_FR; fb = (uint64_t)p->fbu << 32 | p->fb; /* we don't support FBSCP, so rfis size is 256Bytes */ p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256); } else { p->cmd &= ~AHCI_P_CMD_FR; } if (value & AHCI_P_CMD_CLO) { p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ); p->cmd &= ~AHCI_P_CMD_CLO; } if (value & AHCI_P_CMD_ICC_MASK) { p->cmd &= ~AHCI_P_CMD_ICC_MASK; } ahci_handle_port(p); break; } case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"", offset); break; case AHCI_P_SCTL: p->sctl = value; if (!(p->cmd & AHCI_P_CMD_ST)) { if (value & ATA_SC_DET_RESET) ahci_port_reset(p); } break; case AHCI_P_SERR: p->serr &= ~value; break; case AHCI_P_SACT: p->sact |= value; break; case AHCI_P_CI: p->ci |= value; ahci_handle_port(p); break; case AHCI_P_SNTF: case AHCI_P_FBS: default: break; } } static void pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"", offset, value); switch (offset) { case AHCI_CAP: case AHCI_PI: case AHCI_VS: case AHCI_CAP2: DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"", offset); break; case AHCI_GHC: if (value & AHCI_GHC_HR) { ahci_reset(sc); break; } if (value & AHCI_GHC_IE) sc->ghc |= AHCI_GHC_IE; else sc->ghc &= ~AHCI_GHC_IE; ahci_generate_intr(sc, 0xffffffff); break; case AHCI_IS: sc->is &= ~value; ahci_generate_intr(sc, value); break; default: break; } } static void pci_ahci_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_ahci_softc *sc = pi->pi_arg; assert(baridx == 5); assert((offset % 4) == 0 && size == 4); pthread_mutex_lock(&sc->mtx); if (offset < AHCI_OFFSET) pci_ahci_host_write(sc, offset, value); else if (offset < (uint64_t)AHCI_OFFSET + sc->ports * AHCI_STEP) pci_ahci_port_write(sc, offset, value); else WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"", offset); pthread_mutex_unlock(&sc->mtx); } static uint64_t pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; switch (offset) { case AHCI_CAP: case AHCI_GHC: case AHCI_IS: case AHCI_PI: case AHCI_VS: case AHCI_CCCC: case AHCI_CCCP: case AHCI_EM_LOC: case AHCI_EM_CTL: case AHCI_CAP2: { uint32_t *p = &sc->cap; p += (offset - AHCI_CAP) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x", offset, value); return (value); } static uint64_t pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; switch (offset) { case AHCI_P_CLB: case AHCI_P_CLBU: case AHCI_P_FB: case AHCI_P_FBU: case AHCI_P_IS: case AHCI_P_IE: case AHCI_P_CMD: case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: case AHCI_P_SCTL: case AHCI_P_SERR: case AHCI_P_SACT: case AHCI_P_CI: case AHCI_P_SNTF: case AHCI_P_FBS: { uint32_t *p= &sc->port[port].clb; p += (offset - AHCI_P_CLB) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x", port, offset, value); return value; } static uint64_t pci_ahci_read(struct pci_devinst *pi, int baridx, uint64_t regoff, int size) { struct pci_ahci_softc *sc = pi->pi_arg; uint64_t offset; uint32_t value; assert(baridx == 5); assert(size == 1 || size == 2 || size == 4); assert((regoff & (size - 1)) == 0); pthread_mutex_lock(&sc->mtx); offset = regoff & ~0x3; /* round down to a multiple of 4 bytes */ if (offset < AHCI_OFFSET) value = pci_ahci_host_read(sc, offset); else if (offset < (uint64_t)AHCI_OFFSET + sc->ports * AHCI_STEP) value = pci_ahci_port_read(sc, offset); else { value = 0; WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"", regoff); } value >>= 8 * (regoff & 0x3); pthread_mutex_unlock(&sc->mtx); return (value); } /* * Each AHCI controller has a "port" node which contains nodes for * each port named after the decimal number of the port (no leading * zeroes). Port nodes contain a "type" ("hd" or "cd"), as well as * options for blockif. For example: * * pci.0.1.0 * .device="ahci" * .port * .0 * .type="hd" * .path="/path/to/image" */ static int pci_ahci_legacy_config_port(nvlist_t *nvl, int port, const char *type, const char *opts) { char node_name[sizeof("XX")]; nvlist_t *port_nvl; snprintf(node_name, sizeof(node_name), "%d", port); port_nvl = create_relative_config_node(nvl, node_name); set_config_value_node(port_nvl, "type", type); return (blockif_legacy_config(port_nvl, opts)); } static int pci_ahci_legacy_config(nvlist_t *nvl, const char *opts) { nvlist_t *ports_nvl; const char *type; char *next, *next2, *str, *tofree; int p, ret; if (opts == NULL) return (0); ports_nvl = create_relative_config_node(nvl, "port"); ret = 1; tofree = str = strdup(opts); for (p = 0; p < MAX_PORTS && str != NULL; p++, str = next) { /* Identify and cut off type of present port. */ if (strncmp(str, "hd:", 3) == 0) { type = "hd"; str += 3; } else if (strncmp(str, "cd:", 3) == 0) { type = "cd"; str += 3; } else type = NULL; /* Find and cut off the next port options. */ next = strstr(str, ",hd:"); next2 = strstr(str, ",cd:"); if (next == NULL || (next2 != NULL && next2 < next)) next = next2; if (next != NULL) { next[0] = 0; next++; } if (str[0] == 0) continue; if (type == NULL) { EPRINTLN("Missing or invalid type for port %d: \"%s\"", p, str); goto out; } if (pci_ahci_legacy_config_port(ports_nvl, p, type, str) != 0) goto out; } ret = 0; out: free(tofree); return (ret); } static int pci_ahci_cd_legacy_config(nvlist_t *nvl, const char *opts) { nvlist_t *ports_nvl; ports_nvl = create_relative_config_node(nvl, "port"); return (pci_ahci_legacy_config_port(ports_nvl, 0, "cd", opts)); } static int pci_ahci_hd_legacy_config(nvlist_t *nvl, const char *opts) { nvlist_t *ports_nvl; ports_nvl = create_relative_config_node(nvl, "port"); return (pci_ahci_legacy_config_port(ports_nvl, 0, "hd", opts)); } static int pci_ahci_init(struct pci_devinst *pi, nvlist_t *nvl) { char bident[sizeof("XXX:XXX:XXX")]; char node_name[sizeof("XX")]; struct blockif_ctxt *bctxt; struct pci_ahci_softc *sc; int atapi, ret, slots, p; MD5_CTX mdctx; u_char digest[16]; const char *path, *type, *value; nvlist_t *ports_nvl, *port_nvl; ret = 0; #ifdef AHCI_DEBUG dbg = fopen("/tmp/log", "w+"); #endif sc = calloc(1, sizeof(struct pci_ahci_softc)); pi->pi_arg = sc; sc->asc_pi = pi; pthread_mutex_init(&sc->mtx, NULL); sc->ports = 0; sc->pi = 0; slots = 32; ports_nvl = find_relative_config_node(nvl, "port"); for (p = 0; ports_nvl != NULL && p < MAX_PORTS; p++) { struct ata_params *ata_ident = &sc->port[p].ata_ident; char ident[AHCI_PORT_IDENT]; snprintf(node_name, sizeof(node_name), "%d", p); port_nvl = find_relative_config_node(ports_nvl, node_name); if (port_nvl == NULL) continue; type = get_config_value_node(port_nvl, "type"); if (type == NULL) continue; if (strcmp(type, "hd") == 0) atapi = 0; else atapi = 1; /* * Attempt to open the backing image. Use the PCI slot/func * and the port number for the identifier string. */ snprintf(bident, sizeof(bident), "%u:%u:%u", pi->pi_slot, pi->pi_func, p); bctxt = blockif_open(port_nvl, bident); if (bctxt == NULL) { sc->ports = p; ret = 1; goto open_fail; } + + ret = blockif_add_boot_device(pi, bctxt); + if (ret) { + sc->ports = p; + goto open_fail; + } + sc->port[p].bctx = bctxt; sc->port[p].pr_sc = sc; sc->port[p].port = p; sc->port[p].atapi = atapi; /* * Create an identifier for the backing file. * Use parts of the md5 sum of the filename */ path = get_config_value_node(port_nvl, "path"); MD5Init(&mdctx); MD5Update(&mdctx, path, strlen(path)); MD5Final(digest, &mdctx); snprintf(ident, AHCI_PORT_IDENT, "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); memset(ata_ident, 0, sizeof(struct ata_params)); ata_string((uint8_t*)&ata_ident->serial, ident, 20); ata_string((uint8_t*)&ata_ident->revision, "001", 8); if (atapi) ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DVD ROM", 40); else ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DISK", 40); value = get_config_value_node(port_nvl, "nmrr"); if (value != NULL) ata_ident->media_rotation_rate = atoi(value); value = get_config_value_node(port_nvl, "ser"); if (value != NULL) ata_string((uint8_t*)(&ata_ident->serial), value, 20); value = get_config_value_node(port_nvl, "rev"); if (value != NULL) ata_string((uint8_t*)(&ata_ident->revision), value, 8); value = get_config_value_node(port_nvl, "model"); if (value != NULL) ata_string((uint8_t*)(&ata_ident->model), value, 40); ata_identify_init(&sc->port[p], atapi); /* * Allocate blockif request structures and add them * to the free list */ pci_ahci_ioreq_init(&sc->port[p]); sc->pi |= (1 << p); if (sc->port[p].ioqsz < slots) slots = sc->port[p].ioqsz; } sc->ports = p; /* Intel ICH8 AHCI */ --slots; if (sc->ports < DEF_PORTS) sc->ports = DEF_PORTS; sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF | AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP | AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)| AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC | (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1); sc->vs = 0x10300; sc->cap2 = AHCI_CAP2_APST; ahci_reset(sc); pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0); p = MIN(sc->ports, 16); p = flsl(p) - ((p & (p - 1)) ? 0 : 1); pci_emul_add_msicap(pi, 1 << p); pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32, AHCI_OFFSET + sc->ports * AHCI_STEP); pci_lintr_request(pi); open_fail: if (ret) { for (p = 0; p < sc->ports; p++) { if (sc->port[p].bctx != NULL) blockif_close(sc->port[p].bctx); } free(sc); } return (ret); } #ifdef BHYVE_SNAPSHOT static int pci_ahci_snapshot(struct vm_snapshot_meta *meta) { int i, ret; void *bctx; struct pci_devinst *pi; struct pci_ahci_softc *sc; struct ahci_port *port; pi = meta->dev_data; sc = pi->pi_arg; /* TODO: add mtx lock/unlock */ SNAPSHOT_VAR_OR_LEAVE(sc->ports, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->cap, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ghc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->is, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->pi, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->vs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ccc_ctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ccc_pts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->em_loc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->em_ctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->cap2, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->bohc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->lintr, meta, ret, done); for (i = 0; i < MAX_PORTS; i++) { port = &sc->port[i]; if (meta->op == VM_SNAPSHOT_SAVE) bctx = port->bctx; SNAPSHOT_VAR_OR_LEAVE(bctx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->port, meta, ret, done); /* Mostly for restore; save is ensured by the lines above. */ if (((bctx == NULL) && (port->bctx != NULL)) || ((bctx != NULL) && (port->bctx == NULL))) { fprintf(stderr, "%s: ports not matching\r\n", __func__); ret = EINVAL; goto done; } if (port->bctx == NULL) continue; if (port->port != i) { fprintf(stderr, "%s: ports not matching: " "actual: %d expected: %d\r\n", __func__, port->port, i); ret = EINVAL; goto done; } SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, port->cmd_lst, AHCI_CL_SIZE * AHCI_MAX_SLOTS, false, meta, ret, done); SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, port->rfis, 256, false, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ata_ident, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->atapi, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->reset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->waitforclear, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->mult_sectors, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->xfermode, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->err_cfis, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sense_key, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->asc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ccs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->pending, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->clb, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->clbu, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fb, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fbu, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ie, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->cmd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->unused0, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->tfd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sig, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ssts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->serr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sact, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ci, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sntf, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done); assert(TAILQ_EMPTY(&port->iobhd)); } done: return (ret); } static int pci_ahci_pause(struct pci_devinst *pi) { struct pci_ahci_softc *sc; struct blockif_ctxt *bctxt; int i; sc = pi->pi_arg; for (i = 0; i < MAX_PORTS; i++) { bctxt = sc->port[i].bctx; if (bctxt == NULL) continue; blockif_pause(bctxt); } return (0); } static int pci_ahci_resume(struct pci_devinst *pi) { struct pci_ahci_softc *sc; struct blockif_ctxt *bctxt; int i; sc = pi->pi_arg; for (i = 0; i < MAX_PORTS; i++) { bctxt = sc->port[i].bctx; if (bctxt == NULL) continue; blockif_resume(bctxt); } return (0); } #endif /* BHYVE_SNAPSHOT */ /* * Use separate emulation names to distinguish drive and atapi devices */ static const struct pci_devemu pci_de_ahci = { .pe_emu = "ahci", .pe_init = pci_ahci_init, .pe_legacy_config = pci_ahci_legacy_config, .pe_barwrite = pci_ahci_write, .pe_barread = pci_ahci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_ahci_snapshot, .pe_pause = pci_ahci_pause, .pe_resume = pci_ahci_resume, #endif }; PCI_EMUL_SET(pci_de_ahci); static const struct pci_devemu pci_de_ahci_hd = { .pe_emu = "ahci-hd", .pe_legacy_config = pci_ahci_hd_legacy_config, .pe_alias = "ahci", }; PCI_EMUL_SET(pci_de_ahci_hd); static const struct pci_devemu pci_de_ahci_cd = { .pe_emu = "ahci-cd", .pe_legacy_config = pci_ahci_cd_legacy_config, .pe_alias = "ahci", }; PCI_EMUL_SET(pci_de_ahci_cd); diff --git a/usr.sbin/bhyve/pci_nvme.c b/usr.sbin/bhyve/pci_nvme.c index de5865220155..a18413a50367 100644 --- a/usr.sbin/bhyve/pci_nvme.c +++ b/usr.sbin/bhyve/pci_nvme.c @@ -1,3342 +1,3350 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2017 Shunsuke Mie * Copyright (c) 2018 Leon Dang * Copyright (c) 2020 Chuck Tuffli * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * bhyve PCIe-NVMe device emulation. * * options: * -s ,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm= * * accepted devpath: * /dev/blockdev * /path/to/image * ram=size_in_MiB * * maxq = max number of queues * qsz = max elements in each queue * ioslots = max number of concurrent io requests * sectsz = sector size (defaults to blockif sector size) * ser = serial number (20-chars max) * eui64 = IEEE Extended Unique Identifier (8 byte value) * dsm = DataSet Management support. Option is one of auto, enable,disable * */ /* TODO: - create async event for smart and log - intr coalesce */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "block_if.h" #include "config.h" #include "debug.h" #include "pci_emul.h" static int nvme_debug = 0; #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) /* defaults; can be overridden */ #define NVME_MSIX_BAR 4 #define NVME_IOSLOTS 8 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ #define NVME_MMIO_SPACE_MIN (1 << 14) #define NVME_QUEUES 16 #define NVME_MAX_QENTRIES 2048 /* Memory Page size Minimum reported in CAP register */ #define NVME_MPSMIN 0 /* MPSMIN converted to bytes */ #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) #define NVME_MDTS 9 /* Note the + 1 allows for the initial descriptor to not be page aligned */ #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) /* This is a synthetic status code to indicate there is no status */ #define NVME_NO_STATUS 0xffff #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) /* Reported temperature in Kelvin (i.e. room temperature) */ #define NVME_TEMPERATURE 296 /* helpers */ /* Convert a zero-based value into a one-based value */ #define ONE_BASED(zero) ((zero) + 1) /* Convert a one-based value into a zero-based value */ #define ZERO_BASED(one) ((one) - 1) /* Encode number of SQ's and CQ's for Set/Get Features */ #define NVME_FEATURE_NUM_QUEUES(sc) \ (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) enum nvme_controller_register_offsets { NVME_CR_CAP_LOW = 0x00, NVME_CR_CAP_HI = 0x04, NVME_CR_VS = 0x08, NVME_CR_INTMS = 0x0c, NVME_CR_INTMC = 0x10, NVME_CR_CC = 0x14, NVME_CR_CSTS = 0x1c, NVME_CR_NSSR = 0x20, NVME_CR_AQA = 0x24, NVME_CR_ASQ_LOW = 0x28, NVME_CR_ASQ_HI = 0x2c, NVME_CR_ACQ_LOW = 0x30, NVME_CR_ACQ_HI = 0x34, }; enum nvme_cmd_cdw11 { NVME_CMD_CDW11_PC = 0x0001, NVME_CMD_CDW11_IEN = 0x0002, NVME_CMD_CDW11_IV = 0xFFFF0000, }; enum nvme_copy_dir { NVME_COPY_TO_PRP, NVME_COPY_FROM_PRP, }; #define NVME_CQ_INTEN 0x01 #define NVME_CQ_INTCOAL 0x02 struct nvme_completion_queue { struct nvme_completion *qbase; pthread_mutex_t mtx; uint32_t size; uint16_t tail; /* nvme progress */ uint16_t head; /* guest progress */ uint16_t intr_vec; uint32_t intr_en; }; struct nvme_submission_queue { struct nvme_command *qbase; pthread_mutex_t mtx; uint32_t size; uint16_t head; /* nvme progress */ uint16_t tail; /* guest progress */ uint16_t cqid; /* completion queue id */ int qpriority; }; enum nvme_storage_type { NVME_STOR_BLOCKIF = 0, NVME_STOR_RAM = 1, }; struct pci_nvme_blockstore { enum nvme_storage_type type; void *ctx; uint64_t size; uint32_t sectsz; uint32_t sectsz_bits; uint64_t eui64; uint32_t deallocate:1; }; /* * Calculate the number of additional page descriptors for guest IO requests * based on the advertised Max Data Transfer (MDTS) and given the number of * default iovec's in a struct blockif_req. */ #define MDTS_PAD_SIZE \ ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 0 ) struct pci_nvme_ioreq { struct pci_nvme_softc *sc; STAILQ_ENTRY(pci_nvme_ioreq) link; struct nvme_submission_queue *nvme_sq; uint16_t sqid; /* command information */ uint16_t opc; uint16_t cid; uint32_t nsid; uint64_t prev_gpaddr; size_t prev_size; size_t bytes; struct blockif_req io_req; struct iovec iovpadding[MDTS_PAD_SIZE]; }; enum nvme_dsm_type { /* Dataset Management bit in ONCS reflects backing storage capability */ NVME_DATASET_MANAGEMENT_AUTO, /* Unconditionally set Dataset Management bit in ONCS */ NVME_DATASET_MANAGEMENT_ENABLE, /* Unconditionally clear Dataset Management bit in ONCS */ NVME_DATASET_MANAGEMENT_DISABLE, }; struct pci_nvme_softc; struct nvme_feature_obj; typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); struct nvme_feature_obj { uint32_t cdw11; nvme_feature_cb set; nvme_feature_cb get; bool namespace_specific; }; #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) typedef enum { PCI_NVME_AE_TYPE_ERROR = 0, PCI_NVME_AE_TYPE_SMART, PCI_NVME_AE_TYPE_NOTICE, PCI_NVME_AE_TYPE_IO_CMD = 6, PCI_NVME_AE_TYPE_VENDOR = 7, PCI_NVME_AE_TYPE_MAX /* Must be last */ } pci_nvme_async_type; /* Asynchronous Event Requests */ struct pci_nvme_aer { STAILQ_ENTRY(pci_nvme_aer) link; uint16_t cid; /* Command ID of the submitted AER */ }; /** Asynchronous Event Information - Notice */ typedef enum { PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, PCI_NVME_AEI_NOTICE_FW_ACTIVATION, PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, PCI_NVME_AEI_NOTICE_ANA_CHANGE, PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, PCI_NVME_AEI_NOTICE_MAX, } pci_nvme_async_event_info_notice; #define PCI_NVME_AEI_NOTICE_SHIFT 8 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) /* Asynchronous Event Notifications */ struct pci_nvme_aen { pci_nvme_async_type atype; uint32_t event_data; bool posted; }; /* * By default, enable all Asynchrnous Event Notifications: * SMART / Health Critical Warnings * Namespace Attribute Notices */ #define PCI_NVME_AEN_DEFAULT_MASK 0x11f typedef enum { NVME_CNTRLTYPE_IO = 1, NVME_CNTRLTYPE_DISCOVERY = 2, NVME_CNTRLTYPE_ADMIN = 3, } pci_nvme_cntrl_type; struct pci_nvme_softc { struct pci_devinst *nsc_pi; pthread_mutex_t mtx; struct nvme_registers regs; struct nvme_namespace_data nsdata; struct nvme_controller_data ctrldata; struct nvme_error_information_entry err_log; struct nvme_health_information_page health_log; struct nvme_firmware_page fw_log; struct nvme_ns_list ns_log; struct pci_nvme_blockstore nvstore; uint16_t max_qentries; /* max entries per queue */ uint32_t max_queues; /* max number of IO SQ's or CQ's */ uint32_t num_cqueues; uint32_t num_squeues; bool num_q_is_set; /* Has host set Number of Queues */ struct pci_nvme_ioreq *ioreqs; STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ uint32_t pending_ios; uint32_t ioslots; sem_t iosemlock; /* * Memory mapped Submission and Completion queues * Each array includes both Admin and IO queues */ struct nvme_completion_queue *compl_queues; struct nvme_submission_queue *submit_queues; struct nvme_feature_obj feat[NVME_FID_MAX]; enum nvme_dsm_type dataset_management; /* Accounting for SMART data */ __uint128_t read_data_units; __uint128_t write_data_units; __uint128_t read_commands; __uint128_t write_commands; uint32_t read_dunits_remainder; uint32_t write_dunits_remainder; STAILQ_HEAD(, pci_nvme_aer) aer_list; pthread_mutex_t aer_mtx; uint32_t aer_count; struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; pthread_t aen_tid; pthread_mutex_t aen_mtx; pthread_cond_t aen_cond; }; static void pci_nvme_cq_update(struct pci_nvme_softc *sc, struct nvme_completion_queue *cq, uint32_t cdw0, uint16_t cid, uint16_t sqid, uint16_t status); static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); static void pci_nvme_io_done(struct blockif_req *, int); /* Controller Configuration utils */ #define NVME_CC_GET_EN(cc) \ ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) #define NVME_CC_GET_CSS(cc) \ ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) #define NVME_CC_GET_SHN(cc) \ ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) #define NVME_CC_GET_IOSQES(cc) \ ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) #define NVME_CC_GET_IOCQES(cc) \ ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) #define NVME_CC_WRITE_MASK \ ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) #define NVME_CC_NEN_WRITE_MASK \ ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) /* Controller Status utils */ #define NVME_CSTS_GET_RDY(sts) \ ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT) /* Completion Queue status word utils */ #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) #define NVME_STATUS_MASK \ ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ NVME_CTRLR_DATA_ONCS_DSM_SHIFT) static void nvme_feature_invalid_cb(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_temperature(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_num_queues(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_iv_config(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_async_event(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void *aen_thr(void *arg); static __inline void cpywithpad(char *dst, size_t dst_size, const char *src, char pad) { size_t len; len = strnlen(src, dst_size); memset(dst, pad, dst_size); memcpy(dst, src, len); } static __inline void pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) { *status &= ~NVME_STATUS_MASK; *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; } static __inline void pci_nvme_status_genc(uint16_t *status, uint16_t code) { pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); } /* * Initialize the requested number or IO Submission and Completion Queues. * Admin queues are allocated implicitly. */ static void pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) { uint32_t i; /* * Allocate and initialize the Submission Queues */ if (nsq > NVME_QUEUES) { WPRINTF("%s: clamping number of SQ from %u to %u", __func__, nsq, NVME_QUEUES); nsq = NVME_QUEUES; } sc->num_squeues = nsq; sc->submit_queues = calloc(sc->num_squeues + 1, sizeof(struct nvme_submission_queue)); if (sc->submit_queues == NULL) { WPRINTF("%s: SQ allocation failed", __func__); sc->num_squeues = 0; } else { struct nvme_submission_queue *sq = sc->submit_queues; for (i = 0; i < sc->num_squeues + 1; i++) pthread_mutex_init(&sq[i].mtx, NULL); } /* * Allocate and initialize the Completion Queues */ if (ncq > NVME_QUEUES) { WPRINTF("%s: clamping number of CQ from %u to %u", __func__, ncq, NVME_QUEUES); ncq = NVME_QUEUES; } sc->num_cqueues = ncq; sc->compl_queues = calloc(sc->num_cqueues + 1, sizeof(struct nvme_completion_queue)); if (sc->compl_queues == NULL) { WPRINTF("%s: CQ allocation failed", __func__); sc->num_cqueues = 0; } else { struct nvme_completion_queue *cq = sc->compl_queues; for (i = 0; i < sc->num_cqueues + 1; i++) pthread_mutex_init(&cq[i].mtx, NULL); } } static void pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) { struct nvme_controller_data *cd = &sc->ctrldata; cd->vid = 0xFB5D; cd->ssvid = 0x0000; cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); /* Num of submission commands that we can handle at a time (2^rab) */ cd->rab = 4; /* FreeBSD OUI */ cd->ieee[0] = 0x58; cd->ieee[1] = 0x9c; cd->ieee[2] = 0xfc; cd->mic = 0; cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ cd->ver = NVME_REV(1,4); cd->cntrltype = NVME_CNTRLTYPE_IO; cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); cd->acl = 2; cd->aerl = 4; /* Advertise 1, Read-only firmware slot */ cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); cd->lpa = 0; /* TODO: support some simple things like SMART */ cd->elpe = 0; /* max error log page entries */ /* * Report a single power state (zero-based value) * power_state[] values are left as zero to indicate "Not reported" */ cd->npss = 0; /* Warning Composite Temperature Threshold */ cd->wctemp = 0x0157; cd->cctemp = 0x0157; /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO << NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT); cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); cd->nn = 1; /* number of namespaces */ cd->oncs = 0; switch (sc->dataset_management) { case NVME_DATASET_MANAGEMENT_AUTO: if (sc->nvstore.deallocate) cd->oncs |= NVME_ONCS_DSM; break; case NVME_DATASET_MANAGEMENT_ENABLE: cd->oncs |= NVME_ONCS_DSM; break; default: break; } cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; } static void pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, struct nvme_namespace_data *nd) { /* Get capacity and block size information from backing store */ nd->nsze = nvstore->size / nvstore->sectsz; nd->ncap = nd->nsze; nd->nuse = nd->nsze; } static void pci_nvme_init_nsdata(struct pci_nvme_softc *sc, struct nvme_namespace_data *nd, uint32_t nsid, struct pci_nvme_blockstore *nvstore) { pci_nvme_init_nsdata_size(nvstore, nd); if (nvstore->type == NVME_STOR_BLOCKIF) nvstore->deallocate = blockif_candelete(nvstore->ctx); nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ nd->flbas = 0; /* Create an EUI-64 if user did not provide one */ if (nvstore->eui64 == 0) { char *data = NULL; uint64_t eui64 = nvstore->eui64; asprintf(&data, "%s%u%u%u", get_config_value("name"), sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); if (data != NULL) { eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); free(data); } nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); } be64enc(nd->eui64, nvstore->eui64); /* LBA data-sz = 2^lbads */ nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; } static void pci_nvme_init_logpages(struct pci_nvme_softc *sc) { __uint128_t power_cycles = 1; memset(&sc->err_log, 0, sizeof(sc->err_log)); memset(&sc->health_log, 0, sizeof(sc->health_log)); memset(&sc->fw_log, 0, sizeof(sc->fw_log)); memset(&sc->ns_log, 0, sizeof(sc->ns_log)); /* Set read/write remainder to round up according to spec */ sc->read_dunits_remainder = 999; sc->write_dunits_remainder = 999; /* Set nominal Health values checked by implementations */ sc->health_log.temperature = NVME_TEMPERATURE; sc->health_log.available_spare = 100; sc->health_log.available_spare_threshold = 10; /* Set Active Firmware Info to slot 1 */ sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT); memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, sizeof(sc->fw_log.revision[0])); memcpy(&sc->health_log.power_cycles, &power_cycles, sizeof(sc->health_log.power_cycles)); } static void pci_nvme_init_features(struct pci_nvme_softc *sc) { enum nvme_feature fid; for (fid = 0; fid < NVME_FID_MAX; fid++) { switch (fid) { case NVME_FEAT_ARBITRATION: case NVME_FEAT_POWER_MANAGEMENT: case NVME_FEAT_INTERRUPT_COALESCING: //XXX case NVME_FEAT_WRITE_ATOMICITY: /* Mandatory but no special handling required */ //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: // this returns a data buffer break; case NVME_FEAT_TEMPERATURE_THRESHOLD: sc->feat[fid].set = nvme_feature_temperature; break; case NVME_FEAT_ERROR_RECOVERY: sc->feat[fid].namespace_specific = true; break; case NVME_FEAT_NUMBER_OF_QUEUES: sc->feat[fid].set = nvme_feature_num_queues; break; case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: sc->feat[fid].set = nvme_feature_iv_config; break; case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: sc->feat[fid].set = nvme_feature_async_event; /* Enable all AENs by default */ sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; break; default: sc->feat[fid].set = nvme_feature_invalid_cb; sc->feat[fid].get = nvme_feature_invalid_cb; } } } static void pci_nvme_aer_reset(struct pci_nvme_softc *sc) { STAILQ_INIT(&sc->aer_list); sc->aer_count = 0; } static void pci_nvme_aer_init(struct pci_nvme_softc *sc) { pthread_mutex_init(&sc->aer_mtx, NULL); pci_nvme_aer_reset(sc); } static void pci_nvme_aer_destroy(struct pci_nvme_softc *sc) { struct pci_nvme_aer *aer = NULL; pthread_mutex_lock(&sc->aer_mtx); while (!STAILQ_EMPTY(&sc->aer_list)) { aer = STAILQ_FIRST(&sc->aer_list); STAILQ_REMOVE_HEAD(&sc->aer_list, link); free(aer); } pthread_mutex_unlock(&sc->aer_mtx); pci_nvme_aer_reset(sc); } static bool pci_nvme_aer_available(struct pci_nvme_softc *sc) { return (sc->aer_count != 0); } static bool pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) { struct nvme_controller_data *cd = &sc->ctrldata; /* AERL is a zero based value while aer_count is one's based */ return (sc->aer_count == (cd->aerl + 1U)); } /* * Add an Async Event Request * * Stores an AER to be returned later if the Controller needs to notify the * host of an event. * Note that while the NVMe spec doesn't require Controllers to return AER's * in order, this implementation does preserve the order. */ static int pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) { struct pci_nvme_aer *aer = NULL; aer = calloc(1, sizeof(struct pci_nvme_aer)); if (aer == NULL) return (-1); /* Save the Command ID for use in the completion message */ aer->cid = cid; pthread_mutex_lock(&sc->aer_mtx); sc->aer_count++; STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); pthread_mutex_unlock(&sc->aer_mtx); return (0); } /* * Get an Async Event Request structure * * Returns a pointer to an AER previously submitted by the host or NULL if * no AER's exist. Caller is responsible for freeing the returned struct. */ static struct pci_nvme_aer * pci_nvme_aer_get(struct pci_nvme_softc *sc) { struct pci_nvme_aer *aer = NULL; pthread_mutex_lock(&sc->aer_mtx); aer = STAILQ_FIRST(&sc->aer_list); if (aer != NULL) { STAILQ_REMOVE_HEAD(&sc->aer_list, link); sc->aer_count--; } pthread_mutex_unlock(&sc->aer_mtx); return (aer); } static void pci_nvme_aen_reset(struct pci_nvme_softc *sc) { uint32_t atype; memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { sc->aen[atype].atype = atype; } } static void pci_nvme_aen_init(struct pci_nvme_softc *sc) { char nstr[80]; pci_nvme_aen_reset(sc); pthread_mutex_init(&sc->aen_mtx, NULL); pthread_create(&sc->aen_tid, NULL, aen_thr, sc); snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); pthread_set_name_np(sc->aen_tid, nstr); } static void pci_nvme_aen_destroy(struct pci_nvme_softc *sc) { pci_nvme_aen_reset(sc); } /* Notify the AEN thread of pending work */ static void pci_nvme_aen_notify(struct pci_nvme_softc *sc) { pthread_cond_signal(&sc->aen_cond); } /* * Post an Asynchronous Event Notification */ static int32_t pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, uint32_t event_data) { struct pci_nvme_aen *aen; if (atype >= PCI_NVME_AE_TYPE_MAX) { return(EINVAL); } pthread_mutex_lock(&sc->aen_mtx); aen = &sc->aen[atype]; /* Has the controller already posted an event of this type? */ if (aen->posted) { pthread_mutex_unlock(&sc->aen_mtx); return(EALREADY); } aen->event_data = event_data; aen->posted = true; pthread_mutex_unlock(&sc->aen_mtx); pci_nvme_aen_notify(sc); return(0); } static void pci_nvme_aen_process(struct pci_nvme_softc *sc) { struct pci_nvme_aer *aer; struct pci_nvme_aen *aen; pci_nvme_async_type atype; uint32_t mask; uint16_t status; uint8_t lid; assert(pthread_mutex_isowned_np(&sc->aen_mtx)); for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { aen = &sc->aen[atype]; /* Previous iterations may have depleted the available AER's */ if (!pci_nvme_aer_available(sc)) { DPRINTF("%s: no AER", __func__); break; } if (!aen->posted) { DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); continue; } status = NVME_SC_SUCCESS; /* Is the event masked? */ mask = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); switch (atype) { case PCI_NVME_AE_TYPE_ERROR: lid = NVME_LOG_ERROR; break; case PCI_NVME_AE_TYPE_SMART: mask &= 0xff; if ((mask & aen->event_data) == 0) continue; lid = NVME_LOG_HEALTH_INFORMATION; break; case PCI_NVME_AE_TYPE_NOTICE: if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { EPRINTLN("%s unknown AEN notice type %u", __func__, aen->event_data); status = NVME_SC_INTERNAL_DEVICE_ERROR; lid = 0; break; } if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) continue; switch (aen->event_data) { case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: lid = NVME_LOG_CHANGED_NAMESPACE; break; case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: lid = NVME_LOG_FIRMWARE_SLOT; break; case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; break; case PCI_NVME_AEI_NOTICE_ANA_CHANGE: lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; break; case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; break; case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: lid = NVME_LOG_LBA_STATUS_INFORMATION; break; case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; break; default: lid = 0; } break; default: /* bad type?!? */ EPRINTLN("%s unknown AEN type %u", __func__, atype); status = NVME_SC_INTERNAL_DEVICE_ERROR; lid = 0; break; } aer = pci_nvme_aer_get(sc); assert(aer != NULL); DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); pci_nvme_cq_update(sc, &sc->compl_queues[0], (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ aer->cid, 0, /* SQID */ status); aen->event_data = 0; aen->posted = false; pci_generate_msix(sc->nsc_pi, 0); } } static void * aen_thr(void *arg) { struct pci_nvme_softc *sc; sc = arg; pthread_mutex_lock(&sc->aen_mtx); for (;;) { pci_nvme_aen_process(sc); pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); } pthread_mutex_unlock(&sc->aen_mtx); pthread_exit(NULL); return (NULL); } static void pci_nvme_reset_locked(struct pci_nvme_softc *sc) { uint32_t i; DPRINTF("%s", __func__); sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | (1 << NVME_CAP_LO_REG_CQR_SHIFT) | (60 << NVME_CAP_LO_REG_TO_SHIFT); sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ sc->regs.cc = 0; assert(sc->submit_queues != NULL); for (i = 0; i < sc->num_squeues + 1; i++) { sc->submit_queues[i].qbase = NULL; sc->submit_queues[i].size = 0; sc->submit_queues[i].cqid = 0; sc->submit_queues[i].tail = 0; sc->submit_queues[i].head = 0; } assert(sc->compl_queues != NULL); for (i = 0; i < sc->num_cqueues + 1; i++) { sc->compl_queues[i].qbase = NULL; sc->compl_queues[i].size = 0; sc->compl_queues[i].tail = 0; sc->compl_queues[i].head = 0; } sc->num_q_is_set = false; pci_nvme_aer_destroy(sc); pci_nvme_aen_destroy(sc); /* * Clear CSTS.RDY last to prevent the host from enabling Controller * before cleanup completes */ sc->regs.csts = 0; } static void pci_nvme_reset(struct pci_nvme_softc *sc) { pthread_mutex_lock(&sc->mtx); pci_nvme_reset_locked(sc); pthread_mutex_unlock(&sc->mtx); } static int pci_nvme_init_controller(struct pci_nvme_softc *sc) { uint16_t acqs, asqs; DPRINTF("%s", __func__); /* * NVMe 2.0 states that "enabling a controller while this field is * cleared to 0h produces undefined results" for both ACQS and * ASQS. If zero, set CFS and do not become ready. */ asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK); if (asqs < 2) { EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, asqs - 1, sc->regs.aqa); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } sc->submit_queues[0].size = asqs; sc->submit_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, sc->regs.asq, sizeof(struct nvme_command) * asqs); if (sc->submit_queues[0].qbase == NULL) { EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, sc->regs.asq); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", __func__, sc->regs.asq, sc->submit_queues[0].qbase); acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & NVME_AQA_REG_ACQS_MASK); if (acqs < 2) { EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, acqs - 1, sc->regs.aqa); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } sc->compl_queues[0].size = acqs; sc->compl_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, sc->regs.acq, sizeof(struct nvme_completion) * acqs); if (sc->compl_queues[0].qbase == NULL) { EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, sc->regs.acq); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } sc->compl_queues[0].intr_en = NVME_CQ_INTEN; DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", __func__, sc->regs.acq, sc->compl_queues[0].qbase); return (0); } static int nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, size_t len, enum nvme_copy_dir dir) { uint8_t *p; size_t bytes; if (len > (8 * 1024)) { return (-1); } /* Copy from the start of prp1 to the end of the physical page */ bytes = PAGE_SIZE - (prp1 & PAGE_MASK); bytes = MIN(bytes, len); p = vm_map_gpa(ctx, prp1, bytes); if (p == NULL) { return (-1); } if (dir == NVME_COPY_TO_PRP) memcpy(p, b, bytes); else memcpy(b, p, bytes); b += bytes; len -= bytes; if (len == 0) { return (0); } len = MIN(len, PAGE_SIZE); p = vm_map_gpa(ctx, prp2, len); if (p == NULL) { return (-1); } if (dir == NVME_COPY_TO_PRP) memcpy(p, b, len); else memcpy(b, p, len); return (0); } /* * Write a Completion Queue Entry update * * Write the completion and update the doorbell value */ static void pci_nvme_cq_update(struct pci_nvme_softc *sc, struct nvme_completion_queue *cq, uint32_t cdw0, uint16_t cid, uint16_t sqid, uint16_t status) { struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; struct nvme_completion *cqe; assert(cq->qbase != NULL); pthread_mutex_lock(&cq->mtx); cqe = &cq->qbase[cq->tail]; /* Flip the phase bit */ status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; cqe->cdw0 = cdw0; cqe->sqhd = sq->head; cqe->sqid = sqid; cqe->cid = cid; cqe->status = status; cq->tail++; if (cq->tail >= cq->size) { cq->tail = 0; } pthread_mutex_unlock(&cq->mtx); } static int nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint16_t qid = command->cdw10 & 0xffff; DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); if (qid == 0 || qid > sc->num_squeues || (sc->submit_queues[qid].qbase == NULL)) { WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", __func__, qid, sc->num_squeues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } sc->submit_queues[qid].qbase = NULL; sc->submit_queues[qid].cqid = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { if (command->cdw11 & NVME_CMD_CDW11_PC) { uint16_t qid = command->cdw10 & 0xffff; struct nvme_submission_queue *nsq; if ((qid == 0) || (qid > sc->num_squeues) || (sc->submit_queues[qid].qbase != NULL)) { WPRINTF("%s queue index %u > num_squeues %u", __func__, qid, sc->num_squeues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } nsq = &sc->submit_queues[qid]; nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { /* * Queues must specify at least two entries * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec */ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); return (1); } nsq->head = nsq->tail = 0; nsq->cqid = (command->cdw11 >> 16) & 0xffff; if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } if (sc->compl_queues[nsq->cqid].qbase == NULL) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_COMPLETION_QUEUE_INVALID); return (1); } nsq->qpriority = (command->cdw11 >> 1) & 0x03; nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(struct nvme_command) * (size_t)nsq->size); DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, qid, nsq->size, nsq->qbase, nsq->cqid); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); DPRINTF("%s completed creating IOSQ qid %u", __func__, qid); } else { /* * Guest sent non-cont submission queue request. * This setting is unsupported by this emulation. */ WPRINTF("%s unsupported non-contig (list-based) " "create i/o submission queue", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } return (1); } static int nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint16_t qid = command->cdw10 & 0xffff; uint16_t sqid; DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); if (qid == 0 || qid > sc->num_cqueues || (sc->compl_queues[qid].qbase == NULL)) { WPRINTF("%s queue index %u / num_cqueues %u", __func__, qid, sc->num_cqueues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } /* Deleting an Active CQ is an error */ for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) if (sc->submit_queues[sqid].cqid == qid) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_DELETION); return (1); } sc->compl_queues[qid].qbase = NULL; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { struct nvme_completion_queue *ncq; uint16_t qid = command->cdw10 & 0xffff; /* Only support Physically Contiguous queues */ if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { WPRINTF("%s unsupported non-contig (list-based) " "create i/o completion queue", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if ((qid == 0) || (qid > sc->num_cqueues) || (sc->compl_queues[qid].qbase != NULL)) { WPRINTF("%s queue index %u > num_cqueues %u", __func__, qid, sc->num_cqueues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } ncq = &sc->compl_queues[qid]; ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; if (ncq->intr_vec > (sc->max_queues + 1)) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_INTERRUPT_VECTOR); return (1); } ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { /* * Queues must specify at least two entries * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec */ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); return (1); } ncq->head = ncq->tail = 0; ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(struct nvme_command) * (size_t)ncq->size); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint64_t logoff; uint32_t logsize; uint8_t logpage; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); /* * Command specifies the number of dwords to return in fields NUMDU * and NUMDL. This is a zero-based value. */ logpage = command->cdw10 & 0xFF; logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; logsize *= sizeof(uint32_t); logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; DPRINTF("%s log page %u len %u", __func__, logpage, logsize); switch (logpage) { case NVME_LOG_ERROR: if (logoff >= sizeof(sc->err_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->err_log + logoff, MIN(logsize - logoff, sizeof(sc->err_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_HEALTH_INFORMATION: if (logoff >= sizeof(sc->health_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } pthread_mutex_lock(&sc->mtx); memcpy(&sc->health_log.data_units_read, &sc->read_data_units, sizeof(sc->health_log.data_units_read)); memcpy(&sc->health_log.data_units_written, &sc->write_data_units, sizeof(sc->health_log.data_units_written)); memcpy(&sc->health_log.host_read_commands, &sc->read_commands, sizeof(sc->health_log.host_read_commands)); memcpy(&sc->health_log.host_write_commands, &sc->write_commands, sizeof(sc->health_log.host_write_commands)); pthread_mutex_unlock(&sc->mtx); nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->health_log + logoff, MIN(logsize - logoff, sizeof(sc->health_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_FIRMWARE_SLOT: if (logoff >= sizeof(sc->fw_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->fw_log + logoff, MIN(logsize - logoff, sizeof(sc->fw_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_CHANGED_NAMESPACE: if (logoff >= sizeof(sc->ns_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->ns_log + logoff, MIN(logsize - logoff, sizeof(sc->ns_log)), NVME_COPY_TO_PRP); memset(&sc->ns_log, 0, sizeof(sc->ns_log)); break; default: DPRINTF("%s get log page %x command not supported", __func__, logpage); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_LOG_PAGE); } return (1); } static int nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { void *dest; uint16_t status; DPRINTF("%s identify 0x%x nsid 0x%x", __func__, command->cdw10 & 0xFF, command->nsid); status = 0; pci_nvme_status_genc(&status, NVME_SC_SUCCESS); switch (command->cdw10 & 0xFF) { case 0x00: /* return Identify Namespace data structure */ /* Global NS only valid with NS Management */ if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { pci_nvme_status_genc(&status, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), NVME_COPY_TO_PRP); break; case 0x01: /* return Identify Controller data structure */ nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->ctrldata, sizeof(sc->ctrldata), NVME_COPY_TO_PRP); break; case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint32_t) * 1024); /* All unused entries shall be zero */ memset(dest, 0, sizeof(uint32_t) * 1024); ((uint32_t *)dest)[0] = 1; break; case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ if (command->nsid != 1) { pci_nvme_status_genc(&status, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); break; } dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint32_t) * 1024); /* All bytes after the descriptor shall be zero */ memset(dest, 0, sizeof(uint32_t) * 1024); /* Return NIDT=1 (i.e. EUI64) descriptor */ ((uint8_t *)dest)[0] = 1; ((uint8_t *)dest)[1] = sizeof(uint64_t); memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); break; case 0x13: /* * Controller list is optional but used by UNH tests. Return * a valid but empty list. */ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint16_t) * 2048); memset(dest, 0, sizeof(uint16_t) * 2048); break; default: DPRINTF("%s unsupported identify command requested 0x%x", __func__, command->cdw10 & 0xFF); pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); break; } compl->status = status; return (1); } static const char * nvme_fid_to_name(uint8_t fid) { const char *name; switch (fid) { case NVME_FEAT_ARBITRATION: name = "Arbitration"; break; case NVME_FEAT_POWER_MANAGEMENT: name = "Power Management"; break; case NVME_FEAT_LBA_RANGE_TYPE: name = "LBA Range Type"; break; case NVME_FEAT_TEMPERATURE_THRESHOLD: name = "Temperature Threshold"; break; case NVME_FEAT_ERROR_RECOVERY: name = "Error Recovery"; break; case NVME_FEAT_VOLATILE_WRITE_CACHE: name = "Volatile Write Cache"; break; case NVME_FEAT_NUMBER_OF_QUEUES: name = "Number of Queues"; break; case NVME_FEAT_INTERRUPT_COALESCING: name = "Interrupt Coalescing"; break; case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: name = "Interrupt Vector Configuration"; break; case NVME_FEAT_WRITE_ATOMICITY: name = "Write Atomicity Normal"; break; case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: name = "Asynchronous Event Configuration"; break; case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: name = "Autonomous Power State Transition"; break; case NVME_FEAT_HOST_MEMORY_BUFFER: name = "Host Memory Buffer"; break; case NVME_FEAT_TIMESTAMP: name = "Timestamp"; break; case NVME_FEAT_KEEP_ALIVE_TIMER: name = "Keep Alive Timer"; break; case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: name = "Host Controlled Thermal Management"; break; case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: name = "Non-Operation Power State Config"; break; case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: name = "Read Recovery Level Config"; break; case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: name = "Predictable Latency Mode Config"; break; case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: name = "Predictable Latency Mode Window"; break; case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: name = "LBA Status Information Report Interval"; break; case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: name = "Host Behavior Support"; break; case NVME_FEAT_SANITIZE_CONFIG: name = "Sanitize Config"; break; case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: name = "Endurance Group Event Configuration"; break; case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: name = "Software Progress Marker"; break; case NVME_FEAT_HOST_IDENTIFIER: name = "Host Identifier"; break; case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: name = "Reservation Notification Mask"; break; case NVME_FEAT_RESERVATION_PERSISTENCE: name = "Reservation Persistence"; break; case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: name = "Namespace Write Protection Config"; break; default: name = "Unknown"; break; } return (name); } static void nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, struct nvme_feature_obj *feat __unused, struct nvme_command *command __unused, struct nvme_completion *compl) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } static void nvme_feature_iv_config(struct pci_nvme_softc *sc, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { uint32_t i; uint32_t cdw11 = command->cdw11; uint16_t iv; bool cd; pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); iv = cdw11 & 0xffff; cd = cdw11 & (1 << 16); if (iv > (sc->max_queues + 1)) { return; } /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ if ((iv == 0) && !cd) return; /* Requested Interrupt Vector must be used by a CQ */ for (i = 0; i < sc->num_cqueues + 1; i++) { if (sc->compl_queues[i].intr_vec == iv) { pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); } } } #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) static void nvme_feature_async_event(struct pci_nvme_softc *sc __unused, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } #define NVME_TEMP_THRESH_OVER 0 #define NVME_TEMP_THRESH_UNDER 1 static void nvme_feature_temperature(struct pci_nvme_softc *sc, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { uint16_t tmpth; /* Temperature Threshold */ uint8_t tmpsel; /* Threshold Temperature Select */ uint8_t thsel; /* Threshold Type Select */ bool set_crit = false; bool report_crit; tmpth = command->cdw11 & 0xffff; tmpsel = (command->cdw11 >> 16) & 0xf; thsel = (command->cdw11 >> 20) & 0x3; DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); /* Check for unsupported values */ if (((tmpsel != 0) && (tmpsel != 0xf)) || (thsel > NVME_TEMP_THRESH_UNDER)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return; } if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) set_crit = true; pthread_mutex_lock(&sc->mtx); if (set_crit) sc->health_log.critical_warning |= NVME_CRIT_WARN_ST_TEMPERATURE; else sc->health_log.critical_warning &= ~NVME_CRIT_WARN_ST_TEMPERATURE; pthread_mutex_unlock(&sc->mtx); report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 & NVME_CRIT_WARN_ST_TEMPERATURE; if (set_crit && report_crit) pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, sc->health_log.critical_warning); DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); } static void nvme_feature_num_queues(struct pci_nvme_softc *sc, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { uint16_t nqr; /* Number of Queues Requested */ if (sc->num_q_is_set) { WPRINTF("%s: Number of Queues already set", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_COMMAND_SEQUENCE_ERROR); return; } nqr = command->cdw11 & 0xFFFF; if (nqr == 0xffff) { WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return; } sc->num_squeues = ONE_BASED(nqr); if (sc->num_squeues > sc->max_queues) { DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, sc->max_queues); sc->num_squeues = sc->max_queues; } nqr = (command->cdw11 >> 16) & 0xFFFF; if (nqr == 0xffff) { WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return; } sc->num_cqueues = ONE_BASED(nqr); if (sc->num_cqueues > sc->max_queues) { DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, sc->max_queues); sc->num_cqueues = sc->max_queues; } /* Patch the command value which will be saved on callback's return */ command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); sc->num_q_is_set = true; } static int nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, struct nvme_completion *compl) { struct nvme_feature_obj *feat; uint32_t nsid = command->nsid; uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); if (fid >= NVME_FID_MAX) { DPRINTF("%s invalid feature 0x%x", __func__, fid); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if (sv) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_FEATURE_NOT_SAVEABLE); return (1); } feat = &sc->feat[fid]; if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if (!feat->namespace_specific && !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_FEATURE_NOT_NS_SPECIFIC); return (1); } compl->cdw0 = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); if (feat->set) feat->set(sc, feat, command, compl); else { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_FEATURE_NOT_CHANGEABLE); return (1); } DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); if (compl->status == NVME_SC_SUCCESS) { feat->cdw11 = command->cdw11; if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && (command->cdw11 != 0)) pci_nvme_aen_notify(sc); } return (0); } #define NVME_FEATURES_SEL_SUPPORTED 0x3 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) static int nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { struct nvme_feature_obj *feat; uint8_t fid = command->cdw10 & 0xFF; uint8_t sel = (command->cdw10 >> 8) & 0x7; DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); if (fid >= NVME_FID_MAX) { DPRINTF("%s invalid feature 0x%x", __func__, fid); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } compl->cdw0 = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); feat = &sc->feat[fid]; if (feat->get) { feat->get(sc, feat, command, compl); } if (compl->status == NVME_SC_SUCCESS) { if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; else compl->cdw0 = feat->cdw11; } return (0); } static int nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint8_t ses, lbaf, pi; /* Only supports Secure Erase Setting - User Data Erase */ ses = (command->cdw10 >> 9) & 0x7; if (ses > 0x1) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } /* Only supports a single LBA Format */ lbaf = command->cdw10 & 0xf; if (lbaf != 0) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_FORMAT); return (1); } /* Doesn't support Protection Information */ pi = (command->cdw10 >> 5) & 0x7; if (pi != 0) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if (sc->nvstore.type == NVME_STOR_RAM) { if (sc->nvstore.ctx) free(sc->nvstore.ctx); sc->nvstore.ctx = calloc(1, sc->nvstore.size); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); } else { struct pci_nvme_ioreq *req; int err; req = pci_nvme_get_ioreq(sc); if (req == NULL) { pci_nvme_status_genc(&compl->status, NVME_SC_INTERNAL_DEVICE_ERROR); WPRINTF("%s: unable to allocate IO req", __func__); return (1); } req->nvme_sq = &sc->submit_queues[0]; req->sqid = 0; req->opc = command->opc; req->cid = command->cid; req->nsid = command->nsid; req->io_req.br_offset = 0; req->io_req.br_resid = sc->nvstore.size; req->io_req.br_callback = pci_nvme_io_done; err = blockif_delete(sc->nvstore.ctx, &req->io_req); if (err) { pci_nvme_status_genc(&compl->status, NVME_SC_INTERNAL_DEVICE_ERROR); pci_nvme_release_ioreq(sc, req); } else compl->status = NVME_NO_STATUS; } return (1); } static int nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, struct nvme_completion *compl) { DPRINTF("%s submission queue %u, command ID 0x%x", __func__, command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); /* TODO: search for the command ID and abort it */ compl->cdw0 = 1; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_async_event_req(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, sc->aer_count, sc->ctrldata.aerl, command->cid); /* Don't exceed the Async Event Request Limit (AERL). */ if (pci_nvme_aer_limit_reached(sc)) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); return (1); } if (pci_nvme_aer_add(sc, command->cid)) { pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, NVME_SC_INTERNAL_DEVICE_ERROR); return (1); } /* * Raise events when they happen based on the Set Features cmd. * These events happen async, so only set completion successful if * there is an event reflective of the request to get event. */ compl->status = NVME_NO_STATUS; pci_nvme_aen_notify(sc); return (0); } static void pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) { struct nvme_completion compl; struct nvme_command *cmd; struct nvme_submission_queue *sq; struct nvme_completion_queue *cq; uint16_t sqhead; DPRINTF("%s index %u", __func__, (uint32_t)value); sq = &sc->submit_queues[0]; cq = &sc->compl_queues[0]; pthread_mutex_lock(&sq->mtx); sqhead = sq->head; DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); while (sqhead != atomic_load_acq_short(&sq->tail)) { cmd = &(sq->qbase)[sqhead]; compl.cdw0 = 0; compl.status = 0; switch (cmd->opc) { case NVME_OPC_DELETE_IO_SQ: DPRINTF("%s command DELETE_IO_SQ", __func__); nvme_opc_delete_io_sq(sc, cmd, &compl); break; case NVME_OPC_CREATE_IO_SQ: DPRINTF("%s command CREATE_IO_SQ", __func__); nvme_opc_create_io_sq(sc, cmd, &compl); break; case NVME_OPC_DELETE_IO_CQ: DPRINTF("%s command DELETE_IO_CQ", __func__); nvme_opc_delete_io_cq(sc, cmd, &compl); break; case NVME_OPC_CREATE_IO_CQ: DPRINTF("%s command CREATE_IO_CQ", __func__); nvme_opc_create_io_cq(sc, cmd, &compl); break; case NVME_OPC_GET_LOG_PAGE: DPRINTF("%s command GET_LOG_PAGE", __func__); nvme_opc_get_log_page(sc, cmd, &compl); break; case NVME_OPC_IDENTIFY: DPRINTF("%s command IDENTIFY", __func__); nvme_opc_identify(sc, cmd, &compl); break; case NVME_OPC_ABORT: DPRINTF("%s command ABORT", __func__); nvme_opc_abort(sc, cmd, &compl); break; case NVME_OPC_SET_FEATURES: DPRINTF("%s command SET_FEATURES", __func__); nvme_opc_set_features(sc, cmd, &compl); break; case NVME_OPC_GET_FEATURES: DPRINTF("%s command GET_FEATURES", __func__); nvme_opc_get_features(sc, cmd, &compl); break; case NVME_OPC_FIRMWARE_ACTIVATE: DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); pci_nvme_status_tc(&compl.status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_FIRMWARE_SLOT); break; case NVME_OPC_ASYNC_EVENT_REQUEST: DPRINTF("%s command ASYNC_EVENT_REQ", __func__); nvme_opc_async_event_req(sc, cmd, &compl); break; case NVME_OPC_FORMAT_NVM: DPRINTF("%s command FORMAT_NVM", __func__); if ((sc->ctrldata.oacs & (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); break; } nvme_opc_format_nvm(sc, cmd, &compl); break; case NVME_OPC_SECURITY_SEND: case NVME_OPC_SECURITY_RECEIVE: case NVME_OPC_SANITIZE: case NVME_OPC_GET_LBA_STATUS: DPRINTF("%s command OPC=%#x (unsupported)", __func__, cmd->opc); /* Valid but unsupported opcodes */ pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); break; default: DPRINTF("%s command OPC=%#X (not implemented)", __func__, cmd->opc); pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); } sqhead = (sqhead + 1) % sq->size; if (NVME_COMPLETION_VALID(compl)) { pci_nvme_cq_update(sc, &sc->compl_queues[0], compl.cdw0, cmd->cid, 0, /* SQID */ compl.status); } } DPRINTF("setting sqhead %u", sqhead); sq->head = sqhead; if (cq->head != cq->tail) pci_generate_msix(sc->nsc_pi, 0); pthread_mutex_unlock(&sq->mtx); } /* * Update the Write and Read statistics reported in SMART data * * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 * 512 byte blocks. Rounding up is achieved by initializing the remainder to 999. */ static void pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, size_t bytes, uint16_t status) { pthread_mutex_lock(&sc->mtx); switch (opc) { case NVME_OPC_WRITE: sc->write_commands++; if (status != NVME_SC_SUCCESS) break; sc->write_dunits_remainder += (bytes / 512); while (sc->write_dunits_remainder >= 1000) { sc->write_data_units++; sc->write_dunits_remainder -= 1000; } break; case NVME_OPC_READ: sc->read_commands++; if (status != NVME_SC_SUCCESS) break; sc->read_dunits_remainder += (bytes / 512); while (sc->read_dunits_remainder >= 1000) { sc->read_data_units++; sc->read_dunits_remainder -= 1000; } break; default: DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); break; } pthread_mutex_unlock(&sc->mtx); } /* * Check if the combination of Starting LBA (slba) and number of blocks * exceeds the range of the underlying storage. * * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores * the capacity in bytes as a uint64_t, care must be taken to avoid integer * overflow. */ static bool pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, uint32_t nblocks) { size_t offset, bytes; /* Overflow check of multiplying Starting LBA by the sector size */ if (slba >> (64 - nvstore->sectsz_bits)) return (true); offset = slba << nvstore->sectsz_bits; bytes = nblocks << nvstore->sectsz_bits; /* Overflow check of Number of Logical Blocks */ if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) return (true); return (false); } static int pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) { int iovidx; bool range_is_contiguous; if (req == NULL) return (-1); if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { return (-1); } /* * Minimize the number of IOVs by concatenating contiguous address * ranges. If the IOV count is zero, there is no previous range to * concatenate. */ if (req->io_req.br_iovcnt == 0) range_is_contiguous = false; else range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; if (range_is_contiguous) { iovidx = req->io_req.br_iovcnt - 1; req->io_req.br_iov[iovidx].iov_base = paddr_guest2host(req->sc->nsc_pi->pi_vmctx, req->prev_gpaddr, size); if (req->io_req.br_iov[iovidx].iov_base == NULL) return (-1); req->prev_size += size; req->io_req.br_resid += size; req->io_req.br_iov[iovidx].iov_len = req->prev_size; } else { iovidx = req->io_req.br_iovcnt; if (iovidx == 0) { req->io_req.br_offset = offset; req->io_req.br_resid = 0; req->io_req.br_param = req; } req->io_req.br_iov[iovidx].iov_base = paddr_guest2host(req->sc->nsc_pi->pi_vmctx, gpaddr, size); if (req->io_req.br_iov[iovidx].iov_base == NULL) return (-1); req->io_req.br_iov[iovidx].iov_len = size; req->prev_gpaddr = gpaddr; req->prev_size = size; req->io_req.br_resid += size; req->io_req.br_iovcnt++; } return (0); } static void pci_nvme_set_completion(struct pci_nvme_softc *sc, struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) { struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), NVME_STATUS_GET_SC(status)); pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); if (cq->head != cq->tail) { if (cq->intr_en & NVME_CQ_INTEN) { pci_generate_msix(sc->nsc_pi, cq->intr_vec); } else { DPRINTF("%s: CQ%u interrupt disabled", __func__, sq->cqid); } } } static void pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) { req->sc = NULL; req->nvme_sq = NULL; req->sqid = 0; pthread_mutex_lock(&sc->mtx); STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); sc->pending_ios--; /* when no more IO pending, can set to ready if device reset/enabled */ if (sc->pending_ios == 0 && NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) sc->regs.csts |= NVME_CSTS_RDY; pthread_mutex_unlock(&sc->mtx); sem_post(&sc->iosemlock); } static struct pci_nvme_ioreq * pci_nvme_get_ioreq(struct pci_nvme_softc *sc) { struct pci_nvme_ioreq *req = NULL; sem_wait(&sc->iosemlock); pthread_mutex_lock(&sc->mtx); req = STAILQ_FIRST(&sc->ioreqs_free); assert(req != NULL); STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); req->sc = sc; sc->pending_ios++; pthread_mutex_unlock(&sc->mtx); req->io_req.br_iovcnt = 0; req->io_req.br_offset = 0; req->io_req.br_resid = 0; req->io_req.br_param = req; req->prev_gpaddr = 0; req->prev_size = 0; return req; } static void pci_nvme_io_done(struct blockif_req *br, int err) { struct pci_nvme_ioreq *req = br->br_param; struct nvme_submission_queue *sq = req->nvme_sq; uint16_t code, status; DPRINTF("%s error %d %s", __func__, err, strerror(err)); /* TODO return correct error */ code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; status = 0; pci_nvme_status_genc(&status, code); pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); pci_nvme_stats_write_read_update(req->sc, req->opc, req->bytes, status); pci_nvme_release_ioreq(req->sc, req); } /* * Implements the Flush command. The specification states: * If a volatile write cache is not present, Flush commands complete * successfully and have no effect * in the description of the Volatile Write Cache (VWC) field of the Identify * Controller data. Therefore, set status to Success if the command is * not supported (i.e. RAM or as indicated by the blockif). */ static bool nvme_opc_flush(struct pci_nvme_softc *sc __unused, struct nvme_command *cmd __unused, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { bool pending = false; if (nvstore->type == NVME_STOR_RAM) { pci_nvme_status_genc(status, NVME_SC_SUCCESS); } else { int err; req->io_req.br_callback = pci_nvme_io_done; err = blockif_flush(nvstore->ctx, &req->io_req); switch (err) { case 0: pending = true; break; case EOPNOTSUPP: pci_nvme_status_genc(status, NVME_SC_SUCCESS); break; default: pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); } } return (pending); } static uint16_t nvme_write_read_ram(struct pci_nvme_softc *sc, struct pci_nvme_blockstore *nvstore, uint64_t prp1, uint64_t prp2, size_t offset, uint64_t bytes, bool is_write) { uint8_t *buf = nvstore->ctx; enum nvme_copy_dir dir; uint16_t status; if (is_write) dir = NVME_COPY_TO_PRP; else dir = NVME_COPY_FROM_PRP; status = 0; if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, buf + offset, bytes, dir)) pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); else pci_nvme_status_genc(&status, NVME_SC_SUCCESS); return (status); } static uint16_t nvme_write_read_blockif(struct pci_nvme_softc *sc, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint64_t prp1, uint64_t prp2, size_t offset, uint64_t bytes, bool is_write) { uint64_t size; int err; uint16_t status = NVME_NO_STATUS; size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { err = -1; goto out; } offset += size; bytes -= size; if (bytes == 0) { ; } else if (bytes <= PAGE_SIZE) { size = bytes; if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { err = -1; goto out; } } else { void *vmctx = sc->nsc_pi->pi_vmctx; uint64_t *prp_list = &prp2; uint64_t *last = prp_list; /* PRP2 is pointer to a physical region page list */ while (bytes) { /* Last entry in list points to the next list */ if ((prp_list == last) && (bytes > PAGE_SIZE)) { uint64_t prp = *prp_list; prp_list = paddr_guest2host(vmctx, prp, PAGE_SIZE - (prp % PAGE_SIZE)); if (prp_list == NULL) { err = -1; goto out; } last = prp_list + (NVME_PRP2_ITEMS - 1); } size = MIN(bytes, PAGE_SIZE); if (pci_nvme_append_iov_req(sc, req, *prp_list, size, offset)) { err = -1; goto out; } offset += size; bytes -= size; prp_list++; } } req->io_req.br_callback = pci_nvme_io_done; if (is_write) err = blockif_write(nvstore->ctx, &req->io_req); else err = blockif_read(nvstore->ctx, &req->io_req); out: if (err) pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); return (status); } static bool nvme_opc_write_read(struct pci_nvme_softc *sc, struct nvme_command *cmd, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { uint64_t lba, nblocks, bytes; size_t offset; bool is_write = cmd->opc == NVME_OPC_WRITE; bool pending = false; lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; nblocks = (cmd->cdw12 & 0xFFFF) + 1; bytes = nblocks << nvstore->sectsz_bits; if (bytes > NVME_MAX_DATA_SIZE) { WPRINTF("%s command would exceed MDTS", __func__); pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); goto out; } if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", __func__, lba, nblocks); pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); goto out; } offset = lba << nvstore->sectsz_bits; req->bytes = bytes; req->io_req.br_offset = lba; /* PRP bits 1:0 must be zero */ cmd->prp1 &= ~0x3UL; cmd->prp2 &= ~0x3UL; if (nvstore->type == NVME_STOR_RAM) { *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, cmd->prp2, offset, bytes, is_write); } else { *status = nvme_write_read_blockif(sc, nvstore, req, cmd->prp1, cmd->prp2, offset, bytes, is_write); if (*status == NVME_NO_STATUS) pending = true; } out: if (!pending) pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); return (pending); } static void pci_nvme_dealloc_sm(struct blockif_req *br, int err) { struct pci_nvme_ioreq *req = br->br_param; struct pci_nvme_softc *sc = req->sc; bool done = true; uint16_t status; status = 0; if (err) { pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { pci_nvme_status_genc(&status, NVME_SC_SUCCESS); } else { struct iovec *iov = req->io_req.br_iov; req->prev_gpaddr++; iov += req->prev_gpaddr; /* The iov_* values already include the sector size */ req->io_req.br_offset = (off_t)iov->iov_base; req->io_req.br_resid = iov->iov_len; if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); } else done = false; } if (done) { pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, status); pci_nvme_release_ioreq(sc, req); } } static bool nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, struct nvme_command *cmd, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { struct nvme_dsm_range *range = NULL; uint32_t nr, r, non_zero, dr; int err; bool pending = false; if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); goto out; } nr = cmd->cdw10 & 0xff; /* copy locally because a range entry could straddle PRPs */ range = calloc(1, NVME_MAX_DSM_TRIM); if (range == NULL) { pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); goto out; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); /* Check for invalid ranges and the number of non-zero lengths */ non_zero = 0; for (r = 0; r <= nr; r++) { if (pci_nvme_out_of_range(nvstore, range[r].starting_lba, range[r].length)) { pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); goto out; } if (range[r].length != 0) non_zero++; } if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { size_t offset, bytes; int sectsz_bits = sc->nvstore.sectsz_bits; /* * DSM calls are advisory only, and compliant controllers * may choose to take no actions (i.e. return Success). */ if (!nvstore->deallocate) { pci_nvme_status_genc(status, NVME_SC_SUCCESS); goto out; } /* If all ranges have a zero length, return Success */ if (non_zero == 0) { pci_nvme_status_genc(status, NVME_SC_SUCCESS); goto out; } if (req == NULL) { pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); goto out; } offset = range[0].starting_lba << sectsz_bits; bytes = range[0].length << sectsz_bits; /* * If the request is for more than a single range, store * the ranges in the br_iov. Optimize for the common case * of a single range. * * Note that NVMe Number of Ranges is a zero based value */ req->io_req.br_iovcnt = 0; req->io_req.br_offset = offset; req->io_req.br_resid = bytes; if (nr == 0) { req->io_req.br_callback = pci_nvme_io_done; } else { struct iovec *iov = req->io_req.br_iov; for (r = 0, dr = 0; r <= nr; r++) { offset = range[r].starting_lba << sectsz_bits; bytes = range[r].length << sectsz_bits; if (bytes == 0) continue; if ((nvstore->size - offset) < bytes) { pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); goto out; } iov[dr].iov_base = (void *)offset; iov[dr].iov_len = bytes; dr++; } req->io_req.br_callback = pci_nvme_dealloc_sm; /* * Use prev_gpaddr to track the current entry and * prev_size to track the number of entries */ req->prev_gpaddr = 0; req->prev_size = dr; } err = blockif_delete(nvstore->ctx, &req->io_req); if (err) pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); else pending = true; } out: free(range); return (pending); } static void pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) { struct nvme_submission_queue *sq; uint16_t status; uint16_t sqhead; /* handle all submissions up to sq->tail index */ sq = &sc->submit_queues[idx]; pthread_mutex_lock(&sq->mtx); sqhead = sq->head; DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", idx, sqhead, sq->tail, sq->qbase); while (sqhead != atomic_load_acq_short(&sq->tail)) { struct nvme_command *cmd; struct pci_nvme_ioreq *req; uint32_t nsid; bool pending; pending = false; req = NULL; status = 0; cmd = &sq->qbase[sqhead]; sqhead = (sqhead + 1) % sq->size; nsid = le32toh(cmd->nsid); if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { pci_nvme_status_genc(&status, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); status |= NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; goto complete; } req = pci_nvme_get_ioreq(sc); if (req == NULL) { pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); WPRINTF("%s: unable to allocate IO req", __func__); goto complete; } req->nvme_sq = sq; req->sqid = idx; req->opc = cmd->opc; req->cid = cmd->cid; req->nsid = cmd->nsid; switch (cmd->opc) { case NVME_OPC_FLUSH: pending = nvme_opc_flush(sc, cmd, &sc->nvstore, req, &status); break; case NVME_OPC_WRITE: case NVME_OPC_READ: pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, req, &status); break; case NVME_OPC_WRITE_ZEROES: /* TODO: write zeroes WPRINTF("%s write zeroes lba 0x%lx blocks %u", __func__, lba, cmd->cdw12 & 0xFFFF); */ pci_nvme_status_genc(&status, NVME_SC_SUCCESS); break; case NVME_OPC_DATASET_MANAGEMENT: pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, req, &status); break; default: WPRINTF("%s unhandled io command 0x%x", __func__, cmd->opc); pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); } complete: if (!pending) { pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); if (req != NULL) pci_nvme_release_ioreq(sc, req); } } sq->head = sqhead; pthread_mutex_unlock(&sq->mtx); } static void pci_nvme_handle_doorbell(struct pci_nvme_softc* sc, uint64_t idx, int is_sq, uint64_t value) { DPRINTF("nvme doorbell %lu, %s, val 0x%lx", idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); if (is_sq) { if (idx > sc->num_squeues) { WPRINTF("%s queue index %lu overflow from " "guest (max %u)", __func__, idx, sc->num_squeues); return; } atomic_store_short(&sc->submit_queues[idx].tail, (uint16_t)value); if (idx == 0) { pci_nvme_handle_admin_cmd(sc, value); } else { /* submission queue; handle new entries in SQ */ if (idx > sc->num_squeues) { WPRINTF("%s SQ index %lu overflow from " "guest (max %u)", __func__, idx, sc->num_squeues); return; } pci_nvme_handle_io_cmd(sc, (uint16_t)idx); } } else { if (idx > sc->num_cqueues) { WPRINTF("%s queue index %lu overflow from " "guest (max %u)", __func__, idx, sc->num_cqueues); return; } atomic_store_short(&sc->compl_queues[idx].head, (uint16_t)value); } } static void pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) { const char *s = iswrite ? "WRITE" : "READ"; switch (offset) { case NVME_CR_CAP_LOW: DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); break; case NVME_CR_CAP_HI: DPRINTF("%s %s NVME_CR_CAP_HI", func, s); break; case NVME_CR_VS: DPRINTF("%s %s NVME_CR_VS", func, s); break; case NVME_CR_INTMS: DPRINTF("%s %s NVME_CR_INTMS", func, s); break; case NVME_CR_INTMC: DPRINTF("%s %s NVME_CR_INTMC", func, s); break; case NVME_CR_CC: DPRINTF("%s %s NVME_CR_CC", func, s); break; case NVME_CR_CSTS: DPRINTF("%s %s NVME_CR_CSTS", func, s); break; case NVME_CR_NSSR: DPRINTF("%s %s NVME_CR_NSSR", func, s); break; case NVME_CR_AQA: DPRINTF("%s %s NVME_CR_AQA", func, s); break; case NVME_CR_ASQ_LOW: DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); break; case NVME_CR_ASQ_HI: DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); break; case NVME_CR_ACQ_LOW: DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); break; case NVME_CR_ACQ_HI: DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); break; default: DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); } } static void pci_nvme_write_bar_0(struct pci_nvme_softc *sc, uint64_t offset, int size, uint64_t value) { uint32_t ccreg; if (offset >= NVME_DOORBELL_OFFSET) { uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; uint64_t idx = belloffset / 8; /* door bell size = 2*int */ int is_sq = (belloffset % 8) < 4; if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", offset); return; } if (belloffset > ((sc->max_queues+1) * 8 - 4)) { WPRINTF("guest attempted an overflow write offset " "0x%lx, val 0x%lx in %s", offset, value, __func__); return; } if (is_sq) { if (sc->submit_queues[idx].qbase == NULL) return; } else if (sc->compl_queues[idx].qbase == NULL) return; pci_nvme_handle_doorbell(sc, idx, is_sq, value); return; } DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", offset, size, value); if (size != 4) { WPRINTF("guest wrote invalid size %d (offset 0x%lx, " "val 0x%lx) to bar0 in %s", size, offset, value, __func__); /* TODO: shutdown device */ return; } pci_nvme_bar0_reg_dumps(__func__, offset, 1); pthread_mutex_lock(&sc->mtx); switch (offset) { case NVME_CR_CAP_LOW: case NVME_CR_CAP_HI: /* readonly */ break; case NVME_CR_VS: /* readonly */ break; case NVME_CR_INTMS: /* MSI-X, so ignore */ break; case NVME_CR_INTMC: /* MSI-X, so ignore */ break; case NVME_CR_CC: ccreg = (uint32_t)value; DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " "iocqes %u", __func__, NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), NVME_CC_GET_IOCQES(ccreg)); if (NVME_CC_GET_SHN(ccreg)) { /* perform shutdown - flush out data to backend */ sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << NVME_CSTS_REG_SHST_SHIFT); sc->regs.csts |= NVME_SHST_COMPLETE << NVME_CSTS_REG_SHST_SHIFT; } if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { if (NVME_CC_GET_EN(ccreg) == 0) /* transition 1-> causes controller reset */ pci_nvme_reset_locked(sc); else pci_nvme_init_controller(sc); } /* Insert the iocqes, iosqes and en bits from the write */ sc->regs.cc &= ~NVME_CC_WRITE_MASK; sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; if (NVME_CC_GET_EN(ccreg) == 0) { /* Insert the ams, mps and css bit fields */ sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; sc->regs.csts &= ~NVME_CSTS_RDY; } else if ((sc->pending_ios == 0) && !(sc->regs.csts & NVME_CSTS_CFS)) { sc->regs.csts |= NVME_CSTS_RDY; } break; case NVME_CR_CSTS: break; case NVME_CR_NSSR: /* ignore writes; don't support subsystem reset */ break; case NVME_CR_AQA: sc->regs.aqa = (uint32_t)value; break; case NVME_CR_ASQ_LOW: sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | (0xFFFFF000 & value); break; case NVME_CR_ASQ_HI: sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | (value << 32); break; case NVME_CR_ACQ_LOW: sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | (0xFFFFF000 & value); break; case NVME_CR_ACQ_HI: sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | (value << 32); break; default: DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", __func__, offset, value, size); } pthread_mutex_unlock(&sc->mtx); } static void pci_nvme_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_nvme_softc* sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " " value 0x%lx", baridx, offset, size, value); pci_emul_msix_twrite(pi, offset, size, value); return; } switch (baridx) { case 0: pci_nvme_write_bar_0(sc, offset, size, value); break; default: DPRINTF("%s unknown baridx %d, val 0x%lx", __func__, baridx, value); } } static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, uint64_t offset, int size) { uint64_t value; pci_nvme_bar0_reg_dumps(__func__, offset, 0); if (offset < NVME_DOORBELL_OFFSET) { void *p = &(sc->regs); pthread_mutex_lock(&sc->mtx); memcpy(&value, (void *)((uintptr_t)p + offset), size); pthread_mutex_unlock(&sc->mtx); } else { value = 0; WPRINTF("pci_nvme: read invalid offset %ld", offset); } switch (size) { case 1: value &= 0xFF; break; case 2: value &= 0xFFFF; break; case 4: value &= 0xFFFFFFFF; break; } DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", offset, size, (uint32_t)value); return (value); } static uint64_t pci_nvme_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_nvme_softc* sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", baridx, offset, size); return pci_emul_msix_tread(pi, offset, size); } switch (baridx) { case 0: return pci_nvme_read_bar_0(sc, offset, size); default: DPRINTF("unknown bar %d, 0x%lx", baridx, offset); } return (0); } static int pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) { char bident[sizeof("XXX:XXX")]; const char *value; uint32_t sectsz; sc->max_queues = NVME_QUEUES; sc->max_qentries = NVME_MAX_QENTRIES; sc->ioslots = NVME_IOSLOTS; sc->num_squeues = sc->max_queues; sc->num_cqueues = sc->max_queues; sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; sectsz = 0; snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); value = get_config_value_node(nvl, "maxq"); if (value != NULL) sc->max_queues = atoi(value); value = get_config_value_node(nvl, "qsz"); if (value != NULL) { sc->max_qentries = atoi(value); if (sc->max_qentries <= 0) { EPRINTLN("nvme: Invalid qsz option %d", sc->max_qentries); return (-1); } } value = get_config_value_node(nvl, "ioslots"); if (value != NULL) { sc->ioslots = atoi(value); if (sc->ioslots <= 0) { EPRINTLN("Invalid ioslots option %d", sc->ioslots); return (-1); } } value = get_config_value_node(nvl, "sectsz"); if (value != NULL) sectsz = atoi(value); value = get_config_value_node(nvl, "ser"); if (value != NULL) { /* * This field indicates the Product Serial Number in * 7-bit ASCII, unused bytes should be space characters. * Ref: NVMe v1.3c. */ cpywithpad((char *)sc->ctrldata.sn, sizeof(sc->ctrldata.sn), value, ' '); } value = get_config_value_node(nvl, "eui64"); if (value != NULL) sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); value = get_config_value_node(nvl, "dsm"); if (value != NULL) { if (strcmp(value, "auto") == 0) sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; else if (strcmp(value, "enable") == 0) sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; else if (strcmp(value, "disable") == 0) sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; } + value = get_config_value_node(nvl, "bootindex"); + if (value != NULL) { + if (pci_emul_add_boot_device(sc->nsc_pi, atoi(value))) { + EPRINTLN("Invalid bootindex %d", atoi(value)); + return (-1); + } + } + value = get_config_value_node(nvl, "ram"); if (value != NULL) { uint64_t sz = strtoull(value, NULL, 10); sc->nvstore.type = NVME_STOR_RAM; sc->nvstore.size = sz * 1024 * 1024; sc->nvstore.ctx = calloc(1, sc->nvstore.size); sc->nvstore.sectsz = 4096; sc->nvstore.sectsz_bits = 12; if (sc->nvstore.ctx == NULL) { EPRINTLN("nvme: Unable to allocate RAM"); return (-1); } } else { snprintf(bident, sizeof(bident), "%u:%u", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); sc->nvstore.ctx = blockif_open(nvl, bident); if (sc->nvstore.ctx == NULL) { EPRINTLN("nvme: Could not open backing file: %s", strerror(errno)); return (-1); } sc->nvstore.type = NVME_STOR_BLOCKIF; sc->nvstore.size = blockif_size(sc->nvstore.ctx); } if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) sc->nvstore.sectsz = sectsz; else if (sc->nvstore.type != NVME_STOR_RAM) sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); for (sc->nvstore.sectsz_bits = 9; (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; sc->nvstore.sectsz_bits++); if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) sc->max_queues = NVME_QUEUES; return (0); } static void pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, size_t new_size) { struct pci_nvme_softc *sc; struct pci_nvme_blockstore *nvstore; struct nvme_namespace_data *nd; sc = arg; nvstore = &sc->nvstore; nd = &sc->nsdata; nvstore->size = new_size; pci_nvme_init_nsdata_size(nvstore, nd); /* Add changed NSID to list */ sc->ns_log.ns[0] = 1; sc->ns_log.ns[1] = 0; pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); } static int pci_nvme_init(struct pci_devinst *pi, nvlist_t *nvl) { struct pci_nvme_softc *sc; uint32_t pci_membar_sz; int error; error = 0; sc = calloc(1, sizeof(struct pci_nvme_softc)); pi->pi_arg = sc; sc->nsc_pi = pi; error = pci_nvme_parse_config(sc, nvl); if (error < 0) goto done; else error = 0; STAILQ_INIT(&sc->ioreqs_free); sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); for (uint32_t i = 0; i < sc->ioslots; i++) { STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); } pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); /* * Allocate size of NVMe registers + doorbell space for all queues. * * The specification requires a minimum memory I/O window size of 16K. * The Windows driver will refuse to start a device with a smaller * window. */ pci_membar_sz = sizeof(struct nvme_registers) + 2 * sizeof(uint32_t) * (sc->max_queues + 1); pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); DPRINTF("nvme membar size: %u", pci_membar_sz); error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); if (error) { WPRINTF("%s pci alloc mem bar failed", __func__); goto done; } error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); if (error) { WPRINTF("%s pci add msixcap failed", __func__); goto done; } error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); if (error) { WPRINTF("%s pci add Express capability failed", __func__); goto done; } pthread_mutex_init(&sc->mtx, NULL); sem_init(&sc->iosemlock, 0, sc->ioslots); blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); /* * Controller data depends on Namespace data so initialize Namespace * data first. */ pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); pci_nvme_init_ctrldata(sc); pci_nvme_init_logpages(sc); pci_nvme_init_features(sc); pci_nvme_aer_init(sc); pci_nvme_aen_init(sc); pci_nvme_reset(sc); pci_lintr_request(pi); done: return (error); } static int pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) { char *cp, *ram; if (opts == NULL) return (0); if (strncmp(opts, "ram=", 4) == 0) { cp = strchr(opts, ','); if (cp == NULL) { set_config_value_node(nvl, "ram", opts + 4); return (0); } ram = strndup(opts + 4, cp - opts - 4); set_config_value_node(nvl, "ram", ram); free(ram); return (pci_parse_legacy_config(nvl, cp + 1)); } else return (blockif_legacy_config(nvl, opts)); } static const struct pci_devemu pci_de_nvme = { .pe_emu = "nvme", .pe_init = pci_nvme_init, .pe_legacy_config = pci_nvme_legacy_config, .pe_barwrite = pci_nvme_write, .pe_barread = pci_nvme_read }; PCI_EMUL_SET(pci_de_nvme); diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c index 9fd6db41dba8..c8ec62a66793 100644 --- a/usr.sbin/bhyve/pci_virtio_block.c +++ b/usr.sbin/bhyve/pci_virtio_block.c @@ -1,601 +1,606 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * Copyright 2020-2021 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #include "block_if.h" #define VTBLK_BSIZE 512 #define VTBLK_RINGSZ 128 _Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request"); #define VTBLK_S_OK 0 #define VTBLK_S_IOERR 1 #define VTBLK_S_UNSUPP 2 #define VTBLK_BLK_ID_BYTES 20 + 1 /* Capability bits */ #define VTBLK_F_BARRIER (1 << 0) /* Does host support barriers? */ #define VTBLK_F_SIZE_MAX (1 << 1) /* Indicates maximum segment size */ #define VTBLK_F_SEG_MAX (1 << 2) /* Indicates maximum # of segments */ #define VTBLK_F_GEOMETRY (1 << 4) /* Legacy geometry available */ #define VTBLK_F_RO (1 << 5) /* Disk is read-only */ #define VTBLK_F_BLK_SIZE (1 << 6) /* Block size of disk is available*/ #define VTBLK_F_SCSI (1 << 7) /* Supports scsi command passthru */ #define VTBLK_F_FLUSH (1 << 9) /* Writeback mode enabled after reset */ #define VTBLK_F_WCE (1 << 9) /* Legacy alias for FLUSH */ #define VTBLK_F_TOPOLOGY (1 << 10) /* Topology information is available */ #define VTBLK_F_CONFIG_WCE (1 << 11) /* Writeback mode available in config */ #define VTBLK_F_MQ (1 << 12) /* Multi-Queue */ #define VTBLK_F_DISCARD (1 << 13) /* Trim blocks */ #define VTBLK_F_WRITE_ZEROES (1 << 14) /* Write zeros */ /* * Host capabilities */ #define VTBLK_S_HOSTCAPS \ ( VTBLK_F_SEG_MAX | \ VTBLK_F_BLK_SIZE | \ VTBLK_F_FLUSH | \ VTBLK_F_TOPOLOGY | \ VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ /* * The current blockif_delete() interface only allows a single delete * request at a time. */ #define VTBLK_MAX_DISCARD_SEG 1 /* * An arbitrary limit to prevent excessive latency due to large * delete requests. */ #define VTBLK_MAX_DISCARD_SECT ((16 << 20) / VTBLK_BSIZE) /* 16 MiB */ /* * Config space "registers" */ struct vtblk_config { uint64_t vbc_capacity; uint32_t vbc_size_max; uint32_t vbc_seg_max; struct { uint16_t cylinders; uint8_t heads; uint8_t sectors; } vbc_geometry; uint32_t vbc_blk_size; struct { uint8_t physical_block_exp; uint8_t alignment_offset; uint16_t min_io_size; uint32_t opt_io_size; } vbc_topology; uint8_t vbc_writeback; uint8_t unused0[1]; uint16_t num_queues; uint32_t max_discard_sectors; uint32_t max_discard_seg; uint32_t discard_sector_alignment; uint32_t max_write_zeroes_sectors; uint32_t max_write_zeroes_seg; uint8_t write_zeroes_may_unmap; uint8_t unused1[3]; } __packed; /* * Fixed-size block header */ struct virtio_blk_hdr { #define VBH_OP_READ 0 #define VBH_OP_WRITE 1 #define VBH_OP_SCSI_CMD 2 #define VBH_OP_SCSI_CMD_OUT 3 #define VBH_OP_FLUSH 4 #define VBH_OP_FLUSH_OUT 5 #define VBH_OP_IDENT 8 #define VBH_OP_DISCARD 11 #define VBH_OP_WRITE_ZEROES 13 #define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ uint32_t vbh_type; uint32_t vbh_ioprio; uint64_t vbh_sector; } __packed; /* * Debug printf */ static int pci_vtblk_debug; #define DPRINTF(params) if (pci_vtblk_debug) PRINTLN params #define WPRINTF(params) PRINTLN params struct pci_vtblk_ioreq { struct blockif_req io_req; struct pci_vtblk_softc *io_sc; uint8_t *io_status; uint16_t io_idx; }; struct virtio_blk_discard_write_zeroes { uint64_t sector; uint32_t num_sectors; struct { uint32_t unmap:1; uint32_t reserved:31; } flags; }; /* * Per-device softc */ struct pci_vtblk_softc { struct virtio_softc vbsc_vs; pthread_mutex_t vsc_mtx; struct vqueue_info vbsc_vq; struct vtblk_config vbsc_cfg; struct virtio_consts vbsc_consts; struct blockif_ctxt *bc; char vbsc_ident[VTBLK_BLK_ID_BYTES]; struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; }; static void pci_vtblk_reset(void *); static void pci_vtblk_notify(void *, struct vqueue_info *); static int pci_vtblk_cfgread(void *, int, int, uint32_t *); static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); #ifdef BHYVE_SNAPSHOT static void pci_vtblk_pause(void *); static void pci_vtblk_resume(void *); static int pci_vtblk_snapshot(void *, struct vm_snapshot_meta *); #endif static struct virtio_consts vtblk_vi_consts = { .vc_name = "vtblk", .vc_nvq = 1, .vc_cfgsize = sizeof(struct vtblk_config), .vc_reset = pci_vtblk_reset, .vc_qnotify = pci_vtblk_notify, .vc_cfgread = pci_vtblk_cfgread, .vc_cfgwrite = pci_vtblk_cfgwrite, .vc_apply_features = NULL, .vc_hv_caps = VTBLK_S_HOSTCAPS, #ifdef BHYVE_SNAPSHOT .vc_pause = pci_vtblk_pause, .vc_resume = pci_vtblk_resume, .vc_snapshot = pci_vtblk_snapshot, #endif }; static void pci_vtblk_reset(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device reset requested !")); vi_reset_dev(&sc->vbsc_vs); } static void pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err) { struct pci_vtblk_softc *sc = io->io_sc; /* convert errno into a virtio block error return */ if (err == EOPNOTSUPP || err == ENOSYS) *io->io_status = VTBLK_S_UNSUPP; else if (err != 0) *io->io_status = VTBLK_S_IOERR; else *io->io_status = VTBLK_S_OK; /* * Return the descriptor back to the host. * We wrote 1 byte (our status) to host. */ vq_relchain(&sc->vbsc_vq, io->io_idx, 1); vq_endchains(&sc->vbsc_vq, 0); } #ifdef BHYVE_SNAPSHOT static void pci_vtblk_pause(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device pause requested !\n")); blockif_pause(sc->bc); } static void pci_vtblk_resume(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device resume requested !\n")); blockif_resume(sc->bc); } static int pci_vtblk_snapshot(void *vsc, struct vm_snapshot_meta *meta) { int ret; struct pci_vtblk_softc *sc = vsc; SNAPSHOT_VAR_OR_LEAVE(sc->vbsc_cfg, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(sc->vbsc_ident, sizeof(sc->vbsc_ident), meta, ret, done); done: return (ret); } #endif static void pci_vtblk_done(struct blockif_req *br, int err) { struct pci_vtblk_ioreq *io = br->br_param; struct pci_vtblk_softc *sc = io->io_sc; pthread_mutex_lock(&sc->vsc_mtx); pci_vtblk_done_locked(io, err); pthread_mutex_unlock(&sc->vsc_mtx); } static void pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) { struct virtio_blk_hdr *vbh; struct pci_vtblk_ioreq *io; int i, n; int err; ssize_t iolen; int writeop, type; struct vi_req req; struct iovec iov[BLOCKIF_IOV_MAX + 2]; struct virtio_blk_discard_write_zeroes *discard; n = vq_getchain(vq, iov, BLOCKIF_IOV_MAX + 2, &req); /* * The first descriptor will be the read-only fixed header, * and the last is for status (hence +2 above and below). * The remaining iov's are the actual data I/O vectors. * * XXX - note - this fails on crash dump, which does a * VIRTIO_BLK_T_FLUSH with a zero transfer length */ assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2); io = &sc->vbsc_ios[req.idx]; assert(req.readable != 0); assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); vbh = (struct virtio_blk_hdr *)iov[0].iov_base; memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); io->io_req.br_iovcnt = n - 2; io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE; io->io_status = (uint8_t *)iov[--n].iov_base; assert(req.writable != 0); assert(iov[n].iov_len == 1); /* * XXX * The guest should not be setting the BARRIER flag because * we don't advertise the capability. */ type = vbh->vbh_type & ~VBH_FLAG_BARRIER; writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD); /* * - Write op implies read-only descriptor * - Read/ident op implies write-only descriptor * * By taking away either the read-only fixed header or the write-only * status iovec, the following condition should hold true. */ assert(n == (writeop ? req.readable : req.writable)); iolen = 0; for (i = 1; i < n; i++) { iolen += iov[i].iov_len; } io->io_req.br_resid = iolen; DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld", writeop ? "write/discard" : "read/ident", iolen, i - 1, io->io_req.br_offset)); switch (type) { case VBH_OP_READ: err = blockif_read(sc->bc, &io->io_req); break; case VBH_OP_WRITE: err = blockif_write(sc->bc, &io->io_req); break; case VBH_OP_DISCARD: /* * We currently only support a single request, if the guest * has submitted a request that doesn't conform to the * requirements, we return a error. */ if (iov[1].iov_len != sizeof (*discard)) { pci_vtblk_done_locked(io, EINVAL); return; } /* The segments to discard are provided rather than data */ discard = (struct virtio_blk_discard_write_zeroes *) iov[1].iov_base; /* * virtio v1.1 5.2.6.2: * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP * for discard and write zeroes commands if any unknown flag is * set. Furthermore, the device MUST set the status byte to * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag * is set. * * Currently there are no known flags for a DISCARD request. */ if (discard->flags.unmap != 0 || discard->flags.reserved != 0) { pci_vtblk_done_locked(io, ENOTSUP); return; } /* Make sure the request doesn't exceed our size limit */ if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) { pci_vtblk_done_locked(io, EINVAL); return; } io->io_req.br_offset = discard->sector * VTBLK_BSIZE; io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE; err = blockif_delete(sc->bc, &io->io_req); break; case VBH_OP_FLUSH: case VBH_OP_FLUSH_OUT: err = blockif_flush(sc->bc, &io->io_req); break; case VBH_OP_IDENT: /* Assume a single buffer */ /* S/n equal to buffer is not zero-terminated. */ memset(iov[1].iov_base, 0, iov[1].iov_len); strncpy(iov[1].iov_base, sc->vbsc_ident, MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); pci_vtblk_done_locked(io, 0); return; default: pci_vtblk_done_locked(io, EOPNOTSUPP); return; } assert(err == 0); } static void pci_vtblk_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtblk_softc *sc = vsc; while (vq_has_descs(vq)) pci_vtblk_proc(sc, vq); } static void pci_vtblk_resized(struct blockif_ctxt *bctxt __unused, void *arg, size_t new_size) { struct pci_vtblk_softc *sc; sc = arg; sc->vbsc_cfg.vbc_capacity = new_size / VTBLK_BSIZE; /* 512-byte units */ vi_interrupt(&sc->vbsc_vs, VIRTIO_PCI_ISR_CONFIG, sc->vbsc_vs.vs_msix_cfg_idx); } static int pci_vtblk_init(struct pci_devinst *pi, nvlist_t *nvl) { char bident[sizeof("XXX:XXX")]; struct blockif_ctxt *bctxt; const char *path, *serial; MD5_CTX mdctx; u_char digest[16]; struct pci_vtblk_softc *sc; off_t size; int i, sectsz, sts, sto; /* * The supplied backing file has to exist */ snprintf(bident, sizeof(bident), "%u:%u", pi->pi_slot, pi->pi_func); bctxt = blockif_open(nvl, bident); if (bctxt == NULL) { perror("Could not open backing file"); return (1); } + if (blockif_add_boot_device(pi, bctxt)) { + perror("Invalid boot device"); + return (1); + } + size = blockif_size(bctxt); sectsz = blockif_sectsz(bctxt); blockif_psectsz(bctxt, &sts, &sto); sc = calloc(1, sizeof(struct pci_vtblk_softc)); sc->bc = bctxt; for (i = 0; i < VTBLK_RINGSZ; i++) { struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i]; io->io_req.br_callback = pci_vtblk_done; io->io_req.br_param = io; io->io_sc = sc; io->io_idx = i; } bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts)); if (blockif_candelete(sc->bc)) sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD; pthread_mutex_init(&sc->vsc_mtx, NULL); /* init virtio softc and virtqueues */ vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, pi, &sc->vbsc_vq); sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; /* sc->vbsc_vq.vq_notify = we have no per-queue notify */ /* * If an explicit identifier is not given, create an * identifier using parts of the md5 sum of the filename. */ bzero(sc->vbsc_ident, VTBLK_BLK_ID_BYTES); if ((serial = get_config_value_node(nvl, "serial")) != NULL || (serial = get_config_value_node(nvl, "ser")) != NULL) { strlcpy(sc->vbsc_ident, serial, VTBLK_BLK_ID_BYTES); } else { path = get_config_value_node(nvl, "path"); MD5Init(&mdctx); MD5Update(&mdctx, path, strlen(path)); MD5Final(digest, &mdctx); snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES, "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); } /* setup virtio block config space */ sc->vbsc_cfg.vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */ sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ /* * If Linux is presented with a seg_max greater than the virtio queue * size, it can stumble into situations where it violates its own * invariants and panics. For safety, we keep seg_max clamped, paying * heed to the two extra descriptors needed for the header and status * of a request. */ sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX); sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */ sc->vbsc_cfg.vbc_geometry.heads = 0; sc->vbsc_cfg.vbc_geometry.sectors = 0; sc->vbsc_cfg.vbc_blk_size = sectsz; sc->vbsc_cfg.vbc_topology.physical_block_exp = (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0; sc->vbsc_cfg.vbc_topology.alignment_offset = (sto != 0) ? ((sts - sto) / sectsz) : 0; sc->vbsc_cfg.vbc_topology.min_io_size = 0; sc->vbsc_cfg.vbc_topology.opt_io_size = 0; sc->vbsc_cfg.vbc_writeback = 0; sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT; sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG; sc->vbsc_cfg.discard_sector_alignment = MAX(sectsz, sts) / VTBLK_BSIZE; /* * Should we move some of this into virtio.c? Could * have the device, class, and subdev_0 as fields in * the virtio constants structure. */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_BLOCK); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) { blockif_close(sc->bc); free(sc); return (1); } vi_set_io_bar(&sc->vbsc_vs, 0); blockif_register_resize_callback(sc->bc, pci_vtblk_resized, sc); return (0); } static int pci_vtblk_cfgwrite(void *vsc __unused, int offset, int size __unused, uint32_t value __unused) { DPRINTF(("vtblk: write to readonly reg %d", offset)); return (1); } static int pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtblk_softc *sc = vsc; void *ptr; /* our caller has already verified offset and size */ ptr = (uint8_t *)&sc->vbsc_cfg + offset; memcpy(retval, ptr, size); return (0); } static const struct pci_devemu pci_de_vblk = { .pe_emu = "virtio-blk", .pe_init = pci_vtblk_init, .pe_legacy_config = blockif_legacy_config, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = vi_pci_snapshot, .pe_pause = vi_pci_pause, .pe_resume = vi_pci_resume, #endif }; PCI_EMUL_SET(pci_de_vblk); diff --git a/usr.sbin/bhyve/pci_virtio_scsi.c b/usr.sbin/bhyve/pci_virtio_scsi.c index 7d5409cff6a1..fa6cb3a48787 100644 --- a/usr.sbin/bhyve/pci_virtio_scsi.c +++ b/usr.sbin/bhyve/pci_virtio_scsi.c @@ -1,764 +1,773 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2016 Jakub Klama . * Copyright (c) 2018 Marcelo Araujo . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #include "iov.h" #define VTSCSI_RINGSZ 64 #define VTSCSI_REQUESTQ 1 #define VTSCSI_THR_PER_Q 16 #define VTSCSI_MAXQ (VTSCSI_REQUESTQ + 2) #define VTSCSI_MAXSEG 64 #define VTSCSI_IN_HEADER_LEN(_sc) \ (sizeof(struct pci_vtscsi_req_cmd_rd) + _sc->vss_config.cdb_size) #define VTSCSI_OUT_HEADER_LEN(_sc) \ (sizeof(struct pci_vtscsi_req_cmd_wr) + _sc->vss_config.sense_size) #define VIRTIO_SCSI_MAX_CHANNEL 0 #define VIRTIO_SCSI_MAX_TARGET 0 #define VIRTIO_SCSI_MAX_LUN 16383 #define VIRTIO_SCSI_F_INOUT (1 << 0) #define VIRTIO_SCSI_F_HOTPLUG (1 << 1) #define VIRTIO_SCSI_F_CHANGE (1 << 2) static int pci_vtscsi_debug = 0; #define WPRINTF(msg, params...) PRINTLN("virtio-scsi: " msg, ##params) #define DPRINTF(msg, params...) if (pci_vtscsi_debug) WPRINTF(msg, ##params) struct pci_vtscsi_config { uint32_t num_queues; uint32_t seg_max; uint32_t max_sectors; uint32_t cmd_per_lun; uint32_t event_info_size; uint32_t sense_size; uint32_t cdb_size; uint16_t max_channel; uint16_t max_target; uint32_t max_lun; } __attribute__((packed)); struct pci_vtscsi_queue { struct pci_vtscsi_softc * vsq_sc; struct vqueue_info * vsq_vq; pthread_mutex_t vsq_mtx; pthread_mutex_t vsq_qmtx; pthread_cond_t vsq_cv; STAILQ_HEAD(, pci_vtscsi_request) vsq_requests; LIST_HEAD(, pci_vtscsi_worker) vsq_workers; }; struct pci_vtscsi_worker { struct pci_vtscsi_queue * vsw_queue; pthread_t vsw_thread; bool vsw_exiting; LIST_ENTRY(pci_vtscsi_worker) vsw_link; }; struct pci_vtscsi_request { struct pci_vtscsi_queue * vsr_queue; struct iovec vsr_iov_in[VTSCSI_MAXSEG]; int vsr_niov_in; struct iovec vsr_iov_out[VTSCSI_MAXSEG]; int vsr_niov_out; uint32_t vsr_idx; STAILQ_ENTRY(pci_vtscsi_request) vsr_link; }; /* * Per-device softc */ struct pci_vtscsi_softc { struct virtio_softc vss_vs; struct vqueue_info vss_vq[VTSCSI_MAXQ]; struct pci_vtscsi_queue vss_queues[VTSCSI_REQUESTQ]; pthread_mutex_t vss_mtx; int vss_iid; int vss_ctl_fd; uint32_t vss_features; struct pci_vtscsi_config vss_config; }; #define VIRTIO_SCSI_T_TMF 0 #define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 #define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 #define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 #define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 #define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 #define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 #define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 #define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 /* command-specific response values */ #define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0 #define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 #define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 struct pci_vtscsi_ctrl_tmf { uint32_t type; uint32_t subtype; uint8_t lun[8]; uint64_t id; uint8_t response; } __attribute__((packed)); #define VIRTIO_SCSI_T_AN_QUERY 1 #define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2 #define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4 #define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8 #define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16 #define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32 #define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64 struct pci_vtscsi_ctrl_an { uint32_t type; uint8_t lun[8]; uint32_t event_requested; uint32_t event_actual; uint8_t response; } __attribute__((packed)); /* command-specific response values */ #define VIRTIO_SCSI_S_OK 0 #define VIRTIO_SCSI_S_OVERRUN 1 #define VIRTIO_SCSI_S_ABORTED 2 #define VIRTIO_SCSI_S_BAD_TARGET 3 #define VIRTIO_SCSI_S_RESET 4 #define VIRTIO_SCSI_S_BUSY 5 #define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 #define VIRTIO_SCSI_S_TARGET_FAILURE 7 #define VIRTIO_SCSI_S_NEXUS_FAILURE 8 #define VIRTIO_SCSI_S_FAILURE 9 #define VIRTIO_SCSI_S_INCORRECT_LUN 12 /* task_attr */ #define VIRTIO_SCSI_S_SIMPLE 0 #define VIRTIO_SCSI_S_ORDERED 1 #define VIRTIO_SCSI_S_HEAD 2 #define VIRTIO_SCSI_S_ACA 3 struct pci_vtscsi_event { uint32_t event; uint8_t lun[8]; uint32_t reason; } __attribute__((packed)); struct pci_vtscsi_req_cmd_rd { uint8_t lun[8]; uint64_t id; uint8_t task_attr; uint8_t prio; uint8_t crn; uint8_t cdb[]; } __attribute__((packed)); struct pci_vtscsi_req_cmd_wr { uint32_t sense_len; uint32_t residual; uint16_t status_qualifier; uint8_t status; uint8_t response; uint8_t sense[]; } __attribute__((packed)); static void *pci_vtscsi_proc(void *); static void pci_vtscsi_reset(void *); static void pci_vtscsi_neg_features(void *, uint64_t); static int pci_vtscsi_cfgread(void *, int, int, uint32_t *); static int pci_vtscsi_cfgwrite(void *, int, int, uint32_t); static inline int pci_vtscsi_get_lun(uint8_t *); static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *, void *, size_t); static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *, struct pci_vtscsi_ctrl_tmf *); static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *, struct pci_vtscsi_ctrl_an *); static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *, struct iovec *, int, struct iovec *, int); static void pci_vtscsi_controlq_notify(void *, struct vqueue_info *); static void pci_vtscsi_eventq_notify(void *, struct vqueue_info *); static void pci_vtscsi_requestq_notify(void *, struct vqueue_info *); static int pci_vtscsi_init_queue(struct pci_vtscsi_softc *, struct pci_vtscsi_queue *, int); static int pci_vtscsi_init(struct pci_devinst *, nvlist_t *); static struct virtio_consts vtscsi_vi_consts = { .vc_name = "vtscsi", .vc_nvq = VTSCSI_MAXQ, .vc_cfgsize = sizeof(struct pci_vtscsi_config), .vc_reset = pci_vtscsi_reset, .vc_cfgread = pci_vtscsi_cfgread, .vc_cfgwrite = pci_vtscsi_cfgwrite, .vc_apply_features = pci_vtscsi_neg_features, .vc_hv_caps = 0, }; static void * pci_vtscsi_proc(void *arg) { struct pci_vtscsi_worker *worker = (struct pci_vtscsi_worker *)arg; struct pci_vtscsi_queue *q = worker->vsw_queue; struct pci_vtscsi_request *req; int iolen; for (;;) { pthread_mutex_lock(&q->vsq_mtx); while (STAILQ_EMPTY(&q->vsq_requests) && !worker->vsw_exiting) pthread_cond_wait(&q->vsq_cv, &q->vsq_mtx); if (worker->vsw_exiting) break; req = STAILQ_FIRST(&q->vsq_requests); STAILQ_REMOVE_HEAD(&q->vsq_requests, vsr_link); pthread_mutex_unlock(&q->vsq_mtx); iolen = pci_vtscsi_request_handle(q, req->vsr_iov_in, req->vsr_niov_in, req->vsr_iov_out, req->vsr_niov_out); pthread_mutex_lock(&q->vsq_qmtx); vq_relchain(q->vsq_vq, req->vsr_idx, iolen); vq_endchains(q->vsq_vq, 0); pthread_mutex_unlock(&q->vsq_qmtx); DPRINTF("request completed", req->vsr_idx); free(req); } pthread_mutex_unlock(&q->vsq_mtx); return (NULL); } static void pci_vtscsi_reset(void *vsc) { struct pci_vtscsi_softc *sc; sc = vsc; DPRINTF("device reset requested"); vi_reset_dev(&sc->vss_vs); /* initialize config structure */ sc->vss_config = (struct pci_vtscsi_config){ .num_queues = VTSCSI_REQUESTQ, /* Leave room for the request and the response. */ .seg_max = VTSCSI_MAXSEG - 2, .max_sectors = 2, .cmd_per_lun = 1, .event_info_size = sizeof(struct pci_vtscsi_event), .sense_size = 96, .cdb_size = 32, .max_channel = VIRTIO_SCSI_MAX_CHANNEL, .max_target = VIRTIO_SCSI_MAX_TARGET, .max_lun = VIRTIO_SCSI_MAX_LUN }; } static void pci_vtscsi_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vtscsi_softc *sc = vsc; sc->vss_features = negotiated_features; } static int pci_vtscsi_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtscsi_softc *sc = vsc; void *ptr; ptr = (uint8_t *)&sc->vss_config + offset; memcpy(retval, ptr, size); return (0); } static int pci_vtscsi_cfgwrite(void *vsc __unused, int offset __unused, int size __unused, uint32_t val __unused) { return (0); } static inline int pci_vtscsi_get_lun(uint8_t *lun) { return (((lun[2] << 8) | lun[3]) & 0x3fff); } static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *sc, void *buf, size_t bufsize) { struct pci_vtscsi_ctrl_tmf *tmf; struct pci_vtscsi_ctrl_an *an; uint32_t type; if (bufsize < sizeof(uint32_t)) { WPRINTF("ignoring truncated control request"); return (0); } type = *(uint32_t *)buf; if (type == VIRTIO_SCSI_T_TMF) { if (bufsize != sizeof(*tmf)) { WPRINTF("ignoring tmf request with size %zu", bufsize); return (0); } tmf = (struct pci_vtscsi_ctrl_tmf *)buf; return (pci_vtscsi_tmf_handle(sc, tmf)); } if (type == VIRTIO_SCSI_T_AN_QUERY) { if (bufsize != sizeof(*an)) { WPRINTF("ignoring AN request with size %zu", bufsize); return (0); } an = (struct pci_vtscsi_ctrl_an *)buf; return (pci_vtscsi_an_handle(sc, an)); } return (0); } static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *sc, struct pci_vtscsi_ctrl_tmf *tmf) { union ctl_io *io; int err; io = ctl_scsi_alloc_io(sc->vss_iid); ctl_scsi_zero_io(io); io->io_hdr.io_type = CTL_IO_TASK; io->io_hdr.nexus.initid = sc->vss_iid; io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(tmf->lun); io->taskio.tag_type = CTL_TAG_SIMPLE; io->taskio.tag_num = tmf->id; io->io_hdr.flags |= CTL_FLAG_USER_TAG; switch (tmf->subtype) { case VIRTIO_SCSI_T_TMF_ABORT_TASK: io->taskio.task_action = CTL_TASK_ABORT_TASK; break; case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: io->taskio.task_action = CTL_TASK_ABORT_TASK_SET; break; case VIRTIO_SCSI_T_TMF_CLEAR_ACA: io->taskio.task_action = CTL_TASK_CLEAR_ACA; break; case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: io->taskio.task_action = CTL_TASK_CLEAR_TASK_SET; break; case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET; break; case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: io->taskio.task_action = CTL_TASK_LUN_RESET; break; case VIRTIO_SCSI_T_TMF_QUERY_TASK: io->taskio.task_action = CTL_TASK_QUERY_TASK; break; case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET: io->taskio.task_action = CTL_TASK_QUERY_TASK_SET; break; } if (pci_vtscsi_debug) { struct sbuf *sb = sbuf_new_auto(); ctl_io_sbuf(io, sb); sbuf_finish(sb); DPRINTF("%s", sbuf_data(sb)); sbuf_delete(sb); } err = ioctl(sc->vss_ctl_fd, CTL_IO, io); if (err != 0) WPRINTF("CTL_IO: err=%d (%s)", errno, strerror(errno)); tmf->response = io->taskio.task_status; ctl_scsi_free_io(io); return (1); } static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *sc __unused, struct pci_vtscsi_ctrl_an *an __unused) { return (0); } static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in, int niov_in, struct iovec *iov_out, int niov_out) { struct pci_vtscsi_softc *sc = q->vsq_sc; struct pci_vtscsi_req_cmd_rd *cmd_rd = NULL; struct pci_vtscsi_req_cmd_wr *cmd_wr; struct iovec data_iov_in[VTSCSI_MAXSEG], data_iov_out[VTSCSI_MAXSEG]; union ctl_io *io; int data_niov_in, data_niov_out; void *ext_data_ptr = NULL; uint32_t ext_data_len = 0, ext_sg_entries = 0; int err, nxferred; if (count_iov(iov_out, niov_out) < VTSCSI_OUT_HEADER_LEN(sc)) { WPRINTF("ignoring request with insufficient output"); return (0); } if (count_iov(iov_in, niov_in) < VTSCSI_IN_HEADER_LEN(sc)) { WPRINTF("ignoring request with incomplete header"); return (0); } seek_iov(iov_in, niov_in, data_iov_in, &data_niov_in, VTSCSI_IN_HEADER_LEN(sc)); seek_iov(iov_out, niov_out, data_iov_out, &data_niov_out, VTSCSI_OUT_HEADER_LEN(sc)); truncate_iov(iov_in, &niov_in, VTSCSI_IN_HEADER_LEN(sc)); truncate_iov(iov_out, &niov_out, VTSCSI_OUT_HEADER_LEN(sc)); iov_to_buf(iov_in, niov_in, (void **)&cmd_rd); cmd_wr = calloc(1, VTSCSI_OUT_HEADER_LEN(sc)); io = ctl_scsi_alloc_io(sc->vss_iid); ctl_scsi_zero_io(io); io->io_hdr.nexus.initid = sc->vss_iid; io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(cmd_rd->lun); io->io_hdr.io_type = CTL_IO_SCSI; if (data_niov_in > 0) { ext_data_ptr = (void *)data_iov_in; ext_sg_entries = data_niov_in; ext_data_len = count_iov(data_iov_in, data_niov_in); io->io_hdr.flags |= CTL_FLAG_DATA_OUT; } else if (data_niov_out > 0) { ext_data_ptr = (void *)data_iov_out; ext_sg_entries = data_niov_out; ext_data_len = count_iov(data_iov_out, data_niov_out); io->io_hdr.flags |= CTL_FLAG_DATA_IN; } io->scsiio.sense_len = sc->vss_config.sense_size; io->scsiio.tag_num = cmd_rd->id; io->io_hdr.flags |= CTL_FLAG_USER_TAG; switch (cmd_rd->task_attr) { case VIRTIO_SCSI_S_ORDERED: io->scsiio.tag_type = CTL_TAG_ORDERED; break; case VIRTIO_SCSI_S_HEAD: io->scsiio.tag_type = CTL_TAG_HEAD_OF_QUEUE; break; case VIRTIO_SCSI_S_ACA: io->scsiio.tag_type = CTL_TAG_ACA; break; case VIRTIO_SCSI_S_SIMPLE: default: io->scsiio.tag_type = CTL_TAG_SIMPLE; break; } io->scsiio.ext_sg_entries = ext_sg_entries; io->scsiio.ext_data_ptr = ext_data_ptr; io->scsiio.ext_data_len = ext_data_len; io->scsiio.ext_data_filled = 0; io->scsiio.cdb_len = sc->vss_config.cdb_size; memcpy(io->scsiio.cdb, cmd_rd->cdb, sc->vss_config.cdb_size); if (pci_vtscsi_debug) { struct sbuf *sb = sbuf_new_auto(); ctl_io_sbuf(io, sb); sbuf_finish(sb); DPRINTF("%s", sbuf_data(sb)); sbuf_delete(sb); } err = ioctl(sc->vss_ctl_fd, CTL_IO, io); if (err != 0) { WPRINTF("CTL_IO: err=%d (%s)", errno, strerror(errno)); cmd_wr->response = VIRTIO_SCSI_S_FAILURE; } else { cmd_wr->sense_len = MIN(io->scsiio.sense_len, sc->vss_config.sense_size); cmd_wr->residual = ext_data_len - io->scsiio.ext_data_filled; cmd_wr->status = io->scsiio.scsi_status; cmd_wr->response = VIRTIO_SCSI_S_OK; memcpy(&cmd_wr->sense, &io->scsiio.sense_data, cmd_wr->sense_len); } buf_to_iov(cmd_wr, VTSCSI_OUT_HEADER_LEN(sc), iov_out, niov_out, 0); nxferred = VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled; free(cmd_rd); free(cmd_wr); ctl_scsi_free_io(io); return (nxferred); } static void pci_vtscsi_controlq_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtscsi_softc *sc; struct iovec iov[VTSCSI_MAXSEG]; struct vi_req req; void *buf = NULL; size_t bufsize; int iolen, n; sc = vsc; while (vq_has_descs(vq)) { n = vq_getchain(vq, iov, VTSCSI_MAXSEG, &req); assert(n >= 1 && n <= VTSCSI_MAXSEG); bufsize = iov_to_buf(iov, n, &buf); iolen = pci_vtscsi_control_handle(sc, buf, bufsize); buf_to_iov((uint8_t *)buf + bufsize - iolen, iolen, iov, n, bufsize - iolen); /* * Release this chain and handle more */ vq_relchain(vq, req.idx, iolen); } vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ free(buf); } static void pci_vtscsi_eventq_notify(void *vsc __unused, struct vqueue_info *vq) { vq_kick_disable(vq); } static void pci_vtscsi_requestq_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtscsi_softc *sc; struct pci_vtscsi_queue *q; struct pci_vtscsi_request *req; struct iovec iov[VTSCSI_MAXSEG]; struct vi_req vireq; int n; sc = vsc; q = &sc->vss_queues[vq->vq_num - 2]; while (vq_has_descs(vq)) { n = vq_getchain(vq, iov, VTSCSI_MAXSEG, &vireq); assert(n >= 1 && n <= VTSCSI_MAXSEG); req = calloc(1, sizeof(struct pci_vtscsi_request)); req->vsr_idx = vireq.idx; req->vsr_queue = q; req->vsr_niov_in = vireq.readable; req->vsr_niov_out = vireq.writable; memcpy(req->vsr_iov_in, iov, req->vsr_niov_in * sizeof(struct iovec)); memcpy(req->vsr_iov_out, iov + vireq.readable, req->vsr_niov_out * sizeof(struct iovec)); pthread_mutex_lock(&q->vsq_mtx); STAILQ_INSERT_TAIL(&q->vsq_requests, req, vsr_link); pthread_cond_signal(&q->vsq_cv); pthread_mutex_unlock(&q->vsq_mtx); DPRINTF("request enqueued", vireq.idx); } } static int pci_vtscsi_init_queue(struct pci_vtscsi_softc *sc, struct pci_vtscsi_queue *queue, int num) { struct pci_vtscsi_worker *worker; char tname[MAXCOMLEN + 1]; int i; queue->vsq_sc = sc; queue->vsq_vq = &sc->vss_vq[num + 2]; pthread_mutex_init(&queue->vsq_mtx, NULL); pthread_mutex_init(&queue->vsq_qmtx, NULL); pthread_cond_init(&queue->vsq_cv, NULL); STAILQ_INIT(&queue->vsq_requests); LIST_INIT(&queue->vsq_workers); for (i = 0; i < VTSCSI_THR_PER_Q; i++) { worker = calloc(1, sizeof(struct pci_vtscsi_worker)); worker->vsw_queue = queue; pthread_create(&worker->vsw_thread, NULL, &pci_vtscsi_proc, (void *)worker); snprintf(tname, sizeof(tname), "vtscsi:%d-%d", num, i); pthread_set_name_np(worker->vsw_thread, tname); LIST_INSERT_HEAD(&queue->vsq_workers, worker, vsw_link); } return (0); } static int pci_vtscsi_legacy_config(nvlist_t *nvl, const char *opts) { char *cp, *devname; if (opts == NULL) return (0); cp = strchr(opts, ','); if (cp == NULL) { set_config_value_node(nvl, "dev", opts); return (0); } devname = strndup(opts, cp - opts); set_config_value_node(nvl, "dev", devname); free(devname); return (pci_parse_legacy_config(nvl, cp + 1)); } static int pci_vtscsi_init(struct pci_devinst *pi, nvlist_t *nvl) { struct pci_vtscsi_softc *sc; const char *devname, *value; int i; sc = calloc(1, sizeof(struct pci_vtscsi_softc)); value = get_config_value_node(nvl, "iid"); if (value != NULL) sc->vss_iid = strtoul(value, NULL, 10); + value = get_config_value_node(nvl, "bootindex"); + if (value != NULL) { + if (pci_emul_add_boot_device(pi, atoi(value))) { + EPRINTLN("Invalid bootindex %d", atoi(value)); + free(sc); + return (-1); + } + } + devname = get_config_value_node(nvl, "dev"); if (devname == NULL) devname = "/dev/cam/ctl"; sc->vss_ctl_fd = open(devname, O_RDWR); if (sc->vss_ctl_fd < 0) { WPRINTF("cannot open %s: %s", devname, strerror(errno)); free(sc); return (1); } pthread_mutex_init(&sc->vss_mtx, NULL); vi_softc_linkup(&sc->vss_vs, &vtscsi_vi_consts, sc, pi, sc->vss_vq); sc->vss_vs.vs_mtx = &sc->vss_mtx; /* controlq */ sc->vss_vq[0].vq_qsize = VTSCSI_RINGSZ; sc->vss_vq[0].vq_notify = pci_vtscsi_controlq_notify; /* eventq */ sc->vss_vq[1].vq_qsize = VTSCSI_RINGSZ; sc->vss_vq[1].vq_notify = pci_vtscsi_eventq_notify; /* request queues */ for (i = 2; i < VTSCSI_MAXQ; i++) { sc->vss_vq[i].vq_qsize = VTSCSI_RINGSZ; sc->vss_vq[i].vq_notify = pci_vtscsi_requestq_notify; pci_vtscsi_init_queue(sc, &sc->vss_queues[i - 2], i - 2); } /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_SCSI); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_SCSI); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vss_vs, 1, fbsdrun_virtio_msix())) return (1); vi_set_io_bar(&sc->vss_vs, 0); return (0); } static const struct pci_devemu pci_de_vscsi = { .pe_emu = "virtio-scsi", .pe_init = pci_vtscsi_init, .pe_legacy_config = pci_vtscsi_legacy_config, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_vscsi);