diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile index a4fc3deea77e..e35d528ab605 100644 --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -1,134 +1,135 @@ # # $FreeBSD$ # .include CFLAGS+=-I${.CURDIR}/../../contrib/lib9p CFLAGS+=-I${SRCTOP}/sys .PATH: ${SRCTOP}/sys/cam/ctl PROG= bhyve PACKAGE= bhyve -MAN= bhyve.8 +MAN= bhyve.8 bhyve_config.5 BHYVE_SYSDIR?=${SRCTOP} SRCS= \ atkbdc.c \ acpi.c \ audio.c \ bhyvegc.c \ bhyverun.c \ block_if.c \ bootrom.c \ + config.c \ console.c \ ctl_util.c \ ctl_scsi_all.c \ fwctl.c \ gdb.c \ hda_codec.c \ inout.c \ ioapic.c \ kernemu_dev.c \ mem.c \ mevent.c \ mptbl.c \ net_backends.c \ net_utils.c \ pci_ahci.c \ pci_e82545.c \ pci_emul.c \ pci_hda.c \ pci_fbuf.c \ pci_hostbridge.c \ pci_irq.c \ pci_lpc.c \ pci_nvme.c \ pci_passthru.c \ pci_virtio_9p.c \ pci_virtio_block.c \ pci_virtio_console.c \ pci_virtio_net.c \ pci_virtio_rnd.c \ pci_virtio_scsi.c \ pci_uart.c \ pci_xhci.c \ pctestdev.c \ pm.c \ post.c \ ps2kbd.c \ ps2mouse.c \ rfb.c \ rtc.c \ smbiostbl.c \ sockstream.c \ task_switch.c \ uart_emul.c \ usb_emul.c \ usb_mouse.c \ virtio.c \ vga.c \ vmgenc.c \ xmsr.c \ spinup_ap.c \ iov.c .if ${MK_BHYVE_SNAPSHOT} != "no" SRCS+= snapshot.c .endif CFLAGS.kernemu_dev.c+= -I${SRCTOP}/sys/amd64 .PATH: ${BHYVE_SYSDIR}/sys/amd64/vmm SRCS+= vmm_instruction_emul.c -LIBADD= vmmapi md pthread z util sbuf cam 9p +LIBADD= vmmapi md nv pthread z util sbuf cam 9p .if ${MK_CASPER} != "no" LIBADD+= casper LIBADD+= cap_pwd LIBADD+= cap_grp # Temporary disable capsicum, until we integrate checkpoint code with it. #CFLAGS+=-DWITH_CASPER .endif .if ${MK_BHYVE_SNAPSHOT} != "no" LIBADD+= ucl xo .endif .if ${MK_INET_SUPPORT} != "no" CFLAGS+=-DINET .endif .if ${MK_INET6_SUPPORT} != "no" CFLAGS+=-DINET6 .endif .if ${MK_NETGRAPH_SUPPORT} != "no" CFLAGS+=-DNETGRAPH LIBADD+= netgraph .endif .if ${MK_OPENSSL} == "no" CFLAGS+=-DNO_OPENSSL .else LIBADD+= crypto .endif CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000 CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller .if ${MK_BHYVE_SNAPSHOT} != "no" CFLAGS+= -I${SRCTOP}/contrib/libucl/include # Temporary disable capsicum, until we integrate checkpoint code with it. CFLAGS+= -DWITHOUT_CAPSICUM CFLAGS+= -DBHYVE_SNAPSHOT .endif .ifdef GDB_LOG CFLAGS+=-DGDB_LOG .endif WARNS?= 2 .include diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 index 115727a136a7..04e22302d9d7 100644 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -1,735 +1,780 @@ .\" Copyright (c) 2013 Peter Grehan .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd January 18, 2021 +.Dd March 18, 2021 .Dt BHYVE 8 .Os .Sh NAME .Nm bhyve .Nd "run a guest operating system inside a virtual machine" .Sh SYNOPSIS .Nm .Op Fl AaCDeHhPSuWwxY .Oo .Sm off .Fl c\~ .Oo .Op Cm cpus= .Ar numcpus .Oc .Op Cm ,sockets= Ar n .Op Cm ,cores= Ar n .Op Cm ,threads= Ar n .Oc .Sm on .Op Fl G Ar port +.Op Fl k Ar file .Oo Fl l .Sm off .Cm help | Ar lpcdev Op Cm \&, Ar conf .Sm on .Oc .Oo Fl m .Sm off .Ar memsize .Oo .Cm K No | Cm k No | Cm M No | Cm m No | Cm G No | Cm g No | Cm T No | Cm t .Oc .Sm on .Oc +.Op Fl o Ar var Ns Cm = Ns Ar value .Op Fl p Ar vcpu Ns Cm \&: Ns Ar hostcpu .Op Fl r Ar file .Oo Fl s .Sm off .Cm help | Ar slot Cm \&, Ar emulation Op Cm \&, Ar conf .Sm on .Oc .Op Fl U Ar uuid .Ar vmname .Sh DESCRIPTION .Nm is a hypervisor that runs guest operating systems inside a virtual machine. .Pp Parameters such as the number of virtual CPUs, amount of guest memory, and I/O connectivity can be specified with command-line parameters. .Pp If not using a boot ROM, the guest operating system must be loaded with .Xr bhyveload 8 or a similar boot loader before running .Nm , otherwise, it is enough to run .Nm with a boot ROM of choice. .Pp .Nm runs until the guest operating system reboots or an unhandled hypervisor exit is detected. .Sh OPTIONS .Bl -tag -width 10n .It Fl a The guest's local APIC is configured in xAPIC mode. The xAPIC mode is the default setting so this option is redundant. It will be deprecated in a future version. .It Fl A Generate ACPI tables. Required for .Fx Ns /amd64 guests. .It Fl c Op Ar setting ... Number of guest virtual CPUs and/or the CPU topology. The default value for each of .Ar numcpus , .Ar sockets , .Ar cores , and .Ar threads is 1. The current maximum number of guest virtual CPUs is 16. If .Ar numcpus is not specified then it will be calculated from the other arguments. The topology must be consistent in that the .Ar numcpus must equal the product of .Ar sockets , .Ar cores , and .Ar threads . If a .Ar setting is specified more than once the last one has precedence. .It Fl C Include guest memory in core file. .It Fl D Destroy the VM on guest initiated power-off. .It Fl e Force .Nm to exit when a guest issues an access to an I/O port that is not emulated. This is intended for debug purposes. .It Fl G Ar port Start a debug server that uses the GDB protocol to export guest state to a debugger. An IPv4 TCP socket will be bound to the supplied .Ar port to listen for debugger connections. Only a single debugger may be attached to the debug server at a time. If .Ar port begins with .Sq w , .Nm will pause execution at the first instruction waiting for a debugger to attach. .It Fl h Print help message and exit. .It Fl H Yield the virtual CPU thread when a HLT instruction is detected. If this option is not specified, virtual CPUs will use 100% of a host CPU. +.It Fl k Ar file +Set configuration variables from a simple, key-value config file. +Each line of the config file is expected to consist of a config variable +name, an equals sign +.Pq Sq = , +and a value. +No spaces are permitted between the variable name, equals sign, or +value. +Blank lines and lines starting with +.Sq # +are ignored. .It Fl l Op Ar help|lpcdev Ns Op , Ns Ar conf Allow devices behind the LPC PCI-ISA bridge to be configured. The only supported devices are the TTY-class devices .Ar com1 through .Ar com4 , the boot ROM device .Ar bootrom , and the debug/test device .Ar pc-testdev . .Pp .Ar help print a list of supported LPC devices. .It Fl m Ar memsize Ns Op Ar K|k|M|m|G|g|T|t Guest physical memory size in bytes. This must be the same size that was given to .Xr bhyveload 8 . .Pp The size argument may be suffixed with one of K, M, G or T (either upper or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes, or terabytes. If no suffix is given, the value is assumed to be in megabytes. .Pp .Ar memsize defaults to 256M. +.It Fl o Ar var Ns Cm = Ns Ar value +Set the configuration variable +.Ar var +to +.Ar value . .It Fl p Ar vcpu:hostcpu Pin guest's virtual CPU .Em vcpu to .Em hostcpu . .It Fl P Force the guest virtual CPU to exit when a PAUSE instruction is detected. .It Fl r Ar file Resume a guest from a snapshot. The guest memory contents are restored from .Ar file , and the guest device and vCPU state are restored from the file .Dq Ar file Ns .kern . .Pp Note that the current snapshot file format requires that the configuration of devices in the new VM match the VM from which the snapshot was taken by specifying the same .Op Fl s and .Op Fl l options. The count of vCPUs and memory configuration are read from the snapshot. .It Fl s Op Ar help|slot,emulation Ns Op , Ns Ar conf Configure a virtual PCI slot and function. .Pp .Nm provides PCI bus emulation and virtual devices that can be attached to slots on the bus. There are 32 available slots, with the option of providing up to 8 functions per slot. .Bl -tag -width 10n .It Ar help print a list of supported PCI devices. .It Ar slot .Ar pcislot[:function] .Ar bus:pcislot:function .Pp The .Ar pcislot value is 0 to 31. The optional .Ar function value is 0 to 7. The optional .Ar bus value is 0 to 255. If not specified, the .Ar function value defaults to 0. If not specified, the .Ar bus value defaults to 0. .It Ar emulation .Bl -tag -width 10n .It Li hostbridge | Li amd_hostbridge .Pp Provide a simple host bridge. This is usually configured at slot 0, and is required by most guest operating systems. The .Li amd_hostbridge emulation is identical but uses a PCI vendor ID of .Li AMD . .It Li passthru PCI pass-through device. .It Li virtio-net Virtio network interface. .It Li virtio-blk Virtio block storage interface. .It Li virtio-scsi Virtio SCSI interface. .It Li virtio-9p Virtio 9p (VirtFS) interface. .It Li virtio-rnd Virtio RNG interface. .It Li virtio-console Virtio console interface, which exposes multiple ports to the guest in the form of simple char devices for simple IO between the guest and host userspaces. .It Li ahci AHCI controller attached to arbitrary devices. .It Li ahci-cd AHCI controller attached to an ATAPI CD/DVD. .It Li ahci-hd AHCI controller attached to a SATA hard-drive. .It Li e1000 Intel e82545 network interface. .It Li uart PCI 16550 serial device. .It Li lpc LPC PCI-ISA bridge with COM1 and COM2 16550 serial ports, a boot ROM, and, optionally, the debug/test device. The LPC bridge emulation can only be configured on bus 0. .It Li fbuf Raw framebuffer device attached to VNC server. .It Li xhci eXtensible Host Controller Interface (xHCI) USB controller. .It Li nvme NVM Express (NVMe) controller. .It Li hda High Definition Audio Controller. .El .It Op Ar conf This optional parameter describes the backend for device emulations. If .Ar conf is not specified, the device emulation has no backend and can be considered unconnected. .Pp Network backends: .Bl -tag -width 10n .It Ar tapN Ns Oo , Ns Ar mac=xx:xx:xx:xx:xx:xx Oc Ns Oo , Ns Ar mtu=N Oc .It Ar vmnetN Ns Oo , Ns Ar mac=xx:xx:xx:xx:xx:xx Oc Ns Oo , Ns Ar mtu=N Oc .It Ar netgraph,path=ADDRESS,peerhook=HOOK Ns Oo , Ns Ar socket=NAME Oc Ns Oo , Ns Ar hook=HOOK Oc Ns Oo , Ns Ar mac=xx:xx:xx:xx:xx:xx Oc Ns Oo , Ns Ar mtu=N Oc .Pp If .Ar mac is not specified, the MAC address is derived from a fixed OUI and the remaining bytes from an MD5 hash of the slot and function numbers and the device name. .Pp The MAC address is an ASCII string in .Xr ethers 5 format. .Pp With virtio-net devices, the .Ar mtu parameter can be specified to inform the guest about the largest MTU that should be allowed, expressed in bytes. .Pp With netgraph backend, the .Ar path and .Ar peerhook parameters must be specified to set the destination node and corresponding hook. The optional parameters .Ar socket and .Ar hook may be used to set the .Xr ng_socket 4 node name and source hook. The .Ar ADDRESS , .Ar HOOK and .Ar NAME must comply with .Xr netgraph 4 addressing rules. .El .Pp Block storage devices: .Bl -tag -width 10n .It Pa /filename Ns Oo , Ns Ar block-device-options Oc .It Pa /dev/xxx Ns Oo , Ns Ar block-device-options Oc .El .Pp The .Ar block-device-options are: .Bl -tag -width 8n .It Li nocache Open the file with .Dv O_DIRECT . .It Li direct Open the file using .Dv O_SYNC . .It Li ro Force the file to be opened read-only. .It Li sectorsize= Ns Ar logical Ns Oo / Ns Ar physical Oc Specify the logical and physical sector sizes of the emulated disk. The physical sector size is optional and is equal to the logical sector size if not explicitly specified. .It Li nodelete Disable emulation of guest trim requests via .Dv DIOCGDELETE requests. .El .Pp SCSI devices: .Bl -tag -width 10n .It Pa /dev/cam/ctl Ns Oo Ar pp . Ns Ar vp Oc Ns Oo , Ns Ar scsi-device-options Oc .El .Pp The .Ar scsi-device-options are: .Bl -tag -width 10n .It Li iid= Ns Ar IID Initiator ID to use when sending requests to specified CTL port. The default value is 0. .El .Pp 9P devices: .Bl -tag -width 10n .It Pa sharename=/path/to/share[,9p-device-options] .El .Pp The .Ar 9p-device-options are: .Bl -tag -width 10n .It Li ro Expose the share in read-only mode. .El .Pp TTY devices: .Bl -tag -width 10n .It Li stdio Connect the serial port to the standard input and output of the .Nm process. .It Pa /dev/xxx Use the host TTY device for serial port I/O. .El .Pp Boot ROM device: .Bl -tag -width 10n .It Pa romfile Map .Ar romfile in the guest address space reserved for boot firmware. .El .Pp Pass-through devices: .Bl -tag -width 10n .It Ns Ar slot Ns / Ns Ar bus Ns / Ns Ar function Connect to a PCI device on the host at the selector described by .Ar slot , .Ar bus , and .Ar function numbers. .El .Pp Guest memory must be wired using the .Fl S option when a pass-through device is configured. .Pp The host device must have been reserved at boot-time using the .Va pptdevs loader variable as described in .Xr vmm 4 . .Pp Virtio console devices: .Bl -tag -width 10n .It Li port1= Ns Pa /path/to/port1.sock Ns ,anotherport= Ns Pa ... A maximum of 16 ports per device can be created. Every port is named and corresponds to a Unix domain socket created by .Nm . .Nm accepts at most one connection per port at a time. .Pp Limitations: .Bl -bullet -offset 2n .It Due to lack of destructors in .Nm , sockets on the filesystem must be cleaned up manually after .Nm exits. .It There is no way to use the "console port" feature, nor the console port resize at present. .It Emergency write is advertised, but no-op at present. .El .El .Pp Framebuffer devices: .Bl -tag -width 10n .It Xo .Oo rfb= Ns Oo Ar IP\&: Oc Ns Ar port Oc Ns Oo ,w= Ns Ar width Oc Ns Oo ,h= Ns .Ar height Oc Ns Oo ,vga= Ns Ar vgaconf Oc Ns Oo Ns ,wait Oc Ns Oo ,password= Ns .Ar password Oc .Xc .Bl -tag -width 8n .It Ar IPv4:port No or Ar [IPv6%zone]:port An .Ar IP address and a .Ar port VNC should listen on. The default is to listen on localhost IPv4 address and default VNC port 5900. An IPv6 address must be enclosed in square brackets and may contain an optional zone identifier. .It Ar width No and Ar height A display resolution, width and height, respectively. If not specified, a default resolution of 1024x768 pixels will be used. Minimal supported resolution is 640x480 pixels, and maximum is 1920x1200 pixels. .It Ar vgaconf Possible values for this option are .Dq io (default), .Dq on , and .Dq off . PCI graphics cards have a dual personality in that they are standard PCI devices with BAR addressing, but may also implicitly decode legacy VGA I/O space .Pq Ad 0x3c0-3df and memory space .Pq 64KB at Ad 0xA0000 . The default .Dq io option should be used for guests that attempt to issue BIOS calls which result in I/O port queries, and fail to boot if I/O decode is disabled. .Pp The .Dq on option should be used along with the CSM BIOS capability in UEFI to boot traditional BIOS guests that require the legacy VGA I/O and memory regions to be available. .Pp The .Dq off option should be used for the UEFI guests that assume that VGA adapter is present if they detect the I/O ports. An example of such a guest is .Ox in UEFI mode. .Pp Please refer to the .Nm .Fx wiki page .Pq Lk https://wiki.freebsd.org/bhyve for configuration notes of particular guests. .It wait Instruct .Nm to only boot upon the initiation of a VNC connection, simplifying the installation of operating systems that require immediate keyboard input. This can be removed for post-installation use. .It password This type of authentication is known to be cryptographically weak and is not intended for use on untrusted networks. Many implementations will want to use stronger security, such as running the session over an encrypted channel provided by IPsec or SSH. .El .El .Pp xHCI USB devices: .Bl -tag -width 10n .It Li tablet A USB tablet device which provides precise cursor synchronization when using VNC. .El .Pp NVMe devices: .Bl -tag -width 10n .It Li devpath Accepted device paths are: .Ar /dev/blockdev or .Ar /path/to/image or .Ar ram=size_in_MiB . .It Li maxq Max number of queues. .It Li qsz Max elements in each queue. .It Li ioslots Max number of concurrent I/O requests. .It Li sectsz Sector size (defaults to blockif sector size). .It Li ser Serial number with maximum 20 characters. .El .Pp AHCI devices: .Bl -tag -width 10n .It Li nmrr Nominal Media Rotation Rate, known as RPM. value 1 will indicate device as Solid State Disk. default value is 0, not report. .It Li ser Serial Number with maximum 20 characters. .It Li rev Revision Number with maximum 8 characters. .It Li model Model Number with maximum 40 characters. .El .Pp HD Audio devices: .Bl -tag -width 10n .It Li play Playback device, typically .Ar /dev/dsp0 . .It Li rec Recording device, typically .Ar /dev/dsp0 . .El .El .It Fl S Wire guest memory. .It Fl u RTC keeps UTC time. .It Fl U Ar uuid Set the universally unique identifier .Pq UUID in the guest's System Management BIOS System Information structure. By default a UUID is generated from the host's hostname and .Ar vmname . .It Fl w Ignore accesses to unimplemented Model Specific Registers (MSRs). This is intended for debug purposes. .It Fl W Force virtio PCI device emulations to use MSI interrupts instead of MSI-X interrupts. .It Fl x The guest's local APIC is configured in x2APIC mode. .It Fl Y Disable MPtable generation. .It Ar vmname Alphanumeric name of the guest. This should be the same as that created by .Xr bhyveload 8 . .El +.Sh CONFIGURATION VARIABLES +.Nm +uses an internal tree of configuration variables to describe global and +per-device settings. +When +.Nm +starts, +it parses command line options (including config files) in the order given +on the command line. +Each command line option sets one or more configuration variables. +For example, +the +.Fl s +option creates a new tree node for a PCI device and sets one or more variables +under that node including the device model and device model-specific variables. +Variables may be set multiple times during this parsing stage with the final +value overriding previous values. +.Pp +Once all of the command line options have been processed, +the configuration values are frozen. +.Nm +then uses the value of configuration values to initialize device models +and global settings. +.Pp +More details on configuration variables can be found in +.Xr bhyve_config 5 . .Sh DEBUG SERVER The current debug server provides limited support for debuggers. .Ss Registers Each virtual CPU is exposed to the debugger as a thread. .Pp General purpose registers can be queried for each virtual CPU, but other registers such as floating-point and system registers cannot be queried. .Ss Memory Memory (including memory mapped I/O regions) can be read and written by the debugger. Memory operations use virtual addresses that are resolved to physical addresses via the current virtual CPU's active address translation. .Ss Control The running guest can be interrupted by the debugger at any time .Pq for example, by pressing Ctrl-C in the debugger . .Pp Single stepping is only supported on Intel CPUs supporting the MTRAP VM exit. .Pp Breakpoints are supported on Intel CPUs that support single stepping. Note that continuing from a breakpoint while interrupts are enabled in the guest may not work as expected due to timer interrupts firing while single stepping over the breakpoint. .Sh SIGNAL HANDLING .Nm deals with the following signals: .Pp .Bl -tag -width indent -compact .It SIGTERM Trigger ACPI poweroff for a VM .El .Sh EXIT STATUS Exit status indicates how the VM was terminated: .Pp .Bl -tag -width indent -compact .It 0 rebooted .It 1 powered off .It 2 halted .It 3 triple fault .It 4 exited due to an error .El .Sh EXAMPLES If not using a boot ROM, the guest operating system must have been loaded with .Xr bhyveload 8 or a similar boot loader before .Xr bhyve 4 can be run. Otherwise, the boot loader is not needed. .Pp To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio block device backed by the .Pa /my/image filesystem image, and a serial port for the console: .Bd -literal -offset indent bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\ -l com1,stdio -A -H -P -m 1G vm1 .Ed .Pp Run a 24GB single-CPU virtual machine with three network ports, one of which has a MAC address specified: .Bd -literal -offset indent bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\ -s 2:1,virtio-net,tap1 \\ -s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\ -s 3,virtio-blk,/my/image -l com1,stdio \\ -A -H -P -m 24G bigvm .Ed .Pp Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI CD-ROM, a single virtio network port, an AMD hostbridge, and the console port connected to an .Xr nmdm 4 null-modem device. .Bd -literal -offset indent bhyve -c 4 \\ -s 0,amd_hostbridge -s 1,lpc \\ -s 1:0,ahci,hd:/images/disk.1,hd:/images/disk.2,\\ hd:/images/disk.3,hd:/images/disk.4,\\ hd:/images/disk.5,hd:/images/disk.6,\\ hd:/images/disk.7,hd:/images/disk.8,\\ cd:/images/install.iso \\ -s 3,virtio-net,tap0 \\ -l com1,/dev/nmdm0A \\ -A -H -P -m 8G .Ed .Pp Run a UEFI virtual machine with a display resolution of 800 by 600 pixels that can be accessed via VNC at: 0.0.0.0:5900. .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 3,ahci-cd,/path/to/uefi-OS-install.iso \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=0.0.0.0:5900,w=800,h=600,wait \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Pp Run a UEFI virtual machine with a VNC display that is bound to all IPv6 addresses on port 5900. .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=[::]:5900,w=800,h=600 \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Sh SEE ALSO .Xr bhyve 4 , .Xr netgraph 4 , .Xr ng_socket 4 , .Xr nmdm 4 , .Xr vmm 4 , +.Xr bhyve_config 5 , .Xr ethers 5 , .Xr bhyvectl 8 , .Xr bhyveload 8 .Pp .Rs .%A Intel .%B 64 and IA-32 Architectures Software Developer’s Manual .%V Volume 3 .Re .Sh HISTORY .Nm first appeared in .Fx 10.0 . .Sh AUTHORS .An Neel Natu Aq Mt neel@freebsd.org .An Peter Grehan Aq Mt grehan@freebsd.org diff --git a/usr.sbin/bhyve/bhyve_config.5 b/usr.sbin/bhyve/bhyve_config.5 new file mode 100644 index 000000000000..4e200a779d50 --- /dev/null +++ b/usr.sbin/bhyve/bhyve_config.5 @@ -0,0 +1,560 @@ +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2021 John H. Baldwin +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.Dd March 18, 2021 +.Dt BHYVE_CONFIG 5 +.Os +.Sh NAME +.Nm bhyve_config +.Nd "bhyve configuration variables" +.Sh DESCRIPTION +.Xr bhyve 8 +uses a hierarchical tree of configuration variables to describe global and +per-device settings. +Internal nodes in this tree do not have a value, +only leaf nodes have values. +This manual describes the configuration variables understood by +.Xr bhyve 8 . +If additional variables are defined, +.Xr bhyve 8 +will ignore them and will not emit errors for unknown variables. +However, these additional variables can be referenced by other +variables as described below. +.Sh VARIABLE VALUES +Configuration variable values are stored as strings. +A configuration variable value may refer to one or more other +configuration values by name. +Instances of the pattern +.Sq % Ns Pq Ar var +are replaced by the value of the configuration variable +.Va var . +To avoid unwanted expansion, +.Sq % +characters can be escaped by a leading +.Sq % . +For example, +if a configuration variable +.Va disk +uses the value +.Pa /dev/zvol/bhyve/%(name) , +then the final value of the +.Va disk +variable will be set to the path of a ZFS volume whose name matches +the name of the virtual machine on the pool +.Pa bhyve . +.Pp +Some configuration variables may be interpreted as a boolean value. +For those variables the following case-insensitive values may be used to +indicate true: +.Pp +.Bl -bullet -offset indent -compact +.It +true +.It +on +.It +yes +.It +1 +.El +.Pp +The following values may be used to indicate false: +.Pp +.Bl -bullet -offset indent -compact +.It +false +.It +off +.It +no +.It +0 +.El +.Pp +Some configuration variables may be interperted as an integer. +For those variables, +any syntax supported by +.Xr strtol 3 +may be used. +.Sh GLOBAL SETTINGS +.Ss Architecture Neutral Settings +.Bl -column "memory.guest_in_core" "integer" "Default" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va name Ta string Ta Ta +The name of the VM. +.It Va cpus Ta integer Ta 1 Ta +The total number of virtual CPUs. +.It Va cores Ta integer Ta 1 Ta +The number of virtual cores in each virtual socket. +.It Va threads Ta integer Ta 1 Ta +The number of virtual CPUs in each virtual core. +.It Va sockets Ta integer Ta 1 Ta +The number of virtual sockets. +.It Va memory.guest_in_core Ta bool Ta false Ta +Include guest memory in core file. +.It Va memory.size Ta string Ta 256M Ta +Guest physical memory size in bytes. +The value must be formatted as described in +.Xr expand_number 3 . +.It Va memory.wired Ta bool Ta false Ta +Wire guest memory. +.It Va acpi_tables Ta bool Ta false Ta +Generate ACPI tables. +.It Va destroy_on_poweroff Ta bool Ta false Ta +Destroy the VM on guest-initiated power-off. +.It Va gdb.port Ta integer Ta 0 Ta +TCP port number for the debug server. +If this is set to a non-zero value, a debug server +will listen for connections on this port. +.It Va gdb.wait Ta bool Ta false Ta +If the debug server is enabled, wait for a debugger to connect +before starting the guest. +.It Va rtc.use_localtime Ta bool Ta true Ta +The real time clock uses the local time of the host. +If this is set to false, the real time clock uses UTC. +.It Va uuid Ta string Ta Ta +The universally unique identifier (UUID) to use in the guest's +System Management BIOS System Information structure. +If an explicit value is not set, a valid UUID is generated from +the host's hostname and the VM name. +.It Va virtio_msix Ta bool Ta true Ta +Use MSI-X interrupts for PCI VirtIO devices. +If set to false, MSI interrupts are used instead. +.It Va config.dump Ta bool Ta false Ta +If this value is set to true, +then +.Xr bhyve 8 +will write all of its configuration variables to stdout and exit +after it has finished parsing command line options. +.El +.Ss x86-Specific Settings +.Bl -column "x86.vmexit_on_pause" "integer" "Default" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va x86.mptable Ta bool Ta true Ta +Generate an MPTable. +.It Va x86.x2apic Ta bool Ta false Ta +Configure guest's local APICs in x2APIC mode. +.It Va x86.strictio Ta bool Ta false Ta +Exit if a guest accesses an I/O port that is not emulated. +By default, writes are ignored and reads return all bits set. +.It Va x86.strictmsr Ta bool Ta true Ta +Inject a general protection fault if a guest accesses a Model Specific +Register (MSR) that is not emulated. +If this is false, writes are ignored and reads return zero. +.It Va x86.vmexit_on_hlt Ta bool Ta false Ta +Force a VM exit when a guest CPU executes the +.Dv HLT +instruction. +This allows idle guest CPUs to yield the host CPU. +.It Va x86.vmexit_on_pause Ta bool Ta false Ta +Force a VM exit when a guest CPU executes the +.Dv PAUSE +instruction. +.El +.Sh DEVICE SETTINGS +Device settings are stored under a device node. +The device node's name is set by the parent bus of the device. +.Ss PCI Device Settings +PCI devices are described by a device node named +.Dq pci Ns Ar bus . Ns Ar slot . Ns Ar function +where each of +.Ar bus , +.Ar slot , +and +.Ar function +are formatted as decimal values with no padding. +All PCI device nodes must contain a configuration variable named +.Dq device +which specifies the device model to use. +The following PCI device models are supported: +.Bl -tag -indent +.It Li hostbridge +Provide a simple PCI-Host bridge device. +This is usually configured at pci0:0:0 and is required by most guest +operating systems. +.It Li ahci +AHCI storage controller. +.It Li e1000 +Intel e82545 network interface. +.It Li fbuf +VGA framebuffer device attached to VNC server. +.It Li lpc +LPC PCI-ISA bridge with COM1-COM4 16550 serial ports, +a boot ROM, +and an optional debug/test device. +This device must be configured on bus 0. +.It Li hda +High Definition audio controller. +.It Li nvme +NVM Express (NVMe) controller. +.It Li passthru +PCI pass-through device. +.It Li uart +PCI 16550 serial device. +.It Li virtio-9p +VirtIO 9p (VirtFS) interface. +.It Li virtio-blk +VirtIO block storage interface. +.It Li virtio-console +VirtIO console interface. +.It Li virtio-net +VirtIO network interface. +.It Li virtio-rnd +VirtIO RNG interface. +.It Li virtio-scsi +VirtIO SCSI interface. +.It Li xhci +Extensible Host Controller Interface (XHCI) USB controller. +.El +.Ss USB Device Settings +USB controller devices contain zero or more child USB devices +attached to slots. +Each USB device stores its settings in a node named +.Dq slot. Ns Va N +under the controller's device node. +.Va N +is the number of the slot to which the USB device is attached. +Note that USB slot numbers begin at 1. +All USB device nodes must contain a configuration variable named +.Dq device +which specifies the device model to use. +The following USB device models are supported: +.Bl -tag -indent +.It Li tablet +A USB tablet device which provides precise cursor synchronization +when using VNC. +.El +.Ss Block Device Settings +Block devices use the following settings to configure their backing store. +These settings are stored in the configuration node of the respective device. +.Bl -column "sectorsize" "logical[/physical]" "Default" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It path Ta string Ta Ta +The path of the file or disk device to use as the backing store. +.It nocache Ta bool Ta false Ta +Disable caching on the backing file by opening the backing file with +.Dv O_DIRECT . +.It nodelete Ta bool Ta false Ta +Disable emulation of guest trim requests via +.Dv DIOCGDELETE +requests. +.It sync Ta bool Ta false Ta +Write changes to the backing file with synchronous writes. +.It direct Ta bool Ta false Ta +An alias for +.Va sync . +.It ro Ta bool Ta false Ta +Disable writes to the backing file. +.It sectorsize Ta Va logical Ns Op / Ns Va physical Ta Ta +Specify the logical and physical sector size of the emulated disk. +If the physical size is not specified, +it is equal to the logical size. +.El +.Ss Network Backend Settings +Network devices use the following settings to configure their backend. +The backend is responsible for passing packets between the device model +and a desired destination. +Configuring a backend requires setting the +.Va backend +variable to one of the following values: +.Bl -tag +.It tap Ns Va N +Use the named +.Xr tap 4 +interface as the backend. +.It vmnet Ns Va N +Use the named +.Xr vmnet 4 +interface as the backend. +.It netgraph +Use a +.Xr netgraph 4 +socket hook as the backend. +This backend uses the following additional variables: +.Bl -column "peerhook" "Format" "Default" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va path Ta string Ta Ta +The name of the +.Xr netgraph 4 +destination node. +.It Va peerhook Ta string Ta Ta +The name of the destination hook. +.It Va socket Ta string Ta Ta +The name of the created +.Xr ng_socket 4 +node. +.It Va hook Ta string Ta vmlink Ta +The name of the source hook on the created +.Xr ng_socket 4 +node. +.El +.It netmap: Ns Va interface +Use +.Xr netmap 4 +on a network interface as the backend. +.It vale Ns Va bridge : Ns Va port +Use a port on a +.Xr vale 4 +bridge as the backend. +.El +.Ss UART Device Settings +.Bl -column "Name" "Format" "Default" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va path Ta path Ta Ta +Backend device for the serial port. +Either the pathname of a character device or +.Dq stdio +to use standard input and output of the +.Xr bhyve 8 +process. +.El +.Ss Host Bridge Settings +.Bl -column "vendor" "integer" "Default" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va vendor Ta integer Ta 0x1275 Ta +PCI vendor ID. +.It Va device Ta integer Ta 0x1275 Ta +PCI device ID. +.El +.Ss AHCI Controller Settings +AHCI controller devices contain zero or more ports each of which +provides a storage device. +Each port stores its settings in a node named +.Dq port. Ns Va N +under the controller's device node. +The +.Va N +values are formatted as successive decimal values starting with 0. +In addition to the block device settings described above, each +port supports the following settings: +.Bl -column "model" "integer" "generated" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va type Ta string Ta Ta +The type of storage device to emulate. +Must be set to either +.Dq cd +or +.Dq hd . +.It Va nmrr Ta integer Ta 0 Ta +Nominal Media Rotation Rate, also known as RPM. +A value 1 of indicates a device with no rate such as a Solid State Disk. +.It Va ser Ta string Ta generated Ta +Serial number of up to twenty characters. +A default serial number is generated using a hash of the backing +store's pathname. +.It Va rev Ta string Ta 001 Ta +Revision number of up to eight characters. +.It Va model Ta string Ta Ta +Model number of up to forty characters. +Separate default model strings are used for +.Dq cd +and +.Dq hd +device types. +.El +.Ss e1000 Settings +In addition to the network backend settings, +Intel e82545 network interfaces support the following variables: +.Bl -column "Name" "MAC address" "generated" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va mac Ta MAC address Ta generated Ta +MAC address. +If an explicit address is not provided, +a MAC address is generated from a hash of the device's PCI address. +.El +.Ss Frame Buffer Settings +.Bl -column "password" "[IP:]port" "127.0.0.1:5900" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va wait Ta bool Ta false Ta +Wait for a remote connection before starting the VM. +.It Va rfb Ta Oo Ar IP Ns : Oc Ns Ar port Ta 127.0.0.1:5900 Ta +TCP address to listen on for remote connections. +The IP address must be given as a numeric address. +IPv6 addresses must be enclosed in square brackets and +support scoped identifiers as described in +.Xr getaddrinfo 3 . +A bare port number may be given in which case the IPv4 +localhost address is used. +.It Va vga Ta string Ta io Ta +VGA configuration. +More details are provided in +.Xr bhyve 8 . +.It Va w Ta integer Ta 1024 Ta +Frame buffer width in pixels. +.It Va h Ta integer Ta 768 Ta +Frame buffer height in pixels. +.It Va password Ta string Ta Ta +Password to use for VNC authentication. +This type of authentication is known to be cryptographically weak and is not +intended for use on untrusted networks. +.El +.Ss High Definition Audio Settings +.Bl -column "Name" "Format" "Default" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va play Ta path Ta Ta +Host playback device, +typically +.Pa /dev/dsp0 . +.It Va rec Ta path Ta Ta +Host recording device, +typically +.Pa /dev/dsp0 . +.El +.Ss LPC Device Settings +The LPC bridge stores its configuration under a top-level +.Va lpc +node rather than under the PCI LPC device's node. +The following nodes are available under +.Va lpc : +.Bl -column "pc-testdev" "Format" "Default" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va bootrom Ta path Ta Ta +Path to a boot ROM. +The contents of this file are copied into the guest's +memory ending just before the 4GB physical address. +If a boot ROM is present, a firmware interface device is +also enabled for use by the boot ROM. +.It Va com1 Ta node Ta Ta +Settings for the COM1 serial port device. +.It Va com2 Ta node Ta Ta +Settings for the COM2 serial port device. +.It Va com3 Ta node Ta Ta +Settings for the COM3 serial port device. +.It Va com4 Ta node Ta Ta +Settings for the COM4 serial port device. +.It Va pc-testdev Ta bool Ta false Ta +Enable the PC debug/test device. +.El +.Ss NVMe Controller Settings +Each NVMe controller supports a single storage device. +The device can be backed either by a memory disk described by the +.Va ram +variable, or a block device using the the block device settings described above. +In addition, each controller supports the following settings: +.Bl -column "ioslots" "Format" "Default" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va maxq Ta integer Ta 16 Ta +Maximum number of I/O submission and completion queue pairs. +.It Va qsz Ta integer Ta 2058 Ta +Number of elements in each I/O queue. +.It Va ioslots Ta integer Ta 8 Ta +Maximum number of concurrent I/O requests. +.It Va sectsz Ta integer Ta Ta +Sector size. +Can be one of 512, 4096, or 8192. +Devices backed by a memory disk use 4096 as the default. +Devices backed by a block device use the block device's sector size +as the default. +.It Va ser Ta string Ta Ta +Serial number of up to twenty characters. +A default serial number is generated using a hash of the device's PCI address. +.It Va eui64 Ta integer Ta Ta +IEEE Extended Unique Identifier. +If an EUI is not provided, a default is generated using a checksum of the +device's PCI address. +.It Va dsm Ta string Ta auto Ta +Whether or not to advertise DataSet Management support. +One of +.Dq auto , +.Dq enable , +or +.Dq disable . +The +.Dq auto +setting only advertises support if the backing store supports +resource freeing, for example via TRIM. +.It Va ram Ta integer Ta Ta +If set, allocate a memory disk as the backing store. +The value of this variable is the size of the memory disk in megabytes. +.El +.Ss PCI Passthrough Settings +.Bl -column "Name" "integer" "Default" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va bus Ta integer Ta Ta +Host PCI bus address of device to pass through. +.It Va slot Ta integer Ta Ta +Host PCI slot address of device to pass through. +.It Va func Ta integer Ta Ta +Host PCI function address of device to pass through. +.El +.Ss VirtIO 9p Settings +Each VirtIO 9p device exposes a single filesystem from a host path. +.Bl -column "sharename" "Format" "Default" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va sharename Ta string Ta Ta +The share name exposed to the guest. +.It Va path Ta path Ta Ta +The path of a directory on the host to export to the guest. +.It Va ro Ta bool Ta false Ta +If true, the guest filesystem is read-only. +.El +.Ss VirtIO Console Device Settings +Each VirtIO Console device contains one or more console ports. +Each port stores its settings in a node named +.Dq port. Ns Va N +under the controller's device node. +The +.Va N +values are formatted as successive decimal values starting with 0. +Each port supports the following settings: +.Bl -column "Name" "Format" "Default" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va name Ta string Ta Ta +The name of the port exposed to the guest. +.It Va path Ta path Ta Ta +The path of a UNIX domain socket providing the host connection for the port. +.El +.Ss VirtIO Network Interface Settings +In addition to the network backend settings, +VirtIO network interfaces support the following variables: +.Bl -column "Name" "MAC address" "generated" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va mac Ta MAC address Ta generated Ta +MAC address. +If an explicit address is not provided, +a MAC address is generated from a hash of the device's PCI address. +.It Va mtu Ta integer Ta 1500 Ta +The largest supported MTU advertised to the guest. +.El +.Ss VirtIO SCSI Settings +.Bl -column "Name" "integer" "Default" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va dev Ta path Ta Ta +The path of a CAM target layer (CTL) device to export: +.Pa /dev/cam/ctl Ns Oo Ar pp . Ns Ar vp Oc . +.It Va iid Ta integer Ta 0 Ta +Initiator ID to use when sending requests to the CTL port. +.El +.Sh SEE ALSO +.Xr expand_number 3 , +.Xr getaddrinfo 3 , +.Xr strtol 3 , +.Xr netgraph 4 , +.Xr netmap 4 , +.Xr ng_socket 4 , +.Xr tap 4 , +.Xr vale 4 , +.Xr vmnet 4 , +.Xr bhyve 8 diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index aafab4af8d8c..a3e6ef3c4724 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -1,1442 +1,1579 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #ifdef BHYVE_SNAPSHOT #include #include #endif #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #include #include #endif #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "bootrom.h" +#include "config.h" #include "inout.h" #include "debug.h" #include "fwctl.h" #include "gdb.h" #include "ioapic.h" #include "kernemu_dev.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "smbiostbl.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include "vmgenc.h" #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ #define MB (1024UL * 1024) #define GB (1024UL * MB) static const char * const vmx_exit_reason_desc[] = { [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", [EXIT_REASON_EXT_INTR] = "External interrupt", [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", [EXIT_REASON_INIT] = "INIT signal", [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", [EXIT_REASON_SMI] = "Other SMI", [EXIT_REASON_INTR_WINDOW] = "Interrupt window", [EXIT_REASON_NMI_WINDOW] = "NMI window", [EXIT_REASON_TASK_SWITCH] = "Task switch", [EXIT_REASON_CPUID] = "CPUID", [EXIT_REASON_GETSEC] = "GETSEC", [EXIT_REASON_HLT] = "HLT", [EXIT_REASON_INVD] = "INVD", [EXIT_REASON_INVLPG] = "INVLPG", [EXIT_REASON_RDPMC] = "RDPMC", [EXIT_REASON_RDTSC] = "RDTSC", [EXIT_REASON_RSM] = "RSM", [EXIT_REASON_VMCALL] = "VMCALL", [EXIT_REASON_VMCLEAR] = "VMCLEAR", [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", [EXIT_REASON_VMPTRLD] = "VMPTRLD", [EXIT_REASON_VMPTRST] = "VMPTRST", [EXIT_REASON_VMREAD] = "VMREAD", [EXIT_REASON_VMRESUME] = "VMRESUME", [EXIT_REASON_VMWRITE] = "VMWRITE", [EXIT_REASON_VMXOFF] = "VMXOFF", [EXIT_REASON_VMXON] = "VMXON", [EXIT_REASON_CR_ACCESS] = "Control-register accesses", [EXIT_REASON_DR_ACCESS] = "MOV DR", [EXIT_REASON_INOUT] = "I/O instruction", [EXIT_REASON_RDMSR] = "RDMSR", [EXIT_REASON_WRMSR] = "WRMSR", [EXIT_REASON_INVAL_VMCS] = "VM-entry failure due to invalid guest state", [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", [EXIT_REASON_MWAIT] = "MWAIT", [EXIT_REASON_MTF] = "Monitor trap flag", [EXIT_REASON_MONITOR] = "MONITOR", [EXIT_REASON_PAUSE] = "PAUSE", [EXIT_REASON_MCE_DURING_ENTRY] = "VM-entry failure due to machine-check event", [EXIT_REASON_TPR] = "TPR below threshold", [EXIT_REASON_APIC_ACCESS] = "APIC access", [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", [EXIT_REASON_EPT_FAULT] = "EPT violation", [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", [EXIT_REASON_INVEPT] = "INVEPT", [EXIT_REASON_RDTSCP] = "RDTSCP", [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", [EXIT_REASON_INVVPID] = "INVVPID", [EXIT_REASON_WBINVD] = "WBINVD", [EXIT_REASON_XSETBV] = "XSETBV", [EXIT_REASON_APIC_WRITE] = "APIC write", [EXIT_REASON_RDRAND] = "RDRAND", [EXIT_REASON_INVPCID] = "INVPCID", [EXIT_REASON_VMFUNC] = "VMFUNC", [EXIT_REASON_ENCLS] = "ENCLS", [EXIT_REASON_RDSEED] = "RDSEED", [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", [EXIT_REASON_XSAVES] = "XSAVES", [EXIT_REASON_XRSTORS] = "XRSTORS" }; typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); -const char *vmname; - int guest_ncpus; uint16_t cores, maxcpus, sockets, threads; -char *guest_uuid_str; - int raw_stdio = 0; -static int gdb_port = 0; -static int guest_vmexit_on_hlt, guest_vmexit_on_pause; -static int virtio_msix = 1; -static int x2apic_mode = 0; /* default is xAPIC */ -static int destroy_on_poweroff = 0; - -static int strictio; -static int strictmsr = 1; - -static int acpi; - static char *progname; static const int BSP = 0; static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); static struct vm_exit vmexit[VM_MAXCPU]; struct bhyvestats { uint64_t vmexit_bogus; uint64_t vmexit_reqidle; uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; uint64_t vmexit_inst_emul; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; } stats; struct mt_vmm_info { pthread_t mt_thr; struct vmctx *mt_ctx; int mt_vcpu; } mt_vmm_info[VM_MAXCPU]; static cpuset_t *vcpumap[VM_MAXCPU] = { NULL }; static void usage(int code) { fprintf(stderr, "Usage: %s [-aehuwxACDHPSWY]\n" " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" - " %*s [-l ]\n" - " %*s [-m mem] [-p vcpu:hostcpu] [-s ] [-U uuid] \n" + " %*s [-k ] [-l ] [-m mem] [-o =]\n" + " %*s [-p vcpu:hostcpu] [-s ] [-U uuid] []\n" " -a: local apic is in xAPIC mode (deprecated)\n" " -A: create ACPI tables\n" " -c: number of cpus and/or topology specification\n" " -C: include guest memory in core file\n" " -D: destroy on power-off\n" " -e: exit on unhandled I/O access\n" " -h: help\n" " -H: vmexit from the guest on hlt\n" + " -k: key=value flat config file\n" " -l: LPC device configuration\n" " -m: memory size in MB\n" + " -o: set config 'var' to 'value'\n" + " -p: pin 'vcpu' to 'hostcpu'\n" + " -P: vmexit from the guest on pause\n" #ifdef BHYVE_SNAPSHOT " -r: path to checkpoint file\n" #endif - " -p: pin 'vcpu' to 'hostcpu'\n" - " -P: vmexit from the guest on pause\n" " -s: PCI slot config\n" " -S: guest memory cannot be swapped\n" " -u: RTC keeps UTC time\n" " -U: uuid\n" " -w: ignore unimplemented MSRs\n" " -W: force virtio to use single-vector MSI\n" " -x: local apic is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), "", (int)strlen(progname), "", (int)strlen(progname), ""); exit(code); } /* * XXX This parser is known to have the following issues: - * 1. It accepts null key=value tokens ",,". - * 2. It accepts whitespace after = and before value. - * 3. Values out of range of INT are silently wrapped. - * 4. It doesn't check non-final values. - * 5. The apparently bogus limits of UINT16_MAX are for future expansion. + * 1. It accepts null key=value tokens ",," as setting "cpus" to an + * empty string. * * The acceptance of a null specification ('-c ""') is by design to match the * manual page syntax specification, this results in a topology of 1 vCPU. */ static int topology_parse(const char *opt) { - uint64_t ncpus; - int c, chk, n, s, t, tmp; char *cp, *str; - bool ns, scts; - c = 1, n = 1, s = 1, t = 1; - ns = false, scts = false; + if (*opt == '\0') { + set_config_value("sockets", "1"); + set_config_value("cores", "1"); + set_config_value("threads", "1"); + set_config_value("cpus", "1"); + return (0); + } + str = strdup(opt); if (str == NULL) - goto out; + errx(4, "Failed to allocate memory"); while ((cp = strsep(&str, ",")) != NULL) { - if (sscanf(cp, "%i%n", &tmp, &chk) == 1) { - n = tmp; - ns = true; - } else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) { - n = tmp; - ns = true; - } else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) { - s = tmp; - scts = true; - } else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) { - c = tmp; - scts = true; - } else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) { - t = tmp; - scts = true; + if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) + set_config_value("cpus", cp + strlen("cpus=")); + else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0) + set_config_value("sockets", cp + strlen("sockets=")); + else if (strncmp(cp, "cores=", strlen("cores=")) == 0) + set_config_value("cores", cp + strlen("cores=")); + else if (strncmp(cp, "threads=", strlen("threads=")) == 0) + set_config_value("threads", cp + strlen("threads=")); #ifdef notyet /* Do not expose this until vmm.ko implements it */ - } else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) { - m = tmp; + else if (strncmp(cp, "maxcpus=", strlen("maxcpus=")) == 0) + set_config_value("maxcpus", cp + strlen("maxcpus=")); #endif - /* Skip the empty argument case from -c "" */ - } else if (cp[0] == '\0') - continue; - else - goto out; - /* Any trailing garbage causes an error */ - if (cp[chk] != '\0') + else if (strchr(cp, '=') != NULL) goto out; + else + set_config_value("cpus", cp); } free(str); - str = NULL; - - /* - * Range check 1 <= n <= UINT16_MAX all values - */ - if (n < 1 || s < 1 || c < 1 || t < 1 || - n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX || - t > UINT16_MAX) - return (-1); - - /* If only the cpus was specified, use that as sockets */ - if (!scts) - s = n; - /* - * Compute sockets * cores * threads avoiding overflow - * The range check above insures these are 16 bit values - * If n was specified check it against computed ncpus - */ - ncpus = (uint64_t)s * c * t; - if (ncpus > UINT16_MAX || (ns && n != ncpus)) - return (-1); - - guest_ncpus = ncpus; - sockets = s; - cores = c; - threads = t; - return(0); + return (0); out: free(str); return (-1); } +static int +parse_int_value(const char *key, const char *value, int minval, int maxval) +{ + char *cp; + long lval; + + errno = 0; + lval = strtol(value, &cp, 0); + if (errno != 0 || *cp != '\0' || cp == value || lval < minval || + lval > maxval) + errx(4, "Invalid value for %s: '%s'", key, value); + return (lval); +} + +/* + * Set the sockets, cores, threads, and guest_cpus variables based on + * the configured topology. + * + * The limits of UINT16_MAX are due to the types passed to + * vm_set_topology(). vmm.ko may enforce tighter limits. + */ +static void +calc_topolopgy(void) +{ + const char *value; + bool explicit_cpus; + uint64_t ncpus; + + value = get_config_value("cpus"); + if (value != NULL) { + guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX); + explicit_cpus = true; + } else { + guest_ncpus = 1; + explicit_cpus = false; + } + value = get_config_value("cores"); + if (value != NULL) + cores = parse_int_value("cores", value, 1, UINT16_MAX); + else + cores = 1; + value = get_config_value("threads"); + if (value != NULL) + threads = parse_int_value("threads", value, 1, UINT16_MAX); + else + threads = 1; + value = get_config_value("sockets"); + if (value != NULL) + sockets = parse_int_value("sockets", value, 1, UINT16_MAX); + else + sockets = guest_ncpus; + + /* + * Compute sockets * cores * threads avoiding overflow. The + * range check above insures these are 16 bit values. + */ + ncpus = (uint64_t)sockets * cores * threads; + if (ncpus > UINT16_MAX) + errx(4, "Computed number of vCPUs too high: %ju", + (uintmax_t)ncpus); + + if (explicit_cpus) { + if (guest_ncpus != ncpus) + errx(4, "Topology (%d sockets, %d cores, %d threads) " + "does not match %d vCPUs", sockets, cores, threads, + guest_ncpus); + } else + guest_ncpus = ncpus; +} + static int pincpu_parse(const char *opt) { + const char *value; + char *newval; + char key[16]; int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0 || vcpu >= VM_MAXCPU) { fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n", vcpu, VM_MAXCPU - 1); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } - if (vcpumap[vcpu] == NULL) { - if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) { - perror("malloc"); - return (-1); - } - CPU_ZERO(vcpumap[vcpu]); + snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); + value = get_config_value(key); + + if (asprintf(&newval, "%s%s%d", value != NULL ? value : "", + value != NULL ? "," : "", pcpu) == -1) { + perror("failed to build new cpuset string"); + return (-1); } - CPU_SET(pcpu, vcpumap[vcpu]); + + set_config_value(key, newval); + free(newval); return (0); } +static void +parse_cpuset(int vcpu, const char *list, cpuset_t *set) +{ + char *cp, *token; + int pcpu, start; + + CPU_ZERO(set); + start = -1; + token = __DECONST(char *, list); + for (;;) { + pcpu = strtoul(token, &cp, 0); + if (cp == token) + errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); + if (pcpu < 0 || pcpu >= CPU_SETSIZE) + errx(4, "hostcpu '%d' outside valid range from 0 to %d", + pcpu, CPU_SETSIZE - 1); + switch (*cp) { + case ',': + case '\0': + if (start >= 0) { + if (start > pcpu) + errx(4, "Invalid hostcpu range %d-%d", + start, pcpu); + while (start < pcpu) { + CPU_SET(start, vcpumap[vcpu]); + start++; + } + start = -1; + } + CPU_SET(pcpu, vcpumap[vcpu]); + break; + case '-': + if (start >= 0) + errx(4, "invalid cpuset for vcpu %d: '%s'", + vcpu, list); + start = pcpu; + break; + default: + errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); + } + if (*cp == '\0') + break; + token = cp + 1; + } +} + +static void +build_vcpumaps(void) +{ + char key[16]; + const char *value; + int vcpu; + + for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { + snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); + value = get_config_value(key); + if (value == NULL) + continue; + vcpumap[vcpu] = malloc(sizeof(cpuset_t)); + if (vcpumap[vcpu] == NULL) + err(4, "Failed to allocate cpuset for vcpu %d", vcpu); + parse_cpuset(vcpu, value, vcpumap[vcpu]); + } +} + void vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, int errcode) { struct vmctx *ctx; int error, restart_instruction; ctx = arg; restart_instruction = 1; error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, restart_instruction); assert(error == 0); } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr) { return (vm_rev_map_gpa(ctx, addr)); } #endif -int -fbsdrun_vmexit_on_pause(void) -{ - - return (guest_vmexit_on_pause); -} - -int -fbsdrun_vmexit_on_hlt(void) -{ - - return (guest_vmexit_on_hlt); -} - int fbsdrun_virtio_msix(void) { - return (virtio_msix); + return (get_config_bool_default("virtio_msix", true)); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct mt_vmm_info *mtp; int vcpu; mtp = param; vcpu = mtp->mt_vcpu; snprintf(tname, sizeof(tname), "vcpu %d", vcpu); pthread_set_name_np(mtp->mt_thr, tname); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_add(vcpu); #endif - if (gdb_port != 0) - gdb_cpu_add(vcpu); + gdb_cpu_add(vcpu); vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); /* not reached */ exit(1); return (NULL); } void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) { int error; assert(fromcpu == BSP); /* * The 'newcpu' must be activated in the context of 'fromcpu'. If * vm_activate_cpu() is delayed until newcpu's pthread starts running * then vmm.ko is out-of-sync with bhyve and this can create a race * with vm_suspend(). */ error = vm_activate_cpu(ctx, newcpu); if (error != 0) err(EX_OSERR, "could not activate CPU %d", newcpu); CPU_SET_ATOMIC(newcpu, &cpumask); /* * Set up the vmexit struct to allow execution to start * at the given RIP */ vmexit[newcpu].rip = rip; vmexit[newcpu].inst_length = 0; mt_vmm_info[newcpu].mt_ctx = ctx; mt_vmm_info[newcpu].mt_vcpu = newcpu; error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, fbsdrun_start_thread, &mt_vmm_info[newcpu]); assert(error == 0); } static int fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) { if (!CPU_ISSET(vcpu, &cpumask)) { fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); exit(4); } CPU_CLR_ATOMIC(vcpu, &cpumask); return (CPU_EMPTY(&cpumask)); } static int vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, uint32_t eax) { #if BHYVE_DEBUG /* * put guest-driven debug here */ #endif return (VMEXIT_CONTINUE); } static int vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; int bytes, port, in, out; int vcpu; vcpu = *pvcpu; port = vme->u.inout.port; bytes = vme->u.inout.bytes; in = vme->u.inout.in; out = !in; /* Extra-special case of host notifications */ if (out && port == GUEST_NIO_PORT) { error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); return (error); } - error = emulate_inout(ctx, vcpu, vme, strictio); + error = emulate_inout(ctx, vcpu, vme); if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port, vmexit->rip); return (VMEXIT_ABORT); } else { return (VMEXIT_CONTINUE); } } static int vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { uint64_t val; uint32_t eax, edx; int error; val = 0; error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val); if (error != 0) { fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", vme->u.msr.code, *pvcpu); - if (strictmsr) { + if (get_config_bool("x86.strictmsr")) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } eax = val; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", vme->u.msr.code, vme->u.msr.wval, *pvcpu); - if (strictmsr) { + if (get_config_bool("x86.strictmsr")) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); } static int vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { (void)spinup_ap(ctx, *pvcpu, vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); return (VMEXIT_CONTINUE); } #define DEBUG_EPT_MISCONFIG #ifdef DEBUG_EPT_MISCONFIG #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; static int ept_misconfig_ptenum; #endif static const char * vmexit_vmx_desc(uint32_t exit_reason) { if (exit_reason >= nitems(vmx_exit_reason_desc) || vmx_exit_reason_desc[exit_reason] == NULL) return ("Unknown"); return (vmx_exit_reason_desc[exit_reason]); } static int vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tVMX\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status); fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason, vmexit_vmx_desc(vmexit->u.vmx.exit_reason)); fprintf(stderr, "\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); #ifdef DEBUG_EPT_MISCONFIG if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { vm_get_register(ctx, *pvcpu, VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), &ept_misconfig_gpa); vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, &ept_misconfig_ptenum); fprintf(stderr, "\tEPT misconfiguration:\n"); fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", ept_misconfig_ptenum, ept_misconfig_pte[0], ept_misconfig_pte[1], ept_misconfig_pte[2], ept_misconfig_pte[3]); } #endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } static int vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tSVM\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode); fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1); fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2); return (VMEXIT_ABORT); } static int vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_bogus++; return (VMEXIT_CONTINUE); } static int vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_reqidle++; return (VMEXIT_CONTINUE); } static int vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_hlt++; /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_pause++; return (VMEXIT_CONTINUE); } static int vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_mtrap++; #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(*pvcpu); #endif - if (gdb_port != 0) - gdb_cpu_mtrap(*pvcpu); + gdb_cpu_mtrap(*pvcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(*pvcpu); #endif return (VMEXIT_CONTINUE); } static int vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { int err, i, cs_d; struct vie *vie; enum vm_cpu_mode mode; stats.vmexit_inst_emul++; vie = &vmexit->u.inst_emul.vie; if (!vie->decoded) { /* * Attempt to decode in userspace as a fallback. This allows * updating instruction decode in bhyve without rebooting the * kernel (rapid prototyping), albeit with much slower * emulation. */ vie_restart(vie); mode = vmexit->u.inst_emul.paging.cpu_mode; cs_d = vmexit->u.inst_emul.cs_d; if (vmm_decode_instruction(mode, cs_d, vie) != 0) goto fail; if (vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RIP, vmexit->rip + vie->num_processed) != 0) goto fail; } err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, vie, &vmexit->u.inst_emul.paging); if (err) { if (err == ESRCH) { EPRINTLN("Unhandled memory access to 0x%lx\n", vmexit->u.inst_emul.gpa); } goto fail; } return (VMEXIT_CONTINUE); fail: fprintf(stderr, "Failed to emulate instruction sequence [ "); for (i = 0; i < vie->num_valid; i++) fprintf(stderr, "%02x", vie->inst[i]); FPRINTLN(stderr, " ] at 0x%lx", vmexit->rip); return (VMEXIT_ABORT); } static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; static int vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { enum vm_suspend_how how; how = vmexit->u.suspended.how; fbsdrun_deletecpu(ctx, *pvcpu); if (*pvcpu != BSP) { pthread_mutex_lock(&resetcpu_mtx); pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); } pthread_mutex_lock(&resetcpu_mtx); while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); switch (how) { case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: - if (destroy_on_poweroff) + if (get_config_bool_default("destroy_on_poweroff", false)) vm_destroy(ctx); exit(1); case VM_SUSPEND_HALT: exit(2); case VM_SUSPEND_TRIPLEFAULT: exit(3); default: fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); exit(100); } return (0); /* NOTREACHED */ } static int vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(*pvcpu); #endif - if (gdb_port != 0) - gdb_cpu_suspend(*pvcpu); + gdb_cpu_suspend(*pvcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(*pvcpu); #endif return (VMEXIT_CONTINUE); } static int vmexit_breakpoint(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { - if (gdb_port == 0) { - fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n"); - exit(4); - } gdb_cpu_breakpoint(*pvcpu, vmexit); return (VMEXIT_CONTINUE); } static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, [VM_EXITCODE_BPT] = vmexit_breakpoint, }; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) { int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus; if (vcpumap[vcpu] != NULL) { error = pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), vcpumap[vcpu]); assert(error == 0); } error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu, &active_cpus)); error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); assert(error == 0); while (1) { error = vm_run(ctx, vcpu, &vmexit[vcpu]); if (error != 0) break; exitcode = vmexit[vcpu].exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(4); } rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(4); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx) { int tmp, error; error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ if (error == 0) return (VM_MAXCPU); else return (1); } void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) { int err, tmp; - if (fbsdrun_vmexit_on_hlt()) { + if (get_config_bool_default("x86.vmexit_on_hlt", false)) { err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_HLT] = vmexit_hlt; } - if (fbsdrun_vmexit_on_pause()) { + if (get_config_bool_default("x86.vmexit_on_pause", false)) { /* * pause exit support required for this mode */ err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_PAUSE] = vmexit_pause; } - if (x2apic_mode) + if (get_config_bool_default("x86.x2apic", false)) err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED); else err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool reinit, romboot; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; const cap_ioctl_t *cmds; size_t ncmds; #endif reinit = romboot = false; if (lpc_bootrom()) romboot = true; error = vm_create(vmname); if (error) { if (errno == EEXIST) { if (romboot) { reinit = true; } else { /* * The virtual machine has been setup by the * userspace bootloader. */ } } else { perror("vm_create"); exit(4); } } else { if (!romboot) { /* * If the virtual machine was just created then a * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); exit(4); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(4); } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); vm_get_ioctls(&ncmds); cmds = vm_get_ioctls(NULL); if (cmds == NULL) errx(EX_OSERR, "out of memory"); if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); free((cap_ioctl_t *)cmds); #endif if (reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(4); } } error = vm_set_topology(ctx, sockets, cores, threads, maxcpus); if (error) errx(EX_OSERR, "vm_set_topology"); return (ctx); } void spinup_vcpu(struct vmctx *ctx, int vcpu) { int error; uint64_t rip; error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); assert(error == 0); fbsdrun_set_capabilities(ctx, vcpu); error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); fbsdrun_addcpu(ctx, BSP, vcpu, rip); } +static bool +parse_config_option(const char *option) +{ + const char *value; + char *path; + + value = strchr(option, '='); + if (value == NULL || value[1] == '\0') + return (false); + path = strndup(option, value - option); + if (path == NULL) + err(4, "Failed to allocate memory"); + set_config_value(path, value + 1); + return (true); +} + +static void +parse_simple_config_file(const char *path) +{ + FILE *fp; + char *line, *cp; + size_t linecap; + unsigned int lineno; + + fp = fopen(path, "r"); + if (fp == NULL) + err(4, "Failed to open configuration file %s", path); + line = NULL; + linecap = 0; + lineno = 1; + for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) { + if (*line == '#' || *line == '\n') + continue; + cp = strchr(line, '\n'); + if (cp != NULL) + *cp = '\0'; + if (!parse_config_option(line)) + errx(4, "%s line %u: invalid config option '%s'", path, + lineno, line); + } + free(line); + fclose(fp); +} + +static void +set_defaults(void) +{ + + set_config_bool("acpi_tables", false); + set_config_value("memory.size", "256M"); + set_config_bool("x86.strictmsr", true); +} + int main(int argc, char *argv[]) { int c, error, err; - int max_vcpus, mptgen, memflags; - int rtc_localtime; - bool gdb_stop; + int max_vcpus, memflags; struct vmctx *ctx; uint64_t rip; size_t memsize; + const char *value, *vmname; char *optstr; #ifdef BHYVE_SNAPSHOT char *restore_file; struct restore_state rstate; int vcpu; restore_file = NULL; #endif + init_config(); + set_defaults(); progname = basename(argv[0]); - gdb_stop = false; - guest_ncpus = 1; - sockets = cores = threads = 1; - maxcpus = 0; - memsize = 256 * MB; - mptgen = 1; - rtc_localtime = 1; - memflags = 0; #ifdef BHYVE_SNAPSHOT - optstr = "aehuwxACDHIPSWYp:G:c:s:m:l:U:r:"; + optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:r:"; #else - optstr = "aehuwxACDHIPSWYp:G:c:s:m:l:U:"; + optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': - x2apic_mode = 0; + set_config_bool("x86.x2apic", false); break; case 'A': - acpi = 1; + set_config_bool("acpi_tables", true); break; case 'D': - destroy_on_poweroff = 1; + set_config_bool("destroy_on_poweroff", true); break; case 'p': if (pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': if (topology_parse(optarg) != 0) { errx(EX_USAGE, "invalid cpu topology " "'%s'", optarg); } break; case 'C': - memflags |= VM_MEM_F_INCORE; + set_config_bool("memory.guest_in_core", true); break; case 'G': if (optarg[0] == 'w') { - gdb_stop = true; + set_config_bool("gdb.wait", true); optarg++; } - gdb_port = atoi(optarg); + set_config_value("gdb.port", optarg); + break; + case 'k': + parse_simple_config_file(optarg); break; case 'l': if (strncmp(optarg, "help", strlen(optarg)) == 0) { lpc_print_supported_devices(); exit(0); } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; #ifdef BHYVE_SNAPSHOT case 'r': restore_file = optarg; break; #endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); exit(0); } else if (pci_parse_slot(optarg) != 0) exit(4); else break; case 'S': - memflags |= VM_MEM_F_WIRED; + set_config_bool("memory.wired", true); break; case 'm': - error = vm_parse_memsize(optarg, &memsize); - if (error) - errx(EX_USAGE, "invalid memsize '%s'", optarg); + set_config_value("memory.size", optarg); + break; + case 'o': + if (!parse_config_option(optarg)) + errx(EX_USAGE, "invalid configuration option '%s'", optarg); break; case 'H': - guest_vmexit_on_hlt = 1; + set_config_bool("x86.vmexit_on_hlt", true); break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': - guest_vmexit_on_pause = 1; + set_config_bool("x86.vmexit_on_pause", true); break; case 'e': - strictio = 1; + set_config_bool("x86.strictio", true); break; case 'u': - rtc_localtime = 0; + set_config_bool("rtc.use_localtime", false); break; case 'U': - guest_uuid_str = optarg; + set_config_value("uuid", optarg); break; case 'w': - strictmsr = 0; + set_config_bool("x86.strictmsr", false); break; case 'W': - virtio_msix = 0; + set_config_bool("virtio_msix", false); break; case 'x': - x2apic_mode = 1; + set_config_bool("x86.x2apic", true); break; case 'Y': - mptgen = 0; + set_config_bool("x86.mptable", false); break; case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; -#ifdef BHYVE_SNAPSHOT - if (argc > 1 || (argc == 0 && restore_file == NULL)) + if (argc > 1) usage(1); +#ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { error = load_restore_file(restore_file, &rstate); if (error) { fprintf(stderr, "Failed to read checkpoint info from " "file: '%s'.\n", restore_file); exit(1); } - } - - if (argc == 1) { - vmname = argv[0]; - } else { vmname = lookup_vmname(&rstate); - if (vmname == NULL) { - fprintf(stderr, "Cannot find VM name in restore file. " - "Please specify one.\n"); - exit(1); - } + if (vmname != NULL) + set_config_value("name", vmname); } -#else - if (argc != 1) +#endif + + if (argc == 1) + set_config_value("name", argv[0]); + + vmname = get_config_value("name"); + if (vmname == NULL) usage(1); - vmname = argv[0]; -#endif + if (get_config_bool_default("config.dump", false)) { + dump_config(); + exit(1); + } + + calc_topolopgy(); + build_vcpumaps(); + + value = get_config_value("memory.size"); + error = vm_parse_memsize(value, &memsize); + if (error) + errx(EX_USAGE, "invalid memsize '%s'", value); + ctx = do_open(vmname); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { guest_ncpus = lookup_guest_ncpus(&rstate); memflags = lookup_memflags(&rstate); memsize = lookup_memsize(&rstate); } if (guest_ncpus < 1) { fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); exit(1); } #endif max_vcpus = num_vcpus_allowed(ctx); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(4); } fbsdrun_set_capabilities(ctx, BSP); + memflags = 0; + if (get_config_bool_default("memory.wired", false)) + memflags |= VM_MEM_F_WIRED; + if (get_config_bool_default("memory.guest_in_core", false)) + memflags |= VM_MEM_F_INCORE; vm_set_memflags(ctx, memflags); err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (err) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); exit(4); } init_mem(); init_inout(); kernemu_dev_init(); init_bootrom(ctx); atkbdc_init(ctx); pci_irq_init(ctx); ioapic_init(ctx); - rtc_init(ctx, rtc_localtime); + rtc_init(ctx); sci_init(ctx); /* * Exit if a device emulation finds an error in its initilization */ if (init_pci(ctx) != 0) { perror("device emulation initialization error"); exit(4); } /* * Initialize after PCI, to allow a bootrom file to reserve the high * region. */ - if (acpi) + if (get_config_bool("acpi_tables")) vmgenc_init(ctx); - if (gdb_port != 0) - init_gdb(ctx, gdb_port, gdb_stop); + value = get_config_value("gdb.port"); + if (value != NULL) + init_gdb(ctx, atoi(value), get_config_bool_default("gdb.wait", + false)); if (lpc_bootrom()) { if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); exit(4); } error = vcpu_reset(ctx, BSP); assert(error == 0); } #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { fprintf(stdout, "Pausing pci devs...\r\n"); if (vm_pause_user_devs(ctx) != 0) { fprintf(stderr, "Failed to pause PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring vm mem...\r\n"); if (restore_vm_mem(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore VM memory.\n"); exit(1); } fprintf(stdout, "Restoring pci devs...\r\n"); if (vm_restore_user_devs(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring kernel structs...\r\n"); if (vm_restore_kern_structs(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore kernel structs.\n"); exit(1); } fprintf(stdout, "Resuming pci devs...\r\n"); if (vm_resume_user_devs(ctx) != 0) { fprintf(stderr, "Failed to resume PCI device state.\n"); exit(1); } } #endif error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); assert(error == 0); /* * build the guest tables, MP etc. */ - if (mptgen) { + if (get_config_bool_default("x86.mptable", true)) { error = mptable_build(ctx, guest_ncpus); if (error) { perror("error to build the guest tables"); exit(4); } } error = smbios_build(ctx); assert(error == 0); - if (acpi) { + if (get_config_bool("acpi_tables")) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } if (lpc_bootrom()) fwctl_init(); /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); #ifndef WITHOUT_CAPSICUM caph_cache_catpages(); if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_enter() == -1) errx(EX_OSERR, "cap_enter() failed"); #endif #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) destroy_restore_state(&rstate); /* * checkpointing thread for communication with bhyvectl */ if (init_checkpoint_thread(ctx) < 0) printf("Failed to start checkpoint thread!\r\n"); if (restore_file != NULL) vm_restore_time(ctx); #endif /* * Add CPU 0 */ fbsdrun_addcpu(ctx, BSP, BSP, rip); #ifdef BHYVE_SNAPSHOT /* * If we restore a VM, start all vCPUs now (including APs), otherwise, * let the guest OS to spin them up later via vmexits. */ if (restore_file != NULL) { for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { if (vcpu == BSP) continue; fprintf(stdout, "spinning up vcpu no %d...\r\n", vcpu); spinup_vcpu(ctx, vcpu); } } #endif /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(4); } diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h index 0177baca14e9..9b0c9fb533e2 100644 --- a/usr.sbin/bhyve/bhyverun.h +++ b/usr.sbin/bhyve/bhyverun.h @@ -1,55 +1,50 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _FBSDRUN_H_ #define _FBSDRUN_H_ #define VMEXIT_CONTINUE (0) #define VMEXIT_ABORT (-1) struct vmctx; extern int guest_ncpus; extern uint16_t cores, sockets, threads; -extern char *guest_uuid_str; -extern const char *vmname; void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len); #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr); #endif void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu); void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip); -int fbsdrun_muxed(void); -int fbsdrun_vmexit_on_hlt(void); -int fbsdrun_vmexit_on_pause(void); -int fbsdrun_disable_x2apic(void); int fbsdrun_virtio_msix(void); + #endif diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c index 4c91038ca765..ae09bc71d622 100644 --- a/usr.sbin/bhyve/block_if.c +++ b/usr.sbin/bhyve/block_if.c @@ -1,988 +1,1012 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Peter Grehan * All rights reserved. * Copyright 2020 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" +#include "config.h" #include "debug.h" #include "mevent.h" +#include "pci_emul.h" #include "block_if.h" #define BLOCKIF_SIG 0xb109b109 #define BLOCKIF_NUMTHR 8 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) enum blockop { BOP_READ, BOP_WRITE, BOP_FLUSH, BOP_DELETE }; enum blockstat { BST_FREE, BST_BLOCK, BST_PEND, BST_BUSY, BST_DONE }; struct blockif_elem { TAILQ_ENTRY(blockif_elem) be_link; struct blockif_req *be_req; enum blockop be_op; enum blockstat be_status; pthread_t be_tid; off_t be_block; }; struct blockif_ctxt { int bc_magic; int bc_fd; int bc_ischr; int bc_isgeom; int bc_candelete; int bc_rdonly; off_t bc_size; int bc_sectsz; int bc_psectsz; int bc_psectoff; int bc_closing; int bc_paused; int bc_work_count; pthread_t bc_btid[BLOCKIF_NUMTHR]; pthread_mutex_t bc_mtx; pthread_cond_t bc_cond; pthread_cond_t bc_paused_cond; pthread_cond_t bc_work_done_cond; /* Request elements and free/pending/busy queues */ TAILQ_HEAD(, blockif_elem) bc_freeq; TAILQ_HEAD(, blockif_elem) bc_pendq; TAILQ_HEAD(, blockif_elem) bc_busyq; struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; }; static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; struct blockif_sig_elem { pthread_mutex_t bse_mtx; pthread_cond_t bse_cond; int bse_pending; struct blockif_sig_elem *bse_next; }; static struct blockif_sig_elem *blockif_bse_head; static int blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { struct blockif_elem *be, *tbe; off_t off; int i; be = TAILQ_FIRST(&bc->bc_freeq); assert(be != NULL); assert(be->be_status == BST_FREE); TAILQ_REMOVE(&bc->bc_freeq, be, be_link); be->be_req = breq; be->be_op = op; switch (op) { case BOP_READ: case BOP_WRITE: case BOP_DELETE: off = breq->br_offset; for (i = 0; i < breq->br_iovcnt; i++) off += breq->br_iov[i].iov_len; break; default: off = OFF_MAX; } be->be_block = off; TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { if (tbe->be_block == breq->br_offset) break; } if (tbe == NULL) { TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { if (tbe->be_block == breq->br_offset) break; } } if (tbe == NULL) be->be_status = BST_PEND; else be->be_status = BST_BLOCK; TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); return (be->be_status == BST_PEND); } static int blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) { struct blockif_elem *be; TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { if (be->be_status == BST_PEND) break; assert(be->be_status == BST_BLOCK); } if (be == NULL) return (0); TAILQ_REMOVE(&bc->bc_pendq, be, be_link); be->be_status = BST_BUSY; be->be_tid = t; TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); *bep = be; return (1); } static void blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) { struct blockif_elem *tbe; if (be->be_status == BST_DONE || be->be_status == BST_BUSY) TAILQ_REMOVE(&bc->bc_busyq, be, be_link); else TAILQ_REMOVE(&bc->bc_pendq, be, be_link); TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { if (tbe->be_req->br_offset == be->be_block) tbe->be_status = BST_PEND; } be->be_tid = 0; be->be_status = BST_FREE; be->be_req = NULL; TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); } static int blockif_flush_bc(struct blockif_ctxt *bc) { if (bc->bc_ischr) { if (ioctl(bc->bc_fd, DIOCGFLUSH)) return (errno); } else if (fsync(bc->bc_fd)) return (errno); return (0); } static void blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) { struct blockif_req *br; off_t arg[2]; ssize_t clen, len, off, boff, voff; int i, err; br = be->be_req; if (br->br_iovcnt <= 1) buf = NULL; err = 0; switch (be->be_op) { case BOP_READ: if (buf == NULL) { if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, br->br_offset)) < 0) err = errno; else br->br_resid -= len; break; } i = 0; off = voff = 0; while (br->br_resid > 0) { len = MIN(br->br_resid, MAXPHYS); if (pread(bc->bc_fd, buf, len, br->br_offset + off) < 0) { err = errno; break; } boff = 0; do { clen = MIN(len - boff, br->br_iov[i].iov_len - voff); memcpy(br->br_iov[i].iov_base + voff, buf + boff, clen); if (clen < br->br_iov[i].iov_len - voff) voff += clen; else { i++; voff = 0; } boff += clen; } while (boff < len); off += len; br->br_resid -= len; } break; case BOP_WRITE: if (bc->bc_rdonly) { err = EROFS; break; } if (buf == NULL) { if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, br->br_offset)) < 0) err = errno; else br->br_resid -= len; break; } i = 0; off = voff = 0; while (br->br_resid > 0) { len = MIN(br->br_resid, MAXPHYS); boff = 0; do { clen = MIN(len - boff, br->br_iov[i].iov_len - voff); memcpy(buf + boff, br->br_iov[i].iov_base + voff, clen); if (clen < br->br_iov[i].iov_len - voff) voff += clen; else { i++; voff = 0; } boff += clen; } while (boff < len); if (pwrite(bc->bc_fd, buf, len, br->br_offset + off) < 0) { err = errno; break; } off += len; br->br_resid -= len; } break; case BOP_FLUSH: err = blockif_flush_bc(bc); break; case BOP_DELETE: if (!bc->bc_candelete) err = EOPNOTSUPP; else if (bc->bc_rdonly) err = EROFS; else if (bc->bc_ischr) { arg[0] = br->br_offset; arg[1] = br->br_resid; if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) err = errno; else br->br_resid = 0; } else err = EOPNOTSUPP; break; default: err = EINVAL; break; } be->be_status = BST_DONE; (*br->br_callback)(br, err); } static void * blockif_thr(void *arg) { struct blockif_ctxt *bc; struct blockif_elem *be; pthread_t t; uint8_t *buf; bc = arg; if (bc->bc_isgeom) buf = malloc(MAXPHYS); else buf = NULL; t = pthread_self(); pthread_mutex_lock(&bc->bc_mtx); for (;;) { bc->bc_work_count++; /* We cannot process work if the interface is paused */ while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) { pthread_mutex_unlock(&bc->bc_mtx); blockif_proc(bc, be, buf); pthread_mutex_lock(&bc->bc_mtx); blockif_complete(bc, be); } bc->bc_work_count--; /* If none of the workers are busy, notify the main thread */ if (bc->bc_work_count == 0) pthread_cond_broadcast(&bc->bc_work_done_cond); /* Check ctxt status here to see if exit requested */ if (bc->bc_closing) break; /* Make all worker threads wait here if the device is paused */ while (bc->bc_paused) pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx); pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); } pthread_mutex_unlock(&bc->bc_mtx); if (buf) free(buf); pthread_exit(NULL); return (NULL); } static void blockif_sigcont_handler(int signal, enum ev_type type, void *arg) { struct blockif_sig_elem *bse; for (;;) { /* * Process the entire list even if not intended for * this thread. */ do { bse = blockif_bse_head; if (bse == NULL) return; } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, (uintptr_t)bse, (uintptr_t)bse->bse_next)); pthread_mutex_lock(&bse->bse_mtx); bse->bse_pending = 0; pthread_cond_signal(&bse->bse_cond); pthread_mutex_unlock(&bse->bse_mtx); } } static void blockif_init(void) { mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); (void) signal(SIGCONT, SIG_IGN); } +int +blockif_legacy_config(nvlist_t *nvl, const char *opts) +{ + char *cp, *path; + + if (opts == NULL) + return (0); + + cp = strchr(opts, ','); + if (cp == NULL) { + set_config_value_node(nvl, "path", opts); + return (0); + } + path = strndup(opts, cp - opts); + set_config_value_node(nvl, "path", path); + free(path); + return (pci_parse_legacy_config(nvl, cp + 1)); +} + struct blockif_ctxt * -blockif_open(const char *optstr, const char *ident) +blockif_open(nvlist_t *nvl, const char *ident) { char tname[MAXCOMLEN + 1]; char name[MAXPATHLEN]; - char *nopt, *xopts, *cp; + const char *path, *pssval, *ssval; + char *cp; struct blockif_ctxt *bc; struct stat sbuf; struct diocgattr_arg arg; off_t size, psectsz, psectoff; int extra, fd, i, sectsz; - int nocache, sync, ro, candelete, geom, ssopt, pssopt; + int ro, candelete, geom, ssopt, pssopt; int nodelete; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; #endif pthread_once(&blockif_once, blockif_init); fd = -1; + extra = 0; ssopt = 0; - nocache = 0; - sync = 0; ro = 0; nodelete = 0; - /* - * The first element in the optstring is always a pathname. - * Optional elements follow - */ - nopt = xopts = strdup(optstr); - while (xopts != NULL) { - cp = strsep(&xopts, ","); - if (cp == nopt) /* file or device pathname */ - continue; - else if (!strcmp(cp, "nocache")) - nocache = 1; - else if (!strcmp(cp, "nodelete")) - nodelete = 1; - else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) - sync = 1; - else if (!strcmp(cp, "ro")) - ro = 1; - else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) - ; - else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) + if (get_config_bool_node_default(nvl, "nocache", false)) + extra |= O_DIRECT; + if (get_config_bool_node_default(nvl, "nodelete", false)) + nodelete = 1; + if (get_config_bool_node_default(nvl, "sync", false) || + get_config_bool_node_default(nvl, "direct", false)) + extra |= O_SYNC; + if (get_config_bool_node_default(nvl, "ro", false)) + ro = 1; + ssval = get_config_value_node(nvl, "sectorsize"); + if (ssval != NULL) { + ssopt = strtol(ssval, &cp, 10); + if (cp == ssval) { + EPRINTLN("Invalid sector size \"%s\"", ssval); + goto err; + } + if (*cp == '\0') { pssopt = ssopt; - else { - EPRINTLN("Invalid device option \"%s\"", cp); + } else if (*cp == '/') { + pssval = cp + 1; + pssopt = strtol(pssval, &cp, 10); + if (cp == pssval || *cp != '\0') { + EPRINTLN("Invalid sector size \"%s\"", ssval); + goto err; + } + } else { + EPRINTLN("Invalid sector size \"%s\"", ssval); goto err; } } - extra = 0; - if (nocache) - extra |= O_DIRECT; - if (sync) - extra |= O_SYNC; + path = get_config_value_node(nvl, "path"); + if (path == NULL) { + EPRINTLN("Missing \"path\" for block device."); + goto err; + } - fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); + fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra); if (fd < 0 && !ro) { /* Attempt a r/w fail with a r/o open */ - fd = open(nopt, O_RDONLY | extra); + fd = open(path, O_RDONLY | extra); ro = 1; } if (fd < 0) { - warn("Could not open backing file: %s", nopt); + warn("Could not open backing file: %s", path); goto err; } if (fstat(fd, &sbuf) < 0) { - warn("Could not stat backing file %s", nopt); + warn("Could not stat backing file %s", path); goto err; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, CAP_WRITE); if (ro) cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); if (caph_rights_limit(fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif /* * Deal with raw devices */ size = sbuf.st_size; sectsz = DEV_BSIZE; psectsz = psectoff = 0; candelete = geom = 0; if (S_ISCHR(sbuf.st_mode)) { if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || ioctl(fd, DIOCGSECTORSIZE, §sz)) { perror("Could not fetch dev blk/sector size"); goto err; } assert(size != 0); assert(sectsz != 0); if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); arg.len = sizeof(arg.value.i); if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) candelete = arg.value.i; if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) geom = 1; } else psectsz = sbuf.st_blksize; #ifndef WITHOUT_CAPSICUM if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif if (ssopt != 0) { if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || ssopt > pssopt) { EPRINTLN("Invalid sector size %d/%d", ssopt, pssopt); goto err; } /* * Some backend drivers (e.g. cd0, ada0) require that the I/O * size be a multiple of the device's sector size. * * Validate that the emulated sector size complies with this * requirement. */ if (S_ISCHR(sbuf.st_mode)) { if (ssopt < sectsz || (ssopt % sectsz) != 0) { EPRINTLN("Sector size %d incompatible " "with underlying device sector size %d", ssopt, sectsz); goto err; } } sectsz = ssopt; psectsz = pssopt; psectoff = 0; } bc = calloc(1, sizeof(struct blockif_ctxt)); if (bc == NULL) { perror("calloc"); goto err; } bc->bc_magic = BLOCKIF_SIG; bc->bc_fd = fd; bc->bc_ischr = S_ISCHR(sbuf.st_mode); bc->bc_isgeom = geom; bc->bc_candelete = candelete; bc->bc_rdonly = ro; bc->bc_size = size; bc->bc_sectsz = sectsz; bc->bc_psectsz = psectsz; bc->bc_psectoff = psectoff; pthread_mutex_init(&bc->bc_mtx, NULL); pthread_cond_init(&bc->bc_cond, NULL); bc->bc_paused = 0; bc->bc_work_count = 0; pthread_cond_init(&bc->bc_paused_cond, NULL); pthread_cond_init(&bc->bc_work_done_cond, NULL); TAILQ_INIT(&bc->bc_freeq); TAILQ_INIT(&bc->bc_pendq); TAILQ_INIT(&bc->bc_busyq); for (i = 0; i < BLOCKIF_MAXREQ; i++) { bc->bc_reqs[i].be_status = BST_FREE; TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); } for (i = 0; i < BLOCKIF_NUMTHR; i++) { pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); pthread_set_name_np(bc->bc_btid[i], tname); } return (bc); err: if (fd >= 0) close(fd); - free(nopt); return (NULL); } static int blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { int err; err = 0; pthread_mutex_lock(&bc->bc_mtx); if (!TAILQ_EMPTY(&bc->bc_freeq)) { /* * Enqueue and inform the block i/o thread * that there is work available */ if (blockif_enqueue(bc, breq, op)) pthread_cond_signal(&bc->bc_cond); } else { /* * Callers are not allowed to enqueue more than * the specified blockif queue limit. Return an * error to indicate that the queue length has been * exceeded. */ err = E2BIG; } pthread_mutex_unlock(&bc->bc_mtx); return (err); } int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_READ)); } int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_WRITE)); } int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_FLUSH)); } int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_DELETE)); } int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) { struct blockif_elem *be; assert(bc->bc_magic == BLOCKIF_SIG); pthread_mutex_lock(&bc->bc_mtx); /* XXX: not waiting while paused */ /* * Check pending requests. */ TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { if (be->be_req == breq) break; } if (be != NULL) { /* * Found it. */ blockif_complete(bc, be); pthread_mutex_unlock(&bc->bc_mtx); return (0); } /* * Check in-flight requests. */ TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { if (be->be_req == breq) break; } if (be == NULL) { /* * Didn't find it. */ pthread_mutex_unlock(&bc->bc_mtx); return (EINVAL); } /* * Interrupt the processing thread to force it return * prematurely via it's normal callback path. */ while (be->be_status == BST_BUSY) { struct blockif_sig_elem bse, *old_head; pthread_mutex_init(&bse.bse_mtx, NULL); pthread_cond_init(&bse.bse_cond, NULL); bse.bse_pending = 1; do { old_head = blockif_bse_head; bse.bse_next = old_head; } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, (uintptr_t)old_head, (uintptr_t)&bse)); pthread_kill(be->be_tid, SIGCONT); pthread_mutex_lock(&bse.bse_mtx); while (bse.bse_pending) pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); pthread_mutex_unlock(&bse.bse_mtx); } pthread_mutex_unlock(&bc->bc_mtx); /* * The processing thread has been interrupted. Since it's not * clear if the callback has been invoked yet, return EBUSY. */ return (EBUSY); } int blockif_close(struct blockif_ctxt *bc) { void *jval; int i; assert(bc->bc_magic == BLOCKIF_SIG); /* * Stop the block i/o thread */ pthread_mutex_lock(&bc->bc_mtx); bc->bc_closing = 1; pthread_mutex_unlock(&bc->bc_mtx); pthread_cond_broadcast(&bc->bc_cond); for (i = 0; i < BLOCKIF_NUMTHR; i++) pthread_join(bc->bc_btid[i], &jval); /* XXX Cancel queued i/o's ??? */ /* * Release resources */ bc->bc_magic = 0; close(bc->bc_fd); free(bc); return (0); } /* * Return virtual C/H/S values for a given block. Use the algorithm * outlined in the VHD specification to calculate values. */ void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) { off_t sectors; /* total sectors of the block dev */ off_t hcyl; /* cylinders times heads */ uint16_t secpt; /* sectors per track */ uint8_t heads; assert(bc->bc_magic == BLOCKIF_SIG); sectors = bc->bc_size / bc->bc_sectsz; /* Clamp the size to the largest possible with CHS */ if (sectors > 65535UL*16*255) sectors = 65535UL*16*255; if (sectors >= 65536UL*16*63) { secpt = 255; heads = 16; hcyl = sectors / secpt; } else { secpt = 17; hcyl = sectors / secpt; heads = (hcyl + 1023) / 1024; if (heads < 4) heads = 4; if (hcyl >= (heads * 1024) || heads > 16) { secpt = 31; heads = 16; hcyl = sectors / secpt; } if (hcyl >= (heads * 1024)) { secpt = 63; heads = 16; hcyl = sectors / secpt; } } *c = hcyl / heads; *h = heads; *s = secpt; } /* * Accessors */ off_t blockif_size(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_size); } int blockif_sectsz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_sectsz); } void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) { assert(bc->bc_magic == BLOCKIF_SIG); *size = bc->bc_psectsz; *off = bc->bc_psectoff; } int blockif_queuesz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (BLOCKIF_MAXREQ - 1); } int blockif_is_ro(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_rdonly); } int blockif_candelete(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_candelete); } #ifdef BHYVE_SNAPSHOT void blockif_pause(struct blockif_ctxt *bc) { assert(bc != NULL); assert(bc->bc_magic == BLOCKIF_SIG); pthread_mutex_lock(&bc->bc_mtx); bc->bc_paused = 1; /* The interface is paused. Wait for workers to finish their work */ while (bc->bc_work_count) pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx); pthread_mutex_unlock(&bc->bc_mtx); if (blockif_flush_bc(bc)) fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n", __func__); } void blockif_resume(struct blockif_ctxt *bc) { assert(bc != NULL); assert(bc->bc_magic == BLOCKIF_SIG); pthread_mutex_lock(&bc->bc_mtx); bc->bc_paused = 0; /* resume the threads waiting for paused */ pthread_cond_broadcast(&bc->bc_paused_cond); /* kick the threads after restore */ pthread_cond_broadcast(&bc->bc_cond); pthread_mutex_unlock(&bc->bc_mtx); } int blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta) { int i; struct iovec *iov; int ret; SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done); /* * XXX: The callback and parameter must be filled by the virtualized * device that uses the interface, during its init; we're not touching * them here. */ /* Snapshot the iovecs. */ for (i = 0; i < br->br_iovcnt; i++) { iov = &br->br_iov[i]; SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done); /* We assume the iov is a guest-mapped address. */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len, false, meta, ret, done); } done: return (ret); } int blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta) { int ret; if (bc->bc_paused == 0) { fprintf(stderr, "%s: Snapshot failed: " "interface not paused.\r\n", __func__); return (ENXIO); } pthread_mutex_lock(&bc->bc_mtx); SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done); done: pthread_mutex_unlock(&bc->bc_mtx); return (ret); } #endif diff --git a/usr.sbin/bhyve/block_if.h b/usr.sbin/bhyve/block_if.h index f3b5b6938ef1..87ae4d169969 100644 --- a/usr.sbin/bhyve/block_if.h +++ b/usr.sbin/bhyve/block_if.h @@ -1,89 +1,91 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Peter Grehan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * The block API to be used by bhyve block-device emulations. The routines * are thread safe, with no assumptions about the context of the completion * callback - it may occur in the caller's context, or asynchronously in * another thread. */ #ifndef _BLOCK_IF_H_ #define _BLOCK_IF_H_ +#include #include #include struct vm_snapshot_meta; /* * BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in * a single request. BLOCKIF_RING_MAX is the maxmimum number of * pending requests that can be queued. */ #define BLOCKIF_IOV_MAX 128 /* not practical to be IOV_MAX */ #define BLOCKIF_RING_MAX 128 struct blockif_req { int br_iovcnt; off_t br_offset; ssize_t br_resid; void (*br_callback)(struct blockif_req *req, int err); void *br_param; struct iovec br_iov[BLOCKIF_IOV_MAX]; }; struct blockif_ctxt; -struct blockif_ctxt *blockif_open(const char *optstr, const char *ident); +int blockif_legacy_config(nvlist_t *nvl, const char *opts); +struct blockif_ctxt *blockif_open(nvlist_t *nvl, const char *ident); off_t blockif_size(struct blockif_ctxt *bc); void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s); int blockif_sectsz(struct blockif_ctxt *bc); void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off); int blockif_queuesz(struct blockif_ctxt *bc); int blockif_is_ro(struct blockif_ctxt *bc); int blockif_candelete(struct blockif_ctxt *bc); int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_close(struct blockif_ctxt *bc); #ifdef BHYVE_SNAPSHOT void blockif_pause(struct blockif_ctxt *bc); void blockif_resume(struct blockif_ctxt *bc); int blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta); int blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta); #endif #endif /* _BLOCK_IF_H_ */ diff --git a/usr.sbin/bhyve/config.c b/usr.sbin/bhyve/config.c new file mode 100644 index 000000000000..1df750e9576c --- /dev/null +++ b/usr.sbin/bhyve/config.c @@ -0,0 +1,431 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 John H. Baldwin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include "config.h" + +static nvlist_t *config_root; + +void +init_config(void) +{ + + config_root = nvlist_create(0); + if (config_root == NULL) + err(4, "Failed to create configuration root nvlist"); +} + +static nvlist_t * +_lookup_config_node(nvlist_t *parent, const char *path, bool create) +{ + char *copy, *name, *tofree; + nvlist_t *nvl, *new_nvl; + + copy = strdup(path); + if (copy == NULL) + errx(4, "Failed to allocate memory"); + tofree = copy; + nvl = parent; + while ((name = strsep(©, ".")) != NULL) { + if (*name == '\0') { + warnx("Invalid configuration node: %s", path); + nvl = NULL; + break; + } + if (nvlist_exists_nvlist(nvl, name)) + nvl = (nvlist_t *)nvlist_get_nvlist(nvl, name); + else if (nvlist_exists(nvl, name)) { + for (copy = tofree; copy < name; copy++) + if (*copy == '\0') + *copy = '.'; + warnx( + "Configuration node %s is a child of existing variable %s", + path, tofree); + nvl = NULL; + break; + } else if (create) { + new_nvl = nvlist_create(0); + if (new_nvl == NULL) + errx(4, "Failed to allocate memory"); + nvlist_move_nvlist(nvl, name, new_nvl); + nvl = new_nvl; + } else { + nvl = NULL; + break; + } + } + free(tofree); + return (nvl); +} + +nvlist_t * +create_config_node(const char *path) +{ + + return (_lookup_config_node(config_root, path, true)); +} + +nvlist_t * +find_config_node(const char *path) +{ + + return (_lookup_config_node(config_root, path, false)); +} + +nvlist_t * +create_relative_config_node(nvlist_t *parent, const char *path) +{ + + return (_lookup_config_node(parent, path, true)); +} + +nvlist_t * +find_relative_config_node(nvlist_t *parent, const char *path) +{ + + return (_lookup_config_node(parent, path, false)); +} + +void +set_config_value_node(nvlist_t *parent, const char *name, const char *value) +{ + + if (strchr(name, '.') != NULL) + errx(4, "Invalid config node name %s", name); + if (parent == NULL) + parent = config_root; + if (nvlist_exists_string(parent, name)) + nvlist_free_string(parent, name); + else if (nvlist_exists(parent, name)) + errx(4, + "Attemping to add value %s to existing node %s of list %p", + value, name, parent); + nvlist_add_string(parent, name, value); +} + +void +set_config_value(const char *path, const char *value) +{ + const char *name; + char *node_name; + nvlist_t *nvl; + + /* Look for last separator. */ + name = strrchr(path, '.'); + if (name == NULL) { + nvl = config_root; + name = path; + } else { + node_name = strndup(path, name - path); + if (node_name == NULL) + errx(4, "Failed to allocate memory"); + nvl = create_config_node(node_name); + if (nvl == NULL) + errx(4, "Failed to create configuration node %s", + node_name); + free(node_name); + + /* Skip over '.'. */ + name++; + } + + if (nvlist_exists_nvlist(nvl, name)) + errx(4, "Attempting to add value %s to existing node %s", + value, path); + set_config_value_node(nvl, name, value); +} + +static const char * +get_raw_config_value(const char *path) +{ + const char *name; + char *node_name; + nvlist_t *nvl; + + /* Look for last separator. */ + name = strrchr(path, '.'); + if (name == NULL) { + nvl = config_root; + name = path; + } else { + node_name = strndup(path, name - path); + if (node_name == NULL) + errx(4, "Failed to allocate memory"); + nvl = find_config_node(node_name); + free(node_name); + if (nvl == NULL) + return (NULL); + + /* Skip over '.'. */ + name++; + } + + if (nvlist_exists_string(nvl, name)) + return (nvlist_get_string(nvl, name)); + if (nvlist_exists_nvlist(nvl, name)) + warnx("Attempting to fetch value of node %s", path); + return (NULL); +} + +static char * +_expand_config_value(const char *value, int depth) +{ + FILE *valfp; + const char *cp, *vp; + char *nestedval, *path, *valbuf; + size_t valsize; + + valfp = open_memstream(&valbuf, &valsize); + if (valfp == NULL) + errx(4, "Failed to allocate memory"); + + vp = value; + while (*vp != '\0') { + switch (*vp) { + case '%': + if (depth > 15) { + warnx( + "Too many recursive references in configuration value"); + fputc('%', valfp); + vp++; + break; + } + if (vp[1] != '(' || vp[2] == '\0') + cp = NULL; + else + cp = strchr(vp + 2, ')'); + if (cp == NULL) { + warnx( + "Invalid reference in configuration value \"%s\"", + value); + fputc('%', valfp); + vp++; + break; + } + vp += 2; + + if (cp == vp) { + warnx( + "Empty reference in configuration value \"%s\"", + value); + vp++; + break; + } + + /* Allocate a C string holding the path. */ + path = strndup(vp, cp - vp); + if (path == NULL) + errx(4, "Failed to allocate memory"); + + /* Advance 'vp' past the reference. */ + vp = cp + 1; + + /* Fetch the referenced value. */ + cp = get_raw_config_value(path); + if (cp == NULL) + warnx( + "Failed to fetch referenced configuration variable %s", + path); + else { + nestedval = _expand_config_value(cp, depth + 1); + fputs(nestedval, valfp); + free(nestedval); + } + free(path); + break; + case '\\': + vp++; + if (*vp == '\0') { + warnx( + "Trailing \\ in configuration value \"%s\"", + value); + break; + } + /* FALLTHROUGH */ + default: + fputc(*vp, valfp); + vp++; + break; + } + } + fclose(valfp); + return (valbuf); +} + +const char * +expand_config_value(const char *value) +{ + static char *valbuf; + + if (strchr(value, '%') == NULL) + return (value); + + free(valbuf); + valbuf = _expand_config_value(value, 0); + return (valbuf); +} + +const char * +get_config_value(const char *path) +{ + const char *value; + + value = get_raw_config_value(path); + if (value == NULL) + return (NULL); + return (expand_config_value(value)); +} + +const char * +get_config_value_node(const nvlist_t *parent, const char *name) +{ + + if (strchr(name, '.') != NULL) + errx(4, "Invalid config node name %s", name); + if (parent == NULL) + parent = config_root; + if (nvlist_exists_nvlist(parent, name)) + warnx("Attempt to fetch value of node %s of list %p", name, + parent); + if (!nvlist_exists_string(parent, name)) + return (NULL); + + return (expand_config_value(nvlist_get_string(parent, name))); +} + +bool +_bool_value(const char *name, const char *value) +{ + + if (strcasecmp(value, "true") == 0 || + strcasecmp(value, "on") == 0 || + strcasecmp(value, "yes") == 0 || + strcmp(value, "1") == 0) + return (true); + if (strcasecmp(value, "false") == 0 || + strcasecmp(value, "off") == 0 || + strcasecmp(value, "no") == 0 || + strcmp(value, "0") == 0) + return (false); + err(4, "Invalid value %s for boolean variable %s", value, name); +} + +bool +get_config_bool(const char *path) +{ + const char *value; + + value = get_config_value(path); + if (value == NULL) + err(4, "Failed to fetch boolean variable %s", path); + return (_bool_value(path, value)); +} + +bool +get_config_bool_default(const char *path, bool def) +{ + const char *value; + + value = get_config_value(path); + if (value == NULL) + return (def); + return (_bool_value(path, value)); +} + +bool +get_config_bool_node(const nvlist_t *parent, const char *name) +{ + const char *value; + + value = get_config_value_node(parent, name); + if (value == NULL) + err(4, "Failed to fetch boolean variable %s", name); + return (_bool_value(name, value)); +} + +bool +get_config_bool_node_default(const nvlist_t *parent, const char *name, + bool def) +{ + const char *value; + + value = get_config_value_node(parent, name); + if (value == NULL) + return (def); + return (_bool_value(name, value)); +} + +void +set_config_bool(const char *path, bool value) +{ + + set_config_value(path, value ? "true" : "false"); +} + +void +set_config_bool_node(nvlist_t *parent, const char *name, bool value) +{ + + set_config_value_node(parent, name, value ? "true" : "false"); +} + +static void +dump_tree(const char *prefix, const nvlist_t *nvl) +{ + const char *name; + void *cookie; + int type; + + cookie = NULL; + while ((name = nvlist_next(nvl, &type, &cookie)) != NULL) { + if (type == NV_TYPE_NVLIST) { + char *new_prefix; + + asprintf(&new_prefix, "%s%s.", prefix, name); + dump_tree(new_prefix, nvlist_get_nvlist(nvl, name)); + free(new_prefix); + } else { + assert(type == NV_TYPE_STRING); + printf("%s%s=%s\n", prefix, name, + nvlist_get_string(nvl, name)); + } + } +} + +void +dump_config(void) +{ + dump_tree("", config_root); +} diff --git a/usr.sbin/bhyve/config.h b/usr.sbin/bhyve/config.h new file mode 100644 index 000000000000..9f82afb267b2 --- /dev/null +++ b/usr.sbin/bhyve/config.h @@ -0,0 +1,119 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 John H. Baldwin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __CONFIG_H__ +#define __CONFIG_H__ + +#include + +/*- + * Manages a configuration database backed by an nv(9) list. + * + * The database only stores string values. Callers should parse + * values into other types if needed. String values can reference + * other configuration variables using a '%(name)' syntax. In this + * case, the name must be the the full path of the configuration + * variable. The % character can be escaped with a preceding \ to + * avoid expansion. Any \ characters must be escaped. + * + * Configuration variables are stored in a tree. The full path of a + * variable is specified as a dot-separated name similar to sysctl(8) + * OIDs. + */ + +/* + * Fetches the value of a configuration variable. If the "raw" value + * contains references to other configuration variables, this function + * expands those references and returns a pointer to the parsed + * string. The string's storage is only stable until the next call to + * this function. + * + * If no node is found, returns NULL. + * + * If 'parent' is NULL, 'name' is assumed to be a top-level variable. + */ +const char *get_config_value_node(const nvlist_t *parent, const char *name); + +/* + * Similar to get_config_value_node but expects a full path to the + * leaf node. + */ +const char *get_config_value(const char *path); + +/* Initializes the tree to an empty state. */ +void init_config(void); + +/* + * Creates an existing configuration node via a dot-separated OID + * path. Will fail if the path names an existing leaf configuration + * variable. If the node already exists, this returns a pointer to + * the existing node. + */ +nvlist_t *create_config_node(const char *path); + +/* + * Looks for an existing configuration node via a dot-separated OID + * path. Will fail if the path names an existing leaf configuration + * variable. + */ +nvlist_t *find_config_node(const char *path); + +/* + * Similar to the above, but treats the path relative to an existing + * 'parent' node rather than as an absolute path. + */ +nvlist_t *create_relative_config_node(nvlist_t *parent, const char *path); +nvlist_t *find_relative_config_node(nvlist_t *parent, const char *path); + +/* + * Adds or replaces the value of the specified variable. + * + * If 'parent' is NULL, 'name' is assumed to be a top-level variable. + */ +void set_config_value_node(nvlist_t *parent, const char *name, + const char *value); + +/* + * Similar to set_config_value_node but expects a full path to the + * leaf node. + */ +void set_config_value(const char *path, const char *value); + +/* Convenience wrappers for boolean variables. */ +bool get_config_bool(const char *path); +bool get_config_bool_node(const nvlist_t *parent, const char *name); +bool get_config_bool_default(const char *path, bool def); +bool get_config_bool_node_default(const nvlist_t *parent, const char *name, + bool def); +void set_config_bool(const char *path, bool value); +void set_config_bool_node(nvlist_t *parent, const char *name, bool value); + +void dump_config(void); + +#endif /* !__CONFIG_H__ */ diff --git a/usr.sbin/bhyve/gdb.c b/usr.sbin/bhyve/gdb.c index b0f0c034b2c7..c4c37ab80bc7 100644 --- a/usr.sbin/bhyve/gdb.c +++ b/usr.sbin/bhyve/gdb.c @@ -1,1862 +1,1874 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017-2018 John H. Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "gdb.h" #include "mem.h" #include "mevent.h" /* * GDB_SIGNAL_* numbers are part of the GDB remote protocol. Most stops * use SIGTRAP. */ #define GDB_SIGNAL_TRAP 5 static void gdb_resume_vcpus(void); static void check_command(int fd); static struct mevent *read_event, *write_event; static cpuset_t vcpus_active, vcpus_suspended, vcpus_waiting; static pthread_mutex_t gdb_lock; static pthread_cond_t idle_vcpus; static bool first_stop, report_next_stop, swbreak_enabled; /* * An I/O buffer contains 'capacity' bytes of room at 'data'. For a * read buffer, 'start' is unused and 'len' contains the number of * valid bytes in the buffer. For a write buffer, 'start' is set to * the index of the next byte in 'data' to send, and 'len' contains * the remaining number of valid bytes to send. */ struct io_buffer { uint8_t *data; size_t capacity; size_t start; size_t len; }; struct breakpoint { uint64_t gpa; uint8_t shadow_inst; TAILQ_ENTRY(breakpoint) link; }; /* * When a vCPU stops to due to an event that should be reported to the * debugger, information about the event is stored in this structure. * The vCPU thread then sets 'stopped_vcpu' if it is not already set * and stops other vCPUs so the event can be reported. The * report_stop() function reports the event for the 'stopped_vcpu' * vCPU. When the debugger resumes execution via continue or step, * the event for 'stopped_vcpu' is cleared. vCPUs will loop in their * event handlers until the associated event is reported or disabled. * * An idle vCPU will have all of the boolean fields set to false. * * When a vCPU is stepped, 'stepping' is set to true when the vCPU is * released to execute the stepped instruction. When the vCPU reports * the stepping trap, 'stepped' is set. * * When a vCPU hits a breakpoint set by the debug server, * 'hit_swbreak' is set to true. */ struct vcpu_state { bool stepping; bool stepped; bool hit_swbreak; }; static struct io_buffer cur_comm, cur_resp; static uint8_t cur_csum; static struct vmctx *ctx; static int cur_fd = -1; static TAILQ_HEAD(, breakpoint) breakpoints; static struct vcpu_state *vcpu_state; static int cur_vcpu, stopped_vcpu; +static bool gdb_active = false; const int gdb_regset[] = { VM_REG_GUEST_RAX, VM_REG_GUEST_RBX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_RBP, VM_REG_GUEST_RSP, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15, VM_REG_GUEST_RIP, VM_REG_GUEST_RFLAGS, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_ES, VM_REG_GUEST_FS, VM_REG_GUEST_GS }; const int gdb_regsize[] = { 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4 }; #ifdef GDB_LOG #include #include static void __printflike(1, 2) debug(const char *fmt, ...) { static FILE *logfile; va_list ap; if (logfile == NULL) { logfile = fopen("/tmp/bhyve_gdb.log", "w"); if (logfile == NULL) return; #ifndef WITHOUT_CAPSICUM if (caph_limit_stream(fileno(logfile), CAPH_WRITE) == -1) { fclose(logfile); logfile = NULL; return; } #endif setlinebuf(logfile); } va_start(ap, fmt); vfprintf(logfile, fmt, ap); va_end(ap); } #else #define debug(...) #endif static void remove_all_sw_breakpoints(void); static int guest_paging_info(int vcpu, struct vm_guest_paging *paging) { uint64_t regs[4]; const int regset[4] = { VM_REG_GUEST_CR0, VM_REG_GUEST_CR3, VM_REG_GUEST_CR4, VM_REG_GUEST_EFER }; if (vm_get_register_set(ctx, vcpu, nitems(regset), regset, regs) == -1) return (-1); /* * For the debugger, always pretend to be the kernel (CPL 0), * and if long-mode is enabled, always parse addresses as if * in 64-bit mode. */ paging->cr3 = regs[1]; paging->cpl = 0; if (regs[3] & EFER_LMA) paging->cpu_mode = CPU_MODE_64BIT; else if (regs[0] & CR0_PE) paging->cpu_mode = CPU_MODE_PROTECTED; else paging->cpu_mode = CPU_MODE_REAL; if (!(regs[0] & CR0_PG)) paging->paging_mode = PAGING_MODE_FLAT; else if (!(regs[2] & CR4_PAE)) paging->paging_mode = PAGING_MODE_32; else if (regs[3] & EFER_LME) paging->paging_mode = (regs[2] & CR4_LA57) ? PAGING_MODE_64_LA57 : PAGING_MODE_64; else paging->paging_mode = PAGING_MODE_PAE; return (0); } /* * Map a guest virtual address to a physical address (for a given vcpu). * If a guest virtual address is valid, return 1. If the address is * not valid, return 0. If an error occurs obtaining the mapping, * return -1. */ static int guest_vaddr2paddr(int vcpu, uint64_t vaddr, uint64_t *paddr) { struct vm_guest_paging paging; int fault; if (guest_paging_info(vcpu, &paging) == -1) return (-1); /* * Always use PROT_READ. We really care if the VA is * accessible, not if the current vCPU can write. */ if (vm_gla2gpa_nofault(ctx, vcpu, &paging, vaddr, PROT_READ, paddr, &fault) == -1) return (-1); if (fault) return (0); return (1); } static void io_buffer_reset(struct io_buffer *io) { io->start = 0; io->len = 0; } /* Available room for adding data. */ static size_t io_buffer_avail(struct io_buffer *io) { return (io->capacity - (io->start + io->len)); } static uint8_t * io_buffer_head(struct io_buffer *io) { return (io->data + io->start); } static uint8_t * io_buffer_tail(struct io_buffer *io) { return (io->data + io->start + io->len); } static void io_buffer_advance(struct io_buffer *io, size_t amount) { assert(amount <= io->len); io->start += amount; io->len -= amount; } static void io_buffer_consume(struct io_buffer *io, size_t amount) { io_buffer_advance(io, amount); if (io->len == 0) { io->start = 0; return; } /* * XXX: Consider making this move optional and compacting on a * future read() before realloc(). */ memmove(io->data, io_buffer_head(io), io->len); io->start = 0; } static void io_buffer_grow(struct io_buffer *io, size_t newsize) { uint8_t *new_data; size_t avail, new_cap; avail = io_buffer_avail(io); if (newsize <= avail) return; new_cap = io->capacity + (newsize - avail); new_data = realloc(io->data, new_cap); if (new_data == NULL) err(1, "Failed to grow GDB I/O buffer"); io->data = new_data; io->capacity = new_cap; } static bool response_pending(void) { if (cur_resp.start == 0 && cur_resp.len == 0) return (false); if (cur_resp.start + cur_resp.len == 1 && cur_resp.data[0] == '+') return (false); return (true); } static void close_connection(void) { /* * XXX: This triggers a warning because mevent does the close * before the EV_DELETE. */ pthread_mutex_lock(&gdb_lock); mevent_delete(write_event); mevent_delete_close(read_event); write_event = NULL; read_event = NULL; io_buffer_reset(&cur_comm); io_buffer_reset(&cur_resp); cur_fd = -1; remove_all_sw_breakpoints(); /* Clear any pending events. */ memset(vcpu_state, 0, guest_ncpus * sizeof(*vcpu_state)); /* Resume any stopped vCPUs. */ gdb_resume_vcpus(); pthread_mutex_unlock(&gdb_lock); } static uint8_t hex_digit(uint8_t nibble) { if (nibble <= 9) return (nibble + '0'); else return (nibble + 'a' - 10); } static uint8_t parse_digit(uint8_t v) { if (v >= '0' && v <= '9') return (v - '0'); if (v >= 'a' && v <= 'f') return (v - 'a' + 10); if (v >= 'A' && v <= 'F') return (v - 'A' + 10); return (0xF); } /* Parses big-endian hexadecimal. */ static uintmax_t parse_integer(const uint8_t *p, size_t len) { uintmax_t v; v = 0; while (len > 0) { v <<= 4; v |= parse_digit(*p); p++; len--; } return (v); } static uint8_t parse_byte(const uint8_t *p) { return (parse_digit(p[0]) << 4 | parse_digit(p[1])); } static void send_pending_data(int fd) { ssize_t nwritten; if (cur_resp.len == 0) { mevent_disable(write_event); return; } nwritten = write(fd, io_buffer_head(&cur_resp), cur_resp.len); if (nwritten == -1) { warn("Write to GDB socket failed"); close_connection(); } else { io_buffer_advance(&cur_resp, nwritten); if (cur_resp.len == 0) mevent_disable(write_event); else mevent_enable(write_event); } } /* Append a single character to the output buffer. */ static void send_char(uint8_t data) { io_buffer_grow(&cur_resp, 1); *io_buffer_tail(&cur_resp) = data; cur_resp.len++; } /* Append an array of bytes to the output buffer. */ static void send_data(const uint8_t *data, size_t len) { io_buffer_grow(&cur_resp, len); memcpy(io_buffer_tail(&cur_resp), data, len); cur_resp.len += len; } static void format_byte(uint8_t v, uint8_t *buf) { buf[0] = hex_digit(v >> 4); buf[1] = hex_digit(v & 0xf); } /* * Append a single byte (formatted as two hex characters) to the * output buffer. */ static void send_byte(uint8_t v) { uint8_t buf[2]; format_byte(v, buf); send_data(buf, sizeof(buf)); } static void start_packet(void) { send_char('$'); cur_csum = 0; } static void finish_packet(void) { send_char('#'); send_byte(cur_csum); debug("-> %.*s\n", (int)cur_resp.len, io_buffer_head(&cur_resp)); } /* * Append a single character (for the packet payload) and update the * checksum. */ static void append_char(uint8_t v) { send_char(v); cur_csum += v; } /* * Append an array of bytes (for the packet payload) and update the * checksum. */ static void append_packet_data(const uint8_t *data, size_t len) { send_data(data, len); while (len > 0) { cur_csum += *data; data++; len--; } } static void append_string(const char *str) { append_packet_data(str, strlen(str)); } static void append_byte(uint8_t v) { uint8_t buf[2]; format_byte(v, buf); append_packet_data(buf, sizeof(buf)); } static void append_unsigned_native(uintmax_t value, size_t len) { size_t i; for (i = 0; i < len; i++) { append_byte(value); value >>= 8; } } static void append_unsigned_be(uintmax_t value, size_t len) { char buf[len * 2]; size_t i; for (i = 0; i < len; i++) { format_byte(value, buf + (len - i - 1) * 2); value >>= 8; } append_packet_data(buf, sizeof(buf)); } static void append_integer(unsigned int value) { if (value == 0) append_char('0'); else append_unsigned_be(value, (fls(value) + 7) / 8); } static void append_asciihex(const char *str) { while (*str != '\0') { append_byte(*str); str++; } } static void send_empty_response(void) { start_packet(); finish_packet(); } static void send_error(int error) { start_packet(); append_char('E'); append_byte(error); finish_packet(); } static void send_ok(void) { start_packet(); append_string("OK"); finish_packet(); } static int parse_threadid(const uint8_t *data, size_t len) { if (len == 1 && *data == '0') return (0); if (len == 2 && memcmp(data, "-1", 2) == 0) return (-1); if (len == 0) return (-2); return (parse_integer(data, len)); } /* * Report the current stop event to the debugger. If the stop is due * to an event triggered on a specific vCPU such as a breakpoint or * stepping trap, stopped_vcpu will be set to the vCPU triggering the * stop. If 'set_cur_vcpu' is true, then cur_vcpu will be updated to * the reporting vCPU for vCPU events. */ static void report_stop(bool set_cur_vcpu) { struct vcpu_state *vs; start_packet(); if (stopped_vcpu == -1) { append_char('S'); append_byte(GDB_SIGNAL_TRAP); } else { vs = &vcpu_state[stopped_vcpu]; if (set_cur_vcpu) cur_vcpu = stopped_vcpu; append_char('T'); append_byte(GDB_SIGNAL_TRAP); append_string("thread:"); append_integer(stopped_vcpu + 1); append_char(';'); if (vs->hit_swbreak) { debug("$vCPU %d reporting swbreak\n", stopped_vcpu); if (swbreak_enabled) append_string("swbreak:;"); } else if (vs->stepped) debug("$vCPU %d reporting step\n", stopped_vcpu); else debug("$vCPU %d reporting ???\n", stopped_vcpu); } finish_packet(); report_next_stop = false; } /* * If this stop is due to a vCPU event, clear that event to mark it as * acknowledged. */ static void discard_stop(void) { struct vcpu_state *vs; if (stopped_vcpu != -1) { vs = &vcpu_state[stopped_vcpu]; vs->hit_swbreak = false; vs->stepped = false; stopped_vcpu = -1; } report_next_stop = true; } static void gdb_finish_suspend_vcpus(void) { if (first_stop) { first_stop = false; stopped_vcpu = -1; } else if (report_next_stop) { assert(!response_pending()); report_stop(true); send_pending_data(cur_fd); } } /* * vCPU threads invoke this function whenever the vCPU enters the * debug server to pause or report an event. vCPU threads wait here * as long as the debug server keeps them suspended. */ static void _gdb_cpu_suspend(int vcpu, bool report_stop) { + if (!gdb_active) + return; debug("$vCPU %d suspending\n", vcpu); CPU_SET(vcpu, &vcpus_waiting); if (report_stop && CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0) gdb_finish_suspend_vcpus(); while (CPU_ISSET(vcpu, &vcpus_suspended)) pthread_cond_wait(&idle_vcpus, &gdb_lock); CPU_CLR(vcpu, &vcpus_waiting); debug("$vCPU %d resuming\n", vcpu); } /* * Invoked at the start of a vCPU thread's execution to inform the * debug server about the new thread. */ void gdb_cpu_add(int vcpu) { + if (!gdb_active) + return; debug("$vCPU %d starting\n", vcpu); pthread_mutex_lock(&gdb_lock); assert(vcpu < guest_ncpus); CPU_SET(vcpu, &vcpus_active); if (!TAILQ_EMPTY(&breakpoints)) { vm_set_capability(ctx, vcpu, VM_CAP_BPT_EXIT, 1); debug("$vCPU %d enabled breakpoint exits\n", vcpu); } /* * If a vcpu is added while vcpus are stopped, suspend the new * vcpu so that it will pop back out with a debug exit before * executing the first instruction. */ if (!CPU_EMPTY(&vcpus_suspended)) { CPU_SET(vcpu, &vcpus_suspended); _gdb_cpu_suspend(vcpu, false); } pthread_mutex_unlock(&gdb_lock); } /* * Invoked by vCPU before resuming execution. This enables stepping * if the vCPU is marked as stepping. */ static void gdb_cpu_resume(int vcpu) { struct vcpu_state *vs; int error; vs = &vcpu_state[vcpu]; /* * Any pending event should already be reported before * resuming. */ assert(vs->hit_swbreak == false); assert(vs->stepped == false); if (vs->stepping) { error = vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 1); assert(error == 0); } } /* * Handler for VM_EXITCODE_DEBUG used to suspend a vCPU when the guest * has been suspended due to an event on different vCPU or in response * to a guest-wide suspend such as Ctrl-C or the stop on attach. */ void gdb_cpu_suspend(int vcpu) { pthread_mutex_lock(&gdb_lock); _gdb_cpu_suspend(vcpu, true); gdb_cpu_resume(vcpu); pthread_mutex_unlock(&gdb_lock); } static void gdb_suspend_vcpus(void) { assert(pthread_mutex_isowned_np(&gdb_lock)); debug("suspending all CPUs\n"); vcpus_suspended = vcpus_active; vm_suspend_cpu(ctx, -1); if (CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0) gdb_finish_suspend_vcpus(); } /* * Handler for VM_EXITCODE_MTRAP reported when a vCPU single-steps via * the VT-x-specific MTRAP exit. */ void gdb_cpu_mtrap(int vcpu) { struct vcpu_state *vs; + if (!gdb_active) + return; debug("$vCPU %d MTRAP\n", vcpu); pthread_mutex_lock(&gdb_lock); vs = &vcpu_state[vcpu]; if (vs->stepping) { vs->stepping = false; vs->stepped = true; vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 0); while (vs->stepped) { if (stopped_vcpu == -1) { debug("$vCPU %d reporting step\n", vcpu); stopped_vcpu = vcpu; gdb_suspend_vcpus(); } _gdb_cpu_suspend(vcpu, true); } gdb_cpu_resume(vcpu); } pthread_mutex_unlock(&gdb_lock); } static struct breakpoint * find_breakpoint(uint64_t gpa) { struct breakpoint *bp; TAILQ_FOREACH(bp, &breakpoints, link) { if (bp->gpa == gpa) return (bp); } return (NULL); } void gdb_cpu_breakpoint(int vcpu, struct vm_exit *vmexit) { struct breakpoint *bp; struct vcpu_state *vs; uint64_t gpa; int error; + if (!gdb_active) { + fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n"); + exit(4); + } pthread_mutex_lock(&gdb_lock); error = guest_vaddr2paddr(vcpu, vmexit->rip, &gpa); assert(error == 1); bp = find_breakpoint(gpa); if (bp != NULL) { vs = &vcpu_state[vcpu]; assert(vs->stepping == false); assert(vs->stepped == false); assert(vs->hit_swbreak == false); vs->hit_swbreak = true; vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, vmexit->rip); for (;;) { if (stopped_vcpu == -1) { debug("$vCPU %d reporting breakpoint at rip %#lx\n", vcpu, vmexit->rip); stopped_vcpu = vcpu; gdb_suspend_vcpus(); } _gdb_cpu_suspend(vcpu, true); if (!vs->hit_swbreak) { /* Breakpoint reported. */ break; } bp = find_breakpoint(gpa); if (bp == NULL) { /* Breakpoint was removed. */ vs->hit_swbreak = false; break; } } gdb_cpu_resume(vcpu); } else { debug("$vCPU %d injecting breakpoint at rip %#lx\n", vcpu, vmexit->rip); error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ENTRY_INST_LENGTH, vmexit->u.bpt.inst_length); assert(error == 0); error = vm_inject_exception(ctx, vcpu, IDT_BP, 0, 0, 0); assert(error == 0); } pthread_mutex_unlock(&gdb_lock); } static bool gdb_step_vcpu(int vcpu) { int error, val; debug("$vCPU %d step\n", vcpu); error = vm_get_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, &val); if (error < 0) return (false); discard_stop(); vcpu_state[vcpu].stepping = true; vm_resume_cpu(ctx, vcpu); CPU_CLR(vcpu, &vcpus_suspended); pthread_cond_broadcast(&idle_vcpus); return (true); } static void gdb_resume_vcpus(void) { assert(pthread_mutex_isowned_np(&gdb_lock)); vm_resume_cpu(ctx, -1); debug("resuming all CPUs\n"); CPU_ZERO(&vcpus_suspended); pthread_cond_broadcast(&idle_vcpus); } static void gdb_read_regs(void) { uint64_t regvals[nitems(gdb_regset)]; int i; if (vm_get_register_set(ctx, cur_vcpu, nitems(gdb_regset), gdb_regset, regvals) == -1) { send_error(errno); return; } start_packet(); for (i = 0; i < nitems(regvals); i++) append_unsigned_native(regvals[i], gdb_regsize[i]); finish_packet(); } static void gdb_read_mem(const uint8_t *data, size_t len) { uint64_t gpa, gva, val; uint8_t *cp; size_t resid, todo, bytes; bool started; int error; /* Skip 'm' */ data += 1; len -= 1; /* Parse and consume address. */ cp = memchr(data, ',', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } gva = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Parse length. */ resid = parse_integer(data, len); started = false; while (resid > 0) { error = guest_vaddr2paddr(cur_vcpu, gva, &gpa); if (error == -1) { if (started) finish_packet(); else send_error(errno); return; } if (error == 0) { if (started) finish_packet(); else send_error(EFAULT); return; } /* Read bytes from current page. */ todo = getpagesize() - gpa % getpagesize(); if (todo > resid) todo = resid; cp = paddr_guest2host(ctx, gpa, todo); if (cp != NULL) { /* * If this page is guest RAM, read it a byte * at a time. */ if (!started) { start_packet(); started = true; } while (todo > 0) { append_byte(*cp); cp++; gpa++; gva++; resid--; todo--; } } else { /* * If this page isn't guest RAM, try to handle * it via MMIO. For MMIO requests, use * aligned reads of words when possible. */ while (todo > 0) { if (gpa & 1 || todo == 1) bytes = 1; else if (gpa & 2 || todo == 2) bytes = 2; else bytes = 4; error = read_mem(ctx, cur_vcpu, gpa, &val, bytes); if (error == 0) { if (!started) { start_packet(); started = true; } gpa += bytes; gva += bytes; resid -= bytes; todo -= bytes; while (bytes > 0) { append_byte(val); val >>= 8; bytes--; } } else { if (started) finish_packet(); else send_error(EFAULT); return; } } } assert(resid == 0 || gpa % getpagesize() == 0); } if (!started) start_packet(); finish_packet(); } static void gdb_write_mem(const uint8_t *data, size_t len) { uint64_t gpa, gva, val; uint8_t *cp; size_t resid, todo, bytes; int error; /* Skip 'M' */ data += 1; len -= 1; /* Parse and consume address. */ cp = memchr(data, ',', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } gva = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Parse and consume length. */ cp = memchr(data, ':', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } resid = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Verify the available bytes match the length. */ if (len != resid * 2) { send_error(EINVAL); return; } while (resid > 0) { error = guest_vaddr2paddr(cur_vcpu, gva, &gpa); if (error == -1) { send_error(errno); return; } if (error == 0) { send_error(EFAULT); return; } /* Write bytes to current page. */ todo = getpagesize() - gpa % getpagesize(); if (todo > resid) todo = resid; cp = paddr_guest2host(ctx, gpa, todo); if (cp != NULL) { /* * If this page is guest RAM, write it a byte * at a time. */ while (todo > 0) { assert(len >= 2); *cp = parse_byte(data); data += 2; len -= 2; cp++; gpa++; gva++; resid--; todo--; } } else { /* * If this page isn't guest RAM, try to handle * it via MMIO. For MMIO requests, use * aligned writes of words when possible. */ while (todo > 0) { if (gpa & 1 || todo == 1) { bytes = 1; val = parse_byte(data); } else if (gpa & 2 || todo == 2) { bytes = 2; val = be16toh(parse_integer(data, 4)); } else { bytes = 4; val = be32toh(parse_integer(data, 8)); } error = write_mem(ctx, cur_vcpu, gpa, val, bytes); if (error == 0) { gpa += bytes; gva += bytes; resid -= bytes; todo -= bytes; data += 2 * bytes; len -= 2 * bytes; } else { send_error(EFAULT); return; } } } assert(resid == 0 || gpa % getpagesize() == 0); } assert(len == 0); send_ok(); } static bool set_breakpoint_caps(bool enable) { cpuset_t mask; int vcpu; mask = vcpus_active; while (!CPU_EMPTY(&mask)) { vcpu = CPU_FFS(&mask) - 1; CPU_CLR(vcpu, &mask); if (vm_set_capability(ctx, vcpu, VM_CAP_BPT_EXIT, enable ? 1 : 0) < 0) return (false); debug("$vCPU %d %sabled breakpoint exits\n", vcpu, enable ? "en" : "dis"); } return (true); } static void remove_all_sw_breakpoints(void) { struct breakpoint *bp, *nbp; uint8_t *cp; if (TAILQ_EMPTY(&breakpoints)) return; TAILQ_FOREACH_SAFE(bp, &breakpoints, link, nbp) { debug("remove breakpoint at %#lx\n", bp->gpa); cp = paddr_guest2host(ctx, bp->gpa, 1); *cp = bp->shadow_inst; TAILQ_REMOVE(&breakpoints, bp, link); free(bp); } TAILQ_INIT(&breakpoints); set_breakpoint_caps(false); } static void update_sw_breakpoint(uint64_t gva, int kind, bool insert) { struct breakpoint *bp; uint64_t gpa; uint8_t *cp; int error; if (kind != 1) { send_error(EINVAL); return; } error = guest_vaddr2paddr(cur_vcpu, gva, &gpa); if (error == -1) { send_error(errno); return; } if (error == 0) { send_error(EFAULT); return; } cp = paddr_guest2host(ctx, gpa, 1); /* Only permit breakpoints in guest RAM. */ if (cp == NULL) { send_error(EFAULT); return; } /* Find any existing breakpoint. */ bp = find_breakpoint(gpa); /* * Silently ignore duplicate commands since the protocol * requires these packets to be idempotent. */ if (insert) { if (bp == NULL) { if (TAILQ_EMPTY(&breakpoints) && !set_breakpoint_caps(true)) { send_empty_response(); return; } bp = malloc(sizeof(*bp)); bp->gpa = gpa; bp->shadow_inst = *cp; *cp = 0xcc; /* INT 3 */ TAILQ_INSERT_TAIL(&breakpoints, bp, link); debug("new breakpoint at %#lx\n", gpa); } } else { if (bp != NULL) { debug("remove breakpoint at %#lx\n", gpa); *cp = bp->shadow_inst; TAILQ_REMOVE(&breakpoints, bp, link); free(bp); if (TAILQ_EMPTY(&breakpoints)) set_breakpoint_caps(false); } } send_ok(); } static void parse_breakpoint(const uint8_t *data, size_t len) { uint64_t gva; uint8_t *cp; bool insert; int kind, type; insert = data[0] == 'Z'; /* Skip 'Z/z' */ data += 1; len -= 1; /* Parse and consume type. */ cp = memchr(data, ',', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } type = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Parse and consume address. */ cp = memchr(data, ',', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } gva = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Parse and consume kind. */ cp = memchr(data, ';', len); if (cp == data) { send_error(EINVAL); return; } if (cp != NULL) { /* * We do not advertise support for either the * ConditionalBreakpoints or BreakpointCommands * features, so we should not be getting conditions or * commands from the remote end. */ send_empty_response(); return; } kind = parse_integer(data, len); data += len; len = 0; switch (type) { case 0: update_sw_breakpoint(gva, kind, insert); break; default: send_empty_response(); break; } } static bool command_equals(const uint8_t *data, size_t len, const char *cmd) { if (strlen(cmd) > len) return (false); return (memcmp(data, cmd, strlen(cmd)) == 0); } static void check_features(const uint8_t *data, size_t len) { char *feature, *next_feature, *str, *value; bool supported; str = malloc(len + 1); memcpy(str, data, len); str[len] = '\0'; next_feature = str; while ((feature = strsep(&next_feature, ";")) != NULL) { /* * Null features shouldn't exist, but skip if they * do. */ if (strcmp(feature, "") == 0) continue; /* * Look for the value or supported / not supported * flag. */ value = strchr(feature, '='); if (value != NULL) { *value = '\0'; value++; supported = true; } else { value = feature + strlen(feature) - 1; switch (*value) { case '+': supported = true; break; case '-': supported = false; break; default: /* * This is really a protocol error, * but we just ignore malformed * features for ease of * implementation. */ continue; } value = NULL; } if (strcmp(feature, "swbreak") == 0) swbreak_enabled = supported; } free(str); start_packet(); /* This is an arbitrary limit. */ append_string("PacketSize=4096"); append_string(";swbreak+"); finish_packet(); } static void gdb_query(const uint8_t *data, size_t len) { /* * TODO: * - qSearch */ if (command_equals(data, len, "qAttached")) { start_packet(); append_char('1'); finish_packet(); } else if (command_equals(data, len, "qC")) { start_packet(); append_string("QC"); append_integer(cur_vcpu + 1); finish_packet(); } else if (command_equals(data, len, "qfThreadInfo")) { cpuset_t mask; bool first; int vcpu; if (CPU_EMPTY(&vcpus_active)) { send_error(EINVAL); return; } mask = vcpus_active; start_packet(); append_char('m'); first = true; while (!CPU_EMPTY(&mask)) { vcpu = CPU_FFS(&mask) - 1; CPU_CLR(vcpu, &mask); if (first) first = false; else append_char(','); append_integer(vcpu + 1); } finish_packet(); } else if (command_equals(data, len, "qsThreadInfo")) { start_packet(); append_char('l'); finish_packet(); } else if (command_equals(data, len, "qSupported")) { data += strlen("qSupported"); len -= strlen("qSupported"); check_features(data, len); } else if (command_equals(data, len, "qThreadExtraInfo")) { char buf[16]; int tid; data += strlen("qThreadExtraInfo"); len -= strlen("qThreadExtraInfo"); if (*data != ',') { send_error(EINVAL); return; } tid = parse_threadid(data + 1, len - 1); if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) { send_error(EINVAL); return; } snprintf(buf, sizeof(buf), "vCPU %d", tid - 1); start_packet(); append_asciihex(buf); finish_packet(); } else send_empty_response(); } static void handle_command(const uint8_t *data, size_t len) { /* Reject packets with a sequence-id. */ if (len >= 3 && data[0] >= '0' && data[0] <= '9' && data[0] >= '0' && data[0] <= '9' && data[2] == ':') { send_empty_response(); return; } switch (*data) { case 'c': if (len != 1) { send_error(EINVAL); break; } discard_stop(); gdb_resume_vcpus(); break; case 'D': send_ok(); /* TODO: Resume any stopped CPUs. */ break; case 'g': { gdb_read_regs(); break; } case 'H': { int tid; if (data[1] != 'g' && data[1] != 'c') { send_error(EINVAL); break; } tid = parse_threadid(data + 2, len - 2); if (tid == -2) { send_error(EINVAL); break; } if (CPU_EMPTY(&vcpus_active)) { send_error(EINVAL); break; } if (tid == -1 || tid == 0) cur_vcpu = CPU_FFS(&vcpus_active) - 1; else if (CPU_ISSET(tid - 1, &vcpus_active)) cur_vcpu = tid - 1; else { send_error(EINVAL); break; } send_ok(); break; } case 'm': gdb_read_mem(data, len); break; case 'M': gdb_write_mem(data, len); break; case 'T': { int tid; tid = parse_threadid(data + 1, len - 1); if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) { send_error(EINVAL); return; } send_ok(); break; } case 'q': gdb_query(data, len); break; case 's': if (len != 1) { send_error(EINVAL); break; } /* Don't send a reply until a stop occurs. */ if (!gdb_step_vcpu(cur_vcpu)) { send_error(EOPNOTSUPP); break; } break; case 'z': case 'Z': parse_breakpoint(data, len); break; case '?': report_stop(false); break; case 'G': /* TODO */ case 'v': /* Handle 'vCont' */ /* 'vCtrlC' */ case 'p': /* TODO */ case 'P': /* TODO */ case 'Q': /* TODO */ case 't': /* TODO */ case 'X': /* TODO */ default: send_empty_response(); } } /* Check for a valid packet in the command buffer. */ static void check_command(int fd) { uint8_t *head, *hash, *p, sum; size_t avail, plen; for (;;) { avail = cur_comm.len; if (avail == 0) return; head = io_buffer_head(&cur_comm); switch (*head) { case 0x03: debug("<- Ctrl-C\n"); io_buffer_consume(&cur_comm, 1); gdb_suspend_vcpus(); break; case '+': /* ACK of previous response. */ debug("<- +\n"); if (response_pending()) io_buffer_reset(&cur_resp); io_buffer_consume(&cur_comm, 1); if (stopped_vcpu != -1 && report_next_stop) { report_stop(true); send_pending_data(fd); } break; case '-': /* NACK of previous response. */ debug("<- -\n"); if (response_pending()) { cur_resp.len += cur_resp.start; cur_resp.start = 0; if (cur_resp.data[0] == '+') io_buffer_advance(&cur_resp, 1); debug("-> %.*s\n", (int)cur_resp.len, io_buffer_head(&cur_resp)); } io_buffer_consume(&cur_comm, 1); send_pending_data(fd); break; case '$': /* Packet. */ if (response_pending()) { warnx("New GDB command while response in " "progress"); io_buffer_reset(&cur_resp); } /* Is packet complete? */ hash = memchr(head, '#', avail); if (hash == NULL) return; plen = (hash - head + 1) + 2; if (avail < plen) return; debug("<- %.*s\n", (int)plen, head); /* Verify checksum. */ for (sum = 0, p = head + 1; p < hash; p++) sum += *p; if (sum != parse_byte(hash + 1)) { io_buffer_consume(&cur_comm, plen); debug("-> -\n"); send_char('-'); send_pending_data(fd); break; } send_char('+'); handle_command(head + 1, hash - (head + 1)); io_buffer_consume(&cur_comm, plen); if (!response_pending()) debug("-> +\n"); send_pending_data(fd); break; default: /* XXX: Possibly drop connection instead. */ debug("-> %02x\n", *head); io_buffer_consume(&cur_comm, 1); break; } } } static void gdb_readable(int fd, enum ev_type event, void *arg) { ssize_t nread; int pending; if (ioctl(fd, FIONREAD, &pending) == -1) { warn("FIONREAD on GDB socket"); return; } /* * 'pending' might be zero due to EOF. We need to call read * with a non-zero length to detect EOF. */ if (pending == 0) pending = 1; /* Ensure there is room in the command buffer. */ io_buffer_grow(&cur_comm, pending); assert(io_buffer_avail(&cur_comm) >= pending); nread = read(fd, io_buffer_tail(&cur_comm), io_buffer_avail(&cur_comm)); if (nread == 0) { close_connection(); } else if (nread == -1) { if (errno == EAGAIN) return; warn("Read from GDB socket"); close_connection(); } else { cur_comm.len += nread; pthread_mutex_lock(&gdb_lock); check_command(fd); pthread_mutex_unlock(&gdb_lock); } } static void gdb_writable(int fd, enum ev_type event, void *arg) { send_pending_data(fd); } static void new_connection(int fd, enum ev_type event, void *arg) { int optval, s; s = accept4(fd, NULL, NULL, SOCK_NONBLOCK); if (s == -1) { if (arg != NULL) err(1, "Failed accepting initial GDB connection"); /* Silently ignore errors post-startup. */ return; } optval = 1; if (setsockopt(s, SOL_SOCKET, SO_NOSIGPIPE, &optval, sizeof(optval)) == -1) { warn("Failed to disable SIGPIPE for GDB connection"); close(s); return; } pthread_mutex_lock(&gdb_lock); if (cur_fd != -1) { close(s); warnx("Ignoring additional GDB connection."); } read_event = mevent_add(s, EVF_READ, gdb_readable, NULL); if (read_event == NULL) { if (arg != NULL) err(1, "Failed to setup initial GDB connection"); pthread_mutex_unlock(&gdb_lock); return; } write_event = mevent_add(s, EVF_WRITE, gdb_writable, NULL); if (write_event == NULL) { if (arg != NULL) err(1, "Failed to setup initial GDB connection"); mevent_delete_close(read_event); read_event = NULL; } cur_fd = s; cur_vcpu = 0; stopped_vcpu = -1; /* Break on attach. */ first_stop = true; report_next_stop = false; gdb_suspend_vcpus(); pthread_mutex_unlock(&gdb_lock); } #ifndef WITHOUT_CAPSICUM void limit_gdb_socket(int s) { cap_rights_t rights; unsigned long ioctls[] = { FIONREAD }; cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE, CAP_SETSOCKOPT, CAP_IOCTL); if (caph_rights_limit(s, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_ioctls_limit(s, ioctls, nitems(ioctls)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); } #endif void init_gdb(struct vmctx *_ctx, int sport, bool wait) { struct sockaddr_in sin; int error, flags, s; debug("==> starting on %d, %swaiting\n", sport, wait ? "" : "not "); error = pthread_mutex_init(&gdb_lock, NULL); if (error != 0) errc(1, error, "gdb mutex init"); error = pthread_cond_init(&idle_vcpus, NULL); if (error != 0) errc(1, error, "gdb cv init"); ctx = _ctx; s = socket(PF_INET, SOCK_STREAM, 0); if (s < 0) err(1, "gdb socket create"); sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl(INADDR_ANY); sin.sin_port = htons(sport); if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) err(1, "gdb socket bind"); if (listen(s, 1) < 0) err(1, "gdb socket listen"); stopped_vcpu = -1; TAILQ_INIT(&breakpoints); vcpu_state = calloc(guest_ncpus, sizeof(*vcpu_state)); if (wait) { /* * Set vcpu 0 in vcpus_suspended. This will trigger the * logic in gdb_cpu_add() to suspend the first vcpu before * it starts execution. The vcpu will remain suspended * until a debugger connects. */ CPU_SET(0, &vcpus_suspended); stopped_vcpu = 0; } flags = fcntl(s, F_GETFL); if (fcntl(s, F_SETFL, flags | O_NONBLOCK) == -1) err(1, "Failed to mark gdb socket non-blocking"); #ifndef WITHOUT_CAPSICUM limit_gdb_socket(s); #endif mevent_add(s, EVF_READ, new_connection, NULL); + gdb_active = true; } diff --git a/usr.sbin/bhyve/hda_codec.c b/usr.sbin/bhyve/hda_codec.c index 41e8121ae278..7a6ba345d8ec 100644 --- a/usr.sbin/bhyve/hda_codec.c +++ b/usr.sbin/bhyve/hda_codec.c @@ -1,952 +1,950 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Alex Teaca * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include "pci_hda.h" #include "audio.h" /* * HDA Codec defines */ #define INTEL_VENDORID 0x8086 #define HDA_CODEC_SUBSYSTEM_ID ((INTEL_VENDORID << 16) | 0x01) #define HDA_CODEC_ROOT_NID 0x00 #define HDA_CODEC_FG_NID 0x01 #define HDA_CODEC_AUDIO_OUTPUT_NID 0x02 #define HDA_CODEC_PIN_OUTPUT_NID 0x03 #define HDA_CODEC_AUDIO_INPUT_NID 0x04 #define HDA_CODEC_PIN_INPUT_NID 0x05 #define HDA_CODEC_STREAMS_COUNT 0x02 #define HDA_CODEC_STREAM_OUTPUT 0x00 #define HDA_CODEC_STREAM_INPUT 0x01 #define HDA_CODEC_PARAMS_COUNT 0x14 #define HDA_CODEC_CONN_LIST_COUNT 0x01 #define HDA_CODEC_RESPONSE_EX_UNSOL 0x10 #define HDA_CODEC_RESPONSE_EX_SOL 0x00 #define HDA_CODEC_AMP_NUMSTEPS 0x4a #define HDA_CODEC_SUPP_STREAM_FORMATS_PCM \ (1 << HDA_PARAM_SUPP_STREAM_FORMATS_PCM_SHIFT) #define HDA_CODEC_FMT_BASE_MASK (0x01 << 14) #define HDA_CODEC_FMT_MULT_MASK (0x07 << 11) #define HDA_CODEC_FMT_MULT_2 (0x01 << 11) #define HDA_CODEC_FMT_MULT_3 (0x02 << 11) #define HDA_CODEC_FMT_MULT_4 (0x03 << 11) #define HDA_CODEC_FMT_DIV_MASK 0x07 #define HDA_CODEC_FMT_DIV_SHIFT 8 #define HDA_CODEC_FMT_BITS_MASK (0x07 << 4) #define HDA_CODEC_FMT_BITS_8 (0x00 << 4) #define HDA_CODEC_FMT_BITS_16 (0x01 << 4) #define HDA_CODEC_FMT_BITS_24 (0x03 << 4) #define HDA_CODEC_FMT_BITS_32 (0x04 << 4) #define HDA_CODEC_FMT_CHAN_MASK (0x0f << 0) #define HDA_CODEC_AUDIO_WCAP_OUTPUT \ (0x00 << HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_SHIFT) #define HDA_CODEC_AUDIO_WCAP_INPUT \ (0x01 << HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_SHIFT) #define HDA_CODEC_AUDIO_WCAP_PIN \ (0x04 << HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_SHIFT) #define HDA_CODEC_AUDIO_WCAP_CONN_LIST \ (1 << HDA_PARAM_AUDIO_WIDGET_CAP_CONN_LIST_SHIFT) #define HDA_CODEC_AUDIO_WCAP_FORMAT_OVR \ (1 << HDA_PARAM_AUDIO_WIDGET_CAP_FORMAT_OVR_SHIFT) #define HDA_CODEC_AUDIO_WCAP_AMP_OVR \ (1 << HDA_PARAM_AUDIO_WIDGET_CAP_AMP_OVR_SHIFT) #define HDA_CODEC_AUDIO_WCAP_OUT_AMP \ (1 << HDA_PARAM_AUDIO_WIDGET_CAP_OUT_AMP_SHIFT) #define HDA_CODEC_AUDIO_WCAP_IN_AMP \ (1 << HDA_PARAM_AUDIO_WIDGET_CAP_IN_AMP_SHIFT) #define HDA_CODEC_AUDIO_WCAP_STEREO \ (1 << HDA_PARAM_AUDIO_WIDGET_CAP_STEREO_SHIFT) #define HDA_CODEC_PIN_CAP_OUTPUT \ (1 << HDA_PARAM_PIN_CAP_OUTPUT_CAP_SHIFT) #define HDA_CODEC_PIN_CAP_INPUT \ (1 << HDA_PARAM_PIN_CAP_INPUT_CAP_SHIFT) #define HDA_CODEC_PIN_CAP_PRESENCE_DETECT \ (1 << HDA_PARAM_PIN_CAP_PRESENCE_DETECT_CAP_SHIFT) #define HDA_CODEC_OUTPUT_AMP_CAP_MUTE_CAP \ (1 << HDA_PARAM_OUTPUT_AMP_CAP_MUTE_CAP_SHIFT) #define HDA_CODEC_OUTPUT_AMP_CAP_STEPSIZE \ (0x03 << HDA_PARAM_OUTPUT_AMP_CAP_STEPSIZE_SHIFT) #define HDA_CODEC_OUTPUT_AMP_CAP_NUMSTEPS \ (HDA_CODEC_AMP_NUMSTEPS << HDA_PARAM_OUTPUT_AMP_CAP_NUMSTEPS_SHIFT) #define HDA_CODEC_OUTPUT_AMP_CAP_OFFSET \ (HDA_CODEC_AMP_NUMSTEPS << HDA_PARAM_OUTPUT_AMP_CAP_OFFSET_SHIFT) #define HDA_CODEC_SET_AMP_GAIN_MUTE_MUTE 0x80 #define HDA_CODEC_SET_AMP_GAIN_MUTE_GAIN_MASK 0x7f #define HDA_CODEC_PIN_SENSE_PRESENCE_PLUGGED (1 << 31) #define HDA_CODEC_PIN_WIDGET_CTRL_OUT_ENABLE \ (1 << HDA_CMD_GET_PIN_WIDGET_CTRL_OUT_ENABLE_SHIFT) #define HDA_CODEC_PIN_WIDGET_CTRL_IN_ENABLE \ (1 << HDA_CMD_GET_PIN_WIDGET_CTRL_IN_ENABLE_SHIFT) #define HDA_CONFIG_DEFAULTCONF_COLOR_BLACK \ (0x01 << HDA_CONFIG_DEFAULTCONF_COLOR_SHIFT) #define HDA_CONFIG_DEFAULTCONF_COLOR_RED \ (0x05 << HDA_CONFIG_DEFAULTCONF_COLOR_SHIFT) #define HDA_CODEC_BUF_SIZE HDA_FIFO_SIZE #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) /* * HDA Audio Context data structures */ typedef void (*transfer_func_t)(void *arg); typedef int (*setup_func_t)(void *arg); struct hda_audio_ctxt { char name[64]; uint8_t run; uint8_t started; void *priv; pthread_t tid; pthread_mutex_t mtx; pthread_cond_t cond; setup_func_t do_setup; transfer_func_t do_transfer; }; /* * HDA Audio Context module function declarations */ static void *hda_audio_ctxt_thr(void *arg); static int hda_audio_ctxt_init(struct hda_audio_ctxt *actx, const char *tname, transfer_func_t do_transfer, setup_func_t do_setup, void *priv); static int hda_audio_ctxt_start(struct hda_audio_ctxt *actx); static int hda_audio_ctxt_stop(struct hda_audio_ctxt *actx); /* * HDA Codec data structures */ struct hda_codec_softc; typedef uint32_t (*verb_func_t)(struct hda_codec_softc *sc, uint16_t verb, uint16_t payload); struct hda_codec_stream { uint8_t buf[HDA_CODEC_BUF_SIZE]; uint8_t channel; uint16_t fmt; uint8_t stream; uint8_t left_gain; uint8_t right_gain; uint8_t left_mute; uint8_t right_mute; struct audio *aud; struct hda_audio_ctxt actx; }; struct hda_codec_softc { uint32_t no_nodes; uint32_t subsystem_id; const uint32_t (*get_parameters)[HDA_CODEC_PARAMS_COUNT]; const uint8_t (*conn_list)[HDA_CODEC_CONN_LIST_COUNT]; const uint32_t *conf_default; const uint8_t *pin_ctrl_default; const verb_func_t *verb_handlers; struct hda_codec_inst *hci; struct hda_codec_stream streams[HDA_CODEC_STREAMS_COUNT]; }; /* * HDA Codec module function declarations */ static int hda_codec_init(struct hda_codec_inst *hci, const char *play, - const char *rec, const char *opts); + const char *rec); static int hda_codec_reset(struct hda_codec_inst *hci); static int hda_codec_command(struct hda_codec_inst *hci, uint32_t cmd_data); static int hda_codec_notify(struct hda_codec_inst *hci, uint8_t run, uint8_t stream, uint8_t dir); static int hda_codec_parse_format(uint16_t fmt, struct audio_params *params); static uint32_t hda_codec_audio_output_nid(struct hda_codec_softc *sc, uint16_t verb, uint16_t payload); static void hda_codec_audio_output_do_transfer(void *arg); static int hda_codec_audio_output_do_setup(void *arg); static uint32_t hda_codec_audio_input_nid(struct hda_codec_softc *sc, uint16_t verb, uint16_t payload); static void hda_codec_audio_input_do_transfer(void *arg); static int hda_codec_audio_input_do_setup(void *arg); static uint32_t hda_codec_audio_inout_nid(struct hda_codec_stream *st, uint16_t verb, uint16_t payload); /* * HDA Codec global data */ #define HDA_CODEC_ROOT_DESC \ [HDA_CODEC_ROOT_NID] = { \ [HDA_PARAM_VENDOR_ID] = INTEL_VENDORID, \ [HDA_PARAM_REVISION_ID] = 0xffff, \ /* 1 Subnode, StartNid = 1 */ \ [HDA_PARAM_SUB_NODE_COUNT] = 0x00010001, \ }, \ #define HDA_CODEC_FG_COMMON_DESC \ [HDA_PARAM_FCT_GRP_TYPE] = HDA_PARAM_FCT_GRP_TYPE_NODE_TYPE_AUDIO,\ /* B8 - B32, 8.0 - 192.0kHz */ \ [HDA_PARAM_SUPP_PCM_SIZE_RATE] = (0x1f << 16) | 0x7ff, \ [HDA_PARAM_SUPP_STREAM_FORMATS] = HDA_CODEC_SUPP_STREAM_FORMATS_PCM,\ [HDA_PARAM_INPUT_AMP_CAP] = 0x00, /* None */ \ [HDA_PARAM_OUTPUT_AMP_CAP] = 0x00, /* None */ \ [HDA_PARAM_GPIO_COUNT] = 0x00, \ #define HDA_CODEC_FG_OUTPUT_DESC \ [HDA_CODEC_FG_NID] = { \ /* 2 Subnodes, StartNid = 2 */ \ [HDA_PARAM_SUB_NODE_COUNT] = 0x00020002, \ HDA_CODEC_FG_COMMON_DESC \ }, \ #define HDA_CODEC_FG_INPUT_DESC \ [HDA_CODEC_FG_NID] = { \ /* 2 Subnodes, StartNid = 4 */ \ [HDA_PARAM_SUB_NODE_COUNT] = 0x00040002, \ HDA_CODEC_FG_COMMON_DESC \ }, \ #define HDA_CODEC_FG_DUPLEX_DESC \ [HDA_CODEC_FG_NID] = { \ /* 4 Subnodes, StartNid = 2 */ \ [HDA_PARAM_SUB_NODE_COUNT] = 0x00020004, \ HDA_CODEC_FG_COMMON_DESC \ }, \ #define HDA_CODEC_OUTPUT_DESC \ [HDA_CODEC_AUDIO_OUTPUT_NID] = { \ [HDA_PARAM_AUDIO_WIDGET_CAP] = \ HDA_CODEC_AUDIO_WCAP_OUTPUT | \ HDA_CODEC_AUDIO_WCAP_FORMAT_OVR | \ HDA_CODEC_AUDIO_WCAP_AMP_OVR | \ HDA_CODEC_AUDIO_WCAP_OUT_AMP | \ HDA_CODEC_AUDIO_WCAP_STEREO, \ /* B16, 16.0 - 192.0kHz */ \ [HDA_PARAM_SUPP_PCM_SIZE_RATE] = (0x02 << 16) | 0x7fc, \ [HDA_PARAM_SUPP_STREAM_FORMATS] = \ HDA_CODEC_SUPP_STREAM_FORMATS_PCM, \ [HDA_PARAM_INPUT_AMP_CAP] = 0x00, /* None */ \ [HDA_PARAM_CONN_LIST_LENGTH] = 0x00, \ [HDA_PARAM_OUTPUT_AMP_CAP] = \ HDA_CODEC_OUTPUT_AMP_CAP_MUTE_CAP | \ HDA_CODEC_OUTPUT_AMP_CAP_STEPSIZE | \ HDA_CODEC_OUTPUT_AMP_CAP_NUMSTEPS | \ HDA_CODEC_OUTPUT_AMP_CAP_OFFSET, \ }, \ [HDA_CODEC_PIN_OUTPUT_NID] = { \ [HDA_PARAM_AUDIO_WIDGET_CAP] = \ HDA_CODEC_AUDIO_WCAP_PIN | \ HDA_CODEC_AUDIO_WCAP_CONN_LIST | \ HDA_CODEC_AUDIO_WCAP_STEREO, \ [HDA_PARAM_PIN_CAP] = HDA_CODEC_PIN_CAP_OUTPUT | \ HDA_CODEC_PIN_CAP_PRESENCE_DETECT,\ [HDA_PARAM_INPUT_AMP_CAP] = 0x00, /* None */ \ [HDA_PARAM_CONN_LIST_LENGTH] = 0x01, \ [HDA_PARAM_OUTPUT_AMP_CAP] = 0x00, /* None */ \ }, \ #define HDA_CODEC_INPUT_DESC \ [HDA_CODEC_AUDIO_INPUT_NID] = { \ [HDA_PARAM_AUDIO_WIDGET_CAP] = \ HDA_CODEC_AUDIO_WCAP_INPUT | \ HDA_CODEC_AUDIO_WCAP_CONN_LIST | \ HDA_CODEC_AUDIO_WCAP_FORMAT_OVR | \ HDA_CODEC_AUDIO_WCAP_AMP_OVR | \ HDA_CODEC_AUDIO_WCAP_IN_AMP | \ HDA_CODEC_AUDIO_WCAP_STEREO, \ /* B16, 16.0 - 192.0kHz */ \ [HDA_PARAM_SUPP_PCM_SIZE_RATE] = (0x02 << 16) | 0x7fc, \ [HDA_PARAM_SUPP_STREAM_FORMATS] = \ HDA_CODEC_SUPP_STREAM_FORMATS_PCM, \ [HDA_PARAM_OUTPUT_AMP_CAP] = 0x00, /* None */ \ [HDA_PARAM_CONN_LIST_LENGTH] = 0x01, \ [HDA_PARAM_INPUT_AMP_CAP] = \ HDA_CODEC_OUTPUT_AMP_CAP_MUTE_CAP | \ HDA_CODEC_OUTPUT_AMP_CAP_STEPSIZE | \ HDA_CODEC_OUTPUT_AMP_CAP_NUMSTEPS | \ HDA_CODEC_OUTPUT_AMP_CAP_OFFSET, \ }, \ [HDA_CODEC_PIN_INPUT_NID] = { \ [HDA_PARAM_AUDIO_WIDGET_CAP] = \ HDA_CODEC_AUDIO_WCAP_PIN | \ HDA_CODEC_AUDIO_WCAP_STEREO, \ [HDA_PARAM_PIN_CAP] = HDA_CODEC_PIN_CAP_INPUT | \ HDA_CODEC_PIN_CAP_PRESENCE_DETECT, \ [HDA_PARAM_INPUT_AMP_CAP] = 0x00, /* None */ \ [HDA_PARAM_OUTPUT_AMP_CAP] = 0x00, /* None */ \ }, \ static const uint32_t hda_codec_output_parameters[][HDA_CODEC_PARAMS_COUNT] = { HDA_CODEC_ROOT_DESC HDA_CODEC_FG_OUTPUT_DESC HDA_CODEC_OUTPUT_DESC }; static const uint32_t hda_codec_input_parameters[][HDA_CODEC_PARAMS_COUNT] = { HDA_CODEC_ROOT_DESC HDA_CODEC_FG_INPUT_DESC HDA_CODEC_INPUT_DESC }; static const uint32_t hda_codec_duplex_parameters[][HDA_CODEC_PARAMS_COUNT] = { HDA_CODEC_ROOT_DESC HDA_CODEC_FG_DUPLEX_DESC HDA_CODEC_OUTPUT_DESC HDA_CODEC_INPUT_DESC }; #define HDA_CODEC_NODES_COUNT (ARRAY_SIZE(hda_codec_duplex_parameters)) static const uint8_t hda_codec_conn_list[HDA_CODEC_NODES_COUNT][HDA_CODEC_CONN_LIST_COUNT] = { [HDA_CODEC_PIN_OUTPUT_NID] = {HDA_CODEC_AUDIO_OUTPUT_NID}, [HDA_CODEC_AUDIO_INPUT_NID] = {HDA_CODEC_PIN_INPUT_NID}, }; static const uint32_t hda_codec_conf_default[HDA_CODEC_NODES_COUNT] = { [HDA_CODEC_PIN_OUTPUT_NID] = \ HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_JACK | HDA_CONFIG_DEFAULTCONF_DEVICE_LINE_OUT | HDA_CONFIG_DEFAULTCONF_COLOR_BLACK | (0x01 << HDA_CONFIG_DEFAULTCONF_ASSOCIATION_SHIFT), [HDA_CODEC_PIN_INPUT_NID] = HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_JACK | HDA_CONFIG_DEFAULTCONF_DEVICE_LINE_IN | HDA_CONFIG_DEFAULTCONF_COLOR_RED | (0x02 << HDA_CONFIG_DEFAULTCONF_ASSOCIATION_SHIFT), }; static const uint8_t hda_codec_pin_ctrl_default[HDA_CODEC_NODES_COUNT] = { [HDA_CODEC_PIN_OUTPUT_NID] = HDA_CODEC_PIN_WIDGET_CTRL_OUT_ENABLE, [HDA_CODEC_PIN_INPUT_NID] = HDA_CODEC_PIN_WIDGET_CTRL_IN_ENABLE, }; static const verb_func_t hda_codec_verb_handlers[HDA_CODEC_NODES_COUNT] = { [HDA_CODEC_AUDIO_OUTPUT_NID] = hda_codec_audio_output_nid, [HDA_CODEC_AUDIO_INPUT_NID] = hda_codec_audio_input_nid, }; /* * HDA Codec module function definitions */ static int hda_codec_init(struct hda_codec_inst *hci, const char *play, - const char *rec, const char *opts) + const char *rec) { struct hda_codec_softc *sc = NULL; struct hda_codec_stream *st = NULL; int err; if (!(play || rec)) return (-1); - DPRINTF("cad: 0x%x opts: %s", hci->cad, opts); - sc = calloc(1, sizeof(*sc)); if (!sc) return (-1); if (play && rec) sc->get_parameters = hda_codec_duplex_parameters; else { if (play) sc->get_parameters = hda_codec_output_parameters; else sc->get_parameters = hda_codec_input_parameters; } sc->subsystem_id = HDA_CODEC_SUBSYSTEM_ID; sc->no_nodes = HDA_CODEC_NODES_COUNT; sc->conn_list = hda_codec_conn_list; sc->conf_default = hda_codec_conf_default; sc->pin_ctrl_default = hda_codec_pin_ctrl_default; sc->verb_handlers = hda_codec_verb_handlers; DPRINTF("HDA Codec nodes: %d", sc->no_nodes); /* * Initialize the Audio Output stream */ if (play) { st = &sc->streams[HDA_CODEC_STREAM_OUTPUT]; err = hda_audio_ctxt_init(&st->actx, "hda-audio-output", hda_codec_audio_output_do_transfer, hda_codec_audio_output_do_setup, sc); assert(!err); st->aud = audio_init(play, 1); if (!st->aud) { DPRINTF("Fail to init the output audio player"); return (-1); } } /* * Initialize the Audio Input stream */ if (rec) { st = &sc->streams[HDA_CODEC_STREAM_INPUT]; err = hda_audio_ctxt_init(&st->actx, "hda-audio-input", hda_codec_audio_input_do_transfer, hda_codec_audio_input_do_setup, sc); assert(!err); st->aud = audio_init(rec, 0); if (!st->aud) { DPRINTF("Fail to init the input audio player"); return (-1); } } sc->hci = hci; hci->priv = sc; return (0); } static int hda_codec_reset(struct hda_codec_inst *hci) { struct hda_ops *hops = NULL; struct hda_codec_softc *sc = NULL; struct hda_codec_stream *st = NULL; int i; assert(hci); hops = hci->hops; assert(hops); sc = (struct hda_codec_softc *)hci->priv; assert(sc); for (i = 0; i < HDA_CODEC_STREAMS_COUNT; i++) { st = &sc->streams[i]; st->left_gain = HDA_CODEC_AMP_NUMSTEPS; st->right_gain = HDA_CODEC_AMP_NUMSTEPS; st->left_mute = HDA_CODEC_SET_AMP_GAIN_MUTE_MUTE; st->right_mute = HDA_CODEC_SET_AMP_GAIN_MUTE_MUTE; } DPRINTF("cad: 0x%x", hci->cad); if (!hops->signal) { DPRINTF("The controller ops does not implement \ the signal function"); return (-1); } return (hops->signal(hci)); } static int hda_codec_command(struct hda_codec_inst *hci, uint32_t cmd_data) { struct hda_codec_softc *sc = NULL; struct hda_ops *hops = NULL; uint8_t cad = 0, nid = 0; uint16_t verb = 0, payload = 0; uint32_t res = 0; /* 4 bits */ cad = (cmd_data >> HDA_CMD_CAD_SHIFT) & 0x0f; /* 8 bits */ nid = (cmd_data >> HDA_CMD_NID_SHIFT) & 0xff; if ((cmd_data & 0x70000) == 0x70000) { /* 12 bits */ verb = (cmd_data >> HDA_CMD_VERB_12BIT_SHIFT) & 0x0fff; /* 8 bits */ payload = cmd_data & 0xff; } else { /* 4 bits */ verb = (cmd_data >> HDA_CMD_VERB_4BIT_SHIFT) & 0x0f; /* 16 bits */ payload = cmd_data & 0xffff; } assert(cad == hci->cad); assert(hci); hops = hci->hops; assert(hops); sc = (struct hda_codec_softc *)hci->priv; assert(sc); assert(nid < sc->no_nodes); if (!hops->response) { DPRINTF("The controller ops does not implement \ the response function"); return (-1); } switch (verb) { case HDA_CMD_VERB_GET_PARAMETER: res = sc->get_parameters[nid][payload]; break; case HDA_CMD_VERB_GET_CONN_LIST_ENTRY: res = sc->conn_list[nid][0]; break; case HDA_CMD_VERB_GET_PIN_WIDGET_CTRL: res = sc->pin_ctrl_default[nid]; break; case HDA_CMD_VERB_GET_PIN_SENSE: res = HDA_CODEC_PIN_SENSE_PRESENCE_PLUGGED; break; case HDA_CMD_VERB_GET_CONFIGURATION_DEFAULT: res = sc->conf_default[nid]; break; case HDA_CMD_VERB_GET_SUBSYSTEM_ID: res = sc->subsystem_id; break; default: assert(sc->verb_handlers); if (sc->verb_handlers[nid]) res = sc->verb_handlers[nid](sc, verb, payload); else DPRINTF("Unknown VERB: 0x%x", verb); break; } DPRINTF("cad: 0x%x nid: 0x%x verb: 0x%x payload: 0x%x response: 0x%x", cad, nid, verb, payload, res); return (hops->response(hci, res, HDA_CODEC_RESPONSE_EX_SOL)); } static int hda_codec_notify(struct hda_codec_inst *hci, uint8_t run, uint8_t stream, uint8_t dir) { struct hda_codec_softc *sc = NULL; struct hda_codec_stream *st = NULL; struct hda_audio_ctxt *actx = NULL; int i; int err; assert(hci); assert(stream); sc = (struct hda_codec_softc *)hci->priv; assert(sc); i = dir ? HDA_CODEC_STREAM_OUTPUT : HDA_CODEC_STREAM_INPUT; st = &sc->streams[i]; DPRINTF("run: %d, stream: 0x%x, st->stream: 0x%x dir: %d", run, stream, st->stream, dir); if (stream != st->stream) { DPRINTF("Stream not found"); return (0); } actx = &st->actx; if (run) err = hda_audio_ctxt_start(actx); else err = hda_audio_ctxt_stop(actx); return (err); } static int hda_codec_parse_format(uint16_t fmt, struct audio_params *params) { uint8_t div = 0; assert(params); /* Compute the Sample Rate */ params->rate = (fmt & HDA_CODEC_FMT_BASE_MASK) ? 44100 : 48000; switch (fmt & HDA_CODEC_FMT_MULT_MASK) { case HDA_CODEC_FMT_MULT_2: params->rate *= 2; break; case HDA_CODEC_FMT_MULT_3: params->rate *= 3; break; case HDA_CODEC_FMT_MULT_4: params->rate *= 4; break; } div = (fmt >> HDA_CODEC_FMT_DIV_SHIFT) & HDA_CODEC_FMT_DIV_MASK; params->rate /= (div + 1); /* Compute the Bits per Sample */ switch (fmt & HDA_CODEC_FMT_BITS_MASK) { case HDA_CODEC_FMT_BITS_8: params->format = AFMT_U8; break; case HDA_CODEC_FMT_BITS_16: params->format = AFMT_S16_LE; break; case HDA_CODEC_FMT_BITS_24: params->format = AFMT_S24_LE; break; case HDA_CODEC_FMT_BITS_32: params->format = AFMT_S32_LE; break; default: DPRINTF("Unknown format bits: 0x%x", fmt & HDA_CODEC_FMT_BITS_MASK); return (-1); } /* Compute the Number of Channels */ params->channels = (fmt & HDA_CODEC_FMT_CHAN_MASK) + 1; return (0); } static uint32_t hda_codec_audio_output_nid(struct hda_codec_softc *sc, uint16_t verb, uint16_t payload) { struct hda_codec_stream *st = &sc->streams[HDA_CODEC_STREAM_OUTPUT]; int res; res = hda_codec_audio_inout_nid(st, verb, payload); return (res); } static void hda_codec_audio_output_do_transfer(void *arg) { struct hda_codec_softc *sc = (struct hda_codec_softc *)arg; struct hda_codec_inst *hci = NULL; struct hda_ops *hops = NULL; struct hda_codec_stream *st = NULL; struct audio *aud = NULL; int err; hci = sc->hci; assert(hci); hops = hci->hops; assert(hops); st = &sc->streams[HDA_CODEC_STREAM_OUTPUT]; aud = st->aud; err = hops->transfer(hci, st->stream, 1, st->buf, sizeof(st->buf)); if (err) return; err = audio_playback(aud, st->buf, sizeof(st->buf)); assert(!err); } static int hda_codec_audio_output_do_setup(void *arg) { struct hda_codec_softc *sc = (struct hda_codec_softc *)arg; struct hda_codec_stream *st = NULL; struct audio *aud = NULL; struct audio_params params; int err; st = &sc->streams[HDA_CODEC_STREAM_OUTPUT]; aud = st->aud; err = hda_codec_parse_format(st->fmt, ¶ms); if (err) return (-1); DPRINTF("rate: %d, channels: %d, format: 0x%x", params.rate, params.channels, params.format); return (audio_set_params(aud, ¶ms)); } static uint32_t hda_codec_audio_input_nid(struct hda_codec_softc *sc, uint16_t verb, uint16_t payload) { struct hda_codec_stream *st = &sc->streams[HDA_CODEC_STREAM_INPUT]; int res; res = hda_codec_audio_inout_nid(st, verb, payload); return (res); } static void hda_codec_audio_input_do_transfer(void *arg) { struct hda_codec_softc *sc = (struct hda_codec_softc *)arg; struct hda_codec_inst *hci = NULL; struct hda_ops *hops = NULL; struct hda_codec_stream *st = NULL; struct audio *aud = NULL; int err; hci = sc->hci; assert(hci); hops = hci->hops; assert(hops); st = &sc->streams[HDA_CODEC_STREAM_INPUT]; aud = st->aud; err = audio_record(aud, st->buf, sizeof(st->buf)); assert(!err); hops->transfer(hci, st->stream, 0, st->buf, sizeof(st->buf)); } static int hda_codec_audio_input_do_setup(void *arg) { struct hda_codec_softc *sc = (struct hda_codec_softc *)arg; struct hda_codec_stream *st = NULL; struct audio *aud = NULL; struct audio_params params; int err; st = &sc->streams[HDA_CODEC_STREAM_INPUT]; aud = st->aud; err = hda_codec_parse_format(st->fmt, ¶ms); if (err) return (-1); DPRINTF("rate: %d, channels: %d, format: 0x%x", params.rate, params.channels, params.format); return (audio_set_params(aud, ¶ms)); } static uint32_t hda_codec_audio_inout_nid(struct hda_codec_stream *st, uint16_t verb, uint16_t payload) { uint32_t res = 0; uint8_t mute = 0; uint8_t gain = 0; DPRINTF("%s verb: 0x%x, payload, 0x%x", st->actx.name, verb, payload); switch (verb) { case HDA_CMD_VERB_GET_CONV_FMT: res = st->fmt; break; case HDA_CMD_VERB_SET_CONV_FMT: st->fmt = payload; break; case HDA_CMD_VERB_GET_AMP_GAIN_MUTE: if (payload & HDA_CMD_GET_AMP_GAIN_MUTE_LEFT) { res = st->left_gain | st->left_mute; DPRINTF("GET_AMP_GAIN_MUTE_LEFT: 0x%x", res); } else { res = st->right_gain | st->right_mute; DPRINTF("GET_AMP_GAIN_MUTE_RIGHT: 0x%x", res); } break; case HDA_CMD_VERB_SET_AMP_GAIN_MUTE: mute = payload & HDA_CODEC_SET_AMP_GAIN_MUTE_MUTE; gain = payload & HDA_CODEC_SET_AMP_GAIN_MUTE_GAIN_MASK; if (payload & HDA_CMD_SET_AMP_GAIN_MUTE_LEFT) { st->left_mute = mute; st->left_gain = gain; DPRINTF("SET_AMP_GAIN_MUTE_LEFT: \ mute: 0x%x gain: 0x%x", mute, gain); } if (payload & HDA_CMD_SET_AMP_GAIN_MUTE_RIGHT) { st->right_mute = mute; st->right_gain = gain; DPRINTF("SET_AMP_GAIN_MUTE_RIGHT: \ mute: 0x%x gain: 0x%x", mute, gain); } break; case HDA_CMD_VERB_GET_CONV_STREAM_CHAN: res = (st->stream << 4) | st->channel; break; case HDA_CMD_VERB_SET_CONV_STREAM_CHAN: st->channel = payload & 0x0f; st->stream = (payload >> 4) & 0x0f; DPRINTF("st->channel: 0x%x st->stream: 0x%x", st->channel, st->stream); if (!st->stream) hda_audio_ctxt_stop(&st->actx); break; default: DPRINTF("Unknown VERB: 0x%x", verb); break; } return (res); } struct hda_codec_class hda_codec = { .name = "hda_codec", .init = hda_codec_init, .reset = hda_codec_reset, .command = hda_codec_command, .notify = hda_codec_notify, }; HDA_EMUL_SET(hda_codec); /* * HDA Audio Context module function definitions */ static void * hda_audio_ctxt_thr(void *arg) { struct hda_audio_ctxt *actx = arg; DPRINTF("Start Thread: %s", actx->name); pthread_mutex_lock(&actx->mtx); while (1) { while (!actx->run) pthread_cond_wait(&actx->cond, &actx->mtx); actx->do_transfer(actx->priv); } pthread_mutex_unlock(&actx->mtx); pthread_exit(NULL); return (NULL); } static int hda_audio_ctxt_init(struct hda_audio_ctxt *actx, const char *tname, transfer_func_t do_transfer, setup_func_t do_setup, void *priv) { int err; assert(actx); assert(tname); assert(do_transfer); assert(do_setup); assert(priv); memset(actx, 0, sizeof(*actx)); actx->run = 0; actx->do_transfer = do_transfer; actx->do_setup = do_setup; actx->priv = priv; if (strlen(tname) < sizeof(actx->name)) memcpy(actx->name, tname, strlen(tname) + 1); else strcpy(actx->name, "unknown"); err = pthread_mutex_init(&actx->mtx, NULL); assert(!err); err = pthread_cond_init(&actx->cond, NULL); assert(!err); err = pthread_create(&actx->tid, NULL, hda_audio_ctxt_thr, actx); assert(!err); pthread_set_name_np(actx->tid, tname); actx->started = 1; return (0); } static int hda_audio_ctxt_start(struct hda_audio_ctxt *actx) { int err = 0; assert(actx); assert(actx->started); /* The stream is supposed to be stopped */ if (actx->run) return (-1); pthread_mutex_lock(&actx->mtx); err = (* actx->do_setup)(actx->priv); if (!err) { actx->run = 1; pthread_cond_signal(&actx->cond); } pthread_mutex_unlock(&actx->mtx); return (err); } static int hda_audio_ctxt_stop(struct hda_audio_ctxt *actx) { actx->run = 0; return (0); } diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c index b460ee2988cc..1fba90945afb 100644 --- a/usr.sbin/bhyve/inout.c +++ b/usr.sbin/bhyve/inout.c @@ -1,299 +1,301 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" +#include "config.h" #include "inout.h" SET_DECLARE(inout_port_set, struct inout_port); #define MAX_IOPORTS (1 << 16) #define VERIFY_IOPORT(port, size) \ assert((port) >= 0 && (size) > 0 && ((port) + (size)) <= MAX_IOPORTS) static struct { const char *name; int flags; inout_func_t handler; void *arg; } inout_handlers[MAX_IOPORTS]; static int default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { if (in) { switch (bytes) { case 4: *eax = 0xffffffff; break; case 2: *eax = 0xffff; break; case 1: *eax = 0xff; break; } } return (0); } static void register_default_iohandler(int start, int size) { struct inout_port iop; VERIFY_IOPORT(start, size); bzero(&iop, sizeof(iop)); iop.name = "default"; iop.port = start; iop.size = size; iop.flags = IOPORT_F_INOUT | IOPORT_F_DEFAULT; iop.handler = default_inout; register_inout(&iop); } int -emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict) +emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) { int addrsize, bytes, flags, in, port, prot, rep; uint32_t eax, val; inout_func_t handler; void *arg; int error, fault, retval; enum vm_reg_name idxreg; uint64_t gla, index, iterations, count; struct vm_inout_str *vis; struct iovec iov[2]; bytes = vmexit->u.inout.bytes; in = vmexit->u.inout.in; port = vmexit->u.inout.port; assert(port < MAX_IOPORTS); assert(bytes == 1 || bytes == 2 || bytes == 4); handler = inout_handlers[port].handler; - if (strict && handler == default_inout) + if (handler == default_inout && + get_config_bool_default("x86.strictio", false)) return (-1); flags = inout_handlers[port].flags; arg = inout_handlers[port].arg; if (in) { if (!(flags & IOPORT_F_IN)) return (-1); } else { if (!(flags & IOPORT_F_OUT)) return (-1); } retval = 0; if (vmexit->u.inout.string) { vis = &vmexit->u.inout_str; rep = vis->inout.rep; addrsize = vis->addrsize; prot = in ? PROT_WRITE : PROT_READ; assert(addrsize == 2 || addrsize == 4 || addrsize == 8); /* Index register */ idxreg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; index = vis->index & vie_size2mask(addrsize); /* Count register */ count = vis->count & vie_size2mask(addrsize); /* Limit number of back-to-back in/out emulations to 16 */ iterations = MIN(count, 16); while (iterations > 0) { assert(retval == 0); if (vie_calculate_gla(vis->paging.cpu_mode, vis->seg_name, &vis->seg_desc, index, bytes, addrsize, prot, &gla)) { vm_inject_gp(ctx, vcpu); break; } error = vm_copy_setup(ctx, vcpu, &vis->paging, gla, bytes, prot, iov, nitems(iov), &fault); if (error) { retval = -1; /* Unrecoverable error */ break; } else if (fault) { retval = 0; /* Resume guest to handle fault */ break; } if (vie_alignment_check(vis->paging.cpl, bytes, vis->cr0, vis->rflags, gla)) { vm_inject_ac(ctx, vcpu, 0); break; } val = 0; if (!in) vm_copyin(ctx, vcpu, iov, &val, bytes); retval = handler(ctx, vcpu, in, port, bytes, &val, arg); if (retval != 0) break; if (in) vm_copyout(ctx, vcpu, &val, iov, bytes); /* Update index */ if (vis->rflags & PSL_D) index -= bytes; else index += bytes; count--; iterations--; } /* Update index register */ error = vie_update_register(ctx, vcpu, idxreg, index, addrsize); assert(error == 0); /* * Update count register only if the instruction had a repeat * prefix. */ if (rep) { error = vie_update_register(ctx, vcpu, VM_REG_GUEST_RCX, count, addrsize); assert(error == 0); } /* Restart the instruction if more iterations remain */ if (retval == 0 && count != 0) { error = vm_restart_instruction(ctx, vcpu); assert(error == 0); } } else { eax = vmexit->u.inout.eax; val = eax & vie_size2mask(bytes); retval = handler(ctx, vcpu, in, port, bytes, &val, arg); if (retval == 0 && in) { eax &= ~vie_size2mask(bytes); eax |= val & vie_size2mask(bytes); error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); } } return (retval); } void init_inout(void) { struct inout_port **iopp, *iop; /* * Set up the default handler for all ports */ register_default_iohandler(0, MAX_IOPORTS); /* * Overwrite with specified handlers */ SET_FOREACH(iopp, inout_port_set) { iop = *iopp; assert(iop->port < MAX_IOPORTS); inout_handlers[iop->port].name = iop->name; inout_handlers[iop->port].flags = iop->flags; inout_handlers[iop->port].handler = iop->handler; inout_handlers[iop->port].arg = NULL; } } int register_inout(struct inout_port *iop) { int i; VERIFY_IOPORT(iop->port, iop->size); /* * Verify that the new registration is not overwriting an already * allocated i/o range. */ if ((iop->flags & IOPORT_F_DEFAULT) == 0) { for (i = iop->port; i < iop->port + iop->size; i++) { if ((inout_handlers[i].flags & IOPORT_F_DEFAULT) == 0) return (-1); } } for (i = iop->port; i < iop->port + iop->size; i++) { inout_handlers[i].name = iop->name; inout_handlers[i].flags = iop->flags; inout_handlers[i].handler = iop->handler; inout_handlers[i].arg = iop->arg; } return (0); } int unregister_inout(struct inout_port *iop) { VERIFY_IOPORT(iop->port, iop->size); assert(inout_handlers[iop->port].name == iop->name); register_default_iohandler(iop->port, iop->size); return (0); } diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h index 45ad5f4ac654..afbae34c2dd0 100644 --- a/usr.sbin/bhyve/inout.h +++ b/usr.sbin/bhyve/inout.h @@ -1,80 +1,79 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _INOUT_H_ #define _INOUT_H_ #include struct vmctx; struct vm_exit; /* * inout emulation handlers return 0 on success and -1 on failure. */ typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg); struct inout_port { const char *name; int port; int size; int flags; inout_func_t handler; void *arg; }; #define IOPORT_F_IN 0x1 #define IOPORT_F_OUT 0x2 #define IOPORT_F_INOUT (IOPORT_F_IN | IOPORT_F_OUT) /* * The following flags are used internally and must not be used by * device models. */ #define IOPORT_F_DEFAULT 0x80000000 /* claimed by default handler */ #define INOUT_PORT(name, port, flags, handler) \ static struct inout_port __CONCAT(__inout_port, __LINE__) = { \ #name, \ (port), \ 1, \ (flags), \ (handler), \ 0 \ }; \ DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__)) void init_inout(void); -int emulate_inout(struct vmctx *, int vcpu, struct vm_exit *vmexit, - int strict); +int emulate_inout(struct vmctx *, int vcpu, struct vm_exit *vmexit); int register_inout(struct inout_port *iop); int unregister_inout(struct inout_port *iop); #endif /* _INOUT_H_ */ diff --git a/usr.sbin/bhyve/mevent.c b/usr.sbin/bhyve/mevent.c index 649a6b09cb34..2e9059837011 100644 --- a/usr.sbin/bhyve/mevent.c +++ b/usr.sbin/bhyve/mevent.c @@ -1,492 +1,490 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Micro event library for FreeBSD, designed for a single i/o thread * using kqueue, and having events be persistent by default. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include "mevent.h" #define MEVENT_MAX 64 -extern const char *vmname; - static pthread_t mevent_tid; static int mevent_timid = 43; static int mevent_pipefd[2]; static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER; struct mevent { void (*me_func)(int, enum ev_type, void *); #define me_msecs me_fd int me_fd; int me_timid; enum ev_type me_type; void *me_param; int me_cq; int me_state; /* Desired kevent flags. */ int me_closefd; LIST_ENTRY(mevent) me_list; }; static LIST_HEAD(listhead, mevent) global_head, change_head; static void mevent_qlock(void) { pthread_mutex_lock(&mevent_lmutex); } static void mevent_qunlock(void) { pthread_mutex_unlock(&mevent_lmutex); } static void mevent_pipe_read(int fd, enum ev_type type, void *param) { char buf[MEVENT_MAX]; int status; /* * Drain the pipe read side. The fd is non-blocking so this is * safe to do. */ do { status = read(fd, buf, sizeof(buf)); } while (status == MEVENT_MAX); } static void mevent_notify(void) { char c = '\0'; /* * If calling from outside the i/o thread, write a byte on the * pipe to force the i/o thread to exit the blocking kevent call. */ if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) { write(mevent_pipefd[1], &c, 1); } } static int mevent_kq_filter(struct mevent *mevp) { int retval; retval = 0; if (mevp->me_type == EVF_READ) retval = EVFILT_READ; if (mevp->me_type == EVF_WRITE) retval = EVFILT_WRITE; if (mevp->me_type == EVF_TIMER) retval = EVFILT_TIMER; if (mevp->me_type == EVF_SIGNAL) retval = EVFILT_SIGNAL; return (retval); } static int mevent_kq_flags(struct mevent *mevp) { return (mevp->me_state); } static int mevent_kq_fflags(struct mevent *mevp) { /* XXX nothing yet, perhaps EV_EOF for reads ? */ return (0); } static int mevent_build(int mfd, struct kevent *kev) { struct mevent *mevp, *tmpp; int i; i = 0; mevent_qlock(); LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { if (mevp->me_closefd) { /* * A close of the file descriptor will remove the * event */ close(mevp->me_fd); } else { if (mevp->me_type == EVF_TIMER) { kev[i].ident = mevp->me_timid; kev[i].data = mevp->me_msecs; } else { kev[i].ident = mevp->me_fd; kev[i].data = 0; } kev[i].filter = mevent_kq_filter(mevp); kev[i].flags = mevent_kq_flags(mevp); kev[i].fflags = mevent_kq_fflags(mevp); kev[i].udata = mevp; i++; } mevp->me_cq = 0; LIST_REMOVE(mevp, me_list); if (mevp->me_state & EV_DELETE) { free(mevp); } else { /* * We need to add the event only once, so we can * reset the EV_ADD bit after it has been propagated * to the kevent() arguments the first time. */ mevp->me_state &= ~EV_ADD; LIST_INSERT_HEAD(&global_head, mevp, me_list); } assert(i < MEVENT_MAX); } mevent_qunlock(); return (i); } static void mevent_handle(struct kevent *kev, int numev) { struct mevent *mevp; int i; for (i = 0; i < numev; i++) { mevp = kev[i].udata; /* XXX check for EV_ERROR ? */ (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); } } static struct mevent * mevent_add_state(int tfd, enum ev_type type, void (*func)(int, enum ev_type, void *), void *param, int state) { struct mevent *lp, *mevp; if (tfd < 0 || func == NULL) { return (NULL); } mevp = NULL; mevent_qlock(); /* * Verify that the fd/type tuple is not present in any list */ LIST_FOREACH(lp, &global_head, me_list) { if (type != EVF_TIMER && lp->me_fd == tfd && lp->me_type == type) { goto exit; } } LIST_FOREACH(lp, &change_head, me_list) { if (type != EVF_TIMER && lp->me_fd == tfd && lp->me_type == type) { goto exit; } } /* * Allocate an entry, populate it, and add it to the change list. */ mevp = calloc(1, sizeof(struct mevent)); if (mevp == NULL) { goto exit; } if (type == EVF_TIMER) { mevp->me_msecs = tfd; mevp->me_timid = mevent_timid++; } else mevp->me_fd = tfd; mevp->me_type = type; mevp->me_func = func; mevp->me_param = param; LIST_INSERT_HEAD(&change_head, mevp, me_list); mevp->me_cq = 1; mevp->me_state = state; mevent_notify(); exit: mevent_qunlock(); return (mevp); } struct mevent * mevent_add(int tfd, enum ev_type type, void (*func)(int, enum ev_type, void *), void *param) { return (mevent_add_state(tfd, type, func, param, EV_ADD)); } struct mevent * mevent_add_disabled(int tfd, enum ev_type type, void (*func)(int, enum ev_type, void *), void *param) { return (mevent_add_state(tfd, type, func, param, EV_ADD | EV_DISABLE)); } static int mevent_update(struct mevent *evp, bool enable) { int newstate; mevent_qlock(); /* * It's not possible to enable/disable a deleted event */ assert((evp->me_state & EV_DELETE) == 0); newstate = evp->me_state; if (enable) { newstate |= EV_ENABLE; newstate &= ~EV_DISABLE; } else { newstate |= EV_DISABLE; newstate &= ~EV_ENABLE; } /* * No update needed if state isn't changing */ if (evp->me_state != newstate) { evp->me_state = newstate; /* * Place the entry onto the changed list if not * already there. */ if (evp->me_cq == 0) { evp->me_cq = 1; LIST_REMOVE(evp, me_list); LIST_INSERT_HEAD(&change_head, evp, me_list); mevent_notify(); } } mevent_qunlock(); return (0); } int mevent_enable(struct mevent *evp) { return (mevent_update(evp, true)); } int mevent_disable(struct mevent *evp) { return (mevent_update(evp, false)); } static int mevent_delete_event(struct mevent *evp, int closefd) { mevent_qlock(); /* * Place the entry onto the changed list if not already there, and * mark as to be deleted. */ if (evp->me_cq == 0) { evp->me_cq = 1; LIST_REMOVE(evp, me_list); LIST_INSERT_HEAD(&change_head, evp, me_list); mevent_notify(); } evp->me_state = EV_DELETE; if (closefd) evp->me_closefd = 1; mevent_qunlock(); return (0); } int mevent_delete(struct mevent *evp) { return (mevent_delete_event(evp, 0)); } int mevent_delete_close(struct mevent *evp) { return (mevent_delete_event(evp, 1)); } static void mevent_set_name(void) { pthread_set_name_np(mevent_tid, "mevent"); } void mevent_dispatch(void) { struct kevent changelist[MEVENT_MAX]; struct kevent eventlist[MEVENT_MAX]; struct mevent *pipev; int mfd; int numev; int ret; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif mevent_tid = pthread_self(); mevent_set_name(); mfd = kqueue(); assert(mfd > 0); #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_KQUEUE); if (caph_rights_limit(mfd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif /* * Open the pipe that will be used for other threads to force * the blocking kqueue call to exit by writing to it. Set the * descriptor to non-blocking. */ ret = pipe(mevent_pipefd); if (ret < 0) { perror("pipe"); exit(0); } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); if (caph_rights_limit(mevent_pipefd[0], &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_rights_limit(mevent_pipefd[1], &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif /* * Add internal event handler for the pipe write fd */ pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL); assert(pipev != NULL); for (;;) { /* * Build changelist if required. * XXX the changelist can be put into the blocking call * to eliminate the extra syscall. Currently better for * debug. */ numev = mevent_build(mfd, changelist); if (numev) { ret = kevent(mfd, changelist, numev, NULL, 0, NULL); if (ret == -1) { perror("Error return from kevent change"); } } /* * Block awaiting events */ ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL); if (ret == -1 && errno != EINTR) { perror("Error return from kevent monitor"); } /* * Handle reported events */ mevent_handle(eventlist, ret); } } diff --git a/usr.sbin/bhyve/mevent_test.c b/usr.sbin/bhyve/mevent_test.c index 1a9c168da6de..3a7aac98a65c 100644 --- a/usr.sbin/bhyve/mevent_test.c +++ b/usr.sbin/bhyve/mevent_test.c @@ -1,258 +1,256 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Test program for the micro event library. Set up a simple TCP echo * service. * * cc mevent_test.c mevent.c -lpthread */ #include #include #include #include #include #include #include #include #include #include #include "mevent.h" #define TEST_PORT 4321 static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER; static struct mevent *tevp; -char *vmname = "test vm"; - #define MEVENT_ECHO /* Number of timer events to capture */ #define TEVSZ 4096 uint64_t tevbuf[TEVSZ]; static void timer_print(void) { uint64_t min, max, diff, sum, tsc_freq; size_t len; int j; min = UINT64_MAX; max = 0; sum = 0; len = sizeof(tsc_freq); sysctlbyname("machdep.tsc_freq", &tsc_freq, &len, NULL, 0); for (j = 1; j < TEVSZ; j++) { /* Convert a tsc diff into microseconds */ diff = (tevbuf[j] - tevbuf[j-1]) * 1000000 / tsc_freq; sum += diff; if (min > diff) min = diff; if (max < diff) max = diff; } printf("timers done: usecs, min %ld, max %ld, mean %ld\n", min, max, sum/(TEVSZ - 1)); } static void timer_callback(int fd, enum ev_type type, void *param) { static int i; if (i >= TEVSZ) abort(); tevbuf[i++] = rdtsc(); if (i == TEVSZ) { mevent_delete(tevp); timer_print(); } } #ifdef MEVENT_ECHO struct esync { pthread_mutex_t e_mt; pthread_cond_t e_cond; }; static void echoer_callback(int fd, enum ev_type type, void *param) { struct esync *sync = param; pthread_mutex_lock(&sync->e_mt); pthread_cond_signal(&sync->e_cond); pthread_mutex_unlock(&sync->e_mt); } static void * echoer(void *param) { struct esync sync; struct mevent *mev; char buf[128]; int fd = (int)(uintptr_t) param; int len; pthread_mutex_init(&sync.e_mt, NULL); pthread_cond_init(&sync.e_cond, NULL); pthread_mutex_lock(&sync.e_mt); mev = mevent_add(fd, EVF_READ, echoer_callback, &sync); if (mev == NULL) { printf("Could not allocate echoer event\n"); exit(4); } while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) { len = read(fd, buf, sizeof(buf)); if (len > 0) { write(fd, buf, len); write(0, buf, len); } else { break; } } mevent_delete_close(mev); pthread_mutex_unlock(&sync.e_mt); pthread_mutex_destroy(&sync.e_mt); pthread_cond_destroy(&sync.e_cond); return (NULL); } #else static void * echoer(void *param) { char buf[128]; int fd = (int)(uintptr_t) param; int len; while ((len = read(fd, buf, sizeof(buf))) > 0) { write(1, buf, len); } return (NULL); } #endif /* MEVENT_ECHO */ static void acceptor_callback(int fd, enum ev_type type, void *param) { pthread_mutex_lock(&accept_mutex); pthread_cond_signal(&accept_condvar); pthread_mutex_unlock(&accept_mutex); } static void * acceptor(void *param) { struct sockaddr_in sin; pthread_t tid; int news; int s; static int first; if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) { perror("cannot create socket"); exit(4); } sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl(INADDR_ANY); sin.sin_port = htons(TEST_PORT); if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) { perror("cannot bind socket"); exit(4); } if (listen(s, 1) < 0) { perror("cannot listen socket"); exit(4); } (void) mevent_add(s, EVF_READ, acceptor_callback, NULL); pthread_mutex_lock(&accept_mutex); while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) { news = accept(s, NULL, NULL); if (news < 0) { perror("accept error"); } else { static int first = 1; if (first) { /* * Start a timer */ first = 0; tevp = mevent_add(1, EVF_TIMER, timer_callback, NULL); } printf("incoming connection, spawning thread\n"); pthread_create(&tid, NULL, echoer, (void *)(uintptr_t)news); } } return (NULL); } main() { pthread_t tid; pthread_create(&tid, NULL, acceptor, NULL); mevent_dispatch(); } diff --git a/usr.sbin/bhyve/net_backends.c b/usr.sbin/bhyve/net_backends.c index 884ffb824111..30c26aea458b 100644 --- a/usr.sbin/bhyve/net_backends.c +++ b/usr.sbin/bhyve/net_backends.c @@ -1,1108 +1,1094 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2019 Vincenzo Maffione * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * This file implements multiple network backends (tap, netmap, ...), * to be used by network frontends such as virtio-net and e1000. * The API to access the backend (e.g. send/receive packets, negotiate * features) is exported by net_backends.h. */ #include __FBSDID("$FreeBSD$"); #include /* u_short etc */ #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #define NETMAP_WITH_LIBS #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NETGRAPH #include #include #include #endif +#include "config.h" #include "debug.h" #include "iov.h" #include "mevent.h" #include "net_backends.h" +#include "pci_emul.h" #include /* * Each network backend registers a set of function pointers that are * used to implement the net backends API. * This might need to be exposed if we implement backends in separate files. */ struct net_backend { const char *prefix; /* prefix matching this backend */ /* * Routines used to initialize and cleanup the resources needed * by a backend. The cleanup function is used internally, * and should not be called by the frontend. */ int (*init)(struct net_backend *be, const char *devname, - const char *opts, net_be_rxeof_t cb, void *param); + nvlist_t *nvl, net_be_rxeof_t cb, void *param); void (*cleanup)(struct net_backend *be); /* * Called to serve a guest transmit request. The scatter-gather * vector provided by the caller has 'iovcnt' elements and contains * the packet to send. */ ssize_t (*send)(struct net_backend *be, const struct iovec *iov, int iovcnt); /* * Get the length of the next packet that can be received from * the backend. If no packets are currently available, this * function returns 0. */ ssize_t (*peek_recvlen)(struct net_backend *be); /* * Called to receive a packet from the backend. When the function * returns a positive value 'len', the scatter-gather vector * provided by the caller contains a packet with such length. * The function returns 0 if the backend doesn't have a new packet to * receive. */ ssize_t (*recv)(struct net_backend *be, const struct iovec *iov, int iovcnt); /* * Ask the backend to enable or disable receive operation in the * backend. On return from a disable operation, it is guaranteed * that the receive callback won't be called until receive is * enabled again. Note however that it is up to the caller to make * sure that netbe_recv() is not currently being executed by another * thread. */ void (*recv_enable)(struct net_backend *be); void (*recv_disable)(struct net_backend *be); /* * Ask the backend for the virtio-net features it is able to * support. Possible features are TSO, UFO and checksum offloading * in both rx and tx direction and for both IPv4 and IPv6. */ uint64_t (*get_cap)(struct net_backend *be); /* * Tell the backend to enable/disable the specified virtio-net * features (capabilities). */ int (*set_cap)(struct net_backend *be, uint64_t features, unsigned int vnet_hdr_len); struct pci_vtnet_softc *sc; int fd; /* * Length of the virtio-net header used by the backend and the * frontend, respectively. A zero value means that the header * is not used. */ unsigned int be_vnet_hdr_len; unsigned int fe_vnet_hdr_len; /* Size of backend-specific private data. */ size_t priv_size; /* Room for backend-specific data. */ char opaque[0]; }; SET_DECLARE(net_backend_set, struct net_backend); #define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) #define WPRINTF(params) PRINTLN params /* * The tap backend */ struct tap_priv { struct mevent *mevp; /* * A bounce buffer that allows us to implement the peek_recvlen * callback. In the future we may get the same information from * the kevent data. */ char bbuf[1 << 16]; ssize_t bbuflen; }; static void tap_cleanup(struct net_backend *be) { struct tap_priv *priv = (struct tap_priv *)be->opaque; if (priv->mevp) { mevent_delete(priv->mevp); } if (be->fd != -1) { close(be->fd); be->fd = -1; } } static int tap_init(struct net_backend *be, const char *devname, - const char *opts, net_be_rxeof_t cb, void *param) + nvlist_t *nvl, net_be_rxeof_t cb, void *param) { struct tap_priv *priv = (struct tap_priv *)be->opaque; char tbuf[80]; int opt = 1; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif if (cb == NULL) { WPRINTF(("TAP backend requires non-NULL callback")); return (-1); } strcpy(tbuf, "/dev/"); strlcat(tbuf, devname, sizeof(tbuf)); be->fd = open(tbuf, O_RDWR); if (be->fd == -1) { WPRINTF(("open of tap device %s failed", tbuf)); goto error; } /* * Set non-blocking and register for read * notifications with the event loop */ if (ioctl(be->fd, FIONBIO, &opt) < 0) { WPRINTF(("tap device O_NONBLOCK failed")); goto error; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); if (caph_rights_limit(be->fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif memset(priv->bbuf, 0, sizeof(priv->bbuf)); priv->bbuflen = 0; priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); if (priv->mevp == NULL) { WPRINTF(("Could not register event")); goto error; } return (0); error: tap_cleanup(be); return (-1); } /* * Called to send a buffer chain out to the tap device */ static ssize_t tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) { return (writev(be->fd, iov, iovcnt)); } static ssize_t tap_peek_recvlen(struct net_backend *be) { struct tap_priv *priv = (struct tap_priv *)be->opaque; ssize_t ret; if (priv->bbuflen > 0) { /* * We already have a packet in the bounce buffer. * Just return its length. */ return priv->bbuflen; } /* * Read the next packet (if any) into the bounce buffer, so * that we get to know its length and we can return that * to the caller. */ ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); if (ret < 0 && errno == EWOULDBLOCK) { return (0); } if (ret > 0) priv->bbuflen = ret; return (ret); } static ssize_t tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) { struct tap_priv *priv = (struct tap_priv *)be->opaque; ssize_t ret; if (priv->bbuflen > 0) { /* * A packet is available in the bounce buffer, so * we read it from there. */ ret = buf_to_iov(priv->bbuf, priv->bbuflen, iov, iovcnt, 0); /* Mark the bounce buffer as empty. */ priv->bbuflen = 0; return (ret); } ret = readv(be->fd, iov, iovcnt); if (ret < 0 && errno == EWOULDBLOCK) { return (0); } return (ret); } static void tap_recv_enable(struct net_backend *be) { struct tap_priv *priv = (struct tap_priv *)be->opaque; mevent_enable(priv->mevp); } static void tap_recv_disable(struct net_backend *be) { struct tap_priv *priv = (struct tap_priv *)be->opaque; mevent_disable(priv->mevp); } static uint64_t tap_get_cap(struct net_backend *be) { return (0); /* no capabilities for now */ } static int tap_set_cap(struct net_backend *be, uint64_t features, unsigned vnet_hdr_len) { return ((features || vnet_hdr_len) ? -1 : 0); } static struct net_backend tap_backend = { .prefix = "tap", .priv_size = sizeof(struct tap_priv), .init = tap_init, .cleanup = tap_cleanup, .send = tap_send, .peek_recvlen = tap_peek_recvlen, .recv = tap_recv, .recv_enable = tap_recv_enable, .recv_disable = tap_recv_disable, .get_cap = tap_get_cap, .set_cap = tap_set_cap, }; /* A clone of the tap backend, with a different prefix. */ static struct net_backend vmnet_backend = { .prefix = "vmnet", .priv_size = sizeof(struct tap_priv), .init = tap_init, .cleanup = tap_cleanup, .send = tap_send, .peek_recvlen = tap_peek_recvlen, .recv = tap_recv, .recv_enable = tap_recv_enable, .recv_disable = tap_recv_disable, .get_cap = tap_get_cap, .set_cap = tap_set_cap, }; DATA_SET(net_backend_set, tap_backend); DATA_SET(net_backend_set, vmnet_backend); #ifdef NETGRAPH /* * Netgraph backend */ #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024) static int ng_init(struct net_backend *be, const char *devname, - const char *opts, net_be_rxeof_t cb, void *param) + nvlist_t *nvl, net_be_rxeof_t cb, void *param) { struct tap_priv *p = (struct tap_priv *)be->opaque; struct ngm_connect ngc; - char *ngopts, *tofree; - char nodename[NG_NODESIZ]; + const char *value, *nodename; int sbsz; int ctrl_sock; int flags; - int path_provided; - int peerhook_provided; - int socket_provided; unsigned long maxsbsz; size_t msbsz; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif if (cb == NULL) { WPRINTF(("Netgraph backend requires non-NULL callback")); return (-1); } be->fd = -1; memset(&ngc, 0, sizeof(ngc)); - strncpy(ngc.ourhook, "vmlink", NG_HOOKSIZ - 1); - - tofree = ngopts = strdup(opts); - - if (ngopts == NULL) { - WPRINTF(("strdup error")); - return (-1); - } - - socket_provided = 0; - path_provided = 0; - peerhook_provided = 0; - - while (ngopts != NULL) { - char *value = ngopts; - char *key; - - key = strsep(&value, "="); - if (value == NULL) - break; - ngopts = value; - (void) strsep(&ngopts, ","); - - if (strcmp(key, "socket") == 0) { - strncpy(nodename, value, NG_NODESIZ - 1); - socket_provided = 1; - } else if (strcmp(key, "path") == 0) { - strncpy(ngc.path, value, NG_PATHSIZ - 1); - path_provided = 1; - } else if (strcmp(key, "hook") == 0) { - strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); - } else if (strcmp(key, "peerhook") == 0) { - strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); - peerhook_provided = 1; - } - } - - free(tofree); - - if (!path_provided) { + value = get_config_value_node(nvl, "path"); + if (value == NULL) { WPRINTF(("path must be provided")); return (-1); } + strncpy(ngc.path, value, NG_PATHSIZ - 1); - if (!peerhook_provided) { + value = get_config_value_node(nvl, "hook"); + if (value == NULL) + value = "vmlink"; + strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); + + value = get_config_value_node(nvl, "peerhook"); + if (value == NULL) { WPRINTF(("peer hook must be provided")); return (-1); } + strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); - if (NgMkSockNode(socket_provided ? nodename : NULL, + nodename = get_config_value_node(nvl, "socket"); + if (NgMkSockNode(nodename, &ctrl_sock, &be->fd) < 0) { WPRINTF(("can't get Netgraph sockets")); return (-1); } if (NgSendMsg(ctrl_sock, ".", NGM_GENERIC_COOKIE, NGM_CONNECT, &ngc, sizeof(ngc)) < 0) { WPRINTF(("can't connect to node")); close(ctrl_sock); goto error; } close(ctrl_sock); flags = fcntl(be->fd, F_GETFL); if (flags < 0) { WPRINTF(("can't get socket flags")); goto error; } if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) { WPRINTF(("can't set O_NONBLOCK flag")); goto error; } /* * The default ng_socket(4) buffer's size is too low. * Calculate the minimum value between NG_SBUF_MAX_SIZE * and kern.ipc.maxsockbuf. */ msbsz = sizeof(maxsbsz); if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz, NULL, 0) < 0) { WPRINTF(("can't get 'kern.ipc.maxsockbuf' value")); goto error; } /* * We can't set the socket buffer size to kern.ipc.maxsockbuf value, * as it takes into account the mbuf(9) overhead. */ maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES); sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz); if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz, sizeof(sbsz)) < 0) { WPRINTF(("can't set TX buffer size")); goto error; } if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz, sizeof(sbsz)) < 0) { WPRINTF(("can't set RX buffer size")); goto error; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); if (caph_rights_limit(be->fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif memset(p->bbuf, 0, sizeof(p->bbuf)); p->bbuflen = 0; p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); if (p->mevp == NULL) { WPRINTF(("Could not register event")); goto error; } return (0); error: tap_cleanup(be); return (-1); } static struct net_backend ng_backend = { .prefix = "netgraph", .priv_size = sizeof(struct tap_priv), .init = ng_init, .cleanup = tap_cleanup, .send = tap_send, .peek_recvlen = tap_peek_recvlen, .recv = tap_recv, .recv_enable = tap_recv_enable, .recv_disable = tap_recv_disable, .get_cap = tap_get_cap, .set_cap = tap_set_cap, }; DATA_SET(net_backend_set, ng_backend); #endif /* NETGRAPH */ /* * The netmap backend */ /* The virtio-net features supported by netmap. */ #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) struct netmap_priv { char ifname[IFNAMSIZ]; struct nm_desc *nmd; uint16_t memid; struct netmap_ring *rx; struct netmap_ring *tx; struct mevent *mevp; net_be_rxeof_t cb; void *cb_param; }; static void nmreq_init(struct nmreq *req, char *ifname) { memset(req, 0, sizeof(*req)); strlcpy(req->nr_name, ifname, sizeof(req->nr_name)); req->nr_version = NETMAP_API; } static int netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) { int err; struct nmreq req; struct netmap_priv *priv = (struct netmap_priv *)be->opaque; nmreq_init(&req, priv->ifname); req.nr_cmd = NETMAP_BDG_VNET_HDR; req.nr_arg1 = vnet_hdr_len; err = ioctl(be->fd, NIOCREGIF, &req); if (err) { WPRINTF(("Unable to set vnet header length %d", vnet_hdr_len)); return (err); } be->be_vnet_hdr_len = vnet_hdr_len; return (0); } static int netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) { int prev_hdr_len = be->be_vnet_hdr_len; int ret; if (vnet_hdr_len == prev_hdr_len) { return (1); } ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); if (ret) { return (0); } netmap_set_vnet_hdr_len(be, prev_hdr_len); return (1); } static uint64_t netmap_get_cap(struct net_backend *be) { return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? NETMAP_FEATURES : 0); } static int netmap_set_cap(struct net_backend *be, uint64_t features, unsigned vnet_hdr_len) { return (netmap_set_vnet_hdr_len(be, vnet_hdr_len)); } static int netmap_init(struct net_backend *be, const char *devname, - const char *opts, net_be_rxeof_t cb, void *param) + nvlist_t *nvl, net_be_rxeof_t cb, void *param) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; strlcpy(priv->ifname, devname, sizeof(priv->ifname)); priv->ifname[sizeof(priv->ifname) - 1] = '\0'; priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); if (priv->nmd == NULL) { WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)", devname, strerror(errno))); free(priv); return (-1); } priv->memid = priv->nmd->req.nr_arg2; priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); priv->cb = cb; priv->cb_param = param; be->fd = priv->nmd->fd; priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); if (priv->mevp == NULL) { WPRINTF(("Could not register event")); return (-1); } return (0); } static void netmap_cleanup(struct net_backend *be) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; if (priv->mevp) { mevent_delete(priv->mevp); } if (priv->nmd) { nm_close(priv->nmd); } be->fd = -1; } static ssize_t netmap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; struct netmap_ring *ring; ssize_t totlen = 0; int nm_buf_size; int nm_buf_len; uint32_t head; void *nm_buf; int j; ring = priv->tx; head = ring->head; if (head == ring->tail) { WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt))); goto txsync; } nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); nm_buf_size = ring->nr_buf_size; nm_buf_len = 0; for (j = 0; j < iovcnt; j++) { int iov_frag_size = iov[j].iov_len; void *iov_frag_buf = iov[j].iov_base; totlen += iov_frag_size; /* * Split each iovec fragment over more netmap slots, if * necessary. */ for (;;) { int copylen; copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; memcpy(nm_buf, iov_frag_buf, copylen); iov_frag_buf += copylen; iov_frag_size -= copylen; nm_buf += copylen; nm_buf_size -= copylen; nm_buf_len += copylen; if (iov_frag_size == 0) { break; } ring->slot[head].len = nm_buf_len; ring->slot[head].flags = NS_MOREFRAG; head = nm_ring_next(ring, head); if (head == ring->tail) { /* * We ran out of netmap slots while * splitting the iovec fragments. */ WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt))); goto txsync; } nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); nm_buf_size = ring->nr_buf_size; nm_buf_len = 0; } } /* Complete the last slot, which must not have NS_MOREFRAG set. */ ring->slot[head].len = nm_buf_len; ring->slot[head].flags = 0; head = nm_ring_next(ring, head); /* Now update ring->head and ring->cur. */ ring->head = ring->cur = head; txsync: ioctl(be->fd, NIOCTXSYNC, NULL); return (totlen); } static ssize_t netmap_peek_recvlen(struct net_backend *be) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; struct netmap_ring *ring = priv->rx; uint32_t head = ring->head; ssize_t totlen = 0; while (head != ring->tail) { struct netmap_slot *slot = ring->slot + head; totlen += slot->len; if ((slot->flags & NS_MOREFRAG) == 0) break; head = nm_ring_next(ring, head); } return (totlen); } static ssize_t netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; struct netmap_slot *slot = NULL; struct netmap_ring *ring; void *iov_frag_buf; int iov_frag_size; ssize_t totlen = 0; uint32_t head; assert(iovcnt); ring = priv->rx; head = ring->head; iov_frag_buf = iov->iov_base; iov_frag_size = iov->iov_len; do { int nm_buf_len; void *nm_buf; if (head == ring->tail) { return (0); } slot = ring->slot + head; nm_buf = NETMAP_BUF(ring, slot->buf_idx); nm_buf_len = slot->len; for (;;) { int copylen = nm_buf_len < iov_frag_size ? nm_buf_len : iov_frag_size; memcpy(iov_frag_buf, nm_buf, copylen); nm_buf += copylen; nm_buf_len -= copylen; iov_frag_buf += copylen; iov_frag_size -= copylen; totlen += copylen; if (nm_buf_len == 0) { break; } iov++; iovcnt--; if (iovcnt == 0) { /* No space to receive. */ WPRINTF(("Short iov, drop %zd bytes", totlen)); return (-ENOSPC); } iov_frag_buf = iov->iov_base; iov_frag_size = iov->iov_len; } head = nm_ring_next(ring, head); } while (slot->flags & NS_MOREFRAG); /* Release slots to netmap. */ ring->head = ring->cur = head; return (totlen); } static void netmap_recv_enable(struct net_backend *be) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; mevent_enable(priv->mevp); } static void netmap_recv_disable(struct net_backend *be) { struct netmap_priv *priv = (struct netmap_priv *)be->opaque; mevent_disable(priv->mevp); } static struct net_backend netmap_backend = { .prefix = "netmap", .priv_size = sizeof(struct netmap_priv), .init = netmap_init, .cleanup = netmap_cleanup, .send = netmap_send, .peek_recvlen = netmap_peek_recvlen, .recv = netmap_recv, .recv_enable = netmap_recv_enable, .recv_disable = netmap_recv_disable, .get_cap = netmap_get_cap, .set_cap = netmap_set_cap, }; /* A clone of the netmap backend, with a different prefix. */ static struct net_backend vale_backend = { .prefix = "vale", .priv_size = sizeof(struct netmap_priv), .init = netmap_init, .cleanup = netmap_cleanup, .send = netmap_send, .peek_recvlen = netmap_peek_recvlen, .recv = netmap_recv, .recv_enable = netmap_recv_enable, .recv_disable = netmap_recv_disable, .get_cap = netmap_get_cap, .set_cap = netmap_set_cap, }; DATA_SET(net_backend_set, netmap_backend); DATA_SET(net_backend_set, vale_backend); +int +netbe_legacy_config(nvlist_t *nvl, const char *opts) +{ + char *backend, *cp; + + if (opts == NULL) + return (0); + + cp = strchr(opts, ','); + if (cp == NULL) { + set_config_value_node(nvl, "backend", opts); + return (0); + } + backend = strndup(opts, cp - opts); + set_config_value_node(nvl, "backend", backend); + free(backend); + return (pci_parse_legacy_config(nvl, cp + 1)); +} + /* * Initialize a backend and attach to the frontend. * This is called during frontend initialization. - * @pbe is a pointer to the backend to be initialized + * @ret is a pointer to the backend to be initialized * @devname is the backend-name as supplied on the command line, * e.g. -s 2:0,frontend-name,backend-name[,other-args] * @cb is the receive callback supplied by the frontend, * and it is invoked in the event loop when a receive * event is generated in the hypervisor, * @param is a pointer to the frontend, and normally used as * the argument for the callback. */ int -netbe_init(struct net_backend **ret, const char *opts, net_be_rxeof_t cb, +netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb, void *param) { struct net_backend **pbe, *nbe, *tbe = NULL; + const char *value; char *devname; - char *options; int err; - devname = options = strdup(opts); - - if (devname == NULL) { + value = get_config_value_node(nvl, "backend"); + if (value == NULL) { return (-1); } - - devname = strsep(&options, ","); + devname = strdup(value); /* * Find the network backend that matches the user-provided * device name. net_backend_set is built using a linker set. */ SET_FOREACH(pbe, net_backend_set) { if (strncmp(devname, (*pbe)->prefix, strlen((*pbe)->prefix)) == 0) { tbe = *pbe; assert(tbe->init != NULL); assert(tbe->cleanup != NULL); assert(tbe->send != NULL); assert(tbe->recv != NULL); assert(tbe->get_cap != NULL); assert(tbe->set_cap != NULL); break; } } *ret = NULL; if (tbe == NULL) { free(devname); return (EINVAL); } nbe = calloc(1, sizeof(*nbe) + tbe->priv_size); *nbe = *tbe; /* copy the template */ nbe->fd = -1; nbe->sc = param; nbe->be_vnet_hdr_len = 0; nbe->fe_vnet_hdr_len = 0; /* Initialize the backend. */ - err = nbe->init(nbe, devname, options, cb, param); + err = nbe->init(nbe, devname, nvl, cb, param); if (err) { free(devname); free(nbe); return (err); } *ret = nbe; free(devname); return (0); } void netbe_cleanup(struct net_backend *be) { if (be != NULL) { be->cleanup(be); free(be); } } uint64_t netbe_get_cap(struct net_backend *be) { assert(be != NULL); return (be->get_cap(be)); } int netbe_set_cap(struct net_backend *be, uint64_t features, unsigned vnet_hdr_len) { int ret; assert(be != NULL); /* There are only three valid lengths, i.e., 0, 10 and 12. */ if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) return (-1); be->fe_vnet_hdr_len = vnet_hdr_len; ret = be->set_cap(be, features, vnet_hdr_len); assert(be->be_vnet_hdr_len == 0 || be->be_vnet_hdr_len == be->fe_vnet_hdr_len); return (ret); } ssize_t netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) { return (be->send(be, iov, iovcnt)); } ssize_t netbe_peek_recvlen(struct net_backend *be) { return (be->peek_recvlen(be)); } /* * Try to read a packet from the backend, without blocking. * If no packets are available, return 0. In case of success, return * the length of the packet just read. Return -1 in case of errors. */ ssize_t netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) { return (be->recv(be, iov, iovcnt)); } /* * Read a packet from the backend and discard it. * Returns the size of the discarded packet or zero if no packet was available. * A negative error code is returned in case of read error. */ ssize_t netbe_rx_discard(struct net_backend *be) { /* * MP note: the dummybuf is only used to discard frames, * so there is no need for it to be per-vtnet or locked. * We only make it large enough for TSO-sized segment. */ static uint8_t dummybuf[65536 + 64]; struct iovec iov; iov.iov_base = dummybuf; iov.iov_len = sizeof(dummybuf); return netbe_recv(be, &iov, 1); } void netbe_rx_disable(struct net_backend *be) { return be->recv_disable(be); } void netbe_rx_enable(struct net_backend *be) { return be->recv_enable(be); } size_t netbe_get_vnet_hdr_len(struct net_backend *be) { return (be->be_vnet_hdr_len); } diff --git a/usr.sbin/bhyve/net_backends.h b/usr.sbin/bhyve/net_backends.h index b55437fc7bbc..bc7834546bc1 100644 --- a/usr.sbin/bhyve/net_backends.h +++ b/usr.sbin/bhyve/net_backends.h @@ -1,95 +1,96 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2019 Vincenzo Maffione * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __NET_BACKENDS_H__ #define __NET_BACKENDS_H__ #include /* Opaque type representing a network backend. */ typedef struct net_backend net_backend_t; /* Interface between network frontends and the network backends. */ typedef void (*net_be_rxeof_t)(int, enum ev_type, void *param); -int netbe_init(net_backend_t **be, const char *opts, net_be_rxeof_t cb, +int netbe_init(net_backend_t **be, nvlist_t *nvl, net_be_rxeof_t cb, void *param); +int netbe_legacy_config(nvlist_t *nvl, const char *opts); void netbe_cleanup(net_backend_t *be); uint64_t netbe_get_cap(net_backend_t *be); int netbe_set_cap(net_backend_t *be, uint64_t cap, unsigned vnet_hdr_len); size_t netbe_get_vnet_hdr_len(net_backend_t *be); ssize_t netbe_send(net_backend_t *be, const struct iovec *iov, int iovcnt); ssize_t netbe_peek_recvlen(net_backend_t *be); ssize_t netbe_recv(net_backend_t *be, const struct iovec *iov, int iovcnt); ssize_t netbe_rx_discard(net_backend_t *be); void netbe_rx_disable(net_backend_t *be); void netbe_rx_enable(net_backend_t *be); /* * Network device capabilities taken from the VirtIO standard. * Despite the name, these capabilities can be used by different frontents * (virtio-net, ptnet) and supported by different backends (netmap, tap, ...). */ #define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ #define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ #define VIRTIO_NET_F_MTU (1 << 3) /* initial MTU advice */ #define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ #define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ #define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ #define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ #define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ #define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ #define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ #define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ #define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ #define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ #define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ #define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ #define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ #define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ #define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ #define VIRTIO_NET_F_GUEST_ANNOUNCE \ (1 << 21) /* guest can send gratuitous pkts */ #define VIRTIO_NET_F_MQ (1 << 22) /* host supports multiple VQ pairs */ /* * Fixed network header size */ struct virtio_net_rxhdr { uint8_t vrh_flags; uint8_t vrh_gso_type; uint16_t vrh_hdr_len; uint16_t vrh_gso_size; uint16_t vrh_csum_start; uint16_t vrh_csum_offset; uint16_t vrh_bufs; } __packed; #endif /* __NET_BACKENDS_H__ */ diff --git a/usr.sbin/bhyve/net_utils.c b/usr.sbin/bhyve/net_utils.c index d602cac3eb0a..8088fa89a50f 100644 --- a/usr.sbin/bhyve/net_utils.c +++ b/usr.sbin/bhyve/net_utils.c @@ -1,122 +1,123 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include "bhyverun.h" +#include "config.h" #include "debug.h" #include "net_utils.h" int -net_parsemac(char *mac_str, uint8_t *mac_addr) +net_parsemac(const char *mac_str, uint8_t *mac_addr) { struct ether_addr *ea; char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; if (mac_str == NULL) return (EINVAL); ea = ether_aton(mac_str); if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { EPRINTLN("Invalid MAC %s", mac_str); return (EINVAL); } else memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); return (0); } int net_parsemtu(const char *mtu_str, unsigned long *mtu) { char *end; unsigned long val; assert(mtu_str != NULL); if (*mtu_str == '-') goto err; val = strtoul(mtu_str, &end, 0); if (*end != '\0') goto err; if (val == ULONG_MAX) return (ERANGE); if (val == 0 && errno == EINVAL) return (EINVAL); *mtu = val; return (0); err: errno = EINVAL; return (EINVAL); } void net_genmac(struct pci_devinst *pi, uint8_t *macaddr) { /* * The default MAC address is the standard NetApp OUI of 00-a0-98, * followed by an MD5 of the PCI slot/func number and dev name */ MD5_CTX mdctx; unsigned char digest[16]; char nstr[80]; snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot, - pi->pi_func, vmname); + pi->pi_func, get_config_value("name")); MD5Init(&mdctx); MD5Update(&mdctx, nstr, (unsigned int)strlen(nstr)); MD5Final(digest, &mdctx); macaddr[0] = 0x00; macaddr[1] = 0xa0; macaddr[2] = 0x98; macaddr[3] = digest[0]; macaddr[4] = digest[1]; macaddr[5] = digest[2]; } diff --git a/usr.sbin/bhyve/net_utils.h b/usr.sbin/bhyve/net_utils.h index 1ca20ddb74ab..3dd44572588c 100644 --- a/usr.sbin/bhyve/net_utils.h +++ b/usr.sbin/bhyve/net_utils.h @@ -1,40 +1,40 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2019 Vincenzo Maffione * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NET_UTILS_H_ #define _NET_UTILS_H_ #include #include "pci_emul.h" void net_genmac(struct pci_devinst *pi, uint8_t *macaddr); -int net_parsemac(char *mac_str, uint8_t *mac_addr); +int net_parsemac(const char *mac_str, uint8_t *mac_addr); int net_parsemtu(const char *mtu_str, unsigned long *mtu); #endif /* _NET_UTILS_H_ */ diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c index 64c1fe0b90ca..38ec87c34eed 100644 --- a/usr.sbin/bhyve/pci_ahci.c +++ b/usr.sbin/bhyve/pci_ahci.c @@ -1,2840 +1,2869 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Zhixiang Yu * Copyright (c) 2015-2016 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" +#include "config.h" +#include "debug.h" #include "pci_emul.h" #include "ahci.h" #include "block_if.h" #define DEF_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ #define MAX_PORTS 32 /* AHCI supports 32 ports */ #define PxSIG_ATA 0x00000101 /* ATA drive */ #define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */ enum sata_fis_type { FIS_TYPE_REGH2D = 0x27, /* Register FIS - host to device */ FIS_TYPE_REGD2H = 0x34, /* Register FIS - device to host */ FIS_TYPE_DMAACT = 0x39, /* DMA activate FIS - device to host */ FIS_TYPE_DMASETUP = 0x41, /* DMA setup FIS - bidirectional */ FIS_TYPE_DATA = 0x46, /* Data FIS - bidirectional */ FIS_TYPE_BIST = 0x58, /* BIST activate FIS - bidirectional */ FIS_TYPE_PIOSETUP = 0x5F, /* PIO setup FIS - device to host */ FIS_TYPE_SETDEVBITS = 0xA1, /* Set dev bits FIS - device to host */ }; /* * SCSI opcodes */ #define TEST_UNIT_READY 0x00 #define REQUEST_SENSE 0x03 #define INQUIRY 0x12 #define START_STOP_UNIT 0x1B #define PREVENT_ALLOW 0x1E #define READ_CAPACITY 0x25 #define READ_10 0x28 #define POSITION_TO_ELEMENT 0x2B #define READ_TOC 0x43 #define GET_EVENT_STATUS_NOTIFICATION 0x4A #define MODE_SENSE_10 0x5A #define REPORT_LUNS 0xA0 #define READ_12 0xA8 #define READ_CD 0xBE /* * SCSI mode page codes */ #define MODEPAGE_RW_ERROR_RECOVERY 0x01 #define MODEPAGE_CD_CAPABILITIES 0x2A /* * ATA commands */ #define ATA_SF_ENAB_SATA_SF 0x10 #define ATA_SATA_SF_AN 0x05 #define ATA_SF_DIS_SATA_SF 0x90 /* * Debug printf */ #ifdef AHCI_DEBUG static FILE *dbg; #define DPRINTF(format, arg...) do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0) #else #define DPRINTF(format, arg...) #endif #define WPRINTF(format, arg...) printf(format, ##arg) #define AHCI_PORT_IDENT 20 + 1 struct ahci_ioreq { struct blockif_req io_req; struct ahci_port *io_pr; STAILQ_ENTRY(ahci_ioreq) io_flist; TAILQ_ENTRY(ahci_ioreq) io_blist; uint8_t *cfis; uint32_t len; uint32_t done; int slot; int more; int readop; }; struct ahci_port { struct blockif_ctxt *bctx; struct pci_ahci_softc *pr_sc; struct ata_params ata_ident; uint8_t *cmd_lst; uint8_t *rfis; int port; int atapi; int reset; int waitforclear; int mult_sectors; uint8_t xfermode; uint8_t err_cfis[20]; uint8_t sense_key; uint8_t asc; u_int ccs; uint32_t pending; uint32_t clb; uint32_t clbu; uint32_t fb; uint32_t fbu; uint32_t is; uint32_t ie; uint32_t cmd; uint32_t unused0; uint32_t tfd; uint32_t sig; uint32_t ssts; uint32_t sctl; uint32_t serr; uint32_t sact; uint32_t ci; uint32_t sntf; uint32_t fbs; /* * i/o request info */ struct ahci_ioreq *ioreq; int ioqsz; STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd; TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd; }; struct ahci_cmd_hdr { uint16_t flags; uint16_t prdtl; uint32_t prdbc; uint64_t ctba; uint32_t reserved[4]; }; struct ahci_prdt_entry { uint64_t dba; uint32_t reserved; #define DBCMASK 0x3fffff uint32_t dbc; }; struct pci_ahci_softc { struct pci_devinst *asc_pi; pthread_mutex_t mtx; int ports; uint32_t cap; uint32_t ghc; uint32_t is; uint32_t pi; uint32_t vs; uint32_t ccc_ctl; uint32_t ccc_pts; uint32_t em_loc; uint32_t em_ctl; uint32_t cap2; uint32_t bohc; uint32_t lintr; struct ahci_port port[MAX_PORTS]; }; #define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx) static void ahci_handle_port(struct ahci_port *p); static inline void lba_to_msf(uint8_t *buf, int lba) { lba += 150; buf[0] = (lba / 75) / 60; buf[1] = (lba / 75) % 60; buf[2] = lba % 75; } /* * Generate HBA interrupts on global IS register write. */ static void ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask) { struct pci_devinst *pi = sc->asc_pi; struct ahci_port *p; int i, nmsg; uint32_t mmask; /* Update global IS from PxIS/PxIE. */ for (i = 0; i < sc->ports; i++) { p = &sc->port[i]; if (p->is & p->ie) sc->is |= (1 << i); } DPRINTF("%s(%08x) %08x", __func__, mask, sc->is); /* If there is nothing enabled -- clear legacy interrupt and exit. */ if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) { if (sc->lintr) { pci_lintr_deassert(pi); sc->lintr = 0; } return; } /* If there is anything and no MSI -- assert legacy interrupt. */ nmsg = pci_msi_maxmsgnum(pi); if (nmsg == 0) { if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } return; } /* Assert respective MSIs for ports that were touched. */ for (i = 0; i < nmsg; i++) { if (sc->ports <= nmsg || i < nmsg - 1) mmask = 1 << i; else mmask = 0xffffffff << i; if (sc->is & mask && mmask & mask) pci_generate_msi(pi, i); } } /* * Generate HBA interrupt on specific port event. */ static void ahci_port_intr(struct ahci_port *p) { struct pci_ahci_softc *sc = p->pr_sc; struct pci_devinst *pi = sc->asc_pi; int nmsg; DPRINTF("%s(%d) %08x/%08x %08x", __func__, p->port, p->is, p->ie, sc->is); /* If there is nothing enabled -- we are done. */ if ((p->is & p->ie) == 0) return; /* In case of non-shared MSI always generate interrupt. */ nmsg = pci_msi_maxmsgnum(pi); if (sc->ports <= nmsg || p->port < nmsg - 1) { sc->is |= (1 << p->port); if ((sc->ghc & AHCI_GHC_IE) == 0) return; pci_generate_msi(pi, p->port); return; } /* If IS for this port is already set -- do nothing. */ if (sc->is & (1 << p->port)) return; sc->is |= (1 << p->port); /* If interrupts are enabled -- generate one. */ if ((sc->ghc & AHCI_GHC_IE) == 0) return; if (nmsg > 0) { pci_generate_msi(pi, nmsg - 1); } else if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } } static void ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis) { int offset, len, irq; if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE)) return; switch (ft) { case FIS_TYPE_REGD2H: offset = 0x40; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0; break; case FIS_TYPE_SETDEVBITS: offset = 0x58; len = 8; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0; break; case FIS_TYPE_PIOSETUP: offset = 0x20; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0; break; default: WPRINTF("unsupported fis type %d", ft); return; } if (fis[2] & ATA_S_ERROR) { p->waitforclear = 1; irq |= AHCI_P_IX_TFE; } memcpy(p->rfis + offset, fis, len); if (irq) { if (~p->is & irq) { p->is |= irq; ahci_port_intr(p); } } } static void ahci_write_fis_piosetup(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_PIOSETUP; ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis); } static void ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[8]; uint8_t error; error = (tfd >> 8) & 0xff; tfd &= 0x77; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_SETDEVBITS; fis[1] = (1 << 6); fis[2] = tfd; fis[3] = error; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = slot; p->err_cfis[2] = tfd; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else { *(uint32_t *)(fis + 4) = (1 << slot); p->sact &= ~(1 << slot); } p->tfd &= ~0x77; p->tfd |= tfd; ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis); } static void ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[20]; uint8_t error; error = (tfd >> 8) & 0xff; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = (1 << 6); fis[2] = tfd & 0xff; fis[3] = error; fis[4] = cfis[4]; fis[5] = cfis[5]; fis[6] = cfis[6]; fis[7] = cfis[7]; fis[8] = cfis[8]; fis[9] = cfis[9]; fis[10] = cfis[10]; fis[11] = cfis[11]; fis[12] = cfis[12]; fis[13] = cfis[13]; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = 0x80; p->err_cfis[2] = tfd & 0xff; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else p->ci &= ~(1 << slot); p->tfd = tfd; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot) { uint8_t fis[20]; p->tfd = ATA_S_READY | ATA_S_DSC; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = 0; /* No interrupt */ fis[2] = p->tfd; /* Status */ fis[3] = 0; /* No error */ p->ci &= ~(1 << slot); ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_reset_fis_d2h(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[3] = 1; fis[4] = 1; if (p->atapi) { fis[5] = 0x14; fis[6] = 0xeb; } fis[12] = 1; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_check_stopped(struct ahci_port *p) { /* * If we are no longer processing the command list and nothing * is in-flight, clear the running bit, the current command * slot, the command issue and active bits. */ if (!(p->cmd & AHCI_P_CMD_ST)) { if (p->pending == 0) { p->ccs = 0; p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK); p->ci = 0; p->sact = 0; p->waitforclear = 0; } } } static void ahci_port_stop(struct ahci_port *p) { struct ahci_ioreq *aior; uint8_t *cfis; int slot; int error; assert(pthread_mutex_isowned_np(&p->pr_sc->mtx)); TAILQ_FOREACH(aior, &p->iobhd, io_blist) { /* * Try to cancel the outstanding blockif request. */ error = blockif_cancel(p->bctx, &aior->io_req); if (error != 0) continue; slot = aior->slot; cfis = aior->cfis; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) p->sact &= ~(1 << slot); /* NCQ */ else p->ci &= ~(1 << slot); /* * This command is now done. */ p->pending &= ~(1 << slot); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); } ahci_check_stopped(p); } static void ahci_port_reset(struct ahci_port *pr) { pr->serr = 0; pr->sact = 0; pr->xfermode = ATA_UDMA6; pr->mult_sectors = 128; if (!pr->bctx) { pr->ssts = ATA_SS_DET_NO_DEVICE; pr->sig = 0xFFFFFFFF; pr->tfd = 0x7F; return; } pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE; if (pr->sctl & ATA_SC_SPD_MASK) pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK); else pr->ssts |= ATA_SS_SPD_GEN3; pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA; if (!pr->atapi) { pr->sig = PxSIG_ATA; pr->tfd |= ATA_S_READY; } else pr->sig = PxSIG_ATAPI; ahci_write_reset_fis_d2h(pr); } static void ahci_reset(struct pci_ahci_softc *sc) { int i; sc->ghc = AHCI_GHC_AE; sc->is = 0; if (sc->lintr) { pci_lintr_deassert(sc->asc_pi); sc->lintr = 0; } for (i = 0; i < sc->ports; i++) { sc->port[i].ie = 0; sc->port[i].is = 0; sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD); if (sc->port[i].bctx) sc->port[i].cmd |= AHCI_P_CMD_CPS; sc->port[i].sctl = 0; ahci_port_reset(&sc->port[i]); } } static void ata_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i ^ 1] = *src++; else dest[i ^ 1] = ' '; } } static void atapi_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i] = *src++; else dest[i] = ' '; } } /* * Build up the iovec based on the PRDT, 'done' and 'len'. */ static void ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior, struct ahci_prdt_entry *prdt, uint16_t prdtl) { struct blockif_req *breq = &aior->io_req; int i, j, skip, todo, left, extra; uint32_t dbcsz; /* Copy part of PRDT between 'done' and 'len' bytes into the iov. */ skip = aior->done; left = aior->len - aior->done; todo = 0; for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0; i++, prdt++) { dbcsz = (prdt->dbc & DBCMASK) + 1; /* Skip already done part of the PRDT */ if (dbcsz <= skip) { skip -= dbcsz; continue; } dbcsz -= skip; if (dbcsz > left) dbcsz = left; breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba + skip, dbcsz); breq->br_iov[j].iov_len = dbcsz; todo += dbcsz; left -= dbcsz; skip = 0; j++; } /* If we got limited by IOV length, round I/O down to sector size. */ if (j == BLOCKIF_IOV_MAX) { extra = todo % blockif_sectsz(p->bctx); todo -= extra; assert(todo > 0); while (extra > 0) { if (breq->br_iov[j - 1].iov_len > extra) { breq->br_iov[j - 1].iov_len -= extra; break; } extra -= breq->br_iov[j - 1].iov_len; j--; } } breq->br_iovcnt = j; breq->br_resid = todo; aior->done += todo; aior->more = (aior->done < aior->len && i < prdtl); } static void ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; struct ahci_prdt_entry *prdt; struct ahci_cmd_hdr *hdr; uint64_t lba; uint32_t len; int err, first, ncq, readop; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); ncq = 0; readop = 1; first = (done == 0); if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || cfis[2] == ATA_WRITE_FPDMA_QUEUED) readop = 0; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[11] << 8 | cfis[3]; if (!len) len = 65536; ncq = 1; } else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[13] << 8 | cfis[12]; if (!len) len = 65536; } else { lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) | (cfis[5] << 8) | cfis[4]; len = cfis[12]; if (!len) len = 256; } lba *= blockif_sectsz(p->bctx); len *= blockif_sectsz(p->bctx); /* Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->readop = readop; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); if (readop) err = blockif_read(p->bctx, breq); else err = blockif_write(p->bctx, breq); assert(err == 0); } static void ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_ioreq *aior; struct blockif_req *breq; int err; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = 0; aior->done = 0; aior->more = 0; breq = &aior->io_req; /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_flush(p->bctx, breq); assert(err == 0); } static inline void read_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; void *to; int i, len; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; to = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); sublen = MIN(len, dbcsz); memcpy(to, ptr, sublen); len -= sublen; to += sublen; prdt++; } } static void ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; uint8_t *entry; uint64_t elba; uint32_t len, elen; int err, first, ncq; uint8_t buf[512]; first = (done == 0); if (cfis[2] == ATA_DATA_SET_MANAGEMENT) { len = (uint16_t)cfis[13] << 8 | cfis[12]; len *= 512; ncq = 0; } else { /* ATA_SEND_FPDMA_QUEUED */ len = (uint16_t)cfis[11] << 8 | cfis[3]; len *= 512; ncq = 1; } read_prdt(p, slot, cfis, buf, sizeof(buf)); next: entry = &buf[done]; elba = ((uint64_t)entry[5] << 40) | ((uint64_t)entry[4] << 32) | ((uint64_t)entry[3] << 24) | ((uint64_t)entry[2] << 16) | ((uint64_t)entry[1] << 8) | entry[0]; elen = (uint16_t)entry[7] << 8 | entry[6]; done += 8; if (elen == 0) { if (done >= len) { if (ncq) { if (first) ahci_write_fis_d2h_ncq(p, slot); ahci_write_fis_sdb(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } else { ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } p->pending &= ~(1 << slot); ahci_check_stopped(p); if (!first) ahci_handle_port(p); return; } goto next; } /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->more = (len != done); breq = &aior->io_req; breq->br_offset = elba * blockif_sectsz(p->bctx); breq->br_resid = elen * blockif_sectsz(p->bctx); /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); err = blockif_delete(p->bctx, breq); assert(err == 0); } static inline void write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; void *from; int i, len; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; from = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); sublen = MIN(len, dbcsz); memcpy(ptr, from, sublen); len -= sublen; from += sublen; prdt++; } hdr->prdbc = size - len; } static void ahci_checksum(uint8_t *buf, int size) { int i; uint8_t sum = 0; for (i = 0; i < size - 1; i++) sum += buf[i]; buf[size - 1] = 0x100 - sum; } static void ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; uint32_t buf[128]; uint8_t *buf8 = (uint8_t *)buf; uint16_t *buf16 = (uint16_t *)buf; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0 || cfis[5] != 0 || cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); return; } memset(buf, 0, sizeof(buf)); if (cfis[4] == 0x00) { /* Log directory */ buf16[0x00] = 1; /* Version -- 1 */ buf16[0x10] = 1; /* NCQ Command Error Log -- 1 page */ buf16[0x13] = 1; /* SATA NCQ Send and Receive Log -- 1 page */ } else if (cfis[4] == 0x10) { /* NCQ Command Error Log */ memcpy(buf8, p->err_cfis, sizeof(p->err_cfis)); ahci_checksum(buf8, sizeof(buf)); } else if (cfis[4] == 0x13) { /* SATA NCQ Send and Receive Log */ if (blockif_candelete(p->bctx) && !blockif_is_ro(p->bctx)) { buf[0x00] = 1; /* SFQ DSM supported */ buf[0x01] = 1; /* SFQ DSM TRIM supported */ } } else { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); return; } if (cfis[2] == ATA_READ_LOG_EXT) ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } static void handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void*)&p->ata_ident, sizeof(struct ata_params)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void ata_identify_init(struct ahci_port* p, int atapi) { struct ata_params* ata_ident = &p->ata_ident; if (atapi) { ata_ident->config = ATA_PROTO_ATAPI | ATA_ATAPI_TYPE_CDROM | ATA_ATAPI_REMOVABLE | ATA_DRQ_FAST; ata_ident->capabilities1 = ATA_SUPPORT_LBA | ATA_SUPPORT_DMA; ata_ident->capabilities2 = (1 << 14 | 1); ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88; ata_ident->obsolete62 = 0x3f; ata_ident->mwdmamodes = 7; if (p->xfermode & ATA_WDMA0) ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->apiomodes = 3; ata_ident->mwdmamin = 0x0078; ata_ident->mwdmarec = 0x0078; ata_ident->pioblind = 0x0078; ata_ident->pioiordy = 0x0078; ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3); ata_ident->satacapabilities2 = ((p->ssts & ATA_SS_SPD_MASK) >> 3); ata_ident->satasupport = ATA_SUPPORT_NCQ_STREAM; ata_ident->version_major = 0x3f0; ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); ata_ident->support.command2 = (1 << 14); ata_ident->support.extension = (1 << 14); ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); ata_ident->enabled.extension = (1 << 14); ata_ident->udmamodes = 0x7f; if (p->xfermode & ATA_UDMA0) ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->transport_major = 0x1020; ata_ident->integrity = 0x00a5; } else { uint64_t sectors; int sectsz, psectsz, psectoff, candelete, ro; uint16_t cyl; uint8_t sech, heads; ro = blockif_is_ro(p->bctx); candelete = blockif_candelete(p->bctx); sectsz = blockif_sectsz(p->bctx); sectors = blockif_size(p->bctx) / sectsz; blockif_chs(p->bctx, &cyl, &heads, &sech); blockif_psectsz(p->bctx, &psectsz, &psectoff); ata_ident->config = ATA_DRQ_FAST; ata_ident->cylinders = cyl; ata_ident->heads = heads; ata_ident->sectors = sech; ata_ident->sectors_intr = (0x8000 | 128); ata_ident->tcg = 0; ata_ident->capabilities1 = ATA_SUPPORT_DMA | ATA_SUPPORT_LBA | ATA_SUPPORT_IORDY; ata_ident->capabilities2 = (1 << 14); ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88; if (p->mult_sectors) ata_ident->multi = (ATA_MULTI_VALID | p->mult_sectors); if (sectors <= 0x0fffffff) { ata_ident->lba_size_1 = sectors; ata_ident->lba_size_2 = (sectors >> 16); } else { ata_ident->lba_size_1 = 0xffff; ata_ident->lba_size_2 = 0x0fff; } ata_ident->mwdmamodes = 0x7; if (p->xfermode & ATA_WDMA0) ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->apiomodes = 0x3; ata_ident->mwdmamin = 0x0078; ata_ident->mwdmarec = 0x0078; ata_ident->pioblind = 0x0078; ata_ident->pioiordy = 0x0078; ata_ident->support3 = 0; ata_ident->queue = 31; ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 | ATA_SUPPORT_NCQ); ata_ident->satacapabilities2 = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED | (p->ssts & ATA_SS_SPD_MASK) >> 3); ata_ident->version_major = 0x3f0; ata_ident->version_minor = 0x28; ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE | ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); ata_ident->support.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 14); ata_ident->support.extension = (1 << 14); ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE | ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); ata_ident->enabled.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 15); ata_ident->enabled.extension = (1 << 14); ata_ident->udmamodes = 0x7f; if (p->xfermode & ATA_UDMA0) ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->lba_size48_1 = sectors; ata_ident->lba_size48_2 = (sectors >> 16); ata_ident->lba_size48_3 = (sectors >> 32); ata_ident->lba_size48_4 = (sectors >> 48); if (candelete && !ro) { ata_ident->support3 |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT; ata_ident->max_dsm_blocks = 1; ata_ident->support_dsm = ATA_SUPPORT_DSM_TRIM; } ata_ident->pss = ATA_PSS_VALID_VALUE; ata_ident->lsalign = 0x4000; if (psectsz > sectsz) { ata_ident->pss |= ATA_PSS_MULTLS; ata_ident->pss |= ffsl(psectsz / sectsz) - 1; ata_ident->lsalign |= (psectoff / sectsz); } if (sectsz > 512) { ata_ident->pss |= ATA_PSS_LSSABOVE512; ata_ident->lss_1 = sectsz / 2; ata_ident->lss_2 = ((sectsz / 2) >> 16); } ata_ident->support2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); ata_ident->enabled2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); ata_ident->transport_major = 0x1020; ata_ident->integrity = 0x00a5; } ahci_checksum((uint8_t*)ata_ident, sizeof(struct ata_params)); } static void handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) { if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)&p->ata_ident, sizeof(struct ata_params)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[36]; uint8_t *acmd; int len; uint32_t tfd; acmd = cfis + 0x40; if (acmd[1] & 1) { /* VPD */ if (acmd[2] == 0) { /* Supported VPD pages */ buf[0] = 0x05; buf[1] = 0; buf[2] = 0; buf[3] = 1; buf[4] = 0; len = 4 + buf[3]; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } } else { buf[0] = 0x05; buf[1] = 0x80; buf[2] = 0x00; buf[3] = 0x21; buf[4] = 31; buf[5] = 0; buf[6] = 0; buf[7] = 0; atapi_string(buf + 8, "BHYVE", 8); atapi_string(buf + 16, "BHYVE DVD-ROM", 16); atapi_string(buf + 32, "001", 4); len = sizeof(buf); } if (len > acmd[4]) len = acmd[4]; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, len); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[8]; uint64_t sectors; sectors = blockif_size(p->bctx) / 2048; be32enc(buf, sectors - 1); be32enc(buf + 4, 2048); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint8_t format; int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); format = acmd[9] >> 6; switch (format) { case 0: { int msf, size; uint64_t sectors; uint8_t start_track, buf[20], *bp; msf = (acmd[1] >> 1) & 1; start_track = acmd[6]; if (start_track > 1 && start_track != 0xaa) { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } bp = buf + 2; *bp++ = 1; *bp++ = 1; if (start_track <= 1) { *bp++ = 0; *bp++ = 0x14; *bp++ = 1; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } } *bp++ = 0; *bp++ = 0x14; *bp++ = 0xaa; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 1: { uint8_t buf[12]; memset(buf, 0, sizeof(buf)); buf[1] = 0xa; buf[2] = 0x1; buf[3] = 0x1; if (len > sizeof(buf)) len = sizeof(buf); write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 2: { int msf, size; uint64_t sectors; uint8_t *bp, buf[50]; msf = (acmd[1] >> 1) & 1; bp = buf + 2; *bp++ = 1; *bp++ = 1; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa1; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa2; *bp++ = 0; *bp++ = 0; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } default: { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); break; } } } static void atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[16]; memset(buf, 0, sizeof(buf)); buf[3] = 8; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; struct blockif_req *breq; uint8_t *acmd; uint64_t lba; uint32_t len; int err; acmd = cfis + 0x40; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); prdt = (struct ahci_prdt_entry *)(cfis + 0x80); lba = be32dec(acmd + 2); if (acmd[0] == READ_10) len = be16dec(acmd + 7); else len = be32dec(acmd + 6); if (len == 0) { cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } lba *= 2048; len *= 2048; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->readop = 1; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_read(p->bctx, breq); assert(err == 0); } static void atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[64]; uint8_t *acmd; int len; acmd = cfis + 0x40; len = acmd[4]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, len); buf[0] = 0x70 | (1 << 7); buf[2] = p->sense_key; buf[7] = 10; buf[12] = p->asc; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd = cfis + 0x40; uint32_t tfd; switch (acmd[4] & 3) { case 0: case 1: case 3: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; tfd = ATA_S_READY | ATA_S_DSC; break; case 2: /* TODO eject media */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x53; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; uint8_t pc, code; int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); pc = acmd[2] >> 6; code = acmd[2] & 0x3f; switch (pc) { case 0: switch (code) { case MODEPAGE_RW_ERROR_RECOVERY: { uint8_t buf[16]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 16 - 2); buf[2] = 0x70; buf[8] = 0x01; buf[9] = 16 - 10; buf[11] = 0x05; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } case MODEPAGE_CD_CAPABILITIES: { uint8_t buf[30]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 30 - 2); buf[2] = 0x70; buf[8] = 0x2A; buf[9] = 30 - 10; buf[10] = 0x08; buf[12] = 0x71; be16enc(&buf[18], 2); be16enc(&buf[20], 512); write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } default: goto error; break; } break; case 3: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x39; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; error: case 1: case 2: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_get_event_status_notification(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; acmd = cfis + 0x40; /* we don't support asynchronous operation */ if (!(acmd[1] & 1)) { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } else { uint8_t buf[8]; int len; len = be16dec(acmd + 7); if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 8 - 2); buf[2] = 0x04; buf[3] = 0x10; buf[5] = 0x02; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; acmd = cfis + 0x40; #ifdef AHCI_DEBUG { int i; DPRINTF("ACMD:"); for (i = 0; i < 16; i++) DPRINTF("%02x ", acmd[i]); DPRINTF(""); } #endif switch (acmd[0]) { case TEST_UNIT_READY: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case INQUIRY: atapi_inquiry(p, slot, cfis); break; case READ_CAPACITY: atapi_read_capacity(p, slot, cfis); break; case PREVENT_ALLOW: /* TODO */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case READ_TOC: atapi_read_toc(p, slot, cfis); break; case REPORT_LUNS: atapi_report_luns(p, slot, cfis); break; case READ_10: case READ_12: atapi_read(p, slot, cfis, 0); break; case REQUEST_SENSE: atapi_request_sense(p, slot, cfis); break; case START_STOP_UNIT: atapi_start_stop_unit(p, slot, cfis); break; case MODE_SENSE_10: atapi_mode_sense(p, slot, cfis); break; case GET_EVENT_STATUS_NOTIFICATION: atapi_get_event_status_notification(p, slot, cfis); break; default: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x20; ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { p->tfd |= ATA_S_BUSY; switch (cfis[2]) { case ATA_ATA_IDENTIFY: handle_identify(p, slot, cfis); break; case ATA_SETFEATURES: { switch (cfis[3]) { case ATA_SF_ENAB_SATA_SF: switch (cfis[12]) { case ATA_SATA_SF_AN: p->tfd = ATA_S_DSC | ATA_S_READY; break; default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } break; case ATA_SF_ENAB_WCACHE: case ATA_SF_DIS_WCACHE: case ATA_SF_ENAB_RCACHE: case ATA_SF_DIS_RCACHE: p->tfd = ATA_S_DSC | ATA_S_READY; break; case ATA_SF_SETXFER: { switch (cfis[12] & 0xf8) { case ATA_PIO: case ATA_PIO0: break; case ATA_WDMA0: case ATA_UDMA0: p->xfermode = (cfis[12] & 0x7); break; } p->tfd = ATA_S_DSC | ATA_S_READY; break; } default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; } case ATA_SET_MULTI: if (cfis[12] != 0 && (cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) { p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); } else { p->mult_sectors = cfis[12]; p->tfd = ATA_S_DSC | ATA_S_READY; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; case ATA_READ: case ATA_WRITE: case ATA_READ48: case ATA_WRITE48: case ATA_READ_MUL: case ATA_WRITE_MUL: case ATA_READ_MUL48: case ATA_WRITE_MUL48: case ATA_READ_DMA: case ATA_WRITE_DMA: case ATA_READ_DMA48: case ATA_WRITE_DMA48: case ATA_READ_FPDMA_QUEUED: case ATA_WRITE_FPDMA_QUEUED: ahci_handle_rw(p, slot, cfis, 0); break; case ATA_FLUSHCACHE: case ATA_FLUSHCACHE48: ahci_handle_flush(p, slot, cfis); break; case ATA_DATA_SET_MANAGEMENT: if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM && cfis[13] == 0 && cfis[12] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_SEND_FPDMA_QUEUED: if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM && cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM && cfis[11] == 0 && cfis[3] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_READ_LOG_EXT: case ATA_READ_LOG_DMA_EXT: ahci_handle_read_log(p, slot, cfis); break; case ATA_SECURITY_FREEZE_LOCK: case ATA_SMART_CMD: case ATA_NOP: ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_CHECK_POWER_MODE: cfis[12] = 0xff; /* always on */ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_STANDBY_CMD: case ATA_STANDBY_IMMEDIATE: case ATA_IDLE_CMD: case ATA_IDLE_IMMEDIATE: case ATA_SLEEP: case ATA_READ_VERIFY: case ATA_READ_VERIFY48: ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_ATAPI_IDENTIFY: handle_atapi_identify(p, slot, cfis); break; case ATA_PACKET_CMD: if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else handle_packet_cmd(p, slot, cfis); break; default: WPRINTF("Unsupported cmd:%02x", cfis[2]); ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_slot(struct ahci_port *p, int slot) { struct ahci_cmd_hdr *hdr; #ifdef AHCI_DEBUG struct ahci_prdt_entry *prdt; #endif struct pci_ahci_softc *sc; uint8_t *cfis; #ifdef AHCI_DEBUG int cfl, i; #endif sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); #ifdef AHCI_DEBUG cfl = (hdr->flags & 0x1f) * 4; #endif cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba, 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry)); #ifdef AHCI_DEBUG prdt = (struct ahci_prdt_entry *)(cfis + 0x80); DPRINTF("cfis:"); for (i = 0; i < cfl; i++) { if (i % 10 == 0) DPRINTF(""); DPRINTF("%02x ", cfis[i]); } DPRINTF(""); for (i = 0; i < hdr->prdtl; i++) { DPRINTF("%d@%08"PRIx64"", prdt->dbc & 0x3fffff, prdt->dba); prdt++; } #endif if (cfis[0] != FIS_TYPE_REGH2D) { WPRINTF("Not a H2D FIS:%02x", cfis[0]); return; } if (cfis[1] & 0x80) { ahci_handle_cmd(p, slot, cfis); } else { if (cfis[15] & (1 << 2)) p->reset = 1; else if (p->reset) { p->reset = 0; ahci_port_reset(p); } p->ci &= ~(1 << slot); } } static void ahci_handle_port(struct ahci_port *p) { if (!(p->cmd & AHCI_P_CMD_ST)) return; /* * Search for any new commands to issue ignoring those that * are already in-flight. Stop if device is busy or in error. */ for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) { if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0) break; if (p->waitforclear) break; if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) { p->cmd &= ~AHCI_P_CMD_CCS_MASK; p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT; ahci_handle_slot(p, p->ccs); } } } /* * blockif callback routine - this runs in the context of the blockif * i/o thread, so the mutex needs to be acquired. */ static void ata_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint32_t tfd; uint8_t *cfis; int slot, ncq, dsm; DPRINTF("%s %d", __func__, err); ncq = dsm = 0; aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) ncq = 1; if (cfis[2] == ATA_DATA_SET_MANAGEMENT || (cfis[2] == ATA_SEND_FPDMA_QUEUED && (cfis[13] & 0x1f) == ATA_SFPDMA_DSM)) dsm = 1; pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { if (dsm) ahci_handle_dsm_trim(p, slot, cfis, aior->done); else ahci_handle_rw(p, slot, cfis, aior->done); goto out; } if (!err) tfd = ATA_S_READY | ATA_S_DSC; else tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; if (ncq) ahci_write_fis_sdb(p, slot, cfis, tfd); else ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit", __func__); } static void atapi_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint8_t *cfis; uint32_t tfd; int slot; DPRINTF("%s %d", __func__, err); aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE); pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { atapi_read(p, slot, cfis, aior->done); goto out; } if (!err) { tfd = ATA_S_READY | ATA_S_DSC; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x21; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit", __func__); } static void pci_ahci_ioreq_init(struct ahci_port *pr) { struct ahci_ioreq *vr; int i; pr->ioqsz = blockif_queuesz(pr->bctx); pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq)); STAILQ_INIT(&pr->iofhd); /* * Add all i/o request entries to the free queue */ for (i = 0; i < pr->ioqsz; i++) { vr = &pr->ioreq[i]; vr->io_pr = pr; if (!pr->atapi) vr->io_req.br_callback = ata_ioreq_cb; else vr->io_req.br_callback = atapi_ioreq_cb; vr->io_req.br_param = vr; STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist); } TAILQ_INIT(&pr->iobhd); } static void pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; struct ahci_port *p = &sc->port[port]; DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"", port, offset, value); switch (offset) { case AHCI_P_CLB: p->clb = value; break; case AHCI_P_CLBU: p->clbu = value; break; case AHCI_P_FB: p->fb = value; break; case AHCI_P_FBU: p->fbu = value; break; case AHCI_P_IS: p->is &= ~value; ahci_port_intr(p); break; case AHCI_P_IE: p->ie = value & 0xFDC000FF; ahci_port_intr(p); break; case AHCI_P_CMD: { p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK); p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value; if (!(value & AHCI_P_CMD_ST)) { ahci_port_stop(p); } else { uint64_t clb; p->cmd |= AHCI_P_CMD_CR; clb = (uint64_t)p->clbu << 32 | p->clb; p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb, AHCI_CL_SIZE * AHCI_MAX_SLOTS); } if (value & AHCI_P_CMD_FRE) { uint64_t fb; p->cmd |= AHCI_P_CMD_FR; fb = (uint64_t)p->fbu << 32 | p->fb; /* we don't support FBSCP, so rfis size is 256Bytes */ p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256); } else { p->cmd &= ~AHCI_P_CMD_FR; } if (value & AHCI_P_CMD_CLO) { p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ); p->cmd &= ~AHCI_P_CMD_CLO; } if (value & AHCI_P_CMD_ICC_MASK) { p->cmd &= ~AHCI_P_CMD_ICC_MASK; } ahci_handle_port(p); break; } case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"", offset); break; case AHCI_P_SCTL: p->sctl = value; if (!(p->cmd & AHCI_P_CMD_ST)) { if (value & ATA_SC_DET_RESET) ahci_port_reset(p); } break; case AHCI_P_SERR: p->serr &= ~value; break; case AHCI_P_SACT: p->sact |= value; break; case AHCI_P_CI: p->ci |= value; ahci_handle_port(p); break; case AHCI_P_SNTF: case AHCI_P_FBS: default: break; } } static void pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"", offset, value); switch (offset) { case AHCI_CAP: case AHCI_PI: case AHCI_VS: case AHCI_CAP2: DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"", offset); break; case AHCI_GHC: if (value & AHCI_GHC_HR) { ahci_reset(sc); break; } if (value & AHCI_GHC_IE) sc->ghc |= AHCI_GHC_IE; else sc->ghc &= ~AHCI_GHC_IE; ahci_generate_intr(sc, 0xffffffff); break; case AHCI_IS: sc->is &= ~value; ahci_generate_intr(sc, value); break; default: break; } } static void pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_ahci_softc *sc = pi->pi_arg; assert(baridx == 5); assert((offset % 4) == 0 && size == 4); pthread_mutex_lock(&sc->mtx); if (offset < AHCI_OFFSET) pci_ahci_host_write(sc, offset, value); else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP) pci_ahci_port_write(sc, offset, value); else WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"", offset); pthread_mutex_unlock(&sc->mtx); } static uint64_t pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; switch (offset) { case AHCI_CAP: case AHCI_GHC: case AHCI_IS: case AHCI_PI: case AHCI_VS: case AHCI_CCCC: case AHCI_CCCP: case AHCI_EM_LOC: case AHCI_EM_CTL: case AHCI_CAP2: { uint32_t *p = &sc->cap; p += (offset - AHCI_CAP) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x", offset, value); return (value); } static uint64_t pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; switch (offset) { case AHCI_P_CLB: case AHCI_P_CLBU: case AHCI_P_FB: case AHCI_P_FBU: case AHCI_P_IS: case AHCI_P_IE: case AHCI_P_CMD: case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: case AHCI_P_SCTL: case AHCI_P_SERR: case AHCI_P_SACT: case AHCI_P_CI: case AHCI_P_SNTF: case AHCI_P_FBS: { uint32_t *p= &sc->port[port].clb; p += (offset - AHCI_P_CLB) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x", port, offset, value); return value; } static uint64_t pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t regoff, int size) { struct pci_ahci_softc *sc = pi->pi_arg; uint64_t offset; uint32_t value; assert(baridx == 5); assert(size == 1 || size == 2 || size == 4); assert((regoff & (size - 1)) == 0); pthread_mutex_lock(&sc->mtx); offset = regoff & ~0x3; /* round down to a multiple of 4 bytes */ if (offset < AHCI_OFFSET) value = pci_ahci_host_read(sc, offset); else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP) value = pci_ahci_port_read(sc, offset); else { value = 0; WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"", regoff); } value >>= 8 * (regoff & 0x3); pthread_mutex_unlock(&sc->mtx); return (value); } +/* + * Each AHCI controller has a "port" node which contains nodes for + * each port named after the decimal number of the port (no leading + * zeroes). Port nodes contain a "type" ("hd" or "cd"), as well as + * options for blockif. For example: + * + * pci.0.1.0 + * .device="ahci" + * .port + * .0 + * .type="hd" + * .path="/path/to/image" + */ +static int +pci_ahci_legacy_config_port(nvlist_t *nvl, int port, const char *type, + const char *opts) +{ + char node_name[sizeof("XX")]; + nvlist_t *port_nvl; + + snprintf(node_name, sizeof(node_name), "%d", port); + port_nvl = create_relative_config_node(nvl, node_name); + set_config_value_node(port_nvl, "type", type); + return (blockif_legacy_config(port_nvl, opts)); +} + static int -pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) +pci_ahci_legacy_config(nvlist_t *nvl, const char *opts) +{ + nvlist_t *ports_nvl; + const char *type; + char *next, *next2, *str, *tofree; + int p, ret; + + if (opts == NULL) + return (0); + + ports_nvl = create_relative_config_node(nvl, "port"); + ret = 1; + tofree = str = strdup(opts); + for (p = 0; p < MAX_PORTS && str != NULL; p++, str = next) { + /* Identify and cut off type of present port. */ + if (strncmp(str, "hd:", 3) == 0) { + type = "hd"; + str += 3; + } else if (strncmp(str, "cd:", 3) == 0) { + type = "cd"; + str += 3; + } else + type = NULL; + + /* Find and cut off the next port options. */ + next = strstr(str, ",hd:"); + next2 = strstr(str, ",cd:"); + if (next == NULL || (next2 != NULL && next2 < next)) + next = next2; + if (next != NULL) { + next[0] = 0; + next++; + } + + if (str[0] == 0) + continue; + + if (type == NULL) { + EPRINTLN("Missing or invalid type for port %d: \"%s\"", + p, str); + goto out; + } + + if (pci_ahci_legacy_config_port(ports_nvl, p, type, str) != 0) + goto out; + } + ret = 0; +out: + free(tofree); + return (ret); +} + +static int +pci_ahci_cd_legacy_config(nvlist_t *nvl, const char *opts) +{ + nvlist_t *ports_nvl; + + ports_nvl = create_relative_config_node(nvl, "port"); + return (pci_ahci_legacy_config_port(ports_nvl, 0, "cd", opts)); +} + +static int +pci_ahci_hd_legacy_config(nvlist_t *nvl, const char *opts) +{ + nvlist_t *ports_nvl; + + ports_nvl = create_relative_config_node(nvl, "port"); + return (pci_ahci_legacy_config_port(ports_nvl, 0, "hd", opts)); +} + +static int +pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { char bident[sizeof("XX:XX:XX")]; + char node_name[sizeof("XX")]; struct blockif_ctxt *bctxt; struct pci_ahci_softc *sc; - int ret, slots, p; + int atapi, ret, slots, p; MD5_CTX mdctx; u_char digest[16]; - char *next, *next2; - char *bopt, *uopt, *xopts, *config; - FILE* fp; - size_t block_len; - int comma, optpos; + const char *path, *type, *value; + nvlist_t *ports_nvl, *port_nvl; ret = 0; #ifdef AHCI_DEBUG dbg = fopen("/tmp/log", "w+"); #endif sc = calloc(1, sizeof(struct pci_ahci_softc)); pi->pi_arg = sc; sc->asc_pi = pi; pthread_mutex_init(&sc->mtx, NULL); sc->ports = 0; sc->pi = 0; slots = 32; - for (p = 0; p < MAX_PORTS && opts != NULL; p++, opts = next) { + ports_nvl = find_relative_config_node(nvl, "port"); + for (p = 0; p < MAX_PORTS; p++) { struct ata_params *ata_ident = &sc->port[p].ata_ident; - memset(ata_ident, 0, sizeof(struct ata_params)); - - /* Identify and cut off type of present port. */ - if (strncmp(opts, "hd:", 3) == 0) { - atapi = 0; - opts += 3; - } else if (strncmp(opts, "cd:", 3) == 0) { - atapi = 1; - opts += 3; - } + char ident[AHCI_PORT_IDENT]; - /* Find and cut off the next port options. */ - next = strstr(opts, ",hd:"); - next2 = strstr(opts, ",cd:"); - if (next == NULL || (next2 != NULL && next2 < next)) - next = next2; - if (next != NULL) { - next[0] = 0; - next++; - } - - if (opts[0] == 0) + snprintf(node_name, sizeof(node_name), "%d", p); + port_nvl = find_relative_config_node(ports_nvl, node_name); + if (port_nvl == NULL) continue; - uopt = strdup(opts); - bopt = NULL; - fp = open_memstream(&bopt, &block_len); - comma = 0; - optpos = 0; - - for (xopts = strtok(uopt, ","); - xopts != NULL; - xopts = strtok(NULL, ",")) { - - /* First option assume as block filename. */ - if (optpos == 0) { - /* - * Create an identifier for the backing file. - * Use parts of the md5 sum of the filename - */ - char ident[AHCI_PORT_IDENT]; - MD5Init(&mdctx); - MD5Update(&mdctx, opts, strlen(opts)); - MD5Final(digest, &mdctx); - snprintf(ident, AHCI_PORT_IDENT, - "BHYVE-%02X%02X-%02X%02X-%02X%02X", - digest[0], digest[1], digest[2], digest[3], digest[4], - digest[5]); - ata_string((uint8_t*)&ata_ident->serial, ident, 20); - ata_string((uint8_t*)&ata_ident->revision, "001", 8); - if (atapi) { - ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DVD ROM", 40); - } - else { - ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DISK", 40); - } - } - - if ((config = strchr(xopts, '=')) != NULL) { - *config++ = '\0'; - if (!strcmp("nmrr", xopts)) { - ata_ident->media_rotation_rate = atoi(config); - } - else if (!strcmp("ser", xopts)) { - ata_string((uint8_t*)(&ata_ident->serial), config, 20); - } - else if (!strcmp("rev", xopts)) { - ata_string((uint8_t*)(&ata_ident->revision), config, 8); - } - else if (!strcmp("model", xopts)) { - ata_string((uint8_t*)(&ata_ident->model), config, 40); - } - else { - /* Pass all other options to blockif_open. */ - *--config = '='; - fprintf(fp, "%s%s", comma ? "," : "", xopts); - comma = 1; - } - } - else { - /* Pass all other options to blockif_open. */ - fprintf(fp, "%s%s", comma ? "," : "", xopts); - comma = 1; - } - optpos++; - } - free(uopt); - fclose(fp); + type = get_config_value_node(port_nvl, "type"); + if (type == NULL) + continue; - DPRINTF("%s\n", bopt); + if (strcmp(type, "hd") == 0) + atapi = 0; + else + atapi = 1; /* * Attempt to open the backing image. Use the PCI slot/func * and the port number for the identifier string. */ snprintf(bident, sizeof(bident), "%d:%d:%d", pi->pi_slot, pi->pi_func, p); - bctxt = blockif_open(bopt, bident); - free(bopt); + bctxt = blockif_open(port_nvl, bident); if (bctxt == NULL) { sc->ports = p; ret = 1; goto open_fail; } sc->port[p].bctx = bctxt; sc->port[p].pr_sc = sc; sc->port[p].port = p; sc->port[p].atapi = atapi; + /* + * Create an identifier for the backing file. + * Use parts of the md5 sum of the filename + */ + path = get_config_value_node(port_nvl, "path"); + MD5Init(&mdctx); + MD5Update(&mdctx, path, strlen(path)); + MD5Final(digest, &mdctx); + snprintf(ident, AHCI_PORT_IDENT, + "BHYVE-%02X%02X-%02X%02X-%02X%02X", + digest[0], digest[1], digest[2], digest[3], digest[4], + digest[5]); + + memset(ata_ident, 0, sizeof(struct ata_params)); + ata_string((uint8_t*)&ata_ident->serial, ident, 20); + ata_string((uint8_t*)&ata_ident->revision, "001", 8); + if (atapi) + ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DVD ROM", 40); + else + ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DISK", 40); + value = get_config_value_node(port_nvl, "nmrr"); + if (value != NULL) + ata_ident->media_rotation_rate = atoi(value); + value = get_config_value_node(port_nvl, "ser"); + if (value != NULL) + ata_string((uint8_t*)(&ata_ident->serial), value, 20); + value = get_config_value_node(port_nvl, "rev"); + if (value != NULL) + ata_string((uint8_t*)(&ata_ident->revision), value, 8); + value = get_config_value_node(port_nvl, "model"); + if (value != NULL) + ata_string((uint8_t*)(&ata_ident->model), value, 40); ata_identify_init(&sc->port[p], atapi); /* * Allocate blockif request structures and add them * to the free list */ pci_ahci_ioreq_init(&sc->port[p]); sc->pi |= (1 << p); if (sc->port[p].ioqsz < slots) slots = sc->port[p].ioqsz; } sc->ports = p; /* Intel ICH8 AHCI */ --slots; if (sc->ports < DEF_PORTS) sc->ports = DEF_PORTS; sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF | AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP | AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)| AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC | (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1); sc->vs = 0x10300; sc->cap2 = AHCI_CAP2_APST; ahci_reset(sc); pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0); p = MIN(sc->ports, 16); p = flsl(p) - ((p & (p - 1)) ? 0 : 1); pci_emul_add_msicap(pi, 1 << p); pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32, AHCI_OFFSET + sc->ports * AHCI_STEP); pci_lintr_request(pi); open_fail: if (ret) { for (p = 0; p < sc->ports; p++) { if (sc->port[p].bctx != NULL) blockif_close(sc->port[p].bctx); } free(sc); } return (ret); } -static int -pci_ahci_hd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) -{ - - return (pci_ahci_init(ctx, pi, opts, 0)); -} - -static int -pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) -{ - - return (pci_ahci_init(ctx, pi, opts, 1)); -} - #ifdef BHYVE_SNAPSHOT static int pci_ahci_snapshot_save_queues(struct ahci_port *port, struct vm_snapshot_meta *meta) { int ret; int idx; struct ahci_ioreq *ioreq; STAILQ_FOREACH(ioreq, &port->iofhd, io_flist) { idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq); SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); } idx = -1; SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); TAILQ_FOREACH(ioreq, &port->iobhd, io_blist) { idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq); SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); /* * Snapshot only the busy requests; other requests are * not valid. */ ret = blockif_snapshot_req(&ioreq->io_req, meta); if (ret != 0) { fprintf(stderr, "%s: failed to snapshot req\r\n", __func__); goto done; } } idx = -1; SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); done: return (ret); } static int pci_ahci_snapshot_restore_queues(struct ahci_port *port, struct vm_snapshot_meta *meta) { int ret; int idx; struct ahci_ioreq *ioreq; /* Empty the free queue before restoring. */ while (!STAILQ_EMPTY(&port->iofhd)) STAILQ_REMOVE_HEAD(&port->iofhd, io_flist); /* Restore the free queue. */ while (1) { SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); if (idx == -1) break; STAILQ_INSERT_TAIL(&port->iofhd, &port->ioreq[idx], io_flist); } /* Restore the busy queue. */ while (1) { SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); if (idx == -1) break; ioreq = &port->ioreq[idx]; TAILQ_INSERT_TAIL(&port->iobhd, ioreq, io_blist); /* * Restore only the busy requests; other requests are * not valid. */ ret = blockif_snapshot_req(&ioreq->io_req, meta); if (ret != 0) { fprintf(stderr, "%s: failed to restore request\r\n", __func__); goto done; } /* Re-enqueue the requests in the block interface. */ if (ioreq->readop) ret = blockif_read(port->bctx, &ioreq->io_req); else ret = blockif_write(port->bctx, &ioreq->io_req); if (ret != 0) { fprintf(stderr, "%s: failed to re-enqueue request\r\n", __func__); goto done; } } done: return (ret); } static int pci_ahci_snapshot(struct vm_snapshot_meta *meta) { int i, j, ret; void *bctx; struct pci_devinst *pi; struct pci_ahci_softc *sc; struct ahci_port *port; struct ahci_cmd_hdr *hdr; struct ahci_ioreq *ioreq; pi = meta->dev_data; sc = pi->pi_arg; /* TODO: add mtx lock/unlock */ SNAPSHOT_VAR_OR_LEAVE(sc->ports, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->cap, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ghc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->is, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->pi, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->vs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ccc_ctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ccc_pts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->em_loc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->em_ctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->cap2, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->bohc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->lintr, meta, ret, done); for (i = 0; i < MAX_PORTS; i++) { port = &sc->port[i]; if (meta->op == VM_SNAPSHOT_SAVE) bctx = port->bctx; SNAPSHOT_VAR_OR_LEAVE(bctx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->port, meta, ret, done); /* Mostly for restore; save is ensured by the lines above. */ if (((bctx == NULL) && (port->bctx != NULL)) || ((bctx != NULL) && (port->bctx == NULL))) { fprintf(stderr, "%s: ports not matching\r\n", __func__); ret = EINVAL; goto done; } if (port->bctx == NULL) continue; if (port->port != i) { fprintf(stderr, "%s: ports not matching: " "actual: %d expected: %d\r\n", __func__, port->port, i); ret = EINVAL; goto done; } SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->cmd_lst, AHCI_CL_SIZE * AHCI_MAX_SLOTS, false, meta, ret, done); SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->rfis, 256, false, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ata_ident, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->atapi, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->reset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->waitforclear, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->mult_sectors, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->xfermode, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->err_cfis, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sense_key, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->asc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ccs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->pending, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->clb, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->clbu, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fb, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fbu, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ie, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->cmd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->unused0, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->tfd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sig, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ssts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->serr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sact, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ci, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sntf, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done); for (j = 0; j < port->ioqsz; j++) { ioreq = &port->ioreq[j]; /* blockif_req snapshot done only for busy requests. */ hdr = (struct ahci_cmd_hdr *)(port->cmd_lst + ioreq->slot * AHCI_CL_SIZE); SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ioreq->cfis, 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry), false, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(ioreq->len, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(ioreq->done, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(ioreq->slot, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(ioreq->more, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(ioreq->readop, meta, ret, done); } /* Perform save / restore specific operations. */ if (meta->op == VM_SNAPSHOT_SAVE) { ret = pci_ahci_snapshot_save_queues(port, meta); if (ret != 0) goto done; } else if (meta->op == VM_SNAPSHOT_RESTORE) { ret = pci_ahci_snapshot_restore_queues(port, meta); if (ret != 0) goto done; } else { ret = EINVAL; goto done; } ret = blockif_snapshot(port->bctx, meta); if (ret != 0) { fprintf(stderr, "%s: failed to restore blockif\r\n", __func__); goto done; } } done: return (ret); } static int pci_ahci_pause(struct vmctx *ctx, struct pci_devinst *pi) { struct pci_ahci_softc *sc; struct blockif_ctxt *bctxt; int i; sc = pi->pi_arg; for (i = 0; i < MAX_PORTS; i++) { bctxt = sc->port[i].bctx; if (bctxt == NULL) continue; blockif_pause(bctxt); } return (0); } static int pci_ahci_resume(struct vmctx *ctx, struct pci_devinst *pi) { struct pci_ahci_softc *sc; struct blockif_ctxt *bctxt; int i; sc = pi->pi_arg; for (i = 0; i < MAX_PORTS; i++) { bctxt = sc->port[i].bctx; if (bctxt == NULL) continue; blockif_resume(bctxt); } return (0); } #endif /* * Use separate emulation names to distinguish drive and atapi devices */ struct pci_devemu pci_de_ahci = { .pe_emu = "ahci", - .pe_init = pci_ahci_hd_init, + .pe_init = pci_ahci_init, + .pe_legacy_config = pci_ahci_legacy_config, .pe_barwrite = pci_ahci_write, .pe_barread = pci_ahci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_ahci_snapshot, .pe_pause = pci_ahci_pause, .pe_resume = pci_ahci_resume, #endif }; PCI_EMUL_SET(pci_de_ahci); struct pci_devemu pci_de_ahci_hd = { .pe_emu = "ahci-hd", - .pe_init = pci_ahci_hd_init, - .pe_barwrite = pci_ahci_write, - .pe_barread = pci_ahci_read, -#ifdef BHYVE_SNAPSHOT - .pe_snapshot = pci_ahci_snapshot, - .pe_pause = pci_ahci_pause, - .pe_resume = pci_ahci_resume, -#endif + .pe_legacy_config = pci_ahci_hd_legacy_config, + .pe_alias = "ahci", }; PCI_EMUL_SET(pci_de_ahci_hd); struct pci_devemu pci_de_ahci_cd = { .pe_emu = "ahci-cd", - .pe_init = pci_ahci_atapi_init, - .pe_barwrite = pci_ahci_write, - .pe_barread = pci_ahci_read, -#ifdef BHYVE_SNAPSHOT - .pe_snapshot = pci_ahci_snapshot, - .pe_pause = pci_ahci_pause, - .pe_resume = pci_ahci_resume, -#endif + .pe_legacy_config = pci_ahci_cd_legacy_config, + .pe_alias = "ahci", }; PCI_EMUL_SET(pci_de_ahci_cd); diff --git a/usr.sbin/bhyve/pci_e82545.c b/usr.sbin/bhyve/pci_e82545.c index 2d09c024f258..d0b4fb9e466f 100644 --- a/usr.sbin/bhyve/pci_e82545.c +++ b/usr.sbin/bhyve/pci_e82545.c @@ -1,2550 +1,2513 @@ /* * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Alexander Motin * Copyright (c) 2015 Peter Grehan * Copyright (c) 2013 Jeremiah Lott, Avere Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "e1000_regs.h" #include "e1000_defines.h" #include "mii.h" #include "bhyverun.h" +#include "config.h" #include "debug.h" #include "pci_emul.h" #include "mevent.h" #include "net_utils.h" #include "net_backends.h" /* Hardware/register definitions XXX: move some to common code. */ #define E82545_VENDOR_ID_INTEL 0x8086 #define E82545_DEV_ID_82545EM_COPPER 0x100F #define E82545_SUBDEV_ID 0x1008 #define E82545_REVISION_4 4 #define E82545_MDIC_DATA_MASK 0x0000FFFF #define E82545_MDIC_OP_MASK 0x0c000000 #define E82545_MDIC_IE 0x20000000 #define E82545_EECD_FWE_DIS 0x00000010 /* Flash writes disabled */ #define E82545_EECD_FWE_EN 0x00000020 /* Flash writes enabled */ #define E82545_EECD_FWE_MASK 0x00000030 /* Flash writes mask */ #define E82545_BAR_REGISTER 0 #define E82545_BAR_REGISTER_LEN (128*1024) #define E82545_BAR_FLASH 1 #define E82545_BAR_FLASH_LEN (64*1024) #define E82545_BAR_IO 2 #define E82545_BAR_IO_LEN 8 #define E82545_IOADDR 0x00000000 #define E82545_IODATA 0x00000004 #define E82545_IO_REGISTER_MAX 0x0001FFFF #define E82545_IO_FLASH_BASE 0x00080000 #define E82545_IO_FLASH_MAX 0x000FFFFF #define E82545_ARRAY_ENTRY(reg, offset) (reg + (offset<<2)) #define E82545_RAR_MAX 15 #define E82545_MTA_MAX 127 #define E82545_VFTA_MAX 127 /* Slightly modified from the driver versions, hardcoded for 3 opcode bits, * followed by 6 address bits. * TODO: make opcode bits and addr bits configurable? * NVM Commands - Microwire */ #define E82545_NVM_OPCODE_BITS 3 #define E82545_NVM_ADDR_BITS 6 #define E82545_NVM_DATA_BITS 16 #define E82545_NVM_OPADDR_BITS (E82545_NVM_OPCODE_BITS + E82545_NVM_ADDR_BITS) #define E82545_NVM_ADDR_MASK ((1 << E82545_NVM_ADDR_BITS)-1) #define E82545_NVM_OPCODE_MASK \ (((1 << E82545_NVM_OPCODE_BITS) - 1) << E82545_NVM_ADDR_BITS) #define E82545_NVM_OPCODE_READ (0x6 << E82545_NVM_ADDR_BITS) /* read */ #define E82545_NVM_OPCODE_WRITE (0x5 << E82545_NVM_ADDR_BITS) /* write */ #define E82545_NVM_OPCODE_ERASE (0x7 << E82545_NVM_ADDR_BITS) /* erase */ #define E82545_NVM_OPCODE_EWEN (0x4 << E82545_NVM_ADDR_BITS) /* wr-enable */ #define E82545_NVM_EEPROM_SIZE 64 /* 64 * 16-bit values == 128K */ #define E1000_ICR_SRPD 0x00010000 /* This is an arbitrary number. There is no hard limit on the chip. */ #define I82545_MAX_TXSEGS 64 /* Legacy receive descriptor */ struct e1000_rx_desc { uint64_t buffer_addr; /* Address of the descriptor's data buffer */ uint16_t length; /* Length of data DMAed into data buffer */ uint16_t csum; /* Packet checksum */ uint8_t status; /* Descriptor status */ uint8_t errors; /* Descriptor Errors */ uint16_t special; }; /* Transmit descriptor types */ #define E1000_TXD_MASK (E1000_TXD_CMD_DEXT | 0x00F00000) #define E1000_TXD_TYP_L (0) #define E1000_TXD_TYP_C (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_C) #define E1000_TXD_TYP_D (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D) /* Legacy transmit descriptor */ struct e1000_tx_desc { uint64_t buffer_addr; /* Address of the descriptor's data buffer */ union { uint32_t data; struct { uint16_t length; /* Data buffer length */ uint8_t cso; /* Checksum offset */ uint8_t cmd; /* Descriptor control */ } flags; } lower; union { uint32_t data; struct { uint8_t status; /* Descriptor status */ uint8_t css; /* Checksum start */ uint16_t special; } fields; } upper; }; /* Context descriptor */ struct e1000_context_desc { union { uint32_t ip_config; struct { uint8_t ipcss; /* IP checksum start */ uint8_t ipcso; /* IP checksum offset */ uint16_t ipcse; /* IP checksum end */ } ip_fields; } lower_setup; union { uint32_t tcp_config; struct { uint8_t tucss; /* TCP checksum start */ uint8_t tucso; /* TCP checksum offset */ uint16_t tucse; /* TCP checksum end */ } tcp_fields; } upper_setup; uint32_t cmd_and_length; union { uint32_t data; struct { uint8_t status; /* Descriptor status */ uint8_t hdr_len; /* Header length */ uint16_t mss; /* Maximum segment size */ } fields; } tcp_seg_setup; }; /* Data descriptor */ struct e1000_data_desc { uint64_t buffer_addr; /* Address of the descriptor's buffer address */ union { uint32_t data; struct { uint16_t length; /* Data buffer length */ uint8_t typ_len_ext; uint8_t cmd; } flags; } lower; union { uint32_t data; struct { uint8_t status; /* Descriptor status */ uint8_t popts; /* Packet Options */ uint16_t special; } fields; } upper; }; union e1000_tx_udesc { struct e1000_tx_desc td; struct e1000_context_desc cd; struct e1000_data_desc dd; }; /* Tx checksum info for a packet. */ struct ck_info { int ck_valid; /* ck_info is valid */ uint8_t ck_start; /* start byte of cksum calcuation */ uint8_t ck_off; /* offset of cksum insertion */ uint16_t ck_len; /* length of cksum calc: 0 is to packet-end */ }; /* * Debug printf */ static int e82545_debug = 0; #define WPRINTF(msg,params...) PRINTLN("e82545: " msg, params) #define DPRINTF(msg,params...) if (e82545_debug) WPRINTF(msg, params) #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) /* s/w representation of the RAL/RAH regs */ struct eth_uni { int eu_valid; int eu_addrsel; struct ether_addr eu_eth; }; struct e82545_softc { struct pci_devinst *esc_pi; struct vmctx *esc_ctx; struct mevent *esc_mevpitr; pthread_mutex_t esc_mtx; struct ether_addr esc_mac; net_backend_t *esc_be; /* General */ uint32_t esc_CTRL; /* x0000 device ctl */ uint32_t esc_FCAL; /* x0028 flow ctl addr lo */ uint32_t esc_FCAH; /* x002C flow ctl addr hi */ uint32_t esc_FCT; /* x0030 flow ctl type */ uint32_t esc_VET; /* x0038 VLAN eth type */ uint32_t esc_FCTTV; /* x0170 flow ctl tx timer */ uint32_t esc_LEDCTL; /* x0E00 LED control */ uint32_t esc_PBA; /* x1000 pkt buffer allocation */ /* Interrupt control */ int esc_irq_asserted; uint32_t esc_ICR; /* x00C0 cause read/clear */ uint32_t esc_ITR; /* x00C4 intr throttling */ uint32_t esc_ICS; /* x00C8 cause set */ uint32_t esc_IMS; /* x00D0 mask set/read */ uint32_t esc_IMC; /* x00D8 mask clear */ /* Transmit */ union e1000_tx_udesc *esc_txdesc; struct e1000_context_desc esc_txctx; pthread_t esc_tx_tid; pthread_cond_t esc_tx_cond; int esc_tx_enabled; int esc_tx_active; uint32_t esc_TXCW; /* x0178 transmit config */ uint32_t esc_TCTL; /* x0400 transmit ctl */ uint32_t esc_TIPG; /* x0410 inter-packet gap */ uint16_t esc_AIT; /* x0458 Adaptive Interframe Throttle */ uint64_t esc_tdba; /* verified 64-bit desc table addr */ uint32_t esc_TDBAL; /* x3800 desc table addr, low bits */ uint32_t esc_TDBAH; /* x3804 desc table addr, hi 32-bits */ uint32_t esc_TDLEN; /* x3808 # descriptors in bytes */ uint16_t esc_TDH; /* x3810 desc table head idx */ uint16_t esc_TDHr; /* internal read version of TDH */ uint16_t esc_TDT; /* x3818 desc table tail idx */ uint32_t esc_TIDV; /* x3820 intr delay */ uint32_t esc_TXDCTL; /* x3828 desc control */ uint32_t esc_TADV; /* x382C intr absolute delay */ /* L2 frame acceptance */ struct eth_uni esc_uni[16]; /* 16 x unicast MAC addresses */ uint32_t esc_fmcast[128]; /* Multicast filter bit-match */ uint32_t esc_fvlan[128]; /* VLAN 4096-bit filter */ /* Receive */ struct e1000_rx_desc *esc_rxdesc; pthread_cond_t esc_rx_cond; int esc_rx_enabled; int esc_rx_active; int esc_rx_loopback; uint32_t esc_RCTL; /* x0100 receive ctl */ uint32_t esc_FCRTL; /* x2160 flow cntl thresh, low */ uint32_t esc_FCRTH; /* x2168 flow cntl thresh, hi */ uint64_t esc_rdba; /* verified 64-bit desc table addr */ uint32_t esc_RDBAL; /* x2800 desc table addr, low bits */ uint32_t esc_RDBAH; /* x2804 desc table addr, hi 32-bits*/ uint32_t esc_RDLEN; /* x2808 #descriptors */ uint16_t esc_RDH; /* x2810 desc table head idx */ uint16_t esc_RDT; /* x2818 desc table tail idx */ uint32_t esc_RDTR; /* x2820 intr delay */ uint32_t esc_RXDCTL; /* x2828 desc control */ uint32_t esc_RADV; /* x282C intr absolute delay */ uint32_t esc_RSRPD; /* x2C00 recv small packet detect */ uint32_t esc_RXCSUM; /* x5000 receive cksum ctl */ /* IO Port register access */ uint32_t io_addr; /* Shadow copy of MDIC */ uint32_t mdi_control; /* Shadow copy of EECD */ uint32_t eeprom_control; /* Latest NVM in/out */ uint16_t nvm_data; uint16_t nvm_opaddr; /* stats */ uint32_t missed_pkt_count; /* dropped for no room in rx queue */ uint32_t pkt_rx_by_size[6]; uint32_t pkt_tx_by_size[6]; uint32_t good_pkt_rx_count; uint32_t bcast_pkt_rx_count; uint32_t mcast_pkt_rx_count; uint32_t good_pkt_tx_count; uint32_t bcast_pkt_tx_count; uint32_t mcast_pkt_tx_count; uint32_t oversize_rx_count; uint32_t tso_tx_count; uint64_t good_octets_rx; uint64_t good_octets_tx; uint64_t missed_octets; /* counts missed and oversized */ uint8_t nvm_bits:6; /* number of bits remaining in/out */ uint8_t nvm_mode:2; #define E82545_NVM_MODE_OPADDR 0x0 #define E82545_NVM_MODE_DATAIN 0x1 #define E82545_NVM_MODE_DATAOUT 0x2 /* EEPROM data */ uint16_t eeprom_data[E82545_NVM_EEPROM_SIZE]; }; static void e82545_reset(struct e82545_softc *sc, int dev); static void e82545_rx_enable(struct e82545_softc *sc); static void e82545_rx_disable(struct e82545_softc *sc); static void e82545_rx_callback(int fd, enum ev_type type, void *param); static void e82545_tx_start(struct e82545_softc *sc); static void e82545_tx_enable(struct e82545_softc *sc); static void e82545_tx_disable(struct e82545_softc *sc); static inline int e82545_size_stat_index(uint32_t size) { if (size <= 64) { return 0; } else if (size >= 1024) { return 5; } else { /* should be 1-4 */ return (ffs(size) - 6); } } static void e82545_init_eeprom(struct e82545_softc *sc) { uint16_t checksum, i; /* mac addr */ sc->eeprom_data[NVM_MAC_ADDR] = ((uint16_t)sc->esc_mac.octet[0]) | (((uint16_t)sc->esc_mac.octet[1]) << 8); sc->eeprom_data[NVM_MAC_ADDR+1] = ((uint16_t)sc->esc_mac.octet[2]) | (((uint16_t)sc->esc_mac.octet[3]) << 8); sc->eeprom_data[NVM_MAC_ADDR+2] = ((uint16_t)sc->esc_mac.octet[4]) | (((uint16_t)sc->esc_mac.octet[5]) << 8); /* pci ids */ sc->eeprom_data[NVM_SUB_DEV_ID] = E82545_SUBDEV_ID; sc->eeprom_data[NVM_SUB_VEN_ID] = E82545_VENDOR_ID_INTEL; sc->eeprom_data[NVM_DEV_ID] = E82545_DEV_ID_82545EM_COPPER; sc->eeprom_data[NVM_VEN_ID] = E82545_VENDOR_ID_INTEL; /* fill in the checksum */ checksum = 0; for (i = 0; i < NVM_CHECKSUM_REG; i++) { checksum += sc->eeprom_data[i]; } checksum = NVM_SUM - checksum; sc->eeprom_data[NVM_CHECKSUM_REG] = checksum; DPRINTF("eeprom checksum: 0x%x", checksum); } static void e82545_write_mdi(struct e82545_softc *sc, uint8_t reg_addr, uint8_t phy_addr, uint32_t data) { DPRINTF("Write mdi reg:0x%x phy:0x%x data: 0x%x", reg_addr, phy_addr, data); } static uint32_t e82545_read_mdi(struct e82545_softc *sc, uint8_t reg_addr, uint8_t phy_addr) { //DPRINTF("Read mdi reg:0x%x phy:0x%x", reg_addr, phy_addr); switch (reg_addr) { case PHY_STATUS: return (MII_SR_LINK_STATUS | MII_SR_AUTONEG_CAPS | MII_SR_AUTONEG_COMPLETE); case PHY_AUTONEG_ADV: return NWAY_AR_SELECTOR_FIELD; case PHY_LP_ABILITY: return 0; case PHY_1000T_STATUS: return (SR_1000T_LP_FD_CAPS | SR_1000T_REMOTE_RX_STATUS | SR_1000T_LOCAL_RX_STATUS); case PHY_ID1: return (M88E1011_I_PHY_ID >> 16) & 0xFFFF; case PHY_ID2: return (M88E1011_I_PHY_ID | E82545_REVISION_4) & 0xFFFF; default: DPRINTF("Unknown mdi read reg:0x%x phy:0x%x", reg_addr, phy_addr); return 0; } /* not reached */ } static void e82545_eecd_strobe(struct e82545_softc *sc) { /* Microwire state machine */ /* DPRINTF("eeprom state machine srtobe " "0x%x 0x%x 0x%x 0x%x", sc->nvm_mode, sc->nvm_bits, sc->nvm_opaddr, sc->nvm_data);*/ if (sc->nvm_bits == 0) { DPRINTF("eeprom state machine not expecting data! " "0x%x 0x%x 0x%x 0x%x", sc->nvm_mode, sc->nvm_bits, sc->nvm_opaddr, sc->nvm_data); return; } sc->nvm_bits--; if (sc->nvm_mode == E82545_NVM_MODE_DATAOUT) { /* shifting out */ if (sc->nvm_data & 0x8000) { sc->eeprom_control |= E1000_EECD_DO; } else { sc->eeprom_control &= ~E1000_EECD_DO; } sc->nvm_data <<= 1; if (sc->nvm_bits == 0) { /* read done, back to opcode mode. */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; } } else if (sc->nvm_mode == E82545_NVM_MODE_DATAIN) { /* shifting in */ sc->nvm_data <<= 1; if (sc->eeprom_control & E1000_EECD_DI) { sc->nvm_data |= 1; } if (sc->nvm_bits == 0) { /* eeprom write */ uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK; if (op != E82545_NVM_OPCODE_WRITE) { DPRINTF("Illegal eeprom write op 0x%x", sc->nvm_opaddr); } else if (addr >= E82545_NVM_EEPROM_SIZE) { DPRINTF("Illegal eeprom write addr 0x%x", sc->nvm_opaddr); } else { DPRINTF("eeprom write eeprom[0x%x] = 0x%x", addr, sc->nvm_data); sc->eeprom_data[addr] = sc->nvm_data; } /* back to opcode mode */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; } } else if (sc->nvm_mode == E82545_NVM_MODE_OPADDR) { sc->nvm_opaddr <<= 1; if (sc->eeprom_control & E1000_EECD_DI) { sc->nvm_opaddr |= 1; } if (sc->nvm_bits == 0) { uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; switch (op) { case E82545_NVM_OPCODE_EWEN: DPRINTF("eeprom write enable: 0x%x", sc->nvm_opaddr); /* back to opcode mode */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; break; case E82545_NVM_OPCODE_READ: { uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK; sc->nvm_mode = E82545_NVM_MODE_DATAOUT; sc->nvm_bits = E82545_NVM_DATA_BITS; if (addr < E82545_NVM_EEPROM_SIZE) { sc->nvm_data = sc->eeprom_data[addr]; DPRINTF("eeprom read: eeprom[0x%x] = 0x%x", addr, sc->nvm_data); } else { DPRINTF("eeprom illegal read: 0x%x", sc->nvm_opaddr); sc->nvm_data = 0; } break; } case E82545_NVM_OPCODE_WRITE: sc->nvm_mode = E82545_NVM_MODE_DATAIN; sc->nvm_bits = E82545_NVM_DATA_BITS; sc->nvm_data = 0; break; default: DPRINTF("eeprom unknown op: 0x%x", sc->nvm_opaddr); /* back to opcode mode */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; } } } else { DPRINTF("eeprom state machine wrong state! " "0x%x 0x%x 0x%x 0x%x", sc->nvm_mode, sc->nvm_bits, sc->nvm_opaddr, sc->nvm_data); } } static void e82545_itr_callback(int fd, enum ev_type type, void *param) { uint32_t new; struct e82545_softc *sc = param; pthread_mutex_lock(&sc->esc_mtx); new = sc->esc_ICR & sc->esc_IMS; if (new && !sc->esc_irq_asserted) { DPRINTF("itr callback: lintr assert %x", new); sc->esc_irq_asserted = 1; pci_lintr_assert(sc->esc_pi); } else { mevent_delete(sc->esc_mevpitr); sc->esc_mevpitr = NULL; } pthread_mutex_unlock(&sc->esc_mtx); } static void e82545_icr_assert(struct e82545_softc *sc, uint32_t bits) { uint32_t new; DPRINTF("icr assert: 0x%x", bits); /* * An interrupt is only generated if bits are set that * aren't already in the ICR, these bits are unmasked, * and there isn't an interrupt already pending. */ new = bits & ~sc->esc_ICR & sc->esc_IMS; sc->esc_ICR |= bits; if (new == 0) { DPRINTF("icr assert: masked %x, ims %x", new, sc->esc_IMS); } else if (sc->esc_mevpitr != NULL) { DPRINTF("icr assert: throttled %x, ims %x", new, sc->esc_IMS); } else if (!sc->esc_irq_asserted) { DPRINTF("icr assert: lintr assert %x", new); sc->esc_irq_asserted = 1; pci_lintr_assert(sc->esc_pi); if (sc->esc_ITR != 0) { sc->esc_mevpitr = mevent_add( (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ EVF_TIMER, e82545_itr_callback, sc); } } } static void e82545_ims_change(struct e82545_softc *sc, uint32_t bits) { uint32_t new; /* * Changing the mask may allow previously asserted * but masked interrupt requests to generate an interrupt. */ new = bits & sc->esc_ICR & ~sc->esc_IMS; sc->esc_IMS |= bits; if (new == 0) { DPRINTF("ims change: masked %x, ims %x", new, sc->esc_IMS); } else if (sc->esc_mevpitr != NULL) { DPRINTF("ims change: throttled %x, ims %x", new, sc->esc_IMS); } else if (!sc->esc_irq_asserted) { DPRINTF("ims change: lintr assert %x", new); sc->esc_irq_asserted = 1; pci_lintr_assert(sc->esc_pi); if (sc->esc_ITR != 0) { sc->esc_mevpitr = mevent_add( (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ EVF_TIMER, e82545_itr_callback, sc); } } } static void e82545_icr_deassert(struct e82545_softc *sc, uint32_t bits) { DPRINTF("icr deassert: 0x%x", bits); sc->esc_ICR &= ~bits; /* * If there are no longer any interrupt sources and there * was an asserted interrupt, clear it */ if (sc->esc_irq_asserted && !(sc->esc_ICR & sc->esc_IMS)) { DPRINTF("icr deassert: lintr deassert %x", bits); pci_lintr_deassert(sc->esc_pi); sc->esc_irq_asserted = 0; } } static void e82545_intr_write(struct e82545_softc *sc, uint32_t offset, uint32_t value) { DPRINTF("intr_write: off %x, val %x", offset, value); switch (offset) { case E1000_ICR: e82545_icr_deassert(sc, value); break; case E1000_ITR: sc->esc_ITR = value; break; case E1000_ICS: sc->esc_ICS = value; /* not used: store for debug */ e82545_icr_assert(sc, value); break; case E1000_IMS: e82545_ims_change(sc, value); break; case E1000_IMC: sc->esc_IMC = value; /* for debug */ sc->esc_IMS &= ~value; // XXX clear interrupts if all ICR bits now masked // and interrupt was pending ? break; default: break; } } static uint32_t e82545_intr_read(struct e82545_softc *sc, uint32_t offset) { uint32_t retval; retval = 0; DPRINTF("intr_read: off %x", offset); switch (offset) { case E1000_ICR: retval = sc->esc_ICR; sc->esc_ICR = 0; e82545_icr_deassert(sc, ~0); break; case E1000_ITR: retval = sc->esc_ITR; break; case E1000_ICS: /* write-only register */ break; case E1000_IMS: retval = sc->esc_IMS; break; case E1000_IMC: /* write-only register */ break; default: break; } return (retval); } static void e82545_devctl(struct e82545_softc *sc, uint32_t val) { sc->esc_CTRL = val & ~E1000_CTRL_RST; if (val & E1000_CTRL_RST) { DPRINTF("e1k: s/w reset, ctl %x", val); e82545_reset(sc, 1); } /* XXX check for phy reset ? */ } static void e82545_rx_update_rdba(struct e82545_softc *sc) { /* XXX verify desc base/len within phys mem range */ sc->esc_rdba = (uint64_t)sc->esc_RDBAH << 32 | sc->esc_RDBAL; /* Cache host mapping of guest descriptor array */ sc->esc_rxdesc = paddr_guest2host(sc->esc_ctx, sc->esc_rdba, sc->esc_RDLEN); } static void e82545_rx_ctl(struct e82545_softc *sc, uint32_t val) { int on; on = ((val & E1000_RCTL_EN) == E1000_RCTL_EN); /* Save RCTL after stripping reserved bits 31:27,24,21,14,11:10,0 */ sc->esc_RCTL = val & ~0xF9204c01; DPRINTF("rx_ctl - %s RCTL %x, val %x", on ? "on" : "off", sc->esc_RCTL, val); /* state change requested */ if (on != sc->esc_rx_enabled) { if (on) { /* Catch disallowed/unimplemented settings */ //assert(!(val & E1000_RCTL_LBM_TCVR)); if (sc->esc_RCTL & E1000_RCTL_LBM_TCVR) { sc->esc_rx_loopback = 1; } else { sc->esc_rx_loopback = 0; } e82545_rx_update_rdba(sc); e82545_rx_enable(sc); } else { e82545_rx_disable(sc); sc->esc_rx_loopback = 0; sc->esc_rdba = 0; sc->esc_rxdesc = NULL; } } } static void e82545_tx_update_tdba(struct e82545_softc *sc) { /* XXX verify desc base/len within phys mem range */ sc->esc_tdba = (uint64_t)sc->esc_TDBAH << 32 | sc->esc_TDBAL; /* Cache host mapping of guest descriptor array */ sc->esc_txdesc = paddr_guest2host(sc->esc_ctx, sc->esc_tdba, sc->esc_TDLEN); } static void e82545_tx_ctl(struct e82545_softc *sc, uint32_t val) { int on; on = ((val & E1000_TCTL_EN) == E1000_TCTL_EN); /* ignore TCTL_EN settings that don't change state */ if (on == sc->esc_tx_enabled) return; if (on) { e82545_tx_update_tdba(sc); e82545_tx_enable(sc); } else { e82545_tx_disable(sc); sc->esc_tdba = 0; sc->esc_txdesc = NULL; } /* Save TCTL value after stripping reserved bits 31:25,23,2,0 */ sc->esc_TCTL = val & ~0xFE800005; } int e82545_bufsz(uint32_t rctl) { switch (rctl & (E1000_RCTL_BSEX | E1000_RCTL_SZ_256)) { case (E1000_RCTL_SZ_2048): return (2048); case (E1000_RCTL_SZ_1024): return (1024); case (E1000_RCTL_SZ_512): return (512); case (E1000_RCTL_SZ_256): return (256); case (E1000_RCTL_BSEX|E1000_RCTL_SZ_16384): return (16384); case (E1000_RCTL_BSEX|E1000_RCTL_SZ_8192): return (8192); case (E1000_RCTL_BSEX|E1000_RCTL_SZ_4096): return (4096); } return (256); /* Forbidden value. */ } /* XXX one packet at a time until this is debugged */ static void e82545_rx_callback(int fd, enum ev_type type, void *param) { struct e82545_softc *sc = param; struct e1000_rx_desc *rxd; struct iovec vec[64]; int left, len, lim, maxpktsz, maxpktdesc, bufsz, i, n, size; uint32_t cause = 0; uint16_t *tp, tag, head; pthread_mutex_lock(&sc->esc_mtx); DPRINTF("rx_run: head %x, tail %x", sc->esc_RDH, sc->esc_RDT); if (!sc->esc_rx_enabled || sc->esc_rx_loopback) { DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped", sc->esc_rx_enabled, sc->esc_rx_loopback); while (netbe_rx_discard(sc->esc_be) > 0) { } goto done1; } bufsz = e82545_bufsz(sc->esc_RCTL); maxpktsz = (sc->esc_RCTL & E1000_RCTL_LPE) ? 16384 : 1522; maxpktdesc = (maxpktsz + bufsz - 1) / bufsz; size = sc->esc_RDLEN / 16; head = sc->esc_RDH; left = (size + sc->esc_RDT - head) % size; if (left < maxpktdesc) { DPRINTF("rx overflow (%d < %d) -- packet(s) dropped", left, maxpktdesc); while (netbe_rx_discard(sc->esc_be) > 0) { } goto done1; } sc->esc_rx_active = 1; pthread_mutex_unlock(&sc->esc_mtx); for (lim = size / 4; lim > 0 && left >= maxpktdesc; lim -= n) { /* Grab rx descriptor pointed to by the head pointer */ for (i = 0; i < maxpktdesc; i++) { rxd = &sc->esc_rxdesc[(head + i) % size]; vec[i].iov_base = paddr_guest2host(sc->esc_ctx, rxd->buffer_addr, bufsz); vec[i].iov_len = bufsz; } len = netbe_recv(sc->esc_be, vec, maxpktdesc); if (len <= 0) { DPRINTF("netbe_recv() returned %d", len); goto done; } /* * Adjust the packet length based on whether the CRC needs * to be stripped or if the packet is less than the minimum * eth packet size. */ if (len < ETHER_MIN_LEN - ETHER_CRC_LEN) len = ETHER_MIN_LEN - ETHER_CRC_LEN; if (!(sc->esc_RCTL & E1000_RCTL_SECRC)) len += ETHER_CRC_LEN; n = (len + bufsz - 1) / bufsz; DPRINTF("packet read %d bytes, %d segs, head %d", len, n, head); /* Apply VLAN filter. */ tp = (uint16_t *)vec[0].iov_base + 6; if ((sc->esc_RCTL & E1000_RCTL_VFE) && (ntohs(tp[0]) == sc->esc_VET)) { tag = ntohs(tp[1]) & 0x0fff; if ((sc->esc_fvlan[tag >> 5] & (1 << (tag & 0x1f))) != 0) { DPRINTF("known VLAN %d", tag); } else { DPRINTF("unknown VLAN %d", tag); n = 0; continue; } } /* Update all consumed descriptors. */ for (i = 0; i < n - 1; i++) { rxd = &sc->esc_rxdesc[(head + i) % size]; rxd->length = bufsz; rxd->csum = 0; rxd->errors = 0; rxd->special = 0; rxd->status = E1000_RXD_STAT_DD; } rxd = &sc->esc_rxdesc[(head + i) % size]; rxd->length = len % bufsz; rxd->csum = 0; rxd->errors = 0; rxd->special = 0; /* XXX signal no checksum for now */ rxd->status = E1000_RXD_STAT_PIF | E1000_RXD_STAT_IXSM | E1000_RXD_STAT_EOP | E1000_RXD_STAT_DD; /* Schedule receive interrupts. */ if (len <= sc->esc_RSRPD) { cause |= E1000_ICR_SRPD | E1000_ICR_RXT0; } else { /* XXX: RDRT and RADV timers should be here. */ cause |= E1000_ICR_RXT0; } head = (head + n) % size; left -= n; } done: pthread_mutex_lock(&sc->esc_mtx); sc->esc_rx_active = 0; if (sc->esc_rx_enabled == 0) pthread_cond_signal(&sc->esc_rx_cond); sc->esc_RDH = head; /* Respect E1000_RCTL_RDMTS */ left = (size + sc->esc_RDT - head) % size; if (left < (size >> (((sc->esc_RCTL >> 8) & 3) + 1))) cause |= E1000_ICR_RXDMT0; /* Assert all accumulated interrupts. */ if (cause != 0) e82545_icr_assert(sc, cause); done1: DPRINTF("rx_run done: head %x, tail %x", sc->esc_RDH, sc->esc_RDT); pthread_mutex_unlock(&sc->esc_mtx); } static uint16_t e82545_carry(uint32_t sum) { sum = (sum & 0xFFFF) + (sum >> 16); if (sum > 0xFFFF) sum -= 0xFFFF; return (sum); } static uint16_t e82545_buf_checksum(uint8_t *buf, int len) { int i; uint32_t sum = 0; /* Checksum all the pairs of bytes first... */ for (i = 0; i < (len & ~1U); i += 2) sum += *((u_int16_t *)(buf + i)); /* * If there's a single byte left over, checksum it, too. * Network byte order is big-endian, so the remaining byte is * the high byte. */ if (i < len) sum += htons(buf[i] << 8); return (e82545_carry(sum)); } static uint16_t e82545_iov_checksum(struct iovec *iov, int iovcnt, int off, int len) { int now, odd; uint32_t sum = 0, s; /* Skip completely unneeded vectors. */ while (iovcnt > 0 && iov->iov_len <= off && off > 0) { off -= iov->iov_len; iov++; iovcnt--; } /* Calculate checksum of requested range. */ odd = 0; while (len > 0 && iovcnt > 0) { now = MIN(len, iov->iov_len - off); s = e82545_buf_checksum(iov->iov_base + off, now); sum += odd ? (s << 8) : s; odd ^= (now & 1); len -= now; off = 0; iov++; iovcnt--; } return (e82545_carry(sum)); } /* * Return the transmit descriptor type. */ int e82545_txdesc_type(uint32_t lower) { int type; type = 0; if (lower & E1000_TXD_CMD_DEXT) type = lower & E1000_TXD_MASK; return (type); } static void e82545_transmit_checksum(struct iovec *iov, int iovcnt, struct ck_info *ck) { uint16_t cksum; int cklen; DPRINTF("tx cksum: iovcnt/s/off/len %d/%d/%d/%d", iovcnt, ck->ck_start, ck->ck_off, ck->ck_len); cklen = ck->ck_len ? ck->ck_len - ck->ck_start + 1 : INT_MAX; cksum = e82545_iov_checksum(iov, iovcnt, ck->ck_start, cklen); *(uint16_t *)((uint8_t *)iov[0].iov_base + ck->ck_off) = ~cksum; } static void e82545_transmit_backend(struct e82545_softc *sc, struct iovec *iov, int iovcnt) { if (sc->esc_be == NULL) return; (void) netbe_send(sc->esc_be, iov, iovcnt); } static void e82545_transmit_done(struct e82545_softc *sc, uint16_t head, uint16_t tail, uint16_t dsize, int *tdwb) { union e1000_tx_udesc *dsc; for ( ; head != tail; head = (head + 1) % dsize) { dsc = &sc->esc_txdesc[head]; if (dsc->td.lower.data & E1000_TXD_CMD_RS) { dsc->td.upper.data |= E1000_TXD_STAT_DD; *tdwb = 1; } } } static int e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail, uint16_t dsize, uint16_t *rhead, int *tdwb) { uint8_t *hdr, *hdrp; struct iovec iovb[I82545_MAX_TXSEGS + 2]; struct iovec tiov[I82545_MAX_TXSEGS + 2]; struct e1000_context_desc *cd; struct ck_info ckinfo[2]; struct iovec *iov; union e1000_tx_udesc *dsc; int desc, dtype, len, ntype, iovcnt, tlen, tcp, tso; int mss, paylen, seg, tiovcnt, left, now, nleft, nnow, pv, pvoff; unsigned hdrlen, vlen; uint32_t tcpsum, tcpseq; uint16_t ipcs, tcpcs, ipid, ohead; ckinfo[0].ck_valid = ckinfo[1].ck_valid = 0; iovcnt = 0; tlen = 0; ntype = 0; tso = 0; ohead = head; /* iovb[0/1] may be used for writable copy of headers. */ iov = &iovb[2]; for (desc = 0; ; desc++, head = (head + 1) % dsize) { if (head == tail) { *rhead = head; return (0); } dsc = &sc->esc_txdesc[head]; dtype = e82545_txdesc_type(dsc->td.lower.data); if (desc == 0) { switch (dtype) { case E1000_TXD_TYP_C: DPRINTF("tx ctxt desc idx %d: %016jx " "%08x%08x", head, dsc->td.buffer_addr, dsc->td.upper.data, dsc->td.lower.data); /* Save context and return */ sc->esc_txctx = dsc->cd; goto done; case E1000_TXD_TYP_L: DPRINTF("tx legacy desc idx %d: %08x%08x", head, dsc->td.upper.data, dsc->td.lower.data); /* * legacy cksum start valid in first descriptor */ ntype = dtype; ckinfo[0].ck_start = dsc->td.upper.fields.css; break; case E1000_TXD_TYP_D: DPRINTF("tx data desc idx %d: %08x%08x", head, dsc->td.upper.data, dsc->td.lower.data); ntype = dtype; break; default: break; } } else { /* Descriptor type must be consistent */ assert(dtype == ntype); DPRINTF("tx next desc idx %d: %08x%08x", head, dsc->td.upper.data, dsc->td.lower.data); } len = (dtype == E1000_TXD_TYP_L) ? dsc->td.lower.flags.length : dsc->dd.lower.data & 0xFFFFF; if (len > 0) { /* Strip checksum supplied by guest. */ if ((dsc->td.lower.data & E1000_TXD_CMD_EOP) != 0 && (dsc->td.lower.data & E1000_TXD_CMD_IFCS) == 0) len -= 2; tlen += len; if (iovcnt < I82545_MAX_TXSEGS) { iov[iovcnt].iov_base = paddr_guest2host( sc->esc_ctx, dsc->td.buffer_addr, len); iov[iovcnt].iov_len = len; } iovcnt++; } /* * Pull out info that is valid in the final descriptor * and exit descriptor loop. */ if (dsc->td.lower.data & E1000_TXD_CMD_EOP) { if (dtype == E1000_TXD_TYP_L) { if (dsc->td.lower.data & E1000_TXD_CMD_IC) { ckinfo[0].ck_valid = 1; ckinfo[0].ck_off = dsc->td.lower.flags.cso; ckinfo[0].ck_len = 0; } } else { cd = &sc->esc_txctx; if (dsc->dd.lower.data & E1000_TXD_CMD_TSE) tso = 1; if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_IXSM) ckinfo[0].ck_valid = 1; if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_IXSM || tso) { ckinfo[0].ck_start = cd->lower_setup.ip_fields.ipcss; ckinfo[0].ck_off = cd->lower_setup.ip_fields.ipcso; ckinfo[0].ck_len = cd->lower_setup.ip_fields.ipcse; } if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_TXSM) ckinfo[1].ck_valid = 1; if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_TXSM || tso) { ckinfo[1].ck_start = cd->upper_setup.tcp_fields.tucss; ckinfo[1].ck_off = cd->upper_setup.tcp_fields.tucso; ckinfo[1].ck_len = cd->upper_setup.tcp_fields.tucse; } } break; } } if (iovcnt > I82545_MAX_TXSEGS) { WPRINTF("tx too many descriptors (%d > %d) -- dropped", iovcnt, I82545_MAX_TXSEGS); goto done; } hdrlen = vlen = 0; /* Estimate writable space for VLAN header insertion. */ if ((sc->esc_CTRL & E1000_CTRL_VME) && (dsc->td.lower.data & E1000_TXD_CMD_VLE)) { hdrlen = ETHER_ADDR_LEN*2; vlen = ETHER_VLAN_ENCAP_LEN; } if (!tso) { /* Estimate required writable space for checksums. */ if (ckinfo[0].ck_valid) hdrlen = MAX(hdrlen, ckinfo[0].ck_off + 2); if (ckinfo[1].ck_valid) hdrlen = MAX(hdrlen, ckinfo[1].ck_off + 2); /* Round up writable space to the first vector. */ if (hdrlen != 0 && iov[0].iov_len > hdrlen && iov[0].iov_len < hdrlen + 100) hdrlen = iov[0].iov_len; } else { /* In case of TSO header length provided by software. */ hdrlen = sc->esc_txctx.tcp_seg_setup.fields.hdr_len; /* * Cap the header length at 240 based on 7.2.4.5 of * the Intel 82576EB (Rev 2.63) datasheet. */ if (hdrlen > 240) { WPRINTF("TSO hdrlen too large: %d", hdrlen); goto done; } /* * If VLAN insertion is requested, ensure the header * at least holds the amount of data copied during * VLAN insertion below. * * XXX: Realistic packets will include a full Ethernet * header before the IP header at ckinfo[0].ck_start, * but this check is sufficient to prevent * out-of-bounds access below. */ if (vlen != 0 && hdrlen < ETHER_ADDR_LEN*2) { WPRINTF("TSO hdrlen too small for vlan insertion " "(%d vs %d) -- dropped", hdrlen, ETHER_ADDR_LEN*2); goto done; } /* * Ensure that the header length covers the used fields * in the IP and TCP headers as well as the IP and TCP * checksums. The following fields are accessed below: * * Header | Field | Offset | Length * -------+-------+--------+------- * IPv4 | len | 2 | 2 * IPv4 | ID | 4 | 2 * IPv6 | len | 4 | 2 * TCP | seq # | 4 | 4 * TCP | flags | 13 | 1 * UDP | len | 4 | 4 */ if (hdrlen < ckinfo[0].ck_start + 6 || hdrlen < ckinfo[0].ck_off + 2) { WPRINTF("TSO hdrlen too small for IP fields (%d) " "-- dropped", hdrlen); goto done; } if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) { if (hdrlen < ckinfo[1].ck_start + 14 || (ckinfo[1].ck_valid && hdrlen < ckinfo[1].ck_off + 2)) { WPRINTF("TSO hdrlen too small for TCP fields " "(%d) -- dropped", hdrlen); goto done; } } else { if (hdrlen < ckinfo[1].ck_start + 8) { WPRINTF("TSO hdrlen too small for UDP fields " "(%d) -- dropped", hdrlen); goto done; } } } /* Allocate, fill and prepend writable header vector. */ if (hdrlen != 0) { hdr = __builtin_alloca(hdrlen + vlen); hdr += vlen; for (left = hdrlen, hdrp = hdr; left > 0; left -= now, hdrp += now) { now = MIN(left, iov->iov_len); memcpy(hdrp, iov->iov_base, now); iov->iov_base += now; iov->iov_len -= now; if (iov->iov_len == 0) { iov++; iovcnt--; } } iov--; iovcnt++; iov->iov_base = hdr; iov->iov_len = hdrlen; } else hdr = NULL; /* Insert VLAN tag. */ if (vlen != 0) { hdr -= ETHER_VLAN_ENCAP_LEN; memmove(hdr, hdr + ETHER_VLAN_ENCAP_LEN, ETHER_ADDR_LEN*2); hdrlen += ETHER_VLAN_ENCAP_LEN; hdr[ETHER_ADDR_LEN*2 + 0] = sc->esc_VET >> 8; hdr[ETHER_ADDR_LEN*2 + 1] = sc->esc_VET & 0xff; hdr[ETHER_ADDR_LEN*2 + 2] = dsc->td.upper.fields.special >> 8; hdr[ETHER_ADDR_LEN*2 + 3] = dsc->td.upper.fields.special & 0xff; iov->iov_base = hdr; iov->iov_len += ETHER_VLAN_ENCAP_LEN; /* Correct checksum offsets after VLAN tag insertion. */ ckinfo[0].ck_start += ETHER_VLAN_ENCAP_LEN; ckinfo[0].ck_off += ETHER_VLAN_ENCAP_LEN; if (ckinfo[0].ck_len != 0) ckinfo[0].ck_len += ETHER_VLAN_ENCAP_LEN; ckinfo[1].ck_start += ETHER_VLAN_ENCAP_LEN; ckinfo[1].ck_off += ETHER_VLAN_ENCAP_LEN; if (ckinfo[1].ck_len != 0) ckinfo[1].ck_len += ETHER_VLAN_ENCAP_LEN; } /* Simple non-TSO case. */ if (!tso) { /* Calculate checksums and transmit. */ if (ckinfo[0].ck_valid) e82545_transmit_checksum(iov, iovcnt, &ckinfo[0]); if (ckinfo[1].ck_valid) e82545_transmit_checksum(iov, iovcnt, &ckinfo[1]); e82545_transmit_backend(sc, iov, iovcnt); goto done; } /* Doing TSO. */ tcp = (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) != 0; mss = sc->esc_txctx.tcp_seg_setup.fields.mss; paylen = (sc->esc_txctx.cmd_and_length & 0x000fffff); DPRINTF("tx %s segmentation offload %d+%d/%d bytes %d iovs", tcp ? "TCP" : "UDP", hdrlen, paylen, mss, iovcnt); ipid = ntohs(*(uint16_t *)&hdr[ckinfo[0].ck_start + 4]); tcpseq = 0; if (tcp) tcpseq = ntohl(*(uint32_t *)&hdr[ckinfo[1].ck_start + 4]); ipcs = *(uint16_t *)&hdr[ckinfo[0].ck_off]; tcpcs = 0; if (ckinfo[1].ck_valid) /* Save partial pseudo-header checksum. */ tcpcs = *(uint16_t *)&hdr[ckinfo[1].ck_off]; pv = 1; pvoff = 0; for (seg = 0, left = paylen; left > 0; seg++, left -= now) { now = MIN(left, mss); /* Construct IOVs for the segment. */ /* Include whole original header. */ tiov[0].iov_base = hdr; tiov[0].iov_len = hdrlen; tiovcnt = 1; /* Include respective part of payload IOV. */ for (nleft = now; pv < iovcnt && nleft > 0; nleft -= nnow) { nnow = MIN(nleft, iov[pv].iov_len - pvoff); tiov[tiovcnt].iov_base = iov[pv].iov_base + pvoff; tiov[tiovcnt++].iov_len = nnow; if (pvoff + nnow == iov[pv].iov_len) { pv++; pvoff = 0; } else pvoff += nnow; } DPRINTF("tx segment %d %d+%d bytes %d iovs", seg, hdrlen, now, tiovcnt); /* Update IP header. */ if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_IP) { /* IPv4 -- set length and ID */ *(uint16_t *)&hdr[ckinfo[0].ck_start + 2] = htons(hdrlen - ckinfo[0].ck_start + now); *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = htons(ipid + seg); } else { /* IPv6 -- set length */ *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = htons(hdrlen - ckinfo[0].ck_start - 40 + now); } /* Update pseudo-header checksum. */ tcpsum = tcpcs; tcpsum += htons(hdrlen - ckinfo[1].ck_start + now); /* Update TCP/UDP headers. */ if (tcp) { /* Update sequence number and FIN/PUSH flags. */ *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = htonl(tcpseq + paylen - left); if (now < left) { hdr[ckinfo[1].ck_start + 13] &= ~(TH_FIN | TH_PUSH); } } else { /* Update payload length. */ *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = hdrlen - ckinfo[1].ck_start + now; } /* Calculate checksums and transmit. */ if (ckinfo[0].ck_valid) { *(uint16_t *)&hdr[ckinfo[0].ck_off] = ipcs; e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[0]); } if (ckinfo[1].ck_valid) { *(uint16_t *)&hdr[ckinfo[1].ck_off] = e82545_carry(tcpsum); e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[1]); } e82545_transmit_backend(sc, tiov, tiovcnt); } done: head = (head + 1) % dsize; e82545_transmit_done(sc, ohead, head, dsize, tdwb); *rhead = head; return (desc + 1); } static void e82545_tx_run(struct e82545_softc *sc) { uint32_t cause; uint16_t head, rhead, tail, size; int lim, tdwb, sent; head = sc->esc_TDH; tail = sc->esc_TDT; size = sc->esc_TDLEN / 16; DPRINTF("tx_run: head %x, rhead %x, tail %x", sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); pthread_mutex_unlock(&sc->esc_mtx); rhead = head; tdwb = 0; for (lim = size / 4; sc->esc_tx_enabled && lim > 0; lim -= sent) { sent = e82545_transmit(sc, head, tail, size, &rhead, &tdwb); if (sent == 0) break; head = rhead; } pthread_mutex_lock(&sc->esc_mtx); sc->esc_TDH = head; sc->esc_TDHr = rhead; cause = 0; if (tdwb) cause |= E1000_ICR_TXDW; if (lim != size / 4 && sc->esc_TDH == sc->esc_TDT) cause |= E1000_ICR_TXQE; if (cause) e82545_icr_assert(sc, cause); DPRINTF("tx_run done: head %x, rhead %x, tail %x", sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); } static _Noreturn void * e82545_tx_thread(void *param) { struct e82545_softc *sc = param; pthread_mutex_lock(&sc->esc_mtx); for (;;) { while (!sc->esc_tx_enabled || sc->esc_TDHr == sc->esc_TDT) { if (sc->esc_tx_enabled && sc->esc_TDHr != sc->esc_TDT) break; sc->esc_tx_active = 0; if (sc->esc_tx_enabled == 0) pthread_cond_signal(&sc->esc_tx_cond); pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); } sc->esc_tx_active = 1; /* Process some tx descriptors. Lock dropped inside. */ e82545_tx_run(sc); } } static void e82545_tx_start(struct e82545_softc *sc) { if (sc->esc_tx_active == 0) pthread_cond_signal(&sc->esc_tx_cond); } static void e82545_tx_enable(struct e82545_softc *sc) { sc->esc_tx_enabled = 1; } static void e82545_tx_disable(struct e82545_softc *sc) { sc->esc_tx_enabled = 0; while (sc->esc_tx_active) pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); } static void e82545_rx_enable(struct e82545_softc *sc) { sc->esc_rx_enabled = 1; } static void e82545_rx_disable(struct e82545_softc *sc) { sc->esc_rx_enabled = 0; while (sc->esc_rx_active) pthread_cond_wait(&sc->esc_rx_cond, &sc->esc_mtx); } static void e82545_write_ra(struct e82545_softc *sc, int reg, uint32_t wval) { struct eth_uni *eu; int idx; idx = reg >> 1; assert(idx < 15); eu = &sc->esc_uni[idx]; if (reg & 0x1) { /* RAH */ eu->eu_valid = ((wval & E1000_RAH_AV) == E1000_RAH_AV); eu->eu_addrsel = (wval >> 16) & 0x3; eu->eu_eth.octet[5] = wval >> 8; eu->eu_eth.octet[4] = wval; } else { /* RAL */ eu->eu_eth.octet[3] = wval >> 24; eu->eu_eth.octet[2] = wval >> 16; eu->eu_eth.octet[1] = wval >> 8; eu->eu_eth.octet[0] = wval; } } static uint32_t e82545_read_ra(struct e82545_softc *sc, int reg) { struct eth_uni *eu; uint32_t retval; int idx; idx = reg >> 1; assert(idx < 15); eu = &sc->esc_uni[idx]; if (reg & 0x1) { /* RAH */ retval = (eu->eu_valid << 31) | (eu->eu_addrsel << 16) | (eu->eu_eth.octet[5] << 8) | eu->eu_eth.octet[4]; } else { /* RAL */ retval = (eu->eu_eth.octet[3] << 24) | (eu->eu_eth.octet[2] << 16) | (eu->eu_eth.octet[1] << 8) | eu->eu_eth.octet[0]; } return (retval); } static void e82545_write_register(struct e82545_softc *sc, uint32_t offset, uint32_t value) { int ridx; if (offset & 0x3) { DPRINTF("Unaligned register write offset:0x%x value:0x%x", offset, value); return; } DPRINTF("Register write: 0x%x value: 0x%x", offset, value); switch (offset) { case E1000_CTRL: case E1000_CTRL_DUP: e82545_devctl(sc, value); break; case E1000_FCAL: sc->esc_FCAL = value; break; case E1000_FCAH: sc->esc_FCAH = value & ~0xFFFF0000; break; case E1000_FCT: sc->esc_FCT = value & ~0xFFFF0000; break; case E1000_VET: sc->esc_VET = value & ~0xFFFF0000; break; case E1000_FCTTV: sc->esc_FCTTV = value & ~0xFFFF0000; break; case E1000_LEDCTL: sc->esc_LEDCTL = value & ~0x30303000; break; case E1000_PBA: sc->esc_PBA = value & 0x0000FF80; break; case E1000_ICR: case E1000_ITR: case E1000_ICS: case E1000_IMS: case E1000_IMC: e82545_intr_write(sc, offset, value); break; case E1000_RCTL: e82545_rx_ctl(sc, value); break; case E1000_FCRTL: sc->esc_FCRTL = value & ~0xFFFF0007; break; case E1000_FCRTH: sc->esc_FCRTH = value & ~0xFFFF0007; break; case E1000_RDBAL(0): sc->esc_RDBAL = value & ~0xF; if (sc->esc_rx_enabled) { /* Apparently legal: update cached address */ e82545_rx_update_rdba(sc); } break; case E1000_RDBAH(0): assert(!sc->esc_rx_enabled); sc->esc_RDBAH = value; break; case E1000_RDLEN(0): assert(!sc->esc_rx_enabled); sc->esc_RDLEN = value & ~0xFFF0007F; break; case E1000_RDH(0): /* XXX should only ever be zero ? Range check ? */ sc->esc_RDH = value; break; case E1000_RDT(0): /* XXX if this opens up the rx ring, do something ? */ sc->esc_RDT = value; break; case E1000_RDTR: /* ignore FPD bit 31 */ sc->esc_RDTR = value & ~0xFFFF0000; break; case E1000_RXDCTL(0): sc->esc_RXDCTL = value & ~0xFEC0C0C0; break; case E1000_RADV: sc->esc_RADV = value & ~0xFFFF0000; break; case E1000_RSRPD: sc->esc_RSRPD = value & ~0xFFFFF000; break; case E1000_RXCSUM: sc->esc_RXCSUM = value & ~0xFFFFF800; break; case E1000_TXCW: sc->esc_TXCW = value & ~0x3FFF0000; break; case E1000_TCTL: e82545_tx_ctl(sc, value); break; case E1000_TIPG: sc->esc_TIPG = value; break; case E1000_AIT: sc->esc_AIT = value; break; case E1000_TDBAL(0): sc->esc_TDBAL = value & ~0xF; if (sc->esc_tx_enabled) e82545_tx_update_tdba(sc); break; case E1000_TDBAH(0): sc->esc_TDBAH = value; if (sc->esc_tx_enabled) e82545_tx_update_tdba(sc); break; case E1000_TDLEN(0): sc->esc_TDLEN = value & ~0xFFF0007F; if (sc->esc_tx_enabled) e82545_tx_update_tdba(sc); break; case E1000_TDH(0): //assert(!sc->esc_tx_enabled); /* XXX should only ever be zero ? Range check ? */ sc->esc_TDHr = sc->esc_TDH = value; break; case E1000_TDT(0): /* XXX range check ? */ sc->esc_TDT = value; if (sc->esc_tx_enabled) e82545_tx_start(sc); break; case E1000_TIDV: sc->esc_TIDV = value & ~0xFFFF0000; break; case E1000_TXDCTL(0): //assert(!sc->esc_tx_enabled); sc->esc_TXDCTL = value & ~0xC0C0C0; break; case E1000_TADV: sc->esc_TADV = value & ~0xFFFF0000; break; case E1000_RAL(0) ... E1000_RAH(15): /* convert to u32 offset */ ridx = (offset - E1000_RAL(0)) >> 2; e82545_write_ra(sc, ridx, value); break; case E1000_MTA ... (E1000_MTA + (127*4)): sc->esc_fmcast[(offset - E1000_MTA) >> 2] = value; break; case E1000_VFTA ... (E1000_VFTA + (127*4)): sc->esc_fvlan[(offset - E1000_VFTA) >> 2] = value; break; case E1000_EECD: { //DPRINTF("EECD write 0x%x -> 0x%x", sc->eeprom_control, value); /* edge triggered low->high */ uint32_t eecd_strobe = ((sc->eeprom_control & E1000_EECD_SK) ? 0 : (value & E1000_EECD_SK)); uint32_t eecd_mask = (E1000_EECD_SK|E1000_EECD_CS| E1000_EECD_DI|E1000_EECD_REQ); sc->eeprom_control &= ~eecd_mask; sc->eeprom_control |= (value & eecd_mask); /* grant/revoke immediately */ if (value & E1000_EECD_REQ) { sc->eeprom_control |= E1000_EECD_GNT; } else { sc->eeprom_control &= ~E1000_EECD_GNT; } if (eecd_strobe && (sc->eeprom_control & E1000_EECD_CS)) { e82545_eecd_strobe(sc); } return; } case E1000_MDIC: { uint8_t reg_addr = (uint8_t)((value & E1000_MDIC_REG_MASK) >> E1000_MDIC_REG_SHIFT); uint8_t phy_addr = (uint8_t)((value & E1000_MDIC_PHY_MASK) >> E1000_MDIC_PHY_SHIFT); sc->mdi_control = (value & ~(E1000_MDIC_ERROR|E1000_MDIC_DEST)); if ((value & E1000_MDIC_READY) != 0) { DPRINTF("Incorrect MDIC ready bit: 0x%x", value); return; } switch (value & E82545_MDIC_OP_MASK) { case E1000_MDIC_OP_READ: sc->mdi_control &= ~E82545_MDIC_DATA_MASK; sc->mdi_control |= e82545_read_mdi(sc, reg_addr, phy_addr); break; case E1000_MDIC_OP_WRITE: e82545_write_mdi(sc, reg_addr, phy_addr, value & E82545_MDIC_DATA_MASK); break; default: DPRINTF("Unknown MDIC op: 0x%x", value); return; } /* TODO: barrier? */ sc->mdi_control |= E1000_MDIC_READY; if (value & E82545_MDIC_IE) { // TODO: generate interrupt } return; } case E1000_MANC: case E1000_STATUS: return; default: DPRINTF("Unknown write register: 0x%x value:%x", offset, value); return; } } static uint32_t e82545_read_register(struct e82545_softc *sc, uint32_t offset) { uint32_t retval; int ridx; if (offset & 0x3) { DPRINTF("Unaligned register read offset:0x%x", offset); return 0; } DPRINTF("Register read: 0x%x", offset); switch (offset) { case E1000_CTRL: retval = sc->esc_CTRL; break; case E1000_STATUS: retval = E1000_STATUS_FD | E1000_STATUS_LU | E1000_STATUS_SPEED_1000; break; case E1000_FCAL: retval = sc->esc_FCAL; break; case E1000_FCAH: retval = sc->esc_FCAH; break; case E1000_FCT: retval = sc->esc_FCT; break; case E1000_VET: retval = sc->esc_VET; break; case E1000_FCTTV: retval = sc->esc_FCTTV; break; case E1000_LEDCTL: retval = sc->esc_LEDCTL; break; case E1000_PBA: retval = sc->esc_PBA; break; case E1000_ICR: case E1000_ITR: case E1000_ICS: case E1000_IMS: case E1000_IMC: retval = e82545_intr_read(sc, offset); break; case E1000_RCTL: retval = sc->esc_RCTL; break; case E1000_FCRTL: retval = sc->esc_FCRTL; break; case E1000_FCRTH: retval = sc->esc_FCRTH; break; case E1000_RDBAL(0): retval = sc->esc_RDBAL; break; case E1000_RDBAH(0): retval = sc->esc_RDBAH; break; case E1000_RDLEN(0): retval = sc->esc_RDLEN; break; case E1000_RDH(0): retval = sc->esc_RDH; break; case E1000_RDT(0): retval = sc->esc_RDT; break; case E1000_RDTR: retval = sc->esc_RDTR; break; case E1000_RXDCTL(0): retval = sc->esc_RXDCTL; break; case E1000_RADV: retval = sc->esc_RADV; break; case E1000_RSRPD: retval = sc->esc_RSRPD; break; case E1000_RXCSUM: retval = sc->esc_RXCSUM; break; case E1000_TXCW: retval = sc->esc_TXCW; break; case E1000_TCTL: retval = sc->esc_TCTL; break; case E1000_TIPG: retval = sc->esc_TIPG; break; case E1000_AIT: retval = sc->esc_AIT; break; case E1000_TDBAL(0): retval = sc->esc_TDBAL; break; case E1000_TDBAH(0): retval = sc->esc_TDBAH; break; case E1000_TDLEN(0): retval = sc->esc_TDLEN; break; case E1000_TDH(0): retval = sc->esc_TDH; break; case E1000_TDT(0): retval = sc->esc_TDT; break; case E1000_TIDV: retval = sc->esc_TIDV; break; case E1000_TXDCTL(0): retval = sc->esc_TXDCTL; break; case E1000_TADV: retval = sc->esc_TADV; break; case E1000_RAL(0) ... E1000_RAH(15): /* convert to u32 offset */ ridx = (offset - E1000_RAL(0)) >> 2; retval = e82545_read_ra(sc, ridx); break; case E1000_MTA ... (E1000_MTA + (127*4)): retval = sc->esc_fmcast[(offset - E1000_MTA) >> 2]; break; case E1000_VFTA ... (E1000_VFTA + (127*4)): retval = sc->esc_fvlan[(offset - E1000_VFTA) >> 2]; break; case E1000_EECD: //DPRINTF("EECD read %x", sc->eeprom_control); retval = sc->eeprom_control; break; case E1000_MDIC: retval = sc->mdi_control; break; case E1000_MANC: retval = 0; break; /* stats that we emulate. */ case E1000_MPC: retval = sc->missed_pkt_count; break; case E1000_PRC64: retval = sc->pkt_rx_by_size[0]; break; case E1000_PRC127: retval = sc->pkt_rx_by_size[1]; break; case E1000_PRC255: retval = sc->pkt_rx_by_size[2]; break; case E1000_PRC511: retval = sc->pkt_rx_by_size[3]; break; case E1000_PRC1023: retval = sc->pkt_rx_by_size[4]; break; case E1000_PRC1522: retval = sc->pkt_rx_by_size[5]; break; case E1000_GPRC: retval = sc->good_pkt_rx_count; break; case E1000_BPRC: retval = sc->bcast_pkt_rx_count; break; case E1000_MPRC: retval = sc->mcast_pkt_rx_count; break; case E1000_GPTC: case E1000_TPT: retval = sc->good_pkt_tx_count; break; case E1000_GORCL: retval = (uint32_t)sc->good_octets_rx; break; case E1000_GORCH: retval = (uint32_t)(sc->good_octets_rx >> 32); break; case E1000_TOTL: case E1000_GOTCL: retval = (uint32_t)sc->good_octets_tx; break; case E1000_TOTH: case E1000_GOTCH: retval = (uint32_t)(sc->good_octets_tx >> 32); break; case E1000_ROC: retval = sc->oversize_rx_count; break; case E1000_TORL: retval = (uint32_t)(sc->good_octets_rx + sc->missed_octets); break; case E1000_TORH: retval = (uint32_t)((sc->good_octets_rx + sc->missed_octets) >> 32); break; case E1000_TPR: retval = sc->good_pkt_rx_count + sc->missed_pkt_count + sc->oversize_rx_count; break; case E1000_PTC64: retval = sc->pkt_tx_by_size[0]; break; case E1000_PTC127: retval = sc->pkt_tx_by_size[1]; break; case E1000_PTC255: retval = sc->pkt_tx_by_size[2]; break; case E1000_PTC511: retval = sc->pkt_tx_by_size[3]; break; case E1000_PTC1023: retval = sc->pkt_tx_by_size[4]; break; case E1000_PTC1522: retval = sc->pkt_tx_by_size[5]; break; case E1000_MPTC: retval = sc->mcast_pkt_tx_count; break; case E1000_BPTC: retval = sc->bcast_pkt_tx_count; break; case E1000_TSCTC: retval = sc->tso_tx_count; break; /* stats that are always 0. */ case E1000_CRCERRS: case E1000_ALGNERRC: case E1000_SYMERRS: case E1000_RXERRC: case E1000_SCC: case E1000_ECOL: case E1000_MCC: case E1000_LATECOL: case E1000_COLC: case E1000_DC: case E1000_TNCRS: case E1000_SEC: case E1000_CEXTERR: case E1000_RLEC: case E1000_XONRXC: case E1000_XONTXC: case E1000_XOFFRXC: case E1000_XOFFTXC: case E1000_FCRUC: case E1000_RNBC: case E1000_RUC: case E1000_RFC: case E1000_RJC: case E1000_MGTPRC: case E1000_MGTPDC: case E1000_MGTPTC: case E1000_TSCTFC: retval = 0; break; default: DPRINTF("Unknown read register: 0x%x", offset); retval = 0; break; } return (retval); } static void e82545_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct e82545_softc *sc; //DPRINTF("Write bar:%d offset:0x%lx value:0x%lx size:%d", baridx, offset, value, size); sc = pi->pi_arg; pthread_mutex_lock(&sc->esc_mtx); switch (baridx) { case E82545_BAR_IO: switch (offset) { case E82545_IOADDR: if (size != 4) { DPRINTF("Wrong io addr write sz:%d value:0x%lx", size, value); } else sc->io_addr = (uint32_t)value; break; case E82545_IODATA: if (size != 4) { DPRINTF("Wrong io data write size:%d value:0x%lx", size, value); } else if (sc->io_addr > E82545_IO_REGISTER_MAX) { DPRINTF("Non-register io write addr:0x%x value:0x%lx", sc->io_addr, value); } else e82545_write_register(sc, sc->io_addr, (uint32_t)value); break; default: DPRINTF("Unknown io bar write offset:0x%lx value:0x%lx size:%d", offset, value, size); break; } break; case E82545_BAR_REGISTER: if (size != 4) { DPRINTF("Wrong register write size:%d offset:0x%lx value:0x%lx", size, offset, value); } else e82545_write_register(sc, (uint32_t)offset, (uint32_t)value); break; default: DPRINTF("Unknown write bar:%d off:0x%lx val:0x%lx size:%d", baridx, offset, value, size); } pthread_mutex_unlock(&sc->esc_mtx); } static uint64_t e82545_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct e82545_softc *sc; uint64_t retval; //DPRINTF("Read bar:%d offset:0x%lx size:%d", baridx, offset, size); sc = pi->pi_arg; retval = 0; pthread_mutex_lock(&sc->esc_mtx); switch (baridx) { case E82545_BAR_IO: switch (offset) { case E82545_IOADDR: if (size != 4) { DPRINTF("Wrong io addr read sz:%d", size); } else retval = sc->io_addr; break; case E82545_IODATA: if (size != 4) { DPRINTF("Wrong io data read sz:%d", size); } if (sc->io_addr > E82545_IO_REGISTER_MAX) { DPRINTF("Non-register io read addr:0x%x", sc->io_addr); } else retval = e82545_read_register(sc, sc->io_addr); break; default: DPRINTF("Unknown io bar read offset:0x%lx size:%d", offset, size); break; } break; case E82545_BAR_REGISTER: if (size != 4) { DPRINTF("Wrong register read size:%d offset:0x%lx", size, offset); } else retval = e82545_read_register(sc, (uint32_t)offset); break; default: DPRINTF("Unknown read bar:%d offset:0x%lx size:%d", baridx, offset, size); break; } pthread_mutex_unlock(&sc->esc_mtx); return (retval); } static void e82545_reset(struct e82545_softc *sc, int drvr) { int i; e82545_rx_disable(sc); e82545_tx_disable(sc); /* clear outstanding interrupts */ if (sc->esc_irq_asserted) pci_lintr_deassert(sc->esc_pi); /* misc */ if (!drvr) { sc->esc_FCAL = 0; sc->esc_FCAH = 0; sc->esc_FCT = 0; sc->esc_VET = 0; sc->esc_FCTTV = 0; } sc->esc_LEDCTL = 0x07061302; sc->esc_PBA = 0x00100030; /* start nvm in opcode mode. */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; sc->eeprom_control = E1000_EECD_PRES | E82545_EECD_FWE_EN; e82545_init_eeprom(sc); /* interrupt */ sc->esc_ICR = 0; sc->esc_ITR = 250; sc->esc_ICS = 0; sc->esc_IMS = 0; sc->esc_IMC = 0; /* L2 filters */ if (!drvr) { memset(sc->esc_fvlan, 0, sizeof(sc->esc_fvlan)); memset(sc->esc_fmcast, 0, sizeof(sc->esc_fmcast)); memset(sc->esc_uni, 0, sizeof(sc->esc_uni)); /* XXX not necessary on 82545 ?? */ sc->esc_uni[0].eu_valid = 1; memcpy(sc->esc_uni[0].eu_eth.octet, sc->esc_mac.octet, ETHER_ADDR_LEN); } else { /* Clear RAH valid bits */ for (i = 0; i < 16; i++) sc->esc_uni[i].eu_valid = 0; } /* receive */ if (!drvr) { sc->esc_RDBAL = 0; sc->esc_RDBAH = 0; } sc->esc_RCTL = 0; sc->esc_FCRTL = 0; sc->esc_FCRTH = 0; sc->esc_RDLEN = 0; sc->esc_RDH = 0; sc->esc_RDT = 0; sc->esc_RDTR = 0; sc->esc_RXDCTL = (1 << 24) | (1 << 16); /* default GRAN/WTHRESH */ sc->esc_RADV = 0; sc->esc_RXCSUM = 0; /* transmit */ if (!drvr) { sc->esc_TDBAL = 0; sc->esc_TDBAH = 0; sc->esc_TIPG = 0; sc->esc_AIT = 0; sc->esc_TIDV = 0; sc->esc_TADV = 0; } sc->esc_tdba = 0; sc->esc_txdesc = NULL; sc->esc_TXCW = 0; sc->esc_TCTL = 0; sc->esc_TDLEN = 0; sc->esc_TDT = 0; sc->esc_TDHr = sc->esc_TDH = 0; sc->esc_TXDCTL = 0; } static int -e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +e82545_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { char nstr[80]; struct e82545_softc *sc; - char *optscopy; - char *vtopts; - int mac_provided; - - DPRINTF("Loading with options: %s", opts); + const char *mac; + int err; /* Setup our softc */ sc = calloc(1, sizeof(*sc)); pi->pi_arg = sc; sc->esc_pi = pi; sc->esc_ctx = ctx; pthread_mutex_init(&sc->esc_mtx, NULL); pthread_cond_init(&sc->esc_rx_cond, NULL); pthread_cond_init(&sc->esc_tx_cond, NULL); pthread_create(&sc->esc_tx_tid, NULL, e82545_tx_thread, sc); snprintf(nstr, sizeof(nstr), "e82545-%d:%d tx", pi->pi_slot, pi->pi_func); pthread_set_name_np(sc->esc_tx_tid, nstr); pci_set_cfgdata16(pi, PCIR_DEVICE, E82545_DEV_ID_82545EM_COPPER); pci_set_cfgdata16(pi, PCIR_VENDOR, E82545_VENDOR_ID_INTEL); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_NETWORK_ETHERNET); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, E82545_SUBDEV_ID); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, E82545_VENDOR_ID_INTEL); pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); pci_set_cfgdata8(pi, PCIR_INTPIN, 0x1); /* TODO: this card also supports msi, but the freebsd driver for it * does not, so I have not implemented it. */ pci_lintr_request(pi); pci_emul_alloc_bar(pi, E82545_BAR_REGISTER, PCIBAR_MEM32, E82545_BAR_REGISTER_LEN); pci_emul_alloc_bar(pi, E82545_BAR_FLASH, PCIBAR_MEM32, E82545_BAR_FLASH_LEN); pci_emul_alloc_bar(pi, E82545_BAR_IO, PCIBAR_IO, E82545_BAR_IO_LEN); - /* - * Attempt to open the net backend and read the MAC address - * if specified. Copied from virtio-net, slightly modified. - */ - mac_provided = 0; - sc->esc_be = NULL; - if (opts != NULL) { - int err = 0; - - optscopy = vtopts = strdup(opts); - (void) strsep(&vtopts, ","); - - /* - * Parse the list of options in the form - * key1=value1,...,keyN=valueN. - */ - while (vtopts != NULL) { - char *value = vtopts; - char *key; - - key = strsep(&value, "="); - if (value == NULL) - break; - vtopts = value; - (void) strsep(&vtopts, ","); - - if (strcmp(key, "mac") == 0) { - err = net_parsemac(value, sc->esc_mac.octet); - if (err) - break; - mac_provided = 1; - } - } - - free(optscopy); - + mac = get_config_value_node(nvl, "mac"); + if (mac != NULL) { + err = net_parsemac(mac, sc->esc_mac.octet); if (err) { free(sc); return (err); } - - err = netbe_init(&sc->esc_be, opts, e82545_rx_callback, sc); - if (err) { - free(sc); - return (err); - } - } - - if (!mac_provided) { + } else net_genmac(pi, sc->esc_mac.octet); + + err = netbe_init(&sc->esc_be, nvl, e82545_rx_callback, sc); + if (err) { + free(sc); + return (err); } netbe_rx_enable(sc->esc_be); /* H/w initiated reset */ e82545_reset(sc, 0); return (0); } #ifdef BHYVE_SNAPSHOT static int e82545_snapshot(struct vm_snapshot_meta *meta) { int i; int ret; struct e82545_softc *sc; struct pci_devinst *pi; uint64_t bitmap_value; pi = meta->dev_data; sc = pi->pi_arg; /* esc_mevp and esc_mevpitr should be reinitiated at init. */ SNAPSHOT_VAR_OR_LEAVE(sc->esc_mac, meta, ret, done); /* General */ SNAPSHOT_VAR_OR_LEAVE(sc->esc_CTRL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCT, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_VET, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCTTV, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_LEDCTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_PBA, meta, ret, done); /* Interrupt control */ SNAPSHOT_VAR_OR_LEAVE(sc->esc_irq_asserted, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICR, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_ITR, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICS, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMS, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMC, meta, ret, done); /* * Transmit * * The fields in the unions are in superposition to access certain * bytes in the larger uint variables. * e.g., ip_config = [ipcss|ipcso|ipcse0|ipcse1] */ SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.lower_setup.ip_config, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.upper_setup.tcp_config, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.cmd_and_length, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.tcp_seg_setup.data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_active, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXCW, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TCTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIPG, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_AIT, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_tdba, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDLEN, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDHr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDT, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIDV, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXDCTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TADV, meta, ret, done); /* Has dependency on esc_TDLEN; reoreder of fields from struct. */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_txdesc, sc->esc_TDLEN, true, meta, ret, done); /* L2 frame acceptance */ for (i = 0; i < nitems(sc->esc_uni); i++) { SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_valid, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_addrsel, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_eth, meta, ret, done); } SNAPSHOT_BUF_OR_LEAVE(sc->esc_fmcast, sizeof(sc->esc_fmcast), meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(sc->esc_fvlan, sizeof(sc->esc_fvlan), meta, ret, done); /* Receive */ SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_active, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_loopback, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RCTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_rdba, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDLEN, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDT, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDTR, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXDCTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RADV, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RSRPD, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXCSUM, meta, ret, done); /* Has dependency on esc_RDLEN; reoreder of fields from struct. */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_rxdesc, sc->esc_TDLEN, true, meta, ret, done); /* IO Port register access */ SNAPSHOT_VAR_OR_LEAVE(sc->io_addr, meta, ret, done); /* Shadow copy of MDIC */ SNAPSHOT_VAR_OR_LEAVE(sc->mdi_control, meta, ret, done); /* Shadow copy of EECD */ SNAPSHOT_VAR_OR_LEAVE(sc->eeprom_control, meta, ret, done); /* Latest NVM in/out */ SNAPSHOT_VAR_OR_LEAVE(sc->nvm_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->nvm_opaddr, meta, ret, done); /* Stats */ SNAPSHOT_VAR_OR_LEAVE(sc->missed_pkt_count, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(sc->pkt_rx_by_size, sizeof(sc->pkt_rx_by_size), meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(sc->pkt_tx_by_size, sizeof(sc->pkt_tx_by_size), meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_rx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_rx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_rx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_tx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_tx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_tx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->oversize_rx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->tso_tx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_rx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_tx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->missed_octets, meta, ret, done); if (meta->op == VM_SNAPSHOT_SAVE) bitmap_value = sc->nvm_bits; SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE) sc->nvm_bits = bitmap_value; if (meta->op == VM_SNAPSHOT_SAVE) bitmap_value = sc->nvm_bits; SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE) sc->nvm_bits = bitmap_value; /* EEPROM data */ SNAPSHOT_BUF_OR_LEAVE(sc->eeprom_data, sizeof(sc->eeprom_data), meta, ret, done); done: return (ret); } #endif struct pci_devemu pci_de_e82545 = { .pe_emu = "e1000", .pe_init = e82545_init, + .pe_legacy_config = netbe_legacy_config, .pe_barwrite = e82545_write, .pe_barread = e82545_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = e82545_snapshot, #endif }; PCI_EMUL_SET(pci_de_e82545); diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c index 803ab0be38bb..e498b14ef700 100644 --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -1,2345 +1,2420 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "acpi.h" #include "bhyverun.h" +#include "config.h" #include "debug.h" #include "inout.h" #include "ioapic.h" #include "mem.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #define CONF1_ADDR_PORT 0x0cf8 #define CONF1_DATA_PORT 0x0cfc #define CONF1_ENABLE 0x80000000ul #define MAXBUSES (PCI_BUSMAX + 1) #define MAXSLOTS (PCI_SLOTMAX + 1) #define MAXFUNCS (PCI_FUNCMAX + 1) struct funcinfo { - char *fi_name; - char *fi_param; + nvlist_t *fi_config; + struct pci_devemu *fi_pde; struct pci_devinst *fi_devi; }; struct intxinfo { int ii_count; int ii_pirq_pin; int ii_ioapic_irq; }; struct slotinfo { struct intxinfo si_intpins[4]; struct funcinfo si_funcs[MAXFUNCS]; }; struct businfo { uint16_t iobase, iolimit; /* I/O window */ uint32_t membase32, memlimit32; /* mmio window below 4GB */ uint64_t membase64, memlimit64; /* mmio window above 4GB */ struct slotinfo slotinfo[MAXSLOTS]; }; static struct businfo *pci_businfo[MAXBUSES]; SET_DECLARE(pci_devemu_set, struct pci_devemu); static uint64_t pci_emul_iobase; static uint64_t pci_emul_membase32; static uint64_t pci_emul_membase64; static uint64_t pci_emul_memlim64; #define PCI_EMUL_IOBASE 0x2000 #define PCI_EMUL_IOLIMIT 0x10000 #define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ #define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); #define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE -static struct pci_devemu *pci_emul_finddev(char *name); +static struct pci_devemu *pci_emul_finddev(const char *name); static void pci_lintr_route(struct pci_devinst *pi); static void pci_lintr_update(struct pci_devinst *pi); static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, int coff, int bytes, uint32_t *val); static __inline void CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes) { if (bytes == 1) pci_set_cfgdata8(pi, coff, val); else if (bytes == 2) pci_set_cfgdata16(pi, coff, val); else pci_set_cfgdata32(pi, coff, val); } static __inline uint32_t CFGREAD(struct pci_devinst *pi, int coff, int bytes) { if (bytes == 1) return (pci_get_cfgdata8(pi, coff)); else if (bytes == 2) return (pci_get_cfgdata16(pi, coff)); else return (pci_get_cfgdata32(pi, coff)); } /* * I/O access */ /* * Slot options are in the form: * * ::,[,] * [:],[,] * * slot is 0..31 * func is 0..7 * emul is a string describing the type of PCI device e.g. virtio-net * config is an optional string, depending on the device, that can be * used for configuration. * Examples are: * 1,virtio-net,tap0 * 3:0,dummy */ static void pci_parse_slot_usage(char *aopt) { EPRINTLN("Invalid PCI slot info field \"%s\"", aopt); } +/* + * Helper function to parse a list of comma-separated options where + * each option is formatted as "name[=value]". If no value is + * provided, the option is treated as a boolean and is given a value + * of true. + */ +int +pci_parse_legacy_config(nvlist_t *nvl, const char *opt) +{ + char *config, *name, *tofree, *value; + + if (opt == NULL) + return (0); + + config = tofree = strdup(opt); + while ((name = strsep(&config, ",")) != NULL) { + value = strchr(name, '='); + if (value != NULL) { + *value = '\0'; + value++; + set_config_value_node(nvl, name, value); + } else + set_config_bool_node(nvl, name, true); + } + free(tofree); + return (0); +} + +/* + * PCI device configuration is stored in MIBs that encode the device's + * location: + * + * pci... + * + * Where "bus", "slot", and "func" are all decimal values without + * leading zeroes. Each valid device must have a "device" node which + * identifies the driver model of the device. + * + * Device backends can provide a parser for the "config" string. If + * a custom parser is not provided, pci_parse_legacy_config() is used + * to parse the string. + */ int pci_parse_slot(char *opt) { - struct businfo *bi; - struct slotinfo *si; + char node_name[sizeof("pci.XXX.XX.X")]; + struct pci_devemu *pde; char *emul, *config, *str, *cp; int error, bnum, snum, fnum; + nvlist_t *nvl; error = -1; str = strdup(opt); emul = config = NULL; if ((cp = strchr(str, ',')) != NULL) { *cp = '\0'; emul = cp + 1; if ((cp = strchr(emul, ',')) != NULL) { *cp = '\0'; config = cp + 1; } } else { pci_parse_slot_usage(opt); goto done; } /* :: */ if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) { bnum = 0; /* : */ if (sscanf(str, "%d:%d", &snum, &fnum) != 2) { fnum = 0; /* */ if (sscanf(str, "%d", &snum) != 1) { snum = -1; } } } if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS || fnum < 0 || fnum >= MAXFUNCS) { pci_parse_slot_usage(opt); goto done; } - if (pci_businfo[bnum] == NULL) - pci_businfo[bnum] = calloc(1, sizeof(struct businfo)); - - bi = pci_businfo[bnum]; - si = &bi->slotinfo[snum]; - - if (si->si_funcs[fnum].fi_name != NULL) { - EPRINTLN("pci slot %d:%d already occupied!", - snum, fnum); + pde = pci_emul_finddev(emul); + if (pde == NULL) { + EPRINTLN("pci slot %d:%d:%d: unknown device \"%s\"", bnum, snum, + fnum, emul); goto done; } - if (pci_emul_finddev(emul) == NULL) { - EPRINTLN("pci slot %d:%d: unknown device \"%s\"", - snum, fnum, emul); + snprintf(node_name, sizeof(node_name), "pci.%d.%d.%d", bnum, snum, + fnum); + nvl = find_config_node(node_name); + if (nvl != NULL) { + EPRINTLN("pci slot %d:%d:%d already occupied!", bnum, snum, + fnum); goto done; } + nvl = create_config_node(node_name); + if (pde->pe_alias != NULL) + set_config_value_node(nvl, "device", pde->pe_alias); + else + set_config_value_node(nvl, "device", pde->pe_emu); - error = 0; - si->si_funcs[fnum].fi_name = emul; - si->si_funcs[fnum].fi_param = config; - + if (pde->pe_legacy_config != NULL) + error = pde->pe_legacy_config(nvl, config); + else + error = pci_parse_legacy_config(nvl, config); done: - if (error) - free(str); - + free(str); return (error); } void pci_print_supported_devices() { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; printf("%s\n", pdp->pe_emu); } } static int pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) { if (offset < pi->pi_msix.pba_offset) return (0); if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { return (0); } return (1); } int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, uint64_t value) { int msix_entry_offset; int tab_index; char *dest; /* support only 4 or 8 byte writes */ if (size != 4 && size != 8) return (-1); /* * Return if table index is beyond what device supports */ tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index >= pi->pi_msix.table_count) return (-1); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned writes */ if ((msix_entry_offset % size) != 0) return (-1); dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 4) *((uint32_t *)dest) = value; else *((uint64_t *)dest) = value; return (0); } uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) { char *dest; int msix_entry_offset; int tab_index; uint64_t retval = ~0; /* * The PCI standard only allows 4 and 8 byte accesses to the MSI-X * table but we also allow 1 byte access to accommodate reads from * ddb. */ if (size != 1 && size != 4 && size != 8) return (retval); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned reads */ if ((msix_entry_offset % size) != 0) { return (retval); } tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index < pi->pi_msix.table_count) { /* valid MSI-X Table access */ dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 1) retval = *((uint8_t *)dest); else if (size == 4) retval = *((uint32_t *)dest); else retval = *((uint64_t *)dest); } else if (pci_valid_pba_offset(pi, offset)) { /* return 0 for PBA access */ retval = 0; } return (retval); } int pci_msix_table_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.table_bar); else return (-1); } int pci_msix_pba_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.pba_bar); else return (-1); } static int pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { struct pci_devinst *pdi = arg; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int i; for (i = 0; i <= PCI_BARMAX; i++) { if (pdi->pi_bar[i].type == PCIBAR_IO && port >= pdi->pi_bar[i].addr && port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { offset = port - pdi->pi_bar[i].addr; if (in) *eax = (*pe->pe_barread)(ctx, vcpu, pdi, i, offset, bytes); else (*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset, bytes, *eax); return (0); } } return (-1); } static int pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { struct pci_devinst *pdi = arg1; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int bidx = (int) arg2; assert(bidx <= PCI_BARMAX); assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 || pdi->pi_bar[bidx].type == PCIBAR_MEM64); assert(addr >= pdi->pi_bar[bidx].addr && addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); offset = addr - pdi->pi_bar[bidx].addr; if (dir == MEM_F_WRITE) { if (size == 8) { (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, 4, *val & 0xffffffff); (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset + 4, 4, *val >> 32); } else { (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, size, *val); } } else { if (size == 8) { *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset, 4); *val |= (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset + 4, 4) << 32; } else { *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset, size); } } return (0); } static int pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, uint64_t *addr) { uint64_t base; assert((size & (size - 1)) == 0); /* must be a power of 2 */ base = roundup2(*baseptr, size); if (base + size <= limit) { *addr = base; *baseptr = base + size; return (0); } else return (-1); } /* * Register (or unregister) the MMIO or I/O region associated with the BAR * register 'idx' of an emulated pci device. */ static void modify_bar_registration(struct pci_devinst *pi, int idx, int registration) { int error; struct inout_port iop; struct mem_range mr; switch (pi->pi_bar[idx].type) { case PCIBAR_IO: bzero(&iop, sizeof(struct inout_port)); iop.name = pi->pi_name; iop.port = pi->pi_bar[idx].addr; iop.size = pi->pi_bar[idx].size; if (registration) { iop.flags = IOPORT_F_INOUT; iop.handler = pci_emul_io_handler; iop.arg = pi; error = register_inout(&iop); } else error = unregister_inout(&iop); break; case PCIBAR_MEM32: case PCIBAR_MEM64: bzero(&mr, sizeof(struct mem_range)); mr.name = pi->pi_name; mr.base = pi->pi_bar[idx].addr; mr.size = pi->pi_bar[idx].size; if (registration) { mr.flags = MEM_F_RW; mr.handler = pci_emul_mem_handler; mr.arg1 = pi; mr.arg2 = idx; error = register_mem(&mr); } else error = unregister_mem(&mr); break; default: error = EINVAL; break; } assert(error == 0); } static void unregister_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 0); } static void register_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 1); } /* Are we decoding i/o port accesses for the emulated pci device? */ static int porten(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_PORTEN); } /* Are we decoding memory accesses for the emulated pci device? */ static int memen(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_MEMEN); } /* * Update the MMIO or I/O address that is decoded by the BAR register. * * If the pci device has enabled the address space decoding then intercept * the address range decoded by the BAR register. */ static void update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) { int decode; if (pi->pi_bar[idx].type == PCIBAR_IO) decode = porten(pi); else decode = memen(pi); if (decode) unregister_bar(pi, idx); switch (type) { case PCIBAR_IO: case PCIBAR_MEM32: pi->pi_bar[idx].addr = addr; break; case PCIBAR_MEM64: pi->pi_bar[idx].addr &= ~0xffffffffUL; pi->pi_bar[idx].addr |= addr; break; case PCIBAR_MEMHI64: pi->pi_bar[idx].addr &= 0xffffffff; pi->pi_bar[idx].addr |= addr; break; default: assert(0); } if (decode) register_bar(pi, idx); } int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size) { int error; uint64_t *baseptr, limit, addr, mask, lobits, bar; uint16_t cmd, enbit; assert(idx >= 0 && idx <= PCI_BARMAX); if ((size & (size - 1)) != 0) size = 1UL << flsl(size); /* round up to a power of 2 */ /* Enforce minimum BAR sizes required by the PCI standard */ if (type == PCIBAR_IO) { if (size < 4) size = 4; } else { if (size < 16) size = 16; } switch (type) { case PCIBAR_NONE: baseptr = NULL; addr = mask = lobits = enbit = 0; break; case PCIBAR_IO: baseptr = &pci_emul_iobase; limit = PCI_EMUL_IOLIMIT; mask = PCIM_BAR_IO_BASE; lobits = PCIM_BAR_IO_SPACE; enbit = PCIM_CMD_PORTEN; break; case PCIBAR_MEM64: /* * XXX * Some drivers do not work well if the 64-bit BAR is allocated * above 4GB. Allow for this by allocating small requests under * 4GB unless then allocation size is larger than some arbitrary * number (128MB currently). */ if (size > 128 * 1024 * 1024) { baseptr = &pci_emul_membase64; limit = pci_emul_memlim64; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | PCIM_BAR_MEM_PREFETCH; } else { baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64; } enbit = PCIM_CMD_MEMEN; break; case PCIBAR_MEM32: baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; enbit = PCIM_CMD_MEMEN; break; default: printf("pci_emul_alloc_base: invalid bar type %d\n", type); assert(0); } if (baseptr != NULL) { error = pci_emul_alloc_resource(baseptr, limit, size, &addr); if (error != 0) return (error); } pdi->pi_bar[idx].type = type; pdi->pi_bar[idx].addr = addr; pdi->pi_bar[idx].size = size; /* Initialize the BAR register in config space */ bar = (addr & mask) | lobits; pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar); if (type == PCIBAR_MEM64) { assert(idx + 1 <= PCI_BARMAX); pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); } cmd = pci_get_cfgdata16(pdi, PCIR_COMMAND); if ((cmd & enbit) != enbit) pci_set_cfgdata16(pdi, PCIR_COMMAND, cmd | enbit); register_bar(pdi, idx); return (0); } #define CAP_START_OFFSET 0x40 static int pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen) { int i, capoff, reallen; uint16_t sts; assert(caplen > 0); reallen = roundup2(caplen, 4); /* dword aligned */ sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) == 0) capoff = CAP_START_OFFSET; else capoff = pi->pi_capend + 1; /* Check if we have enough space */ if (capoff + reallen > PCI_REGMAX + 1) return (-1); /* Set the previous capability pointer */ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) { pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff); pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT); } else pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff); /* Copy the capability */ for (i = 0; i < caplen; i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); /* Set the next capability pointer */ pci_set_cfgdata8(pi, capoff + 1, 0); pi->pi_prevcap = capoff; pi->pi_capend = capoff + reallen - 1; return (0); } static struct pci_devemu * -pci_emul_finddev(char *name) +pci_emul_finddev(const char *name) { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; if (!strcmp(pdp->pe_emu, name)) { return (pdp); } } return (NULL); } static int pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot, int func, struct funcinfo *fi) { struct pci_devinst *pdi; int err; pdi = calloc(1, sizeof(struct pci_devinst)); pdi->pi_vmctx = ctx; pdi->pi_bus = bus; pdi->pi_slot = slot; pdi->pi_func = func; pthread_mutex_init(&pdi->pi_lintr.lock, NULL); pdi->pi_lintr.pin = 0; pdi->pi_lintr.state = IDLE; pdi->pi_lintr.pirq_pin = 0; pdi->pi_lintr.ioapic_irq = 0; pdi->pi_d = pde; snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot); /* Disable legacy interrupts */ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_BUSMASTEREN); - err = (*pde->pe_init)(ctx, pdi, fi->fi_param); + err = (*pde->pe_init)(ctx, pdi, fi->fi_config); if (err == 0) fi->fi_devi = pdi; else free(pdi); return (err); } void pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) { int mmc; /* Number of msi messages must be a power of 2 between 1 and 32 */ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); mmc = ffs(msgnum) - 1; bzero(msicap, sizeof(struct msicap)); msicap->capid = PCIY_MSI; msicap->nextptr = nextptr; msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1); } int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) { struct msicap msicap; pci_populate_msicap(&msicap, msgnum, 0); return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); } static void pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, uint32_t msix_tab_size) { assert(msix_tab_size % 4096 == 0); bzero(msixcap, sizeof(struct msixcap)); msixcap->capid = PCIY_MSIX; /* * Message Control Register, all fields set to * zero except for the Table Size. * Note: Table size N is encoded as N-1 */ msixcap->msgctrl = msgnum - 1; /* * MSI-X BAR setup: * - MSI-X table start at offset 0 * - PBA table starts at a 4K aligned offset after the MSI-X table */ msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK; msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK); } static void pci_msix_table_init(struct pci_devinst *pi, int table_entries) { int i, table_size; assert(table_entries > 0); assert(table_entries <= MAX_MSIX_TABLE_ENTRIES); table_size = table_entries * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* set mask bit of vector control register */ for (i = 0; i < table_entries; i++) pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) { uint32_t tab_size; struct msixcap msixcap; assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE; /* Align table size to nearest 4K */ tab_size = roundup2(tab_size, 4096); pi->pi_msix.table_bar = barnum; pi->pi_msix.pba_bar = barnum; pi->pi_msix.table_offset = 0; pi->pi_msix.table_count = msgnum; pi->pi_msix.pba_offset = tab_size; pi->pi_msix.pba_size = PBA_SIZE(msgnum); pci_msix_table_init(pi, msgnum); pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size); /* allocate memory for MSI-X Table and PBA */ pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32, tab_size + pi->pi_msix.pba_size); return (pci_emul_add_capability(pi, (u_char *)&msixcap, sizeof(msixcap))); } static void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask; int off; off = offset - capoff; /* Message Control Register */ if (off == 2 && bytes == 2) { rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; pci_lintr_update(pi); } CFGWRITE(pi, offset, val, bytes); } static void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask, msgdata, mme; uint32_t addrlo; /* * If guest is writing to the message control register make sure * we do not overwrite read-only fields. */ if ((offset - capoff) == 2 && bytes == 2) { rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; } CFGWRITE(pi, offset, val, bytes); msgctrl = pci_get_cfgdata16(pi, capoff + 2); addrlo = pci_get_cfgdata32(pi, capoff + 4); if (msgctrl & PCIM_MSICTRL_64BIT) msgdata = pci_get_cfgdata16(pi, capoff + 12); else msgdata = pci_get_cfgdata16(pi, capoff + 8); mme = msgctrl & PCIM_MSICTRL_MME_MASK; pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0; if (pi->pi_msi.enabled) { pi->pi_msi.addr = addrlo; pi->pi_msi.msg_data = msgdata; pi->pi_msi.maxmsgnum = 1 << (mme >> 4); } else { pi->pi_msi.maxmsgnum = 0; } pci_lintr_update(pi); } void pciecap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { /* XXX don't write to the readonly parts */ CFGWRITE(pi, offset, val, bytes); } #define PCIECAP_VERSION 0x2 int pci_emul_add_pciecap(struct pci_devinst *pi, int type) { int err; struct pciecap pciecap; bzero(&pciecap, sizeof(pciecap)); /* * Use the integrated endpoint type for endpoints on a root complex bus. * * NB: bhyve currently only supports a single PCI bus that is the root * complex bus, so all endpoints are integrated. */ if ((type == PCIEM_TYPE_ENDPOINT) && (pi->pi_bus == 0)) type = PCIEM_TYPE_ROOT_INT_EP; pciecap.capid = PCIY_EXPRESS; pciecap.pcie_capabilities = PCIECAP_VERSION | type; if (type != PCIEM_TYPE_ROOT_INT_EP) { pciecap.link_capabilities = 0x411; /* gen1, x1 */ pciecap.link_status = 0x11; /* gen1, x1 */ } err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap)); return (err); } /* * This function assumes that 'coff' is in the capabilities region of the * config space. A capoff parameter of zero will force a search for the * offset and type. */ void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val, uint8_t capoff, int capid) { uint8_t nextoff; /* Do not allow un-aligned writes */ if ((offset & (bytes - 1)) != 0) return; if (capoff == 0) { /* Find the capability that we want to update */ capoff = CAP_START_OFFSET; while (1) { nextoff = pci_get_cfgdata8(pi, capoff + 1); if (nextoff == 0) break; if (offset >= capoff && offset < nextoff) break; capoff = nextoff; } assert(offset >= capoff); capid = pci_get_cfgdata8(pi, capoff); } /* * Capability ID and Next Capability Pointer are readonly. * However, some o/s's do 4-byte writes that include these. * For this case, trim the write back to 2 bytes and adjust * the data. */ if (offset == capoff || offset == capoff + 1) { if (offset == capoff && bytes == 4) { bytes = 2; offset += 2; val >>= 16; } else return; } switch (capid) { case PCIY_MSI: msicap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_MSIX: msixcap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_EXPRESS: pciecap_cfgwrite(pi, capoff, offset, bytes, val); break; default: break; } } static int pci_emul_iscap(struct pci_devinst *pi, int offset) { uint16_t sts; sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) != 0) { if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend) return (1); } return (0); } static int pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { /* * Ignore writes; return 0xff's for reads. The mem read code * will take care of truncating to the correct size. */ if (dir == MEM_F_READ) { *val = 0xffffffffffffffff; } return (0); } static int pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int bytes, uint64_t *val, void *arg1, long arg2) { int bus, slot, func, coff, in; coff = addr & 0xfff; func = (addr >> 12) & 0x7; slot = (addr >> 15) & 0x1f; bus = (addr >> 20) & 0xff; in = (dir == MEM_F_READ); if (in) *val = ~0UL; pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val); return (0); } uint64_t pci_ecfg_base(void) { return (PCI_EMUL_ECFG_BASE); } #define BUSIO_ROUNDUP 32 #define BUSMEM_ROUNDUP (1024 * 1024) int init_pci(struct vmctx *ctx) { + char node_name[sizeof("pci.XXX.XX.X")]; struct mem_range mr; struct pci_devemu *pde; struct businfo *bi; struct slotinfo *si; struct funcinfo *fi; + nvlist_t *nvl; + const char *emul; size_t lowmem; uint64_t cpu_maxphysaddr, pci_emul_memresv64; u_int regs[4]; int bus, slot, func, error; pci_emul_iobase = PCI_EMUL_IOBASE; pci_emul_membase32 = vm_get_lowmem_limit(ctx); do_cpuid(0x80000008, regs); cpu_maxphysaddr = 1ULL << (regs[0] & 0xff); if (cpu_maxphysaddr > VM_MAXUSER_ADDRESS_LA48) cpu_maxphysaddr = VM_MAXUSER_ADDRESS_LA48; pci_emul_memresv64 = cpu_maxphysaddr / 4; /* * Max power of 2 that is less then * cpu_maxphysaddr - pci_emul_memresv64. */ pci_emul_membase64 = 1ULL << (flsl(cpu_maxphysaddr - pci_emul_memresv64) - 1); pci_emul_memlim64 = cpu_maxphysaddr; for (bus = 0; bus < MAXBUSES; bus++) { - if ((bi = pci_businfo[bus]) == NULL) + snprintf(node_name, sizeof(node_name), "pci.%d", bus); + nvl = find_config_node(node_name); + if (nvl == NULL) continue; + pci_businfo[bus] = calloc(1, sizeof(struct businfo)); + bi = pci_businfo[bus]; + /* * Keep track of the i/o and memory resources allocated to * this bus. */ bi->iobase = pci_emul_iobase; bi->membase32 = pci_emul_membase32; bi->membase64 = pci_emul_membase64; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; - if (fi->fi_name == NULL) + snprintf(node_name, sizeof(node_name), + "pci.%d.%d.%d", bus, slot, func); + nvl = find_config_node(node_name); + if (nvl == NULL) continue; - pde = pci_emul_finddev(fi->fi_name); - assert(pde != NULL); + + fi->fi_config = nvl; + emul = get_config_value_node(nvl, "device"); + if (emul == NULL) { + EPRINTLN("pci slot %d:%d:%d: missing " + "\"device\" value", bus, slot, func); + return (EINVAL); + } + pde = pci_emul_finddev(emul); + if (pde == NULL) { + EPRINTLN("pci slot %d:%d:%d: unknown " + "device \"%s\"", bus, slot, func, + emul); + return (EINVAL); + } + if (pde->pe_alias != NULL) { + EPRINTLN("pci slot %d:%d:%d: legacy " + "device \"%s\", use \"%s\" instead", + bus, slot, func, emul, + pde->pe_alias); + return (EINVAL); + } + fi->fi_pde = pde; error = pci_emul_init(ctx, pde, bus, slot, func, fi); if (error) return (error); } } /* * Add some slop to the I/O and memory resources decoded by * this bus to give a guest some flexibility if it wants to * reprogram the BARs. */ pci_emul_iobase += BUSIO_ROUNDUP; pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP); bi->iolimit = pci_emul_iobase; pci_emul_membase32 += BUSMEM_ROUNDUP; pci_emul_membase32 = roundup2(pci_emul_membase32, BUSMEM_ROUNDUP); bi->memlimit32 = pci_emul_membase32; pci_emul_membase64 += BUSMEM_ROUNDUP; pci_emul_membase64 = roundup2(pci_emul_membase64, BUSMEM_ROUNDUP); bi->memlimit64 = pci_emul_membase64; } /* * PCI backends are initialized before routing INTx interrupts * so that LPC devices are able to reserve ISA IRQs before * routing PIRQ pins. */ for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_devi == NULL) continue; pci_lintr_route(fi->fi_devi); } } } lpc_pirq_routed(); /* * The guest physical memory map looks like the following: * [0, lowmem) guest system memory * [lowmem, lowmem_limit) memory hole (may be absent) * [lowmem_limit, 0xE0000000) PCI hole (32-bit BAR allocation) * [0xE0000000, 0xF0000000) PCI extended config window * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware * [4GB, 4GB + highmem) */ /* * Accesses to memory addresses that are not allocated to system * memory or PCI devices return 0xff's. */ lowmem = vm_get_lowmem_size(ctx); bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI hole"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = lowmem; mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; mr.handler = pci_emul_fallback_handler; error = register_mem_fallback(&mr); assert(error == 0); /* PCI extended config space */ bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI ECFG"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = PCI_EMUL_ECFG_BASE; mr.size = PCI_EMUL_ECFG_SIZE; mr.handler = pci_emul_ecfg_handler; error = register_mem(&mr); assert(error == 0); return (0); } static void pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, void *arg) { dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" Zero,"); dsdt_line(" 0x%X", ioapic_irq); dsdt_line(" },"); } static void pci_pirq_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, void *arg) { char *name; name = lpc_pirq_name(pirq_pin); if (name == NULL) return; dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" %s,", name); dsdt_line(" 0x00"); dsdt_line(" },"); free(name); } /* * A bhyve virtual machine has a flat PCI hierarchy with a root port * corresponding to each PCI bus. */ static void pci_bus_write_dsdt(int bus) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; int count, func, slot; /* * If there are no devices on this 'bus' then just return. */ if ((bi = pci_businfo[bus]) == NULL) { /* * Bus 0 is special because it decodes the I/O ports used * for PCI config space access even if there are no devices * on it. */ if (bus != 0) return; } dsdt_line(" Device (PC%02X)", bus); dsdt_line(" {"); dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))"); dsdt_line(" Method (_BBN, 0, NotSerialized)"); dsdt_line(" {"); dsdt_line(" Return (0x%08X)", bus); dsdt_line(" }"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, " "MaxFixed, PosDecode,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bus); dsdt_line(" 0x%04X, // Range Maximum", bus); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0001, // Length"); dsdt_line(" ,, )"); if (bus == 0) { dsdt_indent(3); dsdt_fixed_ioport(0xCF8, 8); dsdt_unindent(3); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0000, // Range Minimum"); dsdt_line(" 0x0CF7, // Range Maximum"); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0CF8, // Length"); dsdt_line(" ,, , TypeStatic)"); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0D00, // Range Minimum"); dsdt_line(" 0x%04X, // Range Maximum", PCI_EMUL_IOBASE - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", PCI_EMUL_IOBASE - 0x0D00); dsdt_line(" ,, , TypeStatic)"); if (bi == NULL) { dsdt_line(" })"); goto done; } } assert(bi != NULL); /* i/o window */ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bi->iobase); dsdt_line(" 0x%04X, // Range Maximum", bi->iolimit - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", bi->iolimit - bi->iobase); dsdt_line(" ,, , TypeStatic)"); /* mmio window (32-bit) */ dsdt_line(" DWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x00000000, // Granularity"); dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32); dsdt_line(" 0x%08X, // Range Maximum\n", bi->memlimit32 - 1); dsdt_line(" 0x00000000, // Translation Offset"); dsdt_line(" 0x%08X, // Length\n", bi->memlimit32 - bi->membase32); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); /* mmio window (64-bit) */ dsdt_line(" QWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x0000000000000000, // Granularity"); dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64); dsdt_line(" 0x%016lX, // Range Maximum\n", bi->memlimit64 - 1); dsdt_line(" 0x0000000000000000, // Translation Offset"); dsdt_line(" 0x%016lX, // Length\n", bi->memlimit64 - bi->membase64); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); dsdt_line(" })"); count = pci_count_lintr(bus); if (count != 0) { dsdt_indent(2); dsdt_line("Name (PPRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); dsdt_line("})"); dsdt_line("Name (APRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_apic_prt_entry, NULL); dsdt_line("})"); dsdt_line("Method (_PRT, 0, NotSerialized)"); dsdt_line("{"); dsdt_line(" If (PICM)"); dsdt_line(" {"); dsdt_line(" Return (APRT)"); dsdt_line(" }"); dsdt_line(" Else"); dsdt_line(" {"); dsdt_line(" Return (PPRT)"); dsdt_line(" }"); dsdt_line("}"); dsdt_unindent(2); } dsdt_indent(2); for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { pi = si->si_funcs[func].fi_devi; if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL) pi->pi_d->pe_write_dsdt(pi); } } dsdt_unindent(2); done: dsdt_line(" }"); } void pci_write_dsdt(void) { int bus; dsdt_indent(1); dsdt_line("Name (PICM, 0x00)"); dsdt_line("Method (_PIC, 1, NotSerialized)"); dsdt_line("{"); dsdt_line(" Store (Arg0, PICM)"); dsdt_line("}"); dsdt_line(""); dsdt_line("Scope (_SB)"); dsdt_line("{"); for (bus = 0; bus < MAXBUSES; bus++) pci_bus_write_dsdt(bus); dsdt_line("}"); dsdt_unindent(1); } int pci_bus_configured(int bus) { assert(bus >= 0 && bus < MAXBUSES); return (pci_businfo[bus] != NULL); } int pci_msi_enabled(struct pci_devinst *pi) { return (pi->pi_msi.enabled); } int pci_msi_maxmsgnum(struct pci_devinst *pi) { if (pi->pi_msi.enabled) return (pi->pi_msi.maxmsgnum); else return (0); } int pci_msix_enabled(struct pci_devinst *pi) { return (pi->pi_msix.enabled && !pi->pi_msi.enabled); } void pci_generate_msix(struct pci_devinst *pi, int index) { struct msix_table_entry *mte; if (!pci_msix_enabled(pi)) return; if (pi->pi_msix.function_mask) return; if (index >= pi->pi_msix.table_count) return; mte = &pi->pi_msix.table[index]; if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { /* XXX Set PBA bit if interrupt is disabled */ vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data); } } void pci_generate_msi(struct pci_devinst *pi, int index) { if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) { vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr, pi->pi_msi.msg_data + index); } } static bool pci_lintr_permitted(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (!(pi->pi_msi.enabled || pi->pi_msix.enabled || (cmd & PCIM_CMD_INTxDIS))); } void pci_lintr_request(struct pci_devinst *pi) { struct businfo *bi; struct slotinfo *si; int bestpin, bestcount, pin; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); /* * Just allocate a pin from our slot. The pin will be * assigned IRQs later when interrupts are routed. */ si = &bi->slotinfo[pi->pi_slot]; bestpin = 0; bestcount = si->si_intpins[0].ii_count; for (pin = 1; pin < 4; pin++) { if (si->si_intpins[pin].ii_count < bestcount) { bestpin = pin; bestcount = si->si_intpins[pin].ii_count; } } si->si_intpins[bestpin].ii_count++; pi->pi_lintr.pin = bestpin + 1; pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1); } static void pci_lintr_route(struct pci_devinst *pi) { struct businfo *bi; struct intxinfo *ii; if (pi->pi_lintr.pin == 0) return; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1]; /* * Attempt to allocate an I/O APIC pin for this intpin if one * is not yet assigned. */ if (ii->ii_ioapic_irq == 0) ii->ii_ioapic_irq = ioapic_pci_alloc_irq(pi); assert(ii->ii_ioapic_irq > 0); /* * Attempt to allocate a PIRQ pin for this intpin if one is * not yet assigned. */ if (ii->ii_pirq_pin == 0) ii->ii_pirq_pin = pirq_alloc_pin(pi); assert(ii->ii_pirq_pin > 0); pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq; pi->pi_lintr.pirq_pin = ii->ii_pirq_pin; pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin)); } void pci_lintr_assert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == IDLE) { if (pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } else pi->pi_lintr.state = PENDING; } pthread_mutex_unlock(&pi->pi_lintr.lock); } void pci_lintr_deassert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED) { pi->pi_lintr.state = IDLE; pci_irq_deassert(pi); } else if (pi->pi_lintr.state == PENDING) pi->pi_lintr.state = IDLE; pthread_mutex_unlock(&pi->pi_lintr.lock); } static void pci_lintr_update(struct pci_devinst *pi) { pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) { pci_irq_deassert(pi); pi->pi_lintr.state = PENDING; } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } pthread_mutex_unlock(&pi->pi_lintr.lock); } int pci_count_lintr(int bus) { int count, slot, pin; struct slotinfo *slotinfo; count = 0; if (pci_businfo[bus] != NULL) { for (slot = 0; slot < MAXSLOTS; slot++) { slotinfo = &pci_businfo[bus]->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { if (slotinfo->si_intpins[pin].ii_count != 0) count++; } } } return (count); } void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg) { struct businfo *bi; struct slotinfo *si; struct intxinfo *ii; int slot, pin; if ((bi = pci_businfo[bus]) == NULL) return; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { ii = &si->si_intpins[pin]; if (ii->ii_count != 0) cb(bus, slot, pin + 1, ii->ii_pirq_pin, ii->ii_ioapic_irq, arg); } } } /* * Return 1 if the emulated device in 'slot' is a multi-function device. * Return 0 otherwise. */ static int pci_emul_is_mfdev(int bus, int slot) { struct businfo *bi; struct slotinfo *si; int f, numfuncs; numfuncs = 0; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; for (f = 0; f < MAXFUNCS; f++) { if (si->si_funcs[f].fi_devi != NULL) { numfuncs++; } } } return (numfuncs > 1); } /* * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on * whether or not is a multi-function being emulated in the pci 'slot'. */ static void pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) { int mfdev; if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) { mfdev = pci_emul_is_mfdev(bus, slot); switch (bytes) { case 1: case 2: *rv &= ~PCIM_MFDEV; if (mfdev) { *rv |= PCIM_MFDEV; } break; case 4: *rv &= ~(PCIM_MFDEV << 16); if (mfdev) { *rv |= (PCIM_MFDEV << 16); } break; } } } /* * Update device state in response to changes to the PCI command * register. */ void pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old) { int i; uint16_t changed, new; new = pci_get_cfgdata16(pi, PCIR_COMMAND); changed = old ^ new; /* * If the MMIO or I/O address space decoding has changed then * register/unregister all BARs that decode that address space. */ for (i = 0; i <= PCI_BARMAX; i++) { switch (pi->pi_bar[i].type) { case PCIBAR_NONE: case PCIBAR_MEMHI64: break; case PCIBAR_IO: /* I/O address space decoding changed? */ if (changed & PCIM_CMD_PORTEN) { if (new & PCIM_CMD_PORTEN) register_bar(pi, i); else unregister_bar(pi, i); } break; case PCIBAR_MEM32: case PCIBAR_MEM64: /* MMIO address space decoding changed? */ if (changed & PCIM_CMD_MEMEN) { if (new & PCIM_CMD_MEMEN) register_bar(pi, i); else unregister_bar(pi, i); } break; default: assert(0); } } /* * If INTx has been unmasked and is pending, assert the * interrupt. */ pci_lintr_update(pi); } static void pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes) { int rshift; uint32_t cmd, old, readonly; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ /* * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3. * * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are * 'write 1 to clear'. However these bits are not set to '1' by * any device emulation so it is simpler to treat them as readonly. */ rshift = (coff & 0x3) * 8; readonly = 0xFFFFF880 >> rshift; old = CFGREAD(pi, coff, bytes); new &= ~readonly; new |= (old & readonly); CFGWRITE(pi, coff, new, bytes); /* update config */ pci_emul_cmd_changed(pi, cmd); } static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, int coff, int bytes, uint32_t *eax) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; struct pci_devemu *pe; int idx, needcfg; uint64_t addr, bar, mask; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; pi = si->si_funcs[func].fi_devi; } else pi = NULL; /* * Just return if there is no device at this slot:func or if the * the guest is doing an un-aligned access. */ if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || (coff & (bytes - 1)) != 0) { if (in) *eax = 0xffffffff; return; } /* * Ignore all writes beyond the standard config space and return all * ones on reads. */ if (coff >= PCI_REGMAX + 1) { if (in) { *eax = 0xffffffff; /* * Extended capabilities begin at offset 256 in config * space. Absence of extended capabilities is signaled * with all 0s in the extended capability header at * offset 256. */ if (coff <= PCI_REGMAX + 4) *eax = 0x00000000; } return; } pe = pi->pi_d; /* * Config read */ if (in) { /* Let the device emulation override the default handler */ if (pe->pe_cfgread != NULL) { needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes, eax); } else { needcfg = 1; } if (needcfg) *eax = CFGREAD(pi, coff, bytes); pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax); } else { /* Let the device emulation override the default handler */ if (pe->pe_cfgwrite != NULL && (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0) return; /* * Special handling for write to BAR registers */ if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) { /* * Ignore writes to BAR registers that are not * 4-byte aligned. */ if (bytes != 4 || (coff & 0x3) != 0) return; idx = (coff - PCIR_BAR(0)) / 4; mask = ~(pi->pi_bar[idx].size - 1); switch (pi->pi_bar[idx].type) { case PCIBAR_NONE: pi->pi_bar[idx].addr = bar = 0; break; case PCIBAR_IO: addr = *eax & mask; addr &= 0xffff; bar = addr | PCIM_BAR_IO_SPACE; /* * Register the new BAR value for interception */ if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_IO); } break; case PCIBAR_MEM32: addr = bar = *eax & mask; bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM32); } break; case PCIBAR_MEM64: addr = bar = *eax & mask; bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | PCIM_BAR_MEM_PREFETCH; if (addr != (uint32_t)pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM64); } break; case PCIBAR_MEMHI64: mask = ~(pi->pi_bar[idx - 1].size - 1); addr = ((uint64_t)*eax << 32) & mask; bar = addr >> 32; if (bar != pi->pi_bar[idx - 1].addr >> 32) { update_bar_address(pi, addr, idx - 1, PCIBAR_MEMHI64); } break; default: assert(0); } pci_set_cfgdata32(pi, coff, bar); } else if (pci_emul_iscap(pi, coff)) { pci_emul_capwrite(pi, coff, bytes, *eax, 0, 0); } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) { pci_emul_cmdsts_write(pi, coff, *eax, bytes); } else { CFGWRITE(pi, coff, *eax, bytes); } } } static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; static int pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { uint32_t x; if (bytes != 4) { if (in) *eax = (bytes == 2) ? 0xffff : 0xff; return (0); } if (in) { x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff; if (cfgenable) x |= CONF1_ENABLE; *eax = x; } else { x = *eax; cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; cfgoff = x & PCI_REGMAX; cfgfunc = (x >> 8) & PCI_FUNCMAX; cfgslot = (x >> 11) & PCI_SLOTMAX; cfgbus = (x >> 16) & PCI_BUSMAX; } return (0); } INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); static int pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { int coff; assert(bytes == 1 || bytes == 2 || bytes == 4); coff = cfgoff + (port - CONF1_DATA_PORT); if (cfgenable) { pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes, eax); } else { /* Ignore accesses to cfgdata if not enabled by cfgaddr */ if (in) *eax = 0xffffffff; } return (0); } INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); #ifdef BHYVE_SNAPSHOT /* * Saves/restores PCI device emulated state. Returns 0 on success. */ static int pci_snapshot_pci_dev(struct vm_snapshot_meta *meta) { struct pci_devinst *pi; int i; int ret; pi = meta->dev_data; SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.addr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.msg_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.maxmsgnum, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_bar, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_bar, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_offset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_offset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_size, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.function_mask, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_page_offset, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(pi->pi_cfgdata, sizeof(pi->pi_cfgdata), meta, ret, done); for (i = 0; i < nitems(pi->pi_bar); i++) { SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].type, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].size, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].addr, meta, ret, done); } /* Restore MSI-X table. */ for (i = 0; i < pi->pi_msix.table_count; i++) { SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].addr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].msg_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].vector_control, meta, ret, done); } done: return (ret); } static int pci_find_slotted_dev(const char *dev_name, struct pci_devemu **pde, struct pci_devinst **pdi) { struct businfo *bi; struct slotinfo *si; struct funcinfo *fi; int bus, slot, func; assert(dev_name != NULL); assert(pde != NULL); assert(pdi != NULL); for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; - if (fi->fi_name == NULL) + if (fi->fi_pde == NULL) continue; - if (strcmp(dev_name, fi->fi_name)) + if (strcmp(dev_name, fi->fi_pde->pe_emu) != 0) continue; - *pde = pci_emul_finddev(fi->fi_name); - assert(*pde != NULL); - + *pde = fi->fi_pde; *pdi = fi->fi_devi; return (0); } } } return (EINVAL); } int pci_snapshot(struct vm_snapshot_meta *meta) { struct pci_devemu *pde; struct pci_devinst *pdi; int ret; assert(meta->dev_name != NULL); ret = pci_find_slotted_dev(meta->dev_name, &pde, &pdi); if (ret != 0) { fprintf(stderr, "%s: no such name: %s\r\n", __func__, meta->dev_name); memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); return (0); } meta->dev_data = pdi; if (pde->pe_snapshot == NULL) { fprintf(stderr, "%s: not implemented yet for: %s\r\n", __func__, meta->dev_name); return (-1); } ret = pci_snapshot_pci_dev(meta); if (ret != 0) { fprintf(stderr, "%s: failed to snapshot pci dev\r\n", __func__); return (-1); } ret = (*pde->pe_snapshot)(meta); return (ret); } int pci_pause(struct vmctx *ctx, const char *dev_name) { struct pci_devemu *pde; struct pci_devinst *pdi; int ret; assert(dev_name != NULL); ret = pci_find_slotted_dev(dev_name, &pde, &pdi); if (ret != 0) { /* * It is possible to call this function without * checking that the device is inserted first. */ fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name); return (0); } if (pde->pe_pause == NULL) { /* The pause/resume functionality is optional. */ fprintf(stderr, "%s: not implemented for: %s\n", __func__, dev_name); return (0); } return (*pde->pe_pause)(ctx, pdi); } int pci_resume(struct vmctx *ctx, const char *dev_name) { struct pci_devemu *pde; struct pci_devinst *pdi; int ret; assert(dev_name != NULL); ret = pci_find_slotted_dev(dev_name, &pde, &pdi); if (ret != 0) { /* * It is possible to call this function without * checking that the device is inserted first. */ fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name); return (0); } if (pde->pe_resume == NULL) { /* The pause/resume functionality is optional. */ fprintf(stderr, "%s: not implemented for: %s\n", __func__, dev_name); return (0); } return (*pde->pe_resume)(ctx, pdi); } #endif #define PCI_EMUL_TEST #ifdef PCI_EMUL_TEST /* * Define a dummy test device */ #define DIOSZ 8 #define DMEMSZ 4096 struct pci_emul_dsoftc { uint8_t ioregs[DIOSZ]; uint8_t memregs[2][DMEMSZ]; }; #define PCI_EMUL_MSI_MSGS 4 #define PCI_EMUL_MSIX_MSGS 16 static int -pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { int error; struct pci_emul_dsoftc *sc; sc = calloc(1, sizeof(struct pci_emul_dsoftc)); pi->pi_arg = sc; pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS); assert(error == 0); error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ); assert(error == 0); return (0); } static void pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { int i; struct pci_emul_dsoftc *sc = pi->pi_arg; if (baridx == 0) { if (offset + size > DIOSZ) { printf("diow: iow too large, offset %ld size %d\n", offset, size); return; } if (size == 1) { sc->ioregs[offset] = value & 0xff; } else if (size == 2) { *(uint16_t *)&sc->ioregs[offset] = value & 0xffff; } else if (size == 4) { *(uint32_t *)&sc->ioregs[offset] = value; } else { printf("diow: iow unknown size %d\n", size); } /* * Special magic value to generate an interrupt */ if (offset == 4 && size == 4 && pci_msi_enabled(pi)) pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi)); if (value == 0xabcdef) { for (i = 0; i < pci_msi_maxmsgnum(pi); i++) pci_generate_msi(pi, i); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("diow: memw too large, offset %ld size %d\n", offset, size); return; } i = baridx - 1; /* 'memregs' index */ if (size == 1) { sc->memregs[i][offset] = value; } else if (size == 2) { *(uint16_t *)&sc->memregs[i][offset] = value; } else if (size == 4) { *(uint32_t *)&sc->memregs[i][offset] = value; } else if (size == 8) { *(uint64_t *)&sc->memregs[i][offset] = value; } else { printf("diow: memw unknown size %d\n", size); } /* * magic interrupt ?? */ } if (baridx > 2 || baridx < 0) { printf("diow: unknown bar idx %d\n", baridx); } } static uint64_t pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_emul_dsoftc *sc = pi->pi_arg; uint32_t value; int i; if (baridx == 0) { if (offset + size > DIOSZ) { printf("dior: ior too large, offset %ld size %d\n", offset, size); return (0); } value = 0; if (size == 1) { value = sc->ioregs[offset]; } else if (size == 2) { value = *(uint16_t *) &sc->ioregs[offset]; } else if (size == 4) { value = *(uint32_t *) &sc->ioregs[offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("dior: memr too large, offset %ld size %d\n", offset, size); return (0); } i = baridx - 1; /* 'memregs' index */ if (size == 1) { value = sc->memregs[i][offset]; } else if (size == 2) { value = *(uint16_t *) &sc->memregs[i][offset]; } else if (size == 4) { value = *(uint32_t *) &sc->memregs[i][offset]; } else if (size == 8) { value = *(uint64_t *) &sc->memregs[i][offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx > 2 || baridx < 0) { printf("dior: unknown bar idx %d\n", baridx); return (0); } return (value); } #ifdef BHYVE_SNAPSHOT int pci_emul_snapshot(struct vm_snapshot_meta *meta) { return (0); } #endif struct pci_devemu pci_dummy = { .pe_emu = "dummy", .pe_init = pci_emul_dinit, .pe_barwrite = pci_emul_diow, .pe_barread = pci_emul_dior, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_emul_snapshot, #endif }; PCI_EMUL_SET(pci_dummy); #endif /* PCI_EMUL_TEST */ diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h index 3e9e95a74b47..2dbed07dfc14 100644 --- a/usr.sbin/bhyve/pci_emul.h +++ b/usr.sbin/bhyve/pci_emul.h @@ -1,299 +1,304 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _PCI_EMUL_H_ #define _PCI_EMUL_H_ #include #include #include +#include #include #include #include #define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */ struct vmctx; struct pci_devinst; struct memory_region; struct vm_snapshot_meta; struct pci_devemu { char *pe_emu; /* Name of device emulation */ /* instance creation */ int (*pe_init)(struct vmctx *, struct pci_devinst *, - char *opts); + nvlist_t *); + int (*pe_legacy_config)(nvlist_t *, const char *); + const char *pe_alias; /* ACPI DSDT enumeration */ void (*pe_write_dsdt)(struct pci_devinst *); /* config space read/write callbacks */ int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int offset, int bytes, uint32_t val); int (*pe_cfgread)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int offset, int bytes, uint32_t *retval); /* BAR read/write callbacks */ void (*pe_barwrite)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value); uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size); /* Save/restore device state */ int (*pe_snapshot)(struct vm_snapshot_meta *meta); int (*pe_pause)(struct vmctx *ctx, struct pci_devinst *pi); int (*pe_resume)(struct vmctx *ctx, struct pci_devinst *pi); + }; #define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x); enum pcibar_type { PCIBAR_NONE, PCIBAR_IO, PCIBAR_MEM32, PCIBAR_MEM64, PCIBAR_MEMHI64 }; struct pcibar { enum pcibar_type type; /* io or memory */ uint64_t size; uint64_t addr; }; #define PI_NAMESZ 40 struct msix_table_entry { uint64_t addr; uint32_t msg_data; uint32_t vector_control; } __packed; /* * In case the structure is modified to hold extra information, use a define * for the size that should be emulated. */ #define MSIX_TABLE_ENTRY_SIZE 16 #define MAX_MSIX_TABLE_ENTRIES 2048 #define PBA_SIZE(msgnum) (roundup2((msgnum), 64) / 8) enum lintr_stat { IDLE, ASSERTED, PENDING }; struct pci_devinst { struct pci_devemu *pi_d; struct vmctx *pi_vmctx; uint8_t pi_bus, pi_slot, pi_func; char pi_name[PI_NAMESZ]; int pi_bar_getsize; int pi_prevcap; int pi_capend; struct { int8_t pin; enum lintr_stat state; int pirq_pin; int ioapic_irq; pthread_mutex_t lock; } pi_lintr; struct { int enabled; uint64_t addr; uint64_t msg_data; int maxmsgnum; } pi_msi; struct { int enabled; int table_bar; int pba_bar; uint32_t table_offset; int table_count; uint32_t pba_offset; int pba_size; int function_mask; struct msix_table_entry *table; /* allocated at runtime */ void *pba_page; int pba_page_offset; } pi_msix; void *pi_arg; /* devemu-private data */ u_char pi_cfgdata[PCI_REGMAX + 1]; struct pcibar pi_bar[PCI_BARMAX + 1]; }; struct msicap { uint8_t capid; uint8_t nextptr; uint16_t msgctrl; uint32_t addrlo; uint32_t addrhi; uint16_t msgdata; } __packed; static_assert(sizeof(struct msicap) == 14, "compile-time assertion failed"); struct msixcap { uint8_t capid; uint8_t nextptr; uint16_t msgctrl; uint32_t table_info; /* bar index and offset within it */ uint32_t pba_info; /* bar index and offset within it */ } __packed; static_assert(sizeof(struct msixcap) == 12, "compile-time assertion failed"); struct pciecap { uint8_t capid; uint8_t nextptr; uint16_t pcie_capabilities; uint32_t dev_capabilities; /* all devices */ uint16_t dev_control; uint16_t dev_status; uint32_t link_capabilities; /* devices with links */ uint16_t link_control; uint16_t link_status; uint32_t slot_capabilities; /* ports with slots */ uint16_t slot_control; uint16_t slot_status; uint16_t root_control; /* root ports */ uint16_t root_capabilities; uint32_t root_status; uint32_t dev_capabilities2; /* all devices */ uint16_t dev_control2; uint16_t dev_status2; uint32_t link_capabilities2; /* devices with links */ uint16_t link_control2; uint16_t link_status2; uint32_t slot_capabilities2; /* ports with slots */ uint16_t slot_control2; uint16_t slot_status2; } __packed; static_assert(sizeof(struct pciecap) == 60, "compile-time assertion failed"); typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin, int ioapic_irq, void *arg); int init_pci(struct vmctx *ctx); void pci_callback(void); int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size); int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type); void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val, uint8_t capoff, int capid); void pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old); void pci_generate_msi(struct pci_devinst *pi, int msgnum); void pci_generate_msix(struct pci_devinst *pi, int msgnum); void pci_lintr_assert(struct pci_devinst *pi); void pci_lintr_deassert(struct pci_devinst *pi); void pci_lintr_request(struct pci_devinst *pi); int pci_msi_enabled(struct pci_devinst *pi); int pci_msix_enabled(struct pci_devinst *pi); int pci_msix_table_bar(struct pci_devinst *pi); int pci_msix_pba_bar(struct pci_devinst *pi); int pci_msi_maxmsgnum(struct pci_devinst *pi); +int pci_parse_legacy_config(nvlist_t *nvl, const char *opt); int pci_parse_slot(char *opt); void pci_print_supported_devices(); void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum); int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, uint64_t value); uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size); int pci_count_lintr(int bus); void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg); void pci_write_dsdt(void); uint64_t pci_ecfg_base(void); int pci_bus_configured(int bus); #ifdef BHYVE_SNAPSHOT int pci_snapshot(struct vm_snapshot_meta *meta); int pci_pause(struct vmctx *ctx, const char *dev_name); int pci_resume(struct vmctx *ctx, const char *dev_name); #endif static __inline void pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val) { assert(offset <= PCI_REGMAX); *(uint8_t *)(pi->pi_cfgdata + offset) = val; } static __inline void pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val) { assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); *(uint16_t *)(pi->pi_cfgdata + offset) = val; } static __inline void pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val) { assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); *(uint32_t *)(pi->pi_cfgdata + offset) = val; } static __inline uint8_t pci_get_cfgdata8(struct pci_devinst *pi, int offset) { assert(offset <= PCI_REGMAX); return (*(uint8_t *)(pi->pi_cfgdata + offset)); } static __inline uint16_t pci_get_cfgdata16(struct pci_devinst *pi, int offset) { assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); return (*(uint16_t *)(pi->pi_cfgdata + offset)); } static __inline uint32_t pci_get_cfgdata32(struct pci_devinst *pi, int offset) { assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); return (*(uint32_t *)(pi->pi_cfgdata + offset)); } #endif /* _PCI_EMUL_H_ */ diff --git a/usr.sbin/bhyve/pci_fbuf.c b/usr.sbin/bhyve/pci_fbuf.c index 0bd740a0908c..215ce742edfb 100644 --- a/usr.sbin/bhyve/pci_fbuf.c +++ b/usr.sbin/bhyve/pci_fbuf.c @@ -1,466 +1,464 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015 Nahanni Systems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "bhyvegc.h" #include "bhyverun.h" +#include "config.h" #include "debug.h" #include "console.h" #include "inout.h" #include "pci_emul.h" #include "rfb.h" #include "vga.h" /* * bhyve Framebuffer device emulation. * BAR0 points to the current mode information. * BAR1 is the 32-bit framebuffer address. * * -s ,fbuf,wait,vga=on|io|off,rfb=:port,w=width,h=height */ static int fbuf_debug = 1; #define DEBUG_INFO 1 #define DEBUG_VERBOSE 4 #define DPRINTF(level, params) if (level <= fbuf_debug) PRINTLN params #define KB (1024UL) #define MB (1024 * 1024UL) #define DMEMSZ 128 #define FB_SIZE (16*MB) #define COLS_MAX 1920 #define ROWS_MAX 1200 #define COLS_DEFAULT 1024 #define ROWS_DEFAULT 768 #define COLS_MIN 640 #define ROWS_MIN 480 struct pci_fbuf_softc { struct pci_devinst *fsc_pi; struct { uint32_t fbsize; uint16_t width; uint16_t height; uint16_t depth; uint16_t refreshrate; uint8_t reserved[116]; } __packed memregs; /* rfb server */ char *rfb_host; char *rfb_password; int rfb_port; int rfb_wait; int vga_enabled; int vga_full; uint32_t fbaddr; char *fb_base; uint16_t gc_width; uint16_t gc_height; void *vgasc; struct bhyvegc_image *gc_image; }; static struct pci_fbuf_softc *fbuf_sc; #define PCI_FBUF_MSI_MSGS 4 -static void -pci_fbuf_usage(char *opt) -{ - - EPRINTLN("Invalid fbuf emulation option \"%s\"", opt); - EPRINTLN("fbuf: {wait,}{vga=on|io|off,}rfb=:port" - "{,w=width}{,h=height}"); -} - static void pci_fbuf_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_fbuf_softc *sc; uint8_t *p; assert(baridx == 0); sc = pi->pi_arg; DPRINTF(DEBUG_VERBOSE, ("fbuf wr: offset 0x%lx, size: %d, value: 0x%lx", offset, size, value)); if (offset + size > DMEMSZ) { printf("fbuf: write too large, offset %ld size %d\n", offset, size); return; } p = (uint8_t *)&sc->memregs + offset; switch (size) { case 1: *p = value; break; case 2: *(uint16_t *)p = value; break; case 4: *(uint32_t *)p = value; break; case 8: *(uint64_t *)p = value; break; default: printf("fbuf: write unknown size %d\n", size); break; } if (!sc->gc_image->vgamode && sc->memregs.width == 0 && sc->memregs.height == 0) { DPRINTF(DEBUG_INFO, ("switching to VGA mode")); sc->gc_image->vgamode = 1; sc->gc_width = 0; sc->gc_height = 0; } else if (sc->gc_image->vgamode && sc->memregs.width != 0 && sc->memregs.height != 0) { DPRINTF(DEBUG_INFO, ("switching to VESA mode")); sc->gc_image->vgamode = 0; } } uint64_t pci_fbuf_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_fbuf_softc *sc; uint8_t *p; uint64_t value; assert(baridx == 0); sc = pi->pi_arg; if (offset + size > DMEMSZ) { printf("fbuf: read too large, offset %ld size %d\n", offset, size); return (0); } p = (uint8_t *)&sc->memregs + offset; value = 0; switch (size) { case 1: value = *p; break; case 2: value = *(uint16_t *)p; break; case 4: value = *(uint32_t *)p; break; case 8: value = *(uint64_t *)p; break; default: printf("fbuf: read unknown size %d\n", size); break; } DPRINTF(DEBUG_VERBOSE, ("fbuf rd: offset 0x%lx, size: %d, value: 0x%lx", offset, size, value)); return (value); } static int -pci_fbuf_parse_opts(struct pci_fbuf_softc *sc, char *opts) +pci_fbuf_parse_config(struct pci_fbuf_softc *sc, nvlist_t *nvl) { - char *uopts, *uoptsbak, *xopts, *config; - char *tmpstr; - int ret; - - ret = 0; - uoptsbak = uopts = strdup(opts); - while ((xopts = strsep(&uopts, ",")) != NULL) { - if (strcmp(xopts, "wait") == 0) { - sc->rfb_wait = 1; - continue; - } - - if ((config = strchr(xopts, '=')) == NULL) { - pci_fbuf_usage(xopts); - ret = -1; - goto done; - } - - *config++ = '\0'; - - DPRINTF(DEBUG_VERBOSE, ("pci_fbuf option %s = %s", - xopts, config)); - - if (!strcmp(xopts, "tcp") || !strcmp(xopts, "rfb")) { - /* - * IPv4 -- host-ip:port - * IPv6 -- [host-ip%zone]:port - * XXX for now port is mandatory. - */ - tmpstr = strsep(&config, "]"); - if (config) { - if (tmpstr[0] == '[') - tmpstr++; - sc->rfb_host = strdup(tmpstr); - if (config[0] == ':') - config++; - else { - pci_fbuf_usage(xopts); - ret = -1; - goto done; - } - sc->rfb_port = atoi(config); - } else { - config = tmpstr; - tmpstr = strsep(&config, ":"); - if (!config) - sc->rfb_port = atoi(tmpstr); - else { - sc->rfb_port = atoi(config); - sc->rfb_host = strdup(tmpstr); + const char *value; + char *cp; + + sc->rfb_wait = get_config_bool_node_default(nvl, "wait", false); + + /* Prefer "rfb" to "tcp". */ + value = get_config_value_node(nvl, "rfb"); + if (value == NULL) + value = get_config_value_node(nvl, "tcp"); + if (value != NULL) { + /* + * IPv4 -- host-ip:port + * IPv6 -- [host-ip%zone]:port + * XXX for now port is mandatory for IPv4. + */ + if (value[0] == '[') { + cp = strchr(value + 1, ']'); + if (cp == NULL || cp == value + 1) { + EPRINTLN("fbuf: Invalid IPv6 address: \"%s\"", + value); + return (-1); + } + sc->rfb_host = strndup(value + 1, cp - (value + 1)); + cp++; + if (*cp == ':') { + cp++; + if (*cp == '\0') { + EPRINTLN( + "fbuf: Missing port number: \"%s\"", + value); + return (-1); } + sc->rfb_port = atoi(cp); + } else if (*cp != '\0') { + EPRINTLN("fbuf: Invalid IPv6 address: \"%s\"", + value); + return (-1); } - } else if (!strcmp(xopts, "vga")) { - if (!strcmp(config, "off")) { - sc->vga_enabled = 0; - } else if (!strcmp(config, "io")) { - sc->vga_enabled = 1; - sc->vga_full = 0; - } else if (!strcmp(config, "on")) { - sc->vga_enabled = 1; - sc->vga_full = 1; + } else { + cp = strchr(value, ':'); + if (cp == NULL) { + sc->rfb_port = atoi(value); } else { - pci_fbuf_usage(xopts); - ret = -1; - goto done; + sc->rfb_host = strndup(value, cp - value); + cp++; + if (*cp == '\0') { + EPRINTLN( + "fbuf: Missing port number: \"%s\"", + value); + return (-1); + } + sc->rfb_port = atoi(cp); } - } else if (!strcmp(xopts, "w")) { - sc->memregs.width = atoi(config); - if (sc->memregs.width > COLS_MAX) { - pci_fbuf_usage(xopts); - ret = -1; - goto done; - } else if (sc->memregs.width == 0) - sc->memregs.width = 1920; - } else if (!strcmp(xopts, "h")) { - sc->memregs.height = atoi(config); - if (sc->memregs.height > ROWS_MAX) { - pci_fbuf_usage(xopts); - ret = -1; - goto done; - } else if (sc->memregs.height == 0) - sc->memregs.height = 1080; - } else if (!strcmp(xopts, "password")) { - sc->rfb_password = strdup(config); + } + } + + value = get_config_value_node(nvl, "vga"); + if (value != NULL) { + if (strcmp(value, "off") == 0) { + sc->vga_enabled = 0; + } else if (strcmp(value, "io") == 0) { + sc->vga_enabled = 1; + sc->vga_full = 0; + } else if (strcmp(value, "on") == 0) { + sc->vga_enabled = 1; + sc->vga_full = 1; } else { - pci_fbuf_usage(xopts); - ret = -1; - goto done; + EPRINTLN("fbuf: Invalid vga setting: \"%s\"", value); + return (-1); } } -done: - free(uoptsbak); - return (ret); + value = get_config_value_node(nvl, "w"); + if (value != NULL) { + sc->memregs.width = atoi(value); + if (sc->memregs.width > COLS_MAX) { + EPRINTLN("fbuf: width %d too large", sc->memregs.width); + return (-1); + } + if (sc->memregs.width == 0) + sc->memregs.width = 1920; + } + + value = get_config_value_node(nvl, "h"); + if (value != NULL) { + sc->memregs.height = atoi(value); + if (sc->memregs.height > ROWS_MAX) { + EPRINTLN("fbuf: height %d too large", + sc->memregs.height); + return (-1); + } + if (sc->memregs.height == 0) + sc->memregs.height = 1080; + } + + value = get_config_value_node(nvl, "password"); + if (value != NULL) + sc->rfb_password = strdup(value); + + return (0); } extern void vga_render(struct bhyvegc *gc, void *arg); void pci_fbuf_render(struct bhyvegc *gc, void *arg) { struct pci_fbuf_softc *sc; sc = arg; if (sc->vga_full && sc->gc_image->vgamode) { /* TODO: mode switching to vga and vesa should use the special * EFI-bhyve protocol port. */ vga_render(gc, sc->vgasc); return; } if (sc->gc_width != sc->memregs.width || sc->gc_height != sc->memregs.height) { bhyvegc_resize(gc, sc->memregs.width, sc->memregs.height); sc->gc_width = sc->memregs.width; sc->gc_height = sc->memregs.height; } return; } static int -pci_fbuf_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_fbuf_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { int error, prot; struct pci_fbuf_softc *sc; if (fbuf_sc != NULL) { EPRINTLN("Only one frame buffer device is allowed."); return (-1); } sc = calloc(1, sizeof(struct pci_fbuf_softc)); pi->pi_arg = sc; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x40FB); pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_DISPLAY); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_DISPLAY_VGA); error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, DMEMSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, FB_SIZE); assert(error == 0); error = pci_emul_add_msicap(pi, PCI_FBUF_MSI_MSGS); assert(error == 0); sc->fbaddr = pi->pi_bar[1].addr; sc->memregs.fbsize = FB_SIZE; sc->memregs.width = COLS_DEFAULT; sc->memregs.height = ROWS_DEFAULT; sc->memregs.depth = 32; sc->vga_enabled = 1; sc->vga_full = 0; sc->fsc_pi = pi; - error = pci_fbuf_parse_opts(sc, opts); + error = pci_fbuf_parse_config(sc, nvl); if (error != 0) goto done; /* XXX until VGA rendering is enabled */ if (sc->vga_full != 0) { EPRINTLN("pci_fbuf: VGA rendering not enabled"); goto done; } sc->fb_base = vm_create_devmem(ctx, VM_FRAMEBUFFER, "framebuffer", FB_SIZE); if (sc->fb_base == MAP_FAILED) { error = -1; goto done; } DPRINTF(DEBUG_INFO, ("fbuf frame buffer base: %p [sz %lu]", sc->fb_base, FB_SIZE)); /* * Map the framebuffer into the guest address space. * XXX This may fail if the BAR is different than a prior * run. In this case flag the error. This will be fixed * when a change_memseg api is available. */ prot = PROT_READ | PROT_WRITE; if (vm_mmap_memseg(ctx, sc->fbaddr, VM_FRAMEBUFFER, 0, FB_SIZE, prot) != 0) { EPRINTLN("pci_fbuf: mapseg failed - try deleting VM and restarting"); error = -1; goto done; } console_init(sc->memregs.width, sc->memregs.height, sc->fb_base); console_fb_register(pci_fbuf_render, sc); if (sc->vga_enabled) sc->vgasc = vga_init(!sc->vga_full); sc->gc_image = console_get_image(); fbuf_sc = sc; memset((void *)sc->fb_base, 0, FB_SIZE); error = rfb_init(sc->rfb_host, sc->rfb_port, sc->rfb_wait, sc->rfb_password); done: if (error) free(sc); return (error); } #ifdef BHYVE_SNAPSHOT static int pci_fbuf_snapshot(struct vm_snapshot_meta *meta) { int ret; SNAPSHOT_BUF_OR_LEAVE(fbuf_sc->fb_base, FB_SIZE, meta, ret, err); err: return (ret); } #endif struct pci_devemu pci_fbuf = { .pe_emu = "fbuf", .pe_init = pci_fbuf_init, .pe_barwrite = pci_fbuf_write, .pe_barread = pci_fbuf_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_fbuf_snapshot, #endif }; PCI_EMUL_SET(pci_fbuf); diff --git a/usr.sbin/bhyve/pci_hda.c b/usr.sbin/bhyve/pci_hda.c index 86e46a550aec..7491944fedd0 100644 --- a/usr.sbin/bhyve/pci_hda.c +++ b/usr.sbin/bhyve/pci_hda.c @@ -1,1331 +1,1292 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Alex Teaca * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include "pci_hda.h" #include "bhyverun.h" +#include "config.h" #include "pci_emul.h" #include "hdac_reg.h" /* * HDA defines */ #define PCIR_HDCTL 0x40 #define INTEL_VENDORID 0x8086 #define HDA_INTEL_82801G 0x27d8 #define HDA_IOSS_NO 0x08 #define HDA_OSS_NO 0x04 #define HDA_ISS_NO 0x04 #define HDA_CODEC_MAX 0x0f #define HDA_LAST_OFFSET \ (0x2084 + ((HDA_ISS_NO) * 0x20) + ((HDA_OSS_NO) * 0x20)) #define HDA_SET_REG_TABLE_SZ \ (0x80 + ((HDA_ISS_NO) * 0x20) + ((HDA_OSS_NO) * 0x20)) #define HDA_CORB_ENTRY_LEN 0x04 #define HDA_RIRB_ENTRY_LEN 0x08 #define HDA_BDL_ENTRY_LEN 0x10 #define HDA_DMA_PIB_ENTRY_LEN 0x08 #define HDA_STREAM_TAGS_CNT 0x10 #define HDA_STREAM_REGS_BASE 0x80 #define HDA_STREAM_REGS_LEN 0x20 #define HDA_DMA_ACCESS_LEN (sizeof(uint32_t)) #define HDA_BDL_MAX_LEN 0x0100 #define HDAC_SDSTS_FIFORDY (1 << 5) #define HDA_RIRBSTS_IRQ_MASK (HDAC_RIRBSTS_RINTFL | HDAC_RIRBSTS_RIRBOIS) #define HDA_STATESTS_IRQ_MASK ((1 << HDA_CODEC_MAX) - 1) #define HDA_SDSTS_IRQ_MASK \ (HDAC_SDSTS_DESE | HDAC_SDSTS_FIFOE | HDAC_SDSTS_BCIS) /* * HDA data structures */ struct hda_softc; typedef void (*hda_set_reg_handler)(struct hda_softc *sc, uint32_t offset, uint32_t old); struct hda_bdle { uint32_t addrl; uint32_t addrh; uint32_t len; uint32_t ioc; } __packed; struct hda_bdle_desc { void *addr; uint8_t ioc; uint32_t len; }; struct hda_codec_cmd_ctl { char *name; void *dma_vaddr; uint8_t run; uint16_t rp; uint16_t size; uint16_t wp; }; struct hda_stream_desc { uint8_t dir; uint8_t run; uint8_t stream; /* bp is the no. of bytes transferred in the current bdle */ uint32_t bp; /* be is the no. of bdles transferred in the bdl */ uint32_t be; uint32_t bdl_cnt; struct hda_bdle_desc bdl[HDA_BDL_MAX_LEN]; }; struct hda_softc { struct pci_devinst *pci_dev; uint32_t regs[HDA_LAST_OFFSET]; uint8_t lintr; uint8_t rirb_cnt; uint64_t wall_clock_start; struct hda_codec_cmd_ctl corb; struct hda_codec_cmd_ctl rirb; uint8_t codecs_no; struct hda_codec_inst *codecs[HDA_CODEC_MAX]; /* Base Address of the DMA Position Buffer */ void *dma_pib_vaddr; struct hda_stream_desc streams[HDA_IOSS_NO]; /* 2 tables for output and input */ uint8_t stream_map[2][HDA_STREAM_TAGS_CNT]; }; /* * HDA module function declarations */ static inline void hda_set_reg_by_offset(struct hda_softc *sc, uint32_t offset, uint32_t value); static inline uint32_t hda_get_reg_by_offset(struct hda_softc *sc, uint32_t offset); static inline void hda_set_field_by_offset(struct hda_softc *sc, uint32_t offset, uint32_t mask, uint32_t value); -static uint8_t hda_parse_config(const char *opts, const char *key, char *val); -static struct hda_softc *hda_init(const char *opts); +static struct hda_softc *hda_init(nvlist_t *nvl); static void hda_update_intr(struct hda_softc *sc); static void hda_response_interrupt(struct hda_softc *sc); static int hda_codec_constructor(struct hda_softc *sc, - struct hda_codec_class *codec, const char *play, const char *rec, - const char *opts); + struct hda_codec_class *codec, const char *play, const char *rec); static struct hda_codec_class *hda_find_codec_class(const char *name); static int hda_send_command(struct hda_softc *sc, uint32_t verb); static int hda_notify_codecs(struct hda_softc *sc, uint8_t run, uint8_t stream, uint8_t dir); static void hda_reset(struct hda_softc *sc); static void hda_reset_regs(struct hda_softc *sc); static void hda_stream_reset(struct hda_softc *sc, uint8_t stream_ind); static int hda_stream_start(struct hda_softc *sc, uint8_t stream_ind); static int hda_stream_stop(struct hda_softc *sc, uint8_t stream_ind); static uint32_t hda_read(struct hda_softc *sc, uint32_t offset); static int hda_write(struct hda_softc *sc, uint32_t offset, uint8_t size, uint32_t value); static inline void hda_print_cmd_ctl_data(struct hda_codec_cmd_ctl *p); static int hda_corb_start(struct hda_softc *sc); static int hda_corb_run(struct hda_softc *sc); static int hda_rirb_start(struct hda_softc *sc); static void *hda_dma_get_vaddr(struct hda_softc *sc, uint64_t dma_paddr, size_t len); static void hda_dma_st_dword(void *dma_vaddr, uint32_t data); static uint32_t hda_dma_ld_dword(void *dma_vaddr); static inline uint8_t hda_get_stream_by_offsets(uint32_t offset, uint8_t reg_offset); static inline uint32_t hda_get_offset_stream(uint8_t stream_ind); static void hda_set_gctl(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_statests(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_corbwp(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_corbctl(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_rirbctl(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_rirbsts(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_dpiblbase(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_sdctl(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_sdctl2(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_sdsts(struct hda_softc *sc, uint32_t offset, uint32_t old); static int hda_signal_state_change(struct hda_codec_inst *hci); static int hda_response(struct hda_codec_inst *hci, uint32_t response, uint8_t unsol); static int hda_transfer(struct hda_codec_inst *hci, uint8_t stream, uint8_t dir, void *buf, size_t count); static void hda_set_pib(struct hda_softc *sc, uint8_t stream_ind, uint32_t pib); static uint64_t hda_get_clock_ns(void); /* * PCI HDA function declarations */ -static int pci_hda_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts); +static int pci_hda_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl); static void pci_hda_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value); static uint64_t pci_hda_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size); /* * HDA global data */ static const hda_set_reg_handler hda_set_reg_table[] = { [HDAC_GCTL] = hda_set_gctl, [HDAC_STATESTS] = hda_set_statests, [HDAC_CORBWP] = hda_set_corbwp, [HDAC_CORBCTL] = hda_set_corbctl, [HDAC_RIRBCTL] = hda_set_rirbctl, [HDAC_RIRBSTS] = hda_set_rirbsts, [HDAC_DPIBLBASE] = hda_set_dpiblbase, #define HDAC_ISTREAM(n, iss, oss) \ [_HDAC_ISDCTL(n, iss, oss)] = hda_set_sdctl, \ [_HDAC_ISDCTL(n, iss, oss) + 2] = hda_set_sdctl2, \ [_HDAC_ISDSTS(n, iss, oss)] = hda_set_sdsts, \ #define HDAC_OSTREAM(n, iss, oss) \ [_HDAC_OSDCTL(n, iss, oss)] = hda_set_sdctl, \ [_HDAC_OSDCTL(n, iss, oss) + 2] = hda_set_sdctl2, \ [_HDAC_OSDSTS(n, iss, oss)] = hda_set_sdsts, \ HDAC_ISTREAM(0, HDA_ISS_NO, HDA_OSS_NO) HDAC_ISTREAM(1, HDA_ISS_NO, HDA_OSS_NO) HDAC_ISTREAM(2, HDA_ISS_NO, HDA_OSS_NO) HDAC_ISTREAM(3, HDA_ISS_NO, HDA_OSS_NO) HDAC_OSTREAM(0, HDA_ISS_NO, HDA_OSS_NO) HDAC_OSTREAM(1, HDA_ISS_NO, HDA_OSS_NO) HDAC_OSTREAM(2, HDA_ISS_NO, HDA_OSS_NO) HDAC_OSTREAM(3, HDA_ISS_NO, HDA_OSS_NO) [HDA_SET_REG_TABLE_SZ] = NULL, }; static const uint16_t hda_corb_sizes[] = { [HDAC_CORBSIZE_CORBSIZE_2] = 2, [HDAC_CORBSIZE_CORBSIZE_16] = 16, [HDAC_CORBSIZE_CORBSIZE_256] = 256, [HDAC_CORBSIZE_CORBSIZE_MASK] = 0, }; static const uint16_t hda_rirb_sizes[] = { [HDAC_RIRBSIZE_RIRBSIZE_2] = 2, [HDAC_RIRBSIZE_RIRBSIZE_16] = 16, [HDAC_RIRBSIZE_RIRBSIZE_256] = 256, [HDAC_RIRBSIZE_RIRBSIZE_MASK] = 0, }; static struct hda_ops hops = { .signal = hda_signal_state_change, .response = hda_response, .transfer = hda_transfer, }; struct pci_devemu pci_de_hda = { .pe_emu = "hda", .pe_init = pci_hda_init, .pe_barwrite = pci_hda_write, .pe_barread = pci_hda_read }; PCI_EMUL_SET(pci_de_hda); SET_DECLARE(hda_codec_class_set, struct hda_codec_class); #if DEBUG_HDA == 1 FILE *dbg; #endif /* * HDA module function definitions */ static inline void hda_set_reg_by_offset(struct hda_softc *sc, uint32_t offset, uint32_t value) { assert(offset < HDA_LAST_OFFSET); sc->regs[offset] = value; } static inline uint32_t hda_get_reg_by_offset(struct hda_softc *sc, uint32_t offset) { assert(offset < HDA_LAST_OFFSET); return sc->regs[offset]; } static inline void hda_set_field_by_offset(struct hda_softc *sc, uint32_t offset, uint32_t mask, uint32_t value) { uint32_t reg_value = 0; reg_value = hda_get_reg_by_offset(sc, offset); reg_value &= ~mask; reg_value |= (value & mask); hda_set_reg_by_offset(sc, offset, reg_value); } -static uint8_t -hda_parse_config(const char *opts, const char *key, char *val) -{ - char buf[64]; - char *s = buf; - char *tmp = NULL; - size_t len; - int i; - - if (!opts) - return (0); - - len = strlen(opts); - if (len >= sizeof(buf)) { - DPRINTF("Opts too big"); - return (0); - } - - DPRINTF("opts: %s", opts); - - strcpy(buf, opts); - - for (i = 0; i < len; i++) - if (buf[i] == ',') { - buf[i] = 0; - tmp = buf + i + 1; - break; - } - - if (!memcmp(s, key, strlen(key))) { - strncpy(val, s + strlen(key), 64); - return (1); - } - - if (!tmp) - return (0); - - s = tmp; - if (!memcmp(s, key, strlen(key))) { - strncpy(val, s + strlen(key), 64); - return (1); - } - - return (0); -} - static struct hda_softc * -hda_init(const char *opts) +hda_init(nvlist_t *nvl) { struct hda_softc *sc = NULL; struct hda_codec_class *codec = NULL; - char play[64]; - char rec[64]; - int err, p, r; + const char *value; + char *play; + char *rec; + int err; #if DEBUG_HDA == 1 dbg = fopen("/tmp/bhyve_hda.log", "w+"); #endif - DPRINTF("opts: %s", opts); - sc = calloc(1, sizeof(*sc)); if (!sc) return (NULL); hda_reset_regs(sc); /* - * TODO search all the codecs declared in opts + * TODO search all configured codecs * For now we play with one single codec */ codec = hda_find_codec_class("hda_codec"); if (codec) { - p = hda_parse_config(opts, "play=", play); - r = hda_parse_config(opts, "rec=", rec); + value = get_config_value_node(nvl, "play"); + if (value == NULL) + play = NULL; + else + play = strdup(value); + value = get_config_value_node(nvl, "rec"); + if (value == NULL) + rec = NULL; + else + rec = strdup(value); DPRINTF("play: %s rec: %s", play, rec); - if (p | r) { - err = hda_codec_constructor(sc, codec, p ? \ - play : NULL, r ? rec : NULL, NULL); + if (play != NULL || rec != NULL) { + err = hda_codec_constructor(sc, codec, play, rec); assert(!err); } + free(play); + free(rec); } return (sc); } static void hda_update_intr(struct hda_softc *sc) { struct pci_devinst *pi = sc->pci_dev; uint32_t intctl = hda_get_reg_by_offset(sc, HDAC_INTCTL); uint32_t intsts = 0; uint32_t sdsts = 0; uint32_t rirbsts = 0; uint32_t wakeen = 0; uint32_t statests = 0; uint32_t off = 0; int i; /* update the CIS bits */ rirbsts = hda_get_reg_by_offset(sc, HDAC_RIRBSTS); if (rirbsts & (HDAC_RIRBSTS_RINTFL | HDAC_RIRBSTS_RIRBOIS)) intsts |= HDAC_INTSTS_CIS; wakeen = hda_get_reg_by_offset(sc, HDAC_WAKEEN); statests = hda_get_reg_by_offset(sc, HDAC_STATESTS); if (statests & wakeen) intsts |= HDAC_INTSTS_CIS; /* update the SIS bits */ for (i = 0; i < HDA_IOSS_NO; i++) { off = hda_get_offset_stream(i); sdsts = hda_get_reg_by_offset(sc, off + HDAC_SDSTS); if (sdsts & HDAC_SDSTS_BCIS) intsts |= (1 << i); } /* update the GIS bit */ if (intsts) intsts |= HDAC_INTSTS_GIS; hda_set_reg_by_offset(sc, HDAC_INTSTS, intsts); if ((intctl & HDAC_INTCTL_GIE) && ((intsts & \ ~HDAC_INTSTS_GIS) & intctl)) { if (!sc->lintr) { pci_lintr_assert(pi); sc->lintr = 1; } } else { if (sc->lintr) { pci_lintr_deassert(pi); sc->lintr = 0; } } } static void hda_response_interrupt(struct hda_softc *sc) { uint8_t rirbctl = hda_get_reg_by_offset(sc, HDAC_RIRBCTL); if ((rirbctl & HDAC_RIRBCTL_RINTCTL) && sc->rirb_cnt) { sc->rirb_cnt = 0; hda_set_field_by_offset(sc, HDAC_RIRBSTS, HDAC_RIRBSTS_RINTFL, HDAC_RIRBSTS_RINTFL); hda_update_intr(sc); } } static int hda_codec_constructor(struct hda_softc *sc, struct hda_codec_class *codec, - const char *play, const char *rec, const char *opts) + const char *play, const char *rec) { struct hda_codec_inst *hci = NULL; if (sc->codecs_no >= HDA_CODEC_MAX) return (-1); hci = calloc(1, sizeof(struct hda_codec_inst)); if (!hci) return (-1); hci->hda = sc; hci->hops = &hops; hci->cad = sc->codecs_no; hci->codec = codec; sc->codecs[sc->codecs_no++] = hci; if (!codec->init) { DPRINTF("This codec does not implement the init function"); return (-1); } - return (codec->init(hci, play, rec, opts)); + return (codec->init(hci, play, rec)); } static struct hda_codec_class * hda_find_codec_class(const char *name) { struct hda_codec_class **pdpp = NULL, *pdp = NULL; SET_FOREACH(pdpp, hda_codec_class_set) { pdp = *pdpp; if (!strcmp(pdp->name, name)) { return (pdp); } } return (NULL); } static int hda_send_command(struct hda_softc *sc, uint32_t verb) { struct hda_codec_inst *hci = NULL; struct hda_codec_class *codec = NULL; uint8_t cad = (verb >> HDA_CMD_CAD_SHIFT) & 0x0f; hci = sc->codecs[cad]; if (!hci) return (-1); DPRINTF("cad: 0x%x verb: 0x%x", cad, verb); codec = hci->codec; assert(codec); if (!codec->command) { DPRINTF("This codec does not implement the command function"); return (-1); } return (codec->command(hci, verb)); } static int hda_notify_codecs(struct hda_softc *sc, uint8_t run, uint8_t stream, uint8_t dir) { struct hda_codec_inst *hci = NULL; struct hda_codec_class *codec = NULL; int err; int i; /* Notify each codec */ for (i = 0; i < sc->codecs_no; i++) { hci = sc->codecs[i]; assert(hci); codec = hci->codec; assert(codec); if (codec->notify) { err = codec->notify(hci, run, stream, dir); if (!err) break; } } return (i == sc->codecs_no ? (-1) : 0); } static void hda_reset(struct hda_softc *sc) { int i; struct hda_codec_inst *hci = NULL; struct hda_codec_class *codec = NULL; hda_reset_regs(sc); /* Reset each codec */ for (i = 0; i < sc->codecs_no; i++) { hci = sc->codecs[i]; assert(hci); codec = hci->codec; assert(codec); if (codec->reset) codec->reset(hci); } sc->wall_clock_start = hda_get_clock_ns(); } static void hda_reset_regs(struct hda_softc *sc) { uint32_t off = 0; uint8_t i; DPRINTF("Reset the HDA controller registers ..."); memset(sc->regs, 0, sizeof(sc->regs)); hda_set_reg_by_offset(sc, HDAC_GCAP, HDAC_GCAP_64OK | (HDA_ISS_NO << HDAC_GCAP_ISS_SHIFT) | (HDA_OSS_NO << HDAC_GCAP_OSS_SHIFT)); hda_set_reg_by_offset(sc, HDAC_VMAJ, 0x01); hda_set_reg_by_offset(sc, HDAC_OUTPAY, 0x3c); hda_set_reg_by_offset(sc, HDAC_INPAY, 0x1d); hda_set_reg_by_offset(sc, HDAC_CORBSIZE, HDAC_CORBSIZE_CORBSZCAP_256 | HDAC_CORBSIZE_CORBSIZE_256); hda_set_reg_by_offset(sc, HDAC_RIRBSIZE, HDAC_RIRBSIZE_RIRBSZCAP_256 | HDAC_RIRBSIZE_RIRBSIZE_256); for (i = 0; i < HDA_IOSS_NO; i++) { off = hda_get_offset_stream(i); hda_set_reg_by_offset(sc, off + HDAC_SDFIFOS, HDA_FIFO_SIZE); } } static void hda_stream_reset(struct hda_softc *sc, uint8_t stream_ind) { struct hda_stream_desc *st = &sc->streams[stream_ind]; uint32_t off = hda_get_offset_stream(stream_ind); DPRINTF("Reset the HDA stream: 0x%x", stream_ind); /* Reset the Stream Descriptor registers */ memset(sc->regs + HDA_STREAM_REGS_BASE + off, 0, HDA_STREAM_REGS_LEN); /* Reset the Stream Descriptor */ memset(st, 0, sizeof(*st)); hda_set_field_by_offset(sc, off + HDAC_SDSTS, HDAC_SDSTS_FIFORDY, HDAC_SDSTS_FIFORDY); hda_set_field_by_offset(sc, off + HDAC_SDCTL0, HDAC_SDCTL_SRST, HDAC_SDCTL_SRST); } static int hda_stream_start(struct hda_softc *sc, uint8_t stream_ind) { struct hda_stream_desc *st = &sc->streams[stream_ind]; struct hda_bdle_desc *bdle_desc = NULL; struct hda_bdle *bdle = NULL; uint32_t lvi = 0; uint32_t bdl_cnt = 0; uint64_t bdpl = 0; uint64_t bdpu = 0; uint64_t bdl_paddr = 0; void *bdl_vaddr = NULL; uint32_t bdle_sz = 0; uint64_t bdle_addrl = 0; uint64_t bdle_addrh = 0; uint64_t bdle_paddr = 0; void *bdle_vaddr = NULL; uint32_t off = hda_get_offset_stream(stream_ind); uint32_t sdctl = 0; uint8_t strm = 0; uint8_t dir = 0; int i; assert(!st->run); lvi = hda_get_reg_by_offset(sc, off + HDAC_SDLVI); bdpl = hda_get_reg_by_offset(sc, off + HDAC_SDBDPL); bdpu = hda_get_reg_by_offset(sc, off + HDAC_SDBDPU); bdl_cnt = lvi + 1; assert(bdl_cnt <= HDA_BDL_MAX_LEN); bdl_paddr = bdpl | (bdpu << 32); bdl_vaddr = hda_dma_get_vaddr(sc, bdl_paddr, HDA_BDL_ENTRY_LEN * bdl_cnt); if (!bdl_vaddr) { DPRINTF("Fail to get the guest virtual address"); return (-1); } DPRINTF("stream: 0x%x bdl_cnt: 0x%x bdl_paddr: 0x%lx", stream_ind, bdl_cnt, bdl_paddr); st->bdl_cnt = bdl_cnt; bdle = (struct hda_bdle *)bdl_vaddr; for (i = 0; i < bdl_cnt; i++, bdle++) { bdle_sz = bdle->len; assert(!(bdle_sz % HDA_DMA_ACCESS_LEN)); bdle_addrl = bdle->addrl; bdle_addrh = bdle->addrh; bdle_paddr = bdle_addrl | (bdle_addrh << 32); bdle_vaddr = hda_dma_get_vaddr(sc, bdle_paddr, bdle_sz); if (!bdle_vaddr) { DPRINTF("Fail to get the guest virtual address"); return (-1); } bdle_desc = &st->bdl[i]; bdle_desc->addr = bdle_vaddr; bdle_desc->len = bdle_sz; bdle_desc->ioc = bdle->ioc; DPRINTF("bdle: 0x%x bdle_sz: 0x%x", i, bdle_sz); } sdctl = hda_get_reg_by_offset(sc, off + HDAC_SDCTL0); strm = (sdctl >> 20) & 0x0f; dir = stream_ind >= HDA_ISS_NO; DPRINTF("strm: 0x%x, dir: 0x%x", strm, dir); sc->stream_map[dir][strm] = stream_ind; st->stream = strm; st->dir = dir; st->bp = 0; st->be = 0; hda_set_pib(sc, stream_ind, 0); st->run = 1; hda_notify_codecs(sc, 1, strm, dir); return (0); } static int hda_stream_stop(struct hda_softc *sc, uint8_t stream_ind) { struct hda_stream_desc *st = &sc->streams[stream_ind]; uint8_t strm = st->stream; uint8_t dir = st->dir; DPRINTF("stream: 0x%x, strm: 0x%x, dir: 0x%x", stream_ind, strm, dir); st->run = 0; hda_notify_codecs(sc, 0, strm, dir); return (0); } static uint32_t hda_read(struct hda_softc *sc, uint32_t offset) { if (offset == HDAC_WALCLK) return (24 * (hda_get_clock_ns() - \ sc->wall_clock_start) / 1000); return (hda_get_reg_by_offset(sc, offset)); } static int hda_write(struct hda_softc *sc, uint32_t offset, uint8_t size, uint32_t value) { uint32_t old = hda_get_reg_by_offset(sc, offset); uint32_t masks[] = {0x00000000, 0x000000ff, 0x0000ffff, 0x00ffffff, 0xffffffff}; hda_set_reg_handler set_reg_handler = hda_set_reg_table[offset]; hda_set_field_by_offset(sc, offset, masks[size], value); if (set_reg_handler) set_reg_handler(sc, offset, old); return (0); } static inline void hda_print_cmd_ctl_data(struct hda_codec_cmd_ctl *p) { #if DEBUG_HDA == 1 char *name = p->name; #endif DPRINTF("%s size: %d", name, p->size); DPRINTF("%s dma_vaddr: %p", name, p->dma_vaddr); DPRINTF("%s wp: 0x%x", name, p->wp); DPRINTF("%s rp: 0x%x", name, p->rp); } static int hda_corb_start(struct hda_softc *sc) { struct hda_codec_cmd_ctl *corb = &sc->corb; uint8_t corbsize = 0; uint64_t corblbase = 0; uint64_t corbubase = 0; uint64_t corbpaddr = 0; corb->name = "CORB"; corbsize = hda_get_reg_by_offset(sc, HDAC_CORBSIZE) & \ HDAC_CORBSIZE_CORBSIZE_MASK; corb->size = hda_corb_sizes[corbsize]; if (!corb->size) { DPRINTF("Invalid corb size"); return (-1); } corblbase = hda_get_reg_by_offset(sc, HDAC_CORBLBASE); corbubase = hda_get_reg_by_offset(sc, HDAC_CORBUBASE); corbpaddr = corblbase | (corbubase << 32); DPRINTF("CORB dma_paddr: %p", (void *)corbpaddr); corb->dma_vaddr = hda_dma_get_vaddr(sc, corbpaddr, HDA_CORB_ENTRY_LEN * corb->size); if (!corb->dma_vaddr) { DPRINTF("Fail to get the guest virtual address"); return (-1); } corb->wp = hda_get_reg_by_offset(sc, HDAC_CORBWP); corb->rp = hda_get_reg_by_offset(sc, HDAC_CORBRP); corb->run = 1; hda_print_cmd_ctl_data(corb); return (0); } static int hda_corb_run(struct hda_softc *sc) { struct hda_codec_cmd_ctl *corb = &sc->corb; uint32_t verb = 0; int err; corb->wp = hda_get_reg_by_offset(sc, HDAC_CORBWP); while (corb->rp != corb->wp && corb->run) { corb->rp++; corb->rp %= corb->size; verb = hda_dma_ld_dword(corb->dma_vaddr + \ HDA_CORB_ENTRY_LEN * corb->rp); err = hda_send_command(sc, verb); assert(!err); } hda_set_reg_by_offset(sc, HDAC_CORBRP, corb->rp); if (corb->run) hda_response_interrupt(sc); return (0); } static int hda_rirb_start(struct hda_softc *sc) { struct hda_codec_cmd_ctl *rirb = &sc->rirb; uint8_t rirbsize = 0; uint64_t rirblbase = 0; uint64_t rirbubase = 0; uint64_t rirbpaddr = 0; rirb->name = "RIRB"; rirbsize = hda_get_reg_by_offset(sc, HDAC_RIRBSIZE) & \ HDAC_RIRBSIZE_RIRBSIZE_MASK; rirb->size = hda_rirb_sizes[rirbsize]; if (!rirb->size) { DPRINTF("Invalid rirb size"); return (-1); } rirblbase = hda_get_reg_by_offset(sc, HDAC_RIRBLBASE); rirbubase = hda_get_reg_by_offset(sc, HDAC_RIRBUBASE); rirbpaddr = rirblbase | (rirbubase << 32); DPRINTF("RIRB dma_paddr: %p", (void *)rirbpaddr); rirb->dma_vaddr = hda_dma_get_vaddr(sc, rirbpaddr, HDA_RIRB_ENTRY_LEN * rirb->size); if (!rirb->dma_vaddr) { DPRINTF("Fail to get the guest virtual address"); return (-1); } rirb->wp = hda_get_reg_by_offset(sc, HDAC_RIRBWP); rirb->rp = 0x0000; rirb->run = 1; hda_print_cmd_ctl_data(rirb); return (0); } static void * hda_dma_get_vaddr(struct hda_softc *sc, uint64_t dma_paddr, size_t len) { struct pci_devinst *pi = sc->pci_dev; assert(pi); return (paddr_guest2host(pi->pi_vmctx, (uintptr_t)dma_paddr, len)); } static void hda_dma_st_dword(void *dma_vaddr, uint32_t data) { *(uint32_t*)dma_vaddr = data; } static uint32_t hda_dma_ld_dword(void *dma_vaddr) { return (*(uint32_t*)dma_vaddr); } static inline uint8_t hda_get_stream_by_offsets(uint32_t offset, uint8_t reg_offset) { uint8_t stream_ind = (offset - reg_offset) >> 5; assert(stream_ind < HDA_IOSS_NO); return (stream_ind); } static inline uint32_t hda_get_offset_stream(uint8_t stream_ind) { return (stream_ind << 5); } static void hda_set_gctl(struct hda_softc *sc, uint32_t offset, uint32_t old) { uint32_t value = hda_get_reg_by_offset(sc, offset); if (!(value & HDAC_GCTL_CRST)) { hda_reset(sc); } } static void hda_set_statests(struct hda_softc *sc, uint32_t offset, uint32_t old) { uint32_t value = hda_get_reg_by_offset(sc, offset); hda_set_reg_by_offset(sc, offset, old); /* clear the corresponding bits written by the software (guest) */ hda_set_field_by_offset(sc, offset, value & HDA_STATESTS_IRQ_MASK, 0); hda_update_intr(sc); } static void hda_set_corbwp(struct hda_softc *sc, uint32_t offset, uint32_t old) { hda_corb_run(sc); } static void hda_set_corbctl(struct hda_softc *sc, uint32_t offset, uint32_t old) { uint32_t value = hda_get_reg_by_offset(sc, offset); int err; struct hda_codec_cmd_ctl *corb = NULL; if (value & HDAC_CORBCTL_CORBRUN) { if (!(old & HDAC_CORBCTL_CORBRUN)) { err = hda_corb_start(sc); assert(!err); } } else { corb = &sc->corb; memset(corb, 0, sizeof(*corb)); } hda_corb_run(sc); } static void hda_set_rirbctl(struct hda_softc *sc, uint32_t offset, uint32_t old) { uint32_t value = hda_get_reg_by_offset(sc, offset); int err; struct hda_codec_cmd_ctl *rirb = NULL; if (value & HDAC_RIRBCTL_RIRBDMAEN) { err = hda_rirb_start(sc); assert(!err); } else { rirb = &sc->rirb; memset(rirb, 0, sizeof(*rirb)); } } static void hda_set_rirbsts(struct hda_softc *sc, uint32_t offset, uint32_t old) { uint32_t value = hda_get_reg_by_offset(sc, offset); hda_set_reg_by_offset(sc, offset, old); /* clear the corresponding bits written by the software (guest) */ hda_set_field_by_offset(sc, offset, value & HDA_RIRBSTS_IRQ_MASK, 0); hda_update_intr(sc); } static void hda_set_dpiblbase(struct hda_softc *sc, uint32_t offset, uint32_t old) { uint32_t value = hda_get_reg_by_offset(sc, offset); uint64_t dpiblbase = 0; uint64_t dpibubase = 0; uint64_t dpibpaddr = 0; if ((value & HDAC_DPLBASE_DPLBASE_DMAPBE) != (old & \ HDAC_DPLBASE_DPLBASE_DMAPBE)) { if (value & HDAC_DPLBASE_DPLBASE_DMAPBE) { dpiblbase = value & HDAC_DPLBASE_DPLBASE_MASK; dpibubase = hda_get_reg_by_offset(sc, HDAC_DPIBUBASE); dpibpaddr = dpiblbase | (dpibubase << 32); DPRINTF("DMA Position In Buffer dma_paddr: %p", (void *)dpibpaddr); sc->dma_pib_vaddr = hda_dma_get_vaddr(sc, dpibpaddr, HDA_DMA_PIB_ENTRY_LEN * HDA_IOSS_NO); if (!sc->dma_pib_vaddr) { DPRINTF("Fail to get the guest \ virtual address"); assert(0); } } else { DPRINTF("DMA Position In Buffer Reset"); sc->dma_pib_vaddr = NULL; } } } static void hda_set_sdctl(struct hda_softc *sc, uint32_t offset, uint32_t old) { uint8_t stream_ind = hda_get_stream_by_offsets(offset, HDAC_SDCTL0); uint32_t value = hda_get_reg_by_offset(sc, offset); int err; DPRINTF("stream_ind: 0x%x old: 0x%x value: 0x%x", stream_ind, old, value); if (value & HDAC_SDCTL_SRST) { hda_stream_reset(sc, stream_ind); } if ((value & HDAC_SDCTL_RUN) != (old & HDAC_SDCTL_RUN)) { if (value & HDAC_SDCTL_RUN) { err = hda_stream_start(sc, stream_ind); assert(!err); } else { err = hda_stream_stop(sc, stream_ind); assert(!err); } } } static void hda_set_sdctl2(struct hda_softc *sc, uint32_t offset, uint32_t old) { uint32_t value = hda_get_reg_by_offset(sc, offset); hda_set_field_by_offset(sc, offset - 2, 0x00ff0000, value << 16); } static void hda_set_sdsts(struct hda_softc *sc, uint32_t offset, uint32_t old) { uint32_t value = hda_get_reg_by_offset(sc, offset); hda_set_reg_by_offset(sc, offset, old); /* clear the corresponding bits written by the software (guest) */ hda_set_field_by_offset(sc, offset, value & HDA_SDSTS_IRQ_MASK, 0); hda_update_intr(sc); } static int hda_signal_state_change(struct hda_codec_inst *hci) { struct hda_softc *sc = NULL; uint32_t sdiwake = 0; assert(hci); assert(hci->hda); DPRINTF("cad: 0x%x", hci->cad); sc = hci->hda; sdiwake = 1 << hci->cad; hda_set_field_by_offset(sc, HDAC_STATESTS, sdiwake, sdiwake); hda_update_intr(sc); return (0); } static int hda_response(struct hda_codec_inst *hci, uint32_t response, uint8_t unsol) { struct hda_softc *sc = NULL; struct hda_codec_cmd_ctl *rirb = NULL; uint32_t response_ex = 0; uint8_t rintcnt = 0; assert(hci); assert(hci->cad <= HDA_CODEC_MAX); response_ex = hci->cad | unsol; sc = hci->hda; assert(sc); rirb = &sc->rirb; if (rirb->run) { rirb->wp++; rirb->wp %= rirb->size; hda_dma_st_dword(rirb->dma_vaddr + HDA_RIRB_ENTRY_LEN * \ rirb->wp, response); hda_dma_st_dword(rirb->dma_vaddr + HDA_RIRB_ENTRY_LEN * \ rirb->wp + 0x04, response_ex); hda_set_reg_by_offset(sc, HDAC_RIRBWP, rirb->wp); sc->rirb_cnt++; } rintcnt = hda_get_reg_by_offset(sc, HDAC_RINTCNT); if (sc->rirb_cnt == rintcnt) hda_response_interrupt(sc); return (0); } static int hda_transfer(struct hda_codec_inst *hci, uint8_t stream, uint8_t dir, void *buf, size_t count) { struct hda_softc *sc = NULL; struct hda_stream_desc *st = NULL; struct hda_bdle_desc *bdl = NULL; struct hda_bdle_desc *bdle_desc = NULL; uint8_t stream_ind = 0; uint32_t lpib = 0; uint32_t off = 0; size_t left = 0; uint8_t irq = 0; assert(hci); assert(hci->hda); assert(buf); assert(!(count % HDA_DMA_ACCESS_LEN)); if (!stream) { DPRINTF("Invalid stream"); return (-1); } sc = hci->hda; assert(stream < HDA_STREAM_TAGS_CNT); stream_ind = sc->stream_map[dir][stream]; if (!dir) assert(stream_ind < HDA_ISS_NO); else assert(stream_ind >= HDA_ISS_NO && stream_ind < HDA_IOSS_NO); st = &sc->streams[stream_ind]; if (!st->run) { DPRINTF("Stream 0x%x stopped", stream); return (-1); } assert(st->stream == stream); off = hda_get_offset_stream(stream_ind); lpib = hda_get_reg_by_offset(sc, off + HDAC_SDLPIB); bdl = st->bdl; assert(st->be < st->bdl_cnt); assert(st->bp < bdl[st->be].len); left = count; while (left) { bdle_desc = &bdl[st->be]; if (dir) *(uint32_t *)buf = \ hda_dma_ld_dword(bdle_desc->addr + st->bp); else hda_dma_st_dword(bdle_desc->addr + st->bp, *(uint32_t *)buf); buf += HDA_DMA_ACCESS_LEN; st->bp += HDA_DMA_ACCESS_LEN; lpib += HDA_DMA_ACCESS_LEN; left -= HDA_DMA_ACCESS_LEN; if (st->bp == bdle_desc->len) { st->bp = 0; if (bdle_desc->ioc) irq = 1; st->be++; if (st->be == st->bdl_cnt) { st->be = 0; lpib = 0; } bdle_desc = &bdl[st->be]; } } hda_set_pib(sc, stream_ind, lpib); if (irq) { hda_set_field_by_offset(sc, off + HDAC_SDSTS, HDAC_SDSTS_BCIS, HDAC_SDSTS_BCIS); hda_update_intr(sc); } return (0); } static void hda_set_pib(struct hda_softc *sc, uint8_t stream_ind, uint32_t pib) { uint32_t off = hda_get_offset_stream(stream_ind); hda_set_reg_by_offset(sc, off + HDAC_SDLPIB, pib); /* LPIB Alias */ hda_set_reg_by_offset(sc, 0x2000 + off + HDAC_SDLPIB, pib); if (sc->dma_pib_vaddr) *(uint32_t *)(sc->dma_pib_vaddr + stream_ind * \ HDA_DMA_PIB_ENTRY_LEN) = pib; } static uint64_t hda_get_clock_ns(void) { struct timespec ts; int err; err = clock_gettime(CLOCK_MONOTONIC, &ts); assert(!err); return (ts.tv_sec * 1000000000LL + ts.tv_nsec); } /* * PCI HDA function definitions */ static int -pci_hda_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_hda_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { struct hda_softc *sc = NULL; assert(ctx != NULL); assert(pi != NULL); pci_set_cfgdata16(pi, PCIR_VENDOR, INTEL_VENDORID); pci_set_cfgdata16(pi, PCIR_DEVICE, HDA_INTEL_82801G); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_MULTIMEDIA_HDA); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_MULTIMEDIA); /* select the Intel HDA mode */ pci_set_cfgdata8(pi, PCIR_HDCTL, 0x01); /* allocate one BAR register for the Memory address offsets */ pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, HDA_LAST_OFFSET); /* allocate an IRQ pin for our slot */ pci_lintr_request(pi); - sc = hda_init(opts); + sc = hda_init(nvl); if (!sc) return (-1); sc->pci_dev = pi; pi->pi_arg = sc; return (0); } static void pci_hda_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct hda_softc *sc = pi->pi_arg; int err; assert(sc); assert(baridx == 0); assert(size <= 4); DPRINTF("offset: 0x%lx value: 0x%lx", offset, value); err = hda_write(sc, offset, size, value); assert(!err); } static uint64_t pci_hda_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct hda_softc *sc = pi->pi_arg; uint64_t value = 0; assert(sc); assert(baridx == 0); assert(size <= 4); value = hda_read(sc, offset); DPRINTF("offset: 0x%lx value: 0x%lx", offset, value); return (value); } diff --git a/usr.sbin/bhyve/pci_hda.h b/usr.sbin/bhyve/pci_hda.h index 7b6bba92e496..65a85f6d603d 100644 --- a/usr.sbin/bhyve/pci_hda.h +++ b/usr.sbin/bhyve/pci_hda.h @@ -1,92 +1,92 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Alex Teaca * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _HDA_EMUL_H_ #define _HDA_EMUL_H_ #include #include #include #include #include #include #include #include #include "hda_reg.h" /* * HDA Debug Log */ #define DEBUG_HDA 1 #if DEBUG_HDA == 1 extern FILE *dbg; #define DPRINTF(fmt, arg...) \ do {fprintf(dbg, "%s-%d: " fmt "\n", __func__, __LINE__, ##arg); \ fflush(dbg); } while (0) #else #define DPRINTF(fmt, arg...) #endif #define HDA_FIFO_SIZE 0x100 struct hda_softc; struct hda_codec_class; struct hda_codec_inst { uint8_t cad; struct hda_codec_class *codec; struct hda_softc *hda; struct hda_ops *hops; void *priv; }; struct hda_codec_class { char *name; int (*init)(struct hda_codec_inst *hci, const char *play, - const char *rec, const char *opts); + const char *rec); int (*reset)(struct hda_codec_inst *hci); int (*command)(struct hda_codec_inst *hci, uint32_t cmd_data); int (*notify)(struct hda_codec_inst *hci, uint8_t run, uint8_t stream, uint8_t dir); }; struct hda_ops { int (*signal)(struct hda_codec_inst *hci); int (*response)(struct hda_codec_inst *hci, uint32_t response, uint8_t unsol); int (*transfer)(struct hda_codec_inst *hci, uint8_t stream, uint8_t dir, void *buf, size_t count); }; #define HDA_EMUL_SET(x) DATA_SET(hda_codec_class_set, x); #endif /* _HDA_EMUL_H_ */ diff --git a/usr.sbin/bhyve/pci_hostbridge.c b/usr.sbin/bhyve/pci_hostbridge.c index 559496a9fee0..7099474eaf92 100644 --- a/usr.sbin/bhyve/pci_hostbridge.c +++ b/usr.sbin/bhyve/pci_hostbridge.c @@ -1,72 +1,88 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); +#include + +#include "config.h" #include "pci_emul.h" static int -pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { + const char *value; + u_int vendor, device; + + vendor = 0x1275; /* NetApp */ + device = 0x1275; /* NetApp */ + + value = get_config_value_node(nvl, "vendor"); + if (value != NULL) + vendor = strtol(value, NULL, 0); + value = get_config_value_node(nvl, "device"); + if (value != NULL) + device = strtol(value, NULL, 0); /* config space */ - pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */ - pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */ + pci_set_cfgdata16(pi, PCIR_VENDOR, vendor); + pci_set_cfgdata16(pi, PCIR_DEVICE, device); pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST); pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT); return (0); } static int -pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_amd_hostbridge_legacy_config(nvlist_t *nvl, const char *opts) { - (void) pci_hostbridge_init(ctx, pi, opts); - pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1022); /* AMD */ - pci_set_cfgdata16(pi, PCIR_DEVICE, 0x7432); /* made up */ + + set_config_value_node(nvl, "vendor", "0x1022"); /* AMD */ + set_config_value_node(nvl, "device", "0x7432"); /* made up */ return (0); } struct pci_devemu pci_de_amd_hostbridge = { .pe_emu = "amd_hostbridge", - .pe_init = pci_amd_hostbridge_init, + .pe_legacy_config = pci_amd_hostbridge_legacy_config, + .pe_alias = "hostbridge", }; PCI_EMUL_SET(pci_de_amd_hostbridge); struct pci_devemu pci_de_hostbridge = { .pe_emu = "hostbridge", .pe_init = pci_hostbridge_init, }; PCI_EMUL_SET(pci_de_hostbridge); diff --git a/usr.sbin/bhyve/pci_lpc.c b/usr.sbin/bhyve/pci_lpc.c index aa5997848f24..8cc536fef112 100644 --- a/usr.sbin/bhyve/pci_lpc.c +++ b/usr.sbin/bhyve/pci_lpc.c @@ -1,510 +1,516 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Neel Natu * Copyright (c) 2013 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include "acpi.h" #include "debug.h" #include "bootrom.h" +#include "config.h" #include "inout.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "pctestdev.h" #include "uart_emul.h" #define IO_ICU1 0x20 #define IO_ICU2 0xA0 SET_DECLARE(lpc_dsdt_set, struct lpc_dsdt); SET_DECLARE(lpc_sysres_set, struct lpc_sysres); #define ELCR_PORT 0x4d0 SYSRES_IO(ELCR_PORT, 2); #define IO_TIMER1_PORT 0x40 #define NMISC_PORT 0x61 SYSRES_IO(NMISC_PORT, 1); static struct pci_devinst *lpc_bridge; -static const char *romfile; - #define LPC_UART_NUM 4 static struct lpc_uart_softc { struct uart_softc *uart_softc; - const char *opts; int iobase; int irq; int enabled; } lpc_uart_softc[LPC_UART_NUM]; -static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2", "COM3", "COM4" }; +static const char *lpc_uart_names[LPC_UART_NUM] = { + "com1", "com2", "com3", "com4" +}; -static bool pctestdev_present; +static const char *lpc_uart_acpi_names[LPC_UART_NUM] = { + "COM1", "COM2", "COM3", "COM4" +}; /* * LPC device configuration is in the following form: * [,] * For e.g. "com1,stdio" or "bootrom,/var/romfile" */ int lpc_device_parse(const char *opts) { int unit, error; - char *str, *cpy, *lpcdev; + char *str, *cpy, *lpcdev, *node_name; error = -1; str = cpy = strdup(opts); lpcdev = strsep(&str, ","); if (lpcdev != NULL) { if (strcasecmp(lpcdev, "bootrom") == 0) { - romfile = str; + set_config_value("lpc.bootrom", str); error = 0; goto done; } for (unit = 0; unit < LPC_UART_NUM; unit++) { if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) { - lpc_uart_softc[unit].opts = str; + asprintf(&node_name, "lpc.%s.path", + lpc_uart_names[unit]); + set_config_value(node_name, str); + free(node_name); error = 0; goto done; } } if (strcasecmp(lpcdev, pctestdev_getname()) == 0) { - if (pctestdev_present) { - EPRINTLN("More than one %s device conf is " - "specified; only one is allowed.", - pctestdev_getname()); - } else if (pctestdev_parse(str) == 0) { - pctestdev_present = true; - error = 0; - free(cpy); - goto done; - } + asprintf(&node_name, "lpc.%s", pctestdev_getname()); + set_config_bool(node_name, true); + free(node_name); + error = 0; + goto done; } } done: - if (error) - free(cpy); + free(cpy); return (error); } void lpc_print_supported_devices() { size_t i; printf("bootrom\n"); for (i = 0; i < LPC_UART_NUM; i++) printf("%s\n", lpc_uart_names[i]); printf("%s\n", pctestdev_getname()); } const char * lpc_bootrom(void) { - return (romfile); + return (get_config_value("lpc.bootrom")); } static void lpc_uart_intr_assert(void *arg) { struct lpc_uart_softc *sc = arg; assert(sc->irq >= 0); vm_isa_pulse_irq(lpc_bridge->pi_vmctx, sc->irq, sc->irq); } static void lpc_uart_intr_deassert(void *arg) { /* * The COM devices on the LPC bus generate edge triggered interrupts, * so nothing more to do here. */ } static int lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { int offset; struct lpc_uart_softc *sc = arg; offset = port - sc->iobase; switch (bytes) { case 1: if (in) *eax = uart_read(sc->uart_softc, offset); else uart_write(sc->uart_softc, offset, *eax); break; case 2: if (in) { *eax = uart_read(sc->uart_softc, offset); *eax |= uart_read(sc->uart_softc, offset + 1) << 8; } else { uart_write(sc->uart_softc, offset, *eax); uart_write(sc->uart_softc, offset + 1, *eax >> 8); } break; default: return (-1); } return (0); } static int lpc_init(struct vmctx *ctx) { struct lpc_uart_softc *sc; struct inout_port iop; - const char *name; + const char *backend, *name, *romfile; + char *node_name; int unit, error; + romfile = get_config_value("lpc.bootrom"); if (romfile != NULL) { error = bootrom_loadrom(ctx, romfile); if (error) return (error); } /* COM1 and COM2 */ for (unit = 0; unit < LPC_UART_NUM; unit++) { sc = &lpc_uart_softc[unit]; name = lpc_uart_names[unit]; if (uart_legacy_alloc(unit, &sc->iobase, &sc->irq) != 0) { EPRINTLN("Unable to allocate resources for " "LPC device %s", name); return (-1); } pci_irq_reserve(sc->irq); sc->uart_softc = uart_init(lpc_uart_intr_assert, lpc_uart_intr_deassert, sc); - if (uart_set_backend(sc->uart_softc, sc->opts) != 0) { + asprintf(&node_name, "lpc.%s.path", name); + backend = get_config_value(node_name); + free(node_name); + if (uart_set_backend(sc->uart_softc, backend) != 0) { EPRINTLN("Unable to initialize backend '%s' " - "for LPC device %s", sc->opts, name); + "for LPC device %s", backend, name); return (-1); } bzero(&iop, sizeof(struct inout_port)); iop.name = name; iop.port = sc->iobase; iop.size = UART_IO_BAR_SIZE; iop.flags = IOPORT_F_INOUT; iop.handler = lpc_uart_io_handler; iop.arg = sc; error = register_inout(&iop); assert(error == 0); sc->enabled = 1; } /* pc-testdev */ - if (pctestdev_present) { + asprintf(&node_name, "lpc.%s", pctestdev_getname()); + if (get_config_bool_default(node_name, false)) { error = pctestdev_init(ctx); if (error) return (error); } + free(node_name); return (0); } static void pci_lpc_write_dsdt(struct pci_devinst *pi) { struct lpc_dsdt **ldpp, *ldp; dsdt_line(""); dsdt_line("Device (ISA)"); dsdt_line("{"); dsdt_line(" Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func); dsdt_line(" OperationRegion (LPCR, PCI_Config, 0x00, 0x100)"); dsdt_line(" Field (LPCR, AnyAcc, NoLock, Preserve)"); dsdt_line(" {"); dsdt_line(" Offset (0x60),"); dsdt_line(" PIRA, 8,"); dsdt_line(" PIRB, 8,"); dsdt_line(" PIRC, 8,"); dsdt_line(" PIRD, 8,"); dsdt_line(" Offset (0x68),"); dsdt_line(" PIRE, 8,"); dsdt_line(" PIRF, 8,"); dsdt_line(" PIRG, 8,"); dsdt_line(" PIRH, 8"); dsdt_line(" }"); dsdt_line(""); dsdt_indent(1); SET_FOREACH(ldpp, lpc_dsdt_set) { ldp = *ldpp; ldp->handler(); } dsdt_line(""); dsdt_line("Device (PIC)"); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0000\"))"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); dsdt_fixed_ioport(IO_ICU1, 2); dsdt_fixed_ioport(IO_ICU2, 2); dsdt_fixed_irq(2); dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); dsdt_line(""); dsdt_line("Device (TIMR)"); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0100\"))"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); dsdt_fixed_ioport(IO_TIMER1_PORT, 4); dsdt_fixed_irq(0); dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); dsdt_unindent(1); dsdt_line("}"); } static void pci_lpc_sysres_dsdt(void) { struct lpc_sysres **lspp, *lsp; dsdt_line(""); dsdt_line("Device (SIO)"); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0C02\"))"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); SET_FOREACH(lspp, lpc_sysres_set) { lsp = *lspp; switch (lsp->type) { case LPC_SYSRES_IO: dsdt_fixed_ioport(lsp->base, lsp->length); break; case LPC_SYSRES_MEM: dsdt_fixed_mem32(lsp->base, lsp->length); break; } } dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); } LPC_DSDT(pci_lpc_sysres_dsdt); static void pci_lpc_uart_dsdt(void) { struct lpc_uart_softc *sc; int unit; for (unit = 0; unit < LPC_UART_NUM; unit++) { sc = &lpc_uart_softc[unit]; if (!sc->enabled) continue; dsdt_line(""); - dsdt_line("Device (%s)", lpc_uart_names[unit]); + dsdt_line("Device (%s)", lpc_uart_acpi_names[unit]); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0501\"))"); dsdt_line(" Name (_UID, %d)", unit + 1); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); dsdt_fixed_ioport(sc->iobase, UART_IO_BAR_SIZE); dsdt_fixed_irq(sc->irq); dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); } } LPC_DSDT(pci_lpc_uart_dsdt); static int pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t val) { int pirq_pin; if (bytes == 1) { pirq_pin = 0; if (coff >= 0x60 && coff <= 0x63) pirq_pin = coff - 0x60 + 1; if (coff >= 0x68 && coff <= 0x6b) pirq_pin = coff - 0x68 + 5; if (pirq_pin != 0) { pirq_write(ctx, pirq_pin, val); pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin)); return (0); } } return (-1); } static void pci_lpc_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { } static uint64_t pci_lpc_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { return (0); } #define LPC_DEV 0x7000 #define LPC_VENDOR 0x8086 static int -pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { /* * Do not allow more than one LPC bridge to be configured. */ if (lpc_bridge != NULL) { EPRINTLN("Only one LPC bridge is allowed."); return (-1); } /* * Enforce that the LPC can only be configured on bus 0. This * simplifies the ACPI DSDT because it can provide a decode for * all legacy i/o ports behind bus 0. */ if (pi->pi_bus != 0) { EPRINTLN("LPC bridge can be present only on bus 0."); return (-1); } if (lpc_init(ctx) != 0) return (-1); /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, LPC_DEV); pci_set_cfgdata16(pi, PCIR_VENDOR, LPC_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA); lpc_bridge = pi; return (0); } char * lpc_pirq_name(int pin) { char *name; if (lpc_bridge == NULL) return (NULL); asprintf(&name, "\\_SB.PC00.ISA.LNK%c,", 'A' + pin - 1); return (name); } void lpc_pirq_routed(void) { int pin; if (lpc_bridge == NULL) return; for (pin = 0; pin < 4; pin++) pci_set_cfgdata8(lpc_bridge, 0x60 + pin, pirq_read(pin + 1)); for (pin = 0; pin < 4; pin++) pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5)); } #ifdef BHYVE_SNAPSHOT static int pci_lpc_snapshot(struct vm_snapshot_meta *meta) { int unit, ret; struct uart_softc *sc; for (unit = 0; unit < LPC_UART_NUM; unit++) { sc = lpc_uart_softc[unit].uart_softc; ret = uart_snapshot(sc, meta); if (ret != 0) goto done; } done: return (ret); } #endif struct pci_devemu pci_de_lpc = { .pe_emu = "lpc", .pe_init = pci_lpc_init, .pe_write_dsdt = pci_lpc_write_dsdt, .pe_cfgwrite = pci_lpc_cfgwrite, .pe_barwrite = pci_lpc_write, .pe_barread = pci_lpc_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_lpc_snapshot, #endif }; PCI_EMUL_SET(pci_de_lpc); diff --git a/usr.sbin/bhyve/pci_nvme.c b/usr.sbin/bhyve/pci_nvme.c index 24f401630d6d..0abc0415a1d8 100644 --- a/usr.sbin/bhyve/pci_nvme.c +++ b/usr.sbin/bhyve/pci_nvme.c @@ -1,2819 +1,2811 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017 Shunsuke Mie * Copyright (c) 2018 Leon Dang * Copyright (c) 2020 Chuck Tuffli * * Function crc16 Copyright (c) 2017, Fedor Uporov * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * bhyve PCIe-NVMe device emulation. * * options: * -s ,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm= * * accepted devpath: * /dev/blockdev * /path/to/image * ram=size_in_MiB * * maxq = max number of queues * qsz = max elements in each queue * ioslots = max number of concurrent io requests * sectsz = sector size (defaults to blockif sector size) * ser = serial number (20-chars max) * eui64 = IEEE Extended Unique Identifier (8 byte value) * dsm = DataSet Management support. Option is one of auto, enable,disable * */ /* TODO: - create async event for smart and log - intr coalesce */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "block_if.h" +#include "config.h" #include "debug.h" #include "pci_emul.h" static int nvme_debug = 0; #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) /* defaults; can be overridden */ #define NVME_MSIX_BAR 4 #define NVME_IOSLOTS 8 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ #define NVME_MMIO_SPACE_MIN (1 << 14) #define NVME_QUEUES 16 #define NVME_MAX_QENTRIES 2048 /* Memory Page size Minimum reported in CAP register */ #define NVME_MPSMIN 0 /* MPSMIN converted to bytes */ #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) #define NVME_MDTS 9 /* Note the + 1 allows for the initial descriptor to not be page aligned */ #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) /* This is a synthetic status code to indicate there is no status */ #define NVME_NO_STATUS 0xffff #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) /* helpers */ /* Convert a zero-based value into a one-based value */ #define ONE_BASED(zero) ((zero) + 1) /* Convert a one-based value into a zero-based value */ #define ZERO_BASED(one) ((one) - 1) /* Encode number of SQ's and CQ's for Set/Get Features */ #define NVME_FEATURE_NUM_QUEUES(sc) \ (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) enum nvme_controller_register_offsets { NVME_CR_CAP_LOW = 0x00, NVME_CR_CAP_HI = 0x04, NVME_CR_VS = 0x08, NVME_CR_INTMS = 0x0c, NVME_CR_INTMC = 0x10, NVME_CR_CC = 0x14, NVME_CR_CSTS = 0x1c, NVME_CR_NSSR = 0x20, NVME_CR_AQA = 0x24, NVME_CR_ASQ_LOW = 0x28, NVME_CR_ASQ_HI = 0x2c, NVME_CR_ACQ_LOW = 0x30, NVME_CR_ACQ_HI = 0x34, }; enum nvme_cmd_cdw11 { NVME_CMD_CDW11_PC = 0x0001, NVME_CMD_CDW11_IEN = 0x0002, NVME_CMD_CDW11_IV = 0xFFFF0000, }; enum nvme_copy_dir { NVME_COPY_TO_PRP, NVME_COPY_FROM_PRP, }; #define NVME_CQ_INTEN 0x01 #define NVME_CQ_INTCOAL 0x02 struct nvme_completion_queue { struct nvme_completion *qbase; pthread_mutex_t mtx; uint32_t size; uint16_t tail; /* nvme progress */ uint16_t head; /* guest progress */ uint16_t intr_vec; uint32_t intr_en; }; struct nvme_submission_queue { struct nvme_command *qbase; pthread_mutex_t mtx; uint32_t size; uint16_t head; /* nvme progress */ uint16_t tail; /* guest progress */ uint16_t cqid; /* completion queue id */ int qpriority; }; enum nvme_storage_type { NVME_STOR_BLOCKIF = 0, NVME_STOR_RAM = 1, }; struct pci_nvme_blockstore { enum nvme_storage_type type; void *ctx; uint64_t size; uint32_t sectsz; uint32_t sectsz_bits; uint64_t eui64; uint32_t deallocate:1; }; /* * Calculate the number of additional page descriptors for guest IO requests * based on the advertised Max Data Transfer (MDTS) and given the number of * default iovec's in a struct blockif_req. * * Note the + 1 allows for the initial descriptor to not be page aligned. */ #define MDTS_PAD_SIZE \ NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 0 struct pci_nvme_ioreq { struct pci_nvme_softc *sc; STAILQ_ENTRY(pci_nvme_ioreq) link; struct nvme_submission_queue *nvme_sq; uint16_t sqid; /* command information */ uint16_t opc; uint16_t cid; uint32_t nsid; uint64_t prev_gpaddr; size_t prev_size; size_t bytes; struct blockif_req io_req; struct iovec iovpadding[MDTS_PAD_SIZE]; }; enum nvme_dsm_type { /* Dataset Management bit in ONCS reflects backing storage capability */ NVME_DATASET_MANAGEMENT_AUTO, /* Unconditionally set Dataset Management bit in ONCS */ NVME_DATASET_MANAGEMENT_ENABLE, /* Unconditionally clear Dataset Management bit in ONCS */ NVME_DATASET_MANAGEMENT_DISABLE, }; struct pci_nvme_softc; struct nvme_feature_obj; typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); struct nvme_feature_obj { uint32_t cdw11; nvme_feature_cb set; nvme_feature_cb get; bool namespace_specific; }; #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) struct pci_nvme_aer { STAILQ_ENTRY(pci_nvme_aer) link; uint16_t cid; /* Command ID of the submitted AER */ }; struct pci_nvme_softc { struct pci_devinst *nsc_pi; pthread_mutex_t mtx; struct nvme_registers regs; struct nvme_namespace_data nsdata; struct nvme_controller_data ctrldata; struct nvme_error_information_entry err_log; struct nvme_health_information_page health_log; struct nvme_firmware_page fw_log; struct pci_nvme_blockstore nvstore; uint16_t max_qentries; /* max entries per queue */ uint32_t max_queues; /* max number of IO SQ's or CQ's */ uint32_t num_cqueues; uint32_t num_squeues; bool num_q_is_set; /* Has host set Number of Queues */ struct pci_nvme_ioreq *ioreqs; STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ uint32_t pending_ios; uint32_t ioslots; sem_t iosemlock; /* * Memory mapped Submission and Completion queues * Each array includes both Admin and IO queues */ struct nvme_completion_queue *compl_queues; struct nvme_submission_queue *submit_queues; struct nvme_feature_obj feat[NVME_FID_MAX]; enum nvme_dsm_type dataset_management; /* Accounting for SMART data */ __uint128_t read_data_units; __uint128_t write_data_units; __uint128_t read_commands; __uint128_t write_commands; uint32_t read_dunits_remainder; uint32_t write_dunits_remainder; STAILQ_HEAD(, pci_nvme_aer) aer_list; uint32_t aer_count; }; static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); static void pci_nvme_io_done(struct blockif_req *, int); /* Controller Configuration utils */ #define NVME_CC_GET_EN(cc) \ ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) #define NVME_CC_GET_CSS(cc) \ ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) #define NVME_CC_GET_SHN(cc) \ ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) #define NVME_CC_GET_IOSQES(cc) \ ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) #define NVME_CC_GET_IOCQES(cc) \ ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) #define NVME_CC_WRITE_MASK \ ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) #define NVME_CC_NEN_WRITE_MASK \ ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) /* Controller Status utils */ #define NVME_CSTS_GET_RDY(sts) \ ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) /* Completion Queue status word utils */ #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) #define NVME_STATUS_MASK \ ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ NVME_CTRLR_DATA_ONCS_DSM_SHIFT) static void nvme_feature_invalid_cb(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_num_queues(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_iv_config(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static __inline void cpywithpad(char *dst, size_t dst_size, const char *src, char pad) { size_t len; len = strnlen(src, dst_size); memset(dst, pad, dst_size); memcpy(dst, src, len); } static __inline void pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) { *status &= ~NVME_STATUS_MASK; *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; } static __inline void pci_nvme_status_genc(uint16_t *status, uint16_t code) { pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); } /* * Initialize the requested number or IO Submission and Completion Queues. * Admin queues are allocated implicitly. */ static void pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) { uint32_t i; /* * Allocate and initialize the Submission Queues */ if (nsq > NVME_QUEUES) { WPRINTF("%s: clamping number of SQ from %u to %u", __func__, nsq, NVME_QUEUES); nsq = NVME_QUEUES; } sc->num_squeues = nsq; sc->submit_queues = calloc(sc->num_squeues + 1, sizeof(struct nvme_submission_queue)); if (sc->submit_queues == NULL) { WPRINTF("%s: SQ allocation failed", __func__); sc->num_squeues = 0; } else { struct nvme_submission_queue *sq = sc->submit_queues; for (i = 0; i < sc->num_squeues; i++) pthread_mutex_init(&sq[i].mtx, NULL); } /* * Allocate and initialize the Completion Queues */ if (ncq > NVME_QUEUES) { WPRINTF("%s: clamping number of CQ from %u to %u", __func__, ncq, NVME_QUEUES); ncq = NVME_QUEUES; } sc->num_cqueues = ncq; sc->compl_queues = calloc(sc->num_cqueues + 1, sizeof(struct nvme_completion_queue)); if (sc->compl_queues == NULL) { WPRINTF("%s: CQ allocation failed", __func__); sc->num_cqueues = 0; } else { struct nvme_completion_queue *cq = sc->compl_queues; for (i = 0; i < sc->num_cqueues; i++) pthread_mutex_init(&cq[i].mtx, NULL); } } static void pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) { struct nvme_controller_data *cd = &sc->ctrldata; cd->vid = 0xFB5D; cd->ssvid = 0x0000; cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); /* Num of submission commands that we can handle at a time (2^rab) */ cd->rab = 4; /* FreeBSD OUI */ cd->ieee[0] = 0x58; cd->ieee[1] = 0x9c; cd->ieee[2] = 0xfc; cd->mic = 0; cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ cd->ver = 0x00010300; cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; cd->acl = 2; cd->aerl = 4; /* Advertise 1, Read-only firmware slot */ cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK | (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); cd->lpa = 0; /* TODO: support some simple things like SMART */ cd->elpe = 0; /* max error log page entries */ cd->npss = 1; /* number of power states support */ /* Warning Composite Temperature Threshold */ cd->wctemp = 0x0157; cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); cd->nn = 1; /* number of namespaces */ cd->oncs = 0; switch (sc->dataset_management) { case NVME_DATASET_MANAGEMENT_AUTO: if (sc->nvstore.deallocate) cd->oncs |= NVME_ONCS_DSM; break; case NVME_DATASET_MANAGEMENT_ENABLE: cd->oncs |= NVME_ONCS_DSM; break; default: break; } cd->fna = 0x03; cd->power_state[0].mp = 10; } /* * Calculate the CRC-16 of the given buffer * See copyright attribution at top of file */ static uint16_t crc16(uint16_t crc, const void *buffer, unsigned int len) { const unsigned char *cp = buffer; /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ static uint16_t const crc16_table[256] = { 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 }; while (len--) crc = (((crc >> 8) & 0xffU) ^ crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; return crc; } static void pci_nvme_init_nsdata(struct pci_nvme_softc *sc, struct nvme_namespace_data *nd, uint32_t nsid, struct pci_nvme_blockstore *nvstore) { /* Get capacity and block size information from backing store */ nd->nsze = nvstore->size / nvstore->sectsz; nd->ncap = nd->nsze; nd->nuse = nd->nsze; if (nvstore->type == NVME_STOR_BLOCKIF) nvstore->deallocate = blockif_candelete(nvstore->ctx); nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ nd->flbas = 0; /* Create an EUI-64 if user did not provide one */ if (nvstore->eui64 == 0) { char *data = NULL; uint64_t eui64 = nvstore->eui64; - asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus, - sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); + asprintf(&data, "%s%u%u%u", get_config_value("name"), + sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, + sc->nsc_pi->pi_func); if (data != NULL) { eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); free(data); } nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); } be64enc(nd->eui64, nvstore->eui64); /* LBA data-sz = 2^lbads */ nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; } static void pci_nvme_init_logpages(struct pci_nvme_softc *sc) { memset(&sc->err_log, 0, sizeof(sc->err_log)); memset(&sc->health_log, 0, sizeof(sc->health_log)); memset(&sc->fw_log, 0, sizeof(sc->fw_log)); /* Set read/write remainder to round up according to spec */ sc->read_dunits_remainder = 999; sc->write_dunits_remainder = 999; /* Set nominal Health values checked by implementations */ sc->health_log.temperature = 310; sc->health_log.available_spare = 100; sc->health_log.available_spare_threshold = 10; } static void pci_nvme_init_features(struct pci_nvme_softc *sc) { sc->feat[0].set = nvme_feature_invalid_cb; sc->feat[0].get = nvme_feature_invalid_cb; sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true; sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true; sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues; sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set = nvme_feature_iv_config; sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get = nvme_feature_invalid_cb; sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get = nvme_feature_invalid_cb; } static void pci_nvme_aer_init(struct pci_nvme_softc *sc) { STAILQ_INIT(&sc->aer_list); sc->aer_count = 0; } static void pci_nvme_aer_destroy(struct pci_nvme_softc *sc) { struct pci_nvme_aer *aer = NULL; while (!STAILQ_EMPTY(&sc->aer_list)) { aer = STAILQ_FIRST(&sc->aer_list); STAILQ_REMOVE_HEAD(&sc->aer_list, link); free(aer); } pci_nvme_aer_init(sc); } static bool pci_nvme_aer_available(struct pci_nvme_softc *sc) { return (!STAILQ_EMPTY(&sc->aer_list)); } static bool pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) { struct nvme_controller_data *cd = &sc->ctrldata; /* AERL is a zero based value while aer_count is one's based */ return (sc->aer_count == (cd->aerl + 1)); } /* * Add an Async Event Request * * Stores an AER to be returned later if the Controller needs to notify the * host of an event. * Note that while the NVMe spec doesn't require Controllers to return AER's * in order, this implementation does preserve the order. */ static int pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) { struct pci_nvme_aer *aer = NULL; if (pci_nvme_aer_limit_reached(sc)) return (-1); aer = calloc(1, sizeof(struct pci_nvme_aer)); if (aer == NULL) return (-1); sc->aer_count++; /* Save the Command ID for use in the completion message */ aer->cid = cid; STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); return (0); } /* * Get an Async Event Request structure * * Returns a pointer to an AER previously submitted by the host or NULL if * no AER's exist. Caller is responsible for freeing the returned struct. */ static struct pci_nvme_aer * pci_nvme_aer_get(struct pci_nvme_softc *sc) { struct pci_nvme_aer *aer = NULL; aer = STAILQ_FIRST(&sc->aer_list); if (aer != NULL) { STAILQ_REMOVE_HEAD(&sc->aer_list, link); sc->aer_count--; } return (aer); } static void pci_nvme_reset_locked(struct pci_nvme_softc *sc) { uint32_t i; DPRINTF("%s", __func__); sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | (1 << NVME_CAP_LO_REG_CQR_SHIFT) | (60 << NVME_CAP_LO_REG_TO_SHIFT); sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; sc->regs.vs = 0x00010300; /* NVMe v1.3 */ sc->regs.cc = 0; sc->regs.csts = 0; assert(sc->submit_queues != NULL); for (i = 0; i < sc->num_squeues + 1; i++) { sc->submit_queues[i].qbase = NULL; sc->submit_queues[i].size = 0; sc->submit_queues[i].cqid = 0; sc->submit_queues[i].tail = 0; sc->submit_queues[i].head = 0; } assert(sc->compl_queues != NULL); for (i = 0; i < sc->num_cqueues + 1; i++) { sc->compl_queues[i].qbase = NULL; sc->compl_queues[i].size = 0; sc->compl_queues[i].tail = 0; sc->compl_queues[i].head = 0; } sc->num_q_is_set = false; pci_nvme_aer_destroy(sc); } static void pci_nvme_reset(struct pci_nvme_softc *sc) { pthread_mutex_lock(&sc->mtx); pci_nvme_reset_locked(sc); pthread_mutex_unlock(&sc->mtx); } static void pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) { uint16_t acqs, asqs; DPRINTF("%s", __func__); asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; sc->submit_queues[0].size = asqs; sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, sizeof(struct nvme_command) * asqs); DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", __func__, sc->regs.asq, sc->submit_queues[0].qbase); acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & NVME_AQA_REG_ACQS_MASK) + 1; sc->compl_queues[0].size = acqs; sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, sizeof(struct nvme_completion) * acqs); sc->compl_queues[0].intr_en = NVME_CQ_INTEN; DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", __func__, sc->regs.acq, sc->compl_queues[0].qbase); } static int nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, size_t len, enum nvme_copy_dir dir) { uint8_t *p; size_t bytes; if (len > (8 * 1024)) { return (-1); } /* Copy from the start of prp1 to the end of the physical page */ bytes = PAGE_SIZE - (prp1 & PAGE_MASK); bytes = MIN(bytes, len); p = vm_map_gpa(ctx, prp1, bytes); if (p == NULL) { return (-1); } if (dir == NVME_COPY_TO_PRP) memcpy(p, b, bytes); else memcpy(b, p, bytes); b += bytes; len -= bytes; if (len == 0) { return (0); } len = MIN(len, PAGE_SIZE); p = vm_map_gpa(ctx, prp2, len); if (p == NULL) { return (-1); } if (dir == NVME_COPY_TO_PRP) memcpy(p, b, len); else memcpy(b, p, len); return (0); } /* * Write a Completion Queue Entry update * * Write the completion and update the doorbell value */ static void pci_nvme_cq_update(struct pci_nvme_softc *sc, struct nvme_completion_queue *cq, uint32_t cdw0, uint16_t cid, uint16_t sqid, uint16_t status) { struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; struct nvme_completion *cqe; assert(cq->qbase != NULL); pthread_mutex_lock(&cq->mtx); cqe = &cq->qbase[cq->tail]; /* Flip the phase bit */ status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; cqe->cdw0 = cdw0; cqe->sqhd = sq->head; cqe->sqid = sqid; cqe->cid = cid; cqe->status = status; cq->tail++; if (cq->tail >= cq->size) { cq->tail = 0; } pthread_mutex_unlock(&cq->mtx); } static int nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint16_t qid = command->cdw10 & 0xffff; DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); if (qid == 0 || qid > sc->num_squeues || (sc->submit_queues[qid].qbase == NULL)) { WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", __func__, qid, sc->num_squeues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } sc->submit_queues[qid].qbase = NULL; sc->submit_queues[qid].cqid = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { if (command->cdw11 & NVME_CMD_CDW11_PC) { uint16_t qid = command->cdw10 & 0xffff; struct nvme_submission_queue *nsq; if ((qid == 0) || (qid > sc->num_squeues) || (sc->submit_queues[qid].qbase != NULL)) { WPRINTF("%s queue index %u > num_squeues %u", __func__, qid, sc->num_squeues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } nsq = &sc->submit_queues[qid]; nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { /* * Queues must specify at least two entries * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec */ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); return (1); } nsq->head = nsq->tail = 0; nsq->cqid = (command->cdw11 >> 16) & 0xffff; if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } if (sc->compl_queues[nsq->cqid].qbase == NULL) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_COMPLETION_QUEUE_INVALID); return (1); } nsq->qpriority = (command->cdw11 >> 1) & 0x03; nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(struct nvme_command) * (size_t)nsq->size); DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, qid, nsq->size, nsq->qbase, nsq->cqid); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); DPRINTF("%s completed creating IOSQ qid %u", __func__, qid); } else { /* * Guest sent non-cont submission queue request. * This setting is unsupported by this emulation. */ WPRINTF("%s unsupported non-contig (list-based) " "create i/o submission queue", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } return (1); } static int nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint16_t qid = command->cdw10 & 0xffff; uint16_t sqid; DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); if (qid == 0 || qid > sc->num_cqueues || (sc->compl_queues[qid].qbase == NULL)) { WPRINTF("%s queue index %u / num_cqueues %u", __func__, qid, sc->num_cqueues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } /* Deleting an Active CQ is an error */ for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) if (sc->submit_queues[sqid].cqid == qid) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_DELETION); return (1); } sc->compl_queues[qid].qbase = NULL; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { struct nvme_completion_queue *ncq; uint16_t qid = command->cdw10 & 0xffff; /* Only support Physically Contiguous queues */ if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { WPRINTF("%s unsupported non-contig (list-based) " "create i/o completion queue", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if ((qid == 0) || (qid > sc->num_cqueues) || (sc->compl_queues[qid].qbase != NULL)) { WPRINTF("%s queue index %u > num_cqueues %u", __func__, qid, sc->num_cqueues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } ncq = &sc->compl_queues[qid]; ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; if (ncq->intr_vec > (sc->max_queues + 1)) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_INTERRUPT_VECTOR); return (1); } ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { /* * Queues must specify at least two entries * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec */ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); return (1); } ncq->head = ncq->tail = 0; ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(struct nvme_command) * (size_t)ncq->size); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint32_t logsize; uint8_t logpage = command->cdw10 & 0xFF; DPRINTF("%s log page %u len %u", __func__, logpage, logsize); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); /* * Command specifies the number of dwords to return in fields NUMDU * and NUMDL. This is a zero-based value. */ logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; logsize *= sizeof(uint32_t); switch (logpage) { case NVME_LOG_ERROR: nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->err_log, MIN(logsize, sizeof(sc->err_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_HEALTH_INFORMATION: pthread_mutex_lock(&sc->mtx); memcpy(&sc->health_log.data_units_read, &sc->read_data_units, sizeof(sc->health_log.data_units_read)); memcpy(&sc->health_log.data_units_written, &sc->write_data_units, sizeof(sc->health_log.data_units_written)); memcpy(&sc->health_log.host_read_commands, &sc->read_commands, sizeof(sc->health_log.host_read_commands)); memcpy(&sc->health_log.host_write_commands, &sc->write_commands, sizeof(sc->health_log.host_write_commands)); pthread_mutex_unlock(&sc->mtx); nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->health_log, MIN(logsize, sizeof(sc->health_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_FIRMWARE_SLOT: nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->fw_log, MIN(logsize, sizeof(sc->fw_log)), NVME_COPY_TO_PRP); break; default: DPRINTF("%s get log page %x command not supported", __func__, logpage); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_LOG_PAGE); } return (1); } static int nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { void *dest; uint16_t status; DPRINTF("%s identify 0x%x nsid 0x%x", __func__, command->cdw10 & 0xFF, command->nsid); pci_nvme_status_genc(&status, NVME_SC_SUCCESS); switch (command->cdw10 & 0xFF) { case 0x00: /* return Identify Namespace data structure */ nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), NVME_COPY_TO_PRP); break; case 0x01: /* return Identify Controller data structure */ nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->ctrldata, sizeof(sc->ctrldata), NVME_COPY_TO_PRP); break; case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint32_t) * 1024); /* All unused entries shall be zero */ bzero(dest, sizeof(uint32_t) * 1024); ((uint32_t *)dest)[0] = 1; break; case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ if (command->nsid != 1) { pci_nvme_status_genc(&status, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); break; } dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint32_t) * 1024); /* All bytes after the descriptor shall be zero */ bzero(dest, sizeof(uint32_t) * 1024); /* Return NIDT=1 (i.e. EUI64) descriptor */ ((uint8_t *)dest)[0] = 1; ((uint8_t *)dest)[1] = sizeof(uint64_t); bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t)); break; default: DPRINTF("%s unsupported identify command requested 0x%x", __func__, command->cdw10 & 0xFF); pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); break; } compl->status = status; return (1); } static const char * nvme_fid_to_name(uint8_t fid) { const char *name; switch (fid) { case NVME_FEAT_ARBITRATION: name = "Arbitration"; break; case NVME_FEAT_POWER_MANAGEMENT: name = "Power Management"; break; case NVME_FEAT_LBA_RANGE_TYPE: name = "LBA Range Type"; break; case NVME_FEAT_TEMPERATURE_THRESHOLD: name = "Temperature Threshold"; break; case NVME_FEAT_ERROR_RECOVERY: name = "Error Recovery"; break; case NVME_FEAT_VOLATILE_WRITE_CACHE: name = "Volatile Write Cache"; break; case NVME_FEAT_NUMBER_OF_QUEUES: name = "Number of Queues"; break; case NVME_FEAT_INTERRUPT_COALESCING: name = "Interrupt Coalescing"; break; case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: name = "Interrupt Vector Configuration"; break; case NVME_FEAT_WRITE_ATOMICITY: name = "Write Atomicity Normal"; break; case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: name = "Asynchronous Event Configuration"; break; case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: name = "Autonomous Power State Transition"; break; case NVME_FEAT_HOST_MEMORY_BUFFER: name = "Host Memory Buffer"; break; case NVME_FEAT_TIMESTAMP: name = "Timestamp"; break; case NVME_FEAT_KEEP_ALIVE_TIMER: name = "Keep Alive Timer"; break; case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: name = "Host Controlled Thermal Management"; break; case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: name = "Non-Operation Power State Config"; break; case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: name = "Read Recovery Level Config"; break; case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: name = "Predictable Latency Mode Config"; break; case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: name = "Predictable Latency Mode Window"; break; case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: name = "LBA Status Information Report Interval"; break; case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: name = "Host Behavior Support"; break; case NVME_FEAT_SANITIZE_CONFIG: name = "Sanitize Config"; break; case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: name = "Endurance Group Event Configuration"; break; case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: name = "Software Progress Marker"; break; case NVME_FEAT_HOST_IDENTIFIER: name = "Host Identifier"; break; case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: name = "Reservation Notification Mask"; break; case NVME_FEAT_RESERVATION_PERSISTENCE: name = "Reservation Persistence"; break; case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: name = "Namespace Write Protection Config"; break; default: name = "Unknown"; break; } return (name); } static void nvme_feature_invalid_cb(struct pci_nvme_softc *sc, struct nvme_feature_obj *feat, struct nvme_command *command, struct nvme_completion *compl) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } static void nvme_feature_iv_config(struct pci_nvme_softc *sc, struct nvme_feature_obj *feat, struct nvme_command *command, struct nvme_completion *compl) { uint32_t i; uint32_t cdw11 = command->cdw11; uint16_t iv; bool cd; pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); iv = cdw11 & 0xffff; cd = cdw11 & (1 << 16); if (iv > (sc->max_queues + 1)) { return; } /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ if ((iv == 0) && !cd) return; /* Requested Interrupt Vector must be used by a CQ */ for (i = 0; i < sc->num_cqueues + 1; i++) { if (sc->compl_queues[i].intr_vec == iv) { pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); } } } static void nvme_feature_num_queues(struct pci_nvme_softc *sc, struct nvme_feature_obj *feat, struct nvme_command *command, struct nvme_completion *compl) { uint16_t nqr; /* Number of Queues Requested */ if (sc->num_q_is_set) { WPRINTF("%s: Number of Queues already set", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_COMMAND_SEQUENCE_ERROR); return; } nqr = command->cdw11 & 0xFFFF; if (nqr == 0xffff) { WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return; } sc->num_squeues = ONE_BASED(nqr); if (sc->num_squeues > sc->max_queues) { DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, sc->max_queues); sc->num_squeues = sc->max_queues; } nqr = (command->cdw11 >> 16) & 0xFFFF; if (nqr == 0xffff) { WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return; } sc->num_cqueues = ONE_BASED(nqr); if (sc->num_cqueues > sc->max_queues) { DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, sc->max_queues); sc->num_cqueues = sc->max_queues; } /* Patch the command value which will be saved on callback's return */ command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); sc->num_q_is_set = true; } static int nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, struct nvme_completion *compl) { struct nvme_feature_obj *feat; uint32_t nsid = command->nsid; uint8_t fid = command->cdw10 & 0xFF; DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); if (fid >= NVME_FID_MAX) { DPRINTF("%s invalid feature 0x%x", __func__, fid); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } feat = &sc->feat[fid]; if (!feat->namespace_specific && !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_FEATURE_NOT_NS_SPECIFIC); return (1); } compl->cdw0 = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); if (feat->set) feat->set(sc, feat, command, compl); if (compl->status == NVME_SC_SUCCESS) feat->cdw11 = command->cdw11; return (0); } static int nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { struct nvme_feature_obj *feat; uint8_t fid = command->cdw10 & 0xFF; DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); if (fid >= NVME_FID_MAX) { DPRINTF("%s invalid feature 0x%x", __func__, fid); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } compl->cdw0 = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); feat = &sc->feat[fid]; if (feat->get) { feat->get(sc, feat, command, compl); } if (compl->status == NVME_SC_SUCCESS) { compl->cdw0 = feat->cdw11; } return (0); } static int nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint8_t ses, lbaf, pi; /* Only supports Secure Erase Setting - User Data Erase */ ses = (command->cdw10 >> 9) & 0x7; if (ses > 0x1) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } /* Only supports a single LBA Format */ lbaf = command->cdw10 & 0xf; if (lbaf != 0) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_FORMAT); return (1); } /* Doesn't support Protection Infomation */ pi = (command->cdw10 >> 5) & 0x7; if (pi != 0) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if (sc->nvstore.type == NVME_STOR_RAM) { if (sc->nvstore.ctx) free(sc->nvstore.ctx); sc->nvstore.ctx = calloc(1, sc->nvstore.size); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); } else { struct pci_nvme_ioreq *req; int err; req = pci_nvme_get_ioreq(sc); if (req == NULL) { pci_nvme_status_genc(&compl->status, NVME_SC_INTERNAL_DEVICE_ERROR); WPRINTF("%s: unable to allocate IO req", __func__); return (1); } req->nvme_sq = &sc->submit_queues[0]; req->sqid = 0; req->opc = command->opc; req->cid = command->cid; req->nsid = command->nsid; req->io_req.br_offset = 0; req->io_req.br_resid = sc->nvstore.size; req->io_req.br_callback = pci_nvme_io_done; err = blockif_delete(sc->nvstore.ctx, &req->io_req); if (err) { pci_nvme_status_genc(&compl->status, NVME_SC_INTERNAL_DEVICE_ERROR); pci_nvme_release_ioreq(sc, req); } } return (1); } static int nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { DPRINTF("%s submission queue %u, command ID 0x%x", __func__, command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); /* TODO: search for the command ID and abort it */ compl->cdw0 = 1; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_async_event_req(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { DPRINTF("%s async event request 0x%x", __func__, command->cdw11); /* Don't exceed the Async Event Request Limit (AERL). */ if (pci_nvme_aer_limit_reached(sc)) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); return (1); } if (pci_nvme_aer_add(sc, command->cid)) { pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, NVME_SC_INTERNAL_DEVICE_ERROR); return (1); } /* * Raise events when they happen based on the Set Features cmd. * These events happen async, so only set completion successful if * there is an event reflective of the request to get event. */ compl->status = NVME_NO_STATUS; return (0); } static void pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) { struct nvme_completion compl; struct nvme_command *cmd; struct nvme_submission_queue *sq; struct nvme_completion_queue *cq; uint16_t sqhead; DPRINTF("%s index %u", __func__, (uint32_t)value); sq = &sc->submit_queues[0]; cq = &sc->compl_queues[0]; pthread_mutex_lock(&sq->mtx); sqhead = sq->head; DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); while (sqhead != atomic_load_acq_short(&sq->tail)) { cmd = &(sq->qbase)[sqhead]; compl.cdw0 = 0; compl.status = 0; switch (cmd->opc) { case NVME_OPC_DELETE_IO_SQ: DPRINTF("%s command DELETE_IO_SQ", __func__); nvme_opc_delete_io_sq(sc, cmd, &compl); break; case NVME_OPC_CREATE_IO_SQ: DPRINTF("%s command CREATE_IO_SQ", __func__); nvme_opc_create_io_sq(sc, cmd, &compl); break; case NVME_OPC_DELETE_IO_CQ: DPRINTF("%s command DELETE_IO_CQ", __func__); nvme_opc_delete_io_cq(sc, cmd, &compl); break; case NVME_OPC_CREATE_IO_CQ: DPRINTF("%s command CREATE_IO_CQ", __func__); nvme_opc_create_io_cq(sc, cmd, &compl); break; case NVME_OPC_GET_LOG_PAGE: DPRINTF("%s command GET_LOG_PAGE", __func__); nvme_opc_get_log_page(sc, cmd, &compl); break; case NVME_OPC_IDENTIFY: DPRINTF("%s command IDENTIFY", __func__); nvme_opc_identify(sc, cmd, &compl); break; case NVME_OPC_ABORT: DPRINTF("%s command ABORT", __func__); nvme_opc_abort(sc, cmd, &compl); break; case NVME_OPC_SET_FEATURES: DPRINTF("%s command SET_FEATURES", __func__); nvme_opc_set_features(sc, cmd, &compl); break; case NVME_OPC_GET_FEATURES: DPRINTF("%s command GET_FEATURES", __func__); nvme_opc_get_features(sc, cmd, &compl); break; case NVME_OPC_FIRMWARE_ACTIVATE: DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); pci_nvme_status_tc(&compl.status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_FIRMWARE_SLOT); break; case NVME_OPC_ASYNC_EVENT_REQUEST: DPRINTF("%s command ASYNC_EVENT_REQ", __func__); nvme_opc_async_event_req(sc, cmd, &compl); break; case NVME_OPC_FORMAT_NVM: DPRINTF("%s command FORMAT_NVM", __func__); if ((sc->ctrldata.oacs & (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); } compl.status = NVME_NO_STATUS; nvme_opc_format_nvm(sc, cmd, &compl); break; default: DPRINTF("0x%x command is not implemented", cmd->opc); pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); } sqhead = (sqhead + 1) % sq->size; if (NVME_COMPLETION_VALID(compl)) { pci_nvme_cq_update(sc, &sc->compl_queues[0], compl.cdw0, cmd->cid, 0, /* SQID */ compl.status); } } DPRINTF("setting sqhead %u", sqhead); sq->head = sqhead; if (cq->head != cq->tail) pci_generate_msix(sc->nsc_pi, 0); pthread_mutex_unlock(&sq->mtx); } /* * Update the Write and Read statistics reported in SMART data * * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. */ static void pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, size_t bytes, uint16_t status) { pthread_mutex_lock(&sc->mtx); switch (opc) { case NVME_OPC_WRITE: sc->write_commands++; if (status != NVME_SC_SUCCESS) break; sc->write_dunits_remainder += (bytes / 512); while (sc->write_dunits_remainder >= 1000) { sc->write_data_units++; sc->write_dunits_remainder -= 1000; } break; case NVME_OPC_READ: sc->read_commands++; if (status != NVME_SC_SUCCESS) break; sc->read_dunits_remainder += (bytes / 512); while (sc->read_dunits_remainder >= 1000) { sc->read_data_units++; sc->read_dunits_remainder -= 1000; } break; default: DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); break; } pthread_mutex_unlock(&sc->mtx); } /* * Check if the combination of Starting LBA (slba) and Number of Logical * Blocks (nlb) exceeds the range of the underlying storage. * * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores * the capacity in bytes as a uint64_t, care must be taken to avoid integer * overflow. */ static bool pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, uint32_t nlb) { size_t offset, bytes; /* Overflow check of multiplying Starting LBA by the sector size */ if (slba >> (64 - nvstore->sectsz_bits)) return (true); offset = slba << nvstore->sectsz_bits; bytes = nlb << nvstore->sectsz_bits; /* Overflow check of Number of Logical Blocks */ if ((nvstore->size - offset) < bytes) return (true); return (false); } static int pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, int do_write, uint64_t lba) { int iovidx; if (req == NULL) return (-1); if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { return (-1); } /* concatenate contig block-iovs to minimize number of iovs */ if ((req->prev_gpaddr + req->prev_size) == gpaddr) { iovidx = req->io_req.br_iovcnt - 1; req->io_req.br_iov[iovidx].iov_base = paddr_guest2host(req->sc->nsc_pi->pi_vmctx, req->prev_gpaddr, size); req->prev_size += size; req->io_req.br_resid += size; req->io_req.br_iov[iovidx].iov_len = req->prev_size; } else { iovidx = req->io_req.br_iovcnt; if (iovidx == 0) { req->io_req.br_offset = lba; req->io_req.br_resid = 0; req->io_req.br_param = req; } req->io_req.br_iov[iovidx].iov_base = paddr_guest2host(req->sc->nsc_pi->pi_vmctx, gpaddr, size); req->io_req.br_iov[iovidx].iov_len = size; req->prev_gpaddr = gpaddr; req->prev_size = size; req->io_req.br_resid += size; req->io_req.br_iovcnt++; } return (0); } static void pci_nvme_set_completion(struct pci_nvme_softc *sc, struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint32_t cdw0, uint16_t status) { struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), NVME_STATUS_GET_SC(status)); pci_nvme_cq_update(sc, cq, 0, /* CDW0 */ cid, sqid, status); if (cq->head != cq->tail) { if (cq->intr_en & NVME_CQ_INTEN) { pci_generate_msix(sc->nsc_pi, cq->intr_vec); } else { DPRINTF("%s: CQ%u interrupt disabled", __func__, sq->cqid); } } } static void pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) { req->sc = NULL; req->nvme_sq = NULL; req->sqid = 0; pthread_mutex_lock(&sc->mtx); STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); sc->pending_ios--; /* when no more IO pending, can set to ready if device reset/enabled */ if (sc->pending_ios == 0 && NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) sc->regs.csts |= NVME_CSTS_RDY; pthread_mutex_unlock(&sc->mtx); sem_post(&sc->iosemlock); } static struct pci_nvme_ioreq * pci_nvme_get_ioreq(struct pci_nvme_softc *sc) { struct pci_nvme_ioreq *req = NULL;; sem_wait(&sc->iosemlock); pthread_mutex_lock(&sc->mtx); req = STAILQ_FIRST(&sc->ioreqs_free); assert(req != NULL); STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); req->sc = sc; sc->pending_ios++; pthread_mutex_unlock(&sc->mtx); req->io_req.br_iovcnt = 0; req->io_req.br_offset = 0; req->io_req.br_resid = 0; req->io_req.br_param = req; req->prev_gpaddr = 0; req->prev_size = 0; return req; } static void pci_nvme_io_done(struct blockif_req *br, int err) { struct pci_nvme_ioreq *req = br->br_param; struct nvme_submission_queue *sq = req->nvme_sq; uint16_t code, status; DPRINTF("%s error %d %s", __func__, err, strerror(err)); /* TODO return correct error */ code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; pci_nvme_status_genc(&status, code); pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); pci_nvme_stats_write_read_update(req->sc, req->opc, req->bytes, status); pci_nvme_release_ioreq(req->sc, req); } /* * Implements the Flush command. The specification states: * If a volatile write cache is not present, Flush commands complete * successfully and have no effect * in the description of the Volatile Write Cache (VWC) field of the Identify * Controller data. Therefore, set status to Success if the command is * not supported (i.e. RAM or as indicated by the blockif). */ static bool nvme_opc_flush(struct pci_nvme_softc *sc, struct nvme_command *cmd, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { bool pending = false; if (nvstore->type == NVME_STOR_RAM) { pci_nvme_status_genc(status, NVME_SC_SUCCESS); } else { int err; req->io_req.br_callback = pci_nvme_io_done; err = blockif_flush(nvstore->ctx, &req->io_req); switch (err) { case 0: pending = true; break; case EOPNOTSUPP: pci_nvme_status_genc(status, NVME_SC_SUCCESS); break; default: pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); } } return (pending); } static uint16_t nvme_write_read_ram(struct pci_nvme_softc *sc, struct pci_nvme_blockstore *nvstore, uint64_t prp1, uint64_t prp2, size_t offset, uint64_t bytes, bool is_write) { uint8_t *buf = nvstore->ctx; enum nvme_copy_dir dir; uint16_t status; if (is_write) dir = NVME_COPY_TO_PRP; else dir = NVME_COPY_FROM_PRP; if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, buf + offset, bytes, dir)) pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); else pci_nvme_status_genc(&status, NVME_SC_SUCCESS); return (status); } static uint16_t nvme_write_read_blockif(struct pci_nvme_softc *sc, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint64_t prp1, uint64_t prp2, size_t offset, uint64_t bytes, bool is_write) { uint64_t size; int err; uint16_t status = NVME_NO_STATUS; size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); if (pci_nvme_append_iov_req(sc, req, prp1, size, is_write, offset)) { pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); goto out; } offset += size; bytes -= size; if (bytes == 0) { ; } else if (bytes <= PAGE_SIZE) { size = bytes; if (pci_nvme_append_iov_req(sc, req, prp2, size, is_write, offset)) { pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); goto out; } } else { void *vmctx = sc->nsc_pi->pi_vmctx; uint64_t *prp_list = &prp2; uint64_t *last = prp_list; /* PRP2 is pointer to a physical region page list */ while (bytes) { /* Last entry in list points to the next list */ if (prp_list == last) { uint64_t prp = *prp_list; prp_list = paddr_guest2host(vmctx, prp, PAGE_SIZE - (prp % PAGE_SIZE)); last = prp_list + (NVME_PRP2_ITEMS - 1); } size = MIN(bytes, PAGE_SIZE); if (pci_nvme_append_iov_req(sc, req, *prp_list, size, is_write, offset)) { pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); goto out; } offset += size; bytes -= size; prp_list++; } } req->io_req.br_callback = pci_nvme_io_done; if (is_write) err = blockif_write(nvstore->ctx, &req->io_req); else err = blockif_read(nvstore->ctx, &req->io_req); if (err) pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); out: return (status); } static bool nvme_opc_write_read(struct pci_nvme_softc *sc, struct nvme_command *cmd, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { uint64_t lba, nblocks, bytes; size_t offset; bool is_write = cmd->opc == NVME_OPC_WRITE; bool pending = false; lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; nblocks = (cmd->cdw12 & 0xFFFF) + 1; if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { WPRINTF("%s command would exceed LBA range", __func__); pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); goto out; } bytes = nblocks << nvstore->sectsz_bits; if (bytes > NVME_MAX_DATA_SIZE) { WPRINTF("%s command would exceed MDTS", __func__); pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); goto out; } offset = lba << nvstore->sectsz_bits; req->bytes = bytes; req->io_req.br_offset = lba; /* PRP bits 1:0 must be zero */ cmd->prp1 &= ~0x3UL; cmd->prp2 &= ~0x3UL; if (nvstore->type == NVME_STOR_RAM) { *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, cmd->prp2, offset, bytes, is_write); } else { *status = nvme_write_read_blockif(sc, nvstore, req, cmd->prp1, cmd->prp2, offset, bytes, is_write); if (*status == NVME_NO_STATUS) pending = true; } out: if (!pending) pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); return (pending); } static void pci_nvme_dealloc_sm(struct blockif_req *br, int err) { struct pci_nvme_ioreq *req = br->br_param; struct pci_nvme_softc *sc = req->sc; bool done = true; uint16_t status; if (err) { pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { pci_nvme_status_genc(&status, NVME_SC_SUCCESS); } else { struct iovec *iov = req->io_req.br_iov; req->prev_gpaddr++; iov += req->prev_gpaddr; /* The iov_* values already include the sector size */ req->io_req.br_offset = (off_t)iov->iov_base; req->io_req.br_resid = iov->iov_len; if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); } else done = false; } if (done) { pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 0, status); pci_nvme_release_ioreq(sc, req); } } static bool nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, struct nvme_command *cmd, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { struct nvme_dsm_range *range; uint32_t nr, r, non_zero, dr; int err; bool pending = false; if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); goto out; } nr = cmd->cdw10 & 0xff; /* copy locally because a range entry could straddle PRPs */ range = calloc(1, NVME_MAX_DSM_TRIM); if (range == NULL) { pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); goto out; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); /* Check for invalid ranges and the number of non-zero lengths */ non_zero = 0; for (r = 0; r <= nr; r++) { if (pci_nvme_out_of_range(nvstore, range[r].starting_lba, range[r].length)) { pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); goto out; } if (range[r].length != 0) non_zero++; } if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { size_t offset, bytes; int sectsz_bits = sc->nvstore.sectsz_bits; /* * DSM calls are advisory only, and compliant controllers * may choose to take no actions (i.e. return Success). */ if (!nvstore->deallocate) { pci_nvme_status_genc(status, NVME_SC_SUCCESS); goto out; } /* If all ranges have a zero length, return Success */ if (non_zero == 0) { pci_nvme_status_genc(status, NVME_SC_SUCCESS); goto out; } if (req == NULL) { pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); goto out; } offset = range[0].starting_lba << sectsz_bits; bytes = range[0].length << sectsz_bits; /* * If the request is for more than a single range, store * the ranges in the br_iov. Optimize for the common case * of a single range. * * Note that NVMe Number of Ranges is a zero based value */ req->io_req.br_iovcnt = 0; req->io_req.br_offset = offset; req->io_req.br_resid = bytes; if (nr == 0) { req->io_req.br_callback = pci_nvme_io_done; } else { struct iovec *iov = req->io_req.br_iov; for (r = 0, dr = 0; r <= nr; r++) { offset = range[r].starting_lba << sectsz_bits; bytes = range[r].length << sectsz_bits; if (bytes == 0) continue; if ((nvstore->size - offset) < bytes) { pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); goto out; } iov[dr].iov_base = (void *)offset; iov[dr].iov_len = bytes; dr++; } req->io_req.br_callback = pci_nvme_dealloc_sm; /* * Use prev_gpaddr to track the current entry and * prev_size to track the number of entries */ req->prev_gpaddr = 0; req->prev_size = dr; } err = blockif_delete(nvstore->ctx, &req->io_req); if (err) pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); else pending = true; } out: free(range); return (pending); } static void pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) { struct nvme_submission_queue *sq; uint16_t status; uint16_t sqhead; /* handle all submissions up to sq->tail index */ sq = &sc->submit_queues[idx]; pthread_mutex_lock(&sq->mtx); sqhead = sq->head; DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", idx, sqhead, sq->tail, sq->qbase); while (sqhead != atomic_load_acq_short(&sq->tail)) { struct nvme_command *cmd; struct pci_nvme_ioreq *req; uint32_t nsid; bool pending; pending = false; req = NULL; status = 0; cmd = &sq->qbase[sqhead]; sqhead = (sqhead + 1) % sq->size; nsid = le32toh(cmd->nsid); if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { pci_nvme_status_genc(&status, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); status |= NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; goto complete; } req = pci_nvme_get_ioreq(sc); if (req == NULL) { pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); WPRINTF("%s: unable to allocate IO req", __func__); goto complete; } req->nvme_sq = sq; req->sqid = idx; req->opc = cmd->opc; req->cid = cmd->cid; req->nsid = cmd->nsid; switch (cmd->opc) { case NVME_OPC_FLUSH: pending = nvme_opc_flush(sc, cmd, &sc->nvstore, req, &status); break; case NVME_OPC_WRITE: case NVME_OPC_READ: pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, req, &status); break; case NVME_OPC_WRITE_ZEROES: /* TODO: write zeroes WPRINTF("%s write zeroes lba 0x%lx blocks %u", __func__, lba, cmd->cdw12 & 0xFFFF); */ pci_nvme_status_genc(&status, NVME_SC_SUCCESS); break; case NVME_OPC_DATASET_MANAGEMENT: pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, req, &status); break; default: WPRINTF("%s unhandled io command 0x%x", __func__, cmd->opc); pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); } complete: if (!pending) { pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, status); if (req != NULL) pci_nvme_release_ioreq(sc, req); } } sq->head = sqhead; pthread_mutex_unlock(&sq->mtx); } static void pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, uint64_t idx, int is_sq, uint64_t value) { DPRINTF("nvme doorbell %lu, %s, val 0x%lx", idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); if (is_sq) { if (idx > sc->num_squeues) { WPRINTF("%s queue index %lu overflow from " "guest (max %u)", __func__, idx, sc->num_squeues); return; } atomic_store_short(&sc->submit_queues[idx].tail, (uint16_t)value); if (idx == 0) { pci_nvme_handle_admin_cmd(sc, value); } else { /* submission queue; handle new entries in SQ */ if (idx > sc->num_squeues) { WPRINTF("%s SQ index %lu overflow from " "guest (max %u)", __func__, idx, sc->num_squeues); return; } pci_nvme_handle_io_cmd(sc, (uint16_t)idx); } } else { if (idx > sc->num_cqueues) { WPRINTF("%s queue index %lu overflow from " "guest (max %u)", __func__, idx, sc->num_cqueues); return; } atomic_store_short(&sc->compl_queues[idx].head, (uint16_t)value); } } static void pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) { const char *s = iswrite ? "WRITE" : "READ"; switch (offset) { case NVME_CR_CAP_LOW: DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); break; case NVME_CR_CAP_HI: DPRINTF("%s %s NVME_CR_CAP_HI", func, s); break; case NVME_CR_VS: DPRINTF("%s %s NVME_CR_VS", func, s); break; case NVME_CR_INTMS: DPRINTF("%s %s NVME_CR_INTMS", func, s); break; case NVME_CR_INTMC: DPRINTF("%s %s NVME_CR_INTMC", func, s); break; case NVME_CR_CC: DPRINTF("%s %s NVME_CR_CC", func, s); break; case NVME_CR_CSTS: DPRINTF("%s %s NVME_CR_CSTS", func, s); break; case NVME_CR_NSSR: DPRINTF("%s %s NVME_CR_NSSR", func, s); break; case NVME_CR_AQA: DPRINTF("%s %s NVME_CR_AQA", func, s); break; case NVME_CR_ASQ_LOW: DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); break; case NVME_CR_ASQ_HI: DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); break; case NVME_CR_ACQ_LOW: DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); break; case NVME_CR_ACQ_HI: DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); break; default: DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); } } static void pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, uint64_t offset, int size, uint64_t value) { uint32_t ccreg; if (offset >= NVME_DOORBELL_OFFSET) { uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; uint64_t idx = belloffset / 8; /* door bell size = 2*int */ int is_sq = (belloffset % 8) < 4; if (belloffset > ((sc->max_queues+1) * 8 - 4)) { WPRINTF("guest attempted an overflow write offset " "0x%lx, val 0x%lx in %s", offset, value, __func__); return; } pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); return; } DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", offset, size, value); if (size != 4) { WPRINTF("guest wrote invalid size %d (offset 0x%lx, " "val 0x%lx) to bar0 in %s", size, offset, value, __func__); /* TODO: shutdown device */ return; } pci_nvme_bar0_reg_dumps(__func__, offset, 1); pthread_mutex_lock(&sc->mtx); switch (offset) { case NVME_CR_CAP_LOW: case NVME_CR_CAP_HI: /* readonly */ break; case NVME_CR_VS: /* readonly */ break; case NVME_CR_INTMS: /* MSI-X, so ignore */ break; case NVME_CR_INTMC: /* MSI-X, so ignore */ break; case NVME_CR_CC: ccreg = (uint32_t)value; DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " "iocqes %u", __func__, NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), NVME_CC_GET_IOCQES(ccreg)); if (NVME_CC_GET_SHN(ccreg)) { /* perform shutdown - flush out data to backend */ sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << NVME_CSTS_REG_SHST_SHIFT); sc->regs.csts |= NVME_SHST_COMPLETE << NVME_CSTS_REG_SHST_SHIFT; } if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { if (NVME_CC_GET_EN(ccreg) == 0) /* transition 1-> causes controller reset */ pci_nvme_reset_locked(sc); else pci_nvme_init_controller(ctx, sc); } /* Insert the iocqes, iosqes and en bits from the write */ sc->regs.cc &= ~NVME_CC_WRITE_MASK; sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; if (NVME_CC_GET_EN(ccreg) == 0) { /* Insert the ams, mps and css bit fields */ sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; sc->regs.csts &= ~NVME_CSTS_RDY; } else if (sc->pending_ios == 0) { sc->regs.csts |= NVME_CSTS_RDY; } break; case NVME_CR_CSTS: break; case NVME_CR_NSSR: /* ignore writes; don't support subsystem reset */ break; case NVME_CR_AQA: sc->regs.aqa = (uint32_t)value; break; case NVME_CR_ASQ_LOW: sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | (0xFFFFF000 & value); break; case NVME_CR_ASQ_HI: sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | (value << 32); break; case NVME_CR_ACQ_LOW: sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | (0xFFFFF000 & value); break; case NVME_CR_ACQ_HI: sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | (value << 32); break; default: DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", __func__, offset, value, size); } pthread_mutex_unlock(&sc->mtx); } static void pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_nvme_softc* sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " " value 0x%lx", baridx, offset, size, value); pci_emul_msix_twrite(pi, offset, size, value); return; } switch (baridx) { case 0: pci_nvme_write_bar_0(ctx, sc, offset, size, value); break; default: DPRINTF("%s unknown baridx %d, val 0x%lx", __func__, baridx, value); } } static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, uint64_t offset, int size) { uint64_t value; pci_nvme_bar0_reg_dumps(__func__, offset, 0); if (offset < NVME_DOORBELL_OFFSET) { void *p = &(sc->regs); pthread_mutex_lock(&sc->mtx); memcpy(&value, (void *)((uintptr_t)p + offset), size); pthread_mutex_unlock(&sc->mtx); } else { value = 0; WPRINTF("pci_nvme: read invalid offset %ld", offset); } switch (size) { case 1: value &= 0xFF; break; case 2: value &= 0xFFFF; break; case 4: value &= 0xFFFFFFFF; break; } DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", offset, size, (uint32_t)value); return (value); } static uint64_t pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_nvme_softc* sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", baridx, offset, size); return pci_emul_msix_tread(pi, offset, size); } switch (baridx) { case 0: return pci_nvme_read_bar_0(sc, offset, size); default: DPRINTF("unknown bar %d, 0x%lx", baridx, offset); } return (0); } - static int -pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts) +pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) { char bident[sizeof("XX:X:X")]; - char *uopt, *xopts, *config; + const char *value; uint32_t sectsz; - int optidx; sc->max_queues = NVME_QUEUES; sc->max_qentries = NVME_MAX_QENTRIES; sc->ioslots = NVME_IOSLOTS; sc->num_squeues = sc->max_queues; sc->num_cqueues = sc->max_queues; sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; sectsz = 0; - - uopt = strdup(opts); - optidx = 0; snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); - for (xopts = strtok(uopt, ","); - xopts != NULL; - xopts = strtok(NULL, ",")) { - - if ((config = strchr(xopts, '=')) != NULL) - *config++ = '\0'; - - if (!strcmp("maxq", xopts)) { - sc->max_queues = atoi(config); - } else if (!strcmp("qsz", xopts)) { - sc->max_qentries = atoi(config); - } else if (!strcmp("ioslots", xopts)) { - sc->ioslots = atoi(config); - } else if (!strcmp("sectsz", xopts)) { - sectsz = atoi(config); - } else if (!strcmp("ser", xopts)) { - /* - * This field indicates the Product Serial Number in - * 7-bit ASCII, unused bytes should be space characters. - * Ref: NVMe v1.3c. - */ - cpywithpad((char *)sc->ctrldata.sn, - sizeof(sc->ctrldata.sn), config, ' '); - } else if (!strcmp("ram", xopts)) { - uint64_t sz = strtoull(&xopts[4], NULL, 10); - - sc->nvstore.type = NVME_STOR_RAM; - sc->nvstore.size = sz * 1024 * 1024; - sc->nvstore.ctx = calloc(1, sc->nvstore.size); - sc->nvstore.sectsz = 4096; - sc->nvstore.sectsz_bits = 12; - if (sc->nvstore.ctx == NULL) { - perror("Unable to allocate RAM"); - free(uopt); - return (-1); - } - } else if (!strcmp("eui64", xopts)) { - sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0)); - } else if (!strcmp("dsm", xopts)) { - if (!strcmp("auto", config)) - sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; - else if (!strcmp("enable", config)) - sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; - else if (!strcmp("disable", config)) - sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; - } else if (optidx == 0) { - snprintf(bident, sizeof(bident), "%d:%d", - sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); - sc->nvstore.ctx = blockif_open(xopts, bident); - if (sc->nvstore.ctx == NULL) { - perror("Could not open backing file"); - free(uopt); - return (-1); - } - sc->nvstore.type = NVME_STOR_BLOCKIF; - sc->nvstore.size = blockif_size(sc->nvstore.ctx); - } else { - EPRINTLN("Invalid option %s", xopts); - free(uopt); + + value = get_config_value_node(nvl, "maxq"); + if (value != NULL) + sc->max_queues = atoi(value); + value = get_config_value_node(nvl, "qsz"); + if (value != NULL) { + sc->max_qentries = atoi(value); + if (sc->max_qentries <= 0) { + EPRINTLN("nvme: Invalid qsz option %d", + sc->max_qentries); return (-1); } - - optidx++; } - free(uopt); - - if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) { - EPRINTLN("backing store not specified"); - return (-1); + value = get_config_value_node(nvl, "ioslots"); + if (value != NULL) { + sc->ioslots = atoi(value); + if (sc->ioslots <= 0) { + EPRINTLN("Invalid ioslots option %d", sc->ioslots); + return (-1); + } + } + value = get_config_value_node(nvl, "sectsz"); + if (value != NULL) + sectsz = atoi(value); + value = get_config_value_node(nvl, "ser"); + if (value != NULL) { + /* + * This field indicates the Product Serial Number in + * 7-bit ASCII, unused bytes should be space characters. + * Ref: NVMe v1.3c. + */ + cpywithpad((char *)sc->ctrldata.sn, + sizeof(sc->ctrldata.sn), value, ' '); + } + value = get_config_value_node(nvl, "eui64"); + if (value != NULL) + sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); + value = get_config_value_node(nvl, "dsm"); + if (value != NULL) { + if (strcmp(value, "auto") == 0) + sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; + else if (strcmp(value, "enable") == 0) + sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; + else if (strcmp(value, "disable") == 0) + sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; + } + + value = get_config_value_node(nvl, "ram"); + if (value != NULL) { + uint64_t sz = strtoull(value, NULL, 10); + + sc->nvstore.type = NVME_STOR_RAM; + sc->nvstore.size = sz * 1024 * 1024; + sc->nvstore.ctx = calloc(1, sc->nvstore.size); + sc->nvstore.sectsz = 4096; + sc->nvstore.sectsz_bits = 12; + if (sc->nvstore.ctx == NULL) { + EPRINTLN("nvme: Unable to allocate RAM"); + return (-1); + } + } else { + snprintf(bident, sizeof(bident), "%d:%d", + sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); + sc->nvstore.ctx = blockif_open(nvl, bident); + if (sc->nvstore.ctx == NULL) { + EPRINTLN("nvme: Could not open backing file: %s", + strerror(errno)); + return (-1); + } + sc->nvstore.type = NVME_STOR_BLOCKIF; + sc->nvstore.size = blockif_size(sc->nvstore.ctx); } + if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) sc->nvstore.sectsz = sectsz; else if (sc->nvstore.type != NVME_STOR_RAM) sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); for (sc->nvstore.sectsz_bits = 9; (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; sc->nvstore.sectsz_bits++); if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) sc->max_queues = NVME_QUEUES; - if (sc->max_qentries <= 0) { - EPRINTLN("Invalid qsz option"); - return (-1); - } - if (sc->ioslots <= 0) { - EPRINTLN("Invalid ioslots option"); - return (-1); - } - return (0); } static int -pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { struct pci_nvme_softc *sc; uint32_t pci_membar_sz; int error; error = 0; sc = calloc(1, sizeof(struct pci_nvme_softc)); pi->pi_arg = sc; sc->nsc_pi = pi; - error = pci_nvme_parse_opts(sc, opts); + error = pci_nvme_parse_config(sc, nvl); if (error < 0) goto done; else error = 0; STAILQ_INIT(&sc->ioreqs_free); sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); for (int i = 0; i < sc->ioslots; i++) { STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); } pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); /* * Allocate size of NVMe registers + doorbell space for all queues. * * The specification requires a minimum memory I/O window size of 16K. * The Windows driver will refuse to start a device with a smaller * window. */ pci_membar_sz = sizeof(struct nvme_registers) + 2 * sizeof(uint32_t) * (sc->max_queues + 1); pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); DPRINTF("nvme membar size: %u", pci_membar_sz); error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); if (error) { WPRINTF("%s pci alloc mem bar failed", __func__); goto done; } error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); if (error) { WPRINTF("%s pci add msixcap failed", __func__); goto done; } error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); if (error) { WPRINTF("%s pci add Express capability failed", __func__); goto done; } pthread_mutex_init(&sc->mtx, NULL); sem_init(&sc->iosemlock, 0, sc->ioslots); pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); /* * Controller data depends on Namespace data so initialize Namespace * data first. */ pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); pci_nvme_init_ctrldata(sc); pci_nvme_init_logpages(sc); pci_nvme_init_features(sc); pci_nvme_aer_init(sc); pci_nvme_reset(sc); pci_lintr_request(pi); done: return (error); } struct pci_devemu pci_de_nvme = { .pe_emu = "nvme", .pe_init = pci_nvme_init, + .pe_legacy_config = blockif_legacy_config, .pe_barwrite = pci_nvme_write, .pe_barread = pci_nvme_read }; PCI_EMUL_SET(pci_de_nvme); diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c index 3305a4854812..f1d306b03959 100644 --- a/usr.sbin/bhyve/pci_passthru.c +++ b/usr.sbin/bhyve/pci_passthru.c @@ -1,965 +1,1000 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include + +#include "config.h" +#include "debug.h" #include "pci_emul.h" #include "mem.h" #ifndef _PATH_DEVPCI #define _PATH_DEVPCI "/dev/pci" #endif #ifndef _PATH_DEVIO #define _PATH_DEVIO "/dev/io" #endif #ifndef _PATH_MEM #define _PATH_MEM "/dev/mem" #endif #define LEGACY_SUPPORT 1 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1) #define MSIX_CAPLEN 12 static int pcifd = -1; static int iofd = -1; static int memfd = -1; struct passthru_softc { struct pci_devinst *psc_pi; struct pcibar psc_bar[PCI_BARMAX + 1]; struct { int capoff; int msgctrl; int emulated; } psc_msi; struct { int capoff; } psc_msix; struct pcisel psc_sel; }; static int msi_caplen(int msgctrl) { int len; len = 10; /* minimum length of msi capability */ if (msgctrl & PCIM_MSICTRL_64BIT) len += 4; #if 0 /* * Ignore the 'mask' and 'pending' bits in the MSI capability. * We'll let the guest manipulate them directly. */ if (msgctrl & PCIM_MSICTRL_VECTOR) len += 10; #endif return (len); } static uint32_t read_config(const struct pcisel *sel, long reg, int width) { struct pci_io pi; bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; if (ioctl(pcifd, PCIOCREAD, &pi) < 0) return (0); /* XXX */ else return (pi.pi_data); } static void write_config(const struct pcisel *sel, long reg, int width, uint32_t data) { struct pci_io pi; bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; pi.pi_data = data; (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */ } #ifdef LEGACY_SUPPORT static int passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr) { int capoff, i; struct msicap msicap; u_char *capdata; pci_populate_msicap(&msicap, msgnum, nextptr); /* * XXX * Copy the msi capability structure in the last 16 bytes of the * config space. This is wrong because it could shadow something * useful to the device. */ capoff = 256 - roundup(sizeof(msicap), 4); capdata = (u_char *)&msicap; for (i = 0; i < sizeof(msicap); i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); return (capoff); } #endif /* LEGACY_SUPPORT */ static int cfginitmsi(struct passthru_softc *sc) { int i, ptr, capptr, cap, sts, caplen, table_size; uint32_t u32; struct pcisel sel; struct pci_devinst *pi; struct msixcap msixcap; uint32_t *msixcap_ptr; pi = sc->psc_pi; sel = sc->psc_sel; /* * Parse the capabilities and cache the location of the MSI * and MSI-X capabilities. */ sts = read_config(&sel, PCIR_STATUS, 2); if (sts & PCIM_STATUS_CAPPRESENT) { ptr = read_config(&sel, PCIR_CAP_PTR, 1); while (ptr != 0 && ptr != 0xff) { cap = read_config(&sel, ptr + PCICAP_ID, 1); if (cap == PCIY_MSI) { /* * Copy the MSI capability into the config * space of the emulated pci device */ sc->psc_msi.capoff = ptr; sc->psc_msi.msgctrl = read_config(&sel, ptr + 2, 2); sc->psc_msi.emulated = 0; caplen = msi_caplen(sc->psc_msi.msgctrl); capptr = ptr; while (caplen > 0) { u32 = read_config(&sel, capptr, 4); pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; } } else if (cap == PCIY_MSIX) { /* * Copy the MSI-X capability */ sc->psc_msix.capoff = ptr; caplen = 12; msixcap_ptr = (uint32_t*) &msixcap; capptr = ptr; while (caplen > 0) { u32 = read_config(&sel, capptr, 4); *msixcap_ptr = u32; pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; msixcap_ptr++; } } ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1); } } if (sc->psc_msix.capoff != 0) { pi->pi_msix.pba_bar = msixcap.pba_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.pba_offset = msixcap.pba_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_bar = msixcap.table_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.table_offset = msixcap.table_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl); pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count); /* Allocate the emulated MSI-X table array */ table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* Mask all table entries */ for (i = 0; i < pi->pi_msix.table_count; i++) { pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } } #ifdef LEGACY_SUPPORT /* * If the passthrough device does not support MSI then craft a * MSI capability for it. We link the new MSI capability at the * head of the list of capabilities. */ if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) { int origptr, msiptr; origptr = read_config(&sel, PCIR_CAP_PTR, 1); msiptr = passthru_add_msicap(pi, 1, origptr); sc->psc_msi.capoff = msiptr; sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2); sc->psc_msi.emulated = 1; pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr); } #endif /* Make sure one of the capabilities is present */ if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) return (-1); else return (0); } static uint64_t msix_table_read(struct passthru_softc *sc, uint64_t offset, int size) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *src8; uint16_t *src16; uint32_t *src32; uint64_t *src64; uint64_t data; size_t entry_offset; int index; pi = sc->psc_pi; if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset && offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { switch(size) { case 1: src8 = (uint8_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src8; break; case 2: src16 = (uint16_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src16; break; case 4: src32 = (uint32_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src32; break; case 8: src64 = (uint64_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src64; break; default: return (-1); } return (data); } if (offset < pi->pi_msix.table_offset) return (-1); offset -= pi->pi_msix.table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; if (index >= pi->pi_msix.table_count) return (-1); entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; switch(size) { case 1: src8 = (uint8_t *)((void *)entry + entry_offset); data = *src8; break; case 2: src16 = (uint16_t *)((void *)entry + entry_offset); data = *src16; break; case 4: src32 = (uint32_t *)((void *)entry + entry_offset); data = *src32; break; case 8: src64 = (uint64_t *)((void *)entry + entry_offset); data = *src64; break; default: return (-1); } return (data); } static void msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc, uint64_t offset, int size, uint64_t data) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *dest8; uint16_t *dest16; uint32_t *dest32; uint64_t *dest64; size_t entry_offset; uint32_t vector_control; int index; pi = sc->psc_pi; if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset && offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { switch(size) { case 1: dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest8 = data; break; case 2: dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest16 = data; break; case 4: dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest32 = data; break; case 8: dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest64 = data; break; default: break; } return; } if (offset < pi->pi_msix.table_offset) return; offset -= pi->pi_msix.table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; if (index >= pi->pi_msix.table_count) return; entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* Only 4 byte naturally-aligned writes are supported */ assert(size == 4); assert(entry_offset % 4 == 0); vector_control = entry->vector_control; dest32 = (uint32_t *)((void *)entry + entry_offset); *dest32 = data; /* If MSI-X hasn't been enabled, do nothing */ if (pi->pi_msix.enabled) { /* If the entry is masked, don't set it up */ if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 || (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { (void)vm_setup_pptdev_msix(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, index, entry->addr, entry->msg_data, entry->vector_control); } } } static int init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base) { int b, s, f; int error, idx; size_t len, remaining; uint32_t table_size, table_offset; uint32_t pba_size, pba_offset; vm_paddr_t start; struct pci_devinst *pi = sc->psc_pi; assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0); b = sc->psc_sel.pc_bus; s = sc->psc_sel.pc_dev; f = sc->psc_sel.pc_func; /* * If the MSI-X table BAR maps memory intended for * other uses, it is at least assured that the table * either resides in its own page within the region, * or it resides in a page shared with only the PBA. */ table_offset = rounddown2(pi->pi_msix.table_offset, 4096); table_size = pi->pi_msix.table_offset - table_offset; table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; table_size = roundup2(table_size, 4096); idx = pi->pi_msix.table_bar; start = pi->pi_bar[idx].addr; remaining = pi->pi_bar[idx].size; if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) { pba_offset = pi->pi_msix.pba_offset; pba_size = pi->pi_msix.pba_size; if (pba_offset >= table_offset + table_size || table_offset >= pba_offset + pba_size) { /* * If the PBA does not share a page with the MSI-x * tables, no PBA emulation is required. */ pi->pi_msix.pba_page = NULL; pi->pi_msix.pba_page_offset = 0; } else { /* * The PBA overlaps with either the first or last * page of the MSI-X table region. Map the * appropriate page. */ if (pba_offset <= table_offset) pi->pi_msix.pba_page_offset = table_offset; else pi->pi_msix.pba_page_offset = table_offset + table_size - 4096; pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, memfd, start + pi->pi_msix.pba_page_offset); if (pi->pi_msix.pba_page == MAP_FAILED) { warn( "Failed to map PBA page for MSI-X on %d/%d/%d", b, s, f); return (-1); } } } /* Map everything before the MSI-X table */ if (table_offset > 0) { len = table_offset; error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base); if (error) return (error); base += len; start += len; remaining -= len; } /* Skip the MSI-X table */ base += table_size; start += table_size; remaining -= table_size; /* Map everything beyond the end of the MSI-X table */ if (remaining > 0) { len = remaining; error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base); if (error) return (error); } return (0); } static int cfginitbar(struct vmctx *ctx, struct passthru_softc *sc) { int i, error; struct pci_devinst *pi; struct pci_bar_io bar; enum pcibar_type bartype; uint64_t base, size; pi = sc->psc_pi; /* * Initialize BAR registers */ for (i = 0; i <= PCI_BARMAX; i++) { bzero(&bar, sizeof(bar)); bar.pbi_sel = sc->psc_sel; bar.pbi_reg = PCIR_BAR(i); if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0) continue; if (PCI_BAR_IO(bar.pbi_base)) { bartype = PCIBAR_IO; base = bar.pbi_base & PCIM_BAR_IO_BASE; } else { switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) { case PCIM_BAR_MEM_64: bartype = PCIBAR_MEM64; break; default: bartype = PCIBAR_MEM32; break; } base = bar.pbi_base & PCIM_BAR_MEM_BASE; } size = bar.pbi_length; if (bartype != PCIBAR_IO) { if (((base | size) & PAGE_MASK) != 0) { warnx("passthru device %d/%d/%d BAR %d: " "base %#lx or size %#lx not page aligned\n", sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, base, size); return (-1); } } /* Cache information about the "real" BAR */ sc->psc_bar[i].type = bartype; sc->psc_bar[i].size = size; sc->psc_bar[i].addr = base; /* Allocate the BAR in the guest I/O or MMIO space */ error = pci_emul_alloc_bar(pi, i, bartype, size); if (error) return (-1); /* The MSI-X table needs special handling */ if (i == pci_msix_table_bar(pi)) { error = init_msix_table(ctx, sc, base); if (error) return (-1); } else if (bartype != PCIBAR_IO) { /* Map the physical BAR in the guest MMIO space */ error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_bar[i].addr, pi->pi_bar[i].size, base); if (error) return (-1); } /* * 64-bit BAR takes up two slots so skip the next one. */ if (bartype == PCIBAR_MEM64) { i++; assert(i <= PCI_BARMAX); sc->psc_bar[i].type = PCIBAR_MEMHI64; } } return (0); } static int cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func) { int error; struct passthru_softc *sc; error = 1; sc = pi->pi_arg; bzero(&sc->psc_sel, sizeof(struct pcisel)); sc->psc_sel.pc_bus = bus; sc->psc_sel.pc_dev = slot; sc->psc_sel.pc_func = func; if (cfginitmsi(sc) != 0) { warnx("failed to initialize MSI for PCI %d/%d/%d", bus, slot, func); goto done; } if (cfginitbar(ctx, sc) != 0) { warnx("failed to initialize BARs for PCI %d/%d/%d", bus, slot, func); goto done; } pci_set_cfgdata16(pi, PCIR_COMMAND, read_config(&sc->psc_sel, PCIR_COMMAND, 2)); error = 0; /* success */ done: return (error); } static int -passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +passthru_legacy_config(nvlist_t *nvl, const char *opts) +{ + char value[16]; + int bus, slot, func; + + if (opts == NULL) + return (0); + + if (sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) { + EPRINTLN("passthru: invalid options \"%s\"", opts); + return (-1); + } + + snprintf(value, sizeof(value), "%d", bus); + set_config_value_node(nvl, "bus", value); + snprintf(value, sizeof(value), "%d", slot); + set_config_value_node(nvl, "slot", value); + snprintf(value, sizeof(value), "%d", func); + set_config_value_node(nvl, "func", value); + return (0); +} + +static int +passthru_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { int bus, slot, func, error, memflags; struct passthru_softc *sc; + const char *value; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t pci_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR }; cap_ioctl_t io_ioctls[] = { IODEV_PIO }; #endif sc = NULL; error = 1; #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_IOCTL, CAP_READ, CAP_WRITE); #endif memflags = vm_get_memflags(ctx); if (!(memflags & VM_MEM_F_WIRED)) { warnx("passthru requires guest memory to be wired"); return (error); } if (pcifd < 0) { pcifd = open(_PATH_DEVPCI, O_RDWR, 0); if (pcifd < 0) { warn("failed to open %s", _PATH_DEVPCI); return (error); } } #ifndef WITHOUT_CAPSICUM if (caph_rights_limit(pcifd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_ioctls_limit(pcifd, pci_ioctls, nitems(pci_ioctls)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif if (iofd < 0) { iofd = open(_PATH_DEVIO, O_RDWR, 0); if (iofd < 0) { warn("failed to open %s", _PATH_DEVIO); return (error); } } #ifndef WITHOUT_CAPSICUM if (caph_rights_limit(iofd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_ioctls_limit(iofd, io_ioctls, nitems(io_ioctls)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif if (memfd < 0) { memfd = open(_PATH_MEM, O_RDWR, 0); if (memfd < 0) { warn("failed to open %s", _PATH_MEM); return (error); } } #ifndef WITHOUT_CAPSICUM cap_rights_clear(&rights, CAP_IOCTL); cap_rights_set(&rights, CAP_MMAP_RW); if (caph_rights_limit(memfd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif - if (opts == NULL || - sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) { - warnx("invalid passthru options"); - return (error); - } +#define GET_INT_CONFIG(var, name) do { \ + value = get_config_value_node(nvl, name); \ + if (value == NULL) { \ + EPRINTLN("passthru: missing required %s setting", name); \ + return (error); \ + } \ + var = atoi(value); \ +} while (0) + + GET_INT_CONFIG(bus, "bus"); + GET_INT_CONFIG(slot, "slot"); + GET_INT_CONFIG(func, "func"); if (vm_assign_pptdev(ctx, bus, slot, func) != 0) { warnx("PCI device at %d/%d/%d is not using the ppt(4) driver", bus, slot, func); goto done; } sc = calloc(1, sizeof(struct passthru_softc)); pi->pi_arg = sc; sc->psc_pi = pi; /* initialize config space */ error = cfginit(ctx, pi, bus, slot, func); done: if (error) { free(sc); vm_unassign_pptdev(ctx, bus, slot, func); } return (error); } static int bar_access(int coff) { if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) return (1); else return (0); } static int msicap_access(struct passthru_softc *sc, int coff) { int caplen; if (sc->psc_msi.capoff == 0) return (0); caplen = msi_caplen(sc->psc_msi.msgctrl); if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen) return (1); else return (0); } static int msixcap_access(struct passthru_softc *sc, int coff) { if (sc->psc_msix.capoff == 0) return (0); return (coff >= sc->psc_msix.capoff && coff < sc->psc_msix.capoff + MSIX_CAPLEN); } static int passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t *rv) { struct passthru_softc *sc; sc = pi->pi_arg; /* * PCI BARs and MSI capability is emulated. */ if (bar_access(coff) || msicap_access(sc, coff)) return (-1); #ifdef LEGACY_SUPPORT /* * Emulate PCIR_CAP_PTR if this device does not support MSI capability * natively. */ if (sc->psc_msi.emulated) { if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4) return (-1); } #endif /* * Emulate the command register. If a single read reads both the * command and status registers, read the status register from the * device's config space. */ if (coff == PCIR_COMMAND) { if (bytes <= 2) return (-1); *rv = read_config(&sc->psc_sel, PCIR_STATUS, 2) << 16 | pci_get_cfgdata16(pi, PCIR_COMMAND); return (0); } /* Everything else just read from the device's config space */ *rv = read_config(&sc->psc_sel, coff, bytes); return (0); } static int passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t val) { int error, msix_table_entries, i; struct passthru_softc *sc; uint16_t cmd_old; sc = pi->pi_arg; /* * PCI BARs are emulated */ if (bar_access(coff)) return (-1); /* * MSI capability is emulated */ if (msicap_access(sc, coff)) { pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff, PCIY_MSI); error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum); if (error != 0) err(1, "vm_setup_pptdev_msi"); return (0); } if (msixcap_access(sc, coff)) { pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff, PCIY_MSIX); if (pi->pi_msix.enabled) { msix_table_entries = pi->pi_msix.table_count; for (i = 0; i < msix_table_entries; i++) { error = vm_setup_pptdev_msix(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, pi->pi_msix.table[i].addr, pi->pi_msix.table[i].msg_data, pi->pi_msix.table[i].vector_control); if (error) err(1, "vm_setup_pptdev_msix"); } } else { error = vm_disable_pptdev_msix(ctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func); if (error) err(1, "vm_disable_pptdev_msix"); } return (0); } #ifdef LEGACY_SUPPORT /* * If this device does not support MSI natively then we cannot let * the guest disable legacy interrupts from the device. It is the * legacy interrupt that is triggering the virtual MSI to the guest. */ if (sc->psc_msi.emulated && pci_msi_enabled(pi)) { if (coff == PCIR_COMMAND && bytes == 2) val &= ~PCIM_CMD_INTxDIS; } #endif write_config(&sc->psc_sel, coff, bytes, val); if (coff == PCIR_COMMAND) { cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND); if (bytes == 1) pci_set_cfgdata8(pi, PCIR_COMMAND, val); else if (bytes == 2) pci_set_cfgdata16(pi, PCIR_COMMAND, val); pci_emul_cmd_changed(pi, cmd_old); } return (0); } static void passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct passthru_softc *sc; struct iodev_pio_req pio; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { msix_table_write(ctx, vcpu, sc, offset, size, value); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); bzero(&pio, sizeof(struct iodev_pio_req)); pio.access = IODEV_PIO_WRITE; pio.port = sc->psc_bar[baridx].addr + offset; pio.width = size; pio.val = value; (void)ioctl(iofd, IODEV_PIO, &pio); } } static uint64_t passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct passthru_softc *sc; struct iodev_pio_req pio; uint64_t val; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { val = msix_table_read(sc, offset, size); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); bzero(&pio, sizeof(struct iodev_pio_req)); pio.access = IODEV_PIO_READ; pio.port = sc->psc_bar[baridx].addr + offset; pio.width = size; pio.val = 0; (void)ioctl(iofd, IODEV_PIO, &pio); val = pio.val; } return (val); } struct pci_devemu passthru = { .pe_emu = "passthru", .pe_init = passthru_init, + .pe_legacy_config = passthru_legacy_config, .pe_cfgwrite = passthru_cfgwrite, .pe_cfgread = passthru_cfgread, .pe_barwrite = passthru_write, .pe_barread = passthru_read, }; PCI_EMUL_SET(passthru); diff --git a/usr.sbin/bhyve/pci_uart.c b/usr.sbin/bhyve/pci_uart.c index 2e8177bafb2c..25ef1ed66217 100644 --- a/usr.sbin/bhyve/pci_uart.c +++ b/usr.sbin/bhyve/pci_uart.c @@ -1,122 +1,135 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include "bhyverun.h" +#include "config.h" #include "debug.h" #include "pci_emul.h" #include "uart_emul.h" /* * Pick a PCI vid/did of a chip with a single uart at * BAR0, that most versions of FreeBSD can understand: * Siig CyberSerial 1-port. */ #define COM_VENDOR 0x131f #define COM_DEV 0x2000 static void pci_uart_intr_assert(void *arg) { struct pci_devinst *pi = arg; pci_lintr_assert(pi); } static void pci_uart_intr_deassert(void *arg) { struct pci_devinst *pi = arg; pci_lintr_deassert(pi); } static void pci_uart_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { assert(baridx == 0); assert(size == 1); uart_write(pi->pi_arg, offset, value); } uint64_t pci_uart_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { uint8_t val; assert(baridx == 0); assert(size == 1); val = uart_read(pi->pi_arg, offset); return (val); } static int -pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_uart_legacy_config(nvlist_t *nvl, const char *opts) +{ + + if (opts != NULL) + set_config_value_node(nvl, "path", opts); + return (0); +} + +static int +pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { struct uart_softc *sc; + const char *device; pci_emul_alloc_bar(pi, 0, PCIBAR_IO, UART_IO_BAR_SIZE); pci_lintr_request(pi); /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, COM_DEV); pci_set_cfgdata16(pi, PCIR_VENDOR, COM_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM); sc = uart_init(pci_uart_intr_assert, pci_uart_intr_deassert, pi); pi->pi_arg = sc; - if (uart_set_backend(sc, opts) != 0) { + device = get_config_value_node(nvl, "path"); + if (uart_set_backend(sc, device) != 0) { EPRINTLN("Unable to initialize backend '%s' for " - "pci uart at %d:%d", opts, pi->pi_slot, pi->pi_func); + "pci uart at %d:%d", device, pi->pi_slot, pi->pi_func); return (-1); } return (0); } struct pci_devemu pci_de_com = { .pe_emu = "uart", .pe_init = pci_uart_init, + .pe_legacy_config = pci_uart_legacy_config, .pe_barwrite = pci_uart_write, .pe_barread = pci_uart_read }; PCI_EMUL_SET(pci_de_com); diff --git a/usr.sbin/bhyve/pci_virtio_9p.c b/usr.sbin/bhyve/pci_virtio_9p.c index 4296ccd11c7f..e27159eb22cb 100644 --- a/usr.sbin/bhyve/pci_virtio_9p.c +++ b/usr.sbin/bhyve/pci_virtio_9p.c @@ -1,344 +1,360 @@ /*- * Copyright (c) 2015 iXsystems Inc. * Copyright (c) 2017-2018 Jakub Klama * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * VirtIO filesystem passthrough using 9p protocol. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" +#include "config.h" +#include "debug.h" #include "pci_emul.h" #include "virtio.h" #define VT9P_MAX_IOV 128 #define VT9P_RINGSZ 256 #define VT9P_MAXTAGSZ 256 #define VT9P_CONFIGSPACESZ (VT9P_MAXTAGSZ + sizeof(uint16_t)) static int pci_vt9p_debug; #define DPRINTF(params) if (pci_vt9p_debug) printf params #define WPRINTF(params) printf params /* * Per-device softc */ struct pci_vt9p_softc { struct virtio_softc vsc_vs; struct vqueue_info vsc_vq; pthread_mutex_t vsc_mtx; uint64_t vsc_cfg; uint64_t vsc_features; char * vsc_rootpath; struct pci_vt9p_config * vsc_config; struct l9p_backend * vsc_fs_backend; struct l9p_server * vsc_server; struct l9p_connection * vsc_conn; }; struct pci_vt9p_request { struct pci_vt9p_softc * vsr_sc; struct iovec * vsr_iov; size_t vsr_niov; size_t vsr_respidx; size_t vsr_iolen; uint16_t vsr_idx; }; struct pci_vt9p_config { uint16_t tag_len; char tag[0]; } __attribute__((packed)); static int pci_vt9p_send(struct l9p_request *, const struct iovec *, const size_t, const size_t, void *); static void pci_vt9p_drop(struct l9p_request *, const struct iovec *, size_t, void *); static void pci_vt9p_reset(void *); static void pci_vt9p_notify(void *, struct vqueue_info *); static int pci_vt9p_cfgread(void *, int, int, uint32_t *); static void pci_vt9p_neg_features(void *, uint64_t); static struct virtio_consts vt9p_vi_consts = { "vt9p", /* our name */ 1, /* we support 1 virtqueue */ VT9P_CONFIGSPACESZ, /* config reg size */ pci_vt9p_reset, /* reset */ pci_vt9p_notify, /* device-wide qnotify */ pci_vt9p_cfgread, /* read virtio config */ NULL, /* write virtio config */ pci_vt9p_neg_features, /* apply negotiated features */ (1 << 0), /* our capabilities */ }; static void pci_vt9p_reset(void *vsc) { struct pci_vt9p_softc *sc; sc = vsc; DPRINTF(("vt9p: device reset requested !\n")); vi_reset_dev(&sc->vsc_vs); } static void pci_vt9p_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vt9p_softc *sc = vsc; sc->vsc_features = negotiated_features; } static int pci_vt9p_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vt9p_softc *sc = vsc; void *ptr; ptr = (uint8_t *)sc->vsc_config + offset; memcpy(retval, ptr, size); return (0); } static int pci_vt9p_get_buffer(struct l9p_request *req, struct iovec *iov, size_t *niov, void *arg) { struct pci_vt9p_request *preq = req->lr_aux; size_t n = preq->vsr_niov - preq->vsr_respidx; memcpy(iov, preq->vsr_iov + preq->vsr_respidx, n * sizeof(struct iovec)); *niov = n; return (0); } static int pci_vt9p_send(struct l9p_request *req, const struct iovec *iov, const size_t niov, const size_t iolen, void *arg) { struct pci_vt9p_request *preq = req->lr_aux; struct pci_vt9p_softc *sc = preq->vsr_sc; preq->vsr_iolen = iolen; pthread_mutex_lock(&sc->vsc_mtx); vq_relchain(&sc->vsc_vq, preq->vsr_idx, preq->vsr_iolen); vq_endchains(&sc->vsc_vq, 1); pthread_mutex_unlock(&sc->vsc_mtx); free(preq); return (0); } static void pci_vt9p_drop(struct l9p_request *req, const struct iovec *iov, size_t niov, void *arg) { struct pci_vt9p_request *preq = req->lr_aux; struct pci_vt9p_softc *sc = preq->vsr_sc; pthread_mutex_lock(&sc->vsc_mtx); vq_relchain(&sc->vsc_vq, preq->vsr_idx, 0); vq_endchains(&sc->vsc_vq, 1); pthread_mutex_unlock(&sc->vsc_mtx); free(preq); } static void pci_vt9p_notify(void *vsc, struct vqueue_info *vq) { struct iovec iov[VT9P_MAX_IOV]; struct pci_vt9p_softc *sc; struct pci_vt9p_request *preq; uint16_t idx, n, i; uint16_t flags[VT9P_MAX_IOV]; sc = vsc; while (vq_has_descs(vq)) { n = vq_getchain(vq, &idx, iov, VT9P_MAX_IOV, flags); preq = calloc(1, sizeof(struct pci_vt9p_request)); preq->vsr_sc = sc; preq->vsr_idx = idx; preq->vsr_iov = iov; preq->vsr_niov = n; preq->vsr_respidx = 0; /* Count readable descriptors */ for (i = 0; i < n; i++) { if (flags[i] & VRING_DESC_F_WRITE) break; preq->vsr_respidx++; } for (int i = 0; i < n; i++) { DPRINTF(("vt9p: vt9p_notify(): desc%d base=%p, " "len=%zu, flags=0x%04x\r\n", i, iov[i].iov_base, iov[i].iov_len, flags[i])); } l9p_connection_recv(sc->vsc_conn, iov, preq->vsr_respidx, preq); } } - static int -pci_vt9p_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_vt9p_legacy_config(nvlist_t *nvl, const char *opts) { - struct pci_vt9p_softc *sc; - char *opt; - char *sharename = NULL; - char *rootpath = NULL; - int rootfd; - bool ro = false; - cap_rights_t rootcap; + char *sharename, *tofree, *token, *tokens; - if (opts == NULL) { - printf("virtio-9p: share name and path required\n"); - return (1); - } + if (opts == NULL) + return (0); - while ((opt = strsep(&opts, ",")) != NULL) { - if (strchr(opt, '=') != NULL) { + tokens = tofree = strdup(opts); + while ((token = strsep(&tokens, ",")) != NULL) { + if (strchr(token, '=') != NULL) { if (sharename != NULL) { - printf("virtio-9p: more than one share name given\n"); - return (1); + EPRINTLN( + "virtio-9p: more than one share name given"); + return (-1); } - sharename = strsep(&opt, "="); - rootpath = opt; - continue; - } + sharename = strsep(&token, "="); + set_config_value_node(nvl, "sharename", sharename); + set_config_value_node(nvl, "path", token); + } else + set_config_bool_node(nvl, token, true); + } + free(tofree); + return (0); +} - if (strcmp(opt, "ro") == 0) { - DPRINTF(("read-only mount requested\r\n")); - ro = true; - continue; - } +static int +pci_vt9p_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) +{ + struct pci_vt9p_softc *sc; + const char *value; + const char *sharename; + int rootfd; + bool ro; + cap_rights_t rootcap; + + ro = get_config_bool_node_default(nvl, "ro", false); - printf("virtio-9p: invalid option '%s'\n", opt); + value = get_config_value_node(nvl, "path"); + if (value == NULL) { + EPRINTLN("virtio-9p: path required"); return (1); } + rootfd = open(value, O_DIRECTORY); + if (rootfd < 0) { + EPRINTLN("virtio-9p: failed to open '%s': %s", value, + strerror(errno)); + return (-1); + } + sharename = get_config_value_node(nvl, "sharename"); + if (sharename == NULL) { + EPRINTLN("virtio-9p: share name required"); + return (1); + } if (strlen(sharename) > VT9P_MAXTAGSZ) { - printf("virtio-9p: share name too long\n"); + EPRINTLN("virtio-9p: share name too long"); return (1); } - rootfd = open(rootpath, O_DIRECTORY); - if (rootfd < 0) - return (-1); - sc = calloc(1, sizeof(struct pci_vt9p_softc)); sc->vsc_config = calloc(1, sizeof(struct pci_vt9p_config) + VT9P_MAXTAGSZ); pthread_mutex_init(&sc->vsc_mtx, NULL); cap_rights_init(&rootcap, CAP_LOOKUP, CAP_ACL_CHECK, CAP_ACL_DELETE, CAP_ACL_GET, CAP_ACL_SET, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_FSTAT, CAP_CREATE, CAP_FCHMODAT, CAP_FCHOWNAT, CAP_FTRUNCATE, CAP_LINKAT_SOURCE, CAP_LINKAT_TARGET, CAP_MKDIRAT, CAP_MKNODAT, CAP_PREAD, CAP_PWRITE, CAP_RENAMEAT_SOURCE, CAP_RENAMEAT_TARGET, CAP_SEEK, CAP_SYMLINKAT, CAP_UNLINKAT, CAP_EXTATTR_DELETE, CAP_EXTATTR_GET, CAP_EXTATTR_LIST, CAP_EXTATTR_SET, CAP_FUTIMES, CAP_FSTATFS, CAP_FSYNC, CAP_FPATHCONF); if (cap_rights_limit(rootfd, &rootcap) != 0) return (1); sc->vsc_config->tag_len = (uint16_t)strlen(sharename); memcpy(sc->vsc_config->tag, sharename, sc->vsc_config->tag_len); if (l9p_backend_fs_init(&sc->vsc_fs_backend, rootfd, ro) != 0) { errno = ENXIO; return (1); } if (l9p_server_init(&sc->vsc_server, sc->vsc_fs_backend) != 0) { errno = ENXIO; return (1); } if (l9p_connection_init(sc->vsc_server, &sc->vsc_conn) != 0) { errno = EIO; return (1); } sc->vsc_conn->lc_msize = L9P_MAX_IOV * PAGE_SIZE; sc->vsc_conn->lc_lt.lt_get_response_buffer = pci_vt9p_get_buffer; sc->vsc_conn->lc_lt.lt_send_response = pci_vt9p_send; sc->vsc_conn->lc_lt.lt_drop_response = pci_vt9p_drop; vi_softc_linkup(&sc->vsc_vs, &vt9p_vi_consts, sc, pi, &sc->vsc_vq); sc->vsc_vs.vs_mtx = &sc->vsc_mtx; sc->vsc_vq.vq_qsize = VT9P_RINGSZ; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_9P); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_9P); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) return (1); vi_set_io_bar(&sc->vsc_vs, 0); return (0); } struct pci_devemu pci_de_v9p = { .pe_emu = "virtio-9p", + .pe_legacy_config = pci_vt9p_legacy_config, .pe_init = pci_vt9p_init, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_v9p); diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c index 00258d1993b0..0dc58e49594b 100644 --- a/usr.sbin/bhyve/pci_virtio_block.c +++ b/usr.sbin/bhyve/pci_virtio_block.c @@ -1,577 +1,576 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * Copyright 2020 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" +#include "config.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #include "block_if.h" #define VTBLK_BSIZE 512 #define VTBLK_RINGSZ 128 _Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request"); #define VTBLK_S_OK 0 #define VTBLK_S_IOERR 1 #define VTBLK_S_UNSUPP 2 #define VTBLK_BLK_ID_BYTES 20 + 1 /* Capability bits */ #define VTBLK_F_BARRIER (1 << 0) /* Does host support barriers? */ #define VTBLK_F_SIZE_MAX (1 << 1) /* Indicates maximum segment size */ #define VTBLK_F_SEG_MAX (1 << 2) /* Indicates maximum # of segments */ #define VTBLK_F_GEOMETRY (1 << 4) /* Legacy geometry available */ #define VTBLK_F_RO (1 << 5) /* Disk is read-only */ #define VTBLK_F_BLK_SIZE (1 << 6) /* Block size of disk is available*/ #define VTBLK_F_SCSI (1 << 7) /* Supports scsi command passthru */ #define VTBLK_F_FLUSH (1 << 9) /* Writeback mode enabled after reset */ #define VTBLK_F_WCE (1 << 9) /* Legacy alias for FLUSH */ #define VTBLK_F_TOPOLOGY (1 << 10) /* Topology information is available */ #define VTBLK_F_CONFIG_WCE (1 << 11) /* Writeback mode available in config */ #define VTBLK_F_MQ (1 << 12) /* Multi-Queue */ #define VTBLK_F_DISCARD (1 << 13) /* Trim blocks */ #define VTBLK_F_WRITE_ZEROES (1 << 14) /* Write zeros */ /* * Host capabilities */ #define VTBLK_S_HOSTCAPS \ ( VTBLK_F_SEG_MAX | \ VTBLK_F_BLK_SIZE | \ VTBLK_F_FLUSH | \ VTBLK_F_TOPOLOGY | \ VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ /* * The current blockif_delete() interface only allows a single delete * request at a time. */ #define VTBLK_MAX_DISCARD_SEG 1 /* * An arbitrary limit to prevent excessive latency due to large * delete requests. */ #define VTBLK_MAX_DISCARD_SECT ((16 << 20) / VTBLK_BSIZE) /* 16 MiB */ /* * Config space "registers" */ struct vtblk_config { uint64_t vbc_capacity; uint32_t vbc_size_max; uint32_t vbc_seg_max; struct { uint16_t cylinders; uint8_t heads; uint8_t sectors; } vbc_geometry; uint32_t vbc_blk_size; struct { uint8_t physical_block_exp; uint8_t alignment_offset; uint16_t min_io_size; uint32_t opt_io_size; } vbc_topology; uint8_t vbc_writeback; uint8_t unused0[1]; uint16_t num_queues; uint32_t max_discard_sectors; uint32_t max_discard_seg; uint32_t discard_sector_alignment; uint32_t max_write_zeroes_sectors; uint32_t max_write_zeroes_seg; uint8_t write_zeroes_may_unmap; uint8_t unused1[3]; } __packed; /* * Fixed-size block header */ struct virtio_blk_hdr { #define VBH_OP_READ 0 #define VBH_OP_WRITE 1 #define VBH_OP_SCSI_CMD 2 #define VBH_OP_SCSI_CMD_OUT 3 #define VBH_OP_FLUSH 4 #define VBH_OP_FLUSH_OUT 5 #define VBH_OP_IDENT 8 #define VBH_OP_DISCARD 11 #define VBH_OP_WRITE_ZEROES 13 #define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ uint32_t vbh_type; uint32_t vbh_ioprio; uint64_t vbh_sector; } __packed; /* * Debug printf */ static int pci_vtblk_debug; #define DPRINTF(params) if (pci_vtblk_debug) PRINTLN params #define WPRINTF(params) PRINTLN params struct pci_vtblk_ioreq { struct blockif_req io_req; struct pci_vtblk_softc *io_sc; uint8_t *io_status; uint16_t io_idx; }; struct virtio_blk_discard_write_zeroes { uint64_t sector; uint32_t num_sectors; struct { uint32_t unmap:1; uint32_t reserved:31; } flags; }; /* * Per-device softc */ struct pci_vtblk_softc { struct virtio_softc vbsc_vs; pthread_mutex_t vsc_mtx; struct vqueue_info vbsc_vq; struct vtblk_config vbsc_cfg; struct virtio_consts vbsc_consts; struct blockif_ctxt *bc; char vbsc_ident[VTBLK_BLK_ID_BYTES]; struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; }; static void pci_vtblk_reset(void *); static void pci_vtblk_notify(void *, struct vqueue_info *); static int pci_vtblk_cfgread(void *, int, int, uint32_t *); static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); #ifdef BHYVE_SNAPSHOT static void pci_vtblk_pause(void *); static void pci_vtblk_resume(void *); static int pci_vtblk_snapshot(void *, struct vm_snapshot_meta *); #endif static struct virtio_consts vtblk_vi_consts = { "vtblk", /* our name */ 1, /* we support 1 virtqueue */ sizeof(struct vtblk_config), /* config reg size */ pci_vtblk_reset, /* reset */ pci_vtblk_notify, /* device-wide qnotify */ pci_vtblk_cfgread, /* read PCI config */ pci_vtblk_cfgwrite, /* write PCI config */ NULL, /* apply negotiated features */ VTBLK_S_HOSTCAPS, /* our capabilities */ #ifdef BHYVE_SNAPSHOT pci_vtblk_pause, /* pause blockif threads */ pci_vtblk_resume, /* resume blockif threads */ pci_vtblk_snapshot, /* save / restore device state */ #endif }; static void pci_vtblk_reset(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device reset requested !")); vi_reset_dev(&sc->vbsc_vs); } static void pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err) { struct pci_vtblk_softc *sc = io->io_sc; /* convert errno into a virtio block error return */ if (err == EOPNOTSUPP || err == ENOSYS) *io->io_status = VTBLK_S_UNSUPP; else if (err != 0) *io->io_status = VTBLK_S_IOERR; else *io->io_status = VTBLK_S_OK; /* * Return the descriptor back to the host. * We wrote 1 byte (our status) to host. */ vq_relchain(&sc->vbsc_vq, io->io_idx, 1); vq_endchains(&sc->vbsc_vq, 0); } #ifdef BHYVE_SNAPSHOT static void pci_vtblk_pause(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device pause requested !\n")); blockif_pause(sc->bc); } static void pci_vtblk_resume(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device resume requested !\n")); blockif_resume(sc->bc); } static int pci_vtblk_snapshot(void *vsc, struct vm_snapshot_meta *meta) { int ret; struct pci_vtblk_softc *sc = vsc; SNAPSHOT_VAR_OR_LEAVE(sc->vbsc_cfg, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(sc->vbsc_ident, sizeof(sc->vbsc_ident), meta, ret, done); done: return (ret); } #endif static void pci_vtblk_done(struct blockif_req *br, int err) { struct pci_vtblk_ioreq *io = br->br_param; struct pci_vtblk_softc *sc = io->io_sc; pthread_mutex_lock(&sc->vsc_mtx); pci_vtblk_done_locked(io, err); pthread_mutex_unlock(&sc->vsc_mtx); } static void pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) { struct virtio_blk_hdr *vbh; struct pci_vtblk_ioreq *io; int i, n; int err; ssize_t iolen; int writeop, type; struct iovec iov[BLOCKIF_IOV_MAX + 2]; uint16_t idx, flags[BLOCKIF_IOV_MAX + 2]; struct virtio_blk_discard_write_zeroes *discard; n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags); /* * The first descriptor will be the read-only fixed header, * and the last is for status (hence +2 above and below). * The remaining iov's are the actual data I/O vectors. * * XXX - note - this fails on crash dump, which does a * VIRTIO_BLK_T_FLUSH with a zero transfer length */ assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2); io = &sc->vbsc_ios[idx]; assert((flags[0] & VRING_DESC_F_WRITE) == 0); assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); vbh = (struct virtio_blk_hdr *)iov[0].iov_base; memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); io->io_req.br_iovcnt = n - 2; io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE; io->io_status = (uint8_t *)iov[--n].iov_base; assert(iov[n].iov_len == 1); assert(flags[n] & VRING_DESC_F_WRITE); /* * XXX * The guest should not be setting the BARRIER flag because * we don't advertise the capability. */ type = vbh->vbh_type & ~VBH_FLAG_BARRIER; writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD); iolen = 0; for (i = 1; i < n; i++) { /* * - write op implies read-only descriptor, * - read/ident op implies write-only descriptor, * therefore test the inverse of the descriptor bit * to the op. */ assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop); iolen += iov[i].iov_len; } io->io_req.br_resid = iolen; DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld", writeop ? "write/discard" : "read/ident", iolen, i - 1, io->io_req.br_offset)); switch (type) { case VBH_OP_READ: err = blockif_read(sc->bc, &io->io_req); break; case VBH_OP_WRITE: err = blockif_write(sc->bc, &io->io_req); break; case VBH_OP_DISCARD: /* * We currently only support a single request, if the guest * has submitted a request that doesn't conform to the * requirements, we return a error. */ if (iov[1].iov_len != sizeof (*discard)) { pci_vtblk_done_locked(io, EINVAL); return; } /* The segments to discard are provided rather than data */ discard = (struct virtio_blk_discard_write_zeroes *) iov[1].iov_base; /* * virtio v1.1 5.2.6.2: * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP * for discard and write zeroes commands if any unknown flag is * set. Furthermore, the device MUST set the status byte to * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag * is set. * * Currently there are no known flags for a DISCARD request. */ if (discard->flags.unmap != 0 || discard->flags.reserved != 0) { pci_vtblk_done_locked(io, ENOTSUP); return; } /* Make sure the request doesn't exceed our size limit */ if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) { pci_vtblk_done_locked(io, EINVAL); return; } io->io_req.br_offset = discard->sector * VTBLK_BSIZE; io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE; err = blockif_delete(sc->bc, &io->io_req); break; case VBH_OP_FLUSH: case VBH_OP_FLUSH_OUT: err = blockif_flush(sc->bc, &io->io_req); break; case VBH_OP_IDENT: /* Assume a single buffer */ /* S/n equal to buffer is not zero-terminated. */ memset(iov[1].iov_base, 0, iov[1].iov_len); strncpy(iov[1].iov_base, sc->vbsc_ident, MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); pci_vtblk_done_locked(io, 0); return; default: pci_vtblk_done_locked(io, EOPNOTSUPP); return; } assert(err == 0); } static void pci_vtblk_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtblk_softc *sc = vsc; while (vq_has_descs(vq)) pci_vtblk_proc(sc, vq); } static int -pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { char bident[sizeof("XX:X:X")]; struct blockif_ctxt *bctxt; + const char *path; MD5_CTX mdctx; u_char digest[16]; struct pci_vtblk_softc *sc; off_t size; int i, sectsz, sts, sto; - if (opts == NULL) { - WPRINTF(("virtio-block: backing device required")); - return (1); - } - /* * The supplied backing file has to exist */ snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); - bctxt = blockif_open(opts, bident); + bctxt = blockif_open(nvl, bident); if (bctxt == NULL) { perror("Could not open backing file"); return (1); } size = blockif_size(bctxt); sectsz = blockif_sectsz(bctxt); blockif_psectsz(bctxt, &sts, &sto); sc = calloc(1, sizeof(struct pci_vtblk_softc)); sc->bc = bctxt; for (i = 0; i < VTBLK_RINGSZ; i++) { struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i]; io->io_req.br_callback = pci_vtblk_done; io->io_req.br_param = io; io->io_sc = sc; io->io_idx = i; } bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts)); if (blockif_candelete(sc->bc)) sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD; pthread_mutex_init(&sc->vsc_mtx, NULL); /* init virtio softc and virtqueues */ vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, pi, &sc->vbsc_vq); sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; /* sc->vbsc_vq.vq_notify = we have no per-queue notify */ /* * Create an identifier for the backing file. Use parts of the * md5 sum of the filename */ + path = get_config_value_node(nvl, "path"); MD5Init(&mdctx); - MD5Update(&mdctx, opts, strlen(opts)); + MD5Update(&mdctx, path, strlen(path)); MD5Final(digest, &mdctx); snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES, "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); /* setup virtio block config space */ sc->vbsc_cfg.vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */ sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ /* * If Linux is presented with a seg_max greater than the virtio queue * size, it can stumble into situations where it violates its own * invariants and panics. For safety, we keep seg_max clamped, paying * heed to the two extra descriptors needed for the header and status * of a request. */ sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX); sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */ sc->vbsc_cfg.vbc_geometry.heads = 0; sc->vbsc_cfg.vbc_geometry.sectors = 0; sc->vbsc_cfg.vbc_blk_size = sectsz; sc->vbsc_cfg.vbc_topology.physical_block_exp = (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0; sc->vbsc_cfg.vbc_topology.alignment_offset = (sto != 0) ? ((sts - sto) / sectsz) : 0; sc->vbsc_cfg.vbc_topology.min_io_size = 0; sc->vbsc_cfg.vbc_topology.opt_io_size = 0; sc->vbsc_cfg.vbc_writeback = 0; sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT; sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG; sc->vbsc_cfg.discard_sector_alignment = MAX(sectsz, sts) / VTBLK_BSIZE; /* * Should we move some of this into virtio.c? Could * have the device, class, and subdev_0 as fields in * the virtio constants structure. */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_BLOCK); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) { blockif_close(sc->bc); free(sc); return (1); } vi_set_io_bar(&sc->vbsc_vs, 0); return (0); } static int pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value) { DPRINTF(("vtblk: write to readonly reg %d", offset)); return (1); } static int pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtblk_softc *sc = vsc; void *ptr; /* our caller has already verified offset and size */ ptr = (uint8_t *)&sc->vbsc_cfg + offset; memcpy(retval, ptr, size); return (0); } struct pci_devemu pci_de_vblk = { .pe_emu = "virtio-blk", .pe_init = pci_vtblk_init, + .pe_legacy_config = blockif_legacy_config, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = vi_pci_snapshot, #endif }; PCI_EMUL_SET(pci_de_vblk); diff --git a/usr.sbin/bhyve/pci_virtio_console.c b/usr.sbin/bhyve/pci_virtio_console.c index 1911210fa046..88d6c37f582e 100644 --- a/usr.sbin/bhyve/pci_virtio_console.c +++ b/usr.sbin/bhyve/pci_virtio_console.c @@ -1,682 +1,759 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 iXsystems Inc. * All rights reserved. * * This software was developed by Jakub Klama * under sponsorship from iXsystems Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" +#include "config.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #include "mevent.h" #include "sockstream.h" #define VTCON_RINGSZ 64 #define VTCON_MAXPORTS 16 #define VTCON_MAXQ (VTCON_MAXPORTS * 2 + 2) #define VTCON_DEVICE_READY 0 #define VTCON_DEVICE_ADD 1 #define VTCON_DEVICE_REMOVE 2 #define VTCON_PORT_READY 3 #define VTCON_CONSOLE_PORT 4 #define VTCON_CONSOLE_RESIZE 5 #define VTCON_PORT_OPEN 6 #define VTCON_PORT_NAME 7 #define VTCON_F_SIZE 0 #define VTCON_F_MULTIPORT 1 #define VTCON_F_EMERG_WRITE 2 #define VTCON_S_HOSTCAPS \ (VTCON_F_SIZE | VTCON_F_MULTIPORT | VTCON_F_EMERG_WRITE) static int pci_vtcon_debug; #define DPRINTF(params) if (pci_vtcon_debug) PRINTLN params #define WPRINTF(params) PRINTLN params struct pci_vtcon_softc; struct pci_vtcon_port; struct pci_vtcon_config; typedef void (pci_vtcon_cb_t)(struct pci_vtcon_port *, void *, struct iovec *, int); struct pci_vtcon_port { struct pci_vtcon_softc * vsp_sc; int vsp_id; const char * vsp_name; bool vsp_enabled; bool vsp_console; bool vsp_rx_ready; bool vsp_open; int vsp_rxq; int vsp_txq; void * vsp_arg; pci_vtcon_cb_t * vsp_cb; }; struct pci_vtcon_sock { struct pci_vtcon_port * vss_port; const char * vss_path; struct mevent * vss_server_evp; struct mevent * vss_conn_evp; int vss_server_fd; int vss_conn_fd; bool vss_open; }; struct pci_vtcon_softc { struct virtio_softc vsc_vs; struct vqueue_info vsc_queues[VTCON_MAXQ]; pthread_mutex_t vsc_mtx; uint64_t vsc_cfg; uint64_t vsc_features; char * vsc_rootdir; int vsc_kq; - int vsc_nports; bool vsc_ready; struct pci_vtcon_port vsc_control_port; struct pci_vtcon_port vsc_ports[VTCON_MAXPORTS]; struct pci_vtcon_config *vsc_config; }; struct pci_vtcon_config { uint16_t cols; uint16_t rows; uint32_t max_nr_ports; uint32_t emerg_wr; } __attribute__((packed)); struct pci_vtcon_control { uint32_t id; uint16_t event; uint16_t value; } __attribute__((packed)); struct pci_vtcon_console_resize { uint16_t cols; uint16_t rows; } __attribute__((packed)); static void pci_vtcon_reset(void *); static void pci_vtcon_notify_rx(void *, struct vqueue_info *); static void pci_vtcon_notify_tx(void *, struct vqueue_info *); static int pci_vtcon_cfgread(void *, int, int, uint32_t *); static int pci_vtcon_cfgwrite(void *, int, int, uint32_t); static void pci_vtcon_neg_features(void *, uint64_t); static void pci_vtcon_sock_accept(int, enum ev_type, void *); static void pci_vtcon_sock_rx(int, enum ev_type, void *); static void pci_vtcon_sock_tx(struct pci_vtcon_port *, void *, struct iovec *, int); static void pci_vtcon_control_send(struct pci_vtcon_softc *, struct pci_vtcon_control *, const void *, size_t); static void pci_vtcon_announce_port(struct pci_vtcon_port *); static void pci_vtcon_open_port(struct pci_vtcon_port *, bool); static struct virtio_consts vtcon_vi_consts = { "vtcon", /* our name */ VTCON_MAXQ, /* we support VTCON_MAXQ virtqueues */ sizeof(struct pci_vtcon_config), /* config reg size */ pci_vtcon_reset, /* reset */ NULL, /* device-wide qnotify */ pci_vtcon_cfgread, /* read virtio config */ pci_vtcon_cfgwrite, /* write virtio config */ pci_vtcon_neg_features, /* apply negotiated features */ VTCON_S_HOSTCAPS, /* our capabilities */ }; static void pci_vtcon_reset(void *vsc) { struct pci_vtcon_softc *sc; sc = vsc; DPRINTF(("vtcon: device reset requested!")); vi_reset_dev(&sc->vsc_vs); } static void pci_vtcon_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vtcon_softc *sc = vsc; sc->vsc_features = negotiated_features; } static int pci_vtcon_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtcon_softc *sc = vsc; void *ptr; ptr = (uint8_t *)sc->vsc_config + offset; memcpy(retval, ptr, size); return (0); } static int pci_vtcon_cfgwrite(void *vsc, int offset, int size, uint32_t val) { return (0); } static inline struct pci_vtcon_port * pci_vtcon_vq_to_port(struct pci_vtcon_softc *sc, struct vqueue_info *vq) { uint16_t num = vq->vq_num; if (num == 0 || num == 1) return (&sc->vsc_ports[0]); if (num == 2 || num == 3) return (&sc->vsc_control_port); return (&sc->vsc_ports[(num / 2) - 1]); } static inline struct vqueue_info * pci_vtcon_port_to_vq(struct pci_vtcon_port *port, bool tx_queue) { int qnum; qnum = tx_queue ? port->vsp_txq : port->vsp_rxq; return (&port->vsp_sc->vsc_queues[qnum]); } static struct pci_vtcon_port * -pci_vtcon_port_add(struct pci_vtcon_softc *sc, const char *name, +pci_vtcon_port_add(struct pci_vtcon_softc *sc, int port_id, const char *name, pci_vtcon_cb_t *cb, void *arg) { struct pci_vtcon_port *port; - if (sc->vsc_nports == VTCON_MAXPORTS) { + port = &sc->vsc_ports[port_id]; + if (port->vsp_enabled) { errno = EBUSY; return (NULL); } - - port = &sc->vsc_ports[sc->vsc_nports++]; - port->vsp_id = sc->vsc_nports - 1; + port->vsp_id = port_id; port->vsp_sc = sc; port->vsp_name = name; port->vsp_cb = cb; port->vsp_arg = arg; if (port->vsp_id == 0) { /* port0 */ port->vsp_txq = 0; port->vsp_rxq = 1; } else { - port->vsp_txq = sc->vsc_nports * 2; + port->vsp_txq = (port_id + 1) * 2; port->vsp_rxq = port->vsp_txq + 1; } port->vsp_enabled = true; return (port); } static int -pci_vtcon_sock_add(struct pci_vtcon_softc *sc, const char *name, - const char *path) +pci_vtcon_sock_add(struct pci_vtcon_softc *sc, const char *port_name, + const nvlist_t *nvl) { struct pci_vtcon_sock *sock; struct sockaddr_un sun; - char *pathcopy; + const char *name, *path; + char *cp, *pathcopy; + long port; int s = -1, fd = -1, error = 0; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif + port = strtol(port_name, &cp, 0); + if (*cp != '\0' || port < 0 || port >= VTCON_MAXPORTS) { + EPRINTLN("vtcon: Invalid port %s", port_name); + error = -1; + goto out; + } + + path = get_config_value_node(nvl, "path"); + if (path == NULL) { + EPRINTLN("vtcon: required path missing for port %ld", port); + error = -1; + goto out; + } + sock = calloc(1, sizeof(struct pci_vtcon_sock)); if (sock == NULL) { error = -1; goto out; } s = socket(AF_UNIX, SOCK_STREAM, 0); if (s < 0) { error = -1; goto out; } pathcopy = strdup(path); if (pathcopy == NULL) { error = -1; goto out; } fd = open(dirname(pathcopy), O_RDONLY | O_DIRECTORY); if (fd < 0) { free(pathcopy); error = -1; goto out; } sun.sun_family = AF_UNIX; sun.sun_len = sizeof(struct sockaddr_un); strcpy(pathcopy, path); strlcpy(sun.sun_path, basename(pathcopy), sizeof(sun.sun_path)); free(pathcopy); if (bindat(fd, s, (struct sockaddr *)&sun, sun.sun_len) < 0) { error = -1; goto out; } if (fcntl(s, F_SETFL, O_NONBLOCK) < 0) { error = -1; goto out; } if (listen(s, 1) < 0) { error = -1; goto out; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE); if (caph_rights_limit(s, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif - sock->vss_port = pci_vtcon_port_add(sc, name, pci_vtcon_sock_tx, sock); + name = get_config_value_node(nvl, "name"); + if (name == NULL) { + EPRINTLN("vtcon: required name missing for port %ld", port); + error = -1; + goto out; + } + sock->vss_port = pci_vtcon_port_add(sc, port, name, pci_vtcon_sock_tx, sock); if (sock->vss_port == NULL) { error = -1; goto out; } sock->vss_open = false; sock->vss_conn_fd = -1; sock->vss_server_fd = s; sock->vss_server_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_accept, sock); if (sock->vss_server_evp == NULL) { error = -1; goto out; } out: if (fd != -1) close(fd); if (error != 0) { if (s != -1) close(s); free(sock); } return (error); } static void pci_vtcon_sock_accept(int fd __unused, enum ev_type t __unused, void *arg) { struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; int s; s = accept(sock->vss_server_fd, NULL, NULL); if (s < 0) return; if (sock->vss_open) { close(s); return; } sock->vss_open = true; sock->vss_conn_fd = s; sock->vss_conn_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_rx, sock); pci_vtcon_open_port(sock->vss_port, true); } static void pci_vtcon_sock_rx(int fd __unused, enum ev_type t __unused, void *arg) { struct pci_vtcon_port *port; struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; struct vqueue_info *vq; struct iovec iov; static char dummybuf[2048]; int len, n; uint16_t idx; port = sock->vss_port; vq = pci_vtcon_port_to_vq(port, true); if (!sock->vss_open || !port->vsp_rx_ready) { len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); if (len == 0) goto close; return; } if (!vq_has_descs(vq)) { len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); vq_endchains(vq, 1); if (len == 0) goto close; return; } do { n = vq_getchain(vq, &idx, &iov, 1, NULL); len = readv(sock->vss_conn_fd, &iov, n); if (len == 0 || (len < 0 && errno == EWOULDBLOCK)) { vq_retchains(vq, 1); vq_endchains(vq, 0); if (len == 0) goto close; return; } vq_relchain(vq, idx, len); } while (vq_has_descs(vq)); vq_endchains(vq, 1); close: mevent_delete_close(sock->vss_conn_evp); sock->vss_conn_fd = -1; sock->vss_open = false; } static void pci_vtcon_sock_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, int niov) { struct pci_vtcon_sock *sock; int i, ret; sock = (struct pci_vtcon_sock *)arg; if (sock->vss_conn_fd == -1) return; for (i = 0; i < niov; i++) { ret = stream_write(sock->vss_conn_fd, iov[i].iov_base, iov[i].iov_len); if (ret <= 0) break; } if (ret <= 0) { mevent_delete_close(sock->vss_conn_evp); sock->vss_conn_fd = -1; sock->vss_open = false; } } static void pci_vtcon_control_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, int niov) { struct pci_vtcon_softc *sc; struct pci_vtcon_port *tmp; struct pci_vtcon_control resp, *ctrl; int i; assert(niov == 1); sc = port->vsp_sc; ctrl = (struct pci_vtcon_control *)iov->iov_base; switch (ctrl->event) { case VTCON_DEVICE_READY: sc->vsc_ready = true; /* set port ready events for registered ports */ for (i = 0; i < VTCON_MAXPORTS; i++) { tmp = &sc->vsc_ports[i]; if (tmp->vsp_enabled) pci_vtcon_announce_port(tmp); if (tmp->vsp_open) pci_vtcon_open_port(tmp, true); } break; case VTCON_PORT_READY: - if (ctrl->id >= sc->vsc_nports) { + tmp = &sc->vsc_ports[ctrl->id]; + if (ctrl->id >= VTCON_MAXPORTS || !tmp->vsp_enabled) { WPRINTF(("VTCON_PORT_READY event for unknown port %d", ctrl->id)); return; } - tmp = &sc->vsc_ports[ctrl->id]; if (tmp->vsp_console) { resp.event = VTCON_CONSOLE_PORT; resp.id = ctrl->id; resp.value = 1; pci_vtcon_control_send(sc, &resp, NULL, 0); } break; } } static void pci_vtcon_announce_port(struct pci_vtcon_port *port) { struct pci_vtcon_control event; event.id = port->vsp_id; event.event = VTCON_DEVICE_ADD; event.value = 1; pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); event.event = VTCON_PORT_NAME; pci_vtcon_control_send(port->vsp_sc, &event, port->vsp_name, strlen(port->vsp_name)); } static void pci_vtcon_open_port(struct pci_vtcon_port *port, bool open) { struct pci_vtcon_control event; if (!port->vsp_sc->vsc_ready) { port->vsp_open = true; return; } event.id = port->vsp_id; event.event = VTCON_PORT_OPEN; event.value = (int)open; pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); } static void pci_vtcon_control_send(struct pci_vtcon_softc *sc, struct pci_vtcon_control *ctrl, const void *payload, size_t len) { struct vqueue_info *vq; struct iovec iov; uint16_t idx; int n; vq = pci_vtcon_port_to_vq(&sc->vsc_control_port, true); if (!vq_has_descs(vq)) return; n = vq_getchain(vq, &idx, &iov, 1, NULL); assert(n == 1); memcpy(iov.iov_base, ctrl, sizeof(struct pci_vtcon_control)); if (payload != NULL && len > 0) memcpy(iov.iov_base + sizeof(struct pci_vtcon_control), payload, len); vq_relchain(vq, idx, sizeof(struct pci_vtcon_control) + len); vq_endchains(vq, 1); } static void pci_vtcon_notify_tx(void *vsc, struct vqueue_info *vq) { struct pci_vtcon_softc *sc; struct pci_vtcon_port *port; struct iovec iov[1]; uint16_t idx, n; uint16_t flags[8]; sc = vsc; port = pci_vtcon_vq_to_port(sc, vq); while (vq_has_descs(vq)) { n = vq_getchain(vq, &idx, iov, 1, flags); assert(n >= 1); if (port != NULL) port->vsp_cb(port, port->vsp_arg, iov, 1); /* * Release this chain and handle more */ vq_relchain(vq, idx, 0); } vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ } static void pci_vtcon_notify_rx(void *vsc, struct vqueue_info *vq) { struct pci_vtcon_softc *sc; struct pci_vtcon_port *port; sc = vsc; port = pci_vtcon_vq_to_port(sc, vq); if (!port->vsp_rx_ready) { port->vsp_rx_ready = 1; vq_kick_disable(vq); } } +/* + * Each console device has a "port" node which contains nodes for + * each port. Ports are numbered starting at 0. + */ static int -pci_vtcon_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_vtcon_legacy_config_port(nvlist_t *nvl, int port, char *opt) +{ + char *name, *path; + char node_name[sizeof("XX")]; + nvlist_t *port_nvl; + + name = strsep(&opt, "="); + path = opt; + if (path == NULL) { + EPRINTLN("vtcon: port %s requires a path", name); + return (-1); + } + if (port >= VTCON_MAXPORTS) { + EPRINTLN("vtcon: too many ports"); + return (-1); + } + snprintf(node_name, sizeof(node_name), "%d", port); + port_nvl = create_relative_config_node(nvl, node_name); + set_config_value_node(port_nvl, "name", name); + set_config_value_node(port_nvl, "path", path); + return (0); +} + +static int +pci_vtcon_legacy_config(nvlist_t *nvl, const char *opts) +{ + char *opt, *str, *tofree; + nvlist_t *ports_nvl; + int error, port; + + ports_nvl = create_relative_config_node(nvl, "port"); + tofree = str = strdup(opts); + error = 0; + port = 0; + while ((opt = strsep(&str, ",")) != NULL) { + error = pci_vtcon_legacy_config_port(ports_nvl, port, opt); + if (error) + break; + port++; + } + free(tofree); + return (error); +} + +static int +pci_vtcon_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { struct pci_vtcon_softc *sc; - char *portname = NULL; - char *portpath = NULL; - char *opt; - int i; + nvlist_t *ports_nvl; + int i; sc = calloc(1, sizeof(struct pci_vtcon_softc)); sc->vsc_config = calloc(1, sizeof(struct pci_vtcon_config)); sc->vsc_config->max_nr_ports = VTCON_MAXPORTS; sc->vsc_config->cols = 80; sc->vsc_config->rows = 25; vi_softc_linkup(&sc->vsc_vs, &vtcon_vi_consts, sc, pi, sc->vsc_queues); sc->vsc_vs.vs_mtx = &sc->vsc_mtx; for (i = 0; i < VTCON_MAXQ; i++) { sc->vsc_queues[i].vq_qsize = VTCON_RINGSZ; sc->vsc_queues[i].vq_notify = i % 2 == 0 ? pci_vtcon_notify_rx : pci_vtcon_notify_tx; } /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_CONSOLE); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_CONSOLE); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) return (1); vi_set_io_bar(&sc->vsc_vs, 0); /* create control port */ sc->vsc_control_port.vsp_sc = sc; sc->vsc_control_port.vsp_txq = 2; sc->vsc_control_port.vsp_rxq = 3; sc->vsc_control_port.vsp_cb = pci_vtcon_control_tx; sc->vsc_control_port.vsp_enabled = true; - while ((opt = strsep(&opts, ",")) != NULL) { - portname = strsep(&opt, "="); - portpath = opt; - - /* create port */ - if (pci_vtcon_sock_add(sc, portname, portpath) < 0) { - EPRINTLN("cannot create port %s: %s", - portname, strerror(errno)); - return (1); + ports_nvl = find_relative_config_node(nvl, "port"); + if (ports_nvl != NULL) { + const char *name; + void *cookie; + int type; + + cookie = NULL; + while ((name = nvlist_next(ports_nvl, &type, &cookie)) != + NULL) { + if (type != NV_TYPE_NVLIST) + continue; + + if (pci_vtcon_sock_add(sc, name, + nvlist_get_nvlist(ports_nvl, name)) < 0) { + EPRINTLN("cannot create port %s: %s", + name, strerror(errno)); + return (1); + } } } return (0); } struct pci_devemu pci_de_vcon = { .pe_emu = "virtio-console", .pe_init = pci_vtcon_init, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_vcon); diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c index 1c8af094b6dc..0ea470a71b56 100644 --- a/usr.sbin/bhyve/pci_virtio_net.c +++ b/usr.sbin/bhyve/pci_virtio_net.c @@ -1,850 +1,817 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" +#include "config.h" #include "debug.h" #include "pci_emul.h" #include "mevent.h" #include "virtio.h" #include "net_utils.h" #include "net_backends.h" #include "iov.h" #define VTNET_RINGSZ 1024 #define VTNET_MAXSEGS 256 #define VTNET_MAX_PKT_LEN (65536 + 64) #define VTNET_MIN_MTU ETHERMIN #define VTNET_MAX_MTU 65535 #define VTNET_S_HOSTCAPS \ ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | \ VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) /* * PCI config-space "registers" */ struct virtio_net_config { uint8_t mac[6]; uint16_t status; uint16_t max_virtqueue_pairs; uint16_t mtu; } __packed; /* * Queue definitions. */ #define VTNET_RXQ 0 #define VTNET_TXQ 1 #define VTNET_CTLQ 2 /* NB: not yet supported */ #define VTNET_MAXQ 3 /* * Debug printf */ static int pci_vtnet_debug; #define DPRINTF(params) if (pci_vtnet_debug) PRINTLN params #define WPRINTF(params) PRINTLN params /* * Per-device softc */ struct pci_vtnet_softc { struct virtio_softc vsc_vs; struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; pthread_mutex_t vsc_mtx; net_backend_t *vsc_be; bool features_negotiated; /* protected by rx_mtx */ int resetting; /* protected by tx_mtx */ uint64_t vsc_features; /* negotiated features */ pthread_mutex_t rx_mtx; int rx_merge; /* merged rx bufs in use */ pthread_t tx_tid; pthread_mutex_t tx_mtx; pthread_cond_t tx_cond; int tx_in_progress; size_t vhdrlen; size_t be_vhdrlen; struct virtio_net_config vsc_config; struct virtio_consts vsc_consts; }; static void pci_vtnet_reset(void *); /* static void pci_vtnet_notify(void *, struct vqueue_info *); */ static int pci_vtnet_cfgread(void *, int, int, uint32_t *); static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); static void pci_vtnet_neg_features(void *, uint64_t); #ifdef BHYVE_SNAPSHOT static void pci_vtnet_pause(void *); static void pci_vtnet_resume(void *); static int pci_vtnet_snapshot(void *, struct vm_snapshot_meta *); #endif static struct virtio_consts vtnet_vi_consts = { "vtnet", /* our name */ VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ sizeof(struct virtio_net_config), /* config reg size */ pci_vtnet_reset, /* reset */ NULL, /* device-wide qnotify -- not used */ pci_vtnet_cfgread, /* read PCI config */ pci_vtnet_cfgwrite, /* write PCI config */ pci_vtnet_neg_features, /* apply negotiated features */ VTNET_S_HOSTCAPS, /* our capabilities */ #ifdef BHYVE_SNAPSHOT pci_vtnet_pause, /* pause rx/tx threads */ pci_vtnet_resume, /* resume rx/tx threads */ pci_vtnet_snapshot, /* save / restore device state */ #endif }; static void pci_vtnet_reset(void *vsc) { struct pci_vtnet_softc *sc = vsc; DPRINTF(("vtnet: device reset requested !")); /* Acquire the RX lock to block RX processing. */ pthread_mutex_lock(&sc->rx_mtx); /* * Make sure receive operation is disabled at least until we * re-negotiate the features, since receive operation depends * on the value of sc->rx_merge and the header length, which * are both set in pci_vtnet_neg_features(). * Receive operation will be enabled again once the guest adds * the first receive buffers and kicks us. */ sc->features_negotiated = false; netbe_rx_disable(sc->vsc_be); /* Set sc->resetting and give a chance to the TX thread to stop. */ pthread_mutex_lock(&sc->tx_mtx); sc->resetting = 1; while (sc->tx_in_progress) { pthread_mutex_unlock(&sc->tx_mtx); usleep(10000); pthread_mutex_lock(&sc->tx_mtx); } /* * Now reset rings, MSI-X vectors, and negotiated capabilities. * Do that with the TX lock held, since we need to reset * sc->resetting. */ vi_reset_dev(&sc->vsc_vs); sc->resetting = 0; pthread_mutex_unlock(&sc->tx_mtx); pthread_mutex_unlock(&sc->rx_mtx); } static __inline struct iovec * iov_trim_hdr(struct iovec *iov, int *iovcnt, unsigned int hlen) { struct iovec *riov; if (iov[0].iov_len < hlen) { /* * Not enough header space in the first fragment. * That's not ok for us. */ return NULL; } iov[0].iov_len -= hlen; if (iov[0].iov_len == 0) { *iovcnt -= 1; if (*iovcnt == 0) { /* * Only space for the header. That's not * enough for us. */ return NULL; } riov = &iov[1]; } else { iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + hlen); riov = &iov[0]; } return (riov); } struct virtio_mrg_rxbuf_info { uint16_t idx; uint16_t pad; uint32_t len; }; static void pci_vtnet_rx(struct pci_vtnet_softc *sc) { int prepend_hdr_len = sc->vhdrlen - sc->be_vhdrlen; struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS]; struct iovec iov[VTNET_MAXSEGS + 1]; struct vqueue_info *vq; vq = &sc->vsc_queues[VTNET_RXQ]; /* Features must be negotiated */ if (!sc->features_negotiated) { return; } for (;;) { struct virtio_net_rxhdr *hdr; uint32_t riov_bytes; struct iovec *riov; uint32_t ulen; int riov_len; int n_chains; ssize_t rlen; ssize_t plen; plen = netbe_peek_recvlen(sc->vsc_be); if (plen <= 0) { /* * No more packets (plen == 0), or backend errored * (plen < 0). Interrupt if needed and stop. */ vq_endchains(vq, /*used_all_avail=*/0); return; } plen += prepend_hdr_len; /* * Get a descriptor chain to store the next ingress * packet. In case of mergeable rx buffers, get as * many chains as necessary in order to make room * for plen bytes. */ riov_bytes = 0; riov_len = 0; riov = iov; n_chains = 0; do { int n = vq_getchain(vq, &info[n_chains].idx, riov, VTNET_MAXSEGS - riov_len, NULL); if (n == 0) { /* * No rx buffers. Enable RX kicks and double * check. */ vq_kick_enable(vq); if (!vq_has_descs(vq)) { /* * Still no buffers. Return the unused * chains (if any), interrupt if needed * (including for NOTIFY_ON_EMPTY), and * disable the backend until the next * kick. */ vq_retchains(vq, n_chains); vq_endchains(vq, /*used_all_avail=*/1); netbe_rx_disable(sc->vsc_be); return; } /* More rx buffers found, so keep going. */ vq_kick_disable(vq); continue; } assert(n >= 1 && riov_len + n <= VTNET_MAXSEGS); riov_len += n; if (!sc->rx_merge) { n_chains = 1; break; } info[n_chains].len = (uint32_t)count_iov(riov, n); riov_bytes += info[n_chains].len; riov += n; n_chains++; } while (riov_bytes < plen && riov_len < VTNET_MAXSEGS); riov = iov; hdr = riov[0].iov_base; if (prepend_hdr_len > 0) { /* * The frontend uses a virtio-net header, but the * backend does not. We need to prepend a zeroed * header. */ riov = iov_trim_hdr(riov, &riov_len, prepend_hdr_len); if (riov == NULL) { /* * The first collected chain is nonsensical, * as it is not even enough to store the * virtio-net header. Just drop it. */ vq_relchain(vq, info[0].idx, 0); vq_retchains(vq, n_chains - 1); continue; } memset(hdr, 0, prepend_hdr_len); } rlen = netbe_recv(sc->vsc_be, riov, riov_len); if (rlen != plen - prepend_hdr_len) { /* * If this happens it means there is something * wrong with the backend (e.g., some other * process is stealing our packets). */ WPRINTF(("netbe_recv: expected %zd bytes, " "got %zd", plen - prepend_hdr_len, rlen)); vq_retchains(vq, n_chains); continue; } ulen = (uint32_t)plen; /* * Publish the used buffers to the guest, reporting the * number of bytes that we wrote. */ if (!sc->rx_merge) { vq_relchain(vq, info[0].idx, ulen); } else { uint32_t iolen; int i = 0; do { iolen = info[i].len; if (iolen > ulen) { iolen = ulen; } vq_relchain_prepare(vq, info[i].idx, iolen); ulen -= iolen; i++; } while (ulen > 0); hdr->vrh_bufs = i; vq_relchain_publish(vq); assert(i == n_chains); } } } /* * Called when there is read activity on the backend file descriptor. * Each buffer posted by the guest is assumed to be able to contain * an entire ethernet frame + rx header. */ static void pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) { struct pci_vtnet_softc *sc = param; pthread_mutex_lock(&sc->rx_mtx); pci_vtnet_rx(sc); pthread_mutex_unlock(&sc->rx_mtx); } /* Called on RX kick. */ static void pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) { struct pci_vtnet_softc *sc = vsc; /* * A qnotify means that the rx process can now begin. * Enable RX only if features are negotiated. */ pthread_mutex_lock(&sc->rx_mtx); if (!sc->features_negotiated) { pthread_mutex_unlock(&sc->rx_mtx); return; } vq_kick_disable(vq); netbe_rx_enable(sc->vsc_be); pthread_mutex_unlock(&sc->rx_mtx); } /* TX virtqueue processing, called by the TX thread. */ static void pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) { struct iovec iov[VTNET_MAXSEGS + 1]; struct iovec *siov = iov; uint16_t idx; ssize_t len; int n; /* * Obtain chain of descriptors. The first descriptor also * contains the virtio-net header. */ n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); assert(n >= 1 && n <= VTNET_MAXSEGS); if (sc->vhdrlen != sc->be_vhdrlen) { /* * The frontend uses a virtio-net header, but the backend * does not. We simply strip the header and ignore it, as * it should be zero-filled. */ siov = iov_trim_hdr(siov, &n, sc->vhdrlen); } if (siov == NULL) { /* The chain is nonsensical. Just drop it. */ len = 0; } else { len = netbe_send(sc->vsc_be, siov, n); if (len < 0) { /* * If send failed, report that 0 bytes * were read. */ len = 0; } } /* * Return the processed chain to the guest, reporting * the number of bytes that we read. */ vq_relchain(vq, idx, len); } /* Called on TX kick. */ static void pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) { struct pci_vtnet_softc *sc = vsc; /* * Any ring entries to process? */ if (!vq_has_descs(vq)) return; /* Signal the tx thread for processing */ pthread_mutex_lock(&sc->tx_mtx); vq_kick_disable(vq); if (sc->tx_in_progress == 0) pthread_cond_signal(&sc->tx_cond); pthread_mutex_unlock(&sc->tx_mtx); } /* * Thread which will handle processing of TX desc */ static void * pci_vtnet_tx_thread(void *param) { struct pci_vtnet_softc *sc = param; struct vqueue_info *vq; int error; vq = &sc->vsc_queues[VTNET_TXQ]; /* * Let us wait till the tx queue pointers get initialised & * first tx signaled */ pthread_mutex_lock(&sc->tx_mtx); error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); assert(error == 0); for (;;) { /* note - tx mutex is locked here */ while (sc->resetting || !vq_has_descs(vq)) { vq_kick_enable(vq); if (!sc->resetting && vq_has_descs(vq)) break; sc->tx_in_progress = 0; error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); assert(error == 0); } vq_kick_disable(vq); sc->tx_in_progress = 1; pthread_mutex_unlock(&sc->tx_mtx); do { /* * Run through entries, placing them into * iovecs and sending when an end-of-packet * is found */ pci_vtnet_proctx(sc, vq); } while (vq_has_descs(vq)); /* * Generate an interrupt if needed. */ vq_endchains(vq, /*used_all_avail=*/1); pthread_mutex_lock(&sc->tx_mtx); } } #ifdef notyet static void pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) { DPRINTF(("vtnet: control qnotify!")); } #endif static int -pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { struct pci_vtnet_softc *sc; + const char *value; char tname[MAXCOMLEN + 1]; - int mac_provided; - int mtu_provided; unsigned long mtu = ETHERMTU; + int err; /* * Allocate data structures for further virtio initializations. * sc also contains a copy of vtnet_vi_consts, since capabilities * change depending on the backend. */ sc = calloc(1, sizeof(struct pci_vtnet_softc)); sc->vsc_consts = vtnet_vi_consts; pthread_mutex_init(&sc->vsc_mtx, NULL); sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; #ifdef notyet sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; #endif - - /* - * Attempt to open the backend device and read the MAC address - * if specified. - */ - mac_provided = 0; - mtu_provided = 0; - if (opts != NULL) { - char *optscopy; - char *vtopts; - int err = 0; - /* Get the device name. */ - optscopy = vtopts = strdup(opts); - (void) strsep(&vtopts, ","); - - /* - * Parse the list of options in the form - * key1=value1,...,keyN=valueN. - */ - while (vtopts != NULL) { - char *value = vtopts; - char *key; - - key = strsep(&value, "="); - if (value == NULL) - break; - vtopts = value; - (void) strsep(&vtopts, ","); - - if (strcmp(key, "mac") == 0) { - err = net_parsemac(value, sc->vsc_config.mac); - if (err) - break; - mac_provided = 1; - } else if (strcmp(key, "mtu") == 0) { - err = net_parsemtu(value, &mtu); - if (err) - break; - - if (mtu < VTNET_MIN_MTU || mtu > VTNET_MAX_MTU) { - err = EINVAL; - errno = EINVAL; - break; - } - mtu_provided = 1; - } + value = get_config_value_node(nvl, "mac"); + if (value != NULL) { + err = net_parsemac(value, sc->vsc_config.mac); + if (err) { + free(sc); + return (err); } + } else + net_genmac(pi, sc->vsc_config.mac); - free(optscopy); - + value = get_config_value_node(nvl, "mtu"); + if (value != NULL) { + err = net_parsemtu(value, &mtu); if (err) { free(sc); return (err); } - err = netbe_init(&sc->vsc_be, opts, pci_vtnet_rx_callback, - sc); - - if (err) { + if (mtu < VTNET_MIN_MTU || mtu > VTNET_MAX_MTU) { + err = EINVAL; + errno = EINVAL; free(sc); return (err); } - sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF | - netbe_get_cap(sc->vsc_be); + sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MTU; } + sc->vsc_config.mtu = mtu; - if (!mac_provided) { - net_genmac(pi, sc->vsc_config.mac); + /* Permit interfaces without a configured backend. */ + if (get_config_value_node(nvl, "backend") != NULL) { + err = netbe_init(&sc->vsc_be, nvl, pci_vtnet_rx_callback, sc); + if (err) { + free(sc); + return (err); + } } - sc->vsc_config.mtu = mtu; - if (mtu_provided) { - sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MTU; - } + sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF | + netbe_get_cap(sc->vsc_be); /* * Since we do not actually support multiqueue, * set the maximum virtqueue pairs to 1. */ sc->vsc_config.max_virtqueue_pairs = 1; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); - /* Link is up if we managed to open backend device. */ - sc->vsc_config.status = (opts == NULL || sc->vsc_be); + /* Link is always up. */ + sc->vsc_config.status = 1; vi_softc_linkup(&sc->vsc_vs, &sc->vsc_consts, sc, pi, sc->vsc_queues); sc->vsc_vs.vs_mtx = &sc->vsc_mtx; /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) { free(sc); return (1); } /* use BAR 0 to map config regs in IO space */ vi_set_io_bar(&sc->vsc_vs, 0); sc->resetting = 0; sc->rx_merge = 0; sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2; pthread_mutex_init(&sc->rx_mtx, NULL); /* * Initialize tx semaphore & spawn TX processing thread. * As of now, only one thread for TX desc processing is * spawned. */ sc->tx_in_progress = 0; pthread_mutex_init(&sc->tx_mtx, NULL); pthread_cond_init(&sc->tx_cond, NULL); pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, pi->pi_func); pthread_set_name_np(sc->tx_tid, tname); return (0); } static int pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) { struct pci_vtnet_softc *sc = vsc; void *ptr; if (offset < (int)sizeof(sc->vsc_config.mac)) { assert(offset + size <= (int)sizeof(sc->vsc_config.mac)); /* * The driver is allowed to change the MAC address */ ptr = &sc->vsc_config.mac[offset]; memcpy(ptr, &value, size); } else { /* silently ignore other writes */ DPRINTF(("vtnet: write to readonly reg %d", offset)); } return (0); } static int pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtnet_softc *sc = vsc; void *ptr; ptr = (uint8_t *)&sc->vsc_config + offset; memcpy(retval, ptr, size); return (0); } static void pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vtnet_softc *sc = vsc; sc->vsc_features = negotiated_features; if (negotiated_features & VIRTIO_NET_F_MRG_RXBUF) { sc->vhdrlen = sizeof(struct virtio_net_rxhdr); sc->rx_merge = 1; } else { /* * Without mergeable rx buffers, virtio-net header is 2 * bytes shorter than sizeof(struct virtio_net_rxhdr). */ sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2; sc->rx_merge = 0; } /* Tell the backend to enable some capabilities it has advertised. */ netbe_set_cap(sc->vsc_be, negotiated_features, sc->vhdrlen); sc->be_vhdrlen = netbe_get_vnet_hdr_len(sc->vsc_be); assert(sc->be_vhdrlen == 0 || sc->be_vhdrlen == sc->vhdrlen); pthread_mutex_lock(&sc->rx_mtx); sc->features_negotiated = true; pthread_mutex_unlock(&sc->rx_mtx); } #ifdef BHYVE_SNAPSHOT static void pci_vtnet_pause(void *vsc) { struct pci_vtnet_softc *sc = vsc; DPRINTF(("vtnet: device pause requested !\n")); /* Acquire the RX lock to block RX processing. */ pthread_mutex_lock(&sc->rx_mtx); /* Wait for the transmit thread to finish its processing. */ pthread_mutex_lock(&sc->tx_mtx); while (sc->tx_in_progress) { pthread_mutex_unlock(&sc->tx_mtx); usleep(10000); pthread_mutex_lock(&sc->tx_mtx); } } static void pci_vtnet_resume(void *vsc) { struct pci_vtnet_softc *sc = vsc; DPRINTF(("vtnet: device resume requested !\n")); pthread_mutex_unlock(&sc->tx_mtx); /* The RX lock should have been acquired in vtnet_pause. */ pthread_mutex_unlock(&sc->rx_mtx); } static int pci_vtnet_snapshot(void *vsc, struct vm_snapshot_meta *meta) { int ret; struct pci_vtnet_softc *sc = vsc; DPRINTF(("vtnet: device snapshot requested !\n")); /* * Queues and consts should have been saved by the more generic * vi_pci_snapshot function. We need to save only our features and * config. */ SNAPSHOT_VAR_OR_LEAVE(sc->vsc_features, meta, ret, done); /* Force reapply negociated features at restore time */ if (meta->op == VM_SNAPSHOT_RESTORE) { pci_vtnet_neg_features(sc, sc->vsc_features); netbe_rx_enable(sc->vsc_be); } SNAPSHOT_VAR_OR_LEAVE(sc->vsc_config, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rx_merge, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->vhdrlen, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->be_vhdrlen, meta, ret, done); done: return (ret); } #endif static struct pci_devemu pci_de_vnet = { .pe_emu = "virtio-net", .pe_init = pci_vtnet_init, + .pe_legacy_config = netbe_legacy_config, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = vi_pci_snapshot, .pe_pause = vi_pci_pause, .pe_resume = vi_pci_resume, #endif }; PCI_EMUL_SET(pci_de_vnet); diff --git a/usr.sbin/bhyve/pci_virtio_rnd.c b/usr.sbin/bhyve/pci_virtio_rnd.c index 42feb7a2c4e0..d51301b32534 100644 --- a/usr.sbin/bhyve/pci_virtio_rnd.c +++ b/usr.sbin/bhyve/pci_virtio_rnd.c @@ -1,213 +1,213 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Nahanni Systems Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * virtio entropy device emulation. * Randomness is sourced from /dev/random which does not block * once it has been seeded at bootup. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #define VTRND_RINGSZ 64 static int pci_vtrnd_debug; #define DPRINTF(params) if (pci_vtrnd_debug) PRINTLN params #define WPRINTF(params) PRINTLN params /* * Per-device softc */ struct pci_vtrnd_softc { struct virtio_softc vrsc_vs; struct vqueue_info vrsc_vq; pthread_mutex_t vrsc_mtx; uint64_t vrsc_cfg; int vrsc_fd; }; static void pci_vtrnd_reset(void *); static void pci_vtrnd_notify(void *, struct vqueue_info *); static struct virtio_consts vtrnd_vi_consts = { "vtrnd", /* our name */ 1, /* we support 1 virtqueue */ 0, /* config reg size */ pci_vtrnd_reset, /* reset */ pci_vtrnd_notify, /* device-wide qnotify */ NULL, /* read virtio config */ NULL, /* write virtio config */ NULL, /* apply negotiated features */ 0, /* our capabilities */ }; static void pci_vtrnd_reset(void *vsc) { struct pci_vtrnd_softc *sc; sc = vsc; DPRINTF(("vtrnd: device reset requested !")); vi_reset_dev(&sc->vrsc_vs); } static void pci_vtrnd_notify(void *vsc, struct vqueue_info *vq) { struct iovec iov; struct pci_vtrnd_softc *sc; int len; uint16_t idx; sc = vsc; if (sc->vrsc_fd < 0) { vq_endchains(vq, 0); return; } while (vq_has_descs(vq)) { vq_getchain(vq, &idx, &iov, 1, NULL); len = read(sc->vrsc_fd, iov.iov_base, iov.iov_len); DPRINTF(("vtrnd: vtrnd_notify(): %d", len)); /* Catastrophe if unable to read from /dev/random */ assert(len > 0); /* * Release this chain and handle more */ vq_relchain(vq, idx, len); } vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ } static int -pci_vtrnd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_vtrnd_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { struct pci_vtrnd_softc *sc; int fd; int len; uint8_t v; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif /* * Should always be able to open /dev/random. */ fd = open("/dev/random", O_RDONLY | O_NONBLOCK); assert(fd >= 0); #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_READ); if (caph_rights_limit(fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif /* * Check that device is seeded and non-blocking. */ len = read(fd, &v, sizeof(v)); if (len <= 0) { WPRINTF(("vtrnd: /dev/random not ready, read(): %d", len)); close(fd); return (1); } sc = calloc(1, sizeof(struct pci_vtrnd_softc)); vi_softc_linkup(&sc->vrsc_vs, &vtrnd_vi_consts, sc, pi, &sc->vrsc_vq); sc->vrsc_vs.vs_mtx = &sc->vrsc_mtx; sc->vrsc_vq.vq_qsize = VTRND_RINGSZ; /* keep /dev/random opened while emulating */ sc->vrsc_fd = fd; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_RANDOM); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_CRYPTO); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_ENTROPY); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix())) return (1); vi_set_io_bar(&sc->vrsc_vs, 0); return (0); } struct pci_devemu pci_de_vrnd = { .pe_emu = "virtio-rnd", .pe_init = pci_vtrnd_init, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = vi_pci_snapshot, #endif }; PCI_EMUL_SET(pci_de_vrnd); diff --git a/usr.sbin/bhyve/pci_virtio_scsi.c b/usr.sbin/bhyve/pci_virtio_scsi.c index 1dd7b6ebcf0b..8edf64a11361 100644 --- a/usr.sbin/bhyve/pci_virtio_scsi.c +++ b/usr.sbin/bhyve/pci_virtio_scsi.c @@ -1,740 +1,745 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Jakub Klama . * Copyright (c) 2018 Marcelo Araujo . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" +#include "config.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #include "iov.h" #define VTSCSI_RINGSZ 64 #define VTSCSI_REQUESTQ 1 #define VTSCSI_THR_PER_Q 16 #define VTSCSI_MAXQ (VTSCSI_REQUESTQ + 2) #define VTSCSI_MAXSEG 64 #define VTSCSI_IN_HEADER_LEN(_sc) \ (sizeof(struct pci_vtscsi_req_cmd_rd) + _sc->vss_config.cdb_size) #define VTSCSI_OUT_HEADER_LEN(_sc) \ (sizeof(struct pci_vtscsi_req_cmd_wr) + _sc->vss_config.sense_size) #define VIRTIO_SCSI_MAX_CHANNEL 0 #define VIRTIO_SCSI_MAX_TARGET 0 #define VIRTIO_SCSI_MAX_LUN 16383 #define VIRTIO_SCSI_F_INOUT (1 << 0) #define VIRTIO_SCSI_F_HOTPLUG (1 << 1) #define VIRTIO_SCSI_F_CHANGE (1 << 2) static int pci_vtscsi_debug = 0; #define DPRINTF(params) if (pci_vtscsi_debug) PRINTLN params #define WPRINTF(params) PRINTLN params struct pci_vtscsi_config { uint32_t num_queues; uint32_t seg_max; uint32_t max_sectors; uint32_t cmd_per_lun; uint32_t event_info_size; uint32_t sense_size; uint32_t cdb_size; uint16_t max_channel; uint16_t max_target; uint32_t max_lun; } __attribute__((packed)); struct pci_vtscsi_queue { struct pci_vtscsi_softc * vsq_sc; struct vqueue_info * vsq_vq; pthread_mutex_t vsq_mtx; pthread_mutex_t vsq_qmtx; pthread_cond_t vsq_cv; STAILQ_HEAD(, pci_vtscsi_request) vsq_requests; LIST_HEAD(, pci_vtscsi_worker) vsq_workers; }; struct pci_vtscsi_worker { struct pci_vtscsi_queue * vsw_queue; pthread_t vsw_thread; bool vsw_exiting; LIST_ENTRY(pci_vtscsi_worker) vsw_link; }; struct pci_vtscsi_request { struct pci_vtscsi_queue * vsr_queue; struct iovec vsr_iov_in[VTSCSI_MAXSEG]; int vsr_niov_in; struct iovec vsr_iov_out[VTSCSI_MAXSEG]; int vsr_niov_out; uint32_t vsr_idx; STAILQ_ENTRY(pci_vtscsi_request) vsr_link; }; /* * Per-device softc */ struct pci_vtscsi_softc { struct virtio_softc vss_vs; struct vqueue_info vss_vq[VTSCSI_MAXQ]; struct pci_vtscsi_queue vss_queues[VTSCSI_REQUESTQ]; pthread_mutex_t vss_mtx; int vss_iid; int vss_ctl_fd; uint32_t vss_features; struct pci_vtscsi_config vss_config; }; #define VIRTIO_SCSI_T_TMF 0 #define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 #define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 #define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 #define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 #define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 #define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 #define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 #define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 /* command-specific response values */ #define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0 #define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 #define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 struct pci_vtscsi_ctrl_tmf { uint32_t type; uint32_t subtype; uint8_t lun[8]; uint64_t id; uint8_t response; } __attribute__((packed)); #define VIRTIO_SCSI_T_AN_QUERY 1 #define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2 #define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4 #define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8 #define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16 #define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32 #define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64 struct pci_vtscsi_ctrl_an { uint32_t type; uint8_t lun[8]; uint32_t event_requested; uint32_t event_actual; uint8_t response; } __attribute__((packed)); /* command-specific response values */ #define VIRTIO_SCSI_S_OK 0 #define VIRTIO_SCSI_S_OVERRUN 1 #define VIRTIO_SCSI_S_ABORTED 2 #define VIRTIO_SCSI_S_BAD_TARGET 3 #define VIRTIO_SCSI_S_RESET 4 #define VIRTIO_SCSI_S_BUSY 5 #define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 #define VIRTIO_SCSI_S_TARGET_FAILURE 7 #define VIRTIO_SCSI_S_NEXUS_FAILURE 8 #define VIRTIO_SCSI_S_FAILURE 9 #define VIRTIO_SCSI_S_INCORRECT_LUN 12 /* task_attr */ #define VIRTIO_SCSI_S_SIMPLE 0 #define VIRTIO_SCSI_S_ORDERED 1 #define VIRTIO_SCSI_S_HEAD 2 #define VIRTIO_SCSI_S_ACA 3 struct pci_vtscsi_event { uint32_t event; uint8_t lun[8]; uint32_t reason; } __attribute__((packed)); struct pci_vtscsi_req_cmd_rd { uint8_t lun[8]; uint64_t id; uint8_t task_attr; uint8_t prio; uint8_t crn; uint8_t cdb[]; } __attribute__((packed)); struct pci_vtscsi_req_cmd_wr { uint32_t sense_len; uint32_t residual; uint16_t status_qualifier; uint8_t status; uint8_t response; uint8_t sense[]; } __attribute__((packed)); static void *pci_vtscsi_proc(void *); static void pci_vtscsi_reset(void *); static void pci_vtscsi_neg_features(void *, uint64_t); static int pci_vtscsi_cfgread(void *, int, int, uint32_t *); static int pci_vtscsi_cfgwrite(void *, int, int, uint32_t); static inline int pci_vtscsi_get_lun(uint8_t *); static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *, void *, size_t); static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *, struct pci_vtscsi_ctrl_tmf *); static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *, struct pci_vtscsi_ctrl_an *); static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *, struct iovec *, int, struct iovec *, int); static void pci_vtscsi_controlq_notify(void *, struct vqueue_info *); static void pci_vtscsi_eventq_notify(void *, struct vqueue_info *); static void pci_vtscsi_requestq_notify(void *, struct vqueue_info *); static int pci_vtscsi_init_queue(struct pci_vtscsi_softc *, struct pci_vtscsi_queue *, int); -static int pci_vtscsi_init(struct vmctx *, struct pci_devinst *, char *); +static int pci_vtscsi_init(struct vmctx *, struct pci_devinst *, nvlist_t *); static struct virtio_consts vtscsi_vi_consts = { "vtscsi", /* our name */ VTSCSI_MAXQ, /* we support 2+n virtqueues */ sizeof(struct pci_vtscsi_config), /* config reg size */ pci_vtscsi_reset, /* reset */ NULL, /* device-wide qnotify */ pci_vtscsi_cfgread, /* read virtio config */ pci_vtscsi_cfgwrite, /* write virtio config */ pci_vtscsi_neg_features, /* apply negotiated features */ 0, /* our capabilities */ }; static void * pci_vtscsi_proc(void *arg) { struct pci_vtscsi_worker *worker = (struct pci_vtscsi_worker *)arg; struct pci_vtscsi_queue *q = worker->vsw_queue; struct pci_vtscsi_request *req; int iolen; for (;;) { pthread_mutex_lock(&q->vsq_mtx); while (STAILQ_EMPTY(&q->vsq_requests) && !worker->vsw_exiting) pthread_cond_wait(&q->vsq_cv, &q->vsq_mtx); if (worker->vsw_exiting) break; req = STAILQ_FIRST(&q->vsq_requests); STAILQ_REMOVE_HEAD(&q->vsq_requests, vsr_link); pthread_mutex_unlock(&q->vsq_mtx); iolen = pci_vtscsi_request_handle(q, req->vsr_iov_in, req->vsr_niov_in, req->vsr_iov_out, req->vsr_niov_out); pthread_mutex_lock(&q->vsq_qmtx); vq_relchain(q->vsq_vq, req->vsr_idx, iolen); vq_endchains(q->vsq_vq, 0); pthread_mutex_unlock(&q->vsq_qmtx); DPRINTF(("virtio-scsi: request completed", req->vsr_idx)); free(req); } pthread_mutex_unlock(&q->vsq_mtx); return (NULL); } static void pci_vtscsi_reset(void *vsc) { struct pci_vtscsi_softc *sc; sc = vsc; DPRINTF(("vtscsi: device reset requested")); vi_reset_dev(&sc->vss_vs); /* initialize config structure */ sc->vss_config = (struct pci_vtscsi_config){ .num_queues = VTSCSI_REQUESTQ, /* Leave room for the request and the response. */ .seg_max = VTSCSI_MAXSEG - 2, .max_sectors = 2, .cmd_per_lun = 1, .event_info_size = sizeof(struct pci_vtscsi_event), .sense_size = 96, .cdb_size = 32, .max_channel = VIRTIO_SCSI_MAX_CHANNEL, .max_target = VIRTIO_SCSI_MAX_TARGET, .max_lun = VIRTIO_SCSI_MAX_LUN }; } static void pci_vtscsi_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vtscsi_softc *sc = vsc; sc->vss_features = negotiated_features; } static int pci_vtscsi_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtscsi_softc *sc = vsc; void *ptr; ptr = (uint8_t *)&sc->vss_config + offset; memcpy(retval, ptr, size); return (0); } static int pci_vtscsi_cfgwrite(void *vsc, int offset, int size, uint32_t val) { return (0); } static inline int pci_vtscsi_get_lun(uint8_t *lun) { return (((lun[2] << 8) | lun[3]) & 0x3fff); } static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *sc, void *buf, size_t bufsize) { struct pci_vtscsi_ctrl_tmf *tmf; struct pci_vtscsi_ctrl_an *an; uint32_t type; type = *(uint32_t *)buf; if (type == VIRTIO_SCSI_T_TMF) { tmf = (struct pci_vtscsi_ctrl_tmf *)buf; return (pci_vtscsi_tmf_handle(sc, tmf)); } if (type == VIRTIO_SCSI_T_AN_QUERY) { an = (struct pci_vtscsi_ctrl_an *)buf; return (pci_vtscsi_an_handle(sc, an)); } return (0); } static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *sc, struct pci_vtscsi_ctrl_tmf *tmf) { union ctl_io *io; int err; io = ctl_scsi_alloc_io(sc->vss_iid); ctl_scsi_zero_io(io); io->io_hdr.io_type = CTL_IO_TASK; io->io_hdr.nexus.initid = sc->vss_iid; io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(tmf->lun); io->taskio.tag_type = CTL_TAG_SIMPLE; io->taskio.tag_num = (uint32_t)tmf->id; switch (tmf->subtype) { case VIRTIO_SCSI_T_TMF_ABORT_TASK: io->taskio.task_action = CTL_TASK_ABORT_TASK; break; case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: io->taskio.task_action = CTL_TASK_ABORT_TASK_SET; break; case VIRTIO_SCSI_T_TMF_CLEAR_ACA: io->taskio.task_action = CTL_TASK_CLEAR_ACA; break; case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: io->taskio.task_action = CTL_TASK_CLEAR_TASK_SET; break; case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET; break; case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: io->taskio.task_action = CTL_TASK_LUN_RESET; break; case VIRTIO_SCSI_T_TMF_QUERY_TASK: io->taskio.task_action = CTL_TASK_QUERY_TASK; break; case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET: io->taskio.task_action = CTL_TASK_QUERY_TASK_SET; break; } if (pci_vtscsi_debug) { struct sbuf *sb = sbuf_new_auto(); ctl_io_sbuf(io, sb); sbuf_finish(sb); DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); sbuf_delete(sb); } err = ioctl(sc->vss_ctl_fd, CTL_IO, io); if (err != 0) WPRINTF(("CTL_IO: err=%d (%s)", errno, strerror(errno))); tmf->response = io->taskio.task_status; ctl_scsi_free_io(io); return (1); } static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *sc, struct pci_vtscsi_ctrl_an *an) { return (0); } static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in, int niov_in, struct iovec *iov_out, int niov_out) { struct pci_vtscsi_softc *sc = q->vsq_sc; struct pci_vtscsi_req_cmd_rd *cmd_rd = NULL; struct pci_vtscsi_req_cmd_wr *cmd_wr; struct iovec data_iov_in[VTSCSI_MAXSEG], data_iov_out[VTSCSI_MAXSEG]; union ctl_io *io; int data_niov_in, data_niov_out; void *ext_data_ptr = NULL; uint32_t ext_data_len = 0, ext_sg_entries = 0; int err, nxferred; seek_iov(iov_in, niov_in, data_iov_in, &data_niov_in, VTSCSI_IN_HEADER_LEN(sc)); seek_iov(iov_out, niov_out, data_iov_out, &data_niov_out, VTSCSI_OUT_HEADER_LEN(sc)); truncate_iov(iov_in, &niov_in, VTSCSI_IN_HEADER_LEN(sc)); truncate_iov(iov_out, &niov_out, VTSCSI_OUT_HEADER_LEN(sc)); iov_to_buf(iov_in, niov_in, (void **)&cmd_rd); cmd_wr = malloc(VTSCSI_OUT_HEADER_LEN(sc)); io = ctl_scsi_alloc_io(sc->vss_iid); ctl_scsi_zero_io(io); io->io_hdr.nexus.initid = sc->vss_iid; io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(cmd_rd->lun); io->io_hdr.io_type = CTL_IO_SCSI; if (data_niov_in > 0) { ext_data_ptr = (void *)data_iov_in; ext_sg_entries = data_niov_in; ext_data_len = count_iov(data_iov_in, data_niov_in); io->io_hdr.flags |= CTL_FLAG_DATA_OUT; } else if (data_niov_out > 0) { ext_data_ptr = (void *)data_iov_out; ext_sg_entries = data_niov_out; ext_data_len = count_iov(data_iov_out, data_niov_out); io->io_hdr.flags |= CTL_FLAG_DATA_IN; } io->scsiio.sense_len = sc->vss_config.sense_size; io->scsiio.tag_num = (uint32_t)cmd_rd->id; switch (cmd_rd->task_attr) { case VIRTIO_SCSI_S_ORDERED: io->scsiio.tag_type = CTL_TAG_ORDERED; break; case VIRTIO_SCSI_S_HEAD: io->scsiio.tag_type = CTL_TAG_HEAD_OF_QUEUE; break; case VIRTIO_SCSI_S_ACA: io->scsiio.tag_type = CTL_TAG_ACA; break; case VIRTIO_SCSI_S_SIMPLE: default: io->scsiio.tag_type = CTL_TAG_SIMPLE; break; } io->scsiio.ext_sg_entries = ext_sg_entries; io->scsiio.ext_data_ptr = ext_data_ptr; io->scsiio.ext_data_len = ext_data_len; io->scsiio.ext_data_filled = 0; io->scsiio.cdb_len = sc->vss_config.cdb_size; memcpy(io->scsiio.cdb, cmd_rd->cdb, sc->vss_config.cdb_size); if (pci_vtscsi_debug) { struct sbuf *sb = sbuf_new_auto(); ctl_io_sbuf(io, sb); sbuf_finish(sb); DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); sbuf_delete(sb); } err = ioctl(sc->vss_ctl_fd, CTL_IO, io); if (err != 0) { WPRINTF(("CTL_IO: err=%d (%s)", errno, strerror(errno))); cmd_wr->response = VIRTIO_SCSI_S_FAILURE; } else { cmd_wr->sense_len = MIN(io->scsiio.sense_len, sc->vss_config.sense_size); cmd_wr->residual = io->scsiio.residual; cmd_wr->status = io->scsiio.scsi_status; cmd_wr->response = VIRTIO_SCSI_S_OK; memcpy(&cmd_wr->sense, &io->scsiio.sense_data, cmd_wr->sense_len); } buf_to_iov(cmd_wr, VTSCSI_OUT_HEADER_LEN(sc), iov_out, niov_out, 0); nxferred = VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled; free(cmd_rd); free(cmd_wr); ctl_scsi_free_io(io); return (nxferred); } static void pci_vtscsi_controlq_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtscsi_softc *sc; struct iovec iov[VTSCSI_MAXSEG]; uint16_t idx, n; void *buf = NULL; size_t bufsize; int iolen; sc = vsc; while (vq_has_descs(vq)) { n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, NULL); bufsize = iov_to_buf(iov, n, &buf); iolen = pci_vtscsi_control_handle(sc, buf, bufsize); buf_to_iov(buf + bufsize - iolen, iolen, iov, n, bufsize - iolen); /* * Release this chain and handle more */ vq_relchain(vq, idx, iolen); } vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ free(buf); } static void pci_vtscsi_eventq_notify(void *vsc, struct vqueue_info *vq) { vq_kick_disable(vq); } static void pci_vtscsi_requestq_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtscsi_softc *sc; struct pci_vtscsi_queue *q; struct pci_vtscsi_request *req; struct iovec iov[VTSCSI_MAXSEG]; uint16_t flags[VTSCSI_MAXSEG]; uint16_t idx, n, i; int readable; sc = vsc; q = &sc->vss_queues[vq->vq_num - 2]; while (vq_has_descs(vq)) { readable = 0; n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, flags); /* Count readable descriptors */ for (i = 0; i < n; i++) { if (flags[i] & VRING_DESC_F_WRITE) break; readable++; } req = calloc(1, sizeof(struct pci_vtscsi_request)); req->vsr_idx = idx; req->vsr_queue = q; req->vsr_niov_in = readable; req->vsr_niov_out = n - readable; memcpy(req->vsr_iov_in, iov, req->vsr_niov_in * sizeof(struct iovec)); memcpy(req->vsr_iov_out, iov + readable, req->vsr_niov_out * sizeof(struct iovec)); pthread_mutex_lock(&q->vsq_mtx); STAILQ_INSERT_TAIL(&q->vsq_requests, req, vsr_link); pthread_cond_signal(&q->vsq_cv); pthread_mutex_unlock(&q->vsq_mtx); DPRINTF(("virtio-scsi: request enqueued", idx)); } } static int pci_vtscsi_init_queue(struct pci_vtscsi_softc *sc, struct pci_vtscsi_queue *queue, int num) { struct pci_vtscsi_worker *worker; char tname[MAXCOMLEN + 1]; int i; queue->vsq_sc = sc; queue->vsq_vq = &sc->vss_vq[num + 2]; pthread_mutex_init(&queue->vsq_mtx, NULL); pthread_mutex_init(&queue->vsq_qmtx, NULL); pthread_cond_init(&queue->vsq_cv, NULL); STAILQ_INIT(&queue->vsq_requests); LIST_INIT(&queue->vsq_workers); for (i = 0; i < VTSCSI_THR_PER_Q; i++) { worker = calloc(1, sizeof(struct pci_vtscsi_worker)); worker->vsw_queue = queue; pthread_create(&worker->vsw_thread, NULL, &pci_vtscsi_proc, (void *)worker); snprintf(tname, sizeof(tname), "vtscsi:%d-%d", num, i); pthread_set_name_np(worker->vsw_thread, tname); LIST_INSERT_HEAD(&queue->vsq_workers, worker, vsw_link); } return (0); } static int -pci_vtscsi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_vtscsi_legacy_config(nvlist_t *nvl, const char *opts) +{ + char *cp, *devname; + + cp = strchr(opts, ','); + if (cp == NULL) { + set_config_value_node(nvl, "dev", opts); + return (0); + } + devname = strndup(opts, cp - opts); + set_config_value_node(nvl, "dev", devname); + free(devname); + return (pci_parse_legacy_config(nvl, cp + 1)); +} + +static int +pci_vtscsi_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { struct pci_vtscsi_softc *sc; - char *opt, *optname; - const char *devname; - int i, optidx = 0; + const char *devname, *value;; + int i; sc = calloc(1, sizeof(struct pci_vtscsi_softc)); - devname = "/dev/cam/ctl"; - while ((opt = strsep(&opts, ",")) != NULL) { - optname = strsep(&opt, "="); - if (opt == NULL && optidx == 0) { - if (optname[0] != 0) - devname = optname; - } else if (strcmp(optname, "dev") == 0 && opt != NULL) { - devname = opt; - } else if (strcmp(optname, "iid") == 0 && opt != NULL) { - sc->vss_iid = strtoul(opt, NULL, 10); - } else { - EPRINTLN("Invalid option %s", optname); - free(sc); - return (1); - } - optidx++; - } + value = get_config_value_node(nvl, "iid"); + if (value != NULL) + sc->vss_iid = strtoul(value, NULL, 10); + devname = get_config_value_node(nvl, "dev"); + if (devname == NULL) + devname = "/dev/cam/ctl"; sc->vss_ctl_fd = open(devname, O_RDWR); if (sc->vss_ctl_fd < 0) { WPRINTF(("cannot open %s: %s", devname, strerror(errno))); free(sc); return (1); } vi_softc_linkup(&sc->vss_vs, &vtscsi_vi_consts, sc, pi, sc->vss_vq); sc->vss_vs.vs_mtx = &sc->vss_mtx; /* controlq */ sc->vss_vq[0].vq_qsize = VTSCSI_RINGSZ; sc->vss_vq[0].vq_notify = pci_vtscsi_controlq_notify; /* eventq */ sc->vss_vq[1].vq_qsize = VTSCSI_RINGSZ; sc->vss_vq[1].vq_notify = pci_vtscsi_eventq_notify; /* request queues */ for (i = 2; i < VTSCSI_MAXQ; i++) { sc->vss_vq[i].vq_qsize = VTSCSI_RINGSZ; sc->vss_vq[i].vq_notify = pci_vtscsi_requestq_notify; pci_vtscsi_init_queue(sc, &sc->vss_queues[i - 2], i - 2); } /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_SCSI); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_SCSI); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vss_vs, 1, fbsdrun_virtio_msix())) return (1); vi_set_io_bar(&sc->vss_vs, 0); return (0); } struct pci_devemu pci_de_vscsi = { .pe_emu = "virtio-scsi", .pe_init = pci_vtscsi_init, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_vscsi); diff --git a/usr.sbin/bhyve/pci_xhci.c b/usr.sbin/bhyve/pci_xhci.c index 9227b934f046..dd6dfc39d014 100644 --- a/usr.sbin/bhyve/pci_xhci.c +++ b/usr.sbin/bhyve/pci_xhci.c @@ -1,3130 +1,3192 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Leon Dang * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* XHCI options: -s ,xhci,{devices} devices: tablet USB tablet mouse */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" +#include "config.h" #include "debug.h" #include "pci_emul.h" #include "pci_xhci.h" #include "usb_emul.h" static int xhci_debug = 0; #define DPRINTF(params) if (xhci_debug) PRINTLN params #define WPRINTF(params) PRINTLN params #define XHCI_NAME "xhci" #define XHCI_MAX_DEVS 8 /* 4 USB3 + 4 USB2 devs */ #define XHCI_MAX_SLOTS 64 /* min allowed by Windows drivers */ /* * XHCI data structures can be up to 64k, but limit paddr_guest2host mapping * to 4k to avoid going over the guest physical memory barrier. */ #define XHCI_PADDR_SZ 4096 /* paddr_guest2host max size */ #define XHCI_ERST_MAX 0 /* max 2^entries event ring seg tbl */ #define XHCI_CAPLEN (4*8) /* offset of op register space */ #define XHCI_HCCPRAMS2 0x1C /* offset of HCCPARAMS2 register */ #define XHCI_PORTREGS_START 0x400 #define XHCI_DOORBELL_MAX 256 #define XHCI_STREAMS_MAX 1 /* 4-15 in XHCI spec */ /* caplength and hci-version registers */ #define XHCI_SET_CAPLEN(x) ((x) & 0xFF) #define XHCI_SET_HCIVERSION(x) (((x) & 0xFFFF) << 16) #define XHCI_GET_HCIVERSION(x) (((x) >> 16) & 0xFFFF) /* hcsparams1 register */ #define XHCI_SET_HCSP1_MAXSLOTS(x) ((x) & 0xFF) #define XHCI_SET_HCSP1_MAXINTR(x) (((x) & 0x7FF) << 8) #define XHCI_SET_HCSP1_MAXPORTS(x) (((x) & 0xFF) << 24) /* hcsparams2 register */ #define XHCI_SET_HCSP2_IST(x) ((x) & 0x0F) #define XHCI_SET_HCSP2_ERSTMAX(x) (((x) & 0x0F) << 4) #define XHCI_SET_HCSP2_MAXSCRATCH_HI(x) (((x) & 0x1F) << 21) #define XHCI_SET_HCSP2_MAXSCRATCH_LO(x) (((x) & 0x1F) << 27) /* hcsparams3 register */ #define XHCI_SET_HCSP3_U1EXITLATENCY(x) ((x) & 0xFF) #define XHCI_SET_HCSP3_U2EXITLATENCY(x) (((x) & 0xFFFF) << 16) /* hccparams1 register */ #define XHCI_SET_HCCP1_AC64(x) ((x) & 0x01) #define XHCI_SET_HCCP1_BNC(x) (((x) & 0x01) << 1) #define XHCI_SET_HCCP1_CSZ(x) (((x) & 0x01) << 2) #define XHCI_SET_HCCP1_PPC(x) (((x) & 0x01) << 3) #define XHCI_SET_HCCP1_PIND(x) (((x) & 0x01) << 4) #define XHCI_SET_HCCP1_LHRC(x) (((x) & 0x01) << 5) #define XHCI_SET_HCCP1_LTC(x) (((x) & 0x01) << 6) #define XHCI_SET_HCCP1_NSS(x) (((x) & 0x01) << 7) #define XHCI_SET_HCCP1_PAE(x) (((x) & 0x01) << 8) #define XHCI_SET_HCCP1_SPC(x) (((x) & 0x01) << 9) #define XHCI_SET_HCCP1_SEC(x) (((x) & 0x01) << 10) #define XHCI_SET_HCCP1_CFC(x) (((x) & 0x01) << 11) #define XHCI_SET_HCCP1_MAXPSA(x) (((x) & 0x0F) << 12) #define XHCI_SET_HCCP1_XECP(x) (((x) & 0xFFFF) << 16) /* hccparams2 register */ #define XHCI_SET_HCCP2_U3C(x) ((x) & 0x01) #define XHCI_SET_HCCP2_CMC(x) (((x) & 0x01) << 1) #define XHCI_SET_HCCP2_FSC(x) (((x) & 0x01) << 2) #define XHCI_SET_HCCP2_CTC(x) (((x) & 0x01) << 3) #define XHCI_SET_HCCP2_LEC(x) (((x) & 0x01) << 4) #define XHCI_SET_HCCP2_CIC(x) (((x) & 0x01) << 5) /* other registers */ #define XHCI_SET_DOORBELL(x) ((x) & ~0x03) #define XHCI_SET_RTSOFFSET(x) ((x) & ~0x0F) /* register masks */ #define XHCI_PS_PLS_MASK (0xF << 5) /* port link state */ #define XHCI_PS_SPEED_MASK (0xF << 10) /* port speed */ #define XHCI_PS_PIC_MASK (0x3 << 14) /* port indicator */ /* port register set */ #define XHCI_PORTREGS_BASE 0x400 /* base offset */ #define XHCI_PORTREGS_PORT0 0x3F0 #define XHCI_PORTREGS_SETSZ 0x10 /* size of a set */ #define MASK_64_HI(x) ((x) & ~0xFFFFFFFFULL) #define MASK_64_LO(x) ((x) & 0xFFFFFFFFULL) #define FIELD_REPLACE(a,b,m,s) (((a) & ~((m) << (s))) | \ (((b) & (m)) << (s))) #define FIELD_COPY(a,b,m,s) (((a) & ~((m) << (s))) | \ (((b) & ((m) << (s))))) #define SNAP_DEV_NAME_LEN 128 struct pci_xhci_trb_ring { uint64_t ringaddr; /* current dequeue guest address */ uint32_t ccs; /* consumer cycle state */ }; /* device endpoint transfer/stream rings */ struct pci_xhci_dev_ep { union { struct xhci_trb *_epu_tr; struct xhci_stream_ctx *_epu_sctx; } _ep_trbsctx; #define ep_tr _ep_trbsctx._epu_tr #define ep_sctx _ep_trbsctx._epu_sctx union { struct pci_xhci_trb_ring _epu_trb; struct pci_xhci_trb_ring *_epu_sctx_trbs; } _ep_trb_rings; #define ep_ringaddr _ep_trb_rings._epu_trb.ringaddr #define ep_ccs _ep_trb_rings._epu_trb.ccs #define ep_sctx_trbs _ep_trb_rings._epu_sctx_trbs struct usb_data_xfer *ep_xfer; /* transfer chain */ }; /* device context base address array: maps slot->device context */ struct xhci_dcbaa { uint64_t dcba[USB_MAX_DEVICES+1]; /* xhci_dev_ctx ptrs */ }; /* port status registers */ struct pci_xhci_portregs { uint32_t portsc; /* port status and control */ uint32_t portpmsc; /* port pwr mgmt status & control */ uint32_t portli; /* port link info */ uint32_t porthlpmc; /* port hardware LPM control */ } __packed; #define XHCI_PS_SPEED_SET(x) (((x) & 0xF) << 10) /* xHC operational registers */ struct pci_xhci_opregs { uint32_t usbcmd; /* usb command */ uint32_t usbsts; /* usb status */ uint32_t pgsz; /* page size */ uint32_t dnctrl; /* device notification control */ uint64_t crcr; /* command ring control */ uint64_t dcbaap; /* device ctx base addr array ptr */ uint32_t config; /* configure */ /* guest mapped addresses: */ struct xhci_trb *cr_p; /* crcr dequeue */ struct xhci_dcbaa *dcbaa_p; /* dev ctx array ptr */ }; /* xHC runtime registers */ struct pci_xhci_rtsregs { uint32_t mfindex; /* microframe index */ struct { /* interrupter register set */ uint32_t iman; /* interrupter management */ uint32_t imod; /* interrupter moderation */ uint32_t erstsz; /* event ring segment table size */ uint32_t rsvd; uint64_t erstba; /* event ring seg-tbl base addr */ uint64_t erdp; /* event ring dequeue ptr */ } intrreg __packed; /* guest mapped addresses */ struct xhci_event_ring_seg *erstba_p; struct xhci_trb *erst_p; /* event ring segment tbl */ int er_deq_seg; /* event ring dequeue segment */ int er_enq_idx; /* event ring enqueue index - xHCI */ int er_enq_seg; /* event ring enqueue segment */ uint32_t er_events_cnt; /* number of events in ER */ uint32_t event_pcs; /* producer cycle state flag */ }; struct pci_xhci_softc; /* * USB device emulation container. * This is referenced from usb_hci->hci_sc; 1 pci_xhci_dev_emu for each * emulated device instance. */ struct pci_xhci_dev_emu { struct pci_xhci_softc *xsc; /* XHCI contexts */ struct xhci_dev_ctx *dev_ctx; struct pci_xhci_dev_ep eps[XHCI_MAX_ENDPOINTS]; int dev_slotstate; struct usb_devemu *dev_ue; /* USB emulated dev */ void *dev_sc; /* device's softc */ struct usb_hci hci; }; struct pci_xhci_softc { struct pci_devinst *xsc_pi; pthread_mutex_t mtx; uint32_t caplength; /* caplen & hciversion */ uint32_t hcsparams1; /* structural parameters 1 */ uint32_t hcsparams2; /* structural parameters 2 */ uint32_t hcsparams3; /* structural parameters 3 */ uint32_t hccparams1; /* capability parameters 1 */ uint32_t dboff; /* doorbell offset */ uint32_t rtsoff; /* runtime register space offset */ uint32_t hccparams2; /* capability parameters 2 */ uint32_t regsend; /* end of configuration registers */ struct pci_xhci_opregs opregs; struct pci_xhci_rtsregs rtsregs; struct pci_xhci_portregs *portregs; struct pci_xhci_dev_emu **devices; /* XHCI[port] = device */ struct pci_xhci_dev_emu **slots; /* slots assigned from 1 */ - int ndevices; int usb2_port_start; int usb3_port_start; }; /* portregs and devices arrays are set up to start from idx=1 */ #define XHCI_PORTREG_PTR(x,n) &(x)->portregs[(n)] #define XHCI_DEVINST_PTR(x,n) (x)->devices[(n)] #define XHCI_SLOTDEV_PTR(x,n) (x)->slots[(n)] #define XHCI_HALTED(sc) ((sc)->opregs.usbsts & XHCI_STS_HCH) #define XHCI_GADDR_SIZE(a) (XHCI_PADDR_SZ - \ (((uint64_t) (a)) & (XHCI_PADDR_SZ - 1))) #define XHCI_GADDR(sc,a) paddr_guest2host((sc)->xsc_pi->pi_vmctx, \ (a), XHCI_GADDR_SIZE(a)) static int xhci_in_use; /* map USB errors to XHCI */ static const int xhci_usb_errors[USB_ERR_MAX] = { [USB_ERR_NORMAL_COMPLETION] = XHCI_TRB_ERROR_SUCCESS, [USB_ERR_PENDING_REQUESTS] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NOT_STARTED] = XHCI_TRB_ERROR_ENDP_NOT_ON, [USB_ERR_INVAL] = XHCI_TRB_ERROR_INVALID, [USB_ERR_NOMEM] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_CANCELLED] = XHCI_TRB_ERROR_STOPPED, [USB_ERR_BAD_ADDRESS] = XHCI_TRB_ERROR_PARAMETER, [USB_ERR_BAD_BUFSIZE] = XHCI_TRB_ERROR_PARAMETER, [USB_ERR_BAD_FLAG] = XHCI_TRB_ERROR_PARAMETER, [USB_ERR_NO_CALLBACK] = XHCI_TRB_ERROR_STALL, [USB_ERR_IN_USE] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NO_ADDR] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NO_PIPE] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_ZERO_NFRAMES] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_ZERO_MAXP] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_SET_ADDR_FAILED] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NO_POWER] = XHCI_TRB_ERROR_ENDP_NOT_ON, [USB_ERR_TOO_DEEP] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_IOERROR] = XHCI_TRB_ERROR_TRB, [USB_ERR_NOT_CONFIGURED] = XHCI_TRB_ERROR_ENDP_NOT_ON, [USB_ERR_TIMEOUT] = XHCI_TRB_ERROR_CMD_ABORTED, [USB_ERR_SHORT_XFER] = XHCI_TRB_ERROR_SHORT_PKT, [USB_ERR_STALLED] = XHCI_TRB_ERROR_STALL, [USB_ERR_INTERRUPTED] = XHCI_TRB_ERROR_CMD_ABORTED, [USB_ERR_DMA_LOAD_FAILED] = XHCI_TRB_ERROR_DATA_BUF, [USB_ERR_BAD_CONTEXT] = XHCI_TRB_ERROR_TRB, [USB_ERR_NO_ROOT_HUB] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_NO_INTR_THREAD] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_NOT_LOCKED] = XHCI_TRB_ERROR_UNDEFINED, }; #define USB_TO_XHCI_ERR(e) ((e) < USB_ERR_MAX ? xhci_usb_errors[(e)] : \ XHCI_TRB_ERROR_INVALID) static int pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb, int do_intr); static void pci_xhci_dump_trb(struct xhci_trb *trb); static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc); static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot); static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm); static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, uint32_t streamid, uint64_t ringaddr, int ccs); static void pci_xhci_set_evtrb(struct xhci_trb *evtrb, uint64_t port, uint32_t errcode, uint32_t evtype) { evtrb->qwTrb0 = port << 24; evtrb->dwTrb2 = XHCI_TRB_2_ERROR_SET(errcode); evtrb->dwTrb3 = XHCI_TRB_3_TYPE_SET(evtype); } /* controller reset */ static void pci_xhci_reset(struct pci_xhci_softc *sc) { int i; sc->rtsregs.er_enq_idx = 0; sc->rtsregs.er_events_cnt = 0; sc->rtsregs.event_pcs = 1; for (i = 1; i <= XHCI_MAX_SLOTS; i++) { pci_xhci_reset_slot(sc, i); } } static uint32_t pci_xhci_usbcmd_write(struct pci_xhci_softc *sc, uint32_t cmd) { int do_intr = 0; int i; if (cmd & XHCI_CMD_RS) { do_intr = (sc->opregs.usbcmd & XHCI_CMD_RS) == 0; sc->opregs.usbcmd |= XHCI_CMD_RS; sc->opregs.usbsts &= ~XHCI_STS_HCH; sc->opregs.usbsts |= XHCI_STS_PCD; /* Queue port change event on controller run from stop */ if (do_intr) for (i = 1; i <= XHCI_MAX_DEVS; i++) { struct pci_xhci_dev_emu *dev; struct pci_xhci_portregs *port; struct xhci_trb evtrb; if ((dev = XHCI_DEVINST_PTR(sc, i)) == NULL) continue; port = XHCI_PORTREG_PTR(sc, i); port->portsc |= XHCI_PS_CSC | XHCI_PS_CCS; port->portsc &= ~XHCI_PS_PLS_MASK; /* * XHCI 4.19.3 USB2 RxDetect->Polling, * USB3 Polling->U0 */ if (dev->dev_ue->ue_usbver == 2) port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL); else port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0); pci_xhci_set_evtrb(&evtrb, i, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); if (pci_xhci_insert_event(sc, &evtrb, 0) != XHCI_TRB_ERROR_SUCCESS) break; } } else { sc->opregs.usbcmd &= ~XHCI_CMD_RS; sc->opregs.usbsts |= XHCI_STS_HCH; sc->opregs.usbsts &= ~XHCI_STS_PCD; } /* start execution of schedule; stop when set to 0 */ cmd |= sc->opregs.usbcmd & XHCI_CMD_RS; if (cmd & XHCI_CMD_HCRST) { /* reset controller */ pci_xhci_reset(sc); cmd &= ~XHCI_CMD_HCRST; } cmd &= ~(XHCI_CMD_CSS | XHCI_CMD_CRS); if (do_intr) pci_xhci_assert_interrupt(sc); return (cmd); } static void pci_xhci_portregs_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { struct xhci_trb evtrb; struct pci_xhci_portregs *p; int port; uint32_t oldpls, newpls; if (sc->portregs == NULL) return; port = (offset - XHCI_PORTREGS_PORT0) / XHCI_PORTREGS_SETSZ; offset = (offset - XHCI_PORTREGS_PORT0) % XHCI_PORTREGS_SETSZ; DPRINTF(("pci_xhci: portregs wr offset 0x%lx, port %u: 0x%lx", offset, port, value)); assert(port >= 0); if (port > XHCI_MAX_DEVS) { DPRINTF(("pci_xhci: portregs_write port %d > ndevices", port)); return; } if (XHCI_DEVINST_PTR(sc, port) == NULL) { DPRINTF(("pci_xhci: portregs_write to unattached port %d", port)); } p = XHCI_PORTREG_PTR(sc, port); switch (offset) { case 0: /* port reset or warm reset */ if (value & (XHCI_PS_PR | XHCI_PS_WPR)) { pci_xhci_reset_port(sc, port, value & XHCI_PS_WPR); break; } if ((p->portsc & XHCI_PS_PP) == 0) { WPRINTF(("pci_xhci: portregs_write to unpowered " "port %d", port)); break; } /* Port status and control register */ oldpls = XHCI_PS_PLS_GET(p->portsc); newpls = XHCI_PS_PLS_GET(value); p->portsc &= XHCI_PS_PED | XHCI_PS_PLS_MASK | XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK; if (XHCI_DEVINST_PTR(sc, port)) p->portsc |= XHCI_PS_CCS; p->portsc |= (value & ~(XHCI_PS_OCA | XHCI_PS_PR | XHCI_PS_PED | XHCI_PS_PLS_MASK | /* link state */ XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK | /* port indicator */ XHCI_PS_LWS | XHCI_PS_DR | XHCI_PS_WPR)); /* clear control bits */ p->portsc &= ~(value & (XHCI_PS_CSC | XHCI_PS_PEC | XHCI_PS_WRC | XHCI_PS_OCC | XHCI_PS_PRC | XHCI_PS_PLC | XHCI_PS_CEC | XHCI_PS_CAS)); /* port disable request; for USB3, don't care */ if (value & XHCI_PS_PED) DPRINTF(("Disable port %d request", port)); if (!(value & XHCI_PS_LWS)) break; DPRINTF(("Port new PLS: %d", newpls)); switch (newpls) { case 0: /* U0 */ case 3: /* U3 */ if (oldpls != newpls) { p->portsc &= ~XHCI_PS_PLS_MASK; p->portsc |= XHCI_PS_PLS_SET(newpls) | XHCI_PS_PLC; if (oldpls != 0 && newpls == 0) { pci_xhci_set_evtrb(&evtrb, port, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); pci_xhci_insert_event(sc, &evtrb, 1); } } break; default: DPRINTF(("Unhandled change port %d PLS %u", port, newpls)); break; } break; case 4: /* Port power management status and control register */ p->portpmsc = value; break; case 8: /* Port link information register */ DPRINTF(("pci_xhci attempted write to PORTLI, port %d", port)); break; case 12: /* * Port hardware LPM control register. * For USB3, this register is reserved. */ p->porthlpmc = value; break; } } struct xhci_dev_ctx * pci_xhci_get_dev_ctx(struct pci_xhci_softc *sc, uint32_t slot) { uint64_t devctx_addr; struct xhci_dev_ctx *devctx; - assert(slot > 0 && slot <= sc->ndevices); + assert(slot > 0 && slot <= XHCI_MAX_DEVS); + assert(XHCI_SLOTDEV_PTR(sc, slot) != NULL); assert(sc->opregs.dcbaa_p != NULL); devctx_addr = sc->opregs.dcbaa_p->dcba[slot]; if (devctx_addr == 0) { DPRINTF(("get_dev_ctx devctx_addr == 0")); return (NULL); } DPRINTF(("pci_xhci: get dev ctx, slot %u devctx addr %016lx", slot, devctx_addr)); devctx = XHCI_GADDR(sc, devctx_addr & ~0x3FUL); return (devctx); } struct xhci_trb * pci_xhci_trb_next(struct pci_xhci_softc *sc, struct xhci_trb *curtrb, uint64_t *guestaddr) { struct xhci_trb *next; assert(curtrb != NULL); if (XHCI_TRB_3_TYPE_GET(curtrb->dwTrb3) == XHCI_TRB_TYPE_LINK) { if (guestaddr) *guestaddr = curtrb->qwTrb0 & ~0xFUL; next = XHCI_GADDR(sc, curtrb->qwTrb0 & ~0xFUL); } else { if (guestaddr) *guestaddr += sizeof(struct xhci_trb) & ~0xFUL; next = curtrb + 1; } return (next); } static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc) { sc->rtsregs.intrreg.erdp |= XHCI_ERDP_LO_BUSY; sc->rtsregs.intrreg.iman |= XHCI_IMAN_INTR_PEND; sc->opregs.usbsts |= XHCI_STS_EINT; /* only trigger interrupt if permitted */ if ((sc->opregs.usbcmd & XHCI_CMD_INTE) && (sc->rtsregs.intrreg.iman & XHCI_IMAN_INTR_ENA)) { if (pci_msi_enabled(sc->xsc_pi)) pci_generate_msi(sc->xsc_pi, 0); else pci_lintr_assert(sc->xsc_pi); } } static void pci_xhci_deassert_interrupt(struct pci_xhci_softc *sc) { if (!pci_msi_enabled(sc->xsc_pi)) pci_lintr_assert(sc->xsc_pi); } static void pci_xhci_init_ep(struct pci_xhci_dev_emu *dev, int epid) { struct xhci_dev_ctx *dev_ctx; struct pci_xhci_dev_ep *devep; struct xhci_endp_ctx *ep_ctx; uint32_t pstreams; int i; dev_ctx = dev->dev_ctx; ep_ctx = &dev_ctx->ctx_ep[epid]; devep = &dev->eps[epid]; pstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0); if (pstreams > 0) { DPRINTF(("init_ep %d with pstreams %d", epid, pstreams)); assert(devep->ep_sctx_trbs == NULL); devep->ep_sctx = XHCI_GADDR(dev->xsc, ep_ctx->qwEpCtx2 & XHCI_EPCTX_2_TR_DQ_PTR_MASK); devep->ep_sctx_trbs = calloc(pstreams, sizeof(struct pci_xhci_trb_ring)); for (i = 0; i < pstreams; i++) { devep->ep_sctx_trbs[i].ringaddr = devep->ep_sctx[i].qwSctx0 & XHCI_SCTX_0_TR_DQ_PTR_MASK; devep->ep_sctx_trbs[i].ccs = XHCI_SCTX_0_DCS_GET(devep->ep_sctx[i].qwSctx0); } } else { DPRINTF(("init_ep %d with no pstreams", epid)); devep->ep_ringaddr = ep_ctx->qwEpCtx2 & XHCI_EPCTX_2_TR_DQ_PTR_MASK; devep->ep_ccs = XHCI_EPCTX_2_DCS_GET(ep_ctx->qwEpCtx2); devep->ep_tr = XHCI_GADDR(dev->xsc, devep->ep_ringaddr); DPRINTF(("init_ep tr DCS %x", devep->ep_ccs)); } if (devep->ep_xfer == NULL) { devep->ep_xfer = malloc(sizeof(struct usb_data_xfer)); USB_DATA_XFER_INIT(devep->ep_xfer); } } static void pci_xhci_disable_ep(struct pci_xhci_dev_emu *dev, int epid) { struct xhci_dev_ctx *dev_ctx; struct pci_xhci_dev_ep *devep; struct xhci_endp_ctx *ep_ctx; DPRINTF(("pci_xhci disable_ep %d", epid)); dev_ctx = dev->dev_ctx; ep_ctx = &dev_ctx->ctx_ep[epid]; ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_DISABLED; devep = &dev->eps[epid]; if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0 && devep->ep_sctx_trbs != NULL) free(devep->ep_sctx_trbs); if (devep->ep_xfer != NULL) { free(devep->ep_xfer); devep->ep_xfer = NULL; } memset(devep, 0, sizeof(struct pci_xhci_dev_ep)); } /* reset device at slot and data structures related to it */ static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot) { struct pci_xhci_dev_emu *dev; dev = XHCI_SLOTDEV_PTR(sc, slot); if (!dev) { DPRINTF(("xhci reset unassigned slot (%d)?", slot)); } else { dev->dev_slotstate = XHCI_ST_DISABLED; } /* TODO: reset ring buffer pointers */ } static int pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb, int do_intr) { struct pci_xhci_rtsregs *rts; uint64_t erdp; int erdp_idx; int err; struct xhci_trb *evtrbptr; err = XHCI_TRB_ERROR_SUCCESS; rts = &sc->rtsregs; erdp = rts->intrreg.erdp & ~0xF; erdp_idx = (erdp - rts->erstba_p[rts->er_deq_seg].qwEvrsTablePtr) / sizeof(struct xhci_trb); DPRINTF(("pci_xhci: insert event 0[%lx] 2[%x] 3[%x]", evtrb->qwTrb0, evtrb->dwTrb2, evtrb->dwTrb3)); DPRINTF(("\terdp idx %d/seg %d, enq idx %d/seg %d, pcs %u", erdp_idx, rts->er_deq_seg, rts->er_enq_idx, rts->er_enq_seg, rts->event_pcs)); DPRINTF(("\t(erdp=0x%lx, erst=0x%lx, tblsz=%u, do_intr %d)", erdp, rts->erstba_p->qwEvrsTablePtr, rts->erstba_p->dwEvrsTableSize, do_intr)); evtrbptr = &rts->erst_p[rts->er_enq_idx]; /* TODO: multi-segment table */ if (rts->er_events_cnt >= rts->erstba_p->dwEvrsTableSize) { DPRINTF(("pci_xhci[%d] cannot insert event; ring full", __LINE__)); err = XHCI_TRB_ERROR_EV_RING_FULL; goto done; } if (rts->er_events_cnt == rts->erstba_p->dwEvrsTableSize - 1) { struct xhci_trb errev; if ((evtrbptr->dwTrb3 & 0x1) == (rts->event_pcs & 0x1)) { DPRINTF(("pci_xhci[%d] insert evt err: ring full", __LINE__)); errev.qwTrb0 = 0; errev.dwTrb2 = XHCI_TRB_2_ERROR_SET( XHCI_TRB_ERROR_EV_RING_FULL); errev.dwTrb3 = XHCI_TRB_3_TYPE_SET( XHCI_TRB_EVENT_HOST_CTRL) | rts->event_pcs; rts->er_events_cnt++; memcpy(&rts->erst_p[rts->er_enq_idx], &errev, sizeof(struct xhci_trb)); rts->er_enq_idx = (rts->er_enq_idx + 1) % rts->erstba_p->dwEvrsTableSize; err = XHCI_TRB_ERROR_EV_RING_FULL; do_intr = 1; goto done; } } else { rts->er_events_cnt++; } evtrb->dwTrb3 &= ~XHCI_TRB_3_CYCLE_BIT; evtrb->dwTrb3 |= rts->event_pcs; memcpy(&rts->erst_p[rts->er_enq_idx], evtrb, sizeof(struct xhci_trb)); rts->er_enq_idx = (rts->er_enq_idx + 1) % rts->erstba_p->dwEvrsTableSize; if (rts->er_enq_idx == 0) rts->event_pcs ^= 1; done: if (do_intr) pci_xhci_assert_interrupt(sc); return (err); } static uint32_t pci_xhci_cmd_enable_slot(struct pci_xhci_softc *sc, uint32_t *slot) { struct pci_xhci_dev_emu *dev; uint32_t cmderr; int i; cmderr = XHCI_TRB_ERROR_NO_SLOTS; if (sc->portregs != NULL) for (i = 1; i <= XHCI_MAX_SLOTS; i++) { dev = XHCI_SLOTDEV_PTR(sc, i); if (dev && dev->dev_slotstate == XHCI_ST_DISABLED) { *slot = i; dev->dev_slotstate = XHCI_ST_ENABLED; cmderr = XHCI_TRB_ERROR_SUCCESS; dev->hci.hci_address = i; break; } } DPRINTF(("pci_xhci enable slot (error=%d) slot %u", cmderr != XHCI_TRB_ERROR_SUCCESS, *slot)); return (cmderr); } static uint32_t pci_xhci_cmd_disable_slot(struct pci_xhci_softc *sc, uint32_t slot) { struct pci_xhci_dev_emu *dev; uint32_t cmderr; DPRINTF(("pci_xhci disable slot %u", slot)); cmderr = XHCI_TRB_ERROR_NO_SLOTS; if (sc->portregs == NULL) goto done; - if (slot > sc->ndevices) { + if (slot > XHCI_MAX_SLOTS) { cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; goto done; } dev = XHCI_SLOTDEV_PTR(sc, slot); if (dev) { if (dev->dev_slotstate == XHCI_ST_DISABLED) { cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; } else { dev->dev_slotstate = XHCI_ST_DISABLED; cmderr = XHCI_TRB_ERROR_SUCCESS; /* TODO: reset events and endpoints */ } - } + } else + cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; done: return (cmderr); } static uint32_t pci_xhci_cmd_reset_device(struct pci_xhci_softc *sc, uint32_t slot) { struct pci_xhci_dev_emu *dev; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; uint32_t cmderr; int i; cmderr = XHCI_TRB_ERROR_NO_SLOTS; if (sc->portregs == NULL) goto done; DPRINTF(("pci_xhci reset device slot %u", slot)); dev = XHCI_SLOTDEV_PTR(sc, slot); if (!dev || dev->dev_slotstate == XHCI_ST_DISABLED) cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; else { dev->dev_slotstate = XHCI_ST_DEFAULT; dev->hci.hci_address = 0; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); /* slot state */ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_DEFAULT, 0x1F, 27); /* number of contexts */ dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); /* reset all eps other than ep-0 */ for (i = 2; i <= 31; i++) { ep_ctx = &dev_ctx->ctx_ep[i]; ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_DISABLED, 0x7, 0); } cmderr = XHCI_TRB_ERROR_SUCCESS; } pci_xhci_reset_slot(sc, slot); done: return (cmderr); } static uint32_t pci_xhci_cmd_address_device(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct pci_xhci_dev_emu *dev; struct xhci_input_dev_ctx *input_ctx; struct xhci_slot_ctx *islot_ctx; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep0_ctx; uint32_t cmderr; input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); islot_ctx = &input_ctx->ctx_slot; ep0_ctx = &input_ctx->ctx_ep[1]; cmderr = XHCI_TRB_ERROR_SUCCESS; DPRINTF(("pci_xhci: address device, input ctl: D 0x%08x A 0x%08x,", input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1)); DPRINTF((" slot %08x %08x %08x %08x", islot_ctx->dwSctx0, islot_ctx->dwSctx1, islot_ctx->dwSctx2, islot_ctx->dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); /* when setting address: drop-ctx=0, add-ctx=slot+ep0 */ if ((input_ctx->ctx_input.dwInCtx0 != 0) || (input_ctx->ctx_input.dwInCtx1 & 0x03) != 0x03) { DPRINTF(("pci_xhci: address device, input ctl invalid")); cmderr = XHCI_TRB_ERROR_TRB; goto done; } /* assign address to slot */ dev_ctx = pci_xhci_get_dev_ctx(sc, slot); DPRINTF(("pci_xhci: address device, dev ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); dev->hci.hci_address = slot; dev->dev_ctx = dev_ctx; if (dev->dev_ue->ue_reset == NULL || dev->dev_ue->ue_reset(dev->dev_sc) < 0) { cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; goto done; } memcpy(&dev_ctx->ctx_slot, islot_ctx, sizeof(struct xhci_slot_ctx)); dev_ctx->ctx_slot.dwSctx3 = XHCI_SCTX_3_SLOT_STATE_SET(XHCI_ST_SLCTX_ADDRESSED) | XHCI_SCTX_3_DEV_ADDR_SET(slot); memcpy(&dev_ctx->ctx_ep[1], ep0_ctx, sizeof(struct xhci_endp_ctx)); ep0_ctx = &dev_ctx->ctx_ep[1]; ep0_ctx->dwEpCtx0 = (ep0_ctx->dwEpCtx0 & ~0x7) | XHCI_EPCTX_0_EPSTATE_SET(XHCI_ST_EPCTX_RUNNING); pci_xhci_init_ep(dev, 1); dev->dev_slotstate = XHCI_ST_ADDRESSED; DPRINTF(("pci_xhci: address device, output ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); done: return (cmderr); } static uint32_t pci_xhci_cmd_config_ep(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct xhci_input_dev_ctx *input_ctx; struct pci_xhci_dev_emu *dev; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx, *iep_ctx; uint32_t cmderr; int i; cmderr = XHCI_TRB_ERROR_SUCCESS; DPRINTF(("pci_xhci config_ep slot %u", slot)); dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); if ((trb->dwTrb3 & XHCI_TRB_3_DCEP_BIT) != 0) { DPRINTF(("pci_xhci config_ep - deconfigure ep slot %u", slot)); if (dev->dev_ue->ue_stop != NULL) dev->dev_ue->ue_stop(dev->dev_sc); dev->dev_slotstate = XHCI_ST_ADDRESSED; dev->hci.hci_address = 0; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); /* number of contexts */ dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); /* slot state */ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_ADDRESSED, 0x1F, 27); /* disable endpoints */ for (i = 2; i < 32; i++) pci_xhci_disable_ep(dev, i); cmderr = XHCI_TRB_ERROR_SUCCESS; goto done; } if (dev->dev_slotstate < XHCI_ST_ADDRESSED) { DPRINTF(("pci_xhci: config_ep slotstate x%x != addressed", dev->dev_slotstate)); cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; goto done; } /* In addressed/configured state; * for each drop endpoint ctx flag: * ep->state = DISABLED * for each add endpoint ctx flag: * cp(ep-in, ep-out) * ep->state = RUNNING * for each drop+add endpoint flag: * reset ep resources * cp(ep-in, ep-out) * ep->state = RUNNING * if input->DisabledCtx[2-31] < 30: (at least 1 ep not disabled) * slot->state = configured */ input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); dev_ctx = dev->dev_ctx; DPRINTF(("pci_xhci: config_ep inputctx: D:x%08x A:x%08x 7:x%08x", input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1, input_ctx->ctx_input.dwInCtx7)); for (i = 2; i <= 31; i++) { ep_ctx = &dev_ctx->ctx_ep[i]; if (input_ctx->ctx_input.dwInCtx0 & XHCI_INCTX_0_DROP_MASK(i)) { DPRINTF((" config ep - dropping ep %d", i)); pci_xhci_disable_ep(dev, i); } if (input_ctx->ctx_input.dwInCtx1 & XHCI_INCTX_1_ADD_MASK(i)) { iep_ctx = &input_ctx->ctx_ep[i]; DPRINTF((" enable ep[%d] %08x %08x %016lx %08x", i, iep_ctx->dwEpCtx0, iep_ctx->dwEpCtx1, iep_ctx->qwEpCtx2, iep_ctx->dwEpCtx4)); memcpy(ep_ctx, iep_ctx, sizeof(struct xhci_endp_ctx)); pci_xhci_init_ep(dev, i); /* ep state */ ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); } } /* slot state to configured */ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_CONFIGURED, 0x1F, 27); dev_ctx->ctx_slot.dwSctx0 = FIELD_COPY( dev_ctx->ctx_slot.dwSctx0, input_ctx->ctx_slot.dwSctx0, 0x1F, 27); dev->dev_slotstate = XHCI_ST_CONFIGURED; DPRINTF(("EP configured; slot %u [0]=0x%08x [1]=0x%08x [2]=0x%08x " "[3]=0x%08x", slot, dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); done: return (cmderr); } static uint32_t pci_xhci_cmd_reset_ep(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; uint32_t cmderr, epid; uint32_t type; epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); DPRINTF(("pci_xhci: reset ep %u: slot %u", epid, slot)); cmderr = XHCI_TRB_ERROR_SUCCESS; type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); if (type == XHCI_TRB_TYPE_STOP_EP && (trb->dwTrb3 & XHCI_TRB_3_SUSP_EP_BIT) != 0) { /* XXX suspend endpoint for 10ms */ } if (epid < 1 || epid > 31) { DPRINTF(("pci_xhci: reset ep: invalid epid %u", epid)); cmderr = XHCI_TRB_ERROR_TRB; goto done; } devep = &dev->eps[epid]; if (devep->ep_xfer != NULL) USB_DATA_XFER_RESET(devep->ep_xfer); dev_ctx = dev->dev_ctx; assert(dev_ctx != NULL); ep_ctx = &dev_ctx->ctx_ep[epid]; ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) == 0) ep_ctx->qwEpCtx2 = devep->ep_ringaddr | devep->ep_ccs; DPRINTF(("pci_xhci: reset ep[%u] %08x %08x %016lx %08x", epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, ep_ctx->dwEpCtx4)); if (type == XHCI_TRB_TYPE_RESET_EP && (dev->dev_ue->ue_reset == NULL || dev->dev_ue->ue_reset(dev->dev_sc) < 0)) { cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; goto done; } done: return (cmderr); } static uint32_t pci_xhci_find_stream(struct pci_xhci_softc *sc, struct xhci_endp_ctx *ep, uint32_t streamid, struct xhci_stream_ctx **osctx) { struct xhci_stream_ctx *sctx; uint32_t maxpstreams; maxpstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep->dwEpCtx0); if (maxpstreams == 0) return (XHCI_TRB_ERROR_TRB); if (maxpstreams > XHCI_STREAMS_MAX) return (XHCI_TRB_ERROR_INVALID_SID); if (XHCI_EPCTX_0_LSA_GET(ep->dwEpCtx0) == 0) { DPRINTF(("pci_xhci: find_stream; LSA bit not set")); return (XHCI_TRB_ERROR_INVALID_SID); } /* only support primary stream */ if (streamid > maxpstreams) return (XHCI_TRB_ERROR_STREAM_TYPE); sctx = XHCI_GADDR(sc, ep->qwEpCtx2 & ~0xFUL) + streamid; if (!XHCI_SCTX_0_SCT_GET(sctx->qwSctx0)) return (XHCI_TRB_ERROR_STREAM_TYPE); *osctx = sctx; return (XHCI_TRB_ERROR_SUCCESS); } static uint32_t pci_xhci_cmd_set_tr(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; uint32_t cmderr, epid; uint32_t streamid; cmderr = XHCI_TRB_ERROR_SUCCESS; dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); DPRINTF(("pci_xhci set_tr: new-tr x%016lx, SCT %u DCS %u", (trb->qwTrb0 & ~0xF), (uint32_t)((trb->qwTrb0 >> 1) & 0x7), (uint32_t)(trb->qwTrb0 & 0x1))); DPRINTF((" stream-id %u, slot %u, epid %u, C %u", (trb->dwTrb2 >> 16) & 0xFFFF, XHCI_TRB_3_SLOT_GET(trb->dwTrb3), XHCI_TRB_3_EP_GET(trb->dwTrb3), trb->dwTrb3 & 0x1)); epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); if (epid < 1 || epid > 31) { DPRINTF(("pci_xhci: set_tr_deq: invalid epid %u", epid)); cmderr = XHCI_TRB_ERROR_TRB; goto done; } dev_ctx = dev->dev_ctx; assert(dev_ctx != NULL); ep_ctx = &dev_ctx->ctx_ep[epid]; devep = &dev->eps[epid]; switch (XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0)) { case XHCI_ST_EPCTX_STOPPED: case XHCI_ST_EPCTX_ERROR: break; default: DPRINTF(("pci_xhci cmd set_tr invalid state %x", XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0))); cmderr = XHCI_TRB_ERROR_CONTEXT_STATE; goto done; } streamid = XHCI_TRB_2_STREAM_GET(trb->dwTrb2); if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0) { struct xhci_stream_ctx *sctx; sctx = NULL; cmderr = pci_xhci_find_stream(sc, ep_ctx, streamid, &sctx); if (sctx != NULL) { assert(devep->ep_sctx != NULL); devep->ep_sctx[streamid].qwSctx0 = trb->qwTrb0; devep->ep_sctx_trbs[streamid].ringaddr = trb->qwTrb0 & ~0xF; devep->ep_sctx_trbs[streamid].ccs = XHCI_EPCTX_2_DCS_GET(trb->qwTrb0); } } else { if (streamid != 0) { DPRINTF(("pci_xhci cmd set_tr streamid %x != 0", streamid)); } ep_ctx->qwEpCtx2 = trb->qwTrb0 & ~0xFUL; devep->ep_ringaddr = ep_ctx->qwEpCtx2 & ~0xFUL; devep->ep_ccs = trb->qwTrb0 & 0x1; devep->ep_tr = XHCI_GADDR(sc, devep->ep_ringaddr); DPRINTF(("pci_xhci set_tr first TRB:")); pci_xhci_dump_trb(devep->ep_tr); } ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; done: return (cmderr); } static uint32_t pci_xhci_cmd_eval_ctx(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct xhci_input_dev_ctx *input_ctx; struct xhci_slot_ctx *islot_ctx; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep0_ctx; uint32_t cmderr; input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); islot_ctx = &input_ctx->ctx_slot; ep0_ctx = &input_ctx->ctx_ep[1]; cmderr = XHCI_TRB_ERROR_SUCCESS; DPRINTF(("pci_xhci: eval ctx, input ctl: D 0x%08x A 0x%08x,", input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1)); DPRINTF((" slot %08x %08x %08x %08x", islot_ctx->dwSctx0, islot_ctx->dwSctx1, islot_ctx->dwSctx2, islot_ctx->dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); /* this command expects drop-ctx=0 & add-ctx=slot+ep0 */ if ((input_ctx->ctx_input.dwInCtx0 != 0) || (input_ctx->ctx_input.dwInCtx1 & 0x03) == 0) { DPRINTF(("pci_xhci: eval ctx, input ctl invalid")); cmderr = XHCI_TRB_ERROR_TRB; goto done; } /* assign address to slot; in this emulation, slot_id = address */ dev_ctx = pci_xhci_get_dev_ctx(sc, slot); DPRINTF(("pci_xhci: eval ctx, dev ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); if (input_ctx->ctx_input.dwInCtx1 & 0x01) { /* slot ctx */ /* set max exit latency */ dev_ctx->ctx_slot.dwSctx1 = FIELD_COPY( dev_ctx->ctx_slot.dwSctx1, input_ctx->ctx_slot.dwSctx1, 0xFFFF, 0); /* set interrupter target */ dev_ctx->ctx_slot.dwSctx2 = FIELD_COPY( dev_ctx->ctx_slot.dwSctx2, input_ctx->ctx_slot.dwSctx2, 0x3FF, 22); } if (input_ctx->ctx_input.dwInCtx1 & 0x02) { /* control ctx */ /* set max packet size */ dev_ctx->ctx_ep[1].dwEpCtx1 = FIELD_COPY( dev_ctx->ctx_ep[1].dwEpCtx1, ep0_ctx->dwEpCtx1, 0xFFFF, 16); ep0_ctx = &dev_ctx->ctx_ep[1]; } DPRINTF(("pci_xhci: eval ctx, output ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); done: return (cmderr); } static int pci_xhci_complete_commands(struct pci_xhci_softc *sc) { struct xhci_trb evtrb; struct xhci_trb *trb; uint64_t crcr; uint32_t ccs; /* cycle state (XHCI 4.9.2) */ uint32_t type; uint32_t slot; uint32_t cmderr; int error; error = 0; sc->opregs.crcr |= XHCI_CRCR_LO_CRR; trb = sc->opregs.cr_p; ccs = sc->opregs.crcr & XHCI_CRCR_LO_RCS; crcr = sc->opregs.crcr & ~0xF; while (1) { sc->opregs.cr_p = trb; type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); if ((trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT) != (ccs & XHCI_TRB_3_CYCLE_BIT)) break; DPRINTF(("pci_xhci: cmd type 0x%x, Trb0 x%016lx dwTrb2 x%08x" " dwTrb3 x%08x, TRB_CYCLE %u/ccs %u", type, trb->qwTrb0, trb->dwTrb2, trb->dwTrb3, trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT, ccs)); cmderr = XHCI_TRB_ERROR_SUCCESS; evtrb.dwTrb2 = 0; evtrb.dwTrb3 = (ccs & XHCI_TRB_3_CYCLE_BIT) | XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_CMD_COMPLETE); slot = 0; switch (type) { case XHCI_TRB_TYPE_LINK: /* 0x06 */ if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) ccs ^= XHCI_CRCR_LO_RCS; break; case XHCI_TRB_TYPE_ENABLE_SLOT: /* 0x09 */ cmderr = pci_xhci_cmd_enable_slot(sc, &slot); break; case XHCI_TRB_TYPE_DISABLE_SLOT: /* 0x0A */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_disable_slot(sc, slot); break; case XHCI_TRB_TYPE_ADDRESS_DEVICE: /* 0x0B */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_address_device(sc, slot, trb); break; case XHCI_TRB_TYPE_CONFIGURE_EP: /* 0x0C */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_config_ep(sc, slot, trb); break; case XHCI_TRB_TYPE_EVALUATE_CTX: /* 0x0D */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_eval_ctx(sc, slot, trb); break; case XHCI_TRB_TYPE_RESET_EP: /* 0x0E */ DPRINTF(("Reset Endpoint on slot %d", slot)); slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); break; case XHCI_TRB_TYPE_STOP_EP: /* 0x0F */ DPRINTF(("Stop Endpoint on slot %d", slot)); slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); break; case XHCI_TRB_TYPE_SET_TR_DEQUEUE: /* 0x10 */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_set_tr(sc, slot, trb); break; case XHCI_TRB_TYPE_RESET_DEVICE: /* 0x11 */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_reset_device(sc, slot); break; case XHCI_TRB_TYPE_FORCE_EVENT: /* 0x12 */ /* TODO: */ break; case XHCI_TRB_TYPE_NEGOTIATE_BW: /* 0x13 */ break; case XHCI_TRB_TYPE_SET_LATENCY_TOL: /* 0x14 */ break; case XHCI_TRB_TYPE_GET_PORT_BW: /* 0x15 */ break; case XHCI_TRB_TYPE_FORCE_HEADER: /* 0x16 */ break; case XHCI_TRB_TYPE_NOOP_CMD: /* 0x17 */ break; default: DPRINTF(("pci_xhci: unsupported cmd %x", type)); break; } if (type != XHCI_TRB_TYPE_LINK) { /* * insert command completion event and assert intr */ evtrb.qwTrb0 = crcr; evtrb.dwTrb2 |= XHCI_TRB_2_ERROR_SET(cmderr); evtrb.dwTrb3 |= XHCI_TRB_3_SLOT_SET(slot); DPRINTF(("pci_xhci: command 0x%x result: 0x%x", type, cmderr)); pci_xhci_insert_event(sc, &evtrb, 1); } trb = pci_xhci_trb_next(sc, trb, &crcr); } sc->opregs.crcr = crcr | (sc->opregs.crcr & XHCI_CRCR_LO_CA) | ccs; sc->opregs.crcr &= ~XHCI_CRCR_LO_CRR; return (error); } static void pci_xhci_dump_trb(struct xhci_trb *trb) { static const char *trbtypes[] = { "RESERVED", "NORMAL", "SETUP_STAGE", "DATA_STAGE", "STATUS_STAGE", "ISOCH", "LINK", "EVENT_DATA", "NOOP", "ENABLE_SLOT", "DISABLE_SLOT", "ADDRESS_DEVICE", "CONFIGURE_EP", "EVALUATE_CTX", "RESET_EP", "STOP_EP", "SET_TR_DEQUEUE", "RESET_DEVICE", "FORCE_EVENT", "NEGOTIATE_BW", "SET_LATENCY_TOL", "GET_PORT_BW", "FORCE_HEADER", "NOOP_CMD" }; uint32_t type; type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); DPRINTF(("pci_xhci: trb[@%p] type x%02x %s 0:x%016lx 2:x%08x 3:x%08x", trb, type, type <= XHCI_TRB_TYPE_NOOP_CMD ? trbtypes[type] : "INVALID", trb->qwTrb0, trb->dwTrb2, trb->dwTrb3)); } static int pci_xhci_xfer_complete(struct pci_xhci_softc *sc, struct usb_data_xfer *xfer, uint32_t slot, uint32_t epid, int *do_intr) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; struct xhci_trb *trb; struct xhci_trb evtrb; uint32_t trbflags; uint32_t edtla; int i, err; dev = XHCI_SLOTDEV_PTR(sc, slot); devep = &dev->eps[epid]; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); assert(dev_ctx != NULL); ep_ctx = &dev_ctx->ctx_ep[epid]; err = XHCI_TRB_ERROR_SUCCESS; *do_intr = 0; edtla = 0; /* go through list of TRBs and insert event(s) */ for (i = xfer->head; xfer->ndata > 0; ) { evtrb.qwTrb0 = (uint64_t)xfer->data[i].hci_data; trb = XHCI_GADDR(sc, evtrb.qwTrb0); trbflags = trb->dwTrb3; DPRINTF(("pci_xhci: xfer[%d] done?%u:%d trb %x %016lx %x " "(err %d) IOC?%d", i, xfer->data[i].processed, xfer->data[i].blen, XHCI_TRB_3_TYPE_GET(trbflags), evtrb.qwTrb0, trbflags, err, trb->dwTrb3 & XHCI_TRB_3_IOC_BIT ? 1 : 0)); if (!xfer->data[i].processed) { xfer->head = i; break; } xfer->ndata--; edtla += xfer->data[i].bdone; trb->dwTrb3 = (trb->dwTrb3 & ~0x1) | (xfer->data[i].ccs); pci_xhci_update_ep_ring(sc, dev, devep, ep_ctx, xfer->data[i].streamid, xfer->data[i].trbnext, xfer->data[i].ccs); /* Only interrupt if IOC or short packet */ if (!(trb->dwTrb3 & XHCI_TRB_3_IOC_BIT) && !((err == XHCI_TRB_ERROR_SHORT_PKT) && (trb->dwTrb3 & XHCI_TRB_3_ISP_BIT))) { i = (i + 1) % USB_MAX_XFER_BLOCKS; continue; } evtrb.dwTrb2 = XHCI_TRB_2_ERROR_SET(err) | XHCI_TRB_2_REM_SET(xfer->data[i].blen); evtrb.dwTrb3 = XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_TRANSFER) | XHCI_TRB_3_SLOT_SET(slot) | XHCI_TRB_3_EP_SET(epid); if (XHCI_TRB_3_TYPE_GET(trbflags) == XHCI_TRB_TYPE_EVENT_DATA) { DPRINTF(("pci_xhci EVENT_DATA edtla %u", edtla)); evtrb.qwTrb0 = trb->qwTrb0; evtrb.dwTrb2 = (edtla & 0xFFFFF) | XHCI_TRB_2_ERROR_SET(err); evtrb.dwTrb3 |= XHCI_TRB_3_ED_BIT; edtla = 0; } *do_intr = 1; err = pci_xhci_insert_event(sc, &evtrb, 0); if (err != XHCI_TRB_ERROR_SUCCESS) { break; } i = (i + 1) % USB_MAX_XFER_BLOCKS; } return (err); } static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, uint32_t streamid, uint64_t ringaddr, int ccs) { if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) { devep->ep_sctx[streamid].qwSctx0 = (ringaddr & ~0xFUL) | (ccs & 0x1); devep->ep_sctx_trbs[streamid].ringaddr = ringaddr & ~0xFUL; devep->ep_sctx_trbs[streamid].ccs = ccs & 0x1; ep_ctx->qwEpCtx2 = (ep_ctx->qwEpCtx2 & ~0x1) | (ccs & 0x1); DPRINTF(("xhci update ep-ring stream %d, addr %lx", streamid, devep->ep_sctx[streamid].qwSctx0)); } else { devep->ep_ringaddr = ringaddr & ~0xFUL; devep->ep_ccs = ccs & 0x1; devep->ep_tr = XHCI_GADDR(sc, ringaddr & ~0xFUL); ep_ctx->qwEpCtx2 = (ringaddr & ~0xFUL) | (ccs & 0x1); DPRINTF(("xhci update ep-ring, addr %lx", (devep->ep_ringaddr | devep->ep_ccs))); } } /* * Outstanding transfer still in progress (device NAK'd earlier) so retry * the transfer again to see if it succeeds. */ static int pci_xhci_try_usb_xfer(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, uint32_t slot, uint32_t epid) { struct usb_data_xfer *xfer; int err; int do_intr; ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); err = 0; do_intr = 0; xfer = devep->ep_xfer; USB_DATA_XFER_LOCK(xfer); /* outstanding requests queued up */ if (dev->dev_ue->ue_data != NULL) { err = dev->dev_ue->ue_data(dev->dev_sc, xfer, epid & 0x1 ? USB_XFER_IN : USB_XFER_OUT, epid/2); if (err == USB_ERR_CANCELLED) { if (USB_DATA_GET_ERRCODE(&xfer->data[xfer->head]) == USB_NAK) err = XHCI_TRB_ERROR_SUCCESS; } else { err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr); if (err == XHCI_TRB_ERROR_SUCCESS && do_intr) { pci_xhci_assert_interrupt(sc); } /* XXX should not do it if error? */ USB_DATA_XFER_RESET(xfer); } } USB_DATA_XFER_UNLOCK(xfer); return (err); } static int pci_xhci_handle_transfer(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, struct xhci_trb *trb, uint32_t slot, uint32_t epid, uint64_t addr, uint32_t ccs, uint32_t streamid) { struct xhci_trb *setup_trb; struct usb_data_xfer *xfer; struct usb_data_xfer_block *xfer_block; uint64_t val; uint32_t trbflags; int do_intr, err; int do_retry; ep_ctx->dwEpCtx0 = FIELD_REPLACE(ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); xfer = devep->ep_xfer; USB_DATA_XFER_LOCK(xfer); DPRINTF(("pci_xhci handle_transfer slot %u", slot)); retry: err = 0; do_retry = 0; do_intr = 0; setup_trb = NULL; while (1) { pci_xhci_dump_trb(trb); trbflags = trb->dwTrb3; if (XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK && (trbflags & XHCI_TRB_3_CYCLE_BIT) != (ccs & XHCI_TRB_3_CYCLE_BIT)) { DPRINTF(("Cycle-bit changed trbflags %x, ccs %x", trbflags & XHCI_TRB_3_CYCLE_BIT, ccs)); break; } xfer_block = NULL; switch (XHCI_TRB_3_TYPE_GET(trbflags)) { case XHCI_TRB_TYPE_LINK: if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) ccs ^= 0x1; xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); xfer_block->processed = 1; break; case XHCI_TRB_TYPE_SETUP_STAGE: if ((trbflags & XHCI_TRB_3_IDT_BIT) == 0 || XHCI_TRB_2_BYTES_GET(trb->dwTrb2) != 8) { DPRINTF(("pci_xhci: invalid setup trb")); err = XHCI_TRB_ERROR_TRB; goto errout; } setup_trb = trb; val = trb->qwTrb0; if (!xfer->ureq) xfer->ureq = malloc( sizeof(struct usb_device_request)); memcpy(xfer->ureq, &val, sizeof(struct usb_device_request)); xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); xfer_block->processed = 1; break; case XHCI_TRB_TYPE_NORMAL: case XHCI_TRB_TYPE_ISOCH: if (setup_trb != NULL) { DPRINTF(("pci_xhci: trb not supposed to be in " "ctl scope")); err = XHCI_TRB_ERROR_TRB; goto errout; } /* fall through */ case XHCI_TRB_TYPE_DATA_STAGE: xfer_block = usb_data_xfer_append(xfer, (void *)(trbflags & XHCI_TRB_3_IDT_BIT ? &trb->qwTrb0 : XHCI_GADDR(sc, trb->qwTrb0)), trb->dwTrb2 & 0x1FFFF, (void *)addr, ccs); break; case XHCI_TRB_TYPE_STATUS_STAGE: xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); break; case XHCI_TRB_TYPE_NOOP: xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); xfer_block->processed = 1; break; case XHCI_TRB_TYPE_EVENT_DATA: xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); if ((epid > 1) && (trbflags & XHCI_TRB_3_IOC_BIT)) { xfer_block->processed = 1; } break; default: DPRINTF(("pci_xhci: handle xfer unexpected trb type " "0x%x", XHCI_TRB_3_TYPE_GET(trbflags))); err = XHCI_TRB_ERROR_TRB; goto errout; } trb = pci_xhci_trb_next(sc, trb, &addr); DPRINTF(("pci_xhci: next trb: 0x%lx", (uint64_t)trb)); if (xfer_block) { xfer_block->trbnext = addr; xfer_block->streamid = streamid; } if (!setup_trb && !(trbflags & XHCI_TRB_3_CHAIN_BIT) && XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK) { break; } /* handle current batch that requires interrupt on complete */ if (trbflags & XHCI_TRB_3_IOC_BIT) { DPRINTF(("pci_xhci: trb IOC bit set")); if (epid == 1) do_retry = 1; break; } } DPRINTF(("pci_xhci[%d]: xfer->ndata %u", __LINE__, xfer->ndata)); if (xfer->ndata <= 0) goto errout; if (epid == 1) { err = USB_ERR_NOT_STARTED; if (dev->dev_ue->ue_request != NULL) err = dev->dev_ue->ue_request(dev->dev_sc, xfer); setup_trb = NULL; } else { /* handle data transfer */ pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); err = XHCI_TRB_ERROR_SUCCESS; goto errout; } err = USB_TO_XHCI_ERR(err); if ((err == XHCI_TRB_ERROR_SUCCESS) || (err == XHCI_TRB_ERROR_STALL) || (err == XHCI_TRB_ERROR_SHORT_PKT)) { err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr); if (err != XHCI_TRB_ERROR_SUCCESS) do_retry = 0; } errout: if (err == XHCI_TRB_ERROR_EV_RING_FULL) DPRINTF(("pci_xhci[%d]: event ring full", __LINE__)); if (!do_retry) USB_DATA_XFER_UNLOCK(xfer); if (do_intr) pci_xhci_assert_interrupt(sc); if (do_retry) { USB_DATA_XFER_RESET(xfer); DPRINTF(("pci_xhci[%d]: retry:continuing with next TRBs", __LINE__)); goto retry; } if (epid == 1) USB_DATA_XFER_RESET(xfer); return (err); } static void pci_xhci_device_doorbell(struct pci_xhci_softc *sc, uint32_t slot, uint32_t epid, uint32_t streamid) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; struct pci_xhci_trb_ring *sctx_tr; struct xhci_trb *trb; uint64_t ringaddr; uint32_t ccs; DPRINTF(("pci_xhci doorbell slot %u epid %u stream %u", slot, epid, streamid)); - if (slot == 0 || slot > sc->ndevices) { + if (slot == 0 || slot > XHCI_MAX_SLOTS) { DPRINTF(("pci_xhci: invalid doorbell slot %u", slot)); return; } if (epid == 0 || epid >= XHCI_MAX_ENDPOINTS) { DPRINTF(("pci_xhci: invalid endpoint %u", epid)); return; } dev = XHCI_SLOTDEV_PTR(sc, slot); devep = &dev->eps[epid]; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); if (!dev_ctx) { return; } ep_ctx = &dev_ctx->ctx_ep[epid]; sctx_tr = NULL; DPRINTF(("pci_xhci: device doorbell ep[%u] %08x %08x %016lx %08x", epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, ep_ctx->dwEpCtx4)); if (ep_ctx->qwEpCtx2 == 0) return; /* handle pending transfers */ if (devep->ep_xfer->ndata > 0) { pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); return; } /* get next trb work item */ if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) { struct xhci_stream_ctx *sctx; /* * Stream IDs of 0, 65535 (any stream), and 65534 * (prime) are invalid. */ if (streamid == 0 || streamid == 65534 || streamid == 65535) { DPRINTF(("pci_xhci: invalid stream %u", streamid)); return; } sctx = NULL; pci_xhci_find_stream(sc, ep_ctx, streamid, &sctx); if (sctx == NULL) { DPRINTF(("pci_xhci: invalid stream %u", streamid)); return; } sctx_tr = &devep->ep_sctx_trbs[streamid]; ringaddr = sctx_tr->ringaddr; ccs = sctx_tr->ccs; trb = XHCI_GADDR(sc, sctx_tr->ringaddr & ~0xFUL); DPRINTF(("doorbell, stream %u, ccs %lx, trb ccs %x", streamid, ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); } else { if (streamid != 0) { DPRINTF(("pci_xhci: invalid stream %u", streamid)); return; } ringaddr = devep->ep_ringaddr; ccs = devep->ep_ccs; trb = devep->ep_tr; DPRINTF(("doorbell, ccs %lx, trb ccs %x", ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); } if (XHCI_TRB_3_TYPE_GET(trb->dwTrb3) == 0) { DPRINTF(("pci_xhci: ring %lx trb[%lx] EP %u is RESERVED?", ep_ctx->qwEpCtx2, devep->ep_ringaddr, epid)); return; } pci_xhci_handle_transfer(sc, dev, devep, ep_ctx, trb, slot, epid, ringaddr, ccs, streamid); } static void pci_xhci_dbregs_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { offset = (offset - sc->dboff) / sizeof(uint32_t); DPRINTF(("pci_xhci: doorbell write offset 0x%lx: 0x%lx", offset, value)); if (XHCI_HALTED(sc)) { DPRINTF(("pci_xhci: controller halted")); return; } if (offset == 0) pci_xhci_complete_commands(sc); else if (sc->portregs != NULL) pci_xhci_device_doorbell(sc, offset, XHCI_DB_TARGET_GET(value), XHCI_DB_SID_GET(value)); } static void pci_xhci_rtsregs_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { struct pci_xhci_rtsregs *rts; offset -= sc->rtsoff; if (offset == 0) { DPRINTF(("pci_xhci attempted write to MFINDEX")); return; } DPRINTF(("pci_xhci: runtime regs write offset 0x%lx: 0x%lx", offset, value)); offset -= 0x20; /* start of intrreg */ rts = &sc->rtsregs; switch (offset) { case 0x00: if (value & XHCI_IMAN_INTR_PEND) rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; rts->intrreg.iman = (value & XHCI_IMAN_INTR_ENA) | (rts->intrreg.iman & XHCI_IMAN_INTR_PEND); if (!(value & XHCI_IMAN_INTR_ENA)) pci_xhci_deassert_interrupt(sc); break; case 0x04: rts->intrreg.imod = value; break; case 0x08: rts->intrreg.erstsz = value & 0xFFFF; break; case 0x10: /* ERSTBA low bits */ rts->intrreg.erstba = MASK_64_HI(sc->rtsregs.intrreg.erstba) | (value & ~0x3F); break; case 0x14: /* ERSTBA high bits */ rts->intrreg.erstba = (value << 32) | MASK_64_LO(sc->rtsregs.intrreg.erstba); rts->erstba_p = XHCI_GADDR(sc, sc->rtsregs.intrreg.erstba & ~0x3FUL); rts->erst_p = XHCI_GADDR(sc, sc->rtsregs.erstba_p->qwEvrsTablePtr & ~0x3FUL); rts->er_enq_idx = 0; rts->er_events_cnt = 0; DPRINTF(("pci_xhci: wr erstba erst (%p) ptr 0x%lx, sz %u", rts->erstba_p, rts->erstba_p->qwEvrsTablePtr, rts->erstba_p->dwEvrsTableSize)); break; case 0x18: /* ERDP low bits */ rts->intrreg.erdp = MASK_64_HI(sc->rtsregs.intrreg.erdp) | (rts->intrreg.erdp & XHCI_ERDP_LO_BUSY) | (value & ~0xF); if (value & XHCI_ERDP_LO_BUSY) { rts->intrreg.erdp &= ~XHCI_ERDP_LO_BUSY; rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; } rts->er_deq_seg = XHCI_ERDP_LO_SINDEX(value); break; case 0x1C: /* ERDP high bits */ rts->intrreg.erdp = (value << 32) | MASK_64_LO(sc->rtsregs.intrreg.erdp); if (rts->er_events_cnt > 0) { uint64_t erdp; uint32_t erdp_i; erdp = rts->intrreg.erdp & ~0xF; erdp_i = (erdp - rts->erstba_p->qwEvrsTablePtr) / sizeof(struct xhci_trb); if (erdp_i <= rts->er_enq_idx) rts->er_events_cnt = rts->er_enq_idx - erdp_i; else rts->er_events_cnt = rts->erstba_p->dwEvrsTableSize - (erdp_i - rts->er_enq_idx); DPRINTF(("pci_xhci: erdp 0x%lx, events cnt %u", erdp, rts->er_events_cnt)); } break; default: DPRINTF(("pci_xhci attempted write to RTS offset 0x%lx", offset)); break; } } static uint64_t pci_xhci_portregs_read(struct pci_xhci_softc *sc, uint64_t offset) { int port; uint32_t *p; if (sc->portregs == NULL) return (0); port = (offset - 0x3F0) / 0x10; if (port > XHCI_MAX_DEVS) { DPRINTF(("pci_xhci: portregs_read port %d >= XHCI_MAX_DEVS", port)); /* return default value for unused port */ return (XHCI_PS_SPEED_SET(3)); } offset = (offset - 0x3F0) % 0x10; p = &sc->portregs[port].portsc; p += offset / sizeof(uint32_t); DPRINTF(("pci_xhci: portregs read offset 0x%lx port %u -> 0x%x", offset, port, *p)); return (*p); } static void pci_xhci_hostop_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { offset -= XHCI_CAPLEN; if (offset < 0x400) DPRINTF(("pci_xhci: hostop write offset 0x%lx: 0x%lx", offset, value)); switch (offset) { case XHCI_USBCMD: sc->opregs.usbcmd = pci_xhci_usbcmd_write(sc, value & 0x3F0F); break; case XHCI_USBSTS: /* clear bits on write */ sc->opregs.usbsts &= ~(value & (XHCI_STS_HSE|XHCI_STS_EINT|XHCI_STS_PCD|XHCI_STS_SSS| XHCI_STS_RSS|XHCI_STS_SRE|XHCI_STS_CNR)); break; case XHCI_PAGESIZE: /* read only */ break; case XHCI_DNCTRL: sc->opregs.dnctrl = value & 0xFFFF; break; case XHCI_CRCR_LO: if (sc->opregs.crcr & XHCI_CRCR_LO_CRR) { sc->opregs.crcr &= ~(XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); sc->opregs.crcr |= value & (XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); } else { sc->opregs.crcr = MASK_64_HI(sc->opregs.crcr) | (value & (0xFFFFFFC0 | XHCI_CRCR_LO_RCS)); } break; case XHCI_CRCR_HI: if (!(sc->opregs.crcr & XHCI_CRCR_LO_CRR)) { sc->opregs.crcr = MASK_64_LO(sc->opregs.crcr) | (value << 32); sc->opregs.cr_p = XHCI_GADDR(sc, sc->opregs.crcr & ~0xF); } if (sc->opregs.crcr & XHCI_CRCR_LO_CS) { /* Stop operation of Command Ring */ } if (sc->opregs.crcr & XHCI_CRCR_LO_CA) { /* Abort command */ } break; case XHCI_DCBAAP_LO: sc->opregs.dcbaap = MASK_64_HI(sc->opregs.dcbaap) | (value & 0xFFFFFFC0); break; case XHCI_DCBAAP_HI: sc->opregs.dcbaap = MASK_64_LO(sc->opregs.dcbaap) | (value << 32); sc->opregs.dcbaa_p = XHCI_GADDR(sc, sc->opregs.dcbaap & ~0x3FUL); DPRINTF(("pci_xhci: opregs dcbaap = 0x%lx (vaddr 0x%lx)", sc->opregs.dcbaap, (uint64_t)sc->opregs.dcbaa_p)); break; case XHCI_CONFIG: sc->opregs.config = value & 0x03FF; break; default: if (offset >= 0x400) pci_xhci_portregs_write(sc, offset, value); break; } } static void pci_xhci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_xhci_softc *sc; sc = pi->pi_arg; assert(baridx == 0); pthread_mutex_lock(&sc->mtx); if (offset < XHCI_CAPLEN) /* read only registers */ WPRINTF(("pci_xhci: write RO-CAPs offset %ld", offset)); else if (offset < sc->dboff) pci_xhci_hostop_write(sc, offset, value); else if (offset < sc->rtsoff) pci_xhci_dbregs_write(sc, offset, value); else if (offset < sc->regsend) pci_xhci_rtsregs_write(sc, offset, value); else WPRINTF(("pci_xhci: write invalid offset %ld", offset)); pthread_mutex_unlock(&sc->mtx); } static uint64_t pci_xhci_hostcap_read(struct pci_xhci_softc *sc, uint64_t offset) { uint64_t value; switch (offset) { case XHCI_CAPLENGTH: /* 0x00 */ value = sc->caplength; break; case XHCI_HCSPARAMS1: /* 0x04 */ value = sc->hcsparams1; break; case XHCI_HCSPARAMS2: /* 0x08 */ value = sc->hcsparams2; break; case XHCI_HCSPARAMS3: /* 0x0C */ value = sc->hcsparams3; break; case XHCI_HCSPARAMS0: /* 0x10 */ value = sc->hccparams1; break; case XHCI_DBOFF: /* 0x14 */ value = sc->dboff; break; case XHCI_RTSOFF: /* 0x18 */ value = sc->rtsoff; break; case XHCI_HCCPRAMS2: /* 0x1C */ value = sc->hccparams2; break; default: value = 0; break; } DPRINTF(("pci_xhci: hostcap read offset 0x%lx -> 0x%lx", offset, value)); return (value); } static uint64_t pci_xhci_hostop_read(struct pci_xhci_softc *sc, uint64_t offset) { uint64_t value; offset = (offset - XHCI_CAPLEN); switch (offset) { case XHCI_USBCMD: /* 0x00 */ value = sc->opregs.usbcmd; break; case XHCI_USBSTS: /* 0x04 */ value = sc->opregs.usbsts; break; case XHCI_PAGESIZE: /* 0x08 */ value = sc->opregs.pgsz; break; case XHCI_DNCTRL: /* 0x14 */ value = sc->opregs.dnctrl; break; case XHCI_CRCR_LO: /* 0x18 */ value = sc->opregs.crcr & XHCI_CRCR_LO_CRR; break; case XHCI_CRCR_HI: /* 0x1C */ value = 0; break; case XHCI_DCBAAP_LO: /* 0x30 */ value = sc->opregs.dcbaap & 0xFFFFFFFF; break; case XHCI_DCBAAP_HI: /* 0x34 */ value = (sc->opregs.dcbaap >> 32) & 0xFFFFFFFF; break; case XHCI_CONFIG: /* 0x38 */ value = sc->opregs.config; break; default: if (offset >= 0x400) value = pci_xhci_portregs_read(sc, offset); else value = 0; break; } if (offset < 0x400) DPRINTF(("pci_xhci: hostop read offset 0x%lx -> 0x%lx", offset, value)); return (value); } static uint64_t pci_xhci_dbregs_read(struct pci_xhci_softc *sc, uint64_t offset) { /* read doorbell always returns 0 */ return (0); } static uint64_t pci_xhci_rtsregs_read(struct pci_xhci_softc *sc, uint64_t offset) { uint32_t value; offset -= sc->rtsoff; value = 0; if (offset == XHCI_MFINDEX) { value = sc->rtsregs.mfindex; } else if (offset >= 0x20) { int item; uint32_t *p; offset -= 0x20; item = offset % 32; assert(offset < sizeof(sc->rtsregs.intrreg)); p = &sc->rtsregs.intrreg.iman; p += item / sizeof(uint32_t); value = *p; } DPRINTF(("pci_xhci: rtsregs read offset 0x%lx -> 0x%x", offset, value)); return (value); } static uint64_t pci_xhci_xecp_read(struct pci_xhci_softc *sc, uint64_t offset) { uint32_t value; offset -= sc->regsend; value = 0; switch (offset) { case 0: /* rev major | rev minor | next-cap | cap-id */ value = (0x02 << 24) | (4 << 8) | XHCI_ID_PROTOCOLS; break; case 4: /* name string = "USB" */ value = 0x20425355; break; case 8: /* psic | proto-defined | compat # | compat offset */ value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb2_port_start; break; case 12: break; case 16: /* rev major | rev minor | next-cap | cap-id */ value = (0x03 << 24) | XHCI_ID_PROTOCOLS; break; case 20: /* name string = "USB" */ value = 0x20425355; break; case 24: /* psic | proto-defined | compat # | compat offset */ value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb3_port_start; break; case 28: break; default: DPRINTF(("pci_xhci: xecp invalid offset 0x%lx", offset)); break; } DPRINTF(("pci_xhci: xecp read offset 0x%lx -> 0x%x", offset, value)); return (value); } static uint64_t pci_xhci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_xhci_softc *sc; uint32_t value; sc = pi->pi_arg; assert(baridx == 0); pthread_mutex_lock(&sc->mtx); if (offset < XHCI_CAPLEN) value = pci_xhci_hostcap_read(sc, offset); else if (offset < sc->dboff) value = pci_xhci_hostop_read(sc, offset); else if (offset < sc->rtsoff) value = pci_xhci_dbregs_read(sc, offset); else if (offset < sc->regsend) value = pci_xhci_rtsregs_read(sc, offset); else if (offset < (sc->regsend + 4*32)) value = pci_xhci_xecp_read(sc, offset); else { value = 0; WPRINTF(("pci_xhci: read invalid offset %ld", offset)); } pthread_mutex_unlock(&sc->mtx); switch (size) { case 1: value &= 0xFF; break; case 2: value &= 0xFFFF; break; case 4: value &= 0xFFFFFFFF; break; } return (value); } static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm) { struct pci_xhci_portregs *port; struct pci_xhci_dev_emu *dev; struct xhci_trb evtrb; int error; assert(portn <= XHCI_MAX_DEVS); DPRINTF(("xhci reset port %d", portn)); port = XHCI_PORTREG_PTR(sc, portn); dev = XHCI_DEVINST_PTR(sc, portn); if (dev) { port->portsc &= ~(XHCI_PS_PLS_MASK | XHCI_PS_PR | XHCI_PS_PRC); port->portsc |= XHCI_PS_PED | XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); if (warm && dev->dev_ue->ue_usbver == 3) { port->portsc |= XHCI_PS_WRC; } if ((port->portsc & XHCI_PS_PRC) == 0) { port->portsc |= XHCI_PS_PRC; pci_xhci_set_evtrb(&evtrb, portn, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); error = pci_xhci_insert_event(sc, &evtrb, 1); if (error != XHCI_TRB_ERROR_SUCCESS) DPRINTF(("xhci reset port insert event " "failed")); } } } static void pci_xhci_init_port(struct pci_xhci_softc *sc, int portn) { struct pci_xhci_portregs *port; struct pci_xhci_dev_emu *dev; port = XHCI_PORTREG_PTR(sc, portn); dev = XHCI_DEVINST_PTR(sc, portn); if (dev) { port->portsc = XHCI_PS_CCS | /* connected */ XHCI_PS_PP; /* port power */ if (dev->dev_ue->ue_usbver == 2) { port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL) | XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); } else { port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0) | XHCI_PS_PED | /* enabled */ XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); } DPRINTF(("Init port %d 0x%x", portn, port->portsc)); } else { port->portsc = XHCI_PS_PLS_SET(UPS_PORT_LS_RX_DET) | XHCI_PS_PP; DPRINTF(("Init empty port %d 0x%x", portn, port->portsc)); } } static int pci_xhci_dev_intr(struct usb_hci *hci, int epctx) { struct pci_xhci_dev_emu *dev; struct xhci_dev_ctx *dev_ctx; struct xhci_trb evtrb; struct pci_xhci_softc *sc; struct pci_xhci_portregs *p; struct xhci_endp_ctx *ep_ctx; int error = 0; int dir_in; int epid; dir_in = epctx & 0x80; epid = epctx & ~0x80; /* HW endpoint contexts are 0-15; convert to epid based on dir */ epid = (epid * 2) + (dir_in ? 1 : 0); assert(epid >= 1 && epid <= 31); dev = hci->hci_sc; sc = dev->xsc; /* check if device is ready; OS has to initialise it */ if (sc->rtsregs.erstba_p == NULL || (sc->opregs.usbcmd & XHCI_CMD_RS) == 0 || dev->dev_ctx == NULL) return (0); p = XHCI_PORTREG_PTR(sc, hci->hci_port); /* raise event if link U3 (suspended) state */ if (XHCI_PS_PLS_GET(p->portsc) == 3) { p->portsc &= ~XHCI_PS_PLS_MASK; p->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_RESUME); if ((p->portsc & XHCI_PS_PLC) != 0) return (0); p->portsc |= XHCI_PS_PLC; pci_xhci_set_evtrb(&evtrb, hci->hci_port, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); error = pci_xhci_insert_event(sc, &evtrb, 0); if (error != XHCI_TRB_ERROR_SUCCESS) goto done; } dev_ctx = dev->dev_ctx; ep_ctx = &dev_ctx->ctx_ep[epid]; if ((ep_ctx->dwEpCtx0 & 0x7) == XHCI_ST_EPCTX_DISABLED) { DPRINTF(("xhci device interrupt on disabled endpoint %d", epid)); return (0); } DPRINTF(("xhci device interrupt on endpoint %d", epid)); pci_xhci_device_doorbell(sc, hci->hci_port, epid, 0); done: return (error); } static int pci_xhci_dev_event(struct usb_hci *hci, enum hci_usbev evid, void *param) { DPRINTF(("xhci device event port %d", hci->hci_port)); return (0); } +/* + * Each controller contains a "slot" node which contains a list of + * child nodes each of which is a device. Each slot node's name + * corresponds to a specific controller slot. These nodes + * contain a "device" variable identifying the device model of the + * USB device. For example: + * + * pci.0.1.0 + * .device="xhci" + * .slot + * .1 + * .device="tablet" + */ +static int +pci_xhci_legacy_config(nvlist_t *nvl, const char *opts) +{ + char node_name[16]; + nvlist_t *slots_nvl, *slot_nvl; + char *cp, *opt, *str, *tofree; + int slot; + if (opts == NULL) + return (0); -static void -pci_xhci_device_usage(char *opt) -{ + slots_nvl = create_relative_config_node(nvl, "slot"); + slot = 1; + tofree = str = strdup(opts); + while ((opt = strsep(&str, ",")) != NULL) { + /* device[=] */ + cp = strchr(opt, '='); + if (cp != NULL) { + *cp = '\0'; + cp++; + } + + snprintf(node_name, sizeof(node_name), "%d", slot); + slot++; + slot_nvl = create_relative_config_node(slots_nvl, node_name); + set_config_value_node(slot_nvl, "device", opt); - EPRINTLN("Invalid USB emulation \"%s\"", opt); + /* + * NB: Given that we split on commas above, the legacy + * format only supports a single option. + */ + if (cp != NULL && *cp != '\0') + pci_parse_legacy_config(slot_nvl, cp); + } + free(tofree); + return (0); } static int -pci_xhci_parse_opts(struct pci_xhci_softc *sc, char *opts) +pci_xhci_parse_devices(struct pci_xhci_softc *sc, nvlist_t *nvl) { - struct pci_xhci_dev_emu **devices; struct pci_xhci_dev_emu *dev; struct usb_devemu *ue; - void *devsc; - char *uopt, *xopts, *config; - int usb3_port, usb2_port, i; + const nvlist_t *slots_nvl, *slot_nvl; + const char *name, *device; + char *cp; + void *devsc, *cookie; + long slot; + int type, usb3_port, usb2_port, i, ndevices; - uopt = NULL; - usb3_port = sc->usb3_port_start - 1; - usb2_port = sc->usb2_port_start - 1; - devices = NULL; + usb3_port = sc->usb3_port_start; + usb2_port = sc->usb2_port_start; - if (opts == NULL) - goto portsfinal; + sc->devices = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_dev_emu *)); + sc->slots = calloc(XHCI_MAX_SLOTS, sizeof(struct pci_xhci_dev_emu *)); - devices = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_dev_emu *)); + /* port and slot numbering start from 1 */ + sc->devices--; + sc->slots--; - sc->slots = calloc(XHCI_MAX_SLOTS, sizeof(struct pci_xhci_dev_emu *)); - sc->devices = devices; - sc->ndevices = 0; - - uopt = strdup(opts); - for (xopts = strtok(uopt, ","); - xopts != NULL; - xopts = strtok(NULL, ",")) { - if (usb2_port == ((sc->usb2_port_start-1) + XHCI_MAX_DEVS/2) || - usb3_port == ((sc->usb3_port_start-1) + XHCI_MAX_DEVS/2)) { + ndevices = 0; + + slots_nvl = find_relative_config_node(nvl, "slots"); + if (slots_nvl == NULL) + goto portsfinal; + + cookie = NULL; + while ((name = nvlist_next(slots_nvl, &type, &cookie)) != NULL) { + if (usb2_port == ((sc->usb2_port_start) + XHCI_MAX_DEVS/2) || + usb3_port == ((sc->usb3_port_start) + XHCI_MAX_DEVS/2)) { WPRINTF(("pci_xhci max number of USB 2 or 3 " "devices reached, max %d", XHCI_MAX_DEVS/2)); - usb2_port = usb3_port = -1; - goto done; + goto bad; } - /* device[=] */ - if ((config = strchr(xopts, '=')) == NULL) - config = ""; /* no config */ - else - *config++ = '\0'; + if (type != NV_TYPE_NVLIST) { + EPRINTLN( + "pci_xhci: config variable '%s' under slot node", + name); + goto bad; + } + + slot = strtol(name, &cp, 0); + if (*cp != '\0' || slot <= 0 || slot > XHCI_MAX_SLOTS) { + EPRINTLN("pci_xhci: invalid slot '%s'", name); + goto bad; + } + + if (XHCI_SLOTDEV_PTR(sc, slot) != NULL) { + EPRINTLN("pci_xhci: duplicate slot '%s'", name); + goto bad; + } - ue = usb_emu_finddev(xopts); + slot_nvl = nvlist_get_nvlist(slots_nvl, name); + device = get_config_value_node(slot_nvl, "device"); + if (device == NULL) { + EPRINTLN( + "pci_xhci: missing \"device\" value for slot '%s'", + name); + goto bad; + } + + ue = usb_emu_finddev(device); if (ue == NULL) { - pci_xhci_device_usage(xopts); - DPRINTF(("pci_xhci device not found %s", xopts)); - usb2_port = usb3_port = -1; - goto done; + EPRINTLN("pci_xhci: unknown device model \"%s\"", + device); + goto bad; } - DPRINTF(("pci_xhci adding device %s, opts \"%s\"", - xopts, config)); + DPRINTF(("pci_xhci adding device %s", device)); dev = calloc(1, sizeof(struct pci_xhci_dev_emu)); dev->xsc = sc; dev->hci.hci_sc = dev; dev->hci.hci_intr = pci_xhci_dev_intr; dev->hci.hci_event = pci_xhci_dev_event; if (ue->ue_usbver == 2) { - dev->hci.hci_port = usb2_port + 1; - devices[usb2_port] = dev; + if (usb2_port == sc->usb2_port_start + + XHCI_MAX_DEVS / 2) { + WPRINTF(("pci_xhci max number of USB 2 devices " + "reached, max %d", XHCI_MAX_DEVS / 2)); + goto bad; + } + dev->hci.hci_port = usb2_port; usb2_port++; } else { - dev->hci.hci_port = usb3_port + 1; - devices[usb3_port] = dev; + if (usb3_port == sc->usb3_port_start + + XHCI_MAX_DEVS / 2) { + WPRINTF(("pci_xhci max number of USB 3 devices " + "reached, max %d", XHCI_MAX_DEVS / 2)); + goto bad; + } + dev->hci.hci_port = usb3_port; usb3_port++; } + XHCI_DEVINST_PTR(sc, dev->hci.hci_port) = dev; dev->hci.hci_address = 0; - devsc = ue->ue_init(&dev->hci, config); + devsc = ue->ue_init(&dev->hci, nvl); if (devsc == NULL) { - pci_xhci_device_usage(xopts); - usb2_port = usb3_port = -1; - goto done; + goto bad; } dev->dev_ue = ue; dev->dev_sc = devsc; - /* assign slot number to device */ - sc->slots[sc->ndevices] = dev; - - sc->ndevices++; + XHCI_SLOTDEV_PTR(sc, slot) = dev; } portsfinal: sc->portregs = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_portregs)); + sc->portregs--; - if (sc->ndevices > 0) { - /* port and slot numbering start from 1 */ - sc->devices--; - sc->portregs--; - sc->slots--; - + if (ndevices > 0) { for (i = 1; i <= XHCI_MAX_DEVS; i++) { pci_xhci_init_port(sc, i); } } else { WPRINTF(("pci_xhci no USB devices configured")); - sc->ndevices = 1; } + return (0); -done: - if (devices != NULL) { - if (usb2_port <= 0 && usb3_port <= 0) { - sc->devices = NULL; - for (i = 0; devices[i] != NULL; i++) - free(devices[i]); - sc->ndevices = -1; - - free(devices); - } +bad: + for (i = 1; i <= XHCI_MAX_DEVS; i++) { + free(XHCI_DEVINST_PTR(sc, i)); } - free(uopt); - return (sc->ndevices); + + free(sc->devices + 1); + free(sc->slots + 1); + + return (-1); } static int -pci_xhci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_xhci_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { struct pci_xhci_softc *sc; int error; if (xhci_in_use) { WPRINTF(("pci_xhci controller already defined")); return (-1); } xhci_in_use = 1; sc = calloc(1, sizeof(struct pci_xhci_softc)); pi->pi_arg = sc; sc->xsc_pi = pi; sc->usb2_port_start = (XHCI_MAX_DEVS/2) + 1; sc->usb3_port_start = 1; /* discover devices */ - error = pci_xhci_parse_opts(sc, opts); + error = pci_xhci_parse_devices(sc, nvl); if (error < 0) goto done; else error = 0; sc->caplength = XHCI_SET_CAPLEN(XHCI_CAPLEN) | XHCI_SET_HCIVERSION(0x0100); sc->hcsparams1 = XHCI_SET_HCSP1_MAXPORTS(XHCI_MAX_DEVS) | XHCI_SET_HCSP1_MAXINTR(1) | /* interrupters */ XHCI_SET_HCSP1_MAXSLOTS(XHCI_MAX_SLOTS); sc->hcsparams2 = XHCI_SET_HCSP2_ERSTMAX(XHCI_ERST_MAX) | XHCI_SET_HCSP2_IST(0x04); sc->hcsparams3 = 0; /* no latency */ sc->hccparams1 = XHCI_SET_HCCP1_AC64(1) | /* 64-bit addrs */ XHCI_SET_HCCP1_NSS(1) | /* no 2nd-streams */ XHCI_SET_HCCP1_SPC(1) | /* short packet */ XHCI_SET_HCCP1_MAXPSA(XHCI_STREAMS_MAX); sc->hccparams2 = XHCI_SET_HCCP2_LEC(1) | XHCI_SET_HCCP2_U3C(1); sc->dboff = XHCI_SET_DOORBELL(XHCI_CAPLEN + XHCI_PORTREGS_START + XHCI_MAX_DEVS * sizeof(struct pci_xhci_portregs)); /* dboff must be 32-bit aligned */ if (sc->dboff & 0x3) sc->dboff = (sc->dboff + 0x3) & ~0x3; /* rtsoff must be 32-bytes aligned */ sc->rtsoff = XHCI_SET_RTSOFFSET(sc->dboff + (XHCI_MAX_SLOTS+1) * 32); if (sc->rtsoff & 0x1F) sc->rtsoff = (sc->rtsoff + 0x1F) & ~0x1F; DPRINTF(("pci_xhci dboff: 0x%x, rtsoff: 0x%x", sc->dboff, sc->rtsoff)); sc->opregs.usbsts = XHCI_STS_HCH; sc->opregs.pgsz = XHCI_PAGESIZE_4K; pci_xhci_reset(sc); sc->regsend = sc->rtsoff + 0x20 + 32; /* only 1 intrpter */ /* * Set extended capabilities pointer to be after regsend; * value of xecp field is 32-bit offset. */ sc->hccparams1 |= XHCI_SET_HCCP1_XECP(sc->regsend/4); pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1E31); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SERIALBUS); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_SERIALBUS_USB); pci_set_cfgdata8(pi, PCIR_PROGIF,PCIP_SERIALBUS_USB_XHCI); pci_set_cfgdata8(pi, PCI_USBREV, PCI_USB_REV_3_0); pci_emul_add_msicap(pi, 1); /* regsend + xecp registers */ pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, sc->regsend + 4*32); DPRINTF(("pci_xhci pci_emu_alloc: %d", sc->regsend + 4*32)); pci_lintr_request(pi); pthread_mutex_init(&sc->mtx, NULL); done: if (error) { free(sc); } return (error); } #ifdef BHYVE_SNAPSHOT static void pci_xhci_map_devs_slots(struct pci_xhci_softc *sc, int maps[]) { int i, j; struct pci_xhci_dev_emu *dev, *slot; memset(maps, 0, sizeof(maps[0]) * XHCI_MAX_SLOTS); for (i = 1; i <= XHCI_MAX_SLOTS; i++) { for (j = 1; j <= XHCI_MAX_DEVS; j++) { slot = XHCI_SLOTDEV_PTR(sc, i); dev = XHCI_DEVINST_PTR(sc, j); if (slot == dev) maps[i] = j; } } } static int pci_xhci_snapshot_ep(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, int idx, struct vm_snapshot_meta *meta) { int k; int ret; struct usb_data_xfer *xfer; struct usb_data_xfer_block *xfer_block; /* some sanity checks */ if (meta->op == VM_SNAPSHOT_SAVE) xfer = dev->eps[idx].ep_xfer; SNAPSHOT_VAR_OR_LEAVE(xfer, meta, ret, done); if (xfer == NULL) { ret = 0; goto done; } if (meta->op == VM_SNAPSHOT_RESTORE) { pci_xhci_init_ep(dev, idx); xfer = dev->eps[idx].ep_xfer; } /* save / restore proper */ for (k = 0; k < USB_MAX_XFER_BLOCKS; k++) { xfer_block = &xfer->data[k]; SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(xfer_block->buf, XHCI_GADDR_SIZE(xfer_block->buf), true, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->blen, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->bdone, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->processed, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->hci_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->ccs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->streamid, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->trbnext, meta, ret, done); } SNAPSHOT_VAR_OR_LEAVE(xfer->ureq, meta, ret, done); if (xfer->ureq) { /* xfer->ureq is not allocated at restore time */ if (meta->op == VM_SNAPSHOT_RESTORE) xfer->ureq = malloc(sizeof(struct usb_device_request)); SNAPSHOT_BUF_OR_LEAVE(xfer->ureq, sizeof(struct usb_device_request), meta, ret, done); } SNAPSHOT_VAR_OR_LEAVE(xfer->ndata, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer->head, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer->tail, meta, ret, done); done: return (ret); } static int pci_xhci_snapshot(struct vm_snapshot_meta *meta) { int i, j; int ret; int restore_idx; struct pci_devinst *pi; struct pci_xhci_softc *sc; struct pci_xhci_portregs *port; struct pci_xhci_dev_emu *dev; char dname[SNAP_DEV_NAME_LEN]; int maps[XHCI_MAX_SLOTS + 1]; pi = meta->dev_data; sc = pi->pi_arg; SNAPSHOT_VAR_OR_LEAVE(sc->caplength, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams1, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams2, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams3, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hccparams1, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->dboff, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsoff, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hccparams2, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->regsend, meta, ret, done); /* opregs */ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbcmd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbsts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.pgsz, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dnctrl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.crcr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dcbaap, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.config, meta, ret, done); /* opregs.cr_p */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.cr_p, XHCI_GADDR_SIZE(sc->opregs.cr_p), true, meta, ret, done); /* opregs.dcbaa_p */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.dcbaa_p, XHCI_GADDR_SIZE(sc->opregs.dcbaa_p), true, meta, ret, done); /* rtsregs */ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.mfindex, meta, ret, done); /* rtsregs.intrreg */ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.iman, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.imod, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstsz, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.rsvd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstba, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erdp, meta, ret, done); /* rtsregs.erstba_p */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erstba_p, XHCI_GADDR_SIZE(sc->rtsregs.erstba_p), true, meta, ret, done); /* rtsregs.erst_p */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erst_p, XHCI_GADDR_SIZE(sc->rtsregs.erst_p), true, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_deq_seg, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_idx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_seg, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_events_cnt, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.event_pcs, meta, ret, done); /* sanity checking */ for (i = 1; i <= XHCI_MAX_DEVS; i++) { dev = XHCI_DEVINST_PTR(sc, i); if (dev == NULL) continue; if (meta->op == VM_SNAPSHOT_SAVE) restore_idx = i; SNAPSHOT_VAR_OR_LEAVE(restore_idx, meta, ret, done); /* check if the restored device (when restoring) is sane */ if (restore_idx != i) { fprintf(stderr, "%s: idx not matching: actual: %d, " "expected: %d\r\n", __func__, restore_idx, i); ret = EINVAL; goto done; } if (meta->op == VM_SNAPSHOT_SAVE) { memset(dname, 0, sizeof(dname)); strncpy(dname, dev->dev_ue->ue_emu, sizeof(dname) - 1); } SNAPSHOT_BUF_OR_LEAVE(dname, sizeof(dname), meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE) { dname[sizeof(dname) - 1] = '\0'; if (strcmp(dev->dev_ue->ue_emu, dname)) { fprintf(stderr, "%s: device names mismatch: " "actual: %s, expected: %s\r\n", __func__, dname, dev->dev_ue->ue_emu); ret = EINVAL; goto done; } } } /* portregs */ for (i = 1; i <= XHCI_MAX_DEVS; i++) { port = XHCI_PORTREG_PTR(sc, i); dev = XHCI_DEVINST_PTR(sc, i); if (dev == NULL) continue; SNAPSHOT_VAR_OR_LEAVE(port->portsc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->portpmsc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->portli, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->porthlpmc, meta, ret, done); } /* slots */ if (meta->op == VM_SNAPSHOT_SAVE) pci_xhci_map_devs_slots(sc, maps); for (i = 1; i <= XHCI_MAX_SLOTS; i++) { SNAPSHOT_VAR_OR_LEAVE(maps[i], meta, ret, done); if (meta->op == VM_SNAPSHOT_SAVE) { dev = XHCI_SLOTDEV_PTR(sc, i); } else if (meta->op == VM_SNAPSHOT_RESTORE) { if (maps[i] != 0) dev = XHCI_DEVINST_PTR(sc, maps[i]); else dev = NULL; XHCI_SLOTDEV_PTR(sc, i) = dev; } else { /* error */ ret = EINVAL; goto done; } if (dev == NULL) continue; SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(dev->dev_ctx, XHCI_GADDR_SIZE(dev->dev_ctx), true, meta, ret, done); if (dev->dev_ctx != NULL) { for (j = 1; j < XHCI_MAX_ENDPOINTS; j++) { ret = pci_xhci_snapshot_ep(sc, dev, j, meta); if (ret != 0) goto done; } } SNAPSHOT_VAR_OR_LEAVE(dev->dev_slotstate, meta, ret, done); /* devices[i]->dev_sc */ dev->dev_ue->ue_snapshot(dev->dev_sc, meta); /* devices[i]->hci */ SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_address, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_port, meta, ret, done); } - SNAPSHOT_VAR_OR_LEAVE(sc->ndevices, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->usb2_port_start, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->usb3_port_start, meta, ret, done); done: return (ret); } #endif struct pci_devemu pci_de_xhci = { .pe_emu = "xhci", .pe_init = pci_xhci_init, + .pe_legacy_config = pci_xhci_legacy_config, .pe_barwrite = pci_xhci_write, .pe_barread = pci_xhci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_xhci_snapshot, #endif }; PCI_EMUL_SET(pci_de_xhci); diff --git a/usr.sbin/bhyve/pctestdev.c b/usr.sbin/bhyve/pctestdev.c index be445e5c75ae..49e03b2a4343 100644 --- a/usr.sbin/bhyve/pctestdev.c +++ b/usr.sbin/bhyve/pctestdev.c @@ -1,270 +1,261 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Adam Fenn * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Emulation of selected legacy test/debug interfaces expected by KVM-unit-tests */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include "debug.h" #include "inout.h" #include "mem.h" #include "pctestdev.h" #define DEBUGEXIT_BASE 0xf4 #define DEBUGEXIT_LEN 4 #define DEBUGEXIT_NAME "isa-debug-exit" #define IOMEM_BASE 0xff000000 #define IOMEM_LEN 0x10000 #define IOMEM_NAME "pc-testdev-iomem" #define IOPORT_BASE 0xe0 #define IOPORT_LEN 4 #define IOPORT_NAME "pc-testdev-ioport" #define IRQ_BASE 0x2000 #define IRQ_IOAPIC_PINCOUNT_MIN 24 #define IRQ_IOAPIC_PINCOUNT_MAX 32 #define IRQ_NAME "pc-testdev-irq-line" #define PCTESTDEV_NAME "pc-testdev" static bool pctestdev_inited; static uint8_t pctestdev_iomem_buf[IOMEM_LEN]; static uint32_t pctestdev_ioport_data; static int pctestdev_debugexit_io(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg); static int pctestdev_iomem_io(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2); static int pctestdev_ioport_io(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg); static int pctestdev_irq_io(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg); const char * pctestdev_getname(void) { return (PCTESTDEV_NAME); } -int -pctestdev_parse(const char *opts) -{ - if (opts != NULL && *opts != '\0') - return (-1); - - return (0); -} - int pctestdev_init(struct vmctx *ctx) { struct mem_range iomem; struct inout_port debugexit, ioport, irq; int err, pincount; if (pctestdev_inited) { EPRINTLN("Only one pc-testdev device is allowed."); return (-1); } err = vm_ioapic_pincount(ctx, &pincount); if (err != 0) { EPRINTLN("pc-testdev: Failed to obtain IOAPIC pin count."); return (-1); } if (pincount < IRQ_IOAPIC_PINCOUNT_MIN || pincount > IRQ_IOAPIC_PINCOUNT_MAX) { EPRINTLN("pc-testdev: Unsupported IOAPIC pin count: %d.", pincount); return (-1); } debugexit.name = DEBUGEXIT_NAME; debugexit.port = DEBUGEXIT_BASE; debugexit.size = DEBUGEXIT_LEN; debugexit.flags = IOPORT_F_INOUT; debugexit.handler = pctestdev_debugexit_io; debugexit.arg = NULL; iomem.name = IOMEM_NAME; iomem.flags = MEM_F_RW | MEM_F_IMMUTABLE; iomem.handler = pctestdev_iomem_io; iomem.arg1 = NULL; iomem.arg2 = 0; iomem.base = IOMEM_BASE; iomem.size = IOMEM_LEN; ioport.name = IOPORT_NAME; ioport.port = IOPORT_BASE; ioport.size = IOPORT_LEN; ioport.flags = IOPORT_F_INOUT; ioport.handler = pctestdev_ioport_io; ioport.arg = NULL; irq.name = IRQ_NAME; irq.port = IRQ_BASE; irq.size = pincount; irq.flags = IOPORT_F_INOUT; irq.handler = pctestdev_irq_io; irq.arg = NULL; err = register_inout(&debugexit); if (err != 0) goto fail; err = register_inout(&ioport); if (err != 0) goto fail_after_debugexit_reg; err = register_inout(&irq); if (err != 0) goto fail_after_ioport_reg; err = register_mem(&iomem); if (err != 0) goto fail_after_irq_reg; pctestdev_inited = true; return (0); fail_after_irq_reg: (void)unregister_inout(&irq); fail_after_ioport_reg: (void)unregister_inout(&ioport); fail_after_debugexit_reg: (void)unregister_inout(&debugexit); fail: return (err); } static int pctestdev_debugexit_io(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { if (in) *eax = 0; else exit((*eax << 1) | 1); return (0); } static int pctestdev_iomem_io(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { uint64_t offset; if (addr + size > IOMEM_BASE + IOMEM_LEN) return (-1); offset = addr - IOMEM_BASE; if (dir == MEM_F_READ) { (void)memcpy(val, pctestdev_iomem_buf + offset, size); } else { assert(dir == MEM_F_WRITE); (void)memcpy(pctestdev_iomem_buf + offset, val, size); } return (0); } static int pctestdev_ioport_io(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { uint32_t mask; int lsb; if (port + bytes > IOPORT_BASE + IOPORT_LEN) return (-1); lsb = (port & 0x3) * 8; mask = (-1UL >> (32 - (bytes * 8))) << lsb; if (in) *eax = (pctestdev_ioport_data & mask) >> lsb; else { pctestdev_ioport_data &= ~mask; pctestdev_ioport_data |= *eax << lsb; } return (0); } static int pctestdev_irq_io(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { int irq; if (bytes != 1) return (-1); if (in) { *eax = 0; return (0); } else { irq = port - IRQ_BASE; if (irq < 16) { if (*eax) return (vm_isa_assert_irq(ctx, irq, irq)); else return (vm_isa_deassert_irq(ctx, irq, irq)); } else { if (*eax) return (vm_ioapic_assert_irq(ctx, irq)); else return (vm_ioapic_deassert_irq(ctx, irq)); } } } diff --git a/usr.sbin/bhyve/pctestdev.h b/usr.sbin/bhyve/pctestdev.h index c1c940146e05..5808abe6e356 100644 --- a/usr.sbin/bhyve/pctestdev.h +++ b/usr.sbin/bhyve/pctestdev.h @@ -1,43 +1,42 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Adam Fenn * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Emulation of selected legacy test/debug interfaces expected by KVM-unit-tests */ #ifndef _PCTESTDEV_H_ #define _PCTESTDEV_H_ struct vmctx; const char *pctestdev_getname(void); int pctestdev_init(struct vmctx *ctx); -int pctestdev_parse(const char *opts); #endif diff --git a/usr.sbin/bhyve/rtc.c b/usr.sbin/bhyve/rtc.c index 09ca3f61ae79..0f63156adb9b 100644 --- a/usr.sbin/bhyve/rtc.c +++ b/usr.sbin/bhyve/rtc.c @@ -1,131 +1,132 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include "acpi.h" +#include "config.h" #include "pci_lpc.h" #include "rtc.h" #define IO_RTC 0x70 #define RTC_LMEM_LSB 0x34 #define RTC_LMEM_MSB 0x35 #define RTC_HMEM_LSB 0x5b #define RTC_HMEM_SB 0x5c #define RTC_HMEM_MSB 0x5d #define m_64KB (64*1024) #define m_16MB (16*1024*1024) #define m_4GB (4ULL*1024*1024*1024) /* * Returns the current RTC time as number of seconds since 00:00:00 Jan 1, 1970 */ static time_t -rtc_time(struct vmctx *ctx, int use_localtime) +rtc_time(struct vmctx *ctx) { struct tm tm; time_t t; time(&t); - if (use_localtime) { + if (get_config_bool_default("rtc.use_localtime", true)) { localtime_r(&t, &tm); t = timegm(&tm); } return (t); } void -rtc_init(struct vmctx *ctx, int use_localtime) +rtc_init(struct vmctx *ctx) { size_t himem; size_t lomem; int err; /* XXX init diag/reset code/equipment/checksum ? */ /* * Report guest memory size in nvram cells as required by UEFI. * Little-endian encoding. * 0x34/0x35 - 64KB chunks above 16MB, below 4GB * 0x5b/0x5c/0x5d - 64KB chunks above 4GB */ lomem = (vm_get_lowmem_size(ctx) - m_16MB) / m_64KB; err = vm_rtc_write(ctx, RTC_LMEM_LSB, lomem); assert(err == 0); err = vm_rtc_write(ctx, RTC_LMEM_MSB, lomem >> 8); assert(err == 0); himem = vm_get_highmem_size(ctx) / m_64KB; err = vm_rtc_write(ctx, RTC_HMEM_LSB, himem); assert(err == 0); err = vm_rtc_write(ctx, RTC_HMEM_SB, himem >> 8); assert(err == 0); err = vm_rtc_write(ctx, RTC_HMEM_MSB, himem >> 16); assert(err == 0); - err = vm_rtc_settime(ctx, rtc_time(ctx, use_localtime)); + err = vm_rtc_settime(ctx, rtc_time(ctx)); assert(err == 0); } static void rtc_dsdt(void) { dsdt_line(""); dsdt_line("Device (RTC)"); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0B00\"))"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); dsdt_fixed_ioport(IO_RTC, 2); dsdt_fixed_irq(8); dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); } LPC_DSDT(rtc_dsdt); /* * Reserve the extended RTC I/O ports although they are not emulated at this * time. */ SYSRES_IO(0x72, 6); diff --git a/usr.sbin/bhyve/rtc.h b/usr.sbin/bhyve/rtc.h index 1c108eed9969..c8b3572e93c6 100644 --- a/usr.sbin/bhyve/rtc.h +++ b/usr.sbin/bhyve/rtc.h @@ -1,36 +1,36 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Peter Grehan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _RTC_H_ #define _RTC_H_ -void rtc_init(struct vmctx *ctx, int use_localtime); +void rtc_init(struct vmctx *ctx); #endif /* _RTC_H_ */ diff --git a/usr.sbin/bhyve/smbiostbl.c b/usr.sbin/bhyve/smbiostbl.c index d9f8a649c710..716fb48178c1 100644 --- a/usr.sbin/bhyve/smbiostbl.c +++ b/usr.sbin/bhyve/smbiostbl.c @@ -1,863 +1,868 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" +#include "config.h" #include "debug.h" #include "smbiostbl.h" #define MB (1024*1024) #define GB (1024ULL*1024*1024) #define SMBIOS_BASE 0xF1000 #define FIRMWARE_VERSION "13.0" /* The SMBIOS specification defines the date format to be mm/dd/yyyy */ #define FIRMWARE_RELEASE_DATE "11/10/2020" /* BHYVE_ACPI_BASE - SMBIOS_BASE) */ #define SMBIOS_MAX_LENGTH (0xF2400 - 0xF1000) #define SMBIOS_TYPE_BIOS 0 #define SMBIOS_TYPE_SYSTEM 1 #define SMBIOS_TYPE_CHASSIS 3 #define SMBIOS_TYPE_PROCESSOR 4 #define SMBIOS_TYPE_MEMARRAY 16 #define SMBIOS_TYPE_MEMDEVICE 17 #define SMBIOS_TYPE_MEMARRAYMAP 19 #define SMBIOS_TYPE_BOOT 32 #define SMBIOS_TYPE_EOT 127 struct smbios_structure { uint8_t type; uint8_t length; uint16_t handle; } __packed; typedef int (*initializer_func_t)(struct smbios_structure *template_entry, const char **template_strings, char *curaddr, char **endaddr, uint16_t *n, uint16_t *size); struct smbios_template_entry { struct smbios_structure *entry; const char **strings; initializer_func_t initializer; }; /* * SMBIOS Structure Table Entry Point */ #define SMBIOS_ENTRY_EANCHOR "_SM_" #define SMBIOS_ENTRY_EANCHORLEN 4 #define SMBIOS_ENTRY_IANCHOR "_DMI_" #define SMBIOS_ENTRY_IANCHORLEN 5 struct smbios_entry_point { char eanchor[4]; /* anchor tag */ uint8_t echecksum; /* checksum of entry point structure */ uint8_t eplen; /* length in bytes of entry point */ uint8_t major; /* major version of the SMBIOS spec */ uint8_t minor; /* minor version of the SMBIOS spec */ uint16_t maxssize; /* maximum size in bytes of a struct */ uint8_t revision; /* entry point structure revision */ uint8_t format[5]; /* entry point rev-specific data */ char ianchor[5]; /* intermediate anchor tag */ uint8_t ichecksum; /* intermediate checksum */ uint16_t stlen; /* len in bytes of structure table */ uint32_t staddr; /* physical addr of structure table */ uint16_t stnum; /* number of structure table entries */ uint8_t bcdrev; /* BCD value representing DMI ver */ } __packed; /* * BIOS Information */ #define SMBIOS_FL_ISA 0x00000010 /* ISA is supported */ #define SMBIOS_FL_PCI 0x00000080 /* PCI is supported */ #define SMBIOS_FL_SHADOW 0x00001000 /* BIOS shadowing is allowed */ #define SMBIOS_FL_CDBOOT 0x00008000 /* Boot from CD is supported */ #define SMBIOS_FL_SELBOOT 0x00010000 /* Selectable Boot supported */ #define SMBIOS_FL_EDD 0x00080000 /* EDD Spec is supported */ #define SMBIOS_XB1_FL_ACPI 0x00000001 /* ACPI is supported */ #define SMBIOS_XB2_FL_BBS 0x00000001 /* BIOS Boot Specification */ #define SMBIOS_XB2_FL_VM 0x00000010 /* Virtual Machine */ struct smbios_table_type0 { struct smbios_structure header; uint8_t vendor; /* vendor string */ uint8_t version; /* version string */ uint16_t segment; /* address segment location */ uint8_t rel_date; /* release date */ uint8_t size; /* rom size */ uint64_t cflags; /* characteristics */ uint8_t xc_bytes[2]; /* characteristics ext bytes */ uint8_t sb_major_rel; /* system bios version */ uint8_t sb_minor_rele; uint8_t ecfw_major_rel; /* embedded ctrl fw version */ uint8_t ecfw_minor_rel; } __packed; /* * System Information */ #define SMBIOS_WAKEUP_SWITCH 0x06 /* power switch */ struct smbios_table_type1 { struct smbios_structure header; uint8_t manufacturer; /* manufacturer string */ uint8_t product; /* product name string */ uint8_t version; /* version string */ uint8_t serial; /* serial number string */ uint8_t uuid[16]; /* uuid byte array */ uint8_t wakeup; /* wake-up event */ uint8_t sku; /* sku number string */ uint8_t family; /* family name string */ } __packed; /* * System Enclosure or Chassis */ #define SMBIOS_CHT_UNKNOWN 0x02 /* unknown */ #define SMBIOS_CHST_SAFE 0x03 /* safe */ #define SMBIOS_CHSC_NONE 0x03 /* none */ struct smbios_table_type3 { struct smbios_structure header; uint8_t manufacturer; /* manufacturer string */ uint8_t type; /* type */ uint8_t version; /* version string */ uint8_t serial; /* serial number string */ uint8_t asset; /* asset tag string */ uint8_t bustate; /* boot-up state */ uint8_t psstate; /* power supply state */ uint8_t tstate; /* thermal state */ uint8_t security; /* security status */ uint8_t uheight; /* height in 'u's */ uint8_t cords; /* number of power cords */ uint8_t elems; /* number of element records */ uint8_t elemlen; /* length of records */ uint8_t sku; /* sku number string */ } __packed; /* * Processor Information */ #define SMBIOS_PRT_CENTRAL 0x03 /* central processor */ #define SMBIOS_PRF_OTHER 0x01 /* other */ #define SMBIOS_PRS_PRESENT 0x40 /* socket is populated */ #define SMBIOS_PRS_ENABLED 0x1 /* enabled */ #define SMBIOS_PRU_NONE 0x06 /* none */ #define SMBIOS_PFL_64B 0x04 /* 64-bit capable */ struct smbios_table_type4 { struct smbios_structure header; uint8_t socket; /* socket designation string */ uint8_t type; /* processor type */ uint8_t family; /* processor family */ uint8_t manufacturer; /* manufacturer string */ uint64_t cpuid; /* processor cpuid */ uint8_t version; /* version string */ uint8_t voltage; /* voltage */ uint16_t clkspeed; /* ext clock speed in mhz */ uint16_t maxspeed; /* maximum speed in mhz */ uint16_t curspeed; /* current speed in mhz */ uint8_t status; /* status */ uint8_t upgrade; /* upgrade */ uint16_t l1handle; /* l1 cache handle */ uint16_t l2handle; /* l2 cache handle */ uint16_t l3handle; /* l3 cache handle */ uint8_t serial; /* serial number string */ uint8_t asset; /* asset tag string */ uint8_t part; /* part number string */ uint8_t cores; /* cores per socket */ uint8_t ecores; /* enabled cores */ uint8_t threads; /* threads per socket */ uint16_t cflags; /* processor characteristics */ uint16_t family2; /* processor family 2 */ } __packed; /* * Physical Memory Array */ #define SMBIOS_MAL_SYSMB 0x03 /* system board or motherboard */ #define SMBIOS_MAU_SYSTEM 0x03 /* system memory */ #define SMBIOS_MAE_NONE 0x03 /* none */ struct smbios_table_type16 { struct smbios_structure header; uint8_t location; /* physical device location */ uint8_t use; /* device functional purpose */ uint8_t ecc; /* err detect/correct method */ uint32_t size; /* max mem capacity in kb */ uint16_t errhand; /* handle of error (if any) */ uint16_t ndevs; /* num of slots or sockets */ uint64_t xsize; /* max mem capacity in bytes */ } __packed; /* * Memory Device */ #define SMBIOS_MDFF_UNKNOWN 0x02 /* unknown */ #define SMBIOS_MDT_UNKNOWN 0x02 /* unknown */ #define SMBIOS_MDF_UNKNOWN 0x0004 /* unknown */ struct smbios_table_type17 { struct smbios_structure header; uint16_t arrayhand; /* handle of physl mem array */ uint16_t errhand; /* handle of mem error data */ uint16_t twidth; /* total width in bits */ uint16_t dwidth; /* data width in bits */ uint16_t size; /* size in kb or mb */ uint8_t form; /* form factor */ uint8_t set; /* set */ uint8_t dloc; /* device locator string */ uint8_t bloc; /* phys bank locator string */ uint8_t type; /* memory type */ uint16_t flags; /* memory characteristics */ uint16_t maxspeed; /* maximum speed in mhz */ uint8_t manufacturer; /* manufacturer string */ uint8_t serial; /* serial number string */ uint8_t asset; /* asset tag string */ uint8_t part; /* part number string */ uint8_t attributes; /* attributes */ uint32_t xsize; /* extended size in mb */ uint16_t curspeed; /* current speed in mhz */ uint16_t minvoltage; /* minimum voltage */ uint16_t maxvoltage; /* maximum voltage */ uint16_t curvoltage; /* configured voltage */ } __packed; /* * Memory Array Mapped Address */ struct smbios_table_type19 { struct smbios_structure header; uint32_t saddr; /* start phys addr in kb */ uint32_t eaddr; /* end phys addr in kb */ uint16_t arrayhand; /* physical mem array handle */ uint8_t width; /* num of dev in row */ uint64_t xsaddr; /* start phys addr in bytes */ uint64_t xeaddr; /* end phys addr in bytes */ } __packed; /* * System Boot Information */ #define SMBIOS_BOOT_NORMAL 0 /* no errors detected */ struct smbios_table_type32 { struct smbios_structure header; uint8_t reserved[6]; uint8_t status; /* boot status */ } __packed; /* * End-of-Table */ struct smbios_table_type127 { struct smbios_structure header; } __packed; struct smbios_table_type0 smbios_type0_template = { { SMBIOS_TYPE_BIOS, sizeof (struct smbios_table_type0), 0 }, 1, /* bios vendor string */ 2, /* bios version string */ 0xF000, /* bios address segment location */ 3, /* bios release date */ 0x0, /* bios size (64k * (n + 1) is the size in bytes) */ SMBIOS_FL_ISA | SMBIOS_FL_PCI | SMBIOS_FL_SHADOW | SMBIOS_FL_CDBOOT | SMBIOS_FL_EDD, { SMBIOS_XB1_FL_ACPI, SMBIOS_XB2_FL_BBS | SMBIOS_XB2_FL_VM }, 0x0, /* bios major release */ 0x0, /* bios minor release */ 0xff, /* embedded controller firmware major release */ 0xff /* embedded controller firmware minor release */ }; const char *smbios_type0_strings[] = { "BHYVE", /* vendor string */ FIRMWARE_VERSION, /* bios version string */ FIRMWARE_RELEASE_DATE, /* bios release date string */ NULL }; struct smbios_table_type1 smbios_type1_template = { { SMBIOS_TYPE_SYSTEM, sizeof (struct smbios_table_type1), 0 }, 1, /* manufacturer string */ 2, /* product string */ 3, /* version string */ 4, /* serial number string */ { 0 }, SMBIOS_WAKEUP_SWITCH, 5, /* sku string */ 6 /* family string */ }; static int smbios_type1_initializer(struct smbios_structure *template_entry, const char **template_strings, char *curaddr, char **endaddr, uint16_t *n, uint16_t *size); const char *smbios_type1_strings[] = { "FreeBSD", /* manufacturer string */ "BHYVE", /* product name string */ "1.0", /* version string */ "None", /* serial number string */ "None", /* sku string */ "Virtual Machine", /* family name string */ NULL }; struct smbios_table_type3 smbios_type3_template = { { SMBIOS_TYPE_CHASSIS, sizeof (struct smbios_table_type3), 0 }, 1, /* manufacturer string */ SMBIOS_CHT_UNKNOWN, 2, /* version string */ 3, /* serial number string */ 4, /* asset tag string */ SMBIOS_CHST_SAFE, SMBIOS_CHST_SAFE, SMBIOS_CHST_SAFE, SMBIOS_CHSC_NONE, 0, /* height in 'u's (0=enclosure height unspecified) */ 0, /* number of power cords (0=number unspecified) */ 0, /* number of contained element records */ 0, /* length of records */ 5 /* sku number string */ }; const char *smbios_type3_strings[] = { "FreeBSD", /* manufacturer string */ "1.0", /* version string */ "None", /* serial number string */ "None", /* asset tag string */ "None", /* sku number string */ NULL }; struct smbios_table_type4 smbios_type4_template = { { SMBIOS_TYPE_PROCESSOR, sizeof (struct smbios_table_type4), 0 }, 1, /* socket designation string */ SMBIOS_PRT_CENTRAL, SMBIOS_PRF_OTHER, 2, /* manufacturer string */ 0, /* cpuid */ 3, /* version string */ 0, /* voltage */ 0, /* external clock frequency in mhz (0=unknown) */ 0, /* maximum frequency in mhz (0=unknown) */ 0, /* current frequency in mhz (0=unknown) */ SMBIOS_PRS_PRESENT | SMBIOS_PRS_ENABLED, SMBIOS_PRU_NONE, -1, /* l1 cache handle */ -1, /* l2 cache handle */ -1, /* l3 cache handle */ 4, /* serial number string */ 5, /* asset tag string */ 6, /* part number string */ 0, /* cores per socket (0=unknown) */ 0, /* enabled cores per socket (0=unknown) */ 0, /* threads per socket (0=unknown) */ SMBIOS_PFL_64B, SMBIOS_PRF_OTHER }; const char *smbios_type4_strings[] = { " ", /* socket designation string */ " ", /* manufacturer string */ " ", /* version string */ "None", /* serial number string */ "None", /* asset tag string */ "None", /* part number string */ NULL }; static int smbios_type4_initializer(struct smbios_structure *template_entry, const char **template_strings, char *curaddr, char **endaddr, uint16_t *n, uint16_t *size); struct smbios_table_type16 smbios_type16_template = { { SMBIOS_TYPE_MEMARRAY, sizeof (struct smbios_table_type16), 0 }, SMBIOS_MAL_SYSMB, SMBIOS_MAU_SYSTEM, SMBIOS_MAE_NONE, 0x80000000, /* max mem capacity in kb (0x80000000=use extended) */ -1, /* handle of error (if any) */ 0, /* number of slots or sockets (TBD) */ 0 /* extended maximum memory capacity in bytes (TBD) */ }; static int smbios_type16_initializer(struct smbios_structure *template_entry, const char **template_strings, char *curaddr, char **endaddr, uint16_t *n, uint16_t *size); struct smbios_table_type17 smbios_type17_template = { { SMBIOS_TYPE_MEMDEVICE, sizeof (struct smbios_table_type17), 0 }, -1, /* handle of physical memory array */ -1, /* handle of memory error data */ 64, /* total width in bits including ecc */ 64, /* data width in bits */ 0, /* size in kb or mb (0x7fff=use extended)*/ SMBIOS_MDFF_UNKNOWN, 0, /* set (0x00=none, 0xff=unknown) */ 1, /* device locator string */ 2, /* physical bank locator string */ SMBIOS_MDT_UNKNOWN, SMBIOS_MDF_UNKNOWN, 0, /* maximum memory speed in mhz (0=unknown) */ 3, /* manufacturer string */ 4, /* serial number string */ 5, /* asset tag string */ 6, /* part number string */ 0, /* attributes (0=unknown rank information) */ 0, /* extended size in mb (TBD) */ 0, /* current speed in mhz (0=unknown) */ 0, /* minimum voltage in mv (0=unknown) */ 0, /* maximum voltage in mv (0=unknown) */ 0 /* configured voltage in mv (0=unknown) */ }; const char *smbios_type17_strings[] = { " ", /* device locator string */ " ", /* physical bank locator string */ " ", /* manufacturer string */ "None", /* serial number string */ "None", /* asset tag string */ "None", /* part number string */ NULL }; static int smbios_type17_initializer(struct smbios_structure *template_entry, const char **template_strings, char *curaddr, char **endaddr, uint16_t *n, uint16_t *size); struct smbios_table_type19 smbios_type19_template = { { SMBIOS_TYPE_MEMARRAYMAP, sizeof (struct smbios_table_type19), 0 }, 0xffffffff, /* starting phys addr in kb (0xffffffff=use ext) */ 0xffffffff, /* ending phys addr in kb (0xffffffff=use ext) */ -1, /* physical memory array handle */ 1, /* number of devices that form a row */ 0, /* extended starting phys addr in bytes (TDB) */ 0 /* extended ending phys addr in bytes (TDB) */ }; static int smbios_type19_initializer(struct smbios_structure *template_entry, const char **template_strings, char *curaddr, char **endaddr, uint16_t *n, uint16_t *size); struct smbios_table_type32 smbios_type32_template = { { SMBIOS_TYPE_BOOT, sizeof (struct smbios_table_type32), 0 }, { 0, 0, 0, 0, 0, 0 }, SMBIOS_BOOT_NORMAL }; struct smbios_table_type127 smbios_type127_template = { { SMBIOS_TYPE_EOT, sizeof (struct smbios_table_type127), 0 } }; static int smbios_generic_initializer(struct smbios_structure *template_entry, const char **template_strings, char *curaddr, char **endaddr, uint16_t *n, uint16_t *size); static struct smbios_template_entry smbios_template[] = { { (struct smbios_structure *)&smbios_type0_template, smbios_type0_strings, smbios_generic_initializer }, { (struct smbios_structure *)&smbios_type1_template, smbios_type1_strings, smbios_type1_initializer }, { (struct smbios_structure *)&smbios_type3_template, smbios_type3_strings, smbios_generic_initializer }, { (struct smbios_structure *)&smbios_type4_template, smbios_type4_strings, smbios_type4_initializer }, { (struct smbios_structure *)&smbios_type16_template, NULL, smbios_type16_initializer }, { (struct smbios_structure *)&smbios_type17_template, smbios_type17_strings, smbios_type17_initializer }, { (struct smbios_structure *)&smbios_type19_template, NULL, smbios_type19_initializer }, { (struct smbios_structure *)&smbios_type32_template, NULL, smbios_generic_initializer }, { (struct smbios_structure *)&smbios_type127_template, NULL, smbios_generic_initializer }, { NULL,NULL, NULL } }; static uint64_t guest_lomem, guest_himem; static uint16_t type16_handle; static int smbios_generic_initializer(struct smbios_structure *template_entry, const char **template_strings, char *curaddr, char **endaddr, uint16_t *n, uint16_t *size) { struct smbios_structure *entry; memcpy(curaddr, template_entry, template_entry->length); entry = (struct smbios_structure *)curaddr; entry->handle = *n + 1; curaddr += entry->length; if (template_strings != NULL) { int i; for (i = 0; template_strings[i] != NULL; i++) { const char *string; int len; string = template_strings[i]; len = strlen(string) + 1; memcpy(curaddr, string, len); curaddr += len; } *curaddr = '\0'; curaddr++; } else { /* Minimum string section is double nul */ *curaddr = '\0'; curaddr++; *curaddr = '\0'; curaddr++; } (*n)++; *endaddr = curaddr; return (0); } static int smbios_type1_initializer(struct smbios_structure *template_entry, const char **template_strings, char *curaddr, char **endaddr, uint16_t *n, uint16_t *size) { struct smbios_table_type1 *type1; + const char *guest_uuid_str; smbios_generic_initializer(template_entry, template_strings, curaddr, endaddr, n, size); type1 = (struct smbios_table_type1 *)curaddr; + guest_uuid_str = get_config_value("uuid"); if (guest_uuid_str != NULL) { uuid_t uuid; uint32_t status; uuid_from_string(guest_uuid_str, &uuid, &status); if (status != uuid_s_ok) return (-1); uuid_enc_le(&type1->uuid, &uuid); } else { MD5_CTX mdctx; u_char digest[16]; char hostname[MAXHOSTNAMELEN]; + const char *vmname; /* * Universally unique and yet reproducible are an * oxymoron, however reproducible is desirable in * this case. */ if (gethostname(hostname, sizeof(hostname))) return (-1); MD5Init(&mdctx); + vmname = get_config_value("name"); MD5Update(&mdctx, vmname, strlen(vmname)); MD5Update(&mdctx, hostname, sizeof(hostname)); MD5Final(digest, &mdctx); /* * Set the variant and version number. */ digest[6] &= 0x0F; digest[6] |= 0x30; /* version 3 */ digest[8] &= 0x3F; digest[8] |= 0x80; memcpy(&type1->uuid, digest, sizeof (digest)); } return (0); } static int smbios_type4_initializer(struct smbios_structure *template_entry, const char **template_strings, char *curaddr, char **endaddr, uint16_t *n, uint16_t *size) { int i; for (i = 0; i < sockets; i++) { struct smbios_table_type4 *type4; char *p; int nstrings, len; smbios_generic_initializer(template_entry, template_strings, curaddr, endaddr, n, size); type4 = (struct smbios_table_type4 *)curaddr; p = curaddr + sizeof (struct smbios_table_type4); nstrings = 0; while (p < *endaddr - 1) { if (*p++ == '\0') nstrings++; } len = sprintf(*endaddr - 1, "CPU #%d", i) + 1; *endaddr += len - 1; *(*endaddr) = '\0'; (*endaddr)++; type4->socket = nstrings + 1; /* Revise cores and threads after update to smbios 3.0 */ if (cores > 254) type4->cores = 0; else type4->cores = cores; /* This threads is total threads in a socket */ if ((cores * threads) > 254) type4->threads = 0; else type4->threads = (cores * threads); curaddr = *endaddr; } return (0); } static int smbios_type16_initializer(struct smbios_structure *template_entry, const char **template_strings, char *curaddr, char **endaddr, uint16_t *n, uint16_t *size) { struct smbios_table_type16 *type16; type16_handle = *n; smbios_generic_initializer(template_entry, template_strings, curaddr, endaddr, n, size); type16 = (struct smbios_table_type16 *)curaddr; type16->xsize = guest_lomem + guest_himem; type16->ndevs = guest_himem > 0 ? 2 : 1; return (0); } static int smbios_type17_initializer(struct smbios_structure *template_entry, const char **template_strings, char *curaddr, char **endaddr, uint16_t *n, uint16_t *size) { struct smbios_table_type17 *type17; uint64_t memsize, size_KB, size_MB; smbios_generic_initializer(template_entry, template_strings, curaddr, endaddr, n, size); type17 = (struct smbios_table_type17 *)curaddr; type17->arrayhand = type16_handle; memsize = guest_lomem + guest_himem; size_KB = memsize / 1024; size_MB = memsize / MB; /* A single Type 17 entry can't represent more than ~2PB RAM */ if (size_MB > 0x7FFFFFFF) { printf("Warning: guest memory too big for SMBIOS Type 17 table: " "%luMB greater than max supported 2147483647MB\n", size_MB); size_MB = 0x7FFFFFFF; } /* See SMBIOS 2.7.0 section 7.18 - Memory Device (Type 17) */ if (size_KB <= 0x7FFF) { /* Can represent up to 32767KB with the top bit set */ type17->size = size_KB | (1 << 15); } else if (size_MB < 0x7FFF) { /* Can represent up to 32766MB with the top bit unset */ type17->size = size_MB & 0x7FFF; } else { type17->size = 0x7FFF; /* * Can represent up to 2147483647MB (~2PB) * The top bit is reserved */ type17->xsize = size_MB & 0x7FFFFFFF; } return (0); } static int smbios_type19_initializer(struct smbios_structure *template_entry, const char **template_strings, char *curaddr, char **endaddr, uint16_t *n, uint16_t *size) { struct smbios_table_type19 *type19; smbios_generic_initializer(template_entry, template_strings, curaddr, endaddr, n, size); type19 = (struct smbios_table_type19 *)curaddr; type19->arrayhand = type16_handle; type19->xsaddr = 0; type19->xeaddr = guest_lomem; if (guest_himem > 0) { curaddr = *endaddr; smbios_generic_initializer(template_entry, template_strings, curaddr, endaddr, n, size); type19 = (struct smbios_table_type19 *)curaddr; type19->arrayhand = type16_handle; type19->xsaddr = 4*GB; type19->xeaddr = type19->xsaddr + guest_himem; } return (0); } static void smbios_ep_initializer(struct smbios_entry_point *smbios_ep, uint32_t staddr) { memset(smbios_ep, 0, sizeof(*smbios_ep)); memcpy(smbios_ep->eanchor, SMBIOS_ENTRY_EANCHOR, SMBIOS_ENTRY_EANCHORLEN); smbios_ep->eplen = 0x1F; assert(sizeof (struct smbios_entry_point) == smbios_ep->eplen); smbios_ep->major = 2; smbios_ep->minor = 6; smbios_ep->revision = 0; memcpy(smbios_ep->ianchor, SMBIOS_ENTRY_IANCHOR, SMBIOS_ENTRY_IANCHORLEN); smbios_ep->staddr = staddr; smbios_ep->bcdrev = (smbios_ep->major & 0xf) << 4 | (smbios_ep->minor & 0xf); } static void smbios_ep_finalizer(struct smbios_entry_point *smbios_ep, uint16_t len, uint16_t num, uint16_t maxssize) { uint8_t checksum; int i; smbios_ep->maxssize = maxssize; smbios_ep->stlen = len; smbios_ep->stnum = num; checksum = 0; for (i = 0x10; i < 0x1f; i++) { checksum -= ((uint8_t *)smbios_ep)[i]; } smbios_ep->ichecksum = checksum; checksum = 0; for (i = 0; i < 0x1f; i++) { checksum -= ((uint8_t *)smbios_ep)[i]; } smbios_ep->echecksum = checksum; } int smbios_build(struct vmctx *ctx) { struct smbios_entry_point *smbios_ep; uint16_t n; uint16_t maxssize; char *curaddr, *startaddr, *ststartaddr; int i; int err; guest_lomem = vm_get_lowmem_size(ctx); guest_himem = vm_get_highmem_size(ctx); startaddr = paddr_guest2host(ctx, SMBIOS_BASE, SMBIOS_MAX_LENGTH); if (startaddr == NULL) { EPRINTLN("smbios table requires mapped mem"); return (ENOMEM); } curaddr = startaddr; smbios_ep = (struct smbios_entry_point *)curaddr; smbios_ep_initializer(smbios_ep, SMBIOS_BASE + sizeof(struct smbios_entry_point)); curaddr += sizeof(struct smbios_entry_point); ststartaddr = curaddr; n = 0; maxssize = 0; for (i = 0; smbios_template[i].entry != NULL; i++) { struct smbios_structure *entry; const char **strings; initializer_func_t initializer; char *endaddr; uint16_t size; entry = smbios_template[i].entry; strings = smbios_template[i].strings; initializer = smbios_template[i].initializer; err = (*initializer)(entry, strings, curaddr, &endaddr, &n, &size); if (err != 0) return (err); if (size > maxssize) maxssize = size; curaddr = endaddr; } assert(curaddr - startaddr < SMBIOS_MAX_LENGTH); smbios_ep_finalizer(smbios_ep, curaddr - ststartaddr, n, maxssize); return (0); } diff --git a/usr.sbin/bhyve/uart_emul.c b/usr.sbin/bhyve/uart_emul.c index 1b55231b2bf5..fd0ad2c846f4 100644 --- a/usr.sbin/bhyve/uart_emul.c +++ b/usr.sbin/bhyve/uart_emul.c @@ -1,761 +1,761 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 NetApp, Inc. * Copyright (c) 2013 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #ifndef WITHOUT_CAPSICUM #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mevent.h" #include "uart_emul.h" #include "debug.h" #define COM1_BASE 0x3F8 #define COM1_IRQ 4 #define COM2_BASE 0x2F8 #define COM2_IRQ 3 #define COM3_BASE 0x3E8 #define COM3_IRQ 4 #define COM4_BASE 0x2E8 #define COM4_IRQ 3 #define DEFAULT_RCLK 1843200 #define DEFAULT_BAUD 9600 #define FCR_RX_MASK 0xC0 #define MCR_OUT1 0x04 #define MCR_OUT2 0x08 #define MSR_DELTA_MASK 0x0f #ifndef REG_SCR #define REG_SCR com_scr #endif #define FIFOSZ 16 static bool uart_stdio; /* stdio in use for i/o */ static struct termios tio_stdio_orig; static struct { int baseaddr; int irq; bool inuse; } uart_lres[] = { { COM1_BASE, COM1_IRQ, false}, { COM2_BASE, COM2_IRQ, false}, { COM3_BASE, COM3_IRQ, false}, { COM4_BASE, COM4_IRQ, false}, }; #define UART_NLDEVS (sizeof(uart_lres) / sizeof(uart_lres[0])) struct fifo { uint8_t buf[FIFOSZ]; int rindex; /* index to read from */ int windex; /* index to write to */ int num; /* number of characters in the fifo */ int size; /* size of the fifo */ }; struct ttyfd { bool opened; int rfd; /* fd for reading */ int wfd; /* fd for writing, may be == rfd */ }; struct uart_softc { pthread_mutex_t mtx; /* protects all softc elements */ uint8_t data; /* Data register (R/W) */ uint8_t ier; /* Interrupt enable register (R/W) */ uint8_t lcr; /* Line control register (R/W) */ uint8_t mcr; /* Modem control register (R/W) */ uint8_t lsr; /* Line status register (R/W) */ uint8_t msr; /* Modem status register (R/W) */ uint8_t fcr; /* FIFO control register (W) */ uint8_t scr; /* Scratch register (R/W) */ uint8_t dll; /* Baudrate divisor latch LSB */ uint8_t dlh; /* Baudrate divisor latch MSB */ struct fifo rxfifo; struct mevent *mev; struct ttyfd tty; bool thre_int_pending; /* THRE interrupt pending */ void *arg; uart_intr_func_t intr_assert; uart_intr_func_t intr_deassert; }; static void uart_drain(int fd, enum ev_type ev, void *arg); static void ttyclose(void) { tcsetattr(STDIN_FILENO, TCSANOW, &tio_stdio_orig); } static void ttyopen(struct ttyfd *tf) { struct termios orig, new; tcgetattr(tf->rfd, &orig); new = orig; cfmakeraw(&new); new.c_cflag |= CLOCAL; tcsetattr(tf->rfd, TCSANOW, &new); if (uart_stdio) { tio_stdio_orig = orig; atexit(ttyclose); } raw_stdio = 1; } static int ttyread(struct ttyfd *tf) { unsigned char rb; if (read(tf->rfd, &rb, 1) == 1) return (rb); else return (-1); } static void ttywrite(struct ttyfd *tf, unsigned char wb) { (void)write(tf->wfd, &wb, 1); } static void rxfifo_reset(struct uart_softc *sc, int size) { char flushbuf[32]; struct fifo *fifo; ssize_t nread; int error; fifo = &sc->rxfifo; bzero(fifo, sizeof(struct fifo)); fifo->size = size; if (sc->tty.opened) { /* * Flush any unread input from the tty buffer. */ while (1) { nread = read(sc->tty.rfd, flushbuf, sizeof(flushbuf)); if (nread != sizeof(flushbuf)) break; } /* * Enable mevent to trigger when new characters are available * on the tty fd. */ error = mevent_enable(sc->mev); assert(error == 0); } } static int rxfifo_available(struct uart_softc *sc) { struct fifo *fifo; fifo = &sc->rxfifo; return (fifo->num < fifo->size); } static int rxfifo_putchar(struct uart_softc *sc, uint8_t ch) { struct fifo *fifo; int error; fifo = &sc->rxfifo; if (fifo->num < fifo->size) { fifo->buf[fifo->windex] = ch; fifo->windex = (fifo->windex + 1) % fifo->size; fifo->num++; if (!rxfifo_available(sc)) { if (sc->tty.opened) { /* * Disable mevent callback if the FIFO is full. */ error = mevent_disable(sc->mev); assert(error == 0); } } return (0); } else return (-1); } static int rxfifo_getchar(struct uart_softc *sc) { struct fifo *fifo; int c, error, wasfull; wasfull = 0; fifo = &sc->rxfifo; if (fifo->num > 0) { if (!rxfifo_available(sc)) wasfull = 1; c = fifo->buf[fifo->rindex]; fifo->rindex = (fifo->rindex + 1) % fifo->size; fifo->num--; if (wasfull) { if (sc->tty.opened) { error = mevent_enable(sc->mev); assert(error == 0); } } return (c); } else return (-1); } static int rxfifo_numchars(struct uart_softc *sc) { struct fifo *fifo = &sc->rxfifo; return (fifo->num); } static void uart_opentty(struct uart_softc *sc) { ttyopen(&sc->tty); sc->mev = mevent_add(sc->tty.rfd, EVF_READ, uart_drain, sc); assert(sc->mev != NULL); } static uint8_t modem_status(uint8_t mcr) { uint8_t msr; if (mcr & MCR_LOOPBACK) { /* * In the loopback mode certain bits from the MCR are * reflected back into MSR. */ msr = 0; if (mcr & MCR_RTS) msr |= MSR_CTS; if (mcr & MCR_DTR) msr |= MSR_DSR; if (mcr & MCR_OUT1) msr |= MSR_RI; if (mcr & MCR_OUT2) msr |= MSR_DCD; } else { /* * Always assert DCD and DSR so tty open doesn't block * even if CLOCAL is turned off. */ msr = MSR_DCD | MSR_DSR; } assert((msr & MSR_DELTA_MASK) == 0); return (msr); } /* * The IIR returns a prioritized interrupt reason: * - receive data available * - transmit holding register empty * - modem status change * * Return an interrupt reason if one is available. */ static int uart_intr_reason(struct uart_softc *sc) { if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0) return (IIR_RLS); else if (rxfifo_numchars(sc) > 0 && (sc->ier & IER_ERXRDY) != 0) return (IIR_RXTOUT); else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0) return (IIR_TXRDY); else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0) return (IIR_MLSC); else return (IIR_NOPEND); } static void uart_reset(struct uart_softc *sc) { uint16_t divisor; divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16; sc->dll = divisor; sc->dlh = divisor >> 16; sc->msr = modem_status(sc->mcr); rxfifo_reset(sc, 1); /* no fifo until enabled by software */ } /* * Toggle the COM port's intr pin depending on whether or not we have an * interrupt condition to report to the processor. */ static void uart_toggle_intr(struct uart_softc *sc) { uint8_t intr_reason; intr_reason = uart_intr_reason(sc); if (intr_reason == IIR_NOPEND) (*sc->intr_deassert)(sc->arg); else (*sc->intr_assert)(sc->arg); } static void uart_drain(int fd, enum ev_type ev, void *arg) { struct uart_softc *sc; int ch; sc = arg; assert(fd == sc->tty.rfd); assert(ev == EVF_READ); /* * This routine is called in the context of the mevent thread * to take out the softc lock to protect against concurrent * access from a vCPU i/o exit */ pthread_mutex_lock(&sc->mtx); if ((sc->mcr & MCR_LOOPBACK) != 0) { (void) ttyread(&sc->tty); } else { while (rxfifo_available(sc) && ((ch = ttyread(&sc->tty)) != -1)) { rxfifo_putchar(sc, ch); } uart_toggle_intr(sc); } pthread_mutex_unlock(&sc->mtx); } void uart_write(struct uart_softc *sc, int offset, uint8_t value) { int fifosz; uint8_t msr; pthread_mutex_lock(&sc->mtx); /* * Take care of the special case DLAB accesses first */ if ((sc->lcr & LCR_DLAB) != 0) { if (offset == REG_DLL) { sc->dll = value; goto done; } if (offset == REG_DLH) { sc->dlh = value; goto done; } } switch (offset) { case REG_DATA: if (sc->mcr & MCR_LOOPBACK) { if (rxfifo_putchar(sc, value) != 0) sc->lsr |= LSR_OE; } else if (sc->tty.opened) { ttywrite(&sc->tty, value); } /* else drop on floor */ sc->thre_int_pending = true; break; case REG_IER: /* Set pending when IER_ETXRDY is raised (edge-triggered). */ if ((sc->ier & IER_ETXRDY) == 0 && (value & IER_ETXRDY) != 0) sc->thre_int_pending = true; /* * Apply mask so that bits 4-7 are 0 * Also enables bits 0-3 only if they're 1 */ sc->ier = value & 0x0F; break; case REG_FCR: /* * When moving from FIFO and 16450 mode and vice versa, * the FIFO contents are reset. */ if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) { fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1; rxfifo_reset(sc, fifosz); } /* * The FCR_ENABLE bit must be '1' for the programming * of other FCR bits to be effective. */ if ((value & FCR_ENABLE) == 0) { sc->fcr = 0; } else { if ((value & FCR_RCV_RST) != 0) rxfifo_reset(sc, FIFOSZ); sc->fcr = value & (FCR_ENABLE | FCR_DMA | FCR_RX_MASK); } break; case REG_LCR: sc->lcr = value; break; case REG_MCR: /* Apply mask so that bits 5-7 are 0 */ sc->mcr = value & 0x1F; msr = modem_status(sc->mcr); /* * Detect if there has been any change between the * previous and the new value of MSR. If there is * then assert the appropriate MSR delta bit. */ if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS)) sc->msr |= MSR_DCTS; if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR)) sc->msr |= MSR_DDSR; if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD)) sc->msr |= MSR_DDCD; if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0) sc->msr |= MSR_TERI; /* * Update the value of MSR while retaining the delta * bits. */ sc->msr &= MSR_DELTA_MASK; sc->msr |= msr; break; case REG_LSR: /* * Line status register is not meant to be written to * during normal operation. */ break; case REG_MSR: /* * As far as I can tell MSR is a read-only register. */ break; case REG_SCR: sc->scr = value; break; default: break; } done: uart_toggle_intr(sc); pthread_mutex_unlock(&sc->mtx); } uint8_t uart_read(struct uart_softc *sc, int offset) { uint8_t iir, intr_reason, reg; pthread_mutex_lock(&sc->mtx); /* * Take care of the special case DLAB accesses first */ if ((sc->lcr & LCR_DLAB) != 0) { if (offset == REG_DLL) { reg = sc->dll; goto done; } if (offset == REG_DLH) { reg = sc->dlh; goto done; } } switch (offset) { case REG_DATA: reg = rxfifo_getchar(sc); break; case REG_IER: reg = sc->ier; break; case REG_IIR: iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0; intr_reason = uart_intr_reason(sc); /* * Deal with side effects of reading the IIR register */ if (intr_reason == IIR_TXRDY) sc->thre_int_pending = false; iir |= intr_reason; reg = iir; break; case REG_LCR: reg = sc->lcr; break; case REG_MCR: reg = sc->mcr; break; case REG_LSR: /* Transmitter is always ready for more data */ sc->lsr |= LSR_TEMT | LSR_THRE; /* Check for new receive data */ if (rxfifo_numchars(sc) > 0) sc->lsr |= LSR_RXRDY; else sc->lsr &= ~LSR_RXRDY; reg = sc->lsr; /* The LSR_OE bit is cleared on LSR read */ sc->lsr &= ~LSR_OE; break; case REG_MSR: /* * MSR delta bits are cleared on read */ reg = sc->msr; sc->msr &= ~MSR_DELTA_MASK; break; case REG_SCR: reg = sc->scr; break; default: reg = 0xFF; break; } done: uart_toggle_intr(sc); pthread_mutex_unlock(&sc->mtx); return (reg); } int uart_legacy_alloc(int which, int *baseaddr, int *irq) { if (which < 0 || which >= UART_NLDEVS || uart_lres[which].inuse) return (-1); uart_lres[which].inuse = true; *baseaddr = uart_lres[which].baseaddr; *irq = uart_lres[which].irq; return (0); } struct uart_softc * uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert, void *arg) { struct uart_softc *sc; sc = calloc(1, sizeof(struct uart_softc)); sc->arg = arg; sc->intr_assert = intr_assert; sc->intr_deassert = intr_deassert; pthread_mutex_init(&sc->mtx, NULL); uart_reset(sc); return (sc); } static int uart_stdio_backend(struct uart_softc *sc) { #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; #endif if (uart_stdio) return (-1); sc->tty.rfd = STDIN_FILENO; sc->tty.wfd = STDOUT_FILENO; sc->tty.opened = true; if (fcntl(sc->tty.rfd, F_SETFL, O_NONBLOCK) != 0) return (-1); if (fcntl(sc->tty.wfd, F_SETFL, O_NONBLOCK) != 0) return (-1); #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ); if (caph_rights_limit(sc->tty.rfd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_ioctls_limit(sc->tty.rfd, cmds, nitems(cmds)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif uart_stdio = true; return (0); } static int -uart_tty_backend(struct uart_softc *sc, const char *opts) +uart_tty_backend(struct uart_softc *sc, const char *path) { #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; #endif int fd; - fd = open(opts, O_RDWR | O_NONBLOCK); + fd = open(path, O_RDWR | O_NONBLOCK); if (fd < 0) return (-1); if (!isatty(fd)) { close(fd); return (-1); } sc->tty.rfd = sc->tty.wfd = fd; sc->tty.opened = true; #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ, CAP_WRITE); if (caph_rights_limit(fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif return (0); } int -uart_set_backend(struct uart_softc *sc, const char *opts) +uart_set_backend(struct uart_softc *sc, const char *device) { int retval; - if (opts == NULL) + if (device == NULL) return (0); - if (strcmp("stdio", opts) == 0) + if (strcmp("stdio", device) == 0) retval = uart_stdio_backend(sc); else - retval = uart_tty_backend(sc, opts); + retval = uart_tty_backend(sc, device); if (retval == 0) uart_opentty(sc); return (retval); } #ifdef BHYVE_SNAPSHOT int uart_snapshot(struct uart_softc *sc, struct vm_snapshot_meta *meta) { int ret; SNAPSHOT_VAR_OR_LEAVE(sc->data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ier, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->lcr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->mcr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->lsr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->msr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->fcr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->scr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->dll, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->dlh, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.rindex, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.windex, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.num, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.size, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(sc->rxfifo.buf, sizeof(sc->rxfifo.buf), meta, ret, done); sc->thre_int_pending = 1; done: return (ret); } #endif diff --git a/usr.sbin/bhyve/uart_emul.h b/usr.sbin/bhyve/uart_emul.h index 5a53294da89e..a66701c18580 100644 --- a/usr.sbin/bhyve/uart_emul.h +++ b/usr.sbin/bhyve/uart_emul.h @@ -1,50 +1,50 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _UART_EMUL_H_ #define _UART_EMUL_H_ #define UART_IO_BAR_SIZE 8 struct uart_softc; struct vm_snapshot_meta; typedef void (*uart_intr_func_t)(void *arg); struct uart_softc *uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert, void *arg); int uart_legacy_alloc(int unit, int *ioaddr, int *irq); uint8_t uart_read(struct uart_softc *sc, int offset); void uart_write(struct uart_softc *sc, int offset, uint8_t value); -int uart_set_backend(struct uart_softc *sc, const char *opt); +int uart_set_backend(struct uart_softc *sc, const char *device); #ifdef BHYVE_SNAPSHOT int uart_snapshot(struct uart_softc *sc, struct vm_snapshot_meta *meta); #endif #endif diff --git a/usr.sbin/bhyve/usb_emul.c b/usr.sbin/bhyve/usb_emul.c index 6ecdd9530eb1..d97d7b38d2f8 100644 --- a/usr.sbin/bhyve/usb_emul.c +++ b/usr.sbin/bhyve/usb_emul.c @@ -1,78 +1,78 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Nahanni Systems Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include "usb_emul.h" SET_DECLARE(usb_emu_set, struct usb_devemu); struct usb_devemu * -usb_emu_finddev(char *name) +usb_emu_finddev(const char *name) { struct usb_devemu **udpp, *udp; SET_FOREACH(udpp, usb_emu_set) { udp = *udpp; if (!strcmp(udp->ue_emu, name)) return (udp); } return (NULL); } struct usb_data_xfer_block * usb_data_xfer_append(struct usb_data_xfer *xfer, void *buf, int blen, void *hci_data, int ccs) { struct usb_data_xfer_block *xb; if (xfer->ndata >= USB_MAX_XFER_BLOCKS) return (NULL); xb = &xfer->data[xfer->tail]; xb->buf = buf; xb->blen = blen; xb->hci_data = hci_data; xb->ccs = ccs; xb->processed = 0; xb->bdone = 0; xfer->ndata++; xfer->tail = (xfer->tail + 1) % USB_MAX_XFER_BLOCKS; return (xb); } diff --git a/usr.sbin/bhyve/usb_emul.h b/usr.sbin/bhyve/usb_emul.h index d6e1e616cd82..67f4f3ff08f1 100644 --- a/usr.sbin/bhyve/usb_emul.h +++ b/usr.sbin/bhyve/usb_emul.h @@ -1,158 +1,159 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Leon Dang * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _USB_EMUL_H_ #define _USB_EMUL_H_ +#include #include #include #include #define USB_MAX_XFER_BLOCKS 8 #define USB_XFER_OUT 0 #define USB_XFER_IN 1 struct usb_hci; struct usb_device_request; struct usb_data_xfer; struct vm_snapshot_meta; /* Device emulation handlers */ struct usb_devemu { char *ue_emu; /* name of device emulation */ int ue_usbver; /* usb version: 2 or 3 */ int ue_usbspeed; /* usb device speed */ /* instance creation */ - void *(*ue_init)(struct usb_hci *hci, char *opt); + void *(*ue_init)(struct usb_hci *hci, nvlist_t *nvl); /* handlers */ int (*ue_request)(void *sc, struct usb_data_xfer *xfer); int (*ue_data)(void *sc, struct usb_data_xfer *xfer, int dir, int epctx); int (*ue_reset)(void *sc); int (*ue_remove)(void *sc); int (*ue_stop)(void *sc); int (*ue_snapshot)(void *scarg, struct vm_snapshot_meta *meta); }; #define USB_EMUL_SET(x) DATA_SET(usb_emu_set, x); /* * USB device events to notify HCI when state changes */ enum hci_usbev { USBDEV_ATTACH, USBDEV_RESET, USBDEV_STOP, USBDEV_REMOVE, }; /* usb controller, ie xhci, ehci */ struct usb_hci { int (*hci_intr)(struct usb_hci *hci, int epctx); int (*hci_event)(struct usb_hci *hci, enum hci_usbev evid, void *param); void *hci_sc; /* private softc for hci */ /* controller managed fields */ int hci_address; int hci_port; }; /* * Each xfer block is mapped to the hci transfer block. * On input into the device handler, blen is set to the lenght of buf. * The device handler is to update blen to reflect on the residual size * of the buffer, i.e. len(buf) - len(consumed). */ struct usb_data_xfer_block { void *buf; /* IN or OUT pointer */ int blen; /* in:len(buf), out:len(remaining) */ int bdone; /* bytes transferred */ uint32_t processed; /* device processed this + errcode */ void *hci_data; /* HCI private reference */ int ccs; uint32_t streamid; uint64_t trbnext; /* next TRB guest address */ }; struct usb_data_xfer { struct usb_data_xfer_block data[USB_MAX_XFER_BLOCKS]; struct usb_device_request *ureq; /* setup ctl request */ int ndata; /* # of data items */ int head; int tail; pthread_mutex_t mtx; }; enum USB_ERRCODE { USB_ACK, USB_NAK, USB_STALL, USB_NYET, USB_ERR, USB_SHORT }; #define USB_DATA_GET_ERRCODE(x) (x)->processed >> 8 #define USB_DATA_SET_ERRCODE(x,e) do { \ (x)->processed = ((x)->processed & 0xFF) | (e << 8); \ } while (0) #define USB_DATA_OK(x,i) ((x)->data[(i)].buf != NULL) #define USB_DATA_XFER_INIT(x) do { \ memset((x), 0, sizeof(*(x))); \ pthread_mutex_init(&((x)->mtx), NULL); \ } while (0) #define USB_DATA_XFER_RESET(x) do { \ memset((x)->data, 0, sizeof((x)->data)); \ (x)->ndata = 0; \ (x)->head = (x)->tail = 0; \ } while (0) #define USB_DATA_XFER_LOCK(x) do { \ pthread_mutex_lock(&((x)->mtx)); \ } while (0) #define USB_DATA_XFER_UNLOCK(x) do { \ pthread_mutex_unlock(&((x)->mtx)); \ } while (0) -struct usb_devemu *usb_emu_finddev(char *name); +struct usb_devemu *usb_emu_finddev(const char *name); struct usb_data_xfer_block *usb_data_xfer_append(struct usb_data_xfer *xfer, void *buf, int blen, void *hci_data, int ccs); #endif /* _USB_EMUL_H_ */ diff --git a/usr.sbin/bhyve/usb_mouse.c b/usr.sbin/bhyve/usb_mouse.c index 3b7fc29ff249..eda3878ff5fb 100644 --- a/usr.sbin/bhyve/usb_mouse.c +++ b/usr.sbin/bhyve/usb_mouse.c @@ -1,831 +1,828 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Leon Dang * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "usb_emul.h" #include "console.h" #include "bhyvegc.h" #include "debug.h" static int umouse_debug = 0; #define DPRINTF(params) if (umouse_debug) PRINTLN params #define WPRINTF(params) PRINTLN params /* USB endpoint context (1-15) for reporting mouse data events*/ #define UMOUSE_INTR_ENDPT 1 #define UMOUSE_REPORT_DESC_TYPE 0x22 #define UMOUSE_GET_REPORT 0x01 #define UMOUSE_GET_IDLE 0x02 #define UMOUSE_GET_PROTOCOL 0x03 #define UMOUSE_SET_REPORT 0x09 #define UMOUSE_SET_IDLE 0x0A #define UMOUSE_SET_PROTOCOL 0x0B #define HSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) } enum { UMSTR_LANG, UMSTR_MANUFACTURER, UMSTR_PRODUCT, UMSTR_SERIAL, UMSTR_CONFIG, UMSTR_MAX }; static const char *umouse_desc_strings[] = { "\x09\x04", "BHYVE", "HID Tablet", "01", "HID Tablet Device", }; struct umouse_hid_descriptor { uint8_t bLength; uint8_t bDescriptorType; uint8_t bcdHID[2]; uint8_t bCountryCode; uint8_t bNumDescriptors; uint8_t bReportDescriptorType; uint8_t wItemLength[2]; } __packed; struct umouse_config_desc { struct usb_config_descriptor confd; struct usb_interface_descriptor ifcd; struct umouse_hid_descriptor hidd; struct usb_endpoint_descriptor endpd; struct usb_endpoint_ss_comp_descriptor sscompd; } __packed; #define MOUSE_MAX_X 0x8000 #define MOUSE_MAX_Y 0x8000 static const uint8_t umouse_report_desc[] = { 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ 0x09, 0x02, /* USAGE (Mouse) */ 0xa1, 0x01, /* COLLECTION (Application) */ 0x09, 0x01, /* USAGE (Pointer) */ 0xa1, 0x00, /* COLLECTION (Physical) */ 0x05, 0x09, /* USAGE_PAGE (Button) */ 0x19, 0x01, /* USAGE_MINIMUM (Button 1) */ 0x29, 0x03, /* USAGE_MAXIMUM (Button 3) */ 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ 0x75, 0x01, /* REPORT_SIZE (1) */ 0x95, 0x03, /* REPORT_COUNT (3) */ 0x81, 0x02, /* INPUT (Data,Var,Abs); 3 buttons */ 0x75, 0x05, /* REPORT_SIZE (5) */ 0x95, 0x01, /* REPORT_COUNT (1) */ 0x81, 0x03, /* INPUT (Cnst,Var,Abs); padding */ 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ 0x09, 0x30, /* USAGE (X) */ 0x09, 0x31, /* USAGE (Y) */ 0x35, 0x00, /* PHYSICAL_MINIMUM (0) */ 0x46, 0xff, 0x7f, /* PHYSICAL_MAXIMUM (0x7fff) */ 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ 0x26, 0xff, 0x7f, /* LOGICAL_MAXIMUM (0x7fff) */ 0x75, 0x10, /* REPORT_SIZE (16) */ 0x95, 0x02, /* REPORT_COUNT (2) */ 0x81, 0x02, /* INPUT (Data,Var,Abs) */ 0x05, 0x01, /* USAGE Page (Generic Desktop) */ 0x09, 0x38, /* USAGE (Wheel) */ 0x35, 0x00, /* PHYSICAL_MINIMUM (0) */ 0x45, 0x00, /* PHYSICAL_MAXIMUM (0) */ 0x15, 0x81, /* LOGICAL_MINIMUM (-127) */ 0x25, 0x7f, /* LOGICAL_MAXIMUM (127) */ 0x75, 0x08, /* REPORT_SIZE (8) */ 0x95, 0x01, /* REPORT_COUNT (1) */ 0x81, 0x06, /* INPUT (Data,Var,Rel) */ 0xc0, /* END_COLLECTION */ 0xc0 /* END_COLLECTION */ }; struct umouse_report { uint8_t buttons; /* bits: 0 left, 1 right, 2 middle */ int16_t x; /* x position */ int16_t y; /* y position */ int8_t z; /* z wheel position */ } __packed; #define MSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) } static struct usb_device_descriptor umouse_dev_desc = { .bLength = sizeof(umouse_dev_desc), .bDescriptorType = UDESC_DEVICE, MSETW(.bcdUSB, UD_USB_3_0), .bMaxPacketSize = 8, /* max packet size */ MSETW(.idVendor, 0xFB5D), /* vendor */ MSETW(.idProduct, 0x0001), /* product */ MSETW(.bcdDevice, 0), /* device version */ .iManufacturer = UMSTR_MANUFACTURER, .iProduct = UMSTR_PRODUCT, .iSerialNumber = UMSTR_SERIAL, .bNumConfigurations = 1, }; static struct umouse_config_desc umouse_confd = { .confd = { .bLength = sizeof(umouse_confd.confd), .bDescriptorType = UDESC_CONFIG, .wTotalLength[0] = sizeof(umouse_confd), .bNumInterface = 1, .bConfigurationValue = 1, .iConfiguration = UMSTR_CONFIG, .bmAttributes = UC_BUS_POWERED | UC_REMOTE_WAKEUP, .bMaxPower = 0, }, .ifcd = { .bLength = sizeof(umouse_confd.ifcd), .bDescriptorType = UDESC_INTERFACE, .bNumEndpoints = 1, .bInterfaceClass = UICLASS_HID, .bInterfaceSubClass = UISUBCLASS_BOOT, .bInterfaceProtocol = UIPROTO_MOUSE, }, .hidd = { .bLength = sizeof(umouse_confd.hidd), .bDescriptorType = 0x21, .bcdHID = { 0x01, 0x10 }, .bCountryCode = 0, .bNumDescriptors = 1, .bReportDescriptorType = UMOUSE_REPORT_DESC_TYPE, .wItemLength = { sizeof(umouse_report_desc), 0 }, }, .endpd = { .bLength = sizeof(umouse_confd.endpd), .bDescriptorType = UDESC_ENDPOINT, .bEndpointAddress = UE_DIR_IN | UMOUSE_INTR_ENDPT, .bmAttributes = UE_INTERRUPT, .wMaxPacketSize[0] = 8, .bInterval = 0xA, }, .sscompd = { .bLength = sizeof(umouse_confd.sscompd), .bDescriptorType = UDESC_ENDPOINT_SS_COMP, .bMaxBurst = 0, .bmAttributes = 0, MSETW(.wBytesPerInterval, 0), }, }; struct umouse_bos_desc { struct usb_bos_descriptor bosd; struct usb_devcap_ss_descriptor usbssd; } __packed; struct umouse_bos_desc umouse_bosd = { .bosd = { .bLength = sizeof(umouse_bosd.bosd), .bDescriptorType = UDESC_BOS, HSETW(.wTotalLength, sizeof(umouse_bosd)), .bNumDeviceCaps = 1, }, .usbssd = { .bLength = sizeof(umouse_bosd.usbssd), .bDescriptorType = UDESC_DEVICE_CAPABILITY, .bDevCapabilityType = 3, .bmAttributes = 0, HSETW(.wSpeedsSupported, 0x08), .bFunctionalitySupport = 3, .bU1DevExitLat = 0xa, /* dummy - not used */ .wU2DevExitLat = { 0x20, 0x00 }, } }; struct umouse_softc { struct usb_hci *hci; - char *opt; - struct umouse_report um_report; int newdata; struct { uint8_t idle; uint8_t protocol; uint8_t feature; } hid; pthread_mutex_t mtx; pthread_mutex_t ev_mtx; int polling; struct timeval prev_evt; }; static void umouse_event(uint8_t button, int x, int y, void *arg) { struct umouse_softc *sc; struct bhyvegc_image *gc; gc = console_get_image(); if (gc == NULL) { /* not ready */ return; } sc = arg; pthread_mutex_lock(&sc->mtx); sc->um_report.buttons = 0; sc->um_report.z = 0; if (button & 0x01) sc->um_report.buttons |= 0x01; /* left */ if (button & 0x02) sc->um_report.buttons |= 0x04; /* middle */ if (button & 0x04) sc->um_report.buttons |= 0x02; /* right */ if (button & 0x8) sc->um_report.z = 1; if (button & 0x10) sc->um_report.z = -1; /* scale coords to mouse resolution */ sc->um_report.x = MOUSE_MAX_X * x / gc->width; sc->um_report.y = MOUSE_MAX_Y * y / gc->height; sc->newdata = 1; pthread_mutex_unlock(&sc->mtx); pthread_mutex_lock(&sc->ev_mtx); sc->hci->hci_intr(sc->hci, UE_DIR_IN | UMOUSE_INTR_ENDPT); pthread_mutex_unlock(&sc->ev_mtx); } static void * -umouse_init(struct usb_hci *hci, char *opt) +umouse_init(struct usb_hci *hci, nvlist_t *nvl) { struct umouse_softc *sc; sc = calloc(1, sizeof(struct umouse_softc)); sc->hci = hci; sc->hid.protocol = 1; /* REPORT protocol */ - sc->opt = strdup(opt); pthread_mutex_init(&sc->mtx, NULL); pthread_mutex_init(&sc->ev_mtx, NULL); console_ptr_register(umouse_event, sc, 10); return (sc); } #define UREQ(x,y) ((x) | ((y) << 8)) static int umouse_request(void *scarg, struct usb_data_xfer *xfer) { struct umouse_softc *sc; struct usb_data_xfer_block *data; const char *str; uint16_t value; uint16_t index; uint16_t len; uint16_t slen; uint8_t *udata; int err; int i, idx; int eshort; sc = scarg; data = NULL; udata = NULL; idx = xfer->head; for (i = 0; i < xfer->ndata; i++) { xfer->data[idx].bdone = 0; if (data == NULL && USB_DATA_OK(xfer,i)) { data = &xfer->data[idx]; udata = data->buf; } xfer->data[idx].processed = 1; idx = (idx + 1) % USB_MAX_XFER_BLOCKS; } err = USB_ERR_NORMAL_COMPLETION; eshort = 0; if (!xfer->ureq) { DPRINTF(("umouse_request: port %d", sc->hci->hci_port)); goto done; } value = UGETW(xfer->ureq->wValue); index = UGETW(xfer->ureq->wIndex); len = UGETW(xfer->ureq->wLength); DPRINTF(("umouse_request: port %d, type 0x%x, req 0x%x, val 0x%x, " "idx 0x%x, len %u", sc->hci->hci_port, xfer->ureq->bmRequestType, xfer->ureq->bRequest, value, index, len)); switch (UREQ(xfer->ureq->bRequest, xfer->ureq->bmRequestType)) { case UREQ(UR_GET_CONFIG, UT_READ_DEVICE): DPRINTF(("umouse: (UR_GET_CONFIG, UT_READ_DEVICE)")); if (!data) break; *udata = umouse_confd.confd.bConfigurationValue; data->blen = len > 0 ? len - 1 : 0; eshort = data->blen > 0; data->bdone += 1; break; case UREQ(UR_GET_DESCRIPTOR, UT_READ_DEVICE): DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_DEVICE) val %x", value >> 8)); if (!data) break; switch (value >> 8) { case UDESC_DEVICE: DPRINTF(("umouse: (->UDESC_DEVICE) len %u ?= " "sizeof(umouse_dev_desc) %lu", len, sizeof(umouse_dev_desc))); if ((value & 0xFF) != 0) { err = USB_ERR_STALLED; goto done; } if (len > sizeof(umouse_dev_desc)) { data->blen = len - sizeof(umouse_dev_desc); len = sizeof(umouse_dev_desc); } else data->blen = 0; memcpy(data->buf, &umouse_dev_desc, len); data->bdone += len; break; case UDESC_CONFIG: DPRINTF(("umouse: (->UDESC_CONFIG)")); if ((value & 0xFF) != 0) { err = USB_ERR_STALLED; goto done; } if (len > sizeof(umouse_confd)) { data->blen = len - sizeof(umouse_confd); len = sizeof(umouse_confd); } else data->blen = 0; memcpy(data->buf, &umouse_confd, len); data->bdone += len; break; case UDESC_STRING: DPRINTF(("umouse: (->UDESC_STRING)")); str = NULL; if ((value & 0xFF) < UMSTR_MAX) str = umouse_desc_strings[value & 0xFF]; else goto done; if ((value & 0xFF) == UMSTR_LANG) { udata[0] = 4; udata[1] = UDESC_STRING; data->blen = len - 2; len -= 2; data->bdone += 2; if (len >= 2) { udata[2] = str[0]; udata[3] = str[1]; data->blen -= 2; data->bdone += 2; } else data->blen = 0; goto done; } slen = 2 + strlen(str) * 2; udata[0] = slen; udata[1] = UDESC_STRING; if (len > slen) { data->blen = len - slen; len = slen; } else data->blen = 0; for (i = 2; i < len; i += 2) { udata[i] = *str++; udata[i+1] = '\0'; } data->bdone += slen; break; case UDESC_BOS: DPRINTF(("umouse: USB3 BOS")); if (len > sizeof(umouse_bosd)) { data->blen = len - sizeof(umouse_bosd); len = sizeof(umouse_bosd); } else data->blen = 0; memcpy(udata, &umouse_bosd, len); data->bdone += len; break; default: DPRINTF(("umouse: unknown(%d)->ERROR", value >> 8)); err = USB_ERR_STALLED; goto done; } eshort = data->blen > 0; break; case UREQ(UR_GET_DESCRIPTOR, UT_READ_INTERFACE): DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_INTERFACE) " "0x%x", (value >> 8))); if (!data) break; switch (value >> 8) { case UMOUSE_REPORT_DESC_TYPE: if (len > sizeof(umouse_report_desc)) { data->blen = len - sizeof(umouse_report_desc); len = sizeof(umouse_report_desc); } else data->blen = 0; memcpy(data->buf, umouse_report_desc, len); data->bdone += len; break; default: DPRINTF(("umouse: IO ERROR")); err = USB_ERR_STALLED; goto done; } eshort = data->blen > 0; break; case UREQ(UR_GET_INTERFACE, UT_READ_INTERFACE): DPRINTF(("umouse: (UR_GET_INTERFACE, UT_READ_INTERFACE)")); if (index != 0) { DPRINTF(("umouse get_interface, invalid index %d", index)); err = USB_ERR_STALLED; goto done; } if (!data) break; if (len > 0) { *udata = 0; data->blen = len - 1; } eshort = data->blen > 0; data->bdone += 1; break; case UREQ(UR_GET_STATUS, UT_READ_DEVICE): DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_DEVICE)")); if (data != NULL && len > 1) { if (sc->hid.feature == UF_DEVICE_REMOTE_WAKEUP) USETW(udata, UDS_REMOTE_WAKEUP); else USETW(udata, 0); data->blen = len - 2; data->bdone += 2; } eshort = data->blen > 0; break; case UREQ(UR_GET_STATUS, UT_READ_INTERFACE): case UREQ(UR_GET_STATUS, UT_READ_ENDPOINT): DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_INTERFACE)")); if (data != NULL && len > 1) { USETW(udata, 0); data->blen = len - 2; data->bdone += 2; } eshort = data->blen > 0; break; case UREQ(UR_SET_ADDRESS, UT_WRITE_DEVICE): /* XXX Controller should've handled this */ DPRINTF(("umouse set address %u", value)); break; case UREQ(UR_SET_CONFIG, UT_WRITE_DEVICE): DPRINTF(("umouse set config %u", value)); break; case UREQ(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE): DPRINTF(("umouse set descriptor %u", value)); break; case UREQ(UR_CLEAR_FEATURE, UT_WRITE_DEVICE): DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x", value)); if (value == UF_DEVICE_REMOTE_WAKEUP) sc->hid.feature = 0; break; case UREQ(UR_SET_FEATURE, UT_WRITE_DEVICE): DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x", value)); if (value == UF_DEVICE_REMOTE_WAKEUP) sc->hid.feature = UF_DEVICE_REMOTE_WAKEUP; break; case UREQ(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE): case UREQ(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT): case UREQ(UR_SET_FEATURE, UT_WRITE_INTERFACE): case UREQ(UR_SET_FEATURE, UT_WRITE_ENDPOINT): DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_INTERFACE)")); err = USB_ERR_STALLED; goto done; case UREQ(UR_SET_INTERFACE, UT_WRITE_INTERFACE): DPRINTF(("umouse set interface %u", value)); break; case UREQ(UR_ISOCH_DELAY, UT_WRITE_DEVICE): DPRINTF(("umouse set isoch delay %u", value)); break; case UREQ(UR_SET_SEL, 0): DPRINTF(("umouse set sel")); break; case UREQ(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT): DPRINTF(("umouse synch frame")); break; /* HID device requests */ case UREQ(UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE): DPRINTF(("umouse: (UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE) " "0x%x", (value >> 8))); if (!data) break; if ((value >> 8) == 0x01 && len >= sizeof(sc->um_report)) { /* TODO read from backend */ if (len > sizeof(sc->um_report)) { data->blen = len - sizeof(sc->um_report); len = sizeof(sc->um_report); } else data->blen = 0; memcpy(data->buf, &sc->um_report, len); data->bdone += len; } else { err = USB_ERR_STALLED; goto done; } eshort = data->blen > 0; break; case UREQ(UMOUSE_GET_IDLE, UT_READ_CLASS_INTERFACE): if (data != NULL && len > 0) { *udata = sc->hid.idle; data->blen = len - 1; data->bdone += 1; } eshort = data->blen > 0; break; case UREQ(UMOUSE_GET_PROTOCOL, UT_READ_CLASS_INTERFACE): if (data != NULL && len > 0) { *udata = sc->hid.protocol; data->blen = len - 1; data->bdone += 1; } eshort = data->blen > 0; break; case UREQ(UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE): DPRINTF(("umouse: (UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE) ignored")); break; case UREQ(UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE): sc->hid.idle = UGETW(xfer->ureq->wValue) >> 8; DPRINTF(("umouse: (UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE) %x", sc->hid.idle)); break; case UREQ(UMOUSE_SET_PROTOCOL, UT_WRITE_CLASS_INTERFACE): sc->hid.protocol = UGETW(xfer->ureq->wValue) >> 8; DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_CLASS_INTERFACE) %x", sc->hid.protocol)); break; default: DPRINTF(("**** umouse request unhandled")); err = USB_ERR_STALLED; break; } done: if (xfer->ureq && (xfer->ureq->bmRequestType & UT_WRITE) && (err == USB_ERR_NORMAL_COMPLETION) && (data != NULL)) data->blen = 0; else if (eshort) err = USB_ERR_SHORT_XFER; DPRINTF(("umouse request error code %d (0=ok), blen %u txlen %u", err, (data ? data->blen : 0), (data ? data->bdone : 0))); return (err); } static int umouse_data_handler(void *scarg, struct usb_data_xfer *xfer, int dir, int epctx) { struct umouse_softc *sc; struct usb_data_xfer_block *data; uint8_t *udata; int len, i, idx; int err; DPRINTF(("umouse handle data - DIR=%s|EP=%d, blen %d", dir ? "IN" : "OUT", epctx, xfer->data[0].blen)); /* find buffer to add data */ udata = NULL; err = USB_ERR_NORMAL_COMPLETION; /* handle xfer at first unprocessed item with buffer */ data = NULL; idx = xfer->head; for (i = 0; i < xfer->ndata; i++) { data = &xfer->data[idx]; if (data->buf != NULL && data->blen != 0) { break; } else { data->processed = 1; data = NULL; } idx = (idx + 1) % USB_MAX_XFER_BLOCKS; } if (!data) goto done; udata = data->buf; len = data->blen; if (udata == NULL) { DPRINTF(("umouse no buffer provided for input")); err = USB_ERR_NOMEM; goto done; } sc = scarg; if (dir) { pthread_mutex_lock(&sc->mtx); if (!sc->newdata) { err = USB_ERR_CANCELLED; USB_DATA_SET_ERRCODE(&xfer->data[xfer->head], USB_NAK); pthread_mutex_unlock(&sc->mtx); goto done; } if (sc->polling) { err = USB_ERR_STALLED; USB_DATA_SET_ERRCODE(data, USB_STALL); pthread_mutex_unlock(&sc->mtx); goto done; } sc->polling = 1; if (len > 0) { sc->newdata = 0; data->processed = 1; data->bdone += 6; memcpy(udata, &sc->um_report, 6); data->blen = len - 6; if (data->blen > 0) err = USB_ERR_SHORT_XFER; } sc->polling = 0; pthread_mutex_unlock(&sc->mtx); } else { USB_DATA_SET_ERRCODE(data, USB_STALL); err = USB_ERR_STALLED; } done: return (err); } static int umouse_reset(void *scarg) { struct umouse_softc *sc; sc = scarg; sc->newdata = 0; return (0); } static int umouse_remove(void *scarg) { return (0); } static int umouse_stop(void *scarg) { return (0); } #ifdef BHYVE_SNAPSHOT static int umouse_snapshot(void *scarg, struct vm_snapshot_meta *meta) { int ret; struct umouse_softc *sc; sc = scarg; SNAPSHOT_VAR_OR_LEAVE(sc->um_report, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->newdata, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hid.idle, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hid.protocol, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hid.feature, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->polling, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->prev_evt.tv_sec, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->prev_evt.tv_usec, meta, ret, done); done: return (ret); } #endif struct usb_devemu ue_mouse = { .ue_emu = "tablet", .ue_usbver = 3, .ue_usbspeed = USB_SPEED_HIGH, .ue_init = umouse_init, .ue_request = umouse_request, .ue_data = umouse_data_handler, .ue_reset = umouse_reset, .ue_remove = umouse_remove, .ue_stop = umouse_stop, #ifdef BHYVE_SNAPSHOT .ue_snapshot = umouse_snapshot, #endif }; USB_EMUL_SET(ue_mouse);