diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile index e35d528ab605..772c0988090e 100644 --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -1,135 +1,136 @@ # # $FreeBSD$ # .include CFLAGS+=-I${.CURDIR}/../../contrib/lib9p CFLAGS+=-I${SRCTOP}/sys .PATH: ${SRCTOP}/sys/cam/ctl PROG= bhyve PACKAGE= bhyve MAN= bhyve.8 bhyve_config.5 BHYVE_SYSDIR?=${SRCTOP} SRCS= \ atkbdc.c \ acpi.c \ audio.c \ bhyvegc.c \ bhyverun.c \ block_if.c \ bootrom.c \ config.c \ console.c \ ctl_util.c \ ctl_scsi_all.c \ fwctl.c \ gdb.c \ hda_codec.c \ inout.c \ ioapic.c \ kernemu_dev.c \ mem.c \ mevent.c \ mptbl.c \ net_backends.c \ net_utils.c \ pci_ahci.c \ pci_e82545.c \ pci_emul.c \ pci_hda.c \ pci_fbuf.c \ pci_hostbridge.c \ pci_irq.c \ pci_lpc.c \ pci_nvme.c \ pci_passthru.c \ pci_virtio_9p.c \ pci_virtio_block.c \ pci_virtio_console.c \ + pci_virtio_input.c \ pci_virtio_net.c \ pci_virtio_rnd.c \ pci_virtio_scsi.c \ pci_uart.c \ pci_xhci.c \ pctestdev.c \ pm.c \ post.c \ ps2kbd.c \ ps2mouse.c \ rfb.c \ rtc.c \ smbiostbl.c \ sockstream.c \ task_switch.c \ uart_emul.c \ usb_emul.c \ usb_mouse.c \ virtio.c \ vga.c \ vmgenc.c \ xmsr.c \ spinup_ap.c \ iov.c .if ${MK_BHYVE_SNAPSHOT} != "no" SRCS+= snapshot.c .endif CFLAGS.kernemu_dev.c+= -I${SRCTOP}/sys/amd64 .PATH: ${BHYVE_SYSDIR}/sys/amd64/vmm SRCS+= vmm_instruction_emul.c LIBADD= vmmapi md nv pthread z util sbuf cam 9p .if ${MK_CASPER} != "no" LIBADD+= casper LIBADD+= cap_pwd LIBADD+= cap_grp # Temporary disable capsicum, until we integrate checkpoint code with it. #CFLAGS+=-DWITH_CASPER .endif .if ${MK_BHYVE_SNAPSHOT} != "no" LIBADD+= ucl xo .endif .if ${MK_INET_SUPPORT} != "no" CFLAGS+=-DINET .endif .if ${MK_INET6_SUPPORT} != "no" CFLAGS+=-DINET6 .endif .if ${MK_NETGRAPH_SUPPORT} != "no" CFLAGS+=-DNETGRAPH LIBADD+= netgraph .endif .if ${MK_OPENSSL} == "no" CFLAGS+=-DNO_OPENSSL .else LIBADD+= crypto .endif CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000 CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller .if ${MK_BHYVE_SNAPSHOT} != "no" CFLAGS+= -I${SRCTOP}/contrib/libucl/include # Temporary disable capsicum, until we integrate checkpoint code with it. CFLAGS+= -DWITHOUT_CAPSICUM CFLAGS+= -DBHYVE_SNAPSHOT .endif .ifdef GDB_LOG CFLAGS+=-DGDB_LOG .endif WARNS?= 2 .include diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 index 79b6d9bfb240..fcf47ec65513 100644 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -1,915 +1,925 @@ .\" Copyright (c) 2013 Peter Grehan .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd April 18, 2021 .Dt BHYVE 8 .Os .Sh NAME .Nm bhyve .Nd "run a guest operating system inside a virtual machine" .Sh SYNOPSIS .Nm .Op Fl AaCDeHhPSuWwxY .Oo .Sm off .Fl c\~ .Oo .Op Cm cpus= .Ar numcpus .Oc .Op Cm ,sockets= Ar n .Op Cm ,cores= Ar n .Op Cm ,threads= Ar n .Oc .Sm on .Op Fl G Ar port .Op Fl k Ar file .Oo Fl l .Sm off .Ar lpcdev Op Cm \&, Ar conf .Sm on .Oc .Oo Fl m .Sm off .Ar memsize .Oo .Cm K | Cm k | Cm M | Cm m | Cm G | Cm g | Cm T | Cm t .Oc .Sm on .Oc .Op Fl o Ar var Ns Cm = Ns Ar value .Op Fl p Ar vcpu Ns Cm \&: Ns Ar hostcpu .Op Fl r Ar file .Sm off .Oo Fl s\~ .Ar slot Cm \&, Ar emulation Op Cm \&, Ar conf .Sm on .Oc .Op Fl U Ar uuid .Ar vmname .Nm .Fl l Cm help .Nm .Fl s Cm help .Sh DESCRIPTION .Nm is a hypervisor that runs guest operating systems inside a virtual machine. .Pp Parameters such as the number of virtual CPUs, amount of guest memory, and I/O connectivity can be specified with command-line parameters. .Pp If not using a boot ROM, the guest operating system must be loaded with .Xr bhyveload 8 or a similar boot loader before running .Nm , otherwise, it is enough to run .Nm with a boot ROM of choice. .Pp .Nm runs until the guest operating system reboots or an unhandled hypervisor exit is detected. .Sh OPTIONS .Bl -tag -width 10n .It Fl A Generate ACPI tables. Required for .Fx Ns /amd64 guests. .It Fl a The guest's local APIC is configured in xAPIC mode. The xAPIC mode is the default setting so this option is redundant. It will be deprecated in a future version. .It Fl C Include guest memory in core file. .It Fl c Op Ar setting ... Number of guest virtual CPUs and/or the CPU topology. The default value for each of .Ar numcpus , .Ar sockets , .Ar cores , and .Ar threads is 1. The current maximum number of guest virtual CPUs is 16. If .Ar numcpus is not specified then it will be calculated from the other arguments. The topology must be consistent in that the .Ar numcpus must equal the product of .Ar sockets , .Ar cores , and .Ar threads . If a .Ar setting is specified more than once the last one has precedence. .It Fl D Destroy the VM on guest initiated power-off. .It Fl e Force .Nm to exit when a guest issues an access to an I/O port that is not emulated. This is intended for debug purposes. .It Fl G Ar port Start a debug server that uses the GDB protocol to export guest state to a debugger. An IPv4 TCP socket will be bound to the supplied .Ar port to listen for debugger connections. Only a single debugger may be attached to the debug server at a time. If .Ar port begins with .Sq w , .Nm will pause execution at the first instruction waiting for a debugger to attach. .It Fl H Yield the virtual CPU thread when a HLT instruction is detected. If this option is not specified, virtual CPUs will use 100% of a host CPU. .It Fl h Print help message and exit. .It Fl k Ar file Set configuration variables from a simple, key-value config file. Each line of the config file is expected to consist of a config variable name, an equals sign .Pq Sq = , and a value. No spaces are permitted between the variable name, equals sign, or value. Blank lines and lines starting with .Sq # are ignored. .It Fl l Cm help Print a list of supported LPC devices. .It Fl l Ar lpcdev Ns Op Cm \&, Ns Ar conf Allow devices behind the LPC PCI-ISA bridge to be configured. The only supported devices are the TTY-class devices .Cm com1 , com2 , com3 , and .Cm com4 , the boot ROM device .Cm bootrom , and the debug/test device .Cm pc-testdev . .Pp The possible values for the .Ar conf argument are listed in the .Fl s flag description. .It Xo .Fl m Ar memsize Ns Oo .Sm off .Cm K | k | M | m | G | g | T | t .Sm on .Oc .Xc Set the guest physical memory size This must be the same size that was given to .Xr bhyveload 8 . .Pp The size argument may be suffixed with one of .Cm K , M , G or .Cm T (either upper or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes, or terabytes. If no suffix is given, the value is assumed to be in megabytes. .Pp The default is 256M. .It Fl o Ar var Ns Cm = Ns Ar value Set the configuration variable .Ar var to .Ar value . .It Fl P Force the guest virtual CPU to exit when a PAUSE instruction is detected. .It Fl p Ar vcpu Ns Cm \& : Ns Ar hostcpu Pin guest's virtual CPU .Em vcpu to .Em hostcpu . .It Fl r Ar file Resume a guest from a snapshot. The guest memory contents are restored from .Ar file , and the guest device and vCPU state are restored from the file .Dq Ar file Ns .kern . .Pp Note that the current snapshot file format requires that the configuration of devices in the new VM match the VM from which the snapshot was taken by specifying the same .Fl s and .Fl l options. The count of vCPUs and memory configuration are read from the snapshot. .It Fl S Wire guest memory. .It Fl s Cm help Print a list of supported PCI devices. .It Fl s Ar slot Ns Cm \&, Ns Ar emulation Ns Op Cm \&, Ns Ar conf Configure a virtual PCI slot and function. .Pp .Nm provides PCI bus emulation and virtual devices that can be attached to slots on the bus. There are 32 available slots, with the option of providing up to 8 functions per slot. .Pp The .Ar slot can be specified in one of the following formats: .Pp .Bl -bullet -compact .It .Ar pcislot .It .Sm off .Ar pcislot Cm \&: Ar function .Sm on .It .Sm off .Ar bus Cm \&: Ar pcislot Cm \&: Ar function .Sm on .El .Pp The .Ar pcislot value is 0 to 31. The optional .Ar function value is 0 to 7. The optional .Ar bus value is 0 to 255. If not specified, the .Ar function value defaults to 0. If not specified, the .Ar bus value defaults to 0. .Pp The .Ar emulation argument can be one of the following: .Bl -tag -width "amd_hostbridge" .It Cm hostbridge A simple host bridge. This is usually configured at slot 0, and is required by most guest operating systems. .It Cm amd_hostbridge Emulation identical to .Cm hostbridge using a PCI vendor ID of AMD. .It Cm passthru PCI pass-through device. .It Cm virtio-net Virtio network interface. .It Cm virtio-blk Virtio block storage interface. .It Cm virtio-scsi Virtio SCSI interface. .It Cm virtio-9p Virtio 9p (VirtFS) interface. .It Cm virtio-rnd Virtio RNG interface. .It Cm virtio-console Virtio console interface, which exposes multiple ports to the guest in the form of simple char devices for simple IO between the guest and host userspaces. +.It Cm virtio-input +Virtio input interface. .It Cm ahci AHCI controller attached to arbitrary devices. .It Cm ahci-cd AHCI controller attached to an ATAPI CD/DVD. .It Cm ahci-hd AHCI controller attached to a SATA hard drive. .It Cm e1000 Intel e82545 network interface. .It Cm uart PCI 16550 serial device. .It Cm lpc LPC PCI-ISA bridge with COM1, COM2, COM3, and COM4 16550 serial ports, a boot ROM, and, optionally, the debug/test device. The LPC bridge emulation can only be configured on bus 0. .It Cm fbuf Raw framebuffer device attached to VNC server. .It Cm xhci eXtensible Host Controller Interface (xHCI) USB controller. .It Cm nvme NVM Express (NVMe) controller. .It Cm hda High Definition Audio Controller. .El .Pp The optional parameter .Ar conf describes the backend for device emulations. If .Ar conf is not specified, the device emulation has no backend and can be considered unconnected. .Pp Network device backends: .Sm off .Bl -bullet .It .Xo .Cm tap Ar N .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx .Op Cm \&,mtu= Ar N .Xc .It .Xo .Cm vmnet Ar N .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx .Op Cm \&,mtu= Ar N .Xc .It .Xo .Cm netgraph,path= Ar ADDRESS Cm \&,peerhook= Ar HOOK .Op Cm \&,socket= Ar NAME .Op Cm \&,hook= Ar HOOK .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx .Op Cm \&,mtu= Ar N .Xc .El .Sm on If .Cm mac is not specified, the MAC address is derived from a fixed OUI and the remaining bytes from an MD5 hash of the slot and function numbers and the device name. .Pp The MAC address is an ASCII string in .Xr ethers 5 format. .Pp With .Cm virtio-net devices, the .Cm mtu parameter can be specified to inform the guest about the largest MTU that should be allowed, expressed in bytes. .Pp With .Cm netgraph backend, the .Cm path and .Cm peerhook parameters must be specified to set the destination node and corresponding hook. The optional parameters .Cm socket and .Cm hook may be used to set the .Xr ng_socket 4 node name and source hook. The .Ar ADDRESS , .Ar HOOK , and .Ar NAME must comply with .Xr netgraph 4 addressing rules. .Pp Block storage device backends: .Sm off .Bl -bullet .It .Ar /filename Op Cm \&, Ar block-device-options .It .Ar /dev/xxx Op Cm \&, Ar block-device-options .El .Sm on .Pp The .Ar block-device-options are: .Bl -tag -width 10n .It Cm nocache Open the file with .Dv O_DIRECT . .It Cm direct Open the file using .Dv O_SYNC . .It Cm ro Force the file to be opened read-only. .It Cm sectorsize= Ns Ar logical Ns Oo Cm \&/ Ns Ar physical Oc Specify the logical and physical sector sizes of the emulated disk. The physical sector size is optional and is equal to the logical sector size if not explicitly specified. .It Cm nodelete Disable emulation of guest trim requests via .Dv DIOCGDELETE requests. .El .Pp SCSI device backends: .Sm off .Bl -bullet .It .Pa /dev/cam/ctl Oo Ar pp Cm \&. Ar vp Oc Oo Cm \&, Ar scsi-device-options Oc .El .Sm on .Pp The .Ar scsi-device-options are: .Bl -tag -width 10n .It Cm iid= Ns Ar IID Initiator ID to use when sending requests to specified CTL port. The default value is 0. .El .Pp 9P device backends: .Sm off .Bl -bullet .It .Ar sharename Cm = Ar /path/to/share Op Cm \&, Ar 9p-device-options .El .Sm on .Pp The .Ar 9p-device-options are: .Bl -tag -width 10n .It Cm ro Expose the share in read-only mode. .El .Pp TTY device backends: .Bl -tag -width 10n .It Cm stdio Connect the serial port to the standard input and output of the .Nm process. .It Ar /dev/xxx Use the host TTY device for serial port I/O. .El .Pp Boot ROM device backends: .Bl -tag -width 10n .It Ar romfile Map .Ar romfile in the guest address space reserved for boot firmware. .El .Pp Pass-through device backends: .Bl -tag -width 10n .It Ns Ar slot Ns Cm \&/ Ns Ar bus Ns Cm \&/ Ns Ar function Connect to a PCI device on the host at the selector described by .Ar slot , .Ar bus , and .Ar function numbers. .El .Pp Guest memory must be wired using the .Fl S option when a pass-through device is configured. .Pp The host device must have been reserved at boot-time using the .Va pptdevs loader variable as described in .Xr vmm 4 . .Pp Virtio console device backends: .Bl -bullet .Sm off .It .Cm port1= Ns Ar /path/to/port1.sock Ns Op Cm ,port Ns Ar N Cm \&= Ns Ar /path/to/port2.sock No \~ Ar ... .Sm on .El .Pp A maximum of 16 ports per device can be created. Every port is named and corresponds to a Unix domain socket created by .Nm . .Nm accepts at most one connection per port at a time. .Pp Limitations: .Bl -bullet .It Due to lack of destructors in .Nm , sockets on the filesystem must be cleaned up manually after .Nm exits. .It There is no way to use the .Dq console port feature, nor the console port resize at present. .It Emergency write is advertised, but no-op at present. .El .Pp +Virtio input device backends: +.Bl -tag -width 10n +.It Ar /dev/input/eventX +Send input events of +.Ar /dev/input/eventX +to guest by VirtIO Input Interface. +.El +.Pp Framebuffer devices backends: .Bl -bullet .Sm off .It .Op Cm rfb= Ar ip-and-port .Op Cm ,w= Ar width .Op Cm ,h= Ar height .Op Cm ,vga= Ar vgaconf .Op Cm ,wait .Op Cm ,password= Ar password .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Cm rfb= Ns Ar ip-and-port Pq or Cm tcp= Ns Ar ip-and-port An IP address and a port VNC should listen on. There are two formats: .Pp .Bl -bullet -compact .It .Sm off .Op Ar IPv4 Cm \&: .Ar port .Sm on .It .Sm off .Cm \&[ Ar IPv6%zone Cm \&] Cm \&: Ar port .Sm on .El .Pp The default is to listen on localhost IPv4 address and default VNC port 5900. An IPv6 address must be enclosed in square brackets and may contain an optional zone identifier. .It Cm w= Ns Ar width No and Cm h= Ns Ar height A display resolution, width and height, respectively. If not specified, a default resolution of 1024x768 pixels will be used. Minimal supported resolution is 640x480 pixels, and maximum is 1920x1200 pixels. .It Cm vga= Ns Ar vgaconf Possible values for this option are .Cm io (default), .Cm on , and .Cm off . PCI graphics cards have a dual personality in that they are standard PCI devices with BAR addressing, but may also implicitly decode legacy VGA I/O space .Pq Ad 0x3c0-3df and memory space .Pq 64KB at Ad 0xA0000 . The default .Cm io option should be used for guests that attempt to issue BIOS calls which result in I/O port queries, and fail to boot if I/O decode is disabled. .Pp The .Cm on option should be used along with the CSM BIOS capability in UEFI to boot traditional BIOS guests that require the legacy VGA I/O and memory regions to be available. .Pp The .Cm off option should be used for the UEFI guests that assume that VGA adapter is present if they detect the I/O ports. An example of such a guest is .Ox in UEFI mode. .Pp Please refer to the .Nm .Fx wiki page .Pq Lk https://wiki.freebsd.org/bhyve for configuration notes of particular guests. .It Cm wait Instruct .Nm to only boot upon the initiation of a VNC connection, simplifying the installation of operating systems that require immediate keyboard input. This can be removed for post-installation use. .It Cm password= Ns Ar password This type of authentication is known to be cryptographically weak and is not intended for use on untrusted networks. Many implementations will want to use stronger security, such as running the session over an encrypted channel provided by IPsec or SSH. .El .Pp xHCI USB device backends: .Bl -tag -width 10n .It Cm tablet A USB tablet device which provides precise cursor synchronization when using VNC. .El .Pp NVMe device backends: .Bl -bullet .Sm off .It .Ar devpath .Op Cm ,maxq= Ar # .Op Cm ,qsz= Ar # .Op Cm ,ioslots= Ar # .Op Cm ,sectsz= Ar # .Op Cm ,ser= Ar # .Op Cm ,eui64= Ar # .Op Cm ,dsm= Ar opt .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Ar devpath Accepted device paths are: .Ar /dev/blockdev or .Ar /path/to/image or .Cm ram= Ns Ar size_in_MiB . .It Cm maxq Max number of queues. .It Cm qsz Max elements in each queue. .It Cm ioslots Max number of concurrent I/O requests. .It Cm sectsz Sector size (defaults to blockif sector size). .It Cm ser Serial number with maximum 20 characters. .It Cm eui64 IEEE Extended Unique Identifier (8 byte value). .It Cm dsm DataSet Management support. Supported values are: .Cm auto , enable , and .Cm disable . .El .Pp AHCI device backends: .Bl -bullet .It .Sm off .Op Oo Cm hd\&: | cd\&: Oc Ar path .Op Cm ,nmrr= Ar nmrr .Op Cm ,ser= Ar # .Op Cm ,rev= Ar # .Op Cm ,model= Ar # .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Cm nmrr Nominal Media Rotation Rate, known as RPM. Value 1 will indicate device as Solid State Disk. Default value is 0, not report. .It Cm ser Serial Number with maximum 20 characters. .It Cm rev Revision Number with maximum 8 characters. .It Cm model Model Number with maximum 40 characters. .El .Pp HD Audio device backends: .Bl -bullet .It .Sm off .Op Cm play= Ar playback .Op Cm ,rec= Ar recording .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Cm play Playback device, typically .Ar /dev/dsp0 . .It Cm rec Recording device, typically .Ar /dev/dsp0 . .El .It Fl U Ar uuid Set the universally unique identifier .Pq UUID in the guest's System Management BIOS System Information structure. By default a UUID is generated from the host's hostname and .Ar vmname . .It Fl u RTC keeps UTC time. .It Fl W Force virtio PCI device emulations to use MSI interrupts instead of MSI-X interrupts. .It Fl w Ignore accesses to unimplemented Model Specific Registers (MSRs). This is intended for debug purposes. .It Fl x The guest's local APIC is configured in x2APIC mode. .It Fl Y Disable MPtable generation. .It Ar vmname Alphanumeric name of the guest. This should be the same as that created by .Xr bhyveload 8 . .El .Sh CONFIGURATION VARIABLES .Nm uses an internal tree of configuration variables to describe global and per-device settings. When .Nm starts, it parses command line options (including config files) in the order given on the command line. Each command line option sets one or more configuration variables. For example, the .Fl s option creates a new tree node for a PCI device and sets one or more variables under that node including the device model and device model-specific variables. Variables may be set multiple times during this parsing stage with the final value overriding previous values. .Pp Once all of the command line options have been processed, the configuration values are frozen. .Nm then uses the value of configuration values to initialize device models and global settings. .Pp More details on configuration variables can be found in .Xr bhyve_config 5 . .Sh DEBUG SERVER The current debug server provides limited support for debuggers. .Ss Registers Each virtual CPU is exposed to the debugger as a thread. .Pp General purpose registers can be queried for each virtual CPU, but other registers such as floating-point and system registers cannot be queried. .Ss Memory Memory (including memory mapped I/O regions) can be read and written by the debugger. Memory operations use virtual addresses that are resolved to physical addresses via the current virtual CPU's active address translation. .Ss Control The running guest can be interrupted by the debugger at any time .Pq for example, by pressing Ctrl-C in the debugger . .Pp Single stepping is only supported on Intel CPUs supporting the MTRAP VM exit. .Pp Breakpoints are supported on Intel CPUs that support single stepping. Note that continuing from a breakpoint while interrupts are enabled in the guest may not work as expected due to timer interrupts firing while single stepping over the breakpoint. .Sh SIGNAL HANDLING .Nm deals with the following signals: .Pp .Bl -tag -width SIGTERM -compact .It SIGTERM Trigger ACPI poweroff for a VM .El .Sh EXIT STATUS Exit status indicates how the VM was terminated: .Pp .Bl -tag -width indent -compact .It 0 rebooted .It 1 powered off .It 2 halted .It 3 triple fault .It 4 exited due to an error .El .Sh EXAMPLES If not using a boot ROM, the guest operating system must have been loaded with .Xr bhyveload 8 or a similar boot loader before .Xr bhyve 4 can be run. Otherwise, the boot loader is not needed. .Pp To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio block device backed by the .Pa /my/image filesystem image, and a serial port for the console: .Bd -literal -offset indent bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\ -l com1,stdio -A -H -P -m 1G vm1 .Ed .Pp Run a 24GB single-CPU virtual machine with three network ports, one of which has a MAC address specified: .Bd -literal -offset indent bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\ -s 2:1,virtio-net,tap1 \\ -s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\ -s 3,virtio-blk,/my/image -l com1,stdio \\ -A -H -P -m 24G bigvm .Ed .Pp Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI CD-ROM, a single virtio network port, an AMD hostbridge, and the console port connected to an .Xr nmdm 4 null-modem device. .Bd -literal -offset indent bhyve -c 4 \\ -s 0,amd_hostbridge -s 1,lpc \\ -s 1:0,ahci,hd:/images/disk.1,hd:/images/disk.2,\\ hd:/images/disk.3,hd:/images/disk.4,\\ hd:/images/disk.5,hd:/images/disk.6,\\ hd:/images/disk.7,hd:/images/disk.8,\\ cd:/images/install.iso \\ -s 3,virtio-net,tap0 \\ -l com1,/dev/nmdm0A \\ -A -H -P -m 8G .Ed .Pp Run a UEFI virtual machine with a display resolution of 800 by 600 pixels that can be accessed via VNC at: 0.0.0.0:5900. .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 3,ahci-cd,/path/to/uefi-OS-install.iso \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=0.0.0.0:5900,w=800,h=600,wait \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Pp Run a UEFI virtual machine with a VNC display that is bound to all IPv6 addresses on port 5900. .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=[::]:5900,w=800,h=600 \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Sh SEE ALSO .Xr bhyve 4 , .Xr netgraph 4 , .Xr ng_socket 4 , .Xr nmdm 4 , .Xr vmm 4 , .Xr bhyve_config 5 , .Xr ethers 5 , .Xr bhyvectl 8 , .Xr bhyveload 8 .Pp .Rs .%A Intel .%B 64 and IA-32 Architectures Software Developer’s Manual .%V Volume 3 .Re .Sh HISTORY .Nm first appeared in .Fx 10.0 . .Sh AUTHORS .An Neel Natu Aq Mt neel@freebsd.org .An Peter Grehan Aq Mt grehan@freebsd.org diff --git a/usr.sbin/bhyve/bhyve_config.5 b/usr.sbin/bhyve/bhyve_config.5 index 1a77b1bbacb4..a2221d5bc4bf 100644 --- a/usr.sbin/bhyve/bhyve_config.5 +++ b/usr.sbin/bhyve/bhyve_config.5 @@ -1,562 +1,573 @@ .\" SPDX-License-Identifier: BSD-2-Clause .\" .\" Copyright (c) 2021 John H. Baldwin .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .Dd April 20, 2021 .Dt BHYVE_CONFIG 5 .Os .Sh NAME .Nm bhyve_config .Nd "bhyve configuration variables" .Sh DESCRIPTION .Xr bhyve 8 uses a hierarchical tree of configuration variables to describe global and per-device settings. Internal nodes in this tree do not have a value, only leaf nodes have values. This manual describes the configuration variables understood by .Xr bhyve 8 . If additional variables are defined, .Xr bhyve 8 will ignore them and will not emit errors for unknown variables. However, these additional variables can be referenced by other variables as described below. .Sh VARIABLE VALUES Configuration variable values are stored as strings. A configuration variable value may refer to one or more other configuration values by name. Instances of the pattern .Sq % Ns Pq Ar var are replaced by the value of the configuration variable .Va var . To avoid unwanted expansion, .Sq % characters can be escaped by a leading .Sq % . For example, if a configuration variable .Va disk uses the value .Pa /dev/zvol/bhyve/%(name) , then the final value of the .Va disk variable will be set to the path of a ZFS volume whose name matches the name of the virtual machine on the pool .Pa bhyve . .Pp Some configuration variables may be interpreted as a boolean value. For those variables the following case-insensitive values may be used to indicate true: .Pp .Bl -bullet -offset indent -compact .It true .It on .It yes .It 1 .El .Pp The following values may be used to indicate false: .Pp .Bl -bullet -offset indent -compact .It false .It off .It no .It 0 .El .Pp Some configuration variables may be interperted as an integer. For those variables, any syntax supported by .Xr strtol 3 may be used. .Sh GLOBAL SETTINGS .Ss Architecture Neutral Settings .Bl -column "memory.guest_in_core" "integer" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va name Ta string Ta Ta The name of the VM. .It Va cpus Ta integer Ta 1 Ta The total number of virtual CPUs. .It Va cores Ta integer Ta 1 Ta The number of virtual cores in each virtual socket. .It Va threads Ta integer Ta 1 Ta The number of virtual CPUs in each virtual core. .It Va sockets Ta integer Ta 1 Ta The number of virtual sockets. .It Va memory.guest_in_core Ta bool Ta false Ta Include guest memory in core file. .It Va memory.size Ta string Ta 256M Ta Guest physical memory size in bytes. The value must be formatted as described in .Xr expand_number 3 . .It Va memory.wired Ta bool Ta false Ta Wire guest memory. .It Va acpi_tables Ta bool Ta false Ta Generate ACPI tables. .It Va destroy_on_poweroff Ta bool Ta false Ta Destroy the VM on guest-initiated power-off. .It Va gdb.port Ta integer Ta 0 Ta TCP port number for the debug server. If this is set to a non-zero value, a debug server will listen for connections on this port. .It Va gdb.wait Ta bool Ta false Ta If the debug server is enabled, wait for a debugger to connect before starting the guest. .It Va rtc.use_localtime Ta bool Ta true Ta The real time clock uses the local time of the host. If this is set to false, the real time clock uses UTC. .It Va uuid Ta string Ta Ta The universally unique identifier (UUID) to use in the guest's System Management BIOS System Information structure. If an explicit value is not set, a valid UUID is generated from the host's hostname and the VM name. .It Va virtio_msix Ta bool Ta true Ta Use MSI-X interrupts for PCI VirtIO devices. If set to false, MSI interrupts are used instead. .It Va config.dump Ta bool Ta false Ta If this value is set to true after .Xr bhyve 8 has finished parsing command line options, then .Xr bhyve 8 will write all of its configuration variables to stdout and exit. No VM will be started. .El .Ss x86-Specific Settings .Bl -column "x86.vmexit_on_pause" "integer" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va x86.mptable Ta bool Ta true Ta Generate an MPTable. .It Va x86.x2apic Ta bool Ta false Ta Configure guest's local APICs in x2APIC mode. .It Va x86.strictio Ta bool Ta false Ta Exit if a guest accesses an I/O port that is not emulated. By default, writes are ignored and reads return all bits set. .It Va x86.strictmsr Ta bool Ta true Ta Inject a general protection fault if a guest accesses a Model Specific Register (MSR) that is not emulated. If this is false, writes are ignored and reads return zero. .It Va x86.vmexit_on_hlt Ta bool Ta false Ta Force a VM exit when a guest CPU executes the .Dv HLT instruction. This allows idle guest CPUs to yield the host CPU. .It Va x86.vmexit_on_pause Ta bool Ta false Ta Force a VM exit when a guest CPU executes the .Dv PAUSE instruction. .El .Sh DEVICE SETTINGS Device settings are stored under a device node. The device node's name is set by the parent bus of the device. .Ss PCI Device Settings PCI devices are described by a device node named .Dq pci Ns Ar bus . Ns Ar slot . Ns Ar function where each of .Ar bus , .Ar slot , and .Ar function are formatted as decimal values with no padding. All PCI device nodes must contain a configuration variable named .Dq device which specifies the device model to use. The following PCI device models are supported: .Bl -tag -indent .It Li hostbridge Provide a simple PCI-Host bridge device. This is usually configured at pci0:0:0 and is required by most guest operating systems. .It Li ahci AHCI storage controller. .It Li e1000 Intel e82545 network interface. .It Li fbuf VGA framebuffer device attached to VNC server. .It Li lpc LPC PCI-ISA bridge with COM1-COM4 16550 serial ports, a boot ROM, and an optional debug/test device. This device must be configured on bus 0. .It Li hda High Definition audio controller. .It Li nvme NVM Express (NVMe) controller. .It Li passthru PCI pass-through device. .It Li uart PCI 16550 serial device. .It Li virtio-9p VirtIO 9p (VirtFS) interface. .It Li virtio-blk VirtIO block storage interface. .It Li virtio-console VirtIO console interface. +.It Li virtio-input +VirtIO input interface. .It Li virtio-net VirtIO network interface. .It Li virtio-rnd VirtIO RNG interface. .It Li virtio-scsi VirtIO SCSI interface. .It Li xhci Extensible Host Controller Interface (XHCI) USB controller. .El .Ss USB Device Settings USB controller devices contain zero or more child USB devices attached to slots. Each USB device stores its settings in a node named .Dq slot. Ns Va N under the controller's device node. .Va N is the number of the slot to which the USB device is attached. Note that USB slot numbers begin at 1. All USB device nodes must contain a configuration variable named .Dq device which specifies the device model to use. The following USB device models are supported: .Bl -tag -indent .It Li tablet A USB tablet device which provides precise cursor synchronization when using VNC. .El .Ss Block Device Settings Block devices use the following settings to configure their backing store. These settings are stored in the configuration node of the respective device. .Bl -column "sectorsize" "logical[/physical]" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It path Ta string Ta Ta The path of the file or disk device to use as the backing store. .It nocache Ta bool Ta false Ta Disable caching on the backing file by opening the backing file with .Dv O_DIRECT . .It nodelete Ta bool Ta false Ta Disable emulation of guest trim requests via .Dv DIOCGDELETE requests. .It sync Ta bool Ta false Ta Write changes to the backing file with synchronous writes. .It direct Ta bool Ta false Ta An alias for .Va sync . .It ro Ta bool Ta false Ta Disable writes to the backing file. .It sectorsize Ta Va logical Ns Op / Ns Va physical Ta Ta Specify the logical and physical sector size of the emulated disk. If the physical size is not specified, it is equal to the logical size. .El .Ss Network Backend Settings Network devices use the following settings to configure their backend. The backend is responsible for passing packets between the device model and a desired destination. Configuring a backend requires setting the .Va backend variable to one of the following values: .Bl -tag .It tap Ns Va N Use the named .Xr tap 4 interface as the backend. .It vmnet Ns Va N Use the named .Xr vmnet 4 interface as the backend. .It netgraph Use a .Xr netgraph 4 socket hook as the backend. This backend uses the following additional variables: .Bl -column "peerhook" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va path Ta string Ta Ta The name of the .Xr netgraph 4 destination node. .It Va peerhook Ta string Ta Ta The name of the destination hook. .It Va socket Ta string Ta Ta The name of the created .Xr ng_socket 4 node. .It Va hook Ta string Ta vmlink Ta The name of the source hook on the created .Xr ng_socket 4 node. .El .It netmap: Ns Va interface Use .Xr netmap 4 on a network interface as the backend. .It vale Ns Va bridge : Ns Va port Use a port on a .Xr vale 4 bridge as the backend. .El .Ss UART Device Settings .Bl -column "Name" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va path Ta path Ta Ta Backend device for the serial port. Either the pathname of a character device or .Dq stdio to use standard input and output of the .Xr bhyve 8 process. .El .Ss Host Bridge Settings .Bl -column "vendor" "integer" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va vendor Ta integer Ta 0x1275 Ta PCI vendor ID. .It Va devid Ta integer Ta 0x1275 Ta PCI device ID. .El .Ss AHCI Controller Settings AHCI controller devices contain zero or more ports each of which provides a storage device. Each port stores its settings in a node named .Dq port. Ns Va N under the controller's device node. The .Va N values are formatted as successive decimal values starting with 0. In addition to the block device settings described above, each port supports the following settings: .Bl -column "model" "integer" "generated" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va type Ta string Ta Ta The type of storage device to emulate. Must be set to either .Dq cd or .Dq hd . .It Va nmrr Ta integer Ta 0 Ta Nominal Media Rotation Rate, also known as RPM. A value 1 of indicates a device with no rate such as a Solid State Disk. .It Va ser Ta string Ta generated Ta Serial number of up to twenty characters. A default serial number is generated using a hash of the backing store's pathname. .It Va rev Ta string Ta 001 Ta Revision number of up to eight characters. .It Va model Ta string Ta Ta Model number of up to forty characters. Separate default model strings are used for .Dq cd and .Dq hd device types. .El .Ss e1000 Settings In addition to the network backend settings, Intel e82545 network interfaces support the following variables: .Bl -column "Name" "MAC address" "generated" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va mac Ta MAC address Ta generated Ta MAC address. If an explicit address is not provided, a MAC address is generated from a hash of the device's PCI address. .El .Ss Frame Buffer Settings .Bl -column "password" "[IP:]port" "127.0.0.1:5900" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va wait Ta bool Ta false Ta Wait for a remote connection before starting the VM. .It Va rfb Ta Oo Ar IP Ns : Oc Ns Ar port Ta 127.0.0.1:5900 Ta TCP address to listen on for remote connections. The IP address must be given as a numeric address. IPv6 addresses must be enclosed in square brackets and support scoped identifiers as described in .Xr getaddrinfo 3 . A bare port number may be given in which case the IPv4 localhost address is used. .It Va vga Ta string Ta io Ta VGA configuration. More details are provided in .Xr bhyve 8 . .It Va w Ta integer Ta 1024 Ta Frame buffer width in pixels. .It Va h Ta integer Ta 768 Ta Frame buffer height in pixels. .It Va password Ta string Ta Ta Password to use for VNC authentication. This type of authentication is known to be cryptographically weak and is not intended for use on untrusted networks. .El .Ss High Definition Audio Settings .Bl -column "Name" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va play Ta path Ta Ta Host playback device, typically .Pa /dev/dsp0 . .It Va rec Ta path Ta Ta Host recording device, typically .Pa /dev/dsp0 . .El .Ss LPC Device Settings The LPC bridge stores its configuration under a top-level .Va lpc node rather than under the PCI LPC device's node. The following nodes are available under .Va lpc : .Bl -column "pc-testdev" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va bootrom Ta path Ta Ta Path to a boot ROM. The contents of this file are copied into the guest's memory ending just before the 4GB physical address. If a boot ROM is present, a firmware interface device is also enabled for use by the boot ROM. .It Va com1 Ta node Ta Ta Settings for the COM1 serial port device. .It Va com2 Ta node Ta Ta Settings for the COM2 serial port device. .It Va com3 Ta node Ta Ta Settings for the COM3 serial port device. .It Va com4 Ta node Ta Ta Settings for the COM4 serial port device. .It Va pc-testdev Ta bool Ta false Ta Enable the PC debug/test device. .El .Ss NVMe Controller Settings Each NVMe controller supports a single storage device. The device can be backed either by a memory disk described by the .Va ram variable, or a block device using the the block device settings described above. In addition, each controller supports the following settings: .Bl -column "ioslots" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va maxq Ta integer Ta 16 Ta Maximum number of I/O submission and completion queue pairs. .It Va qsz Ta integer Ta 2058 Ta Number of elements in each I/O queue. .It Va ioslots Ta integer Ta 8 Ta Maximum number of concurrent I/O requests. .It Va sectsz Ta integer Ta Ta Sector size. Can be one of 512, 4096, or 8192. Devices backed by a memory disk use 4096 as the default. Devices backed by a block device use the block device's sector size as the default. .It Va ser Ta string Ta Ta Serial number of up to twenty characters. A default serial number is generated using a hash of the device's PCI address. .It Va eui64 Ta integer Ta Ta IEEE Extended Unique Identifier. If an EUI is not provided, a default is generated using a checksum of the device's PCI address. .It Va dsm Ta string Ta auto Ta Whether or not to advertise DataSet Management support. One of .Dq auto , .Dq enable , or .Dq disable . The .Dq auto setting only advertises support if the backing store supports resource freeing, for example via TRIM. .It Va ram Ta integer Ta Ta If set, allocate a memory disk as the backing store. The value of this variable is the size of the memory disk in megabytes. .El .Ss PCI Passthrough Settings .Bl -column "Name" "integer" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va bus Ta integer Ta Ta Host PCI bus address of device to pass through. .It Va slot Ta integer Ta Ta Host PCI slot address of device to pass through. .It Va func Ta integer Ta Ta Host PCI function address of device to pass through. .El .Ss VirtIO 9p Settings Each VirtIO 9p device exposes a single filesystem from a host path. .Bl -column "sharename" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va sharename Ta string Ta Ta The share name exposed to the guest. .It Va path Ta path Ta Ta The path of a directory on the host to export to the guest. .It Va ro Ta bool Ta false Ta If true, the guest filesystem is read-only. .El .Ss VirtIO Console Device Settings Each VirtIO Console device contains one or more console ports. Each port stores its settings in a node named .Dq port. Ns Va N under the controller's device node. The .Va N values are formatted as successive decimal values starting with 0. Each port supports the following settings: .Bl -column "Name" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va name Ta string Ta Ta The name of the port exposed to the guest. .It Va path Ta path Ta Ta The path of a UNIX domain socket providing the host connection for the port. .El +.Ss VirtIO Input Interface Settings +Each VirtIO Input device contains one input event device. +All input events of the input event device are send to the guest by VirtIO Input interface. +VirtIO Input Interfaces support the following variables: +.Bl -column "Name" "Format" "Default" +.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description +.It Va path Ta path Ta Ta +The path of the input event device exposed to the guest +.El .Ss VirtIO Network Interface Settings In addition to the network backend settings, VirtIO network interfaces support the following variables: .Bl -column "Name" "MAC address" "generated" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va mac Ta MAC address Ta generated Ta MAC address. If an explicit address is not provided, a MAC address is generated from a hash of the device's PCI address. .It Va mtu Ta integer Ta 1500 Ta The largest supported MTU advertised to the guest. .El .Ss VirtIO SCSI Settings .Bl -column "Name" "integer" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va dev Ta path Ta Ta The path of a CAM target layer (CTL) device to export: .Pa /dev/cam/ctl Ns Oo Ar pp . Ns Ar vp Oc . .It Va iid Ta integer Ta 0 Ta Initiator ID to use when sending requests to the CTL port. .El .Sh SEE ALSO .Xr expand_number 3 , .Xr getaddrinfo 3 , .Xr strtol 3 , .Xr netgraph 4 , .Xr netmap 4 , .Xr ng_socket 4 , .Xr tap 4 , .Xr vale 4 , .Xr vmnet 4 , .Xr bhyve 8 diff --git a/usr.sbin/bhyve/pci_virtio_input.c b/usr.sbin/bhyve/pci_virtio_input.c new file mode 100644 index 000000000000..4517333b1603 --- /dev/null +++ b/usr.sbin/bhyve/pci_virtio_input.c @@ -0,0 +1,782 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * virtio input device emulation. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include + +#include +#endif +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "config.h" +#include "debug.h" +#include "mevent.h" +#include "pci_emul.h" +#include "virtio.h" + +#define VTINPUT_RINGSZ 64 + +#define VTINPUT_MAX_PKT_LEN 10 + +/* + * Queue definitions. + */ +#define VTINPUT_EVENTQ 0 +#define VTINPUT_STATUSQ 1 + +#define VTINPUT_MAXQ 2 + +static int pci_vtinput_debug; +#define DPRINTF(params) \ + if (pci_vtinput_debug) \ + PRINTLN params +#define WPRINTF(params) PRINTLN params + +enum vtinput_config_select { + VTINPUT_CFG_UNSET = 0x00, + VTINPUT_CFG_ID_NAME = 0x01, + VTINPUT_CFG_ID_SERIAL = 0x02, + VTINPUT_CFG_ID_DEVIDS = 0x03, + VTINPUT_CFG_PROP_BITS = 0x10, + VTINPUT_CFG_EV_BITS = 0x11, + VTINPUT_CFG_ABS_INFO = 0x12 +}; + +struct vtinput_absinfo { + uint32_t min; + uint32_t max; + uint32_t fuzz; + uint32_t flat; + uint32_t res; +} __packed; + +struct vtinput_devids { + uint16_t bustype; + uint16_t vendor; + uint16_t product; + uint16_t version; +} __packed; + +struct vtinput_config { + uint8_t select; + uint8_t subsel; + uint8_t size; + uint8_t reserved[5]; + union { + char string[128]; + uint8_t bitmap[128]; + struct vtinput_absinfo abs; + struct vtinput_devids ids; + } u; +} __packed; + +struct vtinput_event { + uint16_t type; + uint16_t code; + uint32_t value; +} __packed; + +struct vtinput_event_elem { + struct vtinput_event event; + struct iovec iov; + uint16_t idx; +}; + +struct vtinput_eventqueue { + struct vtinput_event_elem *events; + uint32_t size; + uint32_t idx; +}; + +/* + * Per-device softc + */ +struct pci_vtinput_softc { + struct virtio_softc vsc_vs; + struct vqueue_info vsc_queues[VTINPUT_MAXQ]; + pthread_mutex_t vsc_mtx; + const char *vsc_evdev; + int vsc_fd; + struct vtinput_config vsc_config; + int vsc_config_valid; + struct mevent *vsc_evp; + struct vtinput_eventqueue vsc_eventqueue; +}; + +static void pci_vtinput_reset(void *); +static int pci_vtinput_cfgread(void *, int, int, uint32_t *); +static int pci_vtinput_cfgwrite(void *, int, int, uint32_t); + +static struct virtio_consts vtinput_vi_consts = { + "vtinput", /* our name */ + VTINPUT_MAXQ, /* we support 1 virtqueue */ + sizeof(struct vtinput_config), /* config reg size */ + pci_vtinput_reset, /* reset */ + NULL, /* device-wide qnotify -- not used */ + pci_vtinput_cfgread, /* read virtio config */ + pci_vtinput_cfgwrite, /* write virtio config */ + NULL, /* apply negotiated features */ + 0, /* our capabilities */ +}; + +static void +pci_vtinput_reset(void *vsc) +{ + struct pci_vtinput_softc *sc = vsc; + + DPRINTF(("%s: device reset requested", __func__)); + vi_reset_dev(&sc->vsc_vs); +} + +static void +pci_vtinput_notify_eventq(void *vsc, struct vqueue_info *vq) +{ + DPRINTF(("%s", __func__)); +} + +static void +pci_vtinput_notify_statusq(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtinput_softc *sc = vsc; + + while (vq_has_descs(vq)) { + /* get descriptor chain */ + struct iovec iov; + struct vi_req req; + const int n = vq_getchain(vq, &iov, 1, &req); + if (n <= 0) { + WPRINTF(("%s: invalid descriptor: %d", __func__, n)); + return; + } + + /* get event */ + struct vtinput_event event; + memcpy(&event, iov.iov_base, sizeof(event)); + + /* + * on multi touch devices: + * - host send EV_MSC to guest + * - guest sends EV_MSC back to host + * - host writes EV_MSC to evdev + * - evdev saves EV_MSC in it's event buffer + * - host receives an extra EV_MSC by reading the evdev event + * buffer + * - frames become larger and larger + * avoid endless loops by ignoring EV_MSC + */ + if (event.type == EV_MSC) { + vq_relchain(vq, req.idx, sizeof(event)); + continue; + } + + /* send event to evdev */ + struct input_event host_event; + host_event.type = event.type; + host_event.code = event.code; + host_event.value = event.value; + if (gettimeofday(&host_event.time, NULL) != 0) { + WPRINTF(("%s: failed gettimeofday", __func__)); + } + if (write(sc->vsc_fd, &host_event, sizeof(host_event)) == -1) { + WPRINTF(("%s: failed to write host_event", __func__)); + } + + vq_relchain(vq, req.idx, sizeof(event)); + } + vq_endchains(vq, 1); +} + +static int +pci_vtinput_get_bitmap(struct pci_vtinput_softc *sc, int cmd, int count) +{ + if (count <= 0 || !sc) { + return (-1); + } + + /* query bitmap */ + memset(sc->vsc_config.u.bitmap, 0, sizeof(sc->vsc_config.u.bitmap)); + if (ioctl(sc->vsc_fd, cmd, sc->vsc_config.u.bitmap) < 0) { + return (-1); + } + + /* get number of set bytes in bitmap */ + for (int i = count - 1; i >= 0; i--) { + if (sc->vsc_config.u.bitmap[i]) { + return i + 1; + } + } + + return (-1); +} + +static int +pci_vtinput_read_config_id_name(struct pci_vtinput_softc *sc) +{ + char name[128]; + if (ioctl(sc->vsc_fd, EVIOCGNAME(sizeof(name) - 1), name) < 0) { + return (1); + } + + memcpy(sc->vsc_config.u.string, name, sizeof(name)); + sc->vsc_config.size = strnlen(name, sizeof(name)); + + return (0); +} + +static int +pci_vtinput_read_config_id_serial(struct pci_vtinput_softc *sc) +{ + /* serial isn't supported */ + sc->vsc_config.size = 0; + + return (0); +} + +static int +pci_vtinput_read_config_id_devids(struct pci_vtinput_softc *sc) +{ + struct input_id devids; + if (ioctl(sc->vsc_fd, EVIOCGID, &devids)) { + return (1); + } + + sc->vsc_config.u.ids.bustype = devids.bustype; + sc->vsc_config.u.ids.vendor = devids.vendor; + sc->vsc_config.u.ids.product = devids.product; + sc->vsc_config.u.ids.version = devids.version; + sc->vsc_config.size = sizeof(struct vtinput_devids); + + return (0); +} + +static int +pci_vtinput_read_config_prop_bits(struct pci_vtinput_softc *sc) +{ + /* + * Evdev bitmap countains 1 bit per count. Additionally evdev bitmaps + * are arrays of longs instead of chars. Calculate how many longs are + * required for evdev bitmap. Multiply that with sizeof(long) to get the + * number of elements. + */ + const int count = howmany(INPUT_PROP_CNT, sizeof(long) * 8) * + sizeof(long); + const unsigned int cmd = EVIOCGPROP(count); + const int size = pci_vtinput_get_bitmap(sc, cmd, count); + if (size <= 0) { + return (1); + } + + sc->vsc_config.size = size; + + return (0); +} + +static int +pci_vtinput_read_config_ev_bits(struct pci_vtinput_softc *sc, uint8_t type) +{ + int count; + + switch (type) { + case EV_KEY: + count = KEY_CNT; + break; + case EV_REL: + count = REL_CNT; + break; + case EV_ABS: + count = ABS_CNT; + break; + case EV_MSC: + count = MSC_CNT; + break; + case EV_SW: + count = SW_CNT; + break; + case EV_LED: + count = LED_CNT; + break; + default: + return (1); + } + + /* + * Evdev bitmap countains 1 bit per count. Additionally evdev bitmaps + * are arrays of longs instead of chars. Calculate how many longs are + * required for evdev bitmap. Multiply that with sizeof(long) to get the + * number of elements. + */ + count = howmany(count, sizeof(long) * 8) * sizeof(long); + const unsigned int cmd = EVIOCGBIT(sc->vsc_config.subsel, count); + const int size = pci_vtinput_get_bitmap(sc, cmd, count); + if (size <= 0) { + return (1); + } + + sc->vsc_config.size = size; + + return (0); +} + +static int +pci_vtinput_read_config_abs_info(struct pci_vtinput_softc *sc) +{ + /* check if evdev has EV_ABS */ + if (!pci_vtinput_read_config_ev_bits(sc, EV_ABS)) { + return (1); + } + + /* get abs information */ + struct input_absinfo abs; + if (ioctl(sc->vsc_fd, EVIOCGABS(sc->vsc_config.subsel), &abs) < 0) { + return (1); + } + + /* save abs information */ + sc->vsc_config.u.abs.min = abs.minimum; + sc->vsc_config.u.abs.max = abs.maximum; + sc->vsc_config.u.abs.fuzz = abs.fuzz; + sc->vsc_config.u.abs.flat = abs.flat; + sc->vsc_config.u.abs.res = abs.resolution; + sc->vsc_config.size = sizeof(struct vtinput_absinfo); + + return (0); +} + +static int +pci_vtinput_read_config(struct pci_vtinput_softc *sc) +{ + switch (sc->vsc_config.select) { + case VTINPUT_CFG_UNSET: + return (0); + case VTINPUT_CFG_ID_NAME: + return pci_vtinput_read_config_id_name(sc); + case VTINPUT_CFG_ID_SERIAL: + return pci_vtinput_read_config_id_serial(sc); + case VTINPUT_CFG_ID_DEVIDS: + return pci_vtinput_read_config_id_devids(sc); + case VTINPUT_CFG_PROP_BITS: + return pci_vtinput_read_config_prop_bits(sc); + case VTINPUT_CFG_EV_BITS: + return pci_vtinput_read_config_ev_bits( + sc, sc->vsc_config.subsel); + case VTINPUT_CFG_ABS_INFO: + return pci_vtinput_read_config_abs_info(sc); + default: + return (1); + } +} + +static int +pci_vtinput_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtinput_softc *sc = vsc; + + /* check for valid offset and size */ + if (offset + size > sizeof(struct vtinput_config)) { + WPRINTF(("%s: read to invalid offset/size %d/%d", __func__, + offset, size)); + memset(retval, 0, size); + return (0); + } + + /* read new config values, if select and subsel changed. */ + if (!sc->vsc_config_valid) { + if (pci_vtinput_read_config(sc) != 0) { + DPRINTF(("%s: could not read config %d/%d", __func__, + sc->vsc_config.select, sc->vsc_config.subsel)); + memset(retval, 0, size); + return (0); + } + sc->vsc_config_valid = 1; + } + + uint8_t *ptr = (uint8_t *)&sc->vsc_config; + memcpy(retval, ptr + offset, size); + + return (0); +} + +static int +pci_vtinput_cfgwrite(void *vsc, int offset, int size, uint32_t value) +{ + struct pci_vtinput_softc *sc = vsc; + + /* guest can only write to select and subsel fields */ + if (offset + size > 2) { + WPRINTF(("%s: write to readonly reg %d", __func__, offset)); + return (1); + } + + /* copy value into config */ + uint8_t *ptr = (uint8_t *)&sc->vsc_config; + memcpy(ptr + offset, &value, size); + + /* select/subsel changed, query new config on next cfgread */ + sc->vsc_config_valid = 0; + + return (0); +} + +static int +vtinput_eventqueue_add_event( + struct vtinput_eventqueue *queue, struct input_event *e) +{ + /* check if queue is full */ + if (queue->idx >= queue->size) { + /* alloc new elements for queue */ + const uint32_t newSize = queue->idx; + const void *newPtr = realloc(queue->events, + queue->size * sizeof(struct vtinput_event_elem)); + if (newPtr == NULL) { + WPRINTF(("%s: realloc memory for eventqueue failed!", + __func__)); + return (1); + } + /* save new size and eventqueue */ + queue->events = (struct vtinput_event_elem *)newPtr; + queue->size = newSize; + } + + /* save event */ + struct vtinput_event *event = &queue->events[queue->idx].event; + event->type = e->type; + event->code = e->code; + event->value = e->value; + queue->idx++; + + return (0); +} + +static void +vtinput_eventqueue_clear(struct vtinput_eventqueue *queue) +{ + /* just reset index to clear queue */ + queue->idx = 0; +} + +static void +vtinput_eventqueue_send_events( + struct vtinput_eventqueue *queue, struct vqueue_info *vq) +{ + /* + * First iteration through eventqueue: + * Get descriptor chains. + */ + for (uint32_t i = 0; i < queue->idx; ++i) { + /* get descriptor */ + if (!vq_has_descs(vq)) { + /* + * We don't have enough descriptors for all events. + * Return chains back to guest. + */ + vq_retchains(vq, i); + WPRINTF(( + "%s: not enough available descriptors, dropping %d events", + __func__, queue->idx)); + goto done; + } + + /* get descriptor chain */ + struct iovec iov; + struct vi_req req; + const int n = vq_getchain(vq, &iov, 1, &req); + if (n <= 0) { + WPRINTF(("%s: invalid descriptor: %d", __func__, n)); + return; + } + if (n != 1) { + WPRINTF( + ("%s: invalid number of descriptors in chain: %d", + __func__, n)); + /* release invalid chain */ + vq_relchain(vq, req.idx, 0); + return; + } + if (iov.iov_len < sizeof(struct vtinput_event)) { + WPRINTF(("%s: invalid descriptor length: %lu", __func__, + iov.iov_len)); + /* release invalid chain */ + vq_relchain(vq, req.idx, 0); + return; + } + + /* save descriptor */ + queue->events[i].iov = iov; + queue->events[i].idx = req.idx; + } + + /* + * Second iteration through eventqueue: + * Send events to guest by releasing chains + */ + for (uint32_t i = 0; i < queue->idx; ++i) { + struct vtinput_event_elem event = queue->events[i]; + memcpy(event.iov.iov_base, &event.event, + sizeof(struct vtinput_event)); + vq_relchain(vq, event.idx, sizeof(struct vtinput_event)); + } +done: + /* clear queue and send interrupt to guest */ + vtinput_eventqueue_clear(queue); + vq_endchains(vq, 1); + + return; +} + +static int +vtinput_read_event_from_host(int fd, struct input_event *event) +{ + const int len = read(fd, event, sizeof(struct input_event)); + if (len != sizeof(struct input_event)) { + if (len == -1 && errno != EAGAIN) { + WPRINTF(("%s: event read failed! len = %d, errno = %d", + __func__, len, errno)); + } + + /* host doesn't have more events for us */ + return (1); + } + + return (0); +} + +static void +vtinput_read_event(int fd __attribute((unused)), + enum ev_type t __attribute__((unused)), void *arg __attribute__((unused))) +{ + struct pci_vtinput_softc *sc = arg; + + /* skip if driver isn't ready */ + if (!(sc->vsc_vs.vs_status & VIRTIO_CONFIG_STATUS_DRIVER_OK)) + return; + + /* read all events from host */ + struct input_event event; + while (vtinput_read_event_from_host(sc->vsc_fd, &event) == 0) { + /* add events to our queue */ + vtinput_eventqueue_add_event(&sc->vsc_eventqueue, &event); + + /* only send events to guest on EV_SYN or SYN_REPORT */ + if (event.type != EV_SYN || event.type != SYN_REPORT) { + continue; + } + + /* send host events to guest */ + vtinput_eventqueue_send_events( + &sc->vsc_eventqueue, &sc->vsc_queues[VTINPUT_EVENTQ]); + } +} + +static int +pci_vtinput_legacy_config(nvlist_t *nvl, const char *opts) +{ + if (opts == NULL) + return (-1); + + /* + * parse opts: + * virtio-input,/dev/input/eventX + */ + char *cp = strchr(opts, ','); + if (cp == NULL) { + set_config_value_node(nvl, "path", opts); + return (0); + } + char *path = strndup(opts, cp - opts); + set_config_value_node(nvl, "path", path); + free(path); + + return (pci_parse_legacy_config(nvl, cp + 1)); +} + +static int +pci_vtinput_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) +{ + struct pci_vtinput_softc *sc; + + /* + * Keep it here. + * Else it's possible to access it uninitialized by jumping to failed. + */ + pthread_mutexattr_t mtx_attr = NULL; + + sc = calloc(1, sizeof(struct pci_vtinput_softc)); + + sc->vsc_evdev = get_config_value_node(nvl, "path"); + if (sc->vsc_evdev == NULL) { + WPRINTF(("%s: missing required path config value", __func__)); + goto failed; + } + + /* + * open evdev by using non blocking I/O: + * read from /dev/input/eventX would block our thread otherwise + */ + sc->vsc_fd = open(sc->vsc_evdev, O_RDWR | O_NONBLOCK); + if (sc->vsc_fd < 0) { + WPRINTF(("%s: failed to open %s", __func__, sc->vsc_evdev)); + goto failed; + } + + /* check if evdev is really a evdev */ + int evversion; + int error = ioctl(sc->vsc_fd, EVIOCGVERSION, &evversion); + if (error < 0) { + WPRINTF(("%s: %s is no evdev", __func__, sc->vsc_evdev)); + goto failed; + } + + /* gain exclusive access to evdev */ + error = ioctl(sc->vsc_fd, EVIOCGRAB, 1); + if (error < 0) { + WPRINTF(("%s: failed to grab %s", __func__, sc->vsc_evdev)); + goto failed; + } + + if (pthread_mutexattr_init(&mtx_attr)) { + WPRINTF(("%s: init mutexattr failed", __func__)); + goto failed; + } + if (pthread_mutexattr_settype(&mtx_attr, PTHREAD_MUTEX_RECURSIVE)) { + WPRINTF(("%s: settype mutexattr failed", __func__)); + goto failed; + } + if (pthread_mutex_init(&sc->vsc_mtx, &mtx_attr)) { + WPRINTF(("%s: init mutex failed", __func__)); + goto failed; + } + + /* init softc */ + sc->vsc_eventqueue.idx = 0; + sc->vsc_eventqueue.size = VTINPUT_MAX_PKT_LEN; + sc->vsc_eventqueue.events = calloc( + sc->vsc_eventqueue.size, sizeof(struct vtinput_event_elem)); + sc->vsc_config_valid = 0; + if (sc->vsc_eventqueue.events == NULL) { + WPRINTF(("%s: failed to alloc eventqueue", __func__)); + goto failed; + } + + /* register event handler */ + sc->vsc_evp = mevent_add(sc->vsc_fd, EVF_READ, vtinput_read_event, sc); + if (sc->vsc_evp == NULL) { + WPRINTF(("%s: could not register mevent", __func__)); + goto failed; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ, CAP_WRITE); + if (caph_rights_limit(sc->vsc_fd, &rights) == -1) { + errx(EX_OSERR, "Unable to apply rights for sandbox"); + } +#endif + + /* link virtio to softc */ + vi_softc_linkup( + &sc->vsc_vs, &vtinput_vi_consts, sc, pi, sc->vsc_queues); + sc->vsc_vs.vs_mtx = &sc->vsc_mtx; + + /* init virtio queues */ + sc->vsc_queues[VTINPUT_EVENTQ].vq_qsize = VTINPUT_RINGSZ; + sc->vsc_queues[VTINPUT_EVENTQ].vq_notify = pci_vtinput_notify_eventq; + sc->vsc_queues[VTINPUT_STATUSQ].vq_qsize = VTINPUT_RINGSZ; + sc->vsc_queues[VTINPUT_STATUSQ].vq_notify = pci_vtinput_notify_statusq; + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_INPUT); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_INPUTDEV); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_INPUTDEV_OTHER); + pci_set_cfgdata8(pi, PCIR_REVID, VIRTIO_REV_INPUT); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_SUBDEV_INPUT); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_SUBVEN_INPUT); + + /* add MSI-X table BAR */ + if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) + goto failed; + /* add virtio register */ + vi_set_io_bar(&sc->vsc_vs, 0); + + return (0); + +failed: + if (sc == NULL) { + return (-1); + } + + if (sc->vsc_evp) + mevent_delete(sc->vsc_evp); + if (sc->vsc_eventqueue.events) + free(sc->vsc_eventqueue.events); + if (sc->vsc_mtx) + pthread_mutex_destroy(&sc->vsc_mtx); + if (mtx_attr) + pthread_mutexattr_destroy(&mtx_attr); + if (sc->vsc_fd) + close(sc->vsc_fd); + + free(sc); + + return (-1); +} + +struct pci_devemu pci_de_vinput = { + .pe_emu = "virtio-input", + .pe_init = pci_vtinput_init, + .pe_legacy_config = pci_vtinput_legacy_config, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read, +}; +PCI_EMUL_SET(pci_de_vinput); diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c index 7f0485cbb826..a08964e28563 100644 --- a/usr.sbin/bhyve/virtio.c +++ b/usr.sbin/bhyve/virtio.c @@ -1,955 +1,961 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Chris Torek * All rights reserved. * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" /* * Functions for dealing with generalized "virtual devices" as * defined by */ /* * In case we decide to relax the "virtio softc comes at the * front of virtio-based device softc" constraint, let's use * this to convert. */ #define DEV_SOFTC(vs) ((void *)(vs)) /* * Link a virtio_softc to its constants, the device softc, and * the PCI emulation. */ void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, void *dev_softc, struct pci_devinst *pi, struct vqueue_info *queues) { int i; /* vs and dev_softc addresses must match */ assert((void *)vs == dev_softc); vs->vs_vc = vc; vs->vs_pi = pi; pi->pi_arg = vs; vs->vs_queues = queues; for (i = 0; i < vc->vc_nvq; i++) { queues[i].vq_vs = vs; queues[i].vq_num = i; } } /* * Reset device (device-wide). This erases all queues, i.e., * all the queues become invalid (though we don't wipe out the * internal pointers, we just clear the VQ_ALLOC flag). * * It resets negotiated features to "none". * * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR. */ void vi_reset_dev(struct virtio_softc *vs) { struct vqueue_info *vq; int i, nvq; if (vs->vs_mtx) assert(pthread_mutex_isowned_np(vs->vs_mtx)); nvq = vs->vs_vc->vc_nvq; for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { vq->vq_flags = 0; vq->vq_last_avail = 0; vq->vq_next_used = 0; vq->vq_save_used = 0; vq->vq_pfn = 0; vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR; } vs->vs_negotiated_caps = 0; vs->vs_curq = 0; /* vs->vs_status = 0; -- redundant */ if (vs->vs_isr) pci_lintr_deassert(vs->vs_pi); vs->vs_isr = 0; vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR; } /* * Set I/O BAR (usually 0) to map PCI config registers. */ void vi_set_io_bar(struct virtio_softc *vs, int barnum) { size_t size; /* * ??? should we use VIRTIO_PCI_CONFIG_OFF(0) if MSI-X is disabled? * Existing code did not... */ size = VIRTIO_PCI_CONFIG_OFF(1) + vs->vs_vc->vc_cfgsize; pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size); } /* * Initialize MSI-X vector capabilities if we're to use MSI-X, * or MSI capabilities if not. * * We assume we want one MSI-X vector per queue, here, plus one * for the config vec. */ int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix) { int nvec; if (use_msix) { vs->vs_flags |= VIRTIO_USE_MSIX; VS_LOCK(vs); vi_reset_dev(vs); /* set all vectors to NO_VECTOR */ VS_UNLOCK(vs); nvec = vs->vs_vc->vc_nvq + 1; if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum)) return (1); } else vs->vs_flags &= ~VIRTIO_USE_MSIX; /* Only 1 MSI vector for bhyve */ pci_emul_add_msicap(vs->vs_pi, 1); /* Legacy interrupts are mandatory for virtio devices */ pci_lintr_request(vs->vs_pi); return (0); } /* * Initialize the currently-selected virtio queue (vs->vs_curq). * The guest just gave us a page frame number, from which we can * calculate the addresses of the queue. */ void vi_vq_init(struct virtio_softc *vs, uint32_t pfn) { struct vqueue_info *vq; uint64_t phys; size_t size; char *base; vq = &vs->vs_queues[vs->vs_curq]; vq->vq_pfn = pfn; phys = (uint64_t)pfn << VRING_PFN; size = vring_size_aligned(vq->vq_qsize); base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size); /* First page(s) are descriptors... */ vq->vq_desc = (struct vring_desc *)base; base += vq->vq_qsize * sizeof(struct vring_desc); /* ... immediately followed by "avail" ring (entirely uint16_t's) */ vq->vq_avail = (struct vring_avail *)base; base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t); /* Then it's rounded up to the next page... */ base = (char *)roundup2((uintptr_t)base, VRING_ALIGN); /* ... and the last page(s) are the used ring. */ vq->vq_used = (struct vring_used *)base; /* Mark queue as allocated, and start at 0 when we use it. */ vq->vq_flags = VQ_ALLOC; vq->vq_last_avail = 0; vq->vq_next_used = 0; vq->vq_save_used = 0; } /* * Helper inline for vq_getchain(): record the i'th "real" * descriptor. */ static inline void _vq_record(int i, volatile struct vring_desc *vd, struct vmctx *ctx, struct iovec *iov, int n_iov, struct vi_req *reqp) { if (i >= n_iov) return; iov[i].iov_base = paddr_guest2host(ctx, vd->addr, vd->len); iov[i].iov_len = vd->len; if ((vd->flags & VRING_DESC_F_WRITE) == 0) reqp->readable++; else reqp->writable++; } #define VQ_MAX_DESCRIPTORS 512 /* see below */ /* * Examine the chain of descriptors starting at the "next one" to * make sure that they describe a sensible request. If so, return * the number of "real" descriptors that would be needed/used in * acting on this request. This may be smaller than the number of * available descriptors, e.g., if there are two available but * they are two separate requests, this just returns 1. Or, it * may be larger: if there are indirect descriptors involved, * there may only be one descriptor available but it may be an * indirect pointing to eight more. We return 8 in this case, * i.e., we do not count the indirect descriptors, only the "real" * ones. * * Basically, this vets the "flags" and "next" field of each * descriptor and tells you how many are involved. Since some may * be indirect, this also needs the vmctx (in the pci_devinst * at vs->vs_pi) so that it can find indirect descriptors. * * As we process each descriptor, we copy and adjust it (guest to * host address wise, also using the vmtctx) into the given iov[] * array (of the given size). If the array overflows, we stop * placing values into the array but keep processing descriptors, * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1. * So you, the caller, must not assume that iov[] is as big as the * return value (you can process the same thing twice to allocate * a larger iov array if needed, or supply a zero length to find * out how much space is needed). * * If some descriptor(s) are invalid, this prints a diagnostic message * and returns -1. If no descriptors are ready now it simply returns 0. * * You are assumed to have done a vq_ring_ready() if needed (note * that vq_has_descs() does one). */ int vq_getchain(struct vqueue_info *vq, struct iovec *iov, int niov, struct vi_req *reqp) { int i; u_int ndesc, n_indir; u_int idx, next; struct vi_req req; volatile struct vring_desc *vdir, *vindir, *vp; struct vmctx *ctx; struct virtio_softc *vs; const char *name; vs = vq->vq_vs; name = vs->vs_vc->vc_name; memset(&req, 0, sizeof(req)); /* * Note: it's the responsibility of the guest not to * update vq->vq_avail->idx until all of the descriptors * the guest has written are valid (including all their * "next" fields and "flags"). * * Compute (vq_avail->idx - last_avail) in integers mod 2**16. This is * the number of descriptors the device has made available * since the last time we updated vq->vq_last_avail. * * We just need to do the subtraction as an unsigned int, * then trim off excess bits. */ idx = vq->vq_last_avail; ndesc = (uint16_t)((u_int)vq->vq_avail->idx - idx); if (ndesc == 0) return (0); if (ndesc > vq->vq_qsize) { /* XXX need better way to diagnose issues */ EPRINTLN( "%s: ndesc (%u) out of range, driver confused?", name, (u_int)ndesc); return (-1); } /* * Now count/parse "involved" descriptors starting from * the head of the chain. * * To prevent loops, we could be more complicated and * check whether we're re-visiting a previously visited * index, but we just abort if the count gets excessive. */ ctx = vs->vs_pi->pi_vmctx; req.idx = next = vq->vq_avail->ring[idx & (vq->vq_qsize - 1)]; vq->vq_last_avail++; for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->next) { if (next >= vq->vq_qsize) { EPRINTLN( "%s: descriptor index %u out of range, " "driver confused?", name, next); return (-1); } vdir = &vq->vq_desc[next]; if ((vdir->flags & VRING_DESC_F_INDIRECT) == 0) { _vq_record(i, vdir, ctx, iov, niov, &req); i++; } else if ((vs->vs_vc->vc_hv_caps & VIRTIO_RING_F_INDIRECT_DESC) == 0) { EPRINTLN( "%s: descriptor has forbidden INDIRECT flag, " "driver confused?", name); return (-1); } else { n_indir = vdir->len / 16; if ((vdir->len & 0xf) || n_indir == 0) { EPRINTLN( "%s: invalid indir len 0x%x, " "driver confused?", name, (u_int)vdir->len); return (-1); } vindir = paddr_guest2host(ctx, vdir->addr, vdir->len); /* * Indirects start at the 0th, then follow * their own embedded "next"s until those run * out. Each one's indirect flag must be off * (we don't really have to check, could just * ignore errors...). */ next = 0; for (;;) { vp = &vindir[next]; if (vp->flags & VRING_DESC_F_INDIRECT) { EPRINTLN( "%s: indirect desc has INDIR flag," " driver confused?", name); return (-1); } _vq_record(i, vp, ctx, iov, niov, &req); if (++i > VQ_MAX_DESCRIPTORS) goto loopy; if ((vp->flags & VRING_DESC_F_NEXT) == 0) break; next = vp->next; if (next >= n_indir) { EPRINTLN( "%s: invalid next %u > %u, " "driver confused?", name, (u_int)next, n_indir); return (-1); } } } if ((vdir->flags & VRING_DESC_F_NEXT) == 0) goto done; } loopy: EPRINTLN( "%s: descriptor loop? count > %d - driver confused?", name, i); return (-1); done: *reqp = req; return (i); } /* * Return the first n_chain request chains back to the available queue. * * (These chains are the ones you handled when you called vq_getchain() * and used its positive return value.) */ void vq_retchains(struct vqueue_info *vq, uint16_t n_chains) { vq->vq_last_avail -= n_chains; } void vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) { volatile struct vring_used *vuh; volatile struct vring_used_elem *vue; uint16_t mask; /* * Notes: * - mask is N-1 where N is a power of 2 so computes x % N * - vuh points to the "used" data shared with guest * - vue points to the "used" ring entry we want to update */ mask = vq->vq_qsize - 1; vuh = vq->vq_used; vue = &vuh->ring[vq->vq_next_used++ & mask]; vue->id = idx; vue->len = iolen; } void vq_relchain_publish(struct vqueue_info *vq) { /* * Ensure the used descriptor is visible before updating the index. * This is necessary on ISAs with memory ordering less strict than x86 * (and even on x86 to act as a compiler barrier). */ atomic_thread_fence_rel(); vq->vq_used->idx = vq->vq_next_used; } /* * Return specified request chain to the guest, setting its I/O length * to the provided value. * * (This chain is the one you handled when you called vq_getchain() * and used its positive return value.) */ void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) { vq_relchain_prepare(vq, idx, iolen); vq_relchain_publish(vq); } /* * Driver has finished processing "available" chains and calling * vq_relchain on each one. If driver used all the available * chains, used_all should be set. * * If the "used" index moved we may need to inform the guest, i.e., * deliver an interrupt. Even if the used index did NOT move we * may need to deliver an interrupt, if the avail ring is empty and * we are supposed to interrupt on empty. * * Note that used_all_avail is provided by the caller because it's * a snapshot of the ring state when he decided to finish interrupt * processing -- it's possible that descriptors became available after * that point. (It's also typically a constant 1/True as well.) */ void vq_endchains(struct vqueue_info *vq, int used_all_avail) { struct virtio_softc *vs; uint16_t event_idx, new_idx, old_idx; int intr; /* * Interrupt generation: if we're using EVENT_IDX, * interrupt if we've crossed the event threshold. * Otherwise interrupt is generated if we added "used" entries, * but suppressed by VRING_AVAIL_F_NO_INTERRUPT. * * In any case, though, if NOTIFY_ON_EMPTY is set and the * entire avail was processed, we need to interrupt always. */ vs = vq->vq_vs; old_idx = vq->vq_save_used; vq->vq_save_used = new_idx = vq->vq_used->idx; /* * Use full memory barrier between "idx" store from preceding * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or * "flags" field below. */ atomic_thread_fence_seq_cst(); if (used_all_avail && (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) intr = 1; else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) { event_idx = VQ_USED_EVENT_IDX(vq); /* * This calculation is per docs and the kernel * (see src/sys/dev/virtio/virtio_ring.h). */ intr = (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old_idx); } else { intr = new_idx != old_idx && !(vq->vq_avail->flags & VRING_AVAIL_F_NO_INTERRUPT); } if (intr) vq_interrupt(vs, vq); } /* Note: these are in sorted order to make for a fast search */ static struct config_reg { uint16_t cr_offset; /* register offset */ uint8_t cr_size; /* size (bytes) */ uint8_t cr_ro; /* true => reg is read only */ const char *cr_name; /* name of reg */ } config_regs[] = { { VIRTIO_PCI_HOST_FEATURES, 4, 1, "HOST_FEATURES" }, { VIRTIO_PCI_GUEST_FEATURES, 4, 0, "GUEST_FEATURES" }, { VIRTIO_PCI_QUEUE_PFN, 4, 0, "QUEUE_PFN" }, { VIRTIO_PCI_QUEUE_NUM, 2, 1, "QUEUE_NUM" }, { VIRTIO_PCI_QUEUE_SEL, 2, 0, "QUEUE_SEL" }, { VIRTIO_PCI_QUEUE_NOTIFY, 2, 0, "QUEUE_NOTIFY" }, { VIRTIO_PCI_STATUS, 1, 0, "STATUS" }, { VIRTIO_PCI_ISR, 1, 0, "ISR" }, { VIRTIO_MSI_CONFIG_VECTOR, 2, 0, "CONFIG_VECTOR" }, { VIRTIO_MSI_QUEUE_VECTOR, 2, 0, "QUEUE_VECTOR" }, }; static inline struct config_reg * vi_find_cr(int offset) { u_int hi, lo, mid; struct config_reg *cr; lo = 0; hi = sizeof(config_regs) / sizeof(*config_regs) - 1; while (hi >= lo) { mid = (hi + lo) >> 1; cr = &config_regs[mid]; if (cr->cr_offset == offset) return (cr); if (cr->cr_offset < offset) lo = mid + 1; else hi = mid - 1; } return (NULL); } /* * Handle pci config space reads. * If it's to the MSI-X info, do that. * If it's part of the virtio standard stuff, do that. * Otherwise dispatch to the actual driver. */ uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct virtio_softc *vs = pi->pi_arg; struct virtio_consts *vc; struct config_reg *cr; uint64_t virtio_config_size, max; const char *name; uint32_t newoff; uint32_t value; int error; if (vs->vs_flags & VIRTIO_USE_MSIX) { if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { return (pci_emul_msix_tread(pi, offset, size)); } } /* XXX probably should do something better than just assert() */ assert(baridx == 0); if (vs->vs_mtx) pthread_mutex_lock(vs->vs_mtx); vc = vs->vs_vc; name = vc->vc_name; value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff; if (size != 1 && size != 2 && size != 4) goto bad; virtio_config_size = VIRTIO_PCI_CONFIG_OFF(pci_msix_enabled(pi)); if (offset >= virtio_config_size) { /* * Subtract off the standard size (including MSI-X * registers if enabled) and dispatch to underlying driver. * If that fails, fall into general code. */ newoff = offset - virtio_config_size; max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; if (newoff + size > max) goto bad; - error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value); + if (vc->vc_cfgread != NULL) + error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value); + else + error = 0; if (!error) goto done; } bad: cr = vi_find_cr(offset); if (cr == NULL || cr->cr_size != size) { if (cr != NULL) { /* offset must be OK, so size must be bad */ EPRINTLN( "%s: read from %s: bad size %d", name, cr->cr_name, size); } else { EPRINTLN( "%s: read from bad offset/size %jd/%d", name, (uintmax_t)offset, size); } goto done; } switch (offset) { case VIRTIO_PCI_HOST_FEATURES: value = vc->vc_hv_caps; break; case VIRTIO_PCI_GUEST_FEATURES: value = vs->vs_negotiated_caps; break; case VIRTIO_PCI_QUEUE_PFN: if (vs->vs_curq < vc->vc_nvq) value = vs->vs_queues[vs->vs_curq].vq_pfn; break; case VIRTIO_PCI_QUEUE_NUM: value = vs->vs_curq < vc->vc_nvq ? vs->vs_queues[vs->vs_curq].vq_qsize : 0; break; case VIRTIO_PCI_QUEUE_SEL: value = vs->vs_curq; break; case VIRTIO_PCI_QUEUE_NOTIFY: value = 0; /* XXX */ break; case VIRTIO_PCI_STATUS: value = vs->vs_status; break; case VIRTIO_PCI_ISR: value = vs->vs_isr; vs->vs_isr = 0; /* a read clears this flag */ if (value) pci_lintr_deassert(pi); break; case VIRTIO_MSI_CONFIG_VECTOR: value = vs->vs_msix_cfg_idx; break; case VIRTIO_MSI_QUEUE_VECTOR: value = vs->vs_curq < vc->vc_nvq ? vs->vs_queues[vs->vs_curq].vq_msix_idx : VIRTIO_MSI_NO_VECTOR; break; } done: if (vs->vs_mtx) pthread_mutex_unlock(vs->vs_mtx); return (value); } /* * Handle pci config space writes. * If it's to the MSI-X info, do that. * If it's part of the virtio standard stuff, do that. * Otherwise dispatch to the actual driver. */ void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct virtio_softc *vs = pi->pi_arg; struct vqueue_info *vq; struct virtio_consts *vc; struct config_reg *cr; uint64_t virtio_config_size, max; const char *name; uint32_t newoff; int error; if (vs->vs_flags & VIRTIO_USE_MSIX) { if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { pci_emul_msix_twrite(pi, offset, size, value); return; } } /* XXX probably should do something better than just assert() */ assert(baridx == 0); if (vs->vs_mtx) pthread_mutex_lock(vs->vs_mtx); vc = vs->vs_vc; name = vc->vc_name; if (size != 1 && size != 2 && size != 4) goto bad; virtio_config_size = VIRTIO_PCI_CONFIG_OFF(pci_msix_enabled(pi)); if (offset >= virtio_config_size) { /* * Subtract off the standard size (including MSI-X * registers if enabled) and dispatch to underlying driver. */ newoff = offset - virtio_config_size; max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; if (newoff + size > max) goto bad; - error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value); + if (vc->vc_cfgwrite != NULL) + error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value); + else + error = 0; if (!error) goto done; } bad: cr = vi_find_cr(offset); if (cr == NULL || cr->cr_size != size || cr->cr_ro) { if (cr != NULL) { /* offset must be OK, wrong size and/or reg is R/O */ if (cr->cr_size != size) EPRINTLN( "%s: write to %s: bad size %d", name, cr->cr_name, size); if (cr->cr_ro) EPRINTLN( "%s: write to read-only reg %s", name, cr->cr_name); } else { EPRINTLN( "%s: write to bad offset/size %jd/%d", name, (uintmax_t)offset, size); } goto done; } switch (offset) { case VIRTIO_PCI_GUEST_FEATURES: vs->vs_negotiated_caps = value & vc->vc_hv_caps; if (vc->vc_apply_features) (*vc->vc_apply_features)(DEV_SOFTC(vs), vs->vs_negotiated_caps); break; case VIRTIO_PCI_QUEUE_PFN: if (vs->vs_curq >= vc->vc_nvq) goto bad_qindex; vi_vq_init(vs, value); break; case VIRTIO_PCI_QUEUE_SEL: /* * Note that the guest is allowed to select an * invalid queue; we just need to return a QNUM * of 0 while the bad queue is selected. */ vs->vs_curq = value; break; case VIRTIO_PCI_QUEUE_NOTIFY: if (value >= vc->vc_nvq) { EPRINTLN("%s: queue %d notify out of range", name, (int)value); goto done; } vq = &vs->vs_queues[value]; if (vq->vq_notify) (*vq->vq_notify)(DEV_SOFTC(vs), vq); else if (vc->vc_qnotify) (*vc->vc_qnotify)(DEV_SOFTC(vs), vq); else EPRINTLN( "%s: qnotify queue %d: missing vq/vc notify", name, (int)value); break; case VIRTIO_PCI_STATUS: vs->vs_status = value; if (value == 0) (*vc->vc_reset)(DEV_SOFTC(vs)); break; case VIRTIO_MSI_CONFIG_VECTOR: vs->vs_msix_cfg_idx = value; break; case VIRTIO_MSI_QUEUE_VECTOR: if (vs->vs_curq >= vc->vc_nvq) goto bad_qindex; vq = &vs->vs_queues[vs->vs_curq]; vq->vq_msix_idx = value; break; } goto done; bad_qindex: EPRINTLN( "%s: write config reg %s: curq %d >= max %d", name, cr->cr_name, vs->vs_curq, vc->vc_nvq); done: if (vs->vs_mtx) pthread_mutex_unlock(vs->vs_mtx); } #ifdef BHYVE_SNAPSHOT int vi_pci_pause(struct vmctx *ctx, struct pci_devinst *pi) { struct virtio_softc *vs; struct virtio_consts *vc; vs = pi->pi_arg; vc = vs->vs_vc; vc = vs->vs_vc; assert(vc->vc_pause != NULL); (*vc->vc_pause)(DEV_SOFTC(vs)); return (0); } int vi_pci_resume(struct vmctx *ctx, struct pci_devinst *pi) { struct virtio_softc *vs; struct virtio_consts *vc; vs = pi->pi_arg; vc = vs->vs_vc; vc = vs->vs_vc; assert(vc->vc_resume != NULL); (*vc->vc_resume)(DEV_SOFTC(vs)); return (0); } static int vi_pci_snapshot_softc(struct virtio_softc *vs, struct vm_snapshot_meta *meta) { int ret; SNAPSHOT_VAR_OR_LEAVE(vs->vs_flags, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_negotiated_caps, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_curq, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_status, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_isr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_msix_cfg_idx, meta, ret, done); done: return (ret); } static int vi_pci_snapshot_consts(struct virtio_consts *vc, struct vm_snapshot_meta *meta) { int ret; SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_nvq, meta, ret, done); SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_cfgsize, meta, ret, done); SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_hv_caps, meta, ret, done); done: return (ret); } static int vi_pci_snapshot_queues(struct virtio_softc *vs, struct vm_snapshot_meta *meta) { int i; int ret; struct virtio_consts *vc; struct vqueue_info *vq; uint64_t addr_size; vc = vs->vs_vc; /* Save virtio queue info */ for (i = 0; i < vc->vc_nvq; i++) { vq = &vs->vs_queues[i]; SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_qsize, meta, ret, done); SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_num, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_flags, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_last_avail, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_next_used, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_save_used, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_msix_idx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_pfn, meta, ret, done); addr_size = vq->vq_qsize * sizeof(struct vring_desc); SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_desc, addr_size, false, meta, ret, done); addr_size = (2 + vq->vq_qsize + 1) * sizeof(uint16_t); SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_avail, addr_size, false, meta, ret, done); addr_size = (2 + 2 * vq->vq_qsize + 1) * sizeof(uint16_t); SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_used, addr_size, false, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(vq->vq_desc, vring_size_aligned(vq->vq_qsize), meta, ret, done); } done: return (ret); } int vi_pci_snapshot(struct vm_snapshot_meta *meta) { int ret; struct pci_devinst *pi; struct virtio_softc *vs; struct virtio_consts *vc; pi = meta->dev_data; vs = pi->pi_arg; vc = vs->vs_vc; /* Save virtio softc */ ret = vi_pci_snapshot_softc(vs, meta); if (ret != 0) goto done; /* Save virtio consts */ ret = vi_pci_snapshot_consts(vc, meta); if (ret != 0) goto done; /* Save virtio queue info */ ret = vi_pci_snapshot_queues(vs, meta); if (ret != 0) goto done; /* Save device softc, if needed */ if (vc->vc_snapshot != NULL) { ret = (*vc->vc_snapshot)(DEV_SOFTC(vs), meta); if (ret != 0) goto done; } done: return (ret); } #endif diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h index e03fd5f710d1..4c7b9de83125 100644 --- a/usr.sbin/bhyve/virtio.h +++ b/usr.sbin/bhyve/virtio.h @@ -1,411 +1,427 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Chris Torek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _BHYVE_VIRTIO_H_ #define _BHYVE_VIRTIO_H_ #include #include #include #include /* * These are derived from several virtio specifications. * * Some useful links: * https://github.com/rustyrussell/virtio-spec * http://people.redhat.com/pbonzini/virtio-spec.pdf */ /* * A virtual device has zero or more "virtual queues" (virtqueue). * Each virtqueue uses at least two 4096-byte pages, laid out thus: * * +-----------------------------------------------+ * | "desc": descriptors, 16 bytes each | * | ----------------------------------------- | * | "avail": 2 uint16; uint16; 1 uint16 | * | ----------------------------------------- | * | pad to 4k boundary | * +-----------------------------------------------+ * | "used": 2 x uint16; elems; 1 uint16 | * | ----------------------------------------- | * | pad to 4k boundary | * +-----------------------------------------------+ * * The number that appears here is always a power of two and is * limited to no more than 32768 (as it must fit in a 16-bit field). * If is sufficiently large, the above will occupy more than * two pages. In any case, all pages must be physically contiguous * within the guest's physical address space. * * The 16-byte "desc" descriptors consist of a 64-bit guest * physical address , a 32-bit length , a 16-bit * , and a 16-bit field (all in guest byte order). * * There are three flags that may be set : * NEXT descriptor is chained, so use its "next" field * WRITE descriptor is for host to write into guest RAM * (else host is to read from guest RAM) * INDIRECT descriptor address field is (guest physical) * address of a linear array of descriptors * * Unless INDIRECT is set, is the number of bytes that may * be read/written from guest physical address . If * INDIRECT is set, WRITE is ignored and provides the length * of the indirect descriptors (and must be a multiple of * 16). Note that NEXT may still be set in the main descriptor * pointing to the indirect, and should be set in each indirect * descriptor that uses the next descriptor (these should generally * be numbered sequentially). However, INDIRECT must not be set * in the indirect descriptors. Upon reaching an indirect descriptor * without a NEXT bit, control returns to the direct descriptors. * * Except inside an indirect, each value must be in the * range [0 .. N) (i.e., the half-open interval). (Inside an * indirect, each must be in the range [0 .. /16).) * * The "avail" data structures reside in the same pages as the * "desc" structures since both together are used by the device to * pass information to the hypervisor's virtual driver. These * begin with a 16-bit field and 16-bit index , then * have 16-bit values, followed by one final 16-bit * field . The entries are simply indices * indices into the descriptor ring (and thus must meet the same * constraints as each value). However, is counted * up from 0 (initially) and simply wraps around after 65535; it * is taken mod to find the next available entry. * * The "used" ring occupies a separate page or pages, and contains * values written from the virtual driver back to the guest OS. * This begins with a 16-bit and 16-bit , then there * are "vring_used" elements, followed by a 16-bit . * The "vring_used" elements consist of a 32-bit and a * 32-bit (vu_tlen below). The is simply the index of * the head of a descriptor chain the guest made available * earlier, and the is the number of bytes actually written, * e.g., in the case of a network driver that provided a large * receive buffer but received only a small amount of data. * * The two event fields, and , in the * avail and used rings (respectively -- note the reversal!), are * always provided, but are used only if the virtual device * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature * negotiation. Similarly, both rings provide a flag -- * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in * their field, indicating that the guest does not need an * interrupt, or that the hypervisor driver does not need a * notify, when descriptors are added to the corresponding ring. * (These are provided only for interrupt optimization and need * not be implemented.) */ #define VRING_ALIGN 4096 /* * The address of any given virtual queue is determined by a single * Page Frame Number register. The guest writes the PFN into the * PCI config space. However, a device that has two or more * virtqueues can have a different PFN, and size, for each queue. * The number of queues is determinable via the PCI config space * VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means * queue #0, 1 means queue#1, etc. Once a queue is selected, the * remaining PFN and QNUM registers refer to that queue. * * QNUM is a read-only register containing a nonzero power of two * that indicates the (hypervisor's) queue size. Or, if reading it * produces zero, the hypervisor does not have a corresponding * queue. (The number of possible queues depends on the virtual * device. The block device has just one; the network device * provides either two -- 0 = receive, 1 = transmit -- or three, * with 2 = control.) * * PFN is a read/write register giving the physical page address of * the virtqueue in guest memory (the guest must allocate enough space * based on the hypervisor's provided QNUM). * * QNOTIFY is effectively write-only: when the guest writes a queue * number to the register, the hypervisor should scan the specified * virtqueue. (Reading QNOTIFY currently always gets 0). */ /* * PFN register shift amount */ #define VRING_PFN 12 /* * PCI vendor/device IDs */ #define VIRTIO_VENDOR 0x1AF4 #define VIRTIO_DEV_NET 0x1000 #define VIRTIO_DEV_BLOCK 0x1001 #define VIRTIO_DEV_CONSOLE 0x1003 #define VIRTIO_DEV_RANDOM 0x1005 #define VIRTIO_DEV_SCSI 0x1008 #define VIRTIO_DEV_9P 0x1009 +#define VIRTIO_DEV_INPUT 0x1052 + +/* + * PCI revision IDs + */ +#define VIRTIO_REV_INPUT 1 + +/* + * PCI subvendor IDs + */ +#define VIRTIO_SUBVEN_INPUT 0x108E + +/* + * PCI subdevice IDs + */ +#define VIRTIO_SUBDEV_INPUT 0x1100 /* From section 2.3, "Virtqueue Configuration", of the virtio specification */ static inline int vring_size_aligned(u_int qsz) { return (roundup2(vring_size(qsz, VRING_ALIGN), VRING_ALIGN)); } struct vmctx; struct pci_devinst; struct vqueue_info; struct vm_snapshot_meta; /* * A virtual device, with some number (possibly 0) of virtual * queues and some size (possibly 0) of configuration-space * registers private to the device. The virtio_softc should come * at the front of each "derived class", so that a pointer to the * virtio_softc is also a pointer to the more specific, derived- * from-virtio driver's softc. * * Note: inside each hypervisor virtio driver, changes to these * data structures must be locked against other threads, if any. * Except for PCI config space register read/write, we assume each * driver does the required locking, but we need a pointer to the * lock (if there is one) for PCI config space read/write ops. * * When the guest reads or writes the device's config space, the * generic layer checks for operations on the special registers * described above. If the offset of the register(s) being read * or written is past the CFG area (CFG0 or CFG1), the request is * passed on to the virtual device, after subtracting off the * generic-layer size. (So, drivers can just use the offset as * an offset into "struct config", for instance.) * * (The virtio layer also makes sure that the read or write is to/ * from a "good" config offset, hence vc_cfgsize, and on BAR #0. * However, the driver must verify the read or write size and offset * and that no one is writing a readonly register.) * * The BROKED flag ("this thing done gone and broked") is for future * use. */ #define VIRTIO_USE_MSIX 0x01 #define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */ #define VIRTIO_BROKED 0x08 /* ??? */ struct virtio_softc { struct virtio_consts *vs_vc; /* constants (see below) */ int vs_flags; /* VIRTIO_* flags from above */ pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */ struct pci_devinst *vs_pi; /* PCI device instance */ uint32_t vs_negotiated_caps; /* negotiated capabilities */ struct vqueue_info *vs_queues; /* one per vc_nvq */ int vs_curq; /* current queue */ uint8_t vs_status; /* value from last status write */ uint8_t vs_isr; /* ISR flags, if not MSI-X */ uint16_t vs_msix_cfg_idx; /* MSI-X vector for config event */ }; #define VS_LOCK(vs) \ do { \ if (vs->vs_mtx) \ pthread_mutex_lock(vs->vs_mtx); \ } while (0) #define VS_UNLOCK(vs) \ do { \ if (vs->vs_mtx) \ pthread_mutex_unlock(vs->vs_mtx); \ } while (0) struct virtio_consts { const char *vc_name; /* name of driver (for diagnostics) */ int vc_nvq; /* number of virtual queues */ size_t vc_cfgsize; /* size of dev-specific config regs */ void (*vc_reset)(void *); /* called on virtual device reset */ void (*vc_qnotify)(void *, struct vqueue_info *); /* called on QNOTIFY if no VQ notify */ int (*vc_cfgread)(void *, int, int, uint32_t *); /* called to read config regs */ int (*vc_cfgwrite)(void *, int, int, uint32_t); /* called to write config regs */ void (*vc_apply_features)(void *, uint64_t); /* called to apply negotiated features */ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ void (*vc_pause)(void *); /* called to pause device activity */ void (*vc_resume)(void *); /* called to resume device activity */ int (*vc_snapshot)(void *, struct vm_snapshot_meta *); /* called to save / restore device state */ }; /* * Data structure allocated (statically) per virtual queue. * * Drivers may change vq_qsize after a reset. When the guest OS * requests a device reset, the hypervisor first calls * vs->vs_vc->vc_reset(); then the data structure below is * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq). * * The remaining fields should only be fussed-with by the generic * code. * * Note: the addresses of vq_desc, vq_avail, and vq_used are all * computable from each other, but it's a lot simpler if we just * keep a pointer to each one. The event indices are similarly * (but more easily) computable, and this time we'll compute them: * they're just XX_ring[N]. */ #define VQ_ALLOC 0x01 /* set once we have a pfn */ #define VQ_BROKED 0x02 /* ??? */ struct vqueue_info { uint16_t vq_qsize; /* size of this queue (a power of 2) */ void (*vq_notify)(void *, struct vqueue_info *); /* called instead of vc_notify, if not NULL */ struct virtio_softc *vq_vs; /* backpointer to softc */ uint16_t vq_num; /* we're the num'th queue in the softc */ uint16_t vq_flags; /* flags (see above) */ uint16_t vq_last_avail; /* a recent value of vq_avail->idx */ uint16_t vq_next_used; /* index of the next used slot to be filled */ uint16_t vq_save_used; /* saved vq_used->idx; see vq_endchains */ uint16_t vq_msix_idx; /* MSI-X index, or VIRTIO_MSI_NO_VECTOR */ uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */ volatile struct vring_desc *vq_desc; /* descriptor array */ volatile struct vring_avail *vq_avail; /* the "avail" ring */ volatile struct vring_used *vq_used; /* the "used" ring */ }; /* as noted above, these are sort of backwards, name-wise */ #define VQ_AVAIL_EVENT_IDX(vq) \ (*(volatile uint16_t *)&(vq)->vq_used->ring[(vq)->vq_qsize]) #define VQ_USED_EVENT_IDX(vq) \ ((vq)->vq_avail->ring[(vq)->vq_qsize]) /* * Is this ring ready for I/O? */ static inline int vq_ring_ready(struct vqueue_info *vq) { return (vq->vq_flags & VQ_ALLOC); } /* * Are there "available" descriptors? (This does not count * how many, just returns True if there are some.) */ static inline int vq_has_descs(struct vqueue_info *vq) { return (vq_ring_ready(vq) && vq->vq_last_avail != vq->vq_avail->idx); } /* * Deliver an interrupt to guest on the given virtual queue * (if possible, or a generic MSI interrupt if not using MSI-X). */ static inline void vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq) { if (pci_msix_enabled(vs->vs_pi)) pci_generate_msix(vs->vs_pi, vq->vq_msix_idx); else { VS_LOCK(vs); vs->vs_isr |= VIRTIO_PCI_ISR_INTR; pci_generate_msi(vs->vs_pi, 0); pci_lintr_assert(vs->vs_pi); VS_UNLOCK(vs); } } static inline void vq_kick_enable(struct vqueue_info *vq) { vq->vq_used->flags &= ~VRING_USED_F_NO_NOTIFY; /* * Full memory barrier to make sure the store to vq_used->flags * happens before the load from vq_avail->idx, which results from a * subsequent call to vq_has_descs(). */ atomic_thread_fence_seq_cst(); } static inline void vq_kick_disable(struct vqueue_info *vq) { vq->vq_used->flags |= VRING_USED_F_NO_NOTIFY; } struct iovec; /* * Request description returned by vq_getchain. * * Writable iovecs start at iov[req.readable]. */ struct vi_req { int readable; /* num of readable iovecs */ int writable; /* num of writable iovecs */ unsigned int idx; /* ring index */ }; void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, void *dev_softc, struct pci_devinst *pi, struct vqueue_info *queues); int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); void vi_reset_dev(struct virtio_softc *); void vi_set_io_bar(struct virtio_softc *, int); int vq_getchain(struct vqueue_info *vq, struct iovec *iov, int niov, struct vi_req *reqp); void vq_retchains(struct vqueue_info *vq, uint16_t n_chains); void vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); void vq_relchain_publish(struct vqueue_info *vq); void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); void vq_endchains(struct vqueue_info *vq, int used_all_avail); uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size); void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value); #ifdef BHYVE_SNAPSHOT int vi_pci_snapshot(struct vm_snapshot_meta *meta); int vi_pci_pause(struct vmctx *ctx, struct pci_devinst *pi); int vi_pci_resume(struct vmctx *ctx, struct pci_devinst *pi); #endif #endif /* _BHYVE_VIRTIO_H_ */