diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile index 1c679a3c1578..a5a1dafebcd9 100644 --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -1,136 +1,137 @@ # # $FreeBSD$ # .include CFLAGS+=-I${.CURDIR}/../../contrib/lib9p CFLAGS+=-I${SRCTOP}/sys .PATH: ${SRCTOP}/sys/cam/ctl PROG= bhyve PACKAGE= bhyve MAN= bhyve.8 bhyve_config.5 BHYVE_SYSDIR?=${SRCTOP} SRCS= \ acpi_device.c \ atkbdc.c \ acpi.c \ audio.c \ basl.c \ bhyvegc.c \ bhyverun.c \ block_if.c \ bootrom.c \ config.c \ console.c \ ctl_util.c \ ctl_scsi_all.c \ fwctl.c \ gdb.c \ hda_codec.c \ inout.c \ ioapic.c \ kernemu_dev.c \ mem.c \ mevent.c \ mptbl.c \ net_backends.c \ net_utils.c \ pci_ahci.c \ pci_e82545.c \ pci_emul.c \ pci_hda.c \ pci_fbuf.c \ pci_hostbridge.c \ pci_irq.c \ pci_lpc.c \ pci_nvme.c \ pci_passthru.c \ pci_virtio_9p.c \ pci_virtio_block.c \ pci_virtio_console.c \ pci_virtio_input.c \ pci_virtio_net.c \ pci_virtio_rnd.c \ pci_virtio_scsi.c \ pci_uart.c \ pci_xhci.c \ pctestdev.c \ pm.c \ post.c \ ps2kbd.c \ ps2mouse.c \ + qemu_fwcfg.c \ rfb.c \ rtc.c \ smbiostbl.c \ sockstream.c \ task_switch.c \ uart_emul.c \ usb_emul.c \ usb_mouse.c \ virtio.c \ vga.c \ vmgenc.c \ xmsr.c \ spinup_ap.c \ iov.c .if ${MK_BHYVE_SNAPSHOT} != "no" SRCS+= snapshot.c .endif CFLAGS.kernemu_dev.c+= -I${SRCTOP}/sys/amd64 .PATH: ${BHYVE_SYSDIR}/sys/amd64/vmm SRCS+= vmm_instruction_emul.c LIBADD= vmmapi md nv pthread z util sbuf cam 9p .if ${MK_BHYVE_SNAPSHOT} != "no" LIBADD+= ucl xo .endif .if ${MK_INET_SUPPORT} != "no" CFLAGS+=-DINET .endif .if ${MK_INET6_SUPPORT} != "no" CFLAGS+=-DINET6 .endif .if ${MK_NETGRAPH_SUPPORT} != "no" CFLAGS+=-DNETGRAPH LIBADD+= netgraph .endif .if ${MK_OPENSSL} == "no" CFLAGS+=-DNO_OPENSSL .else LIBADD+= crypto .endif CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000 CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller .if ${MK_BHYVE_SNAPSHOT} != "no" CFLAGS+= -I${SRCTOP}/contrib/libucl/include # Temporary disable capsicum, until we integrate checkpoint code with it. CFLAGS+= -DWITHOUT_CAPSICUM CFLAGS+= -DBHYVE_SNAPSHOT .endif .ifdef GDB_LOG CFLAGS+=-DGDB_LOG .endif # Disable thread safety analysis since it only finds very simple bugs and # yields many false positives. NO_WTHREAD_SAFETY= NO_WCAST_ALIGN= SUBDIR= kbdlayout .include diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 index 84e031f1340c..72018912e2c5 100644 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -1,992 +1,1017 @@ .\" Copyright (c) 2013 Peter Grehan .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd August 19, 2022 .Dt BHYVE 8 .Os .Sh NAME .Nm bhyve .Nd "run a guest operating system inside a virtual machine" .Sh SYNOPSIS .Nm .Op Fl AaCDeHhPSuWwxY .Oo .Sm off .Fl c\~ .Oo .Op Cm cpus= .Ar numcpus .Oc .Op Cm ,sockets= Ar n .Op Cm ,cores= Ar n .Op Cm ,threads= Ar n .Oc .Sm on .Oo .Sm off .Fl G\~ .Oo Ar w Oc .Oo Ar bind_address Cm \&: Oc .Ar port .Sm on .Oc .Op Fl k Ar config_file .Op Fl K Ar layout .Oo Fl l .Sm off .Ar lpcdev Op Cm \&, Ar conf .Sm on .Oc .Sm off .Oo Fl m\~ .Ar memsize .Oo .Cm K | Cm k | Cm M | Cm m | Cm G | Cm g | Cm T | Cm t .Oc .Sm on .Oc .Op Fl o Ar var Ns Cm = Ns Ar value .Op Fl p Ar vcpu Ns Cm \&: Ns Ar hostcpu .Op Fl r Ar file .Sm off .Oo Fl s\~ .Ar slot Cm \&, Ar emulation Op Cm \&, Ar conf .Sm on .Oc .Op Fl U Ar uuid .Ar vmname .Nm .Fl l Cm help .Nm .Fl s Cm help .Sh DESCRIPTION .Nm is a hypervisor that runs guest operating systems inside a virtual machine. .Pp Parameters such as the number of virtual CPUs, amount of guest memory, and I/O connectivity can be specified with command-line parameters. .Pp If not using a boot ROM, the guest operating system must be loaded with .Xr bhyveload 8 or a similar boot loader before running .Nm , otherwise, it is enough to run .Nm with a boot ROM of choice. .Pp .Nm runs until the guest operating system reboots or an unhandled hypervisor exit is detected. .Sh OPTIONS .Bl -tag -width 10n .It Fl A Generate ACPI tables. Required for .Fx Ns /amd64 guests. .It Fl a The guest's local APIC is configured in xAPIC mode. The xAPIC mode is the default setting so this option is redundant. It will be deprecated in a future version. .It Fl C Include guest memory in core file. .It Fl c Op Ar setting ... Number of guest virtual CPUs and/or the CPU topology. The default value for each of .Ar numcpus , .Ar sockets , .Ar cores , and .Ar threads is 1. The current maximum number of guest virtual CPUs is 16. If .Ar numcpus is not specified then it will be calculated from the other arguments. The topology must be consistent in that the .Ar numcpus must equal the product of .Ar sockets , .Ar cores , and .Ar threads . If a .Ar setting is specified more than once the last one has precedence. .It Fl D Destroy the VM on guest initiated power-off. .It Fl e Force .Nm to exit when a guest issues an access to an I/O port that is not emulated. This is intended for debug purposes. .It Fl G Xo .Sm off .Oo Ar w Oc .Oo Ar bind_address Cm \&: Oc .Ar port .Sm on .Xc Start a debug server that uses the GDB protocol to export guest state to a debugger. An IPv4 TCP socket will be bound to the supplied .Ar bind_address and .Ar port to listen for debugger connections. Only a single debugger may be attached to the debug server at a time. If the option begins with .Sq w , .Nm will pause execution at the first instruction waiting for a debugger to attach. .It Fl H Yield the virtual CPU thread when a HLT instruction is detected. If this option is not specified, virtual CPUs will use 100% of a host CPU. .It Fl h Print help message and exit. .It Fl k Ar config_file Set configuration variables from a simple, key-value config file. Each line of the config file is expected to consist of a config variable name, an equals sign .It Fl K Ar layout Specify the keyboard layout. The value that can be specified sets the file name in .Ar /usr/share/bhyve/kbdlayout . This specification only works when loaded with UEFI mode for VNC. When using a VNC client that supports QEMU Extended Key Event Message (e.g. TigerVNC), this option isn't needed. When using a VNC client that doesn't support QEMU Extended Key Event Message (e.g. tightVNC), the layout defaults to the US keyboard unless specified otherwise. .Pq Sq = , and a value. No spaces are permitted between the variable name, equals sign, or value. Blank lines and lines starting with .Sq # are ignored. See .Xr bhyve_config 5 for more details. .It Fl l Cm help Print a list of supported LPC devices. .It Fl l Ar lpcdev Ns Op Cm \&, Ns Ar conf Allow devices behind the LPC PCI-ISA bridge to be configured. The only supported devices are the TTY-class devices .Cm com1 , com2 , com3 , and .Cm com4 , the boot ROM device .Cm bootrom , -and the debug/test device +the +.Cm fwcfg +type and the debug/test device .Cm pc-testdev . .Pp The possible values for the .Ar conf argument are listed in the .Fl s flag description. .It Xo .Fl m Ar memsize Ns Oo .Sm off .Cm K | k | M | m | G | g | T | t .Sm on .Oc .Xc Set the guest physical memory size This must be the same size that was given to .Xr bhyveload 8 . .Pp The size argument may be suffixed with one of .Cm K , M , G or .Cm T (either upper or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes, or terabytes. If no suffix is given, the value is assumed to be in megabytes. .Pp The default is 256M. .It Fl o Ar var Ns Cm = Ns Ar value Set the configuration variable .Ar var to .Ar value . .It Fl P Force the guest virtual CPU to exit when a PAUSE instruction is detected. .It Fl p Ar vcpu Ns Cm \& : Ns Ar hostcpu Pin guest's virtual CPU .Em vcpu to .Em hostcpu . .It Fl r Ar file Resume a guest from a snapshot. The guest memory contents are restored from .Ar file , and the guest device and vCPU state are restored from the file .Dq Ar file Ns .kern . .Pp Note that the current snapshot file format requires that the configuration of devices in the new VM match the VM from which the snapshot was taken by specifying the same .Fl s and .Fl l options. The count of vCPUs and memory configuration are read from the snapshot. .It Fl S Wire guest memory. .It Fl s Cm help Print a list of supported PCI devices. .It Fl s Ar slot Ns Cm \&, Ns Ar emulation Ns Op Cm \&, Ns Ar conf Configure a virtual PCI slot and function. .Pp .Nm provides PCI bus emulation and virtual devices that can be attached to slots on the bus. There are 32 available slots, with the option of providing up to 8 functions per slot. .Pp The .Ar slot can be specified in one of the following formats: .Pp .Bl -bullet -compact .It .Ar pcislot .It .Sm off .Ar pcislot Cm \&: Ar function .Sm on .It .Sm off .Ar bus Cm \&: Ar pcislot Cm \&: Ar function .Sm on .El .Pp The .Ar pcislot value is 0 to 31. The optional .Ar function value is 0 to 7. The optional .Ar bus value is 0 to 255. If not specified, the .Ar function value defaults to 0. If not specified, the .Ar bus value defaults to 0. .Pp The .Ar emulation argument can be one of the following: .Bl -tag -width "amd_hostbridge" .It Cm hostbridge A simple host bridge. This is usually configured at slot 0, and is required by most guest operating systems. .It Cm amd_hostbridge Emulation identical to .Cm hostbridge using a PCI vendor ID of AMD. .It Cm passthru PCI pass-through device. .It Cm virtio-net Virtio network interface. .It Cm virtio-blk Virtio block storage interface. .It Cm virtio-scsi Virtio SCSI interface. .It Cm virtio-9p Virtio 9p (VirtFS) interface. .It Cm virtio-rnd Virtio RNG interface. .It Cm virtio-console Virtio console interface, which exposes multiple ports to the guest in the form of simple char devices for simple IO between the guest and host userspaces. .It Cm virtio-input Virtio input interface. .It Cm ahci AHCI controller attached to arbitrary devices. .It Cm ahci-cd AHCI controller attached to an ATAPI CD/DVD. .It Cm ahci-hd AHCI controller attached to a SATA hard drive. .It Cm e1000 Intel e82545 network interface. .It Cm uart PCI 16550 serial device. .It Cm lpc LPC PCI-ISA bridge with COM1, COM2, COM3, and COM4 16550 serial ports, a boot ROM, and, -optionally, the debug/test device. +optionally, a fwcfg type and the debug/test device. The LPC bridge emulation can only be configured on bus 0. .It Cm fbuf Raw framebuffer device attached to VNC server. .It Cm xhci eXtensible Host Controller Interface (xHCI) USB controller. .It Cm nvme NVM Express (NVMe) controller. .It Cm hda High Definition Audio Controller. .El .Pp The optional parameter .Ar conf describes the backend for device emulations. If .Ar conf is not specified, the device emulation has no backend and can be considered unconnected. .Pp Network device backends: .Sm off .Bl -bullet .It .Xo .Cm tap Ar N .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx .Op Cm \&,mtu= Ar N .Xc .It .Xo .Cm vmnet Ar N .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx .Op Cm \&,mtu= Ar N .Xc .It .Xo .Cm netgraph,path= Ar ADDRESS Cm \&,peerhook= Ar HOOK .Op Cm \&,socket= Ar NAME .Op Cm \&,hook= Ar HOOK .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx .Op Cm \&,mtu= Ar N .Xc .El .Sm on .Pp If .Cm mac is not specified, the MAC address is derived from a fixed OUI and the remaining bytes from an MD5 hash of the slot and function numbers and the device name. .Pp The MAC address is an ASCII string in .Xr ethers 5 format. .Pp With .Cm virtio-net devices, the .Cm mtu parameter can be specified to inform the guest about the largest MTU that should be allowed, expressed in bytes. .Pp With .Cm netgraph backend, the .Cm path and .Cm peerhook parameters must be specified to set the destination node and corresponding hook. The optional parameters .Cm socket and .Cm hook may be used to set the .Xr ng_socket 4 node name and source hook. The .Ar ADDRESS , .Ar HOOK , and .Ar NAME must comply with .Xr netgraph 4 addressing rules. .Pp Block storage device backends: .Sm off .Bl -bullet .It .Ar /filename Op Cm \&, Ar block-device-options .It .Ar /dev/xxx Op Cm \&, Ar block-device-options .El .Sm on .Pp The .Ar block-device-options are: .Bl -tag -width 10n .It Cm nocache Open the file with .Dv O_DIRECT . .It Cm direct Open the file using .Dv O_SYNC . .It Cm ro Force the file to be opened read-only. .It Cm sectorsize= Ns Ar logical Ns Oo Cm \&/ Ns Ar physical Oc Specify the logical and physical sector sizes of the emulated disk. The physical sector size is optional and is equal to the logical sector size if not explicitly specified. .It Cm nodelete Disable emulation of guest trim requests via .Dv DIOCGDELETE requests. .El .Pp SCSI device backends: .Sm off .Bl -bullet .It .Pa /dev/cam/ctl Oo Ar pp Cm \&. Ar vp Oc Oo Cm \&, Ar scsi-device-options Oc .El .Sm on .Pp The .Ar scsi-device-options are: .Bl -tag -width 10n .It Cm iid= Ns Ar IID Initiator ID to use when sending requests to specified CTL port. The default value is 0. .El .Pp 9P device backends: .Sm off .Bl -bullet .It .Ar sharename Cm = Ar /path/to/share Op Cm \&, Ar 9p-device-options .El .Sm on .Pp The .Ar 9p-device-options are: .Bl -tag -width 10n .It Cm ro Expose the share in read-only mode. .El .Pp TTY device backends: .Bl -tag -width 10n .It Cm stdio Connect the serial port to the standard input and output of the .Nm process. .It Ar /dev/xxx Use the host TTY device for serial port I/O. .El .Pp Boot ROM device backends: .Bl -tag -width 10n .It Ar romfile Ns Op Cm \&, Ns Ar varfile Map .Ar romfile in the guest address space reserved for boot firmware. If .Ar varfile is provided, that file is also mapped in the boot firmware guest address space, and any modifications the guest makes will be saved to that file. .El .Pp +Fwcfg types: +.Bl -tag -width 10n +.It Ar fwcfg +The fwcfg interface is used to pass information such as the CPU count or ACPI tables to the guest firmware. +Supported values are +.Ql bhyve +and +.Ql qemu . +Due to backward compatibility reasons, +.Ql bhyve +is the default option. +When +.Ql bhyve +is used, bhyve's fwctl interface is used. +It currently reports only the CPU count to the guest firmware. +The +.Ql qemu +option uses QEMU's fwcfg interface. +This interface is widely used and allows user-defined information to be passed to the guest. +It is used for passing the CPU count, ACPI tables, a boot order and many other things to the guest. +Some operating systems such as Fedora CoreOS can be configured by qemu's fwcfg interface as well. +.El +.Pp Pass-through device backends: .Sm off .Bl -bullet .It .Cm ppt Ar N Oo , Ar passthru-device-options Oc .It .Ns Ar bus Cm \&/ Ar slot Cm \&/ Ar function .Op , Ar passthru-device-options .It .Cm pci Ar bus Cm : Ar slot Cm : Ns Ar function .Op , Ar passthru-device-options .El .Sm on .Pp Connect to a PCI device on the host either named ppt .Ns Ar N or at the selector described by .Ar slot , .Ar bus , and .Ar function numbers. .Pp The .Ar passthru-device-options are: .Bl -tag -width 10n .It Cm rom= Ns Ar romfile Add .Ar romfile as option ROM to the PCI device. The ROM will be loaded by firmware and should be capable of initializing the device. .El .Pp Guest memory must be wired using the .Fl S option when a pass-through device is configured. .Pp The host device must have been reserved at boot-time using the .Va pptdevs loader variable as described in .Xr vmm 4 . .Pp Virtio console device backends: .Bl -bullet .Sm off .It .Cm port1= Ns Ar /path/to/port1.sock Ns Op Cm ,port Ns Ar N Cm \&= Ns Ar /path/to/port2.sock No \~ Ar ... .Sm on .El .Pp A maximum of 16 ports per device can be created. Every port is named and corresponds to a Unix domain socket created by .Nm . .Nm accepts at most one connection per port at a time. .Pp Limitations: .Bl -bullet .It Due to lack of destructors in .Nm , sockets on the filesystem must be cleaned up manually after .Nm exits. .It There is no way to use the .Dq console port feature, nor the console port resize at present. .It Emergency write is advertised, but no-op at present. .El .Pp Virtio input device backends: .Bl -tag -width 10n .It Ar /dev/input/eventX Send input events of .Ar /dev/input/eventX to guest by VirtIO Input Interface. .El .Pp Framebuffer devices backends: .Bl -bullet .Sm off .It .Op Cm rfb= Ar ip-and-port .Op Cm ,w= Ar width .Op Cm ,h= Ar height .Op Cm ,vga= Ar vgaconf .Op Cm ,wait .Op Cm ,password= Ar password .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Cm rfb= Ns Ar ip-and-port Pq or Cm tcp= Ns Ar ip-and-port An IP address and a port VNC should listen on. There are two formats: .Pp .Bl -bullet -compact .It .Sm off .Op Ar IPv4 Cm \&: .Ar port .Sm on .It .Sm off .Cm \&[ Ar IPv6%zone Cm \&] Cm \&: Ar port .Sm on .El .Pp The default is to listen on localhost IPv4 address and default VNC port 5900. An IPv6 address must be enclosed in square brackets and may contain an optional zone identifier. .It Cm w= Ns Ar width No and Cm h= Ns Ar height A display resolution, width and height, respectively. If not specified, a default resolution of 1024x768 pixels will be used. Minimal supported resolution is 640x480 pixels, and maximum is 1920x1200 pixels. .It Cm vga= Ns Ar vgaconf Possible values for this option are .Cm io (default), .Cm on , and .Cm off . PCI graphics cards have a dual personality in that they are standard PCI devices with BAR addressing, but may also implicitly decode legacy VGA I/O space .Pq Ad 0x3c0-3df and memory space .Pq 64KB at Ad 0xA0000 . The default .Cm io option should be used for guests that attempt to issue BIOS calls which result in I/O port queries, and fail to boot if I/O decode is disabled. .Pp The .Cm on option should be used along with the CSM BIOS capability in UEFI to boot traditional BIOS guests that require the legacy VGA I/O and memory regions to be available. .Pp The .Cm off option should be used for the UEFI guests that assume that VGA adapter is present if they detect the I/O ports. An example of such a guest is .Ox in UEFI mode. .Pp Please refer to the .Nm .Fx wiki page .Pq Lk https://wiki.freebsd.org/bhyve for configuration notes of particular guests. .It Cm wait Instruct .Nm to only boot upon the initiation of a VNC connection, simplifying the installation of operating systems that require immediate keyboard input. This can be removed for post-installation use. .It Cm password= Ns Ar password This type of authentication is known to be cryptographically weak and is not intended for use on untrusted networks. Many implementations will want to use stronger security, such as running the session over an encrypted channel provided by IPsec or SSH. .El .Pp xHCI USB device backends: .Bl -tag -width 10n .It Cm tablet A USB tablet device which provides precise cursor synchronization when using VNC. .El .Pp NVMe device backends: .Bl -bullet .Sm off .It .Ar devpath .Op Cm ,maxq= Ar # .Op Cm ,qsz= Ar # .Op Cm ,ioslots= Ar # .Op Cm ,sectsz= Ar # .Op Cm ,ser= Ar # .Op Cm ,eui64= Ar # .Op Cm ,dsm= Ar opt .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Ar devpath Accepted device paths are: .Ar /dev/blockdev or .Ar /path/to/image or .Cm ram= Ns Ar size_in_MiB . .It Cm maxq Max number of queues. .It Cm qsz Max elements in each queue. .It Cm ioslots Max number of concurrent I/O requests. .It Cm sectsz Sector size (defaults to blockif sector size). .It Cm ser Serial number with maximum 20 characters. .It Cm eui64 IEEE Extended Unique Identifier (8 byte value). .It Cm dsm DataSet Management support. Supported values are: .Cm auto , enable , and .Cm disable . .El .Pp AHCI device backends: .Bl -bullet .It .Sm off .Op Oo Cm hd\&: | cd\&: Oc Ar path .Op Cm ,nmrr= Ar nmrr .Op Cm ,ser= Ar # .Op Cm ,rev= Ar # .Op Cm ,model= Ar # .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Cm nmrr Nominal Media Rotation Rate, known as RPM. Value 1 will indicate device as Solid State Disk. Default value is 0, not report. .It Cm ser Serial Number with maximum 20 characters. .It Cm rev Revision Number with maximum 8 characters. .It Cm model Model Number with maximum 40 characters. .El .Pp HD Audio device backends: .Bl -bullet .It .Sm off .Op Cm play= Ar playback .Op Cm ,rec= Ar recording .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Cm play Playback device, typically .Ar /dev/dsp0 . .It Cm rec Recording device, typically .Ar /dev/dsp0 . .El .It Fl U Ar uuid Set the universally unique identifier .Pq UUID in the guest's System Management BIOS System Information structure. By default a UUID is generated from the host's hostname and .Ar vmname . .It Fl u RTC keeps UTC time. .It Fl W Force virtio PCI device emulations to use MSI interrupts instead of MSI-X interrupts. .It Fl w Ignore accesses to unimplemented Model Specific Registers (MSRs). This is intended for debug purposes. .It Fl x The guest's local APIC is configured in x2APIC mode. .It Fl Y Disable MPtable generation. .It Ar vmname Alphanumeric name of the guest. This should be the same as that created by .Xr bhyveload 8 . .El .Sh CONFIGURATION VARIABLES .Nm uses an internal tree of configuration variables to describe global and per-device settings. When .Nm starts, it parses command line options (including config files) in the order given on the command line. Each command line option sets one or more configuration variables. For example, the .Fl s option creates a new tree node for a PCI device and sets one or more variables under that node including the device model and device model-specific variables. Variables may be set multiple times during this parsing stage with the final value overriding previous values. .Pp Once all of the command line options have been processed, the configuration values are frozen. .Nm then uses the value of configuration values to initialize device models and global settings. .Pp More details on configuration variables can be found in .Xr bhyve_config 5 . .Sh DEBUG SERVER The current debug server provides limited support for debuggers. .Ss Registers Each virtual CPU is exposed to the debugger as a thread. .Pp General purpose registers can be queried for each virtual CPU, but other registers such as floating-point and system registers cannot be queried. .Ss Memory Memory (including memory mapped I/O regions) can be read and written by the debugger. Memory operations use virtual addresses that are resolved to physical addresses via the current virtual CPU's active address translation. .Ss Control The running guest can be interrupted by the debugger at any time .Pq for example, by pressing Ctrl-C in the debugger . .Pp Single stepping is only supported on Intel CPUs supporting the MTRAP VM exit. .Pp Breakpoints are supported on Intel CPUs that support single stepping. Note that continuing from a breakpoint while interrupts are enabled in the guest may not work as expected due to timer interrupts firing while single stepping over the breakpoint. .Sh SIGNAL HANDLING .Nm deals with the following signals: .Pp .Bl -tag -width SIGTERM -compact .It SIGTERM Trigger ACPI poweroff for a VM .El .Sh EXIT STATUS Exit status indicates how the VM was terminated: .Pp .Bl -tag -width indent -compact .It 0 rebooted .It 1 powered off .It 2 halted .It 3 triple fault .It 4 exited due to an error .El .Sh EXAMPLES If not using a boot ROM, the guest operating system must have been loaded with .Xr bhyveload 8 or a similar boot loader before .Xr bhyve 4 can be run. Otherwise, the boot loader is not needed. .Pp To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio block device backed by the .Pa /my/image filesystem image, and a serial port for the console: .Bd -literal -offset indent bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\ -l com1,stdio -A -H -P -m 1G vm1 .Ed .Pp Run a 24GB single-CPU virtual machine with three network ports, one of which has a MAC address specified: .Bd -literal -offset indent bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\ -s 2:1,virtio-net,tap1 \\ -s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\ -s 3,virtio-blk,/my/image -l com1,stdio \\ -A -H -P -m 24G bigvm .Ed .Pp Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI CD-ROM, a single virtio network port, an AMD hostbridge, and the console port connected to an .Xr nmdm 4 null-modem device. .Bd -literal -offset indent bhyve -c 4 \\ -s 0,amd_hostbridge -s 1,lpc \\ -s 1:0,ahci,hd:/images/disk.1,hd:/images/disk.2,\\ hd:/images/disk.3,hd:/images/disk.4,\\ hd:/images/disk.5,hd:/images/disk.6,\\ hd:/images/disk.7,hd:/images/disk.8,\\ cd:/images/install.iso \\ -s 3,virtio-net,tap0 \\ -l com1,/dev/nmdm0A \\ -A -H -P -m 8G .Ed .Pp Run a UEFI virtual machine with a display resolution of 800 by 600 pixels that can be accessed via VNC at: 0.0.0.0:5900. .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 3,ahci-cd,/path/to/uefi-OS-install.iso \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=0.0.0.0:5900,w=800,h=600,wait \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Pp Run a UEFI virtual machine with a VNC display that is bound to all IPv6 addresses on port 5900. .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=[::]:5900,w=800,h=600 \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Pp Run a UEFI virtual machine with a VARS file to save EFI variables. Note that .Nm will write guest modifications to the given VARS file. Be sure to create a per-guest copy of the template VARS file from .Pa /usr . .Bd -literal -offset indent bhyve -c 2 -m 4g -w -H \\ -s 0,hostbridge \\ -s 31,lpc -p com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI_CODE.fd,BHYVE_UEFI_VARS.fd uefivm .Ed .Sh SEE ALSO .Xr bhyve 4 , .Xr netgraph 4 , .Xr ng_socket 4 , .Xr nmdm 4 , .Xr vmm 4 , .Xr bhyve_config 5 , .Xr ethers 5 , .Xr bhyvectl 8 , .Xr bhyveload 8 .Pp .Rs .%A Intel .%B 64 and IA-32 Architectures Software Developer’s Manual .%V Volume 3 .Re .Sh HISTORY .Nm first appeared in .Fx 10.0 . .Sh AUTHORS .An Neel Natu Aq Mt neel@freebsd.org .An Peter Grehan Aq Mt grehan@freebsd.org diff --git a/usr.sbin/bhyve/bhyve_config.5 b/usr.sbin/bhyve/bhyve_config.5 index 94a8a4d5cb1d..32658c11f9e2 100644 --- a/usr.sbin/bhyve/bhyve_config.5 +++ b/usr.sbin/bhyve/bhyve_config.5 @@ -1,676 +1,684 @@ .\" SPDX-License-Identifier: BSD-2-Clause .\" .\" Copyright (c) 2021 John H. Baldwin .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .Dd August 19, 2022 .Dt BHYVE_CONFIG 5 .Os .Sh NAME .Nm bhyve_config .Nd "bhyve configuration variables" .Sh DESCRIPTION .Xr bhyve 8 uses a hierarchical tree of configuration variables to describe global and per-device settings. Internal nodes in this tree do not have a value, only leaf nodes have values. This manual describes the configuration variables understood by .Xr bhyve 8 . If additional variables are defined, .Xr bhyve 8 will ignore them and will not emit errors for unknown variables. However, these additional variables can be referenced by other variables as described below. .Sh VARIABLE VALUES Configuration variable values are stored as strings. A configuration variable value may refer to one or more other configuration values by name. Instances of the pattern .Sq % Ns Pq Ar var are replaced by the value of the configuration variable .Va var . To avoid unwanted expansion, .Sq % characters can be escaped by a leading .Sq % . For example, if a configuration variable .Va disk uses the value .Pa /dev/zvol/bhyve/%(name) , then the final value of the .Va disk variable will be set to the path of a ZFS volume whose name matches the name of the virtual machine on the pool .Pa bhyve . .Pp Some configuration variables may be interpreted as a boolean value. For those variables the following case-insensitive values may be used to indicate true: .Pp .Bl -bullet -offset indent -compact .It true .It on .It yes .It 1 .El .Pp The following values may be used to indicate false: .Pp .Bl -bullet -offset indent -compact .It false .It off .It no .It 0 .El .Pp Some configuration variables may be interperted as an integer. For those variables, any syntax supported by .Xr strtol 3 may be used. .Sh GLOBAL SETTINGS .Ss Architecture Neutral Settings .Bl -column "memory.guest_in_core" "integer" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va name Ta string Ta Ta The name of the VM. .It Va cpus Ta integer Ta 1 Ta The total number of virtual CPUs. .It Va cores Ta integer Ta 1 Ta The number of virtual cores in each virtual socket. .It Va threads Ta integer Ta 1 Ta The number of virtual CPUs in each virtual core. .It Va sockets Ta integer Ta 1 Ta The number of virtual sockets. .It Va memory.guest_in_core Ta bool Ta false Ta Include guest memory in core file. .It Va memory.size Ta string Ta 256M Ta Guest physical memory size in bytes. The value must be formatted as described in .Xr expand_number 3 . .It Va memory.wired Ta bool Ta false Ta Wire guest memory. .It Va acpi_tables Ta bool Ta false Ta Generate ACPI tables. .It Va destroy_on_poweroff Ta bool Ta false Ta Destroy the VM on guest-initiated power-off. .It Va gdb.address Ta string Ta localhost Ta Hostname, IP address, or IPv6 address for the debug server. .It Va gdb.port Ta integer Ta 0 Ta TCP port number for the debug server. If this is set to a non-zero value, a debug server will listen for connections on this port. .It Va gdb.wait Ta bool Ta false Ta If the debug server is enabled, wait for a debugger to connect before starting the guest. .It Va keyboard.layout Ta string Ta Ta Specify the keyboard layout name with the file name in .Ar /usr/share/bhyve/kbdlayout . This value only works when loaded with UEFI mode for VNC, and used a VNC client that don't support QEMU Extended Key Event Message (e.g. TightVNC). .It Va rtc.use_localtime Ta bool Ta true Ta The real time clock uses the local time of the host. If this is set to false, the real time clock uses UTC. .It Va uuid Ta string Ta Ta The universally unique identifier (UUID) to use in the guest's System Management BIOS System Information structure. If an explicit value is not set, a valid UUID is generated from the host's hostname and the VM name. .It Va virtio_msix Ta bool Ta true Ta Use MSI-X interrupts for PCI VirtIO devices. If set to false, MSI interrupts are used instead. .It Va config.dump Ta bool Ta false Ta If this value is set to true after .Xr bhyve 8 has finished parsing command line options, then .Xr bhyve 8 will write all of its configuration variables to stdout and exit. No VM will be started. .It Va bios.vendor Ta string Ta BHYVE Ta This value is used for the guest's System Management BIOS System Information structure. .It Va bios.version Ta string Ta 14.0 Ta This value is used for the guest's System Management BIOS System Information structure. .It Va bios.release_date Ta string Ta 10/17/2021 Ta This value is used for the guest's System Management BIOS System Information structure. .It Va system.family_name Ta string Ta Virtual Machine Ta Family the computer belongs to. This value is used for the guest's System Management BIOS System Information structure. .It Va system.manufacturer Ta string Ta FreeBSD Ta This value is used for the guest's System Management BIOS System Information structure. .It Va system.product_name Ta string Ta BHYVE Ta This value is used for the guest's System Management BIOS System Information structure. .It Va system.serial_number Ta string Ta None Ta This value is used for the guest's System Management BIOS System Information structure. .It Va system.sku Ta string Ta None Ta Stock keeping unit of the computer. It's also called product ID or purchase order number. This value is used for the guest's System Management BIOS System Information structure. .It Va system.version Ta string Ta 1.0 Ta This value is used for the guest's System Management BIOS System Information structure. .It Va board.manufacturer Ta string Ta FreeBSD Ta This value is used for the guest's System Management BIOS System Information structure. .It Va board.product_name Ta string Ta BHYVE Ta This value is used for the guest's System Management BIOS System Information structure. .It Va board.version Ta string Ta 1.0 Ta This value is used for the guest's System Management BIOS System Information structure. .It Va board.serial_number Ta string Ta None Ta This value is used for the guest's System Management BIOS System Information structure. .It Va board.asset_tag Ta string Ta None Ta This value is used for the guest's System Management BIOS System Information structure. .It Va board.location Ta string Ta None Ta Describes the board's location within the chassis. This value is used for the guest's System Management BIOS System Information structure. .It Va chassis.manufacturer Ta string Ta FreeBSD Ta This value is used for the guest's System Management BIOS System Information structure. .It Va chassis.version Ta string Ta 1.0 Ta This value is used for the guest's System Management BIOS System Information structure. .It Va chassis.serial_number Ta string Ta None Ta This value is used for the guest's System Management BIOS System Information structure. .It Va chassis.asset_tag Ta string Ta None Ta This value is used for the guest's System Management BIOS System Information structure. .It Va chassis.sku Ta string Ta None Ta Stock keeping unit of the chassis. It's also called product ID or purchase order number. This value is used for the guest's System Management BIOS System Information structure. .El .Ss x86-Specific Settings .Bl -column "x86.vmexit_on_pause" "integer" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va x86.mptable Ta bool Ta true Ta Generate an MPTable. .It Va x86.x2apic Ta bool Ta false Ta Configure guest's local APICs in x2APIC mode. .It Va x86.strictio Ta bool Ta false Ta Exit if a guest accesses an I/O port that is not emulated. By default, writes are ignored and reads return all bits set. .It Va x86.strictmsr Ta bool Ta true Ta Inject a general protection fault if a guest accesses a Model Specific Register (MSR) that is not emulated. If this is false, writes are ignored and reads return zero. .It Va x86.vmexit_on_hlt Ta bool Ta false Ta Force a VM exit when a guest CPU executes the .Dv HLT instruction. This allows idle guest CPUs to yield the host CPU. .It Va x86.vmexit_on_pause Ta bool Ta false Ta Force a VM exit when a guest CPU executes the .Dv PAUSE instruction. .El .Sh DEVICE SETTINGS Device settings are stored under a device node. The device node's name is set by the parent bus of the device. .Ss PCI Device Settings PCI devices are described by a device node named .Dq pci . Ns Ar bus . Ns Ar slot . Ns Ar function where each of .Ar bus , .Ar slot , and .Ar function are formatted as decimal values with no padding. All PCI device nodes must contain a configuration variable named .Dq device which specifies the device model to use. The following PCI device models are supported: .Bl -tag -indent .It Li hostbridge Provide a simple PCI-Host bridge device. This is usually configured at pci0:0:0 and is required by most guest operating systems. .It Li ahci AHCI storage controller. .It Li e1000 Intel e82545 network interface. .It Li fbuf VGA framebuffer device attached to VNC server. .It Li lpc LPC PCI-ISA bridge with COM1-COM4 16550 serial ports, a boot ROM, +an optional fwcfg type, and an optional debug/test device. This device must be configured on bus 0. .It Li hda High Definition audio controller. .It Li nvme NVM Express (NVMe) controller. .It Li passthru PCI pass-through device. .It Li uart PCI 16550 serial device. .It Li virtio-9p VirtIO 9p (VirtFS) interface. .It Li virtio-blk VirtIO block storage interface. .It Li virtio-console VirtIO console interface. .It Li virtio-input VirtIO input interface. .It Li virtio-net VirtIO network interface. .It Li virtio-rnd VirtIO RNG interface. .It Li virtio-scsi VirtIO SCSI interface. .It Li xhci Extensible Host Controller Interface (XHCI) USB controller. .El .Ss USB Device Settings USB controller devices contain zero or more child USB devices attached to slots. Each USB device stores its settings in a node named .Dq slot. Ns Va N under the controller's device node. .Va N is the number of the slot to which the USB device is attached. Note that USB slot numbers begin at 1. All USB device nodes must contain a configuration variable named .Dq device which specifies the device model to use. The following USB device models are supported: .Bl -tag -indent .It Li tablet A USB tablet device which provides precise cursor synchronization when using VNC. .El .Ss Block Device Settings Block devices use the following settings to configure their backing store. These settings are stored in the configuration node of the respective device. .Bl -column "sectorsize" "logical[/physical]" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It path Ta string Ta Ta The path of the file or disk device to use as the backing store. .It nocache Ta bool Ta false Ta Disable caching on the backing file by opening the backing file with .Dv O_DIRECT . .It nodelete Ta bool Ta false Ta Disable emulation of guest trim requests via .Dv DIOCGDELETE requests. .It sync Ta bool Ta false Ta Write changes to the backing file with synchronous writes. .It direct Ta bool Ta false Ta An alias for .Va sync . .It ro Ta bool Ta false Ta Disable writes to the backing file. .It sectorsize Ta Va logical Ns Op / Ns Va physical Ta Ta Specify the logical and physical sector size of the emulated disk. If the physical size is not specified, it is equal to the logical size. .El .Ss Network Backend Settings Network devices use the following settings to configure their backend. The backend is responsible for passing packets between the device model and a desired destination. Configuring a backend requires setting the .Va backend variable. The type of a backend can either be set explicitly via the .Va type variable or it can be inferred from the value of .Va backend . .Pp The following types of backends are supported: .Bl -tag -width "netgraph" .It tap Use the .Xr tap 4 interface named in .Va backend as the backend. .It netgraph Use a .Xr netgraph 4 socket hook as the backend. This backend uses the following additional variables: .Bl -column "peerhook" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va path Ta string Ta Ta The name of the .Xr netgraph 4 destination node. .It Va peerhook Ta string Ta Ta The name of the destination hook. .It Va socket Ta string Ta Ta The name of the created .Xr ng_socket 4 node. .It Va hook Ta string Ta vmlink Ta The name of the source hook on the created .Xr ng_socket 4 node. .El .It netmap Use .Xr netmap 4 either on a network interface or a port on a .Xr vale 4 bridge as the backend. The value of .Va backend is passed to .Xr nm_open to connect to a netmap port. .El .Pp If .Va type is not specified explicitly, then it is inferred from .Va backend based on the following patterns: .Bl -column -offset indent "valuebridge:port" .It Sy Pattern Ta Sy Type .It tap Ns Va N Ta tap .It vmnet Ns Va N Ta tap .It netgraph Ta netgraph .It netmap: Ns Va interface Ta netmap .It vale Ns Va bridge : Ns Va port Ta netmap .El .Ss UART Device Settings .Bl -column "Name" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va path Ta path Ta Ta Backend device for the serial port. Either the pathname of a character device or .Dq stdio to use standard input and output of the .Xr bhyve 8 process. .El .Ss Host Bridge Settings .Bl -column "vendor" "integer" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va vendor Ta integer Ta 0x1275 Ta PCI vendor ID. .It Va devid Ta integer Ta 0x1275 Ta PCI device ID. .El .Ss AHCI Controller Settings AHCI controller devices contain zero or more ports each of which provides a storage device. Each port stores its settings in a node named .Dq port. Ns Va N under the controller's device node. The .Va N values are formatted as successive decimal values starting with 0. In addition to the block device settings described above, each port supports the following settings: .Bl -column "model" "integer" "generated" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va type Ta string Ta Ta The type of storage device to emulate. Must be set to either .Dq cd or .Dq hd . .It Va nmrr Ta integer Ta 0 Ta Nominal Media Rotation Rate, also known as RPM. A value 1 of indicates a device with no rate such as a Solid State Disk. .It Va ser Ta string Ta generated Ta Serial number of up to twenty characters. A default serial number is generated using a hash of the backing store's pathname. .It Va rev Ta string Ta 001 Ta Revision number of up to eight characters. .It Va model Ta string Ta Ta Model number of up to forty characters. Separate default model strings are used for .Dq cd and .Dq hd device types. .El .Ss e1000 Settings In addition to the network backend settings, Intel e82545 network interfaces support the following variables: .Bl -column "Name" "MAC address" "generated" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va mac Ta MAC address Ta generated Ta MAC address. If an explicit address is not provided, a MAC address is generated from a hash of the device's PCI address. .El .Ss Frame Buffer Settings .Bl -column "password" "[IP:]port" "127.0.0.1:5900" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va wait Ta bool Ta false Ta Wait for a remote connection before starting the VM. .It Va rfb Ta Oo Ar IP Ns : Oc Ns Ar port Ta 127.0.0.1:5900 Ta TCP address to listen on for remote connections. The IP address must be given as a numeric address. IPv6 addresses must be enclosed in square brackets and support scoped identifiers as described in .Xr getaddrinfo 3 . A bare port number may be given in which case the IPv4 localhost address is used. .It Va vga Ta string Ta io Ta VGA configuration. More details are provided in .Xr bhyve 8 . .It Va w Ta integer Ta 1024 Ta Frame buffer width in pixels. .It Va h Ta integer Ta 768 Ta Frame buffer height in pixels. .It Va password Ta string Ta Ta Password to use for VNC authentication. This type of authentication is known to be cryptographically weak and is not intended for use on untrusted networks. .El .Ss High Definition Audio Settings .Bl -column "Name" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va play Ta path Ta Ta Host playback device, typically .Pa /dev/dsp0 . .It Va rec Ta path Ta Ta Host recording device, typically .Pa /dev/dsp0 . .El .Ss LPC Device Settings The LPC bridge stores its configuration under a top-level .Va lpc node rather than under the PCI LPC device's node. The following nodes are available under .Va lpc : .Bl -column "pc-testdev" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va bootrom Ta path Ta Ta Path to a boot ROM. The contents of this file are copied into the guest's memory ending just before the 4GB physical address. If a boot ROM is present, a firmware interface device is also enabled for use by the boot ROM. .It Va bootvars Ta path Ta Ta Path to boot VARS. The contents of this file are copied beneath the boot ROM. Firmware can write to it to save variables. All variables will be persistent even on reboots of the guest. .It Va com1 Ta node Ta Ta Settings for the COM1 serial port device. .It Va com2 Ta node Ta Ta Settings for the COM2 serial port device. .It Va com3 Ta node Ta Ta Settings for the COM3 serial port device. .It Va com4 Ta node Ta Ta Settings for the COM4 serial port device. +.It Va fwcfg Ta string Ta bhyve Ta +The fwcfg type to be used. +Supported values are +.Dq bhyve +for fwctl and +.Dq qemu +for fwcfg. .It Va pc-testdev Ta bool Ta false Ta Enable the PC debug/test device. .El .Ss NVMe Controller Settings Each NVMe controller supports a single storage device. The device can be backed either by a memory disk described by the .Va ram variable, or a block device using the block device settings described above. In addition, each controller supports the following settings: .Bl -column "ioslots" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va maxq Ta integer Ta 16 Ta Maximum number of I/O submission and completion queue pairs. .It Va qsz Ta integer Ta 2058 Ta Number of elements in each I/O queue. .It Va ioslots Ta integer Ta 8 Ta Maximum number of concurrent I/O requests. .It Va sectsz Ta integer Ta Ta Sector size. Can be one of 512, 4096, or 8192. Devices backed by a memory disk use 4096 as the default. Devices backed by a block device use the block device's sector size as the default. .It Va ser Ta string Ta Ta Serial number of up to twenty characters. A default serial number is generated using a hash of the device's PCI address. .It Va eui64 Ta integer Ta Ta IEEE Extended Unique Identifier. If an EUI is not provided, a default is generated using a checksum of the device's PCI address. .It Va dsm Ta string Ta auto Ta Whether or not to advertise DataSet Management support. One of .Dq auto , .Dq enable , or .Dq disable . The .Dq auto setting only advertises support if the backing store supports resource freeing, for example via TRIM. .It Va ram Ta integer Ta Ta If set, allocate a memory disk as the backing store. The value of this variable is the size of the memory disk in megabytes. .El .Ss PCI Passthrough Settings The .Xr ppt 4 device driver must be attached to the PCI device being passed through. The device to pass through can be identified either by name or its host PCI bus location. .Bl -column "Name" "integer" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va bus Ta integer Ta Ta Host PCI bus address of device to pass through. .It Va slot Ta integer Ta Ta Host PCI slot address of device to pass through. .It Va func Ta integer Ta Ta Host PCI function address of device to pass through. .It Va pptdev Ta string Ta Ta Name of a .Xr ppt 4 device to pass through. .It Va rom Ta path Ta Ta ROM file of the device which will be executed by OVMF to init the device. .El .Ss VirtIO 9p Settings Each VirtIO 9p device exposes a single filesystem from a host path. .Bl -column "sharename" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va sharename Ta string Ta Ta The share name exposed to the guest. .It Va path Ta path Ta Ta The path of a directory on the host to export to the guest. .It Va ro Ta bool Ta false Ta If true, the guest filesystem is read-only. .El .Ss VirtIO Block Device Settings In addition to the block device settings described above, each VirtIO block device supports the following settings: .Bl -column "model" "integer" "generated" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va ser Ta string Ta generated Ta Serial number of up to twenty characters. A default serial number is generated using a hash of the backing store's pathname. .El .Ss VirtIO Console Device Settings Each VirtIO Console device contains one or more console ports. Each port stores its settings in a node named .Dq port. Ns Va N under the controller's device node. The .Va N values are formatted as successive decimal values starting with 0. Each port supports the following settings: .Bl -column "Name" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va name Ta string Ta Ta The name of the port exposed to the guest. .It Va path Ta path Ta Ta The path of a UNIX domain socket providing the host connection for the port. .El .Ss VirtIO Input Interface Settings Each VirtIO Input device contains one input event device. All input events of the input event device are send to the guest by VirtIO Input interface. VirtIO Input Interfaces support the following variables: .Bl -column "Name" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va path Ta path Ta Ta The path of the input event device exposed to the guest .El .Ss VirtIO Network Interface Settings In addition to the network backend settings, VirtIO network interfaces support the following variables: .Bl -column "Name" "MAC address" "generated" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va mac Ta MAC address Ta generated Ta MAC address. If an explicit address is not provided, a MAC address is generated from a hash of the device's PCI address. .It Va mtu Ta integer Ta 1500 Ta The largest supported MTU advertised to the guest. .El .Ss VirtIO SCSI Settings .Bl -column "Name" "integer" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va dev Ta path Ta Ta The path of a CAM target layer (CTL) device to export: .Pa /dev/cam/ctl Ns Oo Ar pp . Ns Ar vp Oc . .It Va iid Ta integer Ta 0 Ta Initiator ID to use when sending requests to the CTL port. .El .Sh SEE ALSO .Xr expand_number 3 , .Xr getaddrinfo 3 , .Xr strtol 3 , .Xr netgraph 4 , .Xr netmap 4 , .Xr ng_socket 4 , .Xr tap 4 , .Xr vale 4 , .Xr vmnet 4 , .Xr bhyve 8 diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index af4d15666ae3..665fec73e48c 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -1,1607 +1,1621 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #ifdef BHYVE_SNAPSHOT #include #include #endif #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #include #include #endif #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "bootrom.h" #include "config.h" #include "inout.h" #include "debug.h" #include "fwctl.h" #include "gdb.h" #include "ioapic.h" #include "kernemu_dev.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" +#include "qemu_fwcfg.h" #include "smbiostbl.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include "vmgenc.h" #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ #define MB (1024UL * 1024) #define GB (1024UL * MB) static const char * const vmx_exit_reason_desc[] = { [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", [EXIT_REASON_EXT_INTR] = "External interrupt", [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", [EXIT_REASON_INIT] = "INIT signal", [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", [EXIT_REASON_SMI] = "Other SMI", [EXIT_REASON_INTR_WINDOW] = "Interrupt window", [EXIT_REASON_NMI_WINDOW] = "NMI window", [EXIT_REASON_TASK_SWITCH] = "Task switch", [EXIT_REASON_CPUID] = "CPUID", [EXIT_REASON_GETSEC] = "GETSEC", [EXIT_REASON_HLT] = "HLT", [EXIT_REASON_INVD] = "INVD", [EXIT_REASON_INVLPG] = "INVLPG", [EXIT_REASON_RDPMC] = "RDPMC", [EXIT_REASON_RDTSC] = "RDTSC", [EXIT_REASON_RSM] = "RSM", [EXIT_REASON_VMCALL] = "VMCALL", [EXIT_REASON_VMCLEAR] = "VMCLEAR", [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", [EXIT_REASON_VMPTRLD] = "VMPTRLD", [EXIT_REASON_VMPTRST] = "VMPTRST", [EXIT_REASON_VMREAD] = "VMREAD", [EXIT_REASON_VMRESUME] = "VMRESUME", [EXIT_REASON_VMWRITE] = "VMWRITE", [EXIT_REASON_VMXOFF] = "VMXOFF", [EXIT_REASON_VMXON] = "VMXON", [EXIT_REASON_CR_ACCESS] = "Control-register accesses", [EXIT_REASON_DR_ACCESS] = "MOV DR", [EXIT_REASON_INOUT] = "I/O instruction", [EXIT_REASON_RDMSR] = "RDMSR", [EXIT_REASON_WRMSR] = "WRMSR", [EXIT_REASON_INVAL_VMCS] = "VM-entry failure due to invalid guest state", [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", [EXIT_REASON_MWAIT] = "MWAIT", [EXIT_REASON_MTF] = "Monitor trap flag", [EXIT_REASON_MONITOR] = "MONITOR", [EXIT_REASON_PAUSE] = "PAUSE", [EXIT_REASON_MCE_DURING_ENTRY] = "VM-entry failure due to machine-check event", [EXIT_REASON_TPR] = "TPR below threshold", [EXIT_REASON_APIC_ACCESS] = "APIC access", [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", [EXIT_REASON_EPT_FAULT] = "EPT violation", [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", [EXIT_REASON_INVEPT] = "INVEPT", [EXIT_REASON_RDTSCP] = "RDTSCP", [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", [EXIT_REASON_INVVPID] = "INVVPID", [EXIT_REASON_WBINVD] = "WBINVD", [EXIT_REASON_XSETBV] = "XSETBV", [EXIT_REASON_APIC_WRITE] = "APIC write", [EXIT_REASON_RDRAND] = "RDRAND", [EXIT_REASON_INVPCID] = "INVPCID", [EXIT_REASON_VMFUNC] = "VMFUNC", [EXIT_REASON_ENCLS] = "ENCLS", [EXIT_REASON_RDSEED] = "RDSEED", [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", [EXIT_REASON_XSAVES] = "XSAVES", [EXIT_REASON_XRSTORS] = "XRSTORS" }; typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); int guest_ncpus; uint16_t cpu_cores, cpu_sockets, cpu_threads; int raw_stdio = 0; static char *progname; static const int BSP = 0; static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, int vcpu); static struct bhyvestats { uint64_t vmexit_bogus; uint64_t vmexit_reqidle; uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; uint64_t vmexit_inst_emul; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; } stats; static struct mt_vmm_info { pthread_t mt_thr; struct vmctx *mt_ctx; int mt_vcpu; } *mt_vmm_info; static cpuset_t **vcpumap; static void usage(int code) { fprintf(stderr, "Usage: %s [-AaCDeHhPSuWwxY]\n" " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" " %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n" " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n" " -A: create ACPI tables\n" " -a: local apic is in xAPIC mode (deprecated)\n" " -C: include guest memory in core file\n" " -c: number of CPUs and/or topology specification\n" " -D: destroy on power-off\n" " -e: exit on unhandled I/O access\n" " -G: start a debug server\n" " -H: vmexit from the guest on HLT\n" " -h: help\n" " -k: key=value flat config file\n" " -K: PS2 keyboard layout\n" " -l: LPC device configuration\n" " -m: memory size\n" " -o: set config 'var' to 'value'\n" " -P: vmexit from the guest on pause\n" " -p: pin 'vcpu' to 'hostcpu'\n" #ifdef BHYVE_SNAPSHOT " -r: path to checkpoint file\n" #endif " -S: guest memory cannot be swapped\n" " -s: PCI slot config\n" " -U: UUID\n" " -u: RTC keeps UTC time\n" " -W: force virtio to use single-vector MSI\n" " -w: ignore unimplemented MSRs\n" " -x: local APIC is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), "", (int)strlen(progname), "", (int)strlen(progname), ""); exit(code); } /* * XXX This parser is known to have the following issues: * 1. It accepts null key=value tokens ",," as setting "cpus" to an * empty string. * * The acceptance of a null specification ('-c ""') is by design to match the * manual page syntax specification, this results in a topology of 1 vCPU. */ static int topology_parse(const char *opt) { char *cp, *str, *tofree; if (*opt == '\0') { set_config_value("sockets", "1"); set_config_value("cores", "1"); set_config_value("threads", "1"); set_config_value("cpus", "1"); return (0); } tofree = str = strdup(opt); if (str == NULL) errx(4, "Failed to allocate memory"); while ((cp = strsep(&str, ",")) != NULL) { if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) set_config_value("cpus", cp + strlen("cpus=")); else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0) set_config_value("sockets", cp + strlen("sockets=")); else if (strncmp(cp, "cores=", strlen("cores=")) == 0) set_config_value("cores", cp + strlen("cores=")); else if (strncmp(cp, "threads=", strlen("threads=")) == 0) set_config_value("threads", cp + strlen("threads=")); #ifdef notyet /* Do not expose this until vmm.ko implements it */ else if (strncmp(cp, "maxcpus=", strlen("maxcpus=")) == 0) set_config_value("maxcpus", cp + strlen("maxcpus=")); #endif else if (strchr(cp, '=') != NULL) goto out; else set_config_value("cpus", cp); } free(tofree); return (0); out: free(tofree); return (-1); } static int parse_int_value(const char *key, const char *value, int minval, int maxval) { char *cp; long lval; errno = 0; lval = strtol(value, &cp, 0); if (errno != 0 || *cp != '\0' || cp == value || lval < minval || lval > maxval) errx(4, "Invalid value for %s: '%s'", key, value); return (lval); } /* * Set the sockets, cores, threads, and guest_cpus variables based on * the configured topology. * * The limits of UINT16_MAX are due to the types passed to * vm_set_topology(). vmm.ko may enforce tighter limits. */ static void calc_topology(void) { const char *value; bool explicit_cpus; uint64_t ncpus; value = get_config_value("cpus"); if (value != NULL) { guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX); explicit_cpus = true; } else { guest_ncpus = 1; explicit_cpus = false; } value = get_config_value("cores"); if (value != NULL) cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX); else cpu_cores = 1; value = get_config_value("threads"); if (value != NULL) cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX); else cpu_threads = 1; value = get_config_value("sockets"); if (value != NULL) cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX); else cpu_sockets = guest_ncpus; /* * Compute sockets * cores * threads avoiding overflow. The * range check above insures these are 16 bit values. */ ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads; if (ncpus > UINT16_MAX) errx(4, "Computed number of vCPUs too high: %ju", (uintmax_t)ncpus); if (explicit_cpus) { if (guest_ncpus != (int)ncpus) errx(4, "Topology (%d sockets, %d cores, %d threads) " "does not match %d vCPUs", cpu_sockets, cpu_cores, cpu_threads, guest_ncpus); } else guest_ncpus = ncpus; } static int pincpu_parse(const char *opt) { const char *value; char *newval; char key[16]; int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0) { fprintf(stderr, "invalid vcpu '%d'\n", vcpu); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (asprintf(&newval, "%s%s%d", value != NULL ? value : "", value != NULL ? "," : "", pcpu) == -1) { perror("failed to build new cpuset string"); return (-1); } set_config_value(key, newval); free(newval); return (0); } static void parse_cpuset(int vcpu, const char *list, cpuset_t *set) { char *cp, *token; int pcpu, start; CPU_ZERO(set); start = -1; token = __DECONST(char *, list); for (;;) { pcpu = strtoul(token, &cp, 0); if (cp == token) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); if (pcpu < 0 || pcpu >= CPU_SETSIZE) errx(4, "hostcpu '%d' outside valid range from 0 to %d", pcpu, CPU_SETSIZE - 1); switch (*cp) { case ',': case '\0': if (start >= 0) { if (start > pcpu) errx(4, "Invalid hostcpu range %d-%d", start, pcpu); while (start < pcpu) { CPU_SET(start, set); start++; } start = -1; } CPU_SET(pcpu, set); break; case '-': if (start >= 0) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); start = pcpu; break; default: errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); } if (*cp == '\0') break; token = cp + 1; } } static void build_vcpumaps(void) { char key[16]; const char *value; int vcpu; vcpumap = calloc(guest_ncpus, sizeof(*vcpumap)); for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (value == NULL) continue; vcpumap[vcpu] = malloc(sizeof(cpuset_t)); if (vcpumap[vcpu] == NULL) err(4, "Failed to allocate cpuset for vcpu %d", vcpu); parse_cpuset(vcpu, value, vcpumap[vcpu]); } } void vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, int errcode) { struct vmctx *ctx; int error, restart_instruction; ctx = arg; restart_instruction = 1; error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, restart_instruction); assert(error == 0); } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr) { return (vm_rev_map_gpa(ctx, addr)); } #endif int fbsdrun_virtio_msix(void) { return (get_config_bool_default("virtio_msix", true)); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct mt_vmm_info *mtp; int error, vcpu; mtp = param; vcpu = mtp->mt_vcpu; snprintf(tname, sizeof(tname), "vcpu %d", vcpu); pthread_set_name_np(mtp->mt_thr, tname); if (vcpumap[vcpu] != NULL) { error = pthread_setaffinity_np(mtp->mt_thr, sizeof(cpuset_t), vcpumap[vcpu]); assert(error == 0); } #ifdef BHYVE_SNAPSHOT checkpoint_cpu_add(vcpu); #endif gdb_cpu_add(vcpu); vm_loop(mtp->mt_ctx, vcpu); /* not reached */ exit(1); return (NULL); } static void fbsdrun_addcpu(struct vmctx *ctx, int newcpu) { int error; error = vm_activate_cpu(ctx, newcpu); if (error != 0) err(EX_OSERR, "could not activate CPU %d", newcpu); CPU_SET_ATOMIC(newcpu, &cpumask); vm_suspend_cpu(ctx, newcpu); mt_vmm_info[newcpu].mt_ctx = ctx; mt_vmm_info[newcpu].mt_vcpu = newcpu; error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, fbsdrun_start_thread, &mt_vmm_info[newcpu]); assert(error == 0); } static int fbsdrun_deletecpu(int vcpu) { if (!CPU_ISSET(vcpu, &cpumask)) { fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); exit(4); } CPU_CLR_ATOMIC(vcpu, &cpumask); return (CPU_EMPTY(&cpumask)); } static int vmexit_handle_notify(struct vmctx *ctx __unused, struct vm_exit *vme __unused, int *pvcpu __unused, uint32_t eax __unused) { #if BHYVE_DEBUG /* * put guest-driven debug here */ #endif return (VMEXIT_CONTINUE); } static int vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; int bytes, port, in, out; int vcpu; vcpu = *pvcpu; port = vme->u.inout.port; bytes = vme->u.inout.bytes; in = vme->u.inout.in; out = !in; /* Extra-special case of host notifications */ if (out && port == GUEST_NIO_PORT) { error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); return (error); } error = emulate_inout(ctx, vcpu, vme); if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port, vme->rip); return (VMEXIT_ABORT); } else { return (VMEXIT_CONTINUE); } } static int vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { uint64_t val; uint32_t eax, edx; int error; val = 0; error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val); if (error != 0) { fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", vme->u.msr.code, *pvcpu); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } eax = val; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", vme->u.msr.code, vme->u.msr.wval, *pvcpu); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); } #define DEBUG_EPT_MISCONFIG #ifdef DEBUG_EPT_MISCONFIG #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; static int ept_misconfig_ptenum; #endif static const char * vmexit_vmx_desc(uint32_t exit_reason) { if (exit_reason >= nitems(vmx_exit_reason_desc) || vmx_exit_reason_desc[exit_reason] == NULL) return ("Unknown"); return (vmx_exit_reason_desc[exit_reason]); } static int vmexit_vmx(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tVMX\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vme->u.vmx.status); fprintf(stderr, "\texit_reason\t%u (%s)\n", vme->u.vmx.exit_reason, vmexit_vmx_desc(vme->u.vmx.exit_reason)); fprintf(stderr, "\tqualification\t0x%016lx\n", vme->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vme->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vme->u.vmx.inst_error); #ifdef DEBUG_EPT_MISCONFIG if (vme->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { vm_get_register(ctx, *pvcpu, VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), &ept_misconfig_gpa); vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, &ept_misconfig_ptenum); fprintf(stderr, "\tEPT misconfiguration:\n"); fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", ept_misconfig_ptenum, ept_misconfig_pte[0], ept_misconfig_pte[1], ept_misconfig_pte[2], ept_misconfig_pte[3]); } #endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } static int vmexit_svm(struct vmctx *ctx __unused, struct vm_exit *vme, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tSVM\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\texitcode\t%#lx\n", vme->u.svm.exitcode); fprintf(stderr, "\texitinfo1\t%#lx\n", vme->u.svm.exitinfo1); fprintf(stderr, "\texitinfo2\t%#lx\n", vme->u.svm.exitinfo2); return (VMEXIT_ABORT); } static int vmexit_bogus(struct vmctx *ctx __unused, struct vm_exit *vme, int *pvcpu __unused) { assert(vme->inst_length == 0); stats.vmexit_bogus++; return (VMEXIT_CONTINUE); } static int vmexit_reqidle(struct vmctx *ctx __unused, struct vm_exit *vme, int *pvcpu __unused) { assert(vme->inst_length == 0); stats.vmexit_reqidle++; return (VMEXIT_CONTINUE); } static int vmexit_hlt(struct vmctx *ctx __unused, struct vm_exit *vme __unused, int *pvcpu __unused) { stats.vmexit_hlt++; /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int vmexit_pause(struct vmctx *ctx __unused, struct vm_exit *vme __unused, int *pvcpu __unused) { stats.vmexit_pause++; return (VMEXIT_CONTINUE); } static int vmexit_mtrap(struct vmctx *ctx __unused, struct vm_exit *vme, int *pvcpu) { assert(vme->inst_length == 0); stats.vmexit_mtrap++; #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(*pvcpu); #endif gdb_cpu_mtrap(*pvcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(*pvcpu); #endif return (VMEXIT_CONTINUE); } static int vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int err, i, cs_d; struct vie *vie; enum vm_cpu_mode mode; stats.vmexit_inst_emul++; vie = &vme->u.inst_emul.vie; if (!vie->decoded) { /* * Attempt to decode in userspace as a fallback. This allows * updating instruction decode in bhyve without rebooting the * kernel (rapid prototyping), albeit with much slower * emulation. */ vie_restart(vie); mode = vme->u.inst_emul.paging.cpu_mode; cs_d = vme->u.inst_emul.cs_d; if (vmm_decode_instruction(mode, cs_d, vie) != 0) goto fail; if (vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RIP, vme->rip + vie->num_processed) != 0) goto fail; } err = emulate_mem(ctx, *pvcpu, vme->u.inst_emul.gpa, vie, &vme->u.inst_emul.paging); if (err) { if (err == ESRCH) { EPRINTLN("Unhandled memory access to 0x%lx\n", vme->u.inst_emul.gpa); } goto fail; } return (VMEXIT_CONTINUE); fail: fprintf(stderr, "Failed to emulate instruction sequence [ "); for (i = 0; i < vie->num_valid; i++) fprintf(stderr, "%02x", vie->inst[i]); FPRINTLN(stderr, " ] at 0x%lx", vme->rip); return (VMEXIT_ABORT); } static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; static int vmexit_suspend(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { enum vm_suspend_how how; how = vme->u.suspended.how; fbsdrun_deletecpu(*pvcpu); if (*pvcpu != BSP) { pthread_mutex_lock(&resetcpu_mtx); pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); } pthread_mutex_lock(&resetcpu_mtx); while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); switch (how) { case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: if (get_config_bool_default("destroy_on_poweroff", false)) vm_destroy(ctx); exit(1); case VM_SUSPEND_HALT: exit(2); case VM_SUSPEND_TRIPLEFAULT: exit(3); default: fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); exit(100); } return (0); /* NOTREACHED */ } static int vmexit_debug(struct vmctx *ctx __unused, struct vm_exit *vme __unused, int *pvcpu) { #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(*pvcpu); #endif gdb_cpu_suspend(*pvcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(*pvcpu); #endif return (VMEXIT_CONTINUE); } static int vmexit_breakpoint(struct vmctx *ctx __unused, struct vm_exit *vme, int *pvcpu) { gdb_cpu_breakpoint(*pvcpu, vme); return (VMEXIT_CONTINUE); } static int vmexit_ipi(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu __unused) { int error = -1; int i; switch (vme->u.ipi.mode) { case APIC_DELMODE_INIT: CPU_FOREACH_ISSET(i, &vme->u.ipi.dmask) { error = vm_suspend_cpu(ctx, i); if (error) { warnx("%s: failed to suspend cpu %d\n", __func__, i); break; } } break; case APIC_DELMODE_STARTUP: CPU_FOREACH_ISSET(i, &vme->u.ipi.dmask) { spinup_ap(ctx, i, vme->u.ipi.vector << PAGE_SHIFT); } error = 0; break; default: break; } return (error); } static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, [VM_EXITCODE_BPT] = vmexit_breakpoint, [VM_EXITCODE_IPI] = vmexit_ipi, }; static void vm_loop(struct vmctx *ctx, int vcpu) { struct vm_exit vme; int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus; error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu, &active_cpus)); while (1) { error = vm_run(ctx, vcpu, &vme); if (error != 0) break; exitcode = vme.exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(4); } rc = (*handler[exitcode])(ctx, &vme, &vcpu); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(4); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx) { uint16_t sockets, cores, threads, maxcpus; int tmp, error; /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); if (error != 0) return (1); error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); if (error == 0) return (maxcpus); else return (1); } static void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) { int err, tmp; if (get_config_bool_default("x86.vmexit_on_hlt", false)) { err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_HLT] = vmexit_hlt; } if (get_config_bool_default("x86.vmexit_on_pause", false)) { /* * pause exit support required for this mode */ err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_PAUSE] = vmexit_pause; } if (get_config_bool_default("x86.x2apic", false)) err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED); else err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1); err = vm_set_capability(ctx, cpu, VM_CAP_IPI_EXIT, 1); assert(err == 0); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool reinit, romboot; reinit = romboot = false; if (lpc_bootrom()) romboot = true; error = vm_create(vmname); if (error) { if (errno == EEXIST) { if (romboot) { reinit = true; } else { /* * The virtual machine has been setup by the * userspace bootloader. */ } } else { perror("vm_create"); exit(4); } } else { if (!romboot) { /* * If the virtual machine was just created then a * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); exit(4); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(4); } #ifndef WITHOUT_CAPSICUM if (vm_limit_rights(ctx) != 0) err(EX_OSERR, "vm_limit_rights"); #endif if (reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(4); } } error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0 /* maxcpus, unimplemented */); if (error) errx(EX_OSERR, "vm_set_topology"); return (ctx); } static void spinup_vcpu(struct vmctx *ctx, int vcpu) { int error; if (vcpu != BSP) { fbsdrun_set_capabilities(ctx, vcpu); /* * Enable the 'unrestricted guest' mode for APs. * * APs startup in power-on 16-bit mode. */ error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); } fbsdrun_addcpu(ctx, vcpu); } static bool parse_config_option(const char *option) { const char *value; char *path; value = strchr(option, '='); if (value == NULL || value[1] == '\0') return (false); path = strndup(option, value - option); if (path == NULL) err(4, "Failed to allocate memory"); set_config_value(path, value + 1); return (true); } static void parse_simple_config_file(const char *path) { FILE *fp; char *line, *cp; size_t linecap; unsigned int lineno; fp = fopen(path, "r"); if (fp == NULL) err(4, "Failed to open configuration file %s", path); line = NULL; linecap = 0; lineno = 1; for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) { if (*line == '#' || *line == '\n') continue; cp = strchr(line, '\n'); if (cp != NULL) *cp = '\0'; if (!parse_config_option(line)) errx(4, "%s line %u: invalid config option '%s'", path, lineno, line); } free(line); fclose(fp); } static void parse_gdb_options(const char *opt) { const char *sport; char *colon; if (opt[0] == 'w') { set_config_bool("gdb.wait", true); opt++; } colon = strrchr(opt, ':'); if (colon == NULL) { sport = opt; } else { *colon = '\0'; colon++; sport = colon; set_config_value("gdb.address", opt); } set_config_value("gdb.port", sport); } static void set_defaults(void) { set_config_bool("acpi_tables", false); set_config_value("memory.size", "256M"); set_config_bool("x86.strictmsr", true); + set_config_value("lpc.fwcfg", "bhyve"); } int main(int argc, char *argv[]) { int c, error; int max_vcpus, memflags; struct vmctx *ctx; uint64_t rip; size_t memsize; const char *optstr, *value, *vmname; #ifdef BHYVE_SNAPSHOT char *restore_file; struct restore_state rstate; restore_file = NULL; #endif init_config(); set_defaults(); progname = basename(argv[0]); #ifdef BHYVE_SNAPSHOT optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:K:U:r:"; #else optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:K:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': set_config_bool("x86.x2apic", false); break; case 'A': set_config_bool("acpi_tables", true); break; case 'D': set_config_bool("destroy_on_poweroff", true); break; case 'p': if (pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': if (topology_parse(optarg) != 0) { errx(EX_USAGE, "invalid cpu topology " "'%s'", optarg); } break; case 'C': set_config_bool("memory.guest_in_core", true); break; case 'G': parse_gdb_options(optarg); break; case 'k': parse_simple_config_file(optarg); break; case 'K': set_config_value("keyboard.layout", optarg); break; case 'l': if (strncmp(optarg, "help", strlen(optarg)) == 0) { lpc_print_supported_devices(); exit(0); } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; #ifdef BHYVE_SNAPSHOT case 'r': restore_file = optarg; break; #endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); exit(0); } else if (pci_parse_slot(optarg) != 0) exit(4); else break; case 'S': set_config_bool("memory.wired", true); break; case 'm': set_config_value("memory.size", optarg); break; case 'o': if (!parse_config_option(optarg)) errx(EX_USAGE, "invalid configuration option '%s'", optarg); break; case 'H': set_config_bool("x86.vmexit_on_hlt", true); break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': set_config_bool("x86.vmexit_on_pause", true); break; case 'e': set_config_bool("x86.strictio", true); break; case 'u': set_config_bool("rtc.use_localtime", false); break; case 'U': set_config_value("uuid", optarg); break; case 'w': set_config_bool("x86.strictmsr", false); break; case 'W': set_config_bool("virtio_msix", false); break; case 'x': set_config_bool("x86.x2apic", true); break; case 'Y': set_config_bool("x86.mptable", false); break; case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; if (argc > 1) usage(1); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { error = load_restore_file(restore_file, &rstate); if (error) { fprintf(stderr, "Failed to read checkpoint info from " "file: '%s'.\n", restore_file); exit(1); } vmname = lookup_vmname(&rstate); if (vmname != NULL) set_config_value("name", vmname); } #endif if (argc == 1) set_config_value("name", argv[0]); vmname = get_config_value("name"); if (vmname == NULL) usage(1); if (get_config_bool_default("config.dump", false)) { dump_config(); exit(1); } calc_topology(); build_vcpumaps(); value = get_config_value("memory.size"); error = vm_parse_memsize(value, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", value); ctx = do_open(vmname); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { guest_ncpus = lookup_guest_ncpus(&rstate); memflags = lookup_memflags(&rstate); memsize = lookup_memsize(&rstate); } if (guest_ncpus < 1) { fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); exit(1); } #endif max_vcpus = num_vcpus_allowed(ctx); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(4); } fbsdrun_set_capabilities(ctx, BSP); memflags = 0; if (get_config_bool_default("memory.wired", false)) memflags |= VM_MEM_F_WIRED; if (get_config_bool_default("memory.guest_in_core", false)) memflags |= VM_MEM_F_INCORE; vm_set_memflags(ctx, memflags); error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (error) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); exit(4); } init_mem(guest_ncpus); init_inout(); kernemu_dev_init(); init_bootrom(ctx); atkbdc_init(ctx); pci_irq_init(ctx); ioapic_init(ctx); rtc_init(ctx); sci_init(ctx); + if (qemu_fwcfg_init(ctx) != 0) { + fprintf(stderr, "qemu fwcfg initialization error"); + exit(4); + } + + if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus), + &guest_ncpus) != 0) { + fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu"); + exit(4); + } + /* * Exit if a device emulation finds an error in its initilization */ if (init_pci(ctx) != 0) { perror("device emulation initialization error"); exit(4); } /* * Initialize after PCI, to allow a bootrom file to reserve the high * region. */ if (get_config_bool("acpi_tables")) vmgenc_init(ctx); init_gdb(ctx); if (lpc_bootrom()) { if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); exit(4); } error = vcpu_reset(ctx, BSP); assert(error == 0); } /* Allocate per-VCPU resources. */ mt_vmm_info = calloc(guest_ncpus, sizeof(*mt_vmm_info)); /* * Add all vCPUs. */ for (int vcpu = 0; vcpu < guest_ncpus; vcpu++) { spinup_vcpu(ctx, vcpu); } #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { fprintf(stdout, "Pausing pci devs...\r\n"); if (vm_pause_user_devs() != 0) { fprintf(stderr, "Failed to pause PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring vm mem...\r\n"); if (restore_vm_mem(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore VM memory.\n"); exit(1); } fprintf(stdout, "Restoring pci devs...\r\n"); if (vm_restore_user_devs(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring kernel structs...\r\n"); if (vm_restore_kern_structs(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore kernel structs.\n"); exit(1); } fprintf(stdout, "Resuming pci devs...\r\n"); if (vm_resume_user_devs() != 0) { fprintf(stderr, "Failed to resume PCI device state.\n"); exit(1); } } #endif error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); assert(error == 0); /* * build the guest tables, MP etc. */ if (get_config_bool_default("x86.mptable", true)) { error = mptable_build(ctx, guest_ncpus); if (error) { perror("error to build the guest tables"); exit(4); } } error = smbios_build(ctx); if (error != 0) exit(4); if (get_config_bool("acpi_tables")) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } - if (lpc_bootrom()) + if (lpc_bootrom() && strcmp(lpc_fwcfg(), "bhyve") == 0) { fwctl_init(); + } /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); #ifdef BHYVE_SNAPSHOT /* initialize mutex/cond variables */ init_snapshot(); /* * checkpointing thread for communication with bhyvectl */ if (init_checkpoint_thread(ctx) != 0) errx(EX_OSERR, "Failed to start checkpoint thread"); #endif #ifndef WITHOUT_CAPSICUM caph_cache_catpages(); if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_enter() == -1) errx(EX_OSERR, "cap_enter() failed"); #endif #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { destroy_restore_state(&rstate); if (vm_restore_time(ctx) < 0) err(EX_OSERR, "Unable to restore time"); for (int i = 0; i < guest_ncpus; i++) { if (i == BSP) continue; vm_resume_cpu(ctx, i); } } #endif vm_resume_cpu(ctx, BSP); /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(4); } diff --git a/usr.sbin/bhyve/pci_lpc.c b/usr.sbin/bhyve/pci_lpc.c index 548726e27d0d..ad47230c005e 100644 --- a/usr.sbin/bhyve/pci_lpc.c +++ b/usr.sbin/bhyve/pci_lpc.c @@ -1,527 +1,543 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Neel Natu * Copyright (c) 2013 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "acpi.h" #include "debug.h" #include "bootrom.h" #include "config.h" #include "inout.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "pctestdev.h" #include "uart_emul.h" #define IO_ICU1 0x20 #define IO_ICU2 0xA0 SET_DECLARE(lpc_dsdt_set, struct lpc_dsdt); SET_DECLARE(lpc_sysres_set, struct lpc_sysres); #define ELCR_PORT 0x4d0 SYSRES_IO(ELCR_PORT, 2); #define IO_TIMER1_PORT 0x40 #define NMISC_PORT 0x61 SYSRES_IO(NMISC_PORT, 1); static struct pci_devinst *lpc_bridge; #define LPC_UART_NUM 4 static struct lpc_uart_softc { struct uart_softc *uart_softc; int iobase; int irq; int enabled; } lpc_uart_softc[LPC_UART_NUM]; static const char *lpc_uart_names[LPC_UART_NUM] = { "com1", "com2", "com3", "com4" }; static const char *lpc_uart_acpi_names[LPC_UART_NUM] = { "COM1", "COM2", "COM3", "COM4" }; /* * LPC device configuration is in the following form: * [,] * For e.g. "com1,stdio" or "bootrom,/var/romfile" */ int lpc_device_parse(const char *opts) { int unit, error; char *str, *cpy, *lpcdev, *node_name; const char *romfile, *varfile; error = -1; str = cpy = strdup(opts); lpcdev = strsep(&str, ","); if (lpcdev != NULL) { if (strcasecmp(lpcdev, "bootrom") == 0) { romfile = strsep(&str, ","); if (romfile == NULL) { errx(4, "invalid bootrom option \"%s\"", opts); } set_config_value("lpc.bootrom", romfile); varfile = strsep(&str, ","); - if (varfile != NULL) { + if (varfile == NULL) { + error = 0; + goto done; + } + if (strchr(varfile, '=') == NULL) { set_config_value("lpc.bootvars", varfile); + } else { + /* varfile doesn't exist, it's another config + * option */ + pci_parse_legacy_config(find_config_node("lpc"), + varfile); } + pci_parse_legacy_config(find_config_node("lpc"), str); error = 0; goto done; } for (unit = 0; unit < LPC_UART_NUM; unit++) { if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) { asprintf(&node_name, "lpc.%s.path", lpc_uart_names[unit]); set_config_value(node_name, str); free(node_name); error = 0; goto done; } } if (strcasecmp(lpcdev, pctestdev_getname()) == 0) { asprintf(&node_name, "lpc.%s", pctestdev_getname()); set_config_bool(node_name, true); free(node_name); error = 0; goto done; } } done: free(cpy); return (error); } void lpc_print_supported_devices(void) { size_t i; printf("bootrom\n"); for (i = 0; i < LPC_UART_NUM; i++) printf("%s\n", lpc_uart_names[i]); printf("%s\n", pctestdev_getname()); } const char * lpc_bootrom(void) { return (get_config_value("lpc.bootrom")); } +const char * +lpc_fwcfg(void) +{ + return (get_config_value("lpc.fwcfg")); +} + static void lpc_uart_intr_assert(void *arg) { struct lpc_uart_softc *sc = arg; assert(sc->irq >= 0); vm_isa_pulse_irq(lpc_bridge->pi_vmctx, sc->irq, sc->irq); } static void lpc_uart_intr_deassert(void *arg __unused) { /* * The COM devices on the LPC bus generate edge triggered interrupts, * so nothing more to do here. */ } static int lpc_uart_io_handler(struct vmctx *ctx __unused, int in, int port, int bytes, uint32_t *eax, void *arg) { int offset; struct lpc_uart_softc *sc = arg; offset = port - sc->iobase; switch (bytes) { case 1: if (in) *eax = uart_read(sc->uart_softc, offset); else uart_write(sc->uart_softc, offset, *eax); break; case 2: if (in) { *eax = uart_read(sc->uart_softc, offset); *eax |= uart_read(sc->uart_softc, offset + 1) << 8; } else { uart_write(sc->uart_softc, offset, *eax); uart_write(sc->uart_softc, offset + 1, *eax >> 8); } break; default: return (-1); } return (0); } static int lpc_init(struct vmctx *ctx) { struct lpc_uart_softc *sc; struct inout_port iop; const char *backend, *name; char *node_name; int unit, error; const nvlist_t *nvl; nvl = find_config_node("lpc"); if (nvl != NULL && nvlist_exists(nvl, "bootrom")) { error = bootrom_loadrom(ctx, nvl); if (error) return (error); } /* COM1 and COM2 */ for (unit = 0; unit < LPC_UART_NUM; unit++) { sc = &lpc_uart_softc[unit]; name = lpc_uart_names[unit]; if (uart_legacy_alloc(unit, &sc->iobase, &sc->irq) != 0) { EPRINTLN("Unable to allocate resources for " "LPC device %s", name); return (-1); } pci_irq_reserve(sc->irq); sc->uart_softc = uart_init(lpc_uart_intr_assert, lpc_uart_intr_deassert, sc); asprintf(&node_name, "lpc.%s.path", name); backend = get_config_value(node_name); free(node_name); if (uart_set_backend(sc->uart_softc, backend) != 0) { EPRINTLN("Unable to initialize backend '%s' " "for LPC device %s", backend, name); return (-1); } bzero(&iop, sizeof(struct inout_port)); iop.name = name; iop.port = sc->iobase; iop.size = UART_IO_BAR_SIZE; iop.flags = IOPORT_F_INOUT; iop.handler = lpc_uart_io_handler; iop.arg = sc; error = register_inout(&iop); assert(error == 0); sc->enabled = 1; } /* pc-testdev */ asprintf(&node_name, "lpc.%s", pctestdev_getname()); if (get_config_bool_default(node_name, false)) { error = pctestdev_init(ctx); if (error) return (error); } free(node_name); return (0); } static void pci_lpc_write_dsdt(struct pci_devinst *pi) { struct lpc_dsdt **ldpp, *ldp; dsdt_line(""); dsdt_line("Device (ISA)"); dsdt_line("{"); dsdt_line(" Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func); dsdt_line(" OperationRegion (LPCR, PCI_Config, 0x00, 0x100)"); dsdt_line(" Field (LPCR, AnyAcc, NoLock, Preserve)"); dsdt_line(" {"); dsdt_line(" Offset (0x60),"); dsdt_line(" PIRA, 8,"); dsdt_line(" PIRB, 8,"); dsdt_line(" PIRC, 8,"); dsdt_line(" PIRD, 8,"); dsdt_line(" Offset (0x68),"); dsdt_line(" PIRE, 8,"); dsdt_line(" PIRF, 8,"); dsdt_line(" PIRG, 8,"); dsdt_line(" PIRH, 8"); dsdt_line(" }"); dsdt_line(""); dsdt_indent(1); SET_FOREACH(ldpp, lpc_dsdt_set) { ldp = *ldpp; ldp->handler(); } dsdt_line(""); dsdt_line("Device (PIC)"); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0000\"))"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); dsdt_fixed_ioport(IO_ICU1, 2); dsdt_fixed_ioport(IO_ICU2, 2); dsdt_fixed_irq(2); dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); dsdt_line(""); dsdt_line("Device (TIMR)"); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0100\"))"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); dsdt_fixed_ioport(IO_TIMER1_PORT, 4); dsdt_fixed_irq(0); dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); dsdt_unindent(1); dsdt_line("}"); } static void pci_lpc_sysres_dsdt(void) { struct lpc_sysres **lspp, *lsp; dsdt_line(""); dsdt_line("Device (SIO)"); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0C02\"))"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); SET_FOREACH(lspp, lpc_sysres_set) { lsp = *lspp; switch (lsp->type) { case LPC_SYSRES_IO: dsdt_fixed_ioport(lsp->base, lsp->length); break; case LPC_SYSRES_MEM: dsdt_fixed_mem32(lsp->base, lsp->length); break; } } dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); } LPC_DSDT(pci_lpc_sysres_dsdt); static void pci_lpc_uart_dsdt(void) { struct lpc_uart_softc *sc; int unit; for (unit = 0; unit < LPC_UART_NUM; unit++) { sc = &lpc_uart_softc[unit]; if (!sc->enabled) continue; dsdt_line(""); dsdt_line("Device (%s)", lpc_uart_acpi_names[unit]); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0501\"))"); dsdt_line(" Name (_UID, %d)", unit + 1); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); dsdt_fixed_ioport(sc->iobase, UART_IO_BAR_SIZE); dsdt_fixed_irq(sc->irq); dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); } } LPC_DSDT(pci_lpc_uart_dsdt); static int pci_lpc_cfgwrite(struct pci_devinst *pi, int coff, int bytes, uint32_t val) { int pirq_pin; if (bytes == 1) { pirq_pin = 0; if (coff >= 0x60 && coff <= 0x63) pirq_pin = coff - 0x60 + 1; if (coff >= 0x68 && coff <= 0x6b) pirq_pin = coff - 0x68 + 5; if (pirq_pin != 0) { pirq_write(pi->pi_vmctx, pirq_pin, val); pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin)); return (0); } } return (-1); } static void pci_lpc_write(struct pci_devinst *pi __unused, int baridx __unused, uint64_t offset __unused, int size __unused, uint64_t value __unused) { } static uint64_t pci_lpc_read(struct pci_devinst *pi __unused, int baridx __unused, uint64_t offset __unused, int size __unused) { return (0); } #define LPC_DEV 0x7000 #define LPC_VENDOR 0x8086 static int pci_lpc_init(struct pci_devinst *pi, nvlist_t *nvl __unused) { /* * Do not allow more than one LPC bridge to be configured. */ if (lpc_bridge != NULL) { EPRINTLN("Only one LPC bridge is allowed."); return (-1); } /* * Enforce that the LPC can only be configured on bus 0. This * simplifies the ACPI DSDT because it can provide a decode for * all legacy i/o ports behind bus 0. */ if (pi->pi_bus != 0) { EPRINTLN("LPC bridge can be present only on bus 0."); return (-1); } if (lpc_init(pi->pi_vmctx) != 0) return (-1); /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, LPC_DEV); pci_set_cfgdata16(pi, PCIR_VENDOR, LPC_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA); lpc_bridge = pi; return (0); } char * lpc_pirq_name(int pin) { char *name; if (lpc_bridge == NULL) return (NULL); asprintf(&name, "\\_SB.PC00.ISA.LNK%c,", 'A' + pin - 1); return (name); } void lpc_pirq_routed(void) { int pin; if (lpc_bridge == NULL) return; for (pin = 0; pin < 4; pin++) pci_set_cfgdata8(lpc_bridge, 0x60 + pin, pirq_read(pin + 1)); for (pin = 0; pin < 4; pin++) pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5)); } #ifdef BHYVE_SNAPSHOT static int pci_lpc_snapshot(struct vm_snapshot_meta *meta) { int unit, ret; struct uart_softc *sc; for (unit = 0; unit < LPC_UART_NUM; unit++) { sc = lpc_uart_softc[unit].uart_softc; ret = uart_snapshot(sc, meta); if (ret != 0) goto done; } done: return (ret); } #endif static const struct pci_devemu pci_de_lpc = { .pe_emu = "lpc", .pe_init = pci_lpc_init, .pe_write_dsdt = pci_lpc_write_dsdt, .pe_cfgwrite = pci_lpc_cfgwrite, .pe_barwrite = pci_lpc_write, .pe_barread = pci_lpc_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_lpc_snapshot, #endif }; PCI_EMUL_SET(pci_de_lpc); diff --git a/usr.sbin/bhyve/pci_lpc.h b/usr.sbin/bhyve/pci_lpc.h index 611b025d4386..ff3ea98b4f9c 100644 --- a/usr.sbin/bhyve/pci_lpc.h +++ b/usr.sbin/bhyve/pci_lpc.h @@ -1,76 +1,77 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LPC_H_ #define _LPC_H_ #include typedef void (*lpc_write_dsdt_t)(void); struct lpc_dsdt { lpc_write_dsdt_t handler; }; #define LPC_DSDT(handler) \ static struct lpc_dsdt __CONCAT(__lpc_dsdt, __LINE__) = { \ (handler), \ }; \ DATA_SET(lpc_dsdt_set, __CONCAT(__lpc_dsdt, __LINE__)) enum lpc_sysres_type { LPC_SYSRES_IO, LPC_SYSRES_MEM }; struct lpc_sysres { enum lpc_sysres_type type; uint32_t base; uint32_t length; }; #define LPC_SYSRES(type, base, length) \ static struct lpc_sysres __CONCAT(__lpc_sysres, __LINE__) = { \ (type), \ (base), \ (length) \ }; \ DATA_SET(lpc_sysres_set, __CONCAT(__lpc_sysres, __LINE__)) #define SYSRES_IO(base, length) LPC_SYSRES(LPC_SYSRES_IO, base, length) #define SYSRES_MEM(base, length) LPC_SYSRES(LPC_SYSRES_MEM, base, length) int lpc_device_parse(const char *opt); void lpc_print_supported_devices(void); char *lpc_pirq_name(int pin); void lpc_pirq_routed(void); const char *lpc_bootrom(void); +const char *lpc_fwcfg(void); #endif diff --git a/usr.sbin/bhyve/qemu_fwcfg.c b/usr.sbin/bhyve/qemu_fwcfg.c index 5ee35d6764e6..1b0b5e3e9931 100644 --- a/usr.sbin/bhyve/qemu_fwcfg.c +++ b/usr.sbin/bhyve/qemu_fwcfg.c @@ -1,407 +1,420 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG * Author: Corvin Köhne */ #include #include #include #include #include #include #include #include "acpi_device.h" #include "inout.h" +#include "pci_lpc.h" #include "qemu_fwcfg.h" #define QEMU_FWCFG_ACPI_DEVICE_NAME "FWCF" #define QEMU_FWCFG_ACPI_HARDWARE_ID "QEMU0002" #define QEMU_FWCFG_SELECTOR_PORT_NUMBER 0x510 #define QEMU_FWCFG_SELECTOR_PORT_SIZE 1 #define QEMU_FWCFG_SELECTOR_PORT_FLAGS IOPORT_F_INOUT #define QEMU_FWCFG_DATA_PORT_NUMBER 0x511 #define QEMU_FWCFG_DATA_PORT_SIZE 1 #define QEMU_FWCFG_DATA_PORT_FLAGS \ IOPORT_F_INOUT /* QEMU v2.4+ ignores writes */ #define QEMU_FWCFG_ARCHITECTURE_MASK 0x0001 #define QEMU_FWCFG_INDEX_MASK 0x3FFF #define QEMU_FWCFG_SELECT_READ 0 #define QEMU_FWCFG_SELECT_WRITE 1 #define QEMU_FWCFG_ARCHITECTURE_GENERIC 0 #define QEMU_FWCFG_ARCHITECTURE_SPECIFIC 1 #define QEMU_FWCFG_INDEX_SIGNATURE 0x00 #define QEMU_FWCFG_INDEX_ID 0x01 #define QEMU_FWCFG_INDEX_FILE_DIR 0x19 #define QEMU_FWCFG_FIRST_FILE_INDEX 0x20 #define QEMU_FWCFG_MIN_FILES 10 #pragma pack(1) union qemu_fwcfg_selector { struct { uint16_t index : 14; uint16_t writeable : 1; uint16_t architecture : 1; }; uint16_t bits; }; struct qemu_fwcfg_signature { uint8_t signature[4]; }; struct qemu_fwcfg_id { uint32_t interface : 1; /* always set */ uint32_t DMA : 1; uint32_t reserved : 30; }; struct qemu_fwcfg_file { uint32_t be_size; uint16_t be_selector; uint16_t reserved; uint8_t name[QEMU_FWCFG_MAX_NAME]; }; struct qemu_fwcfg_directory { uint32_t be_count; struct qemu_fwcfg_file files[0]; }; #pragma pack() struct qemu_fwcfg_softc { struct acpi_device *acpi_dev; uint32_t data_offset; union qemu_fwcfg_selector selector; struct qemu_fwcfg_item items[QEMU_FWCFG_MAX_ARCHS] [QEMU_FWCFG_MAX_ENTRIES]; struct qemu_fwcfg_directory *directory; }; static struct qemu_fwcfg_softc fwcfg_sc; static int qemu_fwcfg_selector_port_handler(struct vmctx *const ctx __unused, const int in, const int port __unused, const int bytes, uint32_t *const eax, void *const arg __unused) { if (bytes != sizeof(uint16_t)) { warnx("%s: invalid size (%d) of IO port access", __func__, bytes); return (-1); } if (in) { *eax = htole16(fwcfg_sc.selector.bits); return (0); } fwcfg_sc.data_offset = 0; fwcfg_sc.selector.bits = le16toh(*eax); return (0); } static int qemu_fwcfg_data_port_handler(struct vmctx *const ctx __unused, const int in, const int port __unused, const int bytes, uint32_t *const eax, void *const arg __unused) { if (bytes != sizeof(uint8_t)) { warnx("%s: invalid size (%d) of IO port access", __func__, bytes); return (-1); } if (!in) { warnx("%s: Writes to qemu fwcfg data port aren't allowed", __func__); return (-1); } /* get fwcfg item */ struct qemu_fwcfg_item *const item = &fwcfg_sc.items[fwcfg_sc.selector.architecture] [fwcfg_sc.selector.index]; if (item->data == NULL) { warnx( "%s: qemu fwcfg item doesn't exist (architecture %s index 0x%x)", __func__, fwcfg_sc.selector.architecture ? "specific" : "generic", fwcfg_sc.selector.index); *eax = 0x00; return (0); } else if (fwcfg_sc.data_offset >= item->size) { warnx( "%s: qemu fwcfg item read exceeds size (architecture %s index 0x%x size 0x%x offset 0x%x)", __func__, fwcfg_sc.selector.architecture ? "specific" : "generic", fwcfg_sc.selector.index, item->size, fwcfg_sc.data_offset); *eax = 0x00; return (0); } /* return item data */ *eax = item->data[fwcfg_sc.data_offset]; fwcfg_sc.data_offset++; return (0); } static int qemu_fwcfg_add_item(const uint16_t architecture, const uint16_t index, const uint32_t size, void *const data) { /* truncate architecture and index to their desired size */ const uint16_t arch = architecture & QEMU_FWCFG_ARCHITECTURE_MASK; const uint16_t idx = index & QEMU_FWCFG_INDEX_MASK; /* get pointer to item specified by selector */ struct qemu_fwcfg_item *const fwcfg_item = &fwcfg_sc.items[arch][idx]; /* check if item is already used */ if (fwcfg_item->data != NULL) { warnx("%s: qemu fwcfg item exists (architecture %s index 0x%x)", __func__, arch ? "specific" : "generic", idx); return (-1); } /* save data of the item */ fwcfg_item->size = size; fwcfg_item->data = data; return (0); } static int qemu_fwcfg_add_item_file_dir(void) { const size_t size = sizeof(struct qemu_fwcfg_directory) + QEMU_FWCFG_MIN_FILES * sizeof(struct qemu_fwcfg_file); struct qemu_fwcfg_directory *const fwcfg_directory = calloc(1, size); if (fwcfg_directory == NULL) { return (ENOMEM); } fwcfg_sc.directory = fwcfg_directory; return (qemu_fwcfg_add_item(QEMU_FWCFG_ARCHITECTURE_GENERIC, QEMU_FWCFG_INDEX_FILE_DIR, sizeof(struct qemu_fwcfg_directory), (uint8_t *)fwcfg_sc.directory)); } static int qemu_fwcfg_add_item_id(void) { struct qemu_fwcfg_id *const fwcfg_id = calloc(1, sizeof(struct qemu_fwcfg_id)); if (fwcfg_id == NULL) { return (ENOMEM); } fwcfg_id->interface = 1; fwcfg_id->DMA = 0; uint32_t *const le_fwcfg_id_ptr = (uint32_t *)fwcfg_id; *le_fwcfg_id_ptr = htole32(*le_fwcfg_id_ptr); return (qemu_fwcfg_add_item(QEMU_FWCFG_ARCHITECTURE_GENERIC, QEMU_FWCFG_INDEX_ID, sizeof(struct qemu_fwcfg_id), (uint8_t *)fwcfg_id)); } static int qemu_fwcfg_add_item_signature(void) { struct qemu_fwcfg_signature *const fwcfg_signature = calloc(1, sizeof(struct qemu_fwcfg_signature)); if (fwcfg_signature == NULL) { return (ENOMEM); } fwcfg_signature->signature[0] = 'Q'; fwcfg_signature->signature[1] = 'E'; fwcfg_signature->signature[2] = 'M'; fwcfg_signature->signature[3] = 'U'; return (qemu_fwcfg_add_item(QEMU_FWCFG_ARCHITECTURE_GENERIC, QEMU_FWCFG_INDEX_SIGNATURE, sizeof(struct qemu_fwcfg_signature), (uint8_t *)fwcfg_signature)); } static int qemu_fwcfg_register_port(const char *const name, const int port, const int size, const int flags, const inout_func_t handler) { struct inout_port iop; bzero(&iop, sizeof(iop)); iop.name = name; iop.port = port; iop.size = size; iop.flags = flags; iop.handler = handler; return (register_inout(&iop)); } int qemu_fwcfg_add_file(const uint8_t name[QEMU_FWCFG_MAX_NAME], const uint32_t size, void *const data) { /* * QEMU specifies count as big endian. * Convert it to host endian to work with it. */ const uint32_t count = be32toh(fwcfg_sc.directory->be_count) + 1; /* add file to items list */ const uint32_t index = QEMU_FWCFG_FIRST_FILE_INDEX + count - 1; const int error = qemu_fwcfg_add_item(QEMU_FWCFG_ARCHITECTURE_GENERIC, index, size, data); if (error != 0) { return (error); } /* * files should be sorted alphabetical, get index for new file */ uint32_t file_index; for (file_index = 0; file_index < count - 1; ++file_index) { if (strcmp(name, fwcfg_sc.directory->files[file_index].name) < 0) break; } if (count > QEMU_FWCFG_MIN_FILES) { /* alloc new file directory */ const uint64_t new_size = sizeof(struct qemu_fwcfg_directory) + count * sizeof(struct qemu_fwcfg_file); struct qemu_fwcfg_directory *const new_directory = calloc(1, new_size); if (new_directory == NULL) { warnx( "%s: Unable to allocate a new qemu fwcfg files directory (count %d)", __func__, count); return (-ENOMEM); } /* copy files below file_index to new directory */ memcpy(new_directory->files, fwcfg_sc.directory->files, file_index * sizeof(struct qemu_fwcfg_file)); /* copy files above file_index to directory */ memcpy(&new_directory->files[file_index + 1], &fwcfg_sc.directory->files[file_index], (count - file_index) * sizeof(struct qemu_fwcfg_file)); /* free old directory */ free(fwcfg_sc.directory); /* set directory pointer to new directory */ fwcfg_sc.directory = new_directory; /* adjust directory pointer */ fwcfg_sc.items[0][QEMU_FWCFG_INDEX_FILE_DIR].data = (uint8_t *)fwcfg_sc.directory; } else { /* shift files behind file_index */ for (uint32_t i = QEMU_FWCFG_MIN_FILES - 1; i > file_index; --i) { memcpy(&fwcfg_sc.directory->files[i], &fwcfg_sc.directory->files[i - 1], sizeof(struct qemu_fwcfg_file)); } } /* * QEMU specifies count, size and index as big endian. * Save these values in big endian to simplify guest reads of these * values. */ fwcfg_sc.directory->be_count = htobe32(count); fwcfg_sc.directory->files[file_index].be_size = htobe32(size); fwcfg_sc.directory->files[file_index].be_selector = htobe16(index); strcpy(fwcfg_sc.directory->files[file_index].name, name); /* set new size for the fwcfg_file_directory */ fwcfg_sc.items[0][QEMU_FWCFG_INDEX_FILE_DIR].size = sizeof(struct qemu_fwcfg_directory) + count * sizeof(struct qemu_fwcfg_file); return (0); } int qemu_fwcfg_init(struct vmctx *const ctx) { int error; - error = acpi_device_create(&fwcfg_sc.acpi_dev, ctx, - QEMU_FWCFG_ACPI_DEVICE_NAME, QEMU_FWCFG_ACPI_HARDWARE_ID); - if (error) { - warnx("%s: failed to create ACPI device for QEMU FwCfg", - __func__); - goto done; - } + /* + * Bhyve supports fwctl (bhyve) and fwcfg (qemu) as firmware interfaces. + * Both are using the same ports. So, it's not possible to provide both + * interfaces at the same time to the guest. Therefore, only create acpi + * tables and register io ports for fwcfg, if it's used. + */ + if (strcmp(lpc_fwcfg(), "qemu") == 0) { + error = acpi_device_create(&fwcfg_sc.acpi_dev, ctx, + QEMU_FWCFG_ACPI_DEVICE_NAME, QEMU_FWCFG_ACPI_HARDWARE_ID); + if (error) { + warnx("%s: failed to create ACPI device for QEMU FwCfg", + __func__); + goto done; + } - error = acpi_device_add_res_fixed_ioport(fwcfg_sc.acpi_dev, - QEMU_FWCFG_SELECTOR_PORT_NUMBER, 2); - if (error) { - warnx("%s: failed to add fixed IO port for QEMU FwCfg", - __func__); - goto done; - } + error = acpi_device_add_res_fixed_ioport(fwcfg_sc.acpi_dev, + QEMU_FWCFG_SELECTOR_PORT_NUMBER, 2); + if (error) { + warnx("%s: failed to add fixed IO port for QEMU FwCfg", + __func__); + goto done; + } - /* add handlers for fwcfg ports */ - if ((error = qemu_fwcfg_register_port("qemu_fwcfg_selector", - QEMU_FWCFG_SELECTOR_PORT_NUMBER, QEMU_FWCFG_SELECTOR_PORT_SIZE, - QEMU_FWCFG_SELECTOR_PORT_FLAGS, - qemu_fwcfg_selector_port_handler)) != 0) { - warnx("%s: Unable to register qemu fwcfg selector port 0x%x", - __func__, QEMU_FWCFG_SELECTOR_PORT_NUMBER); - goto done; - } - if ((error = qemu_fwcfg_register_port("qemu_fwcfg_data", - QEMU_FWCFG_DATA_PORT_NUMBER, QEMU_FWCFG_DATA_PORT_SIZE, - QEMU_FWCFG_DATA_PORT_FLAGS, qemu_fwcfg_data_port_handler)) != 0) { - warnx("%s: Unable to register qemu fwcfg data port 0x%x", - __func__, QEMU_FWCFG_DATA_PORT_NUMBER); - goto done; + /* add handlers for fwcfg ports */ + if ((error = qemu_fwcfg_register_port("qemu_fwcfg_selector", + QEMU_FWCFG_SELECTOR_PORT_NUMBER, + QEMU_FWCFG_SELECTOR_PORT_SIZE, + QEMU_FWCFG_SELECTOR_PORT_FLAGS, + qemu_fwcfg_selector_port_handler)) != 0) { + warnx( + "%s: Unable to register qemu fwcfg selector port 0x%x", + __func__, QEMU_FWCFG_SELECTOR_PORT_NUMBER); + goto done; + } + if ((error = qemu_fwcfg_register_port("qemu_fwcfg_data", + QEMU_FWCFG_DATA_PORT_NUMBER, QEMU_FWCFG_DATA_PORT_SIZE, + QEMU_FWCFG_DATA_PORT_FLAGS, + qemu_fwcfg_data_port_handler)) != 0) { + warnx( + "%s: Unable to register qemu fwcfg data port 0x%x", + __func__, QEMU_FWCFG_DATA_PORT_NUMBER); + goto done; + } } /* add common fwcfg items */ if ((error = qemu_fwcfg_add_item_signature()) != 0) { warnx("%s: Unable to add signature item", __func__); goto done; } if ((error = qemu_fwcfg_add_item_id()) != 0) { warnx("%s: Unable to add id item", __func__); goto done; } if ((error = qemu_fwcfg_add_item_file_dir()) != 0) { warnx("%s: Unable to add file_dir item", __func__); goto done; } done: if (error) { acpi_device_destroy(fwcfg_sc.acpi_dev); } return (error); }