diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile index 9b8a7274d793..d4909fb87b76 100644 --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -1,141 +1,142 @@ # # $FreeBSD$ # .include CFLAGS+=-I${.CURDIR}/../../contrib/lib9p CFLAGS+=-I${SRCTOP}/sys .PATH: ${SRCTOP}/sys/libkern ${SRCTOP}/sys/cam/ctl PROG= bhyve PACKAGE= bhyve MAN= bhyve.8 bhyve_config.5 BHYVE_SYSDIR?=${SRCTOP} SRCS= \ acpi.c \ acpi_device.c \ atkbdc.c \ audio.c \ basl.c \ bhyvegc.c \ bhyverun.c \ block_if.c \ bootrom.c \ config.c \ console.c \ crc16.c \ ctl_scsi_all.c \ ctl_util.c \ e820.c \ fwctl.c \ gdb.c \ hda_codec.c \ inout.c \ ioapic.c \ iov.c \ kernemu_dev.c \ mem.c \ mevent.c \ mptbl.c \ net_backends.c \ net_utils.c \ pci_ahci.c \ pci_e82545.c \ pci_emul.c \ pci_fbuf.c \ pci_gvt-d.c \ pci_hda.c \ pci_hostbridge.c \ pci_irq.c \ pci_lpc.c \ pci_nvme.c \ pci_passthru.c \ pci_uart.c \ pci_virtio_9p.c \ pci_virtio_block.c \ pci_virtio_console.c \ pci_virtio_input.c \ pci_virtio_net.c \ pci_virtio_rnd.c \ pci_virtio_scsi.c \ pci_xhci.c \ pctestdev.c \ pm.c \ post.c \ ps2kbd.c \ ps2mouse.c \ qemu_fwcfg.c \ qemu_loader.c \ rfb.c \ rtc.c \ smbiostbl.c \ sockstream.c \ spinup_ap.c \ task_switch.c \ tpm_device.c \ tpm_emul_passthru.c \ tpm_intf_crb.c \ uart_emul.c \ usb_emul.c \ usb_mouse.c \ vga.c \ virtio.c \ vmgenc.c \ xmsr.c .if ${MK_BHYVE_SNAPSHOT} != "no" SRCS+= snapshot.c +SRCS+= migration.c .endif CFLAGS.kernemu_dev.c+= -I${SRCTOP}/sys/amd64 .PATH: ${BHYVE_SYSDIR}/sys/amd64/vmm SRCS+= vmm_instruction_emul.c LIBADD= vmmapi md nv pthread z util sbuf cam 9p .if ${MK_BHYVE_SNAPSHOT} != "no" LIBADD+= ucl xo .endif .if ${MK_INET_SUPPORT} != "no" CFLAGS+=-DINET .endif .if ${MK_INET6_SUPPORT} != "no" CFLAGS+=-DINET6 .endif .if ${MK_NETGRAPH_SUPPORT} != "no" CFLAGS+=-DNETGRAPH LIBADD+= netgraph .endif .if ${MK_OPENSSL} == "no" CFLAGS+=-DNO_OPENSSL .else LIBADD+= crypto CFLAGS+=-DOPENSSL_API_COMPAT=0x10100000L .endif CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000 CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller .if ${MK_BHYVE_SNAPSHOT} != "no" CFLAGS+= -I${SRCTOP}/contrib/libucl/include CFLAGS+= -DBHYVE_SNAPSHOT .endif .ifdef GDB_LOG CFLAGS+=-DGDB_LOG .endif # Disable thread safety analysis since it only finds very simple bugs and # yields many false positives. NO_WTHREAD_SAFETY= NO_WCAST_ALIGN= SUBDIR= kbdlayout .include diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 index 38891ab33c40..479ab75be60f 100644 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -1,1036 +1,1048 @@ .\" Copyright (c) 2013 Peter Grehan .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd May 5, 2023 .Dt BHYVE 8 .Os .Sh NAME .Nm bhyve .Nd "run a guest operating system inside a virtual machine" .Sh SYNOPSIS .Nm .Op Fl AaCDeHhPSuWwxY .Oo .Sm off .Fl c\~ .Oo .Op Cm cpus= .Ar numcpus .Oc .Op Cm ,sockets= Ar n .Op Cm ,cores= Ar n .Op Cm ,threads= Ar n .Oc .Sm on .Oo Fl f .Sm off .Ar name Cm \&, .Oo .Cm string No | Cm file .Oc .Cm \&= Ar data .Sm on .Oc .Oo .Sm off .Fl G\~ .Oo Ar w Oc .Oo Ar bind_address Cm \&: Oc .Ar port .Sm on .Oc .Op Fl k Ar config_file .Op Fl K Ar layout .Oo Fl l .Sm off .Ar lpcdev Op Cm \&, Ar conf .Sm on .Oc .Sm off .Oo Fl m\~ .Ar memsize .Oo .Cm K | Cm k | Cm M | Cm m | Cm G | Cm g | Cm T | Cm t .Oc .Sm on .Oc .Op Fl o Ar var Ns Cm = Ns Ar value .Op Fl p Ar vcpu Ns Cm \&: Ns Ar hostcpu .Op Fl r Ar file +.Oo Fl R +.Sm off +.Ar host Op Cm \&: Ar port +.Sm on +.Oc .Sm off .Oo Fl s\~ .Ar slot Cm \&, Ar emulation Op Cm \&, Ar conf .Sm on .Oc .Op Fl U Ar uuid .Ar vmname .Nm .Fl l Cm help .Nm .Fl s Cm help .Sh DESCRIPTION .Nm is a hypervisor that runs guest operating systems inside a virtual machine. .Pp Parameters such as the number of virtual CPUs, amount of guest memory, and I/O connectivity can be specified with command-line parameters. .Pp If not using a boot ROM, the guest operating system must be loaded with .Xr bhyveload 8 or a similar boot loader before running .Nm , otherwise, it is enough to run .Nm with a boot ROM of choice. .Pp .Nm runs until the guest operating system reboots or an unhandled hypervisor exit is detected. .Sh OPTIONS .Bl -tag -width 10n .It Fl A Generate ACPI tables. Required for .Fx Ns /amd64 guests. .It Fl a The guest's local APIC is configured in xAPIC mode. The xAPIC mode is the default setting so this option is redundant. It will be deprecated in a future version. .It Fl C Include guest memory in core file. .It Fl c Op Ar setting ... Number of guest virtual CPUs and/or the CPU topology. The default value for each of .Ar numcpus , .Ar sockets , .Ar cores , and .Ar threads is 1. The current maximum number of guest virtual CPUs is 16. If .Ar numcpus is not specified then it will be calculated from the other arguments. The topology must be consistent in that the .Ar numcpus must equal the product of .Ar sockets , .Ar cores , and .Ar threads . If a .Ar setting is specified more than once the last one has precedence. .It Fl D Destroy the VM on guest initiated power-off. .It Fl e Force .Nm to exit when a guest issues an access to an I/O port that is not emulated. This is intended for debug purposes. .It Fl f Ar name Ns Cm \&, Ns Oo Cm string Ns No | Ns Cm file Ns Oc Ns Cm \&= Ns Ar data Add a fw_cfg file .Ar name to the fw_cfg interface. If a .Cm string is specified, the fw_cfg file contains the string as data. If a .Cm file is specified, bhyve reads the file and adds the file content as fw_cfg data. .It Fl G Xo .Sm off .Oo Ar w Oc .Oo Ar bind_address Cm \&: Oc .Ar port .Sm on .Xc Start a debug server that uses the GDB protocol to export guest state to a debugger. An IPv4 TCP socket will be bound to the supplied .Ar bind_address and .Ar port to listen for debugger connections. Only a single debugger may be attached to the debug server at a time. If the option begins with .Sq w , .Nm will pause execution at the first instruction waiting for a debugger to attach. .It Fl H Yield the virtual CPU thread when a HLT instruction is detected. If this option is not specified, virtual CPUs will use 100% of a host CPU. .It Fl h Print help message and exit. .It Fl k Ar config_file Set configuration variables from a simple, key-value config file. Each line of the config file is expected to consist of a config variable name, an equals sign .It Fl K Ar layout Specify the keyboard layout. The value that can be specified sets the file name in .Ar /usr/share/bhyve/kbdlayout . This specification only works when loaded with UEFI mode for VNC. When using a VNC client that supports QEMU Extended Key Event Message (e.g. TigerVNC), this option isn't needed. When using a VNC client that doesn't support QEMU Extended Key Event Message (e.g. tightVNC), the layout defaults to the US keyboard unless specified otherwise. .Pq Sq = , and a value. No spaces are permitted between the variable name, equals sign, or value. Blank lines and lines starting with .Sq # are ignored. See .Xr bhyve_config 5 for more details. .It Fl l Cm help Print a list of supported LPC devices. .It Fl l Ar lpcdev Ns Op Cm \&, Ns Ar conf Allow devices behind the LPC PCI-ISA bridge to be configured. The only supported devices are the TTY-class devices .Cm com1 , com2 , com3 , and .Cm com4 , the boot ROM device .Cm bootrom , the .Cm fwcfg type and the debug/test device .Cm pc-testdev . .Pp The possible values for the .Ar conf argument are listed in the .Fl s flag description. .It Xo .Fl m Ar memsize Ns Oo .Sm off .Cm K | k | M | m | G | g | T | t .Sm on .Oc .Xc Set the guest physical memory size This must be the same size that was given to .Xr bhyveload 8 . .Pp The size argument may be suffixed with one of .Cm K , M , G or .Cm T (either upper or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes, or terabytes. If no suffix is given, the value is assumed to be in megabytes. .Pp The default is 256M. .It Fl o Ar var Ns Cm = Ns Ar value Set the configuration variable .Ar var to .Ar value . .It Fl P Force the guest virtual CPU to exit when a PAUSE instruction is detected. .It Fl p Ar vcpu Ns Cm \& : Ns Ar hostcpu Pin guest's virtual CPU .Em vcpu to .Em hostcpu . .It Fl r Ar file Resume a guest from a snapshot. The guest memory contents are restored from .Ar file , and the guest device and vCPU state are restored from the file .Dq Ar file Ns .kern . .Pp Note that the current snapshot file format requires that the configuration of devices in the new VM match the VM from which the snapshot was taken by specifying the same .Fl s and .Fl l options. The count of vCPUs and memory configuration are read from the snapshot. +.It Fl R Ar host Ns Op Cm \&: Ns Ar port +Receive migration from a source guest. +Await for a connection from +.Ar host +on the specified +.Ar port +and resume execution. The default migration port is 24983. .It Fl S Wire guest memory. .It Fl s Cm help Print a list of supported PCI devices. .It Fl s Ar slot Ns Cm \&, Ns Ar emulation Ns Op Cm \&, Ns Ar conf Configure a virtual PCI slot and function. .Pp .Nm provides PCI bus emulation and virtual devices that can be attached to slots on the bus. There are 32 available slots, with the option of providing up to 8 functions per slot. .Pp The .Ar slot can be specified in one of the following formats: .Pp .Bl -bullet -compact .It .Ar pcislot .It .Sm off .Ar pcislot Cm \&: Ar function .Sm on .It .Sm off .Ar bus Cm \&: Ar pcislot Cm \&: Ar function .Sm on .El .Pp The .Ar pcislot value is 0 to 31. The optional .Ar function value is 0 to 7. The optional .Ar bus value is 0 to 255. If not specified, the .Ar function value defaults to 0. If not specified, the .Ar bus value defaults to 0. .Pp The .Ar emulation argument can be one of the following: .Bl -tag -width "amd_hostbridge" .It Cm hostbridge A simple host bridge. This is usually configured at slot 0, and is required by most guest operating systems. .It Cm amd_hostbridge Emulation identical to .Cm hostbridge using a PCI vendor ID of AMD. .It Cm passthru PCI pass-through device. .It Cm virtio-net Virtio network interface. .It Cm virtio-blk Virtio block storage interface. .It Cm virtio-scsi Virtio SCSI interface. .It Cm virtio-9p Virtio 9p (VirtFS) interface. .It Cm virtio-rnd Virtio RNG interface. .It Cm virtio-console Virtio console interface, which exposes multiple ports to the guest in the form of simple char devices for simple IO between the guest and host userspaces. .It Cm virtio-input Virtio input interface. .It Cm ahci AHCI controller attached to arbitrary devices. .It Cm ahci-cd AHCI controller attached to an ATAPI CD/DVD. .It Cm ahci-hd AHCI controller attached to a SATA hard drive. .It Cm e1000 Intel e82545 network interface. .It Cm uart PCI 16550 serial device. .It Cm lpc LPC PCI-ISA bridge with COM1, COM2, COM3, and COM4 16550 serial ports, a boot ROM, and, optionally, a fwcfg type and the debug/test device. The LPC bridge emulation can only be configured on bus 0. .It Cm fbuf Raw framebuffer device attached to VNC server. .It Cm xhci eXtensible Host Controller Interface (xHCI) USB controller. .It Cm nvme NVM Express (NVMe) controller. .It Cm hda High Definition Audio Controller. .El .Pp The optional parameter .Ar conf describes the backend for device emulations. If .Ar conf is not specified, the device emulation has no backend and can be considered unconnected. .Pp Network device backends: .Sm off .Bl -bullet .It .Xo .Cm tap Ar N .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx .Op Cm \&,mtu= Ar N .Xc .It .Xo .Cm vmnet Ar N .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx .Op Cm \&,mtu= Ar N .Xc .It .Xo .Cm netgraph,path= Ar ADDRESS Cm \&,peerhook= Ar HOOK .Op Cm \&,socket= Ar NAME .Op Cm \&,hook= Ar HOOK .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx .Op Cm \&,mtu= Ar N .Xc .El .Sm on .Pp If .Cm mac is not specified, the MAC address is derived from a fixed OUI and the remaining bytes from an MD5 hash of the slot and function numbers and the device name. .Pp The MAC address is an ASCII string in .Xr ethers 5 format. .Pp With .Cm virtio-net devices, the .Cm mtu parameter can be specified to inform the guest about the largest MTU that should be allowed, expressed in bytes. .Pp With .Cm netgraph backend, the .Cm path and .Cm peerhook parameters must be specified to set the destination node and corresponding hook. The optional parameters .Cm socket and .Cm hook may be used to set the .Xr ng_socket 4 node name and source hook. The .Ar ADDRESS , .Ar HOOK , and .Ar NAME must comply with .Xr netgraph 4 addressing rules. .Pp Block storage device backends: .Sm off .Bl -bullet .It .Ar /filename Op Cm \&, Ar block-device-options .It .Ar /dev/xxx Op Cm \&, Ar block-device-options .El .Sm on .Pp The .Ar block-device-options are: .Bl -tag -width 10n .It Cm nocache Open the file with .Dv O_DIRECT . .It Cm direct Open the file using .Dv O_SYNC . .It Cm ro Force the file to be opened read-only. .It Cm sectorsize= Ns Ar logical Ns Oo Cm \&/ Ns Ar physical Oc Specify the logical and physical sector sizes of the emulated disk. The physical sector size is optional and is equal to the logical sector size if not explicitly specified. .It Cm nodelete Disable emulation of guest trim requests via .Dv DIOCGDELETE requests. .El .Pp SCSI device backends: .Sm off .Bl -bullet .It .Pa /dev/cam/ctl Oo Ar pp Cm \&. Ar vp Oc Oo Cm \&, Ar scsi-device-options Oc .El .Sm on .Pp The .Ar scsi-device-options are: .Bl -tag -width 10n .It Cm iid= Ns Ar IID Initiator ID to use when sending requests to specified CTL port. The default value is 0. .El .Pp 9P device backends: .Sm off .Bl -bullet .It .Ar sharename Cm = Ar /path/to/share Op Cm \&, Ar 9p-device-options .El .Sm on .Pp The .Ar 9p-device-options are: .Bl -tag -width 10n .It Cm ro Expose the share in read-only mode. .El .Pp TTY device backends: .Bl -tag -width 10n .It Cm stdio Connect the serial port to the standard input and output of the .Nm process. .It Ar /dev/xxx Use the host TTY device for serial port I/O. .El .Pp Boot ROM device backends: .Bl -tag -width 10n .It Ar romfile Ns Op Cm \&, Ns Ar varfile Map .Ar romfile in the guest address space reserved for boot firmware. If .Ar varfile is provided, that file is also mapped in the boot firmware guest address space, and any modifications the guest makes will be saved to that file. .El .Pp Fwcfg types: .Bl -tag -width 10n .It Ar fwcfg The fwcfg interface is used to pass information such as the CPU count or ACPI tables to the guest firmware. Supported values are .Ql bhyve and .Ql qemu . Due to backward compatibility reasons, .Ql bhyve is the default option. When .Ql bhyve is used, bhyve's fwctl interface is used. It currently reports only the CPU count to the guest firmware. The .Ql qemu option uses QEMU's fwcfg interface. This interface is widely used and allows user-defined information to be passed to the guest. It is used for passing the CPU count, ACPI tables, a boot order and many other things to the guest. Some operating systems such as Fedora CoreOS can be configured by qemu's fwcfg interface as well. .El .Pp Pass-through device backends: .Sm off .Bl -bullet .It .Cm ppt Ar N Oo , Ar passthru-device-options Oc .It .Ns Ar bus Cm \&/ Ar slot Cm \&/ Ar function .Op , Ar passthru-device-options .It .Cm pci Ar bus Cm : Ar slot Cm : Ns Ar function .Op , Ar passthru-device-options .El .Sm on .Pp Connect to a PCI device on the host either named ppt .Ns Ar N or at the selector described by .Ar slot , .Ar bus , and .Ar function numbers. .Pp The .Ar passthru-device-options are: .Bl -tag -width 10n .It Cm rom= Ns Ar romfile Add .Ar romfile as option ROM to the PCI device. The ROM will be loaded by firmware and should be capable of initializing the device. .El .Pp Guest memory must be wired using the .Fl S option when a pass-through device is configured. .Pp The host device must have been reserved at boot-time using the .Va pptdevs loader variable as described in .Xr vmm 4 . .Pp Virtio console device backends: .Bl -bullet .Sm off .It .Cm port1= Ns Ar /path/to/port1.sock Ns Op Cm ,port Ns Ar N Cm \&= Ns Ar /path/to/port2.sock No \~ Ar ... .Sm on .El .Pp A maximum of 16 ports per device can be created. Every port is named and corresponds to a Unix domain socket created by .Nm . .Nm accepts at most one connection per port at a time. .Pp Limitations: .Bl -bullet .It Due to lack of destructors in .Nm , sockets on the filesystem must be cleaned up manually after .Nm exits. .It There is no way to use the .Dq console port feature, nor the console port resize at present. .It Emergency write is advertised, but no-op at present. .El .Pp Virtio input device backends: .Bl -tag -width 10n .It Ar /dev/input/eventX Send input events of .Ar /dev/input/eventX to guest by VirtIO Input Interface. .El .Pp Framebuffer devices backends: .Bl -bullet .Sm off .It .Op Cm rfb= Ar ip-and-port .Op Cm ,w= Ar width .Op Cm ,h= Ar height .Op Cm ,vga= Ar vgaconf .Op Cm ,wait .Op Cm ,password= Ar password .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Cm rfb= Ns Ar ip-and-port Pq or Cm tcp= Ns Ar ip-and-port An IP address and a port VNC should listen on. There are two formats: .Pp .Bl -bullet -compact .It .Sm off .Op Ar IPv4 Cm \&: .Ar port .Sm on .It .Sm off .Cm \&[ Ar IPv6%zone Cm \&] Cm \&: Ar port .Sm on .El .Pp The default is to listen on localhost IPv4 address and default VNC port 5900. An IPv6 address must be enclosed in square brackets and may contain an optional zone identifier. .It Cm w= Ns Ar width No and Cm h= Ns Ar height A display resolution, width and height, respectively. If not specified, a default resolution of 1024x768 pixels will be used. Minimal supported resolution is 640x480 pixels, and maximum is 1920x1200 pixels. .It Cm vga= Ns Ar vgaconf Possible values for this option are .Cm io (default), .Cm on , and .Cm off . PCI graphics cards have a dual personality in that they are standard PCI devices with BAR addressing, but may also implicitly decode legacy VGA I/O space .Pq Ad 0x3c0-3df and memory space .Pq 64KB at Ad 0xA0000 . The default .Cm io option should be used for guests that attempt to issue BIOS calls which result in I/O port queries, and fail to boot if I/O decode is disabled. .Pp The .Cm on option should be used along with the CSM BIOS capability in UEFI to boot traditional BIOS guests that require the legacy VGA I/O and memory regions to be available. .Pp The .Cm off option should be used for the UEFI guests that assume that VGA adapter is present if they detect the I/O ports. An example of such a guest is .Ox in UEFI mode. .Pp Please refer to the .Nm .Fx wiki page .Pq Lk https://wiki.freebsd.org/bhyve for configuration notes of particular guests. .It Cm wait Instruct .Nm to only boot upon the initiation of a VNC connection, simplifying the installation of operating systems that require immediate keyboard input. This can be removed for post-installation use. .It Cm password= Ns Ar password This type of authentication is known to be cryptographically weak and is not intended for use on untrusted networks. Many implementations will want to use stronger security, such as running the session over an encrypted channel provided by IPsec or SSH. .El .Pp xHCI USB device backends: .Bl -tag -width 10n .It Cm tablet A USB tablet device which provides precise cursor synchronization when using VNC. .El .Pp NVMe device backends: .Bl -bullet .Sm off .It .Ar devpath .Op Cm ,maxq= Ar # .Op Cm ,qsz= Ar # .Op Cm ,ioslots= Ar # .Op Cm ,sectsz= Ar # .Op Cm ,ser= Ar # .Op Cm ,eui64= Ar # .Op Cm ,dsm= Ar opt .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Ar devpath Accepted device paths are: .Ar /dev/blockdev or .Ar /path/to/image or .Cm ram= Ns Ar size_in_MiB . .It Cm maxq Max number of queues. .It Cm qsz Max elements in each queue. .It Cm ioslots Max number of concurrent I/O requests. .It Cm sectsz Sector size (defaults to blockif sector size). .It Cm ser Serial number with maximum 20 characters. .It Cm eui64 IEEE Extended Unique Identifier (8 byte value). .It Cm dsm DataSet Management support. Supported values are: .Cm auto , enable , and .Cm disable . .El .Pp AHCI device backends: .Bl -bullet .It .Sm off .Op Oo Cm hd\&: | cd\&: Oc Ar path .Op Cm ,nmrr= Ar nmrr .Op Cm ,ser= Ar # .Op Cm ,rev= Ar # .Op Cm ,model= Ar # .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Cm nmrr Nominal Media Rotation Rate, known as RPM. Value 1 will indicate device as Solid State Disk. Default value is 0, not report. .It Cm ser Serial Number with maximum 20 characters. .It Cm rev Revision Number with maximum 8 characters. .It Cm model Model Number with maximum 40 characters. .El .Pp HD Audio device backends: .Bl -bullet .It .Sm off .Op Cm play= Ar playback .Op Cm ,rec= Ar recording .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Cm play Playback device, typically .Ar /dev/dsp0 . .It Cm rec Recording device, typically .Ar /dev/dsp0 . .El .It Fl U Ar uuid Set the universally unique identifier .Pq UUID in the guest's System Management BIOS System Information structure. By default a UUID is generated from the host's hostname and .Ar vmname . .It Fl u RTC keeps UTC time. .It Fl W Force virtio PCI device emulations to use MSI interrupts instead of MSI-X interrupts. .It Fl w Ignore accesses to unimplemented Model Specific Registers (MSRs). This is intended for debug purposes. .It Fl x The guest's local APIC is configured in x2APIC mode. .It Fl Y Disable MPtable generation. .It Ar vmname Alphanumeric name of the guest. This should be the same as that created by .Xr bhyveload 8 . .El .Sh CONFIGURATION VARIABLES .Nm uses an internal tree of configuration variables to describe global and per-device settings. When .Nm starts, it parses command line options (including config files) in the order given on the command line. Each command line option sets one or more configuration variables. For example, the .Fl s option creates a new tree node for a PCI device and sets one or more variables under that node including the device model and device model-specific variables. Variables may be set multiple times during this parsing stage with the final value overriding previous values. .Pp Once all of the command line options have been processed, the configuration values are frozen. .Nm then uses the value of configuration values to initialize device models and global settings. .Pp More details on configuration variables can be found in .Xr bhyve_config 5 . .Sh DEBUG SERVER The current debug server provides limited support for debuggers. .Ss Registers Each virtual CPU is exposed to the debugger as a thread. .Pp General purpose registers can be queried for each virtual CPU, but other registers such as floating-point and system registers cannot be queried. .Ss Memory Memory (including memory mapped I/O regions) can be read and written by the debugger. Memory operations use virtual addresses that are resolved to physical addresses via the current virtual CPU's active address translation. .Ss Control The running guest can be interrupted by the debugger at any time .Pq for example, by pressing Ctrl-C in the debugger . .Pp Single stepping is only supported on Intel CPUs supporting the MTRAP VM exit. .Pp Breakpoints are supported on Intel CPUs that support single stepping. Note that continuing from a breakpoint while interrupts are enabled in the guest may not work as expected due to timer interrupts firing while single stepping over the breakpoint. .Sh SIGNAL HANDLING .Nm deals with the following signals: .Pp .Bl -tag -width SIGTERM -compact .It SIGTERM Trigger ACPI poweroff for a VM .El .Sh EXIT STATUS Exit status indicates how the VM was terminated: .Pp .Bl -tag -width indent -compact .It 0 rebooted .It 1 powered off .It 2 halted .It 3 triple fault .It 4 exited due to an error .El .Sh EXAMPLES If not using a boot ROM, the guest operating system must have been loaded with .Xr bhyveload 8 or a similar boot loader before .Xr bhyve 4 can be run. Otherwise, the boot loader is not needed. .Pp To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio block device backed by the .Pa /my/image filesystem image, and a serial port for the console: .Bd -literal -offset indent bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\ -l com1,stdio -A -H -P -m 1G vm1 .Ed .Pp Run a 24GB single-CPU virtual machine with three network ports, one of which has a MAC address specified: .Bd -literal -offset indent bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\ -s 2:1,virtio-net,tap1 \\ -s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\ -s 3,virtio-blk,/my/image -l com1,stdio \\ -A -H -P -m 24G bigvm .Ed .Pp Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI CD-ROM, a single virtio network port, an AMD hostbridge, and the console port connected to an .Xr nmdm 4 null-modem device. .Bd -literal -offset indent bhyve -c 4 \\ -s 0,amd_hostbridge -s 1,lpc \\ -s 1:0,ahci,hd:/images/disk.1,hd:/images/disk.2,\\ hd:/images/disk.3,hd:/images/disk.4,\\ hd:/images/disk.5,hd:/images/disk.6,\\ hd:/images/disk.7,hd:/images/disk.8,\\ cd:/images/install.iso \\ -s 3,virtio-net,tap0 \\ -l com1,/dev/nmdm0A \\ -A -H -P -m 8G .Ed .Pp Run a UEFI virtual machine with a display resolution of 800 by 600 pixels that can be accessed via VNC at: 0.0.0.0:5900. .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 3,ahci-cd,/path/to/uefi-OS-install.iso \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=0.0.0.0:5900,w=800,h=600,wait \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Pp Run a UEFI virtual machine with a VNC display that is bound to all IPv6 addresses on port 5900. .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=[::]:5900,w=800,h=600 \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Pp Run a UEFI virtual machine with a VARS file to save EFI variables. Note that .Nm will write guest modifications to the given VARS file. Be sure to create a per-guest copy of the template VARS file from .Pa /usr . .Bd -literal -offset indent bhyve -c 2 -m 4g -w -H \\ -s 0,hostbridge \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI_CODE.fd,BHYVE_UEFI_VARS.fd uefivm .Ed .Sh SEE ALSO .Xr bhyve 4 , .Xr netgraph 4 , .Xr ng_socket 4 , .Xr nmdm 4 , .Xr vmm 4 , .Xr bhyve_config 5 , .Xr ethers 5 , .Xr bhyvectl 8 , .Xr bhyveload 8 .Pp .Rs .%A Intel .%B 64 and IA-32 Architectures Software Developer’s Manual .%V Volume 3 .Re .Sh HISTORY .Nm first appeared in .Fx 10.0 . .Sh AUTHORS .An Neel Natu Aq Mt neel@freebsd.org .An Peter Grehan Aq Mt grehan@freebsd.org diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index 5ced5df0b4cb..f5365a352d07 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -1,1629 +1,1657 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #ifdef BHYVE_SNAPSHOT #include #include #endif #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #include #include #endif #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "bootrom.h" #include "config.h" #include "inout.h" #include "debug.h" #include "e820.h" #include "fwctl.h" #include "gdb.h" #include "ioapic.h" #include "kernemu_dev.h" #include "mem.h" #include "mevent.h" +#ifdef BHYVE_SNAPSHOT +#include "migration.h" +#endif #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "qemu_fwcfg.h" #include "smbiostbl.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include "vmgenc.h" #define MB (1024UL * 1024) #define GB (1024UL * MB) static const char * const vmx_exit_reason_desc[] = { [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", [EXIT_REASON_EXT_INTR] = "External interrupt", [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", [EXIT_REASON_INIT] = "INIT signal", [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", [EXIT_REASON_SMI] = "Other SMI", [EXIT_REASON_INTR_WINDOW] = "Interrupt window", [EXIT_REASON_NMI_WINDOW] = "NMI window", [EXIT_REASON_TASK_SWITCH] = "Task switch", [EXIT_REASON_CPUID] = "CPUID", [EXIT_REASON_GETSEC] = "GETSEC", [EXIT_REASON_HLT] = "HLT", [EXIT_REASON_INVD] = "INVD", [EXIT_REASON_INVLPG] = "INVLPG", [EXIT_REASON_RDPMC] = "RDPMC", [EXIT_REASON_RDTSC] = "RDTSC", [EXIT_REASON_RSM] = "RSM", [EXIT_REASON_VMCALL] = "VMCALL", [EXIT_REASON_VMCLEAR] = "VMCLEAR", [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", [EXIT_REASON_VMPTRLD] = "VMPTRLD", [EXIT_REASON_VMPTRST] = "VMPTRST", [EXIT_REASON_VMREAD] = "VMREAD", [EXIT_REASON_VMRESUME] = "VMRESUME", [EXIT_REASON_VMWRITE] = "VMWRITE", [EXIT_REASON_VMXOFF] = "VMXOFF", [EXIT_REASON_VMXON] = "VMXON", [EXIT_REASON_CR_ACCESS] = "Control-register accesses", [EXIT_REASON_DR_ACCESS] = "MOV DR", [EXIT_REASON_INOUT] = "I/O instruction", [EXIT_REASON_RDMSR] = "RDMSR", [EXIT_REASON_WRMSR] = "WRMSR", [EXIT_REASON_INVAL_VMCS] = "VM-entry failure due to invalid guest state", [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", [EXIT_REASON_MWAIT] = "MWAIT", [EXIT_REASON_MTF] = "Monitor trap flag", [EXIT_REASON_MONITOR] = "MONITOR", [EXIT_REASON_PAUSE] = "PAUSE", [EXIT_REASON_MCE_DURING_ENTRY] = "VM-entry failure due to machine-check event", [EXIT_REASON_TPR] = "TPR below threshold", [EXIT_REASON_APIC_ACCESS] = "APIC access", [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", [EXIT_REASON_EPT_FAULT] = "EPT violation", [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", [EXIT_REASON_INVEPT] = "INVEPT", [EXIT_REASON_RDTSCP] = "RDTSCP", [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", [EXIT_REASON_INVVPID] = "INVVPID", [EXIT_REASON_WBINVD] = "WBINVD", [EXIT_REASON_XSETBV] = "XSETBV", [EXIT_REASON_APIC_WRITE] = "APIC write", [EXIT_REASON_RDRAND] = "RDRAND", [EXIT_REASON_INVPCID] = "INVPCID", [EXIT_REASON_VMFUNC] = "VMFUNC", [EXIT_REASON_ENCLS] = "ENCLS", [EXIT_REASON_RDSEED] = "RDSEED", [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", [EXIT_REASON_XSAVES] = "XSAVES", [EXIT_REASON_XRSTORS] = "XRSTORS" }; typedef int (*vmexit_handler_t)(struct vmctx *, struct vcpu *, struct vm_run *); int guest_ncpus; uint16_t cpu_cores, cpu_sockets, cpu_threads; int raw_stdio = 0; static char *progname; static const int BSP = 0; static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu); static struct vcpu_info { struct vmctx *ctx; struct vcpu *vcpu; int vcpuid; } *vcpu_info; static cpuset_t **vcpumap; static void usage(int code) { fprintf(stderr, "Usage: %s [-AaCDeHhPSuWwxY]\n" " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" " %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n" " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n" " -A: create ACPI tables\n" " -a: local apic is in xAPIC mode (deprecated)\n" " -C: include guest memory in core file\n" " -c: number of CPUs and/or topology specification\n" " -D: destroy on power-off\n" " -e: exit on unhandled I/O access\n" " -G: start a debug server\n" " -H: vmexit from the guest on HLT\n" " -h: help\n" " -k: key=value flat config file\n" " -K: PS2 keyboard layout\n" " -l: LPC device configuration\n" " -m: memory size\n" " -o: set config 'var' to 'value'\n" " -P: vmexit from the guest on pause\n" " -p: pin 'vcpu' to 'hostcpu'\n" #ifdef BHYVE_SNAPSHOT " -r: path to checkpoint file\n" + " -R: the source vm host and port for migration\n" #endif " -S: guest memory cannot be swapped\n" " -s: PCI slot config\n" " -U: UUID\n" " -u: RTC keeps UTC time\n" " -W: force virtio to use single-vector MSI\n" " -w: ignore unimplemented MSRs\n" " -x: local APIC is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), "", (int)strlen(progname), "", (int)strlen(progname), ""); exit(code); } /* * XXX This parser is known to have the following issues: * 1. It accepts null key=value tokens ",," as setting "cpus" to an * empty string. * * The acceptance of a null specification ('-c ""') is by design to match the * manual page syntax specification, this results in a topology of 1 vCPU. */ static int topology_parse(const char *opt) { char *cp, *str, *tofree; if (*opt == '\0') { set_config_value("sockets", "1"); set_config_value("cores", "1"); set_config_value("threads", "1"); set_config_value("cpus", "1"); return (0); } tofree = str = strdup(opt); if (str == NULL) errx(4, "Failed to allocate memory"); while ((cp = strsep(&str, ",")) != NULL) { if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) set_config_value("cpus", cp + strlen("cpus=")); else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0) set_config_value("sockets", cp + strlen("sockets=")); else if (strncmp(cp, "cores=", strlen("cores=")) == 0) set_config_value("cores", cp + strlen("cores=")); else if (strncmp(cp, "threads=", strlen("threads=")) == 0) set_config_value("threads", cp + strlen("threads=")); else if (strchr(cp, '=') != NULL) goto out; else set_config_value("cpus", cp); } free(tofree); return (0); out: free(tofree); return (-1); } static int parse_int_value(const char *key, const char *value, int minval, int maxval) { char *cp; long lval; errno = 0; lval = strtol(value, &cp, 0); if (errno != 0 || *cp != '\0' || cp == value || lval < minval || lval > maxval) errx(4, "Invalid value for %s: '%s'", key, value); return (lval); } /* * Set the sockets, cores, threads, and guest_cpus variables based on * the configured topology. * * The limits of UINT16_MAX are due to the types passed to * vm_set_topology(). vmm.ko may enforce tighter limits. */ static void calc_topology(void) { const char *value; bool explicit_cpus; uint64_t ncpus; value = get_config_value("cpus"); if (value != NULL) { guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX); explicit_cpus = true; } else { guest_ncpus = 1; explicit_cpus = false; } value = get_config_value("cores"); if (value != NULL) cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX); else cpu_cores = 1; value = get_config_value("threads"); if (value != NULL) cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX); else cpu_threads = 1; value = get_config_value("sockets"); if (value != NULL) cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX); else cpu_sockets = guest_ncpus; /* * Compute sockets * cores * threads avoiding overflow. The * range check above insures these are 16 bit values. */ ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads; if (ncpus > UINT16_MAX) errx(4, "Computed number of vCPUs too high: %ju", (uintmax_t)ncpus); if (explicit_cpus) { if (guest_ncpus != (int)ncpus) errx(4, "Topology (%d sockets, %d cores, %d threads) " "does not match %d vCPUs", cpu_sockets, cpu_cores, cpu_threads, guest_ncpus); } else guest_ncpus = ncpus; } static int pincpu_parse(const char *opt) { const char *value; char *newval; char key[16]; int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0) { fprintf(stderr, "invalid vcpu '%d'\n", vcpu); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (asprintf(&newval, "%s%s%d", value != NULL ? value : "", value != NULL ? "," : "", pcpu) == -1) { perror("failed to build new cpuset string"); return (-1); } set_config_value(key, newval); free(newval); return (0); } static void parse_cpuset(int vcpu, const char *list, cpuset_t *set) { char *cp, *token; int pcpu, start; CPU_ZERO(set); start = -1; token = __DECONST(char *, list); for (;;) { pcpu = strtoul(token, &cp, 0); if (cp == token) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); if (pcpu < 0 || pcpu >= CPU_SETSIZE) errx(4, "hostcpu '%d' outside valid range from 0 to %d", pcpu, CPU_SETSIZE - 1); switch (*cp) { case ',': case '\0': if (start >= 0) { if (start > pcpu) errx(4, "Invalid hostcpu range %d-%d", start, pcpu); while (start < pcpu) { CPU_SET(start, set); start++; } start = -1; } CPU_SET(pcpu, set); break; case '-': if (start >= 0) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); start = pcpu; break; default: errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); } if (*cp == '\0') break; token = cp + 1; } } static void build_vcpumaps(void) { char key[16]; const char *value; int vcpu; vcpumap = calloc(guest_ncpus, sizeof(*vcpumap)); for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (value == NULL) continue; vcpumap[vcpu] = malloc(sizeof(cpuset_t)); if (vcpumap[vcpu] == NULL) err(4, "Failed to allocate cpuset for vcpu %d", vcpu); parse_cpuset(vcpu, value, vcpumap[vcpu]); } } void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode) { int error, restart_instruction; restart_instruction = 1; error = vm_inject_exception(vcpu, vector, errcode_valid, errcode, restart_instruction); assert(error == 0); } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr) { return (vm_rev_map_gpa(ctx, addr)); } #endif int fbsdrun_virtio_msix(void) { return (get_config_bool_default("virtio_msix", true)); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct vcpu_info *vi = param; int error; snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid); pthread_set_name_np(pthread_self(), tname); if (vcpumap[vi->vcpuid] != NULL) { error = pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), vcpumap[vi->vcpuid]); assert(error == 0); } #ifdef BHYVE_SNAPSHOT checkpoint_cpu_add(vi->vcpuid); #endif gdb_cpu_add(vi->vcpu); vm_loop(vi->ctx, vi->vcpu); /* not reached */ exit(1); return (NULL); } static void fbsdrun_addcpu(struct vcpu_info *vi) { pthread_t thr; int error; error = vm_activate_cpu(vi->vcpu); if (error != 0) err(EX_OSERR, "could not activate CPU %d", vi->vcpuid); CPU_SET_ATOMIC(vi->vcpuid, &cpumask); vm_suspend_cpu(vi->vcpu); error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi); assert(error == 0); } static int fbsdrun_deletecpu(int vcpu) { if (!CPU_ISSET(vcpu, &cpumask)) { fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); exit(4); } CPU_CLR_ATOMIC(vcpu, &cpumask); return (CPU_EMPTY(&cpumask)); } static int vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; int error; int bytes, port, in; vme = vmrun->vm_exit; port = vme->u.inout.port; bytes = vme->u.inout.bytes; in = vme->u.inout.in; error = emulate_inout(ctx, vcpu, vme); if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port, vme->rip); return (VMEXIT_ABORT); } else { return (VMEXIT_CONTINUE); } } static int vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; uint64_t val; uint32_t eax, edx; int error; vme = vmrun->vm_exit; val = 0; error = emulate_rdmsr(vcpu, vme->u.msr.code, &val); if (error != 0) { fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", vme->u.msr.code, vcpu_id(vcpu)); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(vcpu); return (VMEXIT_CONTINUE); } } eax = val; error = vm_set_register(vcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; error = vm_set_register(vcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; int error; vme = vmrun->vm_exit; error = emulate_wrmsr(vcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", vme->u.msr.code, vme->u.msr.wval, vcpu_id(vcpu)); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(vcpu); return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); } #define DEBUG_EPT_MISCONFIG #ifdef DEBUG_EPT_MISCONFIG #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; static int ept_misconfig_ptenum; #endif static const char * vmexit_vmx_desc(uint32_t exit_reason) { if (exit_reason >= nitems(vmx_exit_reason_desc) || vmx_exit_reason_desc[exit_reason] == NULL) return ("Unknown"); return (vmx_exit_reason_desc[exit_reason]); } static int vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; vme = vmrun->vm_exit; fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); fprintf(stderr, "\treason\t\tVMX\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vme->u.vmx.status); fprintf(stderr, "\texit_reason\t%u (%s)\n", vme->u.vmx.exit_reason, vmexit_vmx_desc(vme->u.vmx.exit_reason)); fprintf(stderr, "\tqualification\t0x%016lx\n", vme->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vme->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vme->u.vmx.inst_error); #ifdef DEBUG_EPT_MISCONFIG if (vme->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { vm_get_register(vcpu, VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), &ept_misconfig_gpa); vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, &ept_misconfig_ptenum); fprintf(stderr, "\tEPT misconfiguration:\n"); fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", ept_misconfig_ptenum, ept_misconfig_pte[0], ept_misconfig_pte[1], ept_misconfig_pte[2], ept_misconfig_pte[3]); } #endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } static int vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; vme = vmrun->vm_exit; fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); fprintf(stderr, "\treason\t\tSVM\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\texitcode\t%#lx\n", vme->u.svm.exitcode); fprintf(stderr, "\texitinfo1\t%#lx\n", vme->u.svm.exitinfo1); fprintf(stderr, "\texitinfo2\t%#lx\n", vme->u.svm.exitinfo2); return (VMEXIT_ABORT); } static int vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun) { assert(vmrun->vm_exit->inst_length == 0); return (VMEXIT_CONTINUE); } static int vmexit_reqidle(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun) { assert(vmrun->vm_exit->inst_length == 0); return (VMEXIT_CONTINUE); } static int vmexit_hlt(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun __unused) { /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int vmexit_pause(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun __unused) { return (VMEXIT_CONTINUE); } static int vmexit_mtrap(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { assert(vmrun->vm_exit->inst_length == 0); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(vcpu_id(vcpu)); #endif gdb_cpu_mtrap(vcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(vcpu_id(vcpu)); #endif return (VMEXIT_CONTINUE); } static int vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; struct vie *vie; int err, i, cs_d; enum vm_cpu_mode mode; vme = vmrun->vm_exit; vie = &vme->u.inst_emul.vie; if (!vie->decoded) { /* * Attempt to decode in userspace as a fallback. This allows * updating instruction decode in bhyve without rebooting the * kernel (rapid prototyping), albeit with much slower * emulation. */ vie_restart(vie); mode = vme->u.inst_emul.paging.cpu_mode; cs_d = vme->u.inst_emul.cs_d; if (vmm_decode_instruction(mode, cs_d, vie) != 0) goto fail; if (vm_set_register(vcpu, VM_REG_GUEST_RIP, vme->rip + vie->num_processed) != 0) goto fail; } err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie, &vme->u.inst_emul.paging); if (err) { if (err == ESRCH) { EPRINTLN("Unhandled memory access to 0x%lx\n", vme->u.inst_emul.gpa); } goto fail; } return (VMEXIT_CONTINUE); fail: fprintf(stderr, "Failed to emulate instruction sequence [ "); for (i = 0; i < vie->num_valid; i++) fprintf(stderr, "%02x", vie->inst[i]); FPRINTLN(stderr, " ] at 0x%lx", vme->rip); return (VMEXIT_ABORT); } static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; static int vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; enum vm_suspend_how how; int vcpuid = vcpu_id(vcpu); vme = vmrun->vm_exit; how = vme->u.suspended.how; fbsdrun_deletecpu(vcpuid); if (vcpuid != BSP) { pthread_mutex_lock(&resetcpu_mtx); pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); } pthread_mutex_lock(&resetcpu_mtx); while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); switch (how) { case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: if (get_config_bool_default("destroy_on_poweroff", false)) vm_destroy(ctx); exit(1); case VM_SUSPEND_HALT: exit(2); case VM_SUSPEND_TRIPLEFAULT: exit(3); default: fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); exit(100); } return (0); /* NOTREACHED */ } static int vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun __unused) { #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(vcpu_id(vcpu)); #endif gdb_cpu_suspend(vcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(vcpu_id(vcpu)); #endif /* * XXX-MJ sleep for a short period to avoid chewing up the CPU in the * window between activation of the vCPU thread and the STARTUP IPI. */ usleep(1000); return (VMEXIT_CONTINUE); } static int vmexit_breakpoint(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { gdb_cpu_breakpoint(vcpu, vmrun->vm_exit); return (VMEXIT_CONTINUE); } static int vmexit_ipi(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun) { struct vm_exit *vme; cpuset_t *dmask; int error = -1; int i; dmask = vmrun->cpuset; vme = vmrun->vm_exit; switch (vme->u.ipi.mode) { case APIC_DELMODE_INIT: CPU_FOREACH_ISSET(i, dmask) { error = vm_suspend_cpu(vcpu_info[i].vcpu); if (error) { warnx("%s: failed to suspend cpu %d\n", __func__, i); break; } } break; case APIC_DELMODE_STARTUP: CPU_FOREACH_ISSET(i, dmask) { spinup_ap(vcpu_info[i].vcpu, vme->u.ipi.vector << PAGE_SHIFT); } error = 0; break; default: break; } return (error); } static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, [VM_EXITCODE_BPT] = vmexit_breakpoint, [VM_EXITCODE_IPI] = vmexit_ipi, }; static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu) { struct vm_exit vme; struct vm_run vmrun; int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus, dmask; error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus)); vmrun.vm_exit = &vme; vmrun.cpuset = &dmask; vmrun.cpusetsize = sizeof(dmask); while (1) { error = vm_run(vcpu, &vmrun); if (error != 0) break; exitcode = vme.exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(4); } rc = (*handler[exitcode])(ctx, vcpu, &vmrun); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(4); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu) { uint16_t sockets, cores, threads, maxcpus; int tmp, error; /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp); if (error != 0) return (1); error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); if (error == 0) return (maxcpus); else return (1); } static void fbsdrun_set_capabilities(struct vcpu *vcpu, bool bsp) { int err, tmp; if (get_config_bool_default("x86.vmexit_on_hlt", false)) { err = vm_get_capability(vcpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); exit(4); } vm_set_capability(vcpu, VM_CAP_HALT_EXIT, 1); if (bsp) handler[VM_EXITCODE_HLT] = vmexit_hlt; } if (get_config_bool_default("x86.vmexit_on_pause", false)) { /* * pause exit support required for this mode */ err = vm_get_capability(vcpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); exit(4); } vm_set_capability(vcpu, VM_CAP_PAUSE_EXIT, 1); if (bsp) handler[VM_EXITCODE_PAUSE] = vmexit_pause; } if (get_config_bool_default("x86.x2apic", false)) err = vm_set_x2apic_state(vcpu, X2APIC_ENABLED); else err = vm_set_x2apic_state(vcpu, X2APIC_DISABLED); if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); exit(4); } vm_set_capability(vcpu, VM_CAP_ENABLE_INVPCID, 1); err = vm_set_capability(vcpu, VM_CAP_IPI_EXIT, 1); assert(err == 0); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool reinit, romboot; reinit = romboot = false; if (lpc_bootrom()) romboot = true; error = vm_create(vmname); if (error) { if (errno == EEXIST) { if (romboot) { reinit = true; } else { /* * The virtual machine has been setup by the * userspace bootloader. */ } } else { perror("vm_create"); exit(4); } } else { +#ifndef BHYVE_SNAPSHOT if (!romboot) { +#else + if (!romboot && !get_config_bool_default("is_migrated", false)) { +#endif /* * If the virtual machine was just created then a * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); exit(4); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(4); } #ifndef WITHOUT_CAPSICUM if (vm_limit_rights(ctx) != 0) err(EX_OSERR, "vm_limit_rights"); #endif if (reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(4); } } error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0); if (error) errx(EX_OSERR, "vm_set_topology"); return (ctx); } static void spinup_vcpu(struct vcpu_info *vi, bool bsp) { int error; if (!bsp) { fbsdrun_set_capabilities(vi->vcpu, false); /* * Enable the 'unrestricted guest' mode for APs. * * APs startup in power-on 16-bit mode. */ error = vm_set_capability(vi->vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); } fbsdrun_addcpu(vi); } static bool parse_config_option(const char *option) { const char *value; char *path; value = strchr(option, '='); if (value == NULL || value[1] == '\0') return (false); path = strndup(option, value - option); if (path == NULL) err(4, "Failed to allocate memory"); set_config_value(path, value + 1); return (true); } static void parse_simple_config_file(const char *path) { FILE *fp; char *line, *cp; size_t linecap; unsigned int lineno; fp = fopen(path, "r"); if (fp == NULL) err(4, "Failed to open configuration file %s", path); line = NULL; linecap = 0; lineno = 1; for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) { if (*line == '#' || *line == '\n') continue; cp = strchr(line, '\n'); if (cp != NULL) *cp = '\0'; if (!parse_config_option(line)) errx(4, "%s line %u: invalid config option '%s'", path, lineno, line); } free(line); fclose(fp); } static void parse_gdb_options(const char *opt) { const char *sport; char *colon; if (opt[0] == 'w') { set_config_bool("gdb.wait", true); opt++; } colon = strrchr(opt, ':'); if (colon == NULL) { sport = opt; } else { *colon = '\0'; colon++; sport = colon; set_config_value("gdb.address", opt); } set_config_value("gdb.port", sport); } static void set_defaults(void) { set_config_bool("acpi_tables", false); set_config_value("memory.size", "256M"); set_config_bool("x86.strictmsr", true); set_config_value("lpc.fwcfg", "bhyve"); } int main(int argc, char *argv[]) { int c, error; int max_vcpus, memflags; struct vcpu *bsp; struct vmctx *ctx; struct qemu_fwcfg_item *e820_fwcfg_item; uint64_t rip; size_t memsize; const char *optstr, *value, *vmname; #ifdef BHYVE_SNAPSHOT char *restore_file; + char *migration_host; struct restore_state rstate; restore_file = NULL; + migration_host = NULL; #endif init_config(); set_defaults(); progname = basename(argv[0]); #ifdef BHYVE_SNAPSHOT - optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:"; + optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:R:"; #else optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': set_config_bool("x86.x2apic", false); break; case 'A': set_config_bool("acpi_tables", true); break; case 'D': set_config_bool("destroy_on_poweroff", true); break; case 'p': if (pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': if (topology_parse(optarg) != 0) { errx(EX_USAGE, "invalid cpu topology " "'%s'", optarg); } break; case 'C': set_config_bool("memory.guest_in_core", true); break; case 'f': if (qemu_fwcfg_parse_cmdline_arg(optarg) != 0) { errx(EX_USAGE, "invalid fwcfg item '%s'", optarg); } break; case 'G': parse_gdb_options(optarg); break; case 'k': parse_simple_config_file(optarg); break; case 'K': set_config_value("keyboard.layout", optarg); break; case 'l': if (strncmp(optarg, "help", strlen(optarg)) == 0) { lpc_print_supported_devices(); exit(0); } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; #ifdef BHYVE_SNAPSHOT case 'r': restore_file = optarg; break; + case 'R': + migration_host = optarg; + set_config_bool("is_migrated", true); + break; #endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); exit(0); } else if (pci_parse_slot(optarg) != 0) exit(4); else break; case 'S': set_config_bool("memory.wired", true); break; case 'm': set_config_value("memory.size", optarg); break; case 'o': if (!parse_config_option(optarg)) errx(EX_USAGE, "invalid configuration option '%s'", optarg); break; case 'H': set_config_bool("x86.vmexit_on_hlt", true); break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': set_config_bool("x86.vmexit_on_pause", true); break; case 'e': set_config_bool("x86.strictio", true); break; case 'u': set_config_bool("rtc.use_localtime", false); break; case 'U': set_config_value("uuid", optarg); break; case 'w': set_config_bool("x86.strictmsr", false); break; case 'W': set_config_bool("virtio_msix", false); break; case 'x': set_config_bool("x86.x2apic", true); break; case 'Y': set_config_bool("x86.mptable", false); break; case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; if (argc > 1) usage(1); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { error = load_restore_file(restore_file, &rstate); if (error) { fprintf(stderr, "Failed to read checkpoint info from " "file: '%s'.\n", restore_file); exit(1); } vmname = lookup_vmname(&rstate); if (vmname != NULL) set_config_value("name", vmname); } #endif if (argc == 1) set_config_value("name", argv[0]); vmname = get_config_value("name"); if (vmname == NULL) usage(1); if (get_config_bool_default("config.dump", false)) { dump_config(); exit(1); } calc_topology(); build_vcpumaps(); value = get_config_value("memory.size"); error = vm_parse_memsize(value, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", value); ctx = do_open(vmname); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { guest_ncpus = lookup_guest_ncpus(&rstate); memflags = lookup_memflags(&rstate); memsize = lookup_memsize(&rstate); } if (guest_ncpus < 1) { fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); exit(1); } #endif bsp = vm_vcpu_open(ctx, BSP); max_vcpus = num_vcpus_allowed(ctx, bsp); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(4); } fbsdrun_set_capabilities(bsp, true); /* Allocate per-VCPU resources. */ vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info)); for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) { vcpu_info[vcpuid].ctx = ctx; vcpu_info[vcpuid].vcpuid = vcpuid; if (vcpuid == BSP) vcpu_info[vcpuid].vcpu = bsp; else vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid); } memflags = 0; if (get_config_bool_default("memory.wired", false)) memflags |= VM_MEM_F_WIRED; if (get_config_bool_default("memory.guest_in_core", false)) memflags |= VM_MEM_F_INCORE; vm_set_memflags(ctx, memflags); error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (error) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); exit(4); } init_mem(guest_ncpus); init_inout(); kernemu_dev_init(); init_bootrom(ctx); atkbdc_init(ctx); pci_irq_init(ctx); ioapic_init(ctx); rtc_init(ctx); sci_init(ctx); if (qemu_fwcfg_init(ctx) != 0) { fprintf(stderr, "qemu fwcfg initialization error"); exit(4); } if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus), &guest_ncpus) != 0) { fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu"); exit(4); } if (e820_init(ctx) != 0) { fprintf(stderr, "Unable to setup E820"); exit(4); } /* * Exit if a device emulation finds an error in its initialization */ if (init_pci(ctx) != 0) { perror("device emulation initialization error"); exit(4); } /* * Initialize after PCI, to allow a bootrom file to reserve the high * region. */ if (get_config_bool("acpi_tables")) vmgenc_init(ctx); init_gdb(ctx); if (lpc_bootrom()) { if (vm_set_capability(bsp, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); exit(4); } error = vcpu_reset(bsp); assert(error == 0); } /* * Add all vCPUs. */ for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) spinup_vcpu(&vcpu_info[vcpuid], vcpuid == BSP); #ifdef BHYVE_SNAPSHOT - if (restore_file != NULL) { - fprintf(stdout, "Pausing pci devs...\r\n"); + if (restore_file != NULL || migration_host != NULL) { + fprintf(stdout, "Pausing pci devs...\n"); if (vm_pause_devices() != 0) { fprintf(stderr, "Failed to pause PCI device state.\n"); exit(1); } - fprintf(stdout, "Restoring vm mem...\r\n"); - if (restore_vm_mem(ctx, &rstate) != 0) { - fprintf(stderr, "Failed to restore VM memory.\n"); - exit(1); - } + if (restore_file != NULL) { + fprintf(stdout, "Restoring vm mem...\n"); + if (restore_vm_mem(ctx, &rstate) != 0) { + fprintf(stderr, + "Failed to restore VM memory.\n"); + exit(1); + } - fprintf(stdout, "Restoring pci devs...\r\n"); - if (vm_restore_devices(&rstate) != 0) { - fprintf(stderr, "Failed to restore PCI device state.\n"); - exit(1); + fprintf(stdout, "Restoring pci devs...\n"); + if (vm_restore_devices(&rstate) != 0) { + fprintf(stderr, + "Failed to restore PCI device state.\n"); + exit(1); + } + + fprintf(stdout, "Restoring kernel structs...\n"); + if (vm_restore_kern_structs(ctx, &rstate) != 0) { + fprintf(stderr, + "Failed to restore kernel structs.\n"); + exit(1); + } } - fprintf(stdout, "Restoring kernel structs...\r\n"); - if (vm_restore_kern_structs(ctx, &rstate) != 0) { - fprintf(stderr, "Failed to restore kernel structs.\n"); - exit(1); + if (migration_host != NULL) { + fprintf(stdout, "Starting the migration process...\n"); + if (receive_vm_migration(ctx, migration_host) != 0) { + fprintf(stderr, "Failed to migrate the vm.\n"); + exit(1); + } } - fprintf(stdout, "Resuming pci devs...\r\n"); + fprintf(stdout, "Resuming pci devs...\n"); if (vm_resume_devices() != 0) { fprintf(stderr, "Failed to resume PCI device state.\n"); exit(1); } } -#endif +#endif /* BHYVE_SNAPSHOT */ error = vm_get_register(bsp, VM_REG_GUEST_RIP, &rip); assert(error == 0); /* * build the guest tables, MP etc. */ if (get_config_bool_default("x86.mptable", true)) { error = mptable_build(ctx, guest_ncpus); if (error) { perror("error to build the guest tables"); exit(4); } } error = smbios_build(ctx); if (error != 0) exit(4); if (get_config_bool("acpi_tables")) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } e820_fwcfg_item = e820_get_fwcfg_item(); if (e820_fwcfg_item == NULL) { fprintf(stderr, "invalid e820 table"); exit(4); } if (qemu_fwcfg_add_file("etc/e820", e820_fwcfg_item->size, e820_fwcfg_item->data) != 0) { fprintf(stderr, "could not add qemu fwcfg etc/e820"); exit(4); } free(e820_fwcfg_item); if (lpc_bootrom() && strcmp(lpc_fwcfg(), "bhyve") == 0) { fwctl_init(); } /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); #ifdef BHYVE_SNAPSHOT /* initialize mutex/cond variables */ init_snapshot(); /* * checkpointing thread for communication with bhyvectl */ if (init_checkpoint_thread(ctx) != 0) errx(EX_OSERR, "Failed to start checkpoint thread"); #endif #ifndef WITHOUT_CAPSICUM caph_cache_catpages(); if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_enter() == -1) errx(EX_OSERR, "cap_enter() failed"); #endif #ifdef BHYVE_SNAPSHOT - if (restore_file != NULL) { + if (restore_file != NULL) destroy_restore_state(&rstate); + if (restore_file != NULL || migration_host != NULL) { if (vm_restore_time(ctx) < 0) err(EX_OSERR, "Unable to restore time"); for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) vm_resume_cpu(vcpu_info[vcpuid].vcpu); } else #endif vm_resume_cpu(bsp); /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(4); } diff --git a/usr.sbin/bhyve/migration.c b/usr.sbin/bhyve/migration.c new file mode 100644 index 000000000000..daa67b370d47 --- /dev/null +++ b/usr.sbin/bhyve/migration.c @@ -0,0 +1,98 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2017-2020 Elena Mihailescu + * Copyright (c) 2017-2020 Darius Mihai + * Copyright (c) 2017-2020 Mihai Carabas + * + * The migration feature was developed under sponsorships + * from Matthew Grooms. + * + */ + +#include +#include +#include +#include +#include + +#include +#ifndef WITHOUT_CAPSICUM +#include +#include +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include + +#include "migration.h" +#include "pci_emul.h" +#include "snapshot.h" + + +#ifdef BHYVE_DEBUG +#define DPRINTF(FMT, ...) \ +({ \ + fprintf(stderr, "%s: " FMT "\n", __func__, ##__VA_ARGS__); \ + }) +#else +#define DPRINTF(FMT, ...) +#endif + +#define EPRINTF(FMT, ...) \ +({ \ + fprintf(stderr, "%s: " FMT "\n", __func__, ##__VA_ARGS__); \ + }) + +int +receive_vm_migration(struct vmctx *ctx, char *migration_data) +{ + struct migrate_req req; + size_t len; + char *hostname, *pos; + unsigned int port = DEFAULT_MIGRATION_PORT; + int rc; + + assert(ctx != NULL); + assert(migration_data != NULL); + + memset(req.host, 0, MAXHOSTNAMELEN); + hostname = strdup(migration_data); + + if ((pos = strchr(hostname, ':')) != NULL) { + *pos = '\0'; + pos = pos + 1; + + rc = sscanf(pos, "%u", &port); + + if (rc <= 0) { + EPRINTF("Could not parse the port"); + free(hostname); + return (EINVAL); + } + } + req.port = port; + + len = strlen(hostname); + if (len > MAXHOSTNAMELEN - 1) { + EPRINTF("Hostname length %lu bigger than maximum allowed %d", + len, MAXHOSTNAMELEN - 1); + free(hostname); + return (EINVAL); + } + + strlcpy(req.host, hostname, MAXHOSTNAMELEN); + + // rc = vm_recv_migrate_req(ctx, req); + rc = EOPNOTSUPP; + EPRINTF("Migration not implemented yet"); + + free(hostname); + return (rc); +} diff --git a/usr.sbin/bhyve/migration.h b/usr.sbin/bhyve/migration.h new file mode 100644 index 000000000000..b72a4aa3611f --- /dev/null +++ b/usr.sbin/bhyve/migration.h @@ -0,0 +1,27 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2017-2020 Elena Mihailescu + * Copyright (c) 2017-2020 Darius Mihai + * Copyright (c) 2017-2020 Mihai Carabas + * + * The migration feature was developed under sponsorships + * from Matthew Grooms. + * + */ + +#pragma once + +#include +#include + +#define DEFAULT_MIGRATION_PORT 24983 + +struct vmctx; + +struct migrate_req { + char host[MAXHOSTNAMELEN]; + unsigned int port; +}; + +int receive_vm_migration(struct vmctx *ctx, char *migration_data); diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c index 5569ffcb2e24..58ae29782b43 100644 --- a/usr.sbin/bhyve/snapshot.c +++ b/usr.sbin/bhyve/snapshot.c @@ -1,1618 +1,1653 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2016 Flavius Anton * Copyright (c) 2016 Mihai Tiganus * Copyright (c) 2016-2019 Mihai Carabas * Copyright (c) 2017-2019 Darius Mihai * Copyright (c) 2017-2019 Elena Mihailescu * Copyright (c) 2018-2019 Sergiu Weisz * All rights reserved. * The bhyve-snapshot feature was developed under sponsorships * from Matthew Grooms. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "debug.h" #include "inout.h" #include "ipc.h" #include "fwctl.h" #include "ioapic.h" #include "mem.h" #include "mevent.h" +#include "migration.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "smbiostbl.h" #include "snapshot.h" #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include #include struct spinner_info { const size_t *crtval; const size_t maxval; const size_t total; }; extern int guest_ncpus; static struct winsize winsize; static sig_t old_winch_handler; #define KB (1024UL) #define MB (1024UL * KB) #define GB (1024UL * MB) #define SNAPSHOT_CHUNK (4 * MB) #define PROG_BUF_SZ (8192) #define SNAPSHOT_BUFFER_SIZE (20 * MB) #define JSON_KERNEL_ARR_KEY "kern_structs" #define JSON_DEV_ARR_KEY "devices" #define JSON_BASIC_METADATA_KEY "basic metadata" #define JSON_SNAPSHOT_REQ_KEY "device" #define JSON_SIZE_KEY "size" #define JSON_FILE_OFFSET_KEY "file_offset" #define JSON_NCPUS_KEY "ncpus" #define JSON_VMNAME_KEY "vmname" #define JSON_MEMSIZE_KEY "memsize" #define JSON_MEMFLAGS_KEY "memflags" #define min(a,b) \ ({ \ __typeof__ (a) _a = (a); \ __typeof__ (b) _b = (b); \ _a < _b ? _a : _b; \ }) static const struct vm_snapshot_kern_info snapshot_kern_structs[] = { { "vhpet", STRUCT_VHPET }, { "vm", STRUCT_VM }, { "vioapic", STRUCT_VIOAPIC }, { "vlapic", STRUCT_VLAPIC }, { "vmcx", STRUCT_VMCX }, { "vatpit", STRUCT_VATPIT }, { "vatpic", STRUCT_VATPIC }, { "vpmtmr", STRUCT_VPMTMR }, { "vrtc", STRUCT_VRTC }, }; static cpuset_t vcpus_active, vcpus_suspended; static pthread_mutex_t vcpu_lock; static pthread_cond_t vcpus_idle, vcpus_can_run; static bool checkpoint_active; /* * TODO: Harden this function and all of its callers since 'base_str' is a user * provided string. */ static char * strcat_extension(const char *base_str, const char *ext) { char *res; size_t base_len, ext_len; base_len = strnlen(base_str, NAME_MAX); ext_len = strnlen(ext, NAME_MAX); if (base_len + ext_len > NAME_MAX) { fprintf(stderr, "Filename exceeds maximum length.\n"); return (NULL); } res = malloc(base_len + ext_len + 1); if (res == NULL) { perror("Failed to allocate memory."); return (NULL); } memcpy(res, base_str, base_len); memcpy(res + base_len, ext, ext_len); res[base_len + ext_len] = 0; return (res); } void destroy_restore_state(struct restore_state *rstate) { if (rstate == NULL) { fprintf(stderr, "Attempting to destroy NULL restore struct.\n"); return; } if (rstate->kdata_map != MAP_FAILED) munmap(rstate->kdata_map, rstate->kdata_len); if (rstate->kdata_fd > 0) close(rstate->kdata_fd); if (rstate->vmmem_fd > 0) close(rstate->vmmem_fd); if (rstate->meta_root_obj != NULL) ucl_object_unref(rstate->meta_root_obj); if (rstate->meta_parser != NULL) ucl_parser_free(rstate->meta_parser); } static int load_vmmem_file(const char *filename, struct restore_state *rstate) { struct stat sb; int err; rstate->vmmem_fd = open(filename, O_RDONLY); if (rstate->vmmem_fd < 0) { perror("Failed to open restore file"); return (-1); } err = fstat(rstate->vmmem_fd, &sb); if (err < 0) { perror("Failed to stat restore file"); goto err_load_vmmem; } if (sb.st_size == 0) { fprintf(stderr, "Restore file is empty.\n"); goto err_load_vmmem; } rstate->vmmem_len = sb.st_size; return (0); err_load_vmmem: if (rstate->vmmem_fd > 0) close(rstate->vmmem_fd); return (-1); } static int load_kdata_file(const char *filename, struct restore_state *rstate) { struct stat sb; int err; rstate->kdata_fd = open(filename, O_RDONLY); if (rstate->kdata_fd < 0) { perror("Failed to open kernel data file"); return (-1); } err = fstat(rstate->kdata_fd, &sb); if (err < 0) { perror("Failed to stat kernel data file"); goto err_load_kdata; } if (sb.st_size == 0) { fprintf(stderr, "Kernel data file is empty.\n"); goto err_load_kdata; } rstate->kdata_len = sb.st_size; rstate->kdata_map = mmap(NULL, rstate->kdata_len, PROT_READ, MAP_SHARED, rstate->kdata_fd, 0); if (rstate->kdata_map == MAP_FAILED) { perror("Failed to map restore file"); goto err_load_kdata; } return (0); err_load_kdata: if (rstate->kdata_fd > 0) close(rstate->kdata_fd); return (-1); } static int load_metadata_file(const char *filename, struct restore_state *rstate) { ucl_object_t *obj; struct ucl_parser *parser; int err; parser = ucl_parser_new(UCL_PARSER_DEFAULT); if (parser == NULL) { fprintf(stderr, "Failed to initialize UCL parser.\n"); err = -1; goto err_load_metadata; } err = ucl_parser_add_file(parser, filename); if (err == 0) { fprintf(stderr, "Failed to parse metadata file: '%s'\n", filename); err = -1; goto err_load_metadata; } obj = ucl_parser_get_object(parser); if (obj == NULL) { fprintf(stderr, "Failed to parse object.\n"); err = -1; goto err_load_metadata; } rstate->meta_parser = parser; rstate->meta_root_obj = (ucl_object_t *)obj; return (0); err_load_metadata: if (parser != NULL) ucl_parser_free(parser); return (err); } int load_restore_file(const char *filename, struct restore_state *rstate) { int err = 0; char *kdata_filename = NULL, *meta_filename = NULL; assert(filename != NULL); assert(rstate != NULL); memset(rstate, 0, sizeof(*rstate)); rstate->kdata_map = MAP_FAILED; err = load_vmmem_file(filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest RAM file.\n"); goto err_restore; } kdata_filename = strcat_extension(filename, ".kern"); if (kdata_filename == NULL) { fprintf(stderr, "Failed to construct kernel data filename.\n"); goto err_restore; } err = load_kdata_file(kdata_filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest kernel data file.\n"); goto err_restore; } meta_filename = strcat_extension(filename, ".meta"); if (meta_filename == NULL) { fprintf(stderr, "Failed to construct kernel metadata filename.\n"); goto err_restore; } err = load_metadata_file(meta_filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest metadata file.\n"); goto err_restore; } return (0); err_restore: destroy_restore_state(rstate); if (kdata_filename != NULL) free(kdata_filename); if (meta_filename != NULL) free(meta_filename); return (-1); } #define JSON_GET_INT_OR_RETURN(key, obj, result_ptr, ret) \ do { \ const ucl_object_t *obj__; \ obj__ = ucl_object_lookup(obj, key); \ if (obj__ == NULL) { \ fprintf(stderr, "Missing key: '%s'", key); \ return (ret); \ } \ if (!ucl_object_toint_safe(obj__, result_ptr)) { \ fprintf(stderr, "Cannot convert '%s' value to int.", key); \ return (ret); \ } \ } while(0) #define JSON_GET_STRING_OR_RETURN(key, obj, result_ptr, ret) \ do { \ const ucl_object_t *obj__; \ obj__ = ucl_object_lookup(obj, key); \ if (obj__ == NULL) { \ fprintf(stderr, "Missing key: '%s'", key); \ return (ret); \ } \ if (!ucl_object_tostring_safe(obj__, result_ptr)) { \ fprintf(stderr, "Cannot convert '%s' value to string.", key); \ return (ret); \ } \ } while(0) static void * lookup_check_dev(const char *dev_name, struct restore_state *rstate, const ucl_object_t *obj, size_t *data_size) { const char *snapshot_req; int64_t size, file_offset; snapshot_req = NULL; JSON_GET_STRING_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj, &snapshot_req, NULL); assert(snapshot_req != NULL); if (!strcmp(snapshot_req, dev_name)) { JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj, &size, NULL); assert(size >= 0); JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj, &file_offset, NULL); assert(file_offset >= 0); assert((uint64_t)file_offset + size <= rstate->kdata_len); *data_size = (size_t)size; return ((uint8_t *)rstate->kdata_map + file_offset); } return (NULL); } static void * lookup_dev(const char *dev_name, const char *key, struct restore_state *rstate, size_t *data_size) { const ucl_object_t *devs = NULL, *obj = NULL; ucl_object_iter_t it = NULL; void *ret; devs = ucl_object_lookup(rstate->meta_root_obj, key); if (devs == NULL) { fprintf(stderr, "Failed to find '%s' object.\n", JSON_DEV_ARR_KEY); return (NULL); } if (ucl_object_type(devs) != UCL_ARRAY) { fprintf(stderr, "Object '%s' is not an array.\n", JSON_DEV_ARR_KEY); return (NULL); } while ((obj = ucl_object_iterate(devs, &it, true)) != NULL) { ret = lookup_check_dev(dev_name, rstate, obj, data_size); if (ret != NULL) return (ret); } return (NULL); } static const ucl_object_t * lookup_basic_metadata_object(struct restore_state *rstate) { const ucl_object_t *basic_meta_obj = NULL; basic_meta_obj = ucl_object_lookup(rstate->meta_root_obj, JSON_BASIC_METADATA_KEY); if (basic_meta_obj == NULL) { fprintf(stderr, "Failed to find '%s' object.\n", JSON_BASIC_METADATA_KEY); return (NULL); } if (ucl_object_type(basic_meta_obj) != UCL_OBJECT) { fprintf(stderr, "Object '%s' is not a JSON object.\n", JSON_BASIC_METADATA_KEY); return (NULL); } return (basic_meta_obj); } const char * lookup_vmname(struct restore_state *rstate) { const char *vmname; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (NULL); JSON_GET_STRING_OR_RETURN(JSON_VMNAME_KEY, obj, &vmname, NULL); return (vmname); } int lookup_memflags(struct restore_state *rstate) { int64_t memflags; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_MEMFLAGS_KEY, obj, &memflags, 0); return ((int)memflags); } size_t lookup_memsize(struct restore_state *rstate) { int64_t memsize; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_MEMSIZE_KEY, obj, &memsize, 0); if (memsize < 0) memsize = 0; return ((size_t)memsize); } int lookup_guest_ncpus(struct restore_state *rstate) { int64_t ncpus; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_NCPUS_KEY, obj, &ncpus, 0); return ((int)ncpus); } static void winch_handler(int signal __unused) { #ifdef TIOCGWINSZ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize); #endif /* TIOCGWINSZ */ } static int print_progress(size_t crtval, const size_t maxval) { size_t rc; double crtval_gb, maxval_gb; size_t i, win_width, prog_start, prog_done, prog_end; int mval_len; static char prog_buf[PROG_BUF_SZ]; static const size_t len = sizeof(prog_buf); static size_t div; static const char *div_str; static char wip_bar[] = { '/', '-', '\\', '|' }; static int wip_idx = 0; if (maxval == 0) { printf("[0B / 0B]\r\n"); return (0); } if (crtval > maxval) crtval = maxval; if (maxval > 10 * GB) { div = GB; div_str = "GiB"; } else if (maxval > 10 * MB) { div = MB; div_str = "MiB"; } else { div = KB; div_str = "KiB"; } crtval_gb = (double) crtval / div; maxval_gb = (double) maxval / div; rc = snprintf(prog_buf, len, "%.03lf", maxval_gb); if (rc == len) { fprintf(stderr, "Maxval too big\n"); return (-1); } mval_len = rc; rc = snprintf(prog_buf, len, "\r[%*.03lf%s / %.03lf%s] |", mval_len, crtval_gb, div_str, maxval_gb, div_str); if (rc == len) { fprintf(stderr, "Buffer too small to print progress\n"); return (-1); } win_width = min(winsize.ws_col, len); prog_start = rc; if (prog_start < (win_width - 2)) { prog_end = win_width - prog_start - 2; prog_done = prog_end * (crtval_gb / maxval_gb); for (i = prog_start; i < prog_start + prog_done; i++) prog_buf[i] = '#'; if (crtval != maxval) { prog_buf[i] = wip_bar[wip_idx]; wip_idx = (wip_idx + 1) % sizeof(wip_bar); i++; } else { prog_buf[i++] = '#'; } for (; i < win_width - 2; i++) prog_buf[i] = '_'; prog_buf[win_width - 2] = '|'; } prog_buf[win_width - 1] = '\0'; write(STDOUT_FILENO, prog_buf, win_width); return (0); } static void * snapshot_spinner_cb(void *arg) { int rc; size_t crtval, maxval, total; struct spinner_info *si; struct timespec ts; si = arg; if (si == NULL) pthread_exit(NULL); ts.tv_sec = 0; ts.tv_nsec = 50 * 1000 * 1000; /* 50 ms sleep time */ do { crtval = *si->crtval; maxval = si->maxval; total = si->total; rc = print_progress(crtval, total); if (rc < 0) { fprintf(stderr, "Failed to parse progress\n"); break; } nanosleep(&ts, NULL); } while (crtval < maxval); pthread_exit(NULL); return NULL; } static int vm_snapshot_mem_part(const int snapfd, const size_t foff, void *src, const size_t len, const size_t totalmem, const bool op_wr) { int rc; size_t part_done, todo, rem; ssize_t done; bool show_progress; pthread_t spinner_th; struct spinner_info *si; if (lseek(snapfd, foff, SEEK_SET) < 0) { perror("Failed to change file offset"); return (-1); } show_progress = false; if (isatty(STDIN_FILENO) && (winsize.ws_col != 0)) show_progress = true; part_done = foff; rem = len; if (show_progress) { si = &(struct spinner_info) { .crtval = &part_done, .maxval = foff + len, .total = totalmem }; rc = pthread_create(&spinner_th, 0, snapshot_spinner_cb, si); if (rc) { perror("Unable to create spinner thread"); show_progress = false; } } while (rem > 0) { if (show_progress) todo = min(SNAPSHOT_CHUNK, rem); else todo = rem; if (op_wr) done = write(snapfd, src, todo); else done = read(snapfd, src, todo); if (done < 0) { perror("Failed to write in file"); return (-1); } src = (uint8_t *)src + done; part_done += done; rem -= done; } if (show_progress) { rc = pthread_join(spinner_th, NULL); if (rc) perror("Unable to end spinner thread"); } return (0); } static size_t vm_snapshot_mem(struct vmctx *ctx, int snapfd, size_t memsz, const bool op_wr) { int ret; size_t lowmem, highmem, totalmem; char *baseaddr; ret = vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem, &highmem); if (ret) { fprintf(stderr, "%s: unable to retrieve guest memory size\r\n", __func__); return (0); } totalmem = lowmem + highmem; if ((op_wr == false) && (totalmem != memsz)) { fprintf(stderr, "%s: mem size mismatch: %ld vs %ld\r\n", __func__, totalmem, memsz); return (0); } winsize.ws_col = 80; #ifdef TIOCGWINSZ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize); #endif /* TIOCGWINSZ */ old_winch_handler = signal(SIGWINCH, winch_handler); ret = vm_snapshot_mem_part(snapfd, 0, baseaddr, lowmem, totalmem, op_wr); if (ret) { fprintf(stderr, "%s: Could not %s lowmem\r\n", __func__, op_wr ? "write" : "read"); totalmem = 0; goto done; } if (highmem == 0) goto done; ret = vm_snapshot_mem_part(snapfd, lowmem, baseaddr + 4*GB, highmem, totalmem, op_wr); if (ret) { fprintf(stderr, "%s: Could not %s highmem\r\n", __func__, op_wr ? "write" : "read"); totalmem = 0; goto done; } done: printf("\r\n"); signal(SIGWINCH, old_winch_handler); return (totalmem); } int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate) { size_t restored; restored = vm_snapshot_mem(ctx, rstate->vmmem_fd, rstate->vmmem_len, false); if (restored != rstate->vmmem_len) return (-1); return (0); } int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate) { for (unsigned i = 0; i < nitems(snapshot_kern_structs); i++) { const struct vm_snapshot_kern_info *info; struct vm_snapshot_meta *meta; void *data; size_t size; info = &snapshot_kern_structs[i]; data = lookup_dev(info->struct_name, JSON_KERNEL_ARR_KEY, rstate, &size); if (data == NULL) errx(EX_DATAERR, "Cannot find kern struct %s", info->struct_name); if (size == 0) errx(EX_DATAERR, "data with zero size for %s", info->struct_name); meta = &(struct vm_snapshot_meta) { .dev_name = info->struct_name, .dev_req = info->req, .buffer.buf_start = data, .buffer.buf_size = size, .buffer.buf = data, .buffer.buf_rem = size, .op = VM_SNAPSHOT_RESTORE, }; if (vm_snapshot_req(ctx, meta)) err(EX_DATAERR, "Failed to restore %s", info->struct_name); } return (0); } static int vm_restore_device(struct restore_state *rstate, vm_snapshot_dev_cb func, const char *name, void *data) { void *dev_ptr; size_t dev_size; int ret; struct vm_snapshot_meta *meta; dev_ptr = lookup_dev(name, JSON_DEV_ARR_KEY, rstate, &dev_size); if (dev_ptr == NULL) { EPRINTLN("Failed to lookup dev: %s", name); return (EINVAL); } if (dev_size == 0) { EPRINTLN("Restore device size is 0: %s", name); return (EINVAL); } meta = &(struct vm_snapshot_meta) { .dev_name = name, .dev_data = data, .buffer.buf_start = dev_ptr, .buffer.buf_size = dev_size, .buffer.buf = dev_ptr, .buffer.buf_rem = dev_size, .op = VM_SNAPSHOT_RESTORE, }; ret = func(meta); if (ret != 0) { EPRINTLN("Failed to restore dev: %s %d", name, ret); return (ret); } return (0); } int vm_restore_devices(struct restore_state *rstate) { int ret; struct pci_devinst *pdi = NULL; while ((pdi = pci_next(pdi)) != NULL) { ret = vm_restore_device(rstate, pci_snapshot, pdi->pi_name, pdi); if (ret) return (ret); } return (vm_restore_device(rstate, atkbdc_snapshot, "atkbdc", NULL)); } int vm_pause_devices(void) { int ret; struct pci_devinst *pdi = NULL; while ((pdi = pci_next(pdi)) != NULL) { ret = pci_pause(pdi); if (ret) { EPRINTLN("Cannot pause dev %s: %d", pdi->pi_name, ret); return (ret); } } return (0); } int vm_resume_devices(void) { int ret; struct pci_devinst *pdi = NULL; while ((pdi = pci_next(pdi)) != NULL) { ret = pci_resume(pdi); if (ret) { EPRINTLN("Cannot resume '%s': %d", pdi->pi_name, ret); return (ret); } } return (0); } static int vm_save_kern_struct(struct vmctx *ctx, int data_fd, xo_handle_t *xop, const char *array_key, struct vm_snapshot_meta *meta, off_t *offset) { int ret; size_t data_size; ssize_t write_cnt; ret = vm_snapshot_req(ctx, meta); if (ret != 0) { fprintf(stderr, "%s: Failed to snapshot struct %s\r\n", __func__, meta->dev_name); ret = -1; goto done; } data_size = vm_get_snapshot_size(meta); /* XXX-MJ no handling for short writes. */ write_cnt = write(data_fd, meta->buffer.buf_start, data_size); if (write_cnt < 0 || (size_t)write_cnt != data_size) { perror("Failed to write all snapshotted data."); ret = -1; goto done; } /* Write metadata. */ xo_open_instance_h(xop, array_key); xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name); xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); xo_close_instance_h(xop, JSON_KERNEL_ARR_KEY); *offset += data_size; done: return (ret); } static int vm_save_kern_structs(struct vmctx *ctx, int data_fd, xo_handle_t *xop) { int ret, error; size_t buf_size, i, offset; char *buffer; struct vm_snapshot_meta *meta; error = 0; offset = 0; buf_size = SNAPSHOT_BUFFER_SIZE; buffer = malloc(SNAPSHOT_BUFFER_SIZE * sizeof(char)); if (buffer == NULL) { error = ENOMEM; perror("Failed to allocate memory for snapshot buffer"); goto err_vm_snapshot_kern_data; } meta = &(struct vm_snapshot_meta) { .buffer.buf_start = buffer, .buffer.buf_size = buf_size, .op = VM_SNAPSHOT_SAVE, }; xo_open_list_h(xop, JSON_KERNEL_ARR_KEY); for (i = 0; i < nitems(snapshot_kern_structs); i++) { meta->dev_name = snapshot_kern_structs[i].struct_name; meta->dev_req = snapshot_kern_structs[i].req; memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); meta->buffer.buf = meta->buffer.buf_start; meta->buffer.buf_rem = meta->buffer.buf_size; ret = vm_save_kern_struct(ctx, data_fd, xop, JSON_DEV_ARR_KEY, meta, &offset); if (ret != 0) { error = -1; goto err_vm_snapshot_kern_data; } } xo_close_list_h(xop, JSON_KERNEL_ARR_KEY); err_vm_snapshot_kern_data: if (buffer != NULL) free(buffer); return (error); } static int vm_snapshot_basic_metadata(struct vmctx *ctx, xo_handle_t *xop, size_t memsz) { xo_open_container_h(xop, JSON_BASIC_METADATA_KEY); xo_emit_h(xop, "{:" JSON_NCPUS_KEY "/%ld}\n", guest_ncpus); xo_emit_h(xop, "{:" JSON_VMNAME_KEY "/%s}\n", vm_get_name(ctx)); xo_emit_h(xop, "{:" JSON_MEMSIZE_KEY "/%lu}\n", memsz); xo_emit_h(xop, "{:" JSON_MEMFLAGS_KEY "/%d}\n", vm_get_memflags(ctx)); xo_close_container_h(xop, JSON_BASIC_METADATA_KEY); return (0); } static int vm_snapshot_dev_write_data(int data_fd, xo_handle_t *xop, const char *array_key, struct vm_snapshot_meta *meta, off_t *offset) { ssize_t ret; size_t data_size; data_size = vm_get_snapshot_size(meta); /* XXX-MJ no handling for short writes. */ ret = write(data_fd, meta->buffer.buf_start, data_size); if (ret < 0 || (size_t)ret != data_size) { perror("Failed to write all snapshotted data."); return (-1); } /* Write metadata. */ xo_open_instance_h(xop, array_key); xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name); xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); xo_close_instance_h(xop, array_key); *offset += data_size; return (0); } static int vm_snapshot_device(vm_snapshot_dev_cb func, const char *dev_name, void *devdata, int data_fd, xo_handle_t *xop, struct vm_snapshot_meta *meta, off_t *offset) { int ret; memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); meta->buffer.buf = meta->buffer.buf_start; meta->buffer.buf_rem = meta->buffer.buf_size; meta->dev_name = dev_name; meta->dev_data = devdata; ret = func(meta); if (ret != 0) { EPRINTLN("Failed to snapshot %s; ret=%d", dev_name, ret); return (ret); } ret = vm_snapshot_dev_write_data(data_fd, xop, JSON_DEV_ARR_KEY, meta, offset); if (ret != 0) return (ret); return (0); } static int vm_snapshot_devices(int data_fd, xo_handle_t *xop) { int ret; off_t offset; void *buffer; size_t buf_size; struct vm_snapshot_meta *meta; struct pci_devinst *pdi; buf_size = SNAPSHOT_BUFFER_SIZE; offset = lseek(data_fd, 0, SEEK_CUR); if (offset < 0) { perror("Failed to get data file current offset."); return (-1); } buffer = malloc(buf_size); if (buffer == NULL) { perror("Failed to allocate memory for snapshot buffer"); ret = ENOSPC; goto snapshot_err; } meta = &(struct vm_snapshot_meta) { .buffer.buf_start = buffer, .buffer.buf_size = buf_size, .op = VM_SNAPSHOT_SAVE, }; xo_open_list_h(xop, JSON_DEV_ARR_KEY); /* Save PCI devices */ pdi = NULL; while ((pdi = pci_next(pdi)) != NULL) { ret = vm_snapshot_device(pci_snapshot, pdi->pi_name, pdi, data_fd, xop, meta, &offset); if (ret != 0) goto snapshot_err; } ret = vm_snapshot_device(atkbdc_snapshot, "atkbdc", NULL, data_fd, xop, meta, &offset); xo_close_list_h(xop, JSON_DEV_ARR_KEY); snapshot_err: if (buffer != NULL) free(buffer); return (ret); } void checkpoint_cpu_add(int vcpu) { pthread_mutex_lock(&vcpu_lock); CPU_SET(vcpu, &vcpus_active); if (checkpoint_active) { CPU_SET(vcpu, &vcpus_suspended); while (checkpoint_active) pthread_cond_wait(&vcpus_can_run, &vcpu_lock); CPU_CLR(vcpu, &vcpus_suspended); } pthread_mutex_unlock(&vcpu_lock); } /* * When a vCPU is suspended for any reason, it calls * checkpoint_cpu_suspend(). This records that the vCPU is idle. * Before returning from suspension, checkpoint_cpu_resume() is * called. In suspend we note that the vCPU is idle. In resume we * pause the vCPU thread until the checkpoint is complete. The reason * for the two-step process is that vCPUs might already be stopped in * the debug server when a checkpoint is requested. This approach * allows us to account for and handle those vCPUs. */ void checkpoint_cpu_suspend(int vcpu) { pthread_mutex_lock(&vcpu_lock); CPU_SET(vcpu, &vcpus_suspended); if (checkpoint_active && CPU_CMP(&vcpus_active, &vcpus_suspended) == 0) pthread_cond_signal(&vcpus_idle); pthread_mutex_unlock(&vcpu_lock); } void checkpoint_cpu_resume(int vcpu) { pthread_mutex_lock(&vcpu_lock); while (checkpoint_active) pthread_cond_wait(&vcpus_can_run, &vcpu_lock); CPU_CLR(vcpu, &vcpus_suspended); pthread_mutex_unlock(&vcpu_lock); } static void vm_vcpu_pause(struct vmctx *ctx) { pthread_mutex_lock(&vcpu_lock); checkpoint_active = true; vm_suspend_all_cpus(ctx); while (CPU_CMP(&vcpus_active, &vcpus_suspended) != 0) pthread_cond_wait(&vcpus_idle, &vcpu_lock); pthread_mutex_unlock(&vcpu_lock); } static void vm_vcpu_resume(struct vmctx *ctx) { pthread_mutex_lock(&vcpu_lock); checkpoint_active = false; pthread_mutex_unlock(&vcpu_lock); vm_resume_all_cpus(ctx); pthread_cond_broadcast(&vcpus_can_run); } static int vm_checkpoint(struct vmctx *ctx, int fddir, const char *checkpoint_file, bool stop_vm) { int fd_checkpoint = 0, kdata_fd = 0, fd_meta; int ret = 0; int error = 0; size_t memsz; xo_handle_t *xop = NULL; char *meta_filename = NULL; char *kdata_filename = NULL; FILE *meta_file = NULL; kdata_filename = strcat_extension(checkpoint_file, ".kern"); if (kdata_filename == NULL) { fprintf(stderr, "Failed to construct kernel data filename.\n"); return (-1); } kdata_fd = openat(fddir, kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); if (kdata_fd < 0) { perror("Failed to open kernel data snapshot file."); error = -1; goto done; } fd_checkpoint = openat(fddir, checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700); if (fd_checkpoint < 0) { perror("Failed to create checkpoint file"); error = -1; goto done; } meta_filename = strcat_extension(checkpoint_file, ".meta"); if (meta_filename == NULL) { fprintf(stderr, "Failed to construct vm metadata filename.\n"); goto done; } fd_meta = openat(fddir, meta_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); if (fd_meta != -1) meta_file = fdopen(fd_meta, "w"); if (meta_file == NULL) { perror("Failed to open vm metadata snapshot file."); close(fd_meta); goto done; } xop = xo_create_to_file(meta_file, XO_STYLE_JSON, XOF_PRETTY); if (xop == NULL) { perror("Failed to get libxo handle on metadata file."); goto done; } vm_vcpu_pause(ctx); ret = vm_pause_devices(); if (ret != 0) { fprintf(stderr, "Could not pause devices\r\n"); error = ret; goto done; } memsz = vm_snapshot_mem(ctx, fd_checkpoint, 0, true); if (memsz == 0) { perror("Could not write guest memory to file"); error = -1; goto done; } ret = vm_snapshot_basic_metadata(ctx, xop, memsz); if (ret != 0) { fprintf(stderr, "Failed to snapshot vm basic metadata.\n"); error = -1; goto done; } ret = vm_save_kern_structs(ctx, kdata_fd, xop); if (ret != 0) { fprintf(stderr, "Failed to snapshot vm kernel data.\n"); error = -1; goto done; } ret = vm_snapshot_devices(kdata_fd, xop); if (ret != 0) { fprintf(stderr, "Failed to snapshot device state.\n"); error = -1; goto done; } xo_finish_h(xop); if (stop_vm) { vm_destroy(ctx); exit(0); } done: ret = vm_resume_devices(); if (ret != 0) fprintf(stderr, "Could not resume devices\r\n"); vm_vcpu_resume(ctx); if (fd_checkpoint > 0) close(fd_checkpoint); if (meta_filename != NULL) free(meta_filename); if (kdata_filename != NULL) free(kdata_filename); if (xop != NULL) xo_destroy(xop); if (meta_file != NULL) fclose(meta_file); if (kdata_fd > 0) close(kdata_fd); return (error); } static int handle_message(struct vmctx *ctx, nvlist_t *nvl) { const char *cmd; struct ipc_command **ipc_cmd; if (!nvlist_exists_string(nvl, "cmd")) return (EINVAL); cmd = nvlist_get_string(nvl, "cmd"); IPC_COMMAND_FOREACH(ipc_cmd, ipc_cmd_set) { if (strcmp(cmd, (*ipc_cmd)->name) == 0) return ((*ipc_cmd)->handler(ctx, nvl)); } return (EOPNOTSUPP); } /* * Listen for commands from bhyvectl */ void * checkpoint_thread(void *param) { int fd; struct checkpoint_thread_info *thread_info; nvlist_t *nvl; pthread_set_name_np(pthread_self(), "checkpoint thread"); thread_info = (struct checkpoint_thread_info *)param; while ((fd = accept(thread_info->socket_fd, NULL, NULL)) != -1) { nvl = nvlist_recv(fd, 0); if (nvl != NULL) handle_message(thread_info->ctx, nvl); else EPRINTLN("nvlist_recv() failed: %s", strerror(errno)); close(fd); nvlist_destroy(nvl); } return (NULL); } static int vm_do_checkpoint(struct vmctx *ctx, const nvlist_t *nvl) { int error; if (!nvlist_exists_string(nvl, "filename") || !nvlist_exists_bool(nvl, "suspend") || !nvlist_exists_descriptor(nvl, "fddir")) error = EINVAL; else error = vm_checkpoint(ctx, nvlist_get_descriptor(nvl, "fddir"), nvlist_get_string(nvl, "filename"), nvlist_get_bool(nvl, "suspend")); return (error); } IPC_COMMAND(ipc_cmd_set, checkpoint, vm_do_checkpoint); +static int +vm_do_migrate(struct vmctx __unused *ctx, const nvlist_t *nvl) +{ + size_t len; + struct migrate_req req; + + if (!nvlist_exists_string(nvl, "hostname") || + !nvlist_exists_number(nvl, "port")) + return (EINVAL); + + memset(&req, 0, sizeof(struct migrate_req)); + req.port = nvlist_get_number(nvl, "port"); + + len = strlen(nvlist_get_string(nvl, "hostname")); + if (len > MAXHOSTNAMELEN - 1) { + EPRINTLN("Hostname length %lu bigger than maximum allowed %d", + len, MAXHOSTNAMELEN - 1); + return (EINVAL); + } + + strlcpy(req.host, nvlist_get_string(nvl, "hostname"), MAXHOSTNAMELEN); + + printf("%s: IP address used for migration: %s;\n" + "Port used for migration: %d\n", + __func__, + req.host, + req.port); + + // return (vm_send_migrate_req(ctx, req, nvlist_get_bool(nvl, "live"))); + EPRINTLN("Migration operation not implemented yet\n"); + return (EOPNOTSUPP); +} +IPC_COMMAND(ipc_cmd_set, migrate, vm_do_migrate); + void init_snapshot(void) { int err; err = pthread_mutex_init(&vcpu_lock, NULL); if (err != 0) errc(1, err, "checkpoint mutex init"); err = pthread_cond_init(&vcpus_idle, NULL); if (err != 0) errc(1, err, "checkpoint cv init (vcpus_idle)"); err = pthread_cond_init(&vcpus_can_run, NULL); if (err != 0) errc(1, err, "checkpoint cv init (vcpus_can_run)"); } /* * Create the listening socket for IPC with bhyvectl */ int init_checkpoint_thread(struct vmctx *ctx) { struct checkpoint_thread_info *checkpoint_info = NULL; struct sockaddr_un addr; int socket_fd; pthread_t checkpoint_pthread; int err; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif memset(&addr, 0, sizeof(addr)); socket_fd = socket(PF_UNIX, SOCK_STREAM, 0); if (socket_fd < 0) { EPRINTLN("Socket creation failed: %s", strerror(errno)); err = -1; goto fail; } addr.sun_family = AF_UNIX; snprintf(addr.sun_path, sizeof(addr.sun_path), "%s%s", BHYVE_RUN_DIR, vm_get_name(ctx)); addr.sun_len = SUN_LEN(&addr); unlink(addr.sun_path); if (bind(socket_fd, (struct sockaddr *)&addr, addr.sun_len) != 0) { EPRINTLN("Failed to bind socket \"%s\": %s\n", addr.sun_path, strerror(errno)); err = -1; goto fail; } if (listen(socket_fd, 10) < 0) { EPRINTLN("ipc socket listen: %s\n", strerror(errno)); err = errno; goto fail; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_ACCEPT, CAP_READ, CAP_RECV, CAP_WRITE, CAP_SEND, CAP_GETSOCKOPT); if (caph_rights_limit(socket_fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif checkpoint_info = calloc(1, sizeof(*checkpoint_info)); checkpoint_info->ctx = ctx; checkpoint_info->socket_fd = socket_fd; err = pthread_create(&checkpoint_pthread, NULL, checkpoint_thread, checkpoint_info); if (err != 0) goto fail; return (0); fail: free(checkpoint_info); if (socket_fd > 0) close(socket_fd); unlink(addr.sun_path); return (err); } void vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op) { const char *__op; if (op == VM_SNAPSHOT_SAVE) __op = "save"; else if (op == VM_SNAPSHOT_RESTORE) __op = "restore"; else __op = "unknown"; fprintf(stderr, "%s: snapshot-%s failed for %s\r\n", __func__, __op, bufname); } int vm_snapshot_buf(void *data, size_t data_size, struct vm_snapshot_meta *meta) { struct vm_snapshot_buffer *buffer; int op; buffer = &meta->buffer; op = meta->op; if (buffer->buf_rem < data_size) { fprintf(stderr, "%s: buffer too small\r\n", __func__); return (E2BIG); } if (op == VM_SNAPSHOT_SAVE) memcpy(buffer->buf, data, data_size); else if (op == VM_SNAPSHOT_RESTORE) memcpy(data, buffer->buf, data_size); else return (EINVAL); buffer->buf += data_size; buffer->buf_rem -= data_size; return (0); } size_t vm_get_snapshot_size(struct vm_snapshot_meta *meta) { size_t length; struct vm_snapshot_buffer *buffer; buffer = &meta->buffer; if (buffer->buf_size < buffer->buf_rem) { fprintf(stderr, "%s: Invalid buffer: size = %zu, rem = %zu\r\n", __func__, buffer->buf_size, buffer->buf_rem); length = 0; } else { length = buffer->buf_size - buffer->buf_rem; } return (length); } int vm_snapshot_guest2host_addr(struct vmctx *ctx, void **addrp, size_t len, bool restore_null, struct vm_snapshot_meta *meta) { int ret; vm_paddr_t gaddr; if (meta->op == VM_SNAPSHOT_SAVE) { gaddr = paddr_host2guest(ctx, *addrp); if (gaddr == (vm_paddr_t) -1) { if (!restore_null || (restore_null && (*addrp != NULL))) { ret = EFAULT; goto done; } } SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); } else if (meta->op == VM_SNAPSHOT_RESTORE) { SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); if (gaddr == (vm_paddr_t) -1) { if (!restore_null) { ret = EFAULT; goto done; } } *addrp = paddr_guest2host(ctx, gaddr, len); } else { ret = EINVAL; } done: return (ret); } int vm_snapshot_buf_cmp(void *data, size_t data_size, struct vm_snapshot_meta *meta) { struct vm_snapshot_buffer *buffer; int op; int ret; buffer = &meta->buffer; op = meta->op; if (buffer->buf_rem < data_size) { fprintf(stderr, "%s: buffer too small\r\n", __func__); ret = E2BIG; goto done; } if (op == VM_SNAPSHOT_SAVE) { ret = 0; memcpy(buffer->buf, data, data_size); } else if (op == VM_SNAPSHOT_RESTORE) { ret = memcmp(data, buffer->buf, data_size); } else { ret = EINVAL; goto done; } buffer->buf += data_size; buffer->buf_rem -= data_size; done: return (ret); } diff --git a/usr.sbin/bhyvectl/bhyvectl.8 b/usr.sbin/bhyvectl/bhyvectl.8 index 636529a7a53b..9b917f14270e 100644 --- a/usr.sbin/bhyvectl/bhyvectl.8 +++ b/usr.sbin/bhyvectl/bhyvectl.8 @@ -1,114 +1,133 @@ .\" Copyright (c) 2015 Christian Brueffer .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd May 4, 2020 .Dt BHYVECTL 8 .Os .Sh NAME .Nm bhyvectl .Nd "control utility for bhyve instances" .Sh SYNOPSIS .Nm .Fl -vm= Ns Ar .Op Fl -create .Op Fl -destroy .Op Fl -get-stats .Op Fl -inject-nmi .Op Fl -force-reset .Op Fl -force-poweroff .Op Fl -checkpoint= Ns Ar .Op Fl -suspend= Ns Ar +.Oo +.Fl -migrate= Ns Ar host Ns Op Cm \&: Ns Ar port +| +.Fl -migrate-live= Ns Ar host Ns Op Cm \&: Ns Ar port +.Oc .Sh DESCRIPTION The .Nm command is a control utility for active .Xr bhyve 8 virtual machine instances. .Pp .Em Note : Most .Nm flags are intended for querying and setting the state of an active instance. These commands are intended for development purposes, and are not documented here. A complete list can be obtained by executing .Nm without any arguments. .Pp The user-facing options are as follows: .Bl -tag -width ".Fl d Ar argument" .It Fl -vm= Ns Ar Operate on the virtual machine .Ar . .It Fl -create Create the specified VM. .It Fl -destroy Destroy the specified VM. .It Fl -get-stats Retrieve statistics for the specified VM. .It Fl -inject-nmi Inject a non-maskable interrupt (NMI) into the VM. .It Fl -force-reset Force the VM to reset. .It Fl -force-poweroff Force the VM to power off. .It Fl -checkpoint= Ns Ar Save a snapshot of a virtual machine. The guest memory contents are saved in the file given in .Ar . The guest device and vCPU state are saved in the file .Ar .kern . .It Fl -suspend= Ns Ar Save a snapshot of a virtual machine similar to .Fl -checkpoint . The virtual machine will terminate after the snapshot has been saved. +.It Fl -migrate= Ns Ar host Ns Op Cm \&: Ns Ar port +Warm migrate the virtual machine to a +.Ar host +on the specified +.Ar port . +The default migration port is 24983. +The virtual machine will be destroyed after the migration finishes. +.It Fl -migrate-live= Ns Ar host Ns Op Cm \&: Ns Ar port +Live migrate the virtual machine to a +.Ar host +on the specified +.Ar port . +The default migration port is 24983. +The virtual machine will be destroyed after the migration finishes. .El .Sh EXIT STATUS .Ex -std .Sh EXAMPLES Destroy the VM called fbsd10: .Pp .Dl "bhyvectl --vm=fbsd10 --destroy" .Sh COMPATIBILITY The snapshot file format is not yet stable and is subject to future changes. Backwards compatibility support for the current snapshot file format is not guaranteed when future changes are made. .Sh SEE ALSO .Xr bhyve 8 , .Xr bhyveload 8 .Sh HISTORY The .Nm command first appeared in .Fx 10.1 . .Sh AUTHORS .An -nosplit The .Nm utility was written by .An Peter Grehan and .An Neel Natu . diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c index f723ff1f2e82..3c41938d0e18 100644 --- a/usr.sbin/bhyvectl/bhyvectl.c +++ b/usr.sbin/bhyvectl/bhyvectl.c @@ -1,2421 +1,2474 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "amd/vmcb.h" #include "intel/vmcs.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" +#include "migration.h" #endif #define MB (1UL << 20) #define GB (1UL << 30) #define REQ_ARG required_argument #define NO_ARG no_argument #define OPT_ARG optional_argument static const char *progname; static void usage(bool cpu_intel) { (void)fprintf(stderr, "Usage: %s --vm=\n" " [--cpu=]\n" " [--create]\n" " [--destroy]\n" #ifdef BHYVE_SNAPSHOT " [--checkpoint= | --suspend=]\n" + " [--migrate=[:] | --migrate-live=[:]]\n" #endif " [--get-all]\n" " [--get-stats]\n" " [--set-desc-ds]\n" " [--get-desc-ds]\n" " [--set-desc-es]\n" " [--get-desc-es]\n" " [--set-desc-gs]\n" " [--get-desc-gs]\n" " [--set-desc-fs]\n" " [--get-desc-fs]\n" " [--set-desc-cs]\n" " [--get-desc-cs]\n" " [--set-desc-ss]\n" " [--get-desc-ss]\n" " [--set-desc-tr]\n" " [--get-desc-tr]\n" " [--set-desc-ldtr]\n" " [--get-desc-ldtr]\n" " [--set-desc-gdtr]\n" " [--get-desc-gdtr]\n" " [--set-desc-idtr]\n" " [--get-desc-idtr]\n" " [--run]\n" " [--capname=]\n" " [--getcap]\n" " [--setcap=<0|1>]\n" " [--desc-base=]\n" " [--desc-limit=]\n" " [--desc-access=]\n" " [--set-cr0=]\n" " [--get-cr0]\n" " [--set-cr2=]\n" " [--get-cr2]\n" " [--set-cr3=]\n" " [--get-cr3]\n" " [--set-cr4=]\n" " [--get-cr4]\n" " [--set-dr0=]\n" " [--get-dr0]\n" " [--set-dr1=]\n" " [--get-dr1]\n" " [--set-dr2=]\n" " [--get-dr2]\n" " [--set-dr3=]\n" " [--get-dr3]\n" " [--set-dr6=]\n" " [--get-dr6]\n" " [--set-dr7=]\n" " [--get-dr7]\n" " [--set-rsp=]\n" " [--get-rsp]\n" " [--set-rip=]\n" " [--get-rip]\n" " [--get-rax]\n" " [--set-rax=]\n" " [--get-rbx]\n" " [--get-rcx]\n" " [--get-rdx]\n" " [--get-rsi]\n" " [--get-rdi]\n" " [--get-rbp]\n" " [--get-r8]\n" " [--get-r9]\n" " [--get-r10]\n" " [--get-r11]\n" " [--get-r12]\n" " [--get-r13]\n" " [--get-r14]\n" " [--get-r15]\n" " [--set-rflags=]\n" " [--get-rflags]\n" " [--set-cs]\n" " [--get-cs]\n" " [--set-ds]\n" " [--get-ds]\n" " [--set-es]\n" " [--get-es]\n" " [--set-fs]\n" " [--get-fs]\n" " [--set-gs]\n" " [--get-gs]\n" " [--set-ss]\n" " [--get-ss]\n" " [--get-tr]\n" " [--get-ldtr]\n" " [--set-x2apic-state=]\n" " [--get-x2apic-state]\n" " [--unassign-pptdev=]\n" " [--set-mem=]\n" " [--get-lowmem]\n" " [--get-highmem]\n" " [--get-gpa-pmap]\n" " [--assert-lapic-lvt=]\n" " [--inject-nmi]\n" " [--force-reset]\n" " [--force-poweroff]\n" " [--get-rtc-time]\n" " [--set-rtc-time=]\n" " [--get-rtc-nvram]\n" " [--set-rtc-nvram=]\n" " [--rtc-nvram-offset=]\n" " [--get-active-cpus]\n" " [--get-suspended-cpus]\n" " [--get-intinfo]\n" " [--get-eptp]\n" " [--set-exception-bitmap]\n" " [--get-exception-bitmap]\n" " [--get-tsc-offset]\n" " [--get-guest-pat]\n" " [--get-io-bitmap-address]\n" " [--get-msr-bitmap]\n" " [--get-msr-bitmap-address]\n" " [--get-guest-sysenter]\n" " [--get-exit-reason]\n" " [--get-cpu-topology]\n", progname); if (cpu_intel) { (void)fprintf(stderr, " [--get-vmcs-pinbased-ctls]\n" " [--get-vmcs-procbased-ctls]\n" " [--get-vmcs-procbased-ctls2]\n" " [--get-vmcs-entry-interruption-info]\n" " [--set-vmcs-entry-interruption-info=]\n" " [--get-vmcs-guest-physical-address\n" " [--get-vmcs-guest-linear-address\n" " [--get-vmcs-host-pat]\n" " [--get-vmcs-host-cr0]\n" " [--get-vmcs-host-cr3]\n" " [--get-vmcs-host-cr4]\n" " [--get-vmcs-host-rip]\n" " [--get-vmcs-host-rsp]\n" " [--get-vmcs-cr0-mask]\n" " [--get-vmcs-cr0-shadow]\n" " [--get-vmcs-cr4-mask]\n" " [--get-vmcs-cr4-shadow]\n" " [--get-vmcs-cr3-targets]\n" " [--get-vmcs-apic-access-address]\n" " [--get-vmcs-virtual-apic-address]\n" " [--get-vmcs-tpr-threshold]\n" " [--get-vmcs-vpid]\n" " [--get-vmcs-instruction-error]\n" " [--get-vmcs-exit-ctls]\n" " [--get-vmcs-entry-ctls]\n" " [--get-vmcs-link]\n" " [--get-vmcs-exit-qualification]\n" " [--get-vmcs-exit-interruption-info]\n" " [--get-vmcs-exit-interruption-error]\n" " [--get-vmcs-interruptibility]\n" ); } else { (void)fprintf(stderr, " [--get-vmcb-intercepts]\n" " [--get-vmcb-asid]\n" " [--get-vmcb-exit-details]\n" " [--get-vmcb-tlb-ctrl]\n" " [--get-vmcb-virq]\n" " [--get-avic-apic-bar]\n" " [--get-avic-backing-page]\n" " [--get-avic-table]\n" ); } exit(1); } static int get_rtc_time, set_rtc_time; static int get_rtc_nvram, set_rtc_nvram; static int rtc_nvram_offset; static uint8_t rtc_nvram_value; static time_t rtc_secs; static int get_stats, getcap, setcap, capval, get_gpa_pmap; static int inject_nmi, assert_lapic_lvt; static int force_reset, force_poweroff; static const char *capname; static int create, destroy, get_memmap, get_memseg; static int get_intinfo; static int get_active_cpus, get_suspended_cpus; static uint64_t memsize; static int set_cr0, get_cr0, set_cr2, get_cr2, set_cr3, get_cr3; static int set_cr4, get_cr4; static int set_efer, get_efer; static int set_dr0, get_dr0; static int set_dr1, get_dr1; static int set_dr2, get_dr2; static int set_dr3, get_dr3; static int set_dr6, get_dr6; static int set_dr7, get_dr7; static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags; static int set_rax, get_rax; static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp; static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15; static int set_desc_ds, get_desc_ds; static int set_desc_es, get_desc_es; static int set_desc_fs, get_desc_fs; static int set_desc_gs, get_desc_gs; static int set_desc_cs, get_desc_cs; static int set_desc_ss, get_desc_ss; static int set_desc_gdtr, get_desc_gdtr; static int set_desc_idtr, get_desc_idtr; static int set_desc_tr, get_desc_tr; static int set_desc_ldtr, get_desc_ldtr; static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr; static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr; static int set_x2apic_state, get_x2apic_state; static enum x2apic_state x2apic_state; static int unassign_pptdev, bus, slot, func; static int run; static int get_cpu_topology; #ifdef BHYVE_SNAPSHOT static int vm_suspend_opt; +static int vm_migrate_live; #endif /* * VMCB specific. */ static int get_vmcb_intercept, get_vmcb_exit_details, get_vmcb_tlb_ctrl; static int get_vmcb_virq, get_avic_table; /* * VMCS-specific fields */ static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2; static int get_eptp, get_io_bitmap, get_tsc_offset; static int get_vmcs_entry_interruption_info; static int get_vmcs_interruptibility; static int get_vmcs_gpa, get_vmcs_gla; static int get_exception_bitmap; static int get_cr0_mask, get_cr0_shadow; static int get_cr4_mask, get_cr4_shadow; static int get_cr3_targets; static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold; static int get_msr_bitmap, get_msr_bitmap_address; static int get_vpid_asid; static int get_inst_err, get_exit_ctls, get_entry_ctls; static int get_host_cr0, get_host_cr3, get_host_cr4; static int get_host_rip, get_host_rsp; static int get_guest_pat, get_host_pat; static int get_guest_sysenter, get_vmcs_link; static int get_exit_reason, get_vmcs_exit_qualification; static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error; static int get_vmcs_exit_inst_length; static uint64_t desc_base; static uint32_t desc_limit, desc_access; static int get_all; static void dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu) { printf("vm exit[%d]\n", vcpu); printf("\trip\t\t0x%016lx\n", vmexit->rip); printf("\tinst_length\t%d\n", vmexit->inst_length); switch (vmexit->exitcode) { case VM_EXITCODE_INOUT: printf("\treason\t\tINOUT\n"); printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT"); printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes); printf("\tflags\t\t%s%s\n", vmexit->u.inout.string ? "STRING " : "", vmexit->u.inout.rep ? "REP " : ""); printf("\tport\t\t0x%04x\n", vmexit->u.inout.port); printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax); break; case VM_EXITCODE_VMX: printf("\treason\t\tVMX\n"); printf("\tstatus\t\t%d\n", vmexit->u.vmx.status); printf("\texit_reason\t0x%08x (%u)\n", vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason); printf("\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); printf("\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); printf("\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); break; case VM_EXITCODE_SVM: printf("\treason\t\tSVM\n"); printf("\texit_reason\t\t%#lx\n", vmexit->u.svm.exitcode); printf("\texitinfo1\t\t%#lx\n", vmexit->u.svm.exitinfo1); printf("\texitinfo2\t\t%#lx\n", vmexit->u.svm.exitinfo2); break; default: printf("*** unknown vm run exitcode %d\n", vmexit->exitcode); break; } } /* AMD 6th generation and Intel compatible MSRs */ #define MSR_AMD6TH_START 0xC0000000 #define MSR_AMD6TH_END 0xC0001FFF /* AMD 7th and 8th generation compatible MSRs */ #define MSR_AMD7TH_START 0xC0010000 #define MSR_AMD7TH_END 0xC0011FFF static const char * msr_name(uint32_t msr) { static char buf[32]; switch(msr) { case MSR_TSC: return ("MSR_TSC"); case MSR_EFER: return ("MSR_EFER"); case MSR_STAR: return ("MSR_STAR"); case MSR_LSTAR: return ("MSR_LSTAR"); case MSR_CSTAR: return ("MSR_CSTAR"); case MSR_SF_MASK: return ("MSR_SF_MASK"); case MSR_FSBASE: return ("MSR_FSBASE"); case MSR_GSBASE: return ("MSR_GSBASE"); case MSR_KGSBASE: return ("MSR_KGSBASE"); case MSR_SYSENTER_CS_MSR: return ("MSR_SYSENTER_CS_MSR"); case MSR_SYSENTER_ESP_MSR: return ("MSR_SYSENTER_ESP_MSR"); case MSR_SYSENTER_EIP_MSR: return ("MSR_SYSENTER_EIP_MSR"); case MSR_PAT: return ("MSR_PAT"); } snprintf(buf, sizeof(buf), "MSR %#08x", msr); return (buf); } static inline void print_msr_pm(uint64_t msr, int vcpu, int readable, int writeable) { if (readable || writeable) { printf("%-20s[%d]\t\t%c%c\n", msr_name(msr), vcpu, readable ? 'R' : '-', writeable ? 'W' : '-'); } } /* * Reference APM vol2, section 15.11 MSR Intercepts. */ static void dump_amd_msr_pm(const char *bitmap, int vcpu) { int byte, bit, readable, writeable; uint32_t msr; for (msr = 0; msr < 0x2000; msr++) { byte = msr / 4; bit = (msr % 4) * 2; /* Look at MSRs in the range 0x00000000 to 0x00001FFF */ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; print_msr_pm(msr, vcpu, readable, writeable); /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */ byte += 2048; readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable, writeable); /* MSR 0xC0010000 to 0xC0011FF is only for AMD */ byte += 4096; readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; print_msr_pm(msr + MSR_AMD7TH_START, vcpu, readable, writeable); } } /* * Reference Intel SDM Vol3 Section 24.6.9 MSR-Bitmap Address */ static void dump_intel_msr_pm(const char *bitmap, int vcpu) { int byte, bit, readable, writeable; uint32_t msr; for (msr = 0; msr < 0x2000; msr++) { byte = msr / 8; bit = msr & 0x7; /* Look at MSRs in the range 0x00000000 to 0x00001FFF */ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; print_msr_pm(msr, vcpu, readable, writeable); /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */ byte += 1024; readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable, writeable); } } static int dump_msr_bitmap(int vcpu, uint64_t addr, bool cpu_intel) { char *bitmap; int error, fd, map_size; error = -1; bitmap = MAP_FAILED; fd = open("/dev/mem", O_RDONLY, 0); if (fd < 0) { perror("Couldn't open /dev/mem"); goto done; } if (cpu_intel) map_size = PAGE_SIZE; else map_size = 2 * PAGE_SIZE; bitmap = mmap(NULL, map_size, PROT_READ, MAP_SHARED, fd, addr); if (bitmap == MAP_FAILED) { perror("mmap failed"); goto done; } if (cpu_intel) dump_intel_msr_pm(bitmap, vcpu); else dump_amd_msr_pm(bitmap, vcpu); error = 0; done: if (bitmap != MAP_FAILED) munmap((void *)bitmap, map_size); if (fd >= 0) close(fd); return (error); } static int vm_get_vmcs_field(struct vcpu *vcpu, int field, uint64_t *ret_val) { return (vm_get_register(vcpu, VMCS_IDENT(field), ret_val)); } static int vm_get_vmcb_field(struct vcpu *vcpu, int off, int bytes, uint64_t *ret_val) { return (vm_get_register(vcpu, VMCB_ACCESS(off, bytes), ret_val)); } enum { VMNAME = 1000, /* avoid collision with return values from getopt */ VCPU, SET_MEM, SET_EFER, SET_CR0, SET_CR2, SET_CR3, SET_CR4, SET_DR0, SET_DR1, SET_DR2, SET_DR3, SET_DR6, SET_DR7, SET_RSP, SET_RIP, SET_RAX, SET_RFLAGS, DESC_BASE, DESC_LIMIT, DESC_ACCESS, SET_CS, SET_DS, SET_ES, SET_FS, SET_GS, SET_SS, SET_TR, SET_LDTR, SET_X2APIC_STATE, SET_CAP, CAPNAME, UNASSIGN_PPTDEV, GET_GPA_PMAP, ASSERT_LAPIC_LVT, SET_RTC_TIME, SET_RTC_NVRAM, RTC_NVRAM_OFFSET, #ifdef BHYVE_SNAPSHOT SET_CHECKPOINT_FILE, SET_SUSPEND_FILE, + MIGRATE_VM, + MIGRATE_VM_LIVE, #endif }; static void print_cpus(const char *banner, const cpuset_t *cpus) { int i, first; first = 1; printf("%s:\t", banner); if (!CPU_EMPTY(cpus)) { for (i = 0; i < CPU_SETSIZE; i++) { if (CPU_ISSET(i, cpus)) { printf("%s%d", first ? " " : ", ", i); first = 0; } } } else printf(" (none)"); printf("\n"); } static void print_intinfo(const char *banner, uint64_t info) { int type; printf("%s:\t", banner); if (info & VM_INTINFO_VALID) { type = info & VM_INTINFO_TYPE; switch (type) { case VM_INTINFO_HWINTR: printf("extint"); break; case VM_INTINFO_NMI: printf("nmi"); break; case VM_INTINFO_SWINTR: printf("swint"); break; default: printf("exception"); break; } printf(" vector %d", (int)VM_INTINFO_VECTOR(info)); if (info & VM_INTINFO_DEL_ERRCODE) printf(" errcode %#x", (u_int)(info >> 32)); } else { printf("n/a"); } printf("\n"); } static bool cpu_vendor_intel(void) { u_int regs[4], v[3]; do_cpuid(0, regs); v[0] = regs[1]; v[1] = regs[3]; v[2] = regs[2]; if (memcmp(v, "GenuineIntel", sizeof(v)) == 0) return (true); if (memcmp(v, "AuthenticAMD", sizeof(v)) == 0 || memcmp(v, "HygonGenuine", sizeof(v)) == 0) return (false); fprintf(stderr, "Unknown cpu vendor \"%s\"\n", (const char *)v); exit(1); } static int get_all_registers(struct vcpu *vcpu, int vcpuid) { uint64_t cr0, cr2, cr3, cr4, dr0, dr1, dr2, dr3, dr6, dr7; uint64_t rsp, rip, rflags, efer; uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp; uint64_t r8, r9, r10, r11, r12, r13, r14, r15; int error = 0; if (!error && (get_efer || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_EFER, &efer); if (error == 0) printf("efer[%d]\t\t0x%016lx\n", vcpuid, efer); } if (!error && (get_cr0 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_CR0, &cr0); if (error == 0) printf("cr0[%d]\t\t0x%016lx\n", vcpuid, cr0); } if (!error && (get_cr2 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_CR2, &cr2); if (error == 0) printf("cr2[%d]\t\t0x%016lx\n", vcpuid, cr2); } if (!error && (get_cr3 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_CR3, &cr3); if (error == 0) printf("cr3[%d]\t\t0x%016lx\n", vcpuid, cr3); } if (!error && (get_cr4 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_CR4, &cr4); if (error == 0) printf("cr4[%d]\t\t0x%016lx\n", vcpuid, cr4); } if (!error && (get_dr0 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_DR0, &dr0); if (error == 0) printf("dr0[%d]\t\t0x%016lx\n", vcpuid, dr0); } if (!error && (get_dr1 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_DR1, &dr1); if (error == 0) printf("dr1[%d]\t\t0x%016lx\n", vcpuid, dr1); } if (!error && (get_dr2 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_DR2, &dr2); if (error == 0) printf("dr2[%d]\t\t0x%016lx\n", vcpuid, dr2); } if (!error && (get_dr3 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_DR3, &dr3); if (error == 0) printf("dr3[%d]\t\t0x%016lx\n", vcpuid, dr3); } if (!error && (get_dr6 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_DR6, &dr6); if (error == 0) printf("dr6[%d]\t\t0x%016lx\n", vcpuid, dr6); } if (!error && (get_dr7 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_DR7, &dr7); if (error == 0) printf("dr7[%d]\t\t0x%016lx\n", vcpuid, dr7); } if (!error && (get_rsp || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_RSP, &rsp); if (error == 0) printf("rsp[%d]\t\t0x%016lx\n", vcpuid, rsp); } if (!error && (get_rip || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_RIP, &rip); if (error == 0) printf("rip[%d]\t\t0x%016lx\n", vcpuid, rip); } if (!error && (get_rax || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_RAX, &rax); if (error == 0) printf("rax[%d]\t\t0x%016lx\n", vcpuid, rax); } if (!error && (get_rbx || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_RBX, &rbx); if (error == 0) printf("rbx[%d]\t\t0x%016lx\n", vcpuid, rbx); } if (!error && (get_rcx || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_RCX, &rcx); if (error == 0) printf("rcx[%d]\t\t0x%016lx\n", vcpuid, rcx); } if (!error && (get_rdx || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_RDX, &rdx); if (error == 0) printf("rdx[%d]\t\t0x%016lx\n", vcpuid, rdx); } if (!error && (get_rsi || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_RSI, &rsi); if (error == 0) printf("rsi[%d]\t\t0x%016lx\n", vcpuid, rsi); } if (!error && (get_rdi || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_RDI, &rdi); if (error == 0) printf("rdi[%d]\t\t0x%016lx\n", vcpuid, rdi); } if (!error && (get_rbp || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_RBP, &rbp); if (error == 0) printf("rbp[%d]\t\t0x%016lx\n", vcpuid, rbp); } if (!error && (get_r8 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_R8, &r8); if (error == 0) printf("r8[%d]\t\t0x%016lx\n", vcpuid, r8); } if (!error && (get_r9 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_R9, &r9); if (error == 0) printf("r9[%d]\t\t0x%016lx\n", vcpuid, r9); } if (!error && (get_r10 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_R10, &r10); if (error == 0) printf("r10[%d]\t\t0x%016lx\n", vcpuid, r10); } if (!error && (get_r11 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_R11, &r11); if (error == 0) printf("r11[%d]\t\t0x%016lx\n", vcpuid, r11); } if (!error && (get_r12 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_R12, &r12); if (error == 0) printf("r12[%d]\t\t0x%016lx\n", vcpuid, r12); } if (!error && (get_r13 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_R13, &r13); if (error == 0) printf("r13[%d]\t\t0x%016lx\n", vcpuid, r13); } if (!error && (get_r14 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_R14, &r14); if (error == 0) printf("r14[%d]\t\t0x%016lx\n", vcpuid, r14); } if (!error && (get_r15 || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_R15, &r15); if (error == 0) printf("r15[%d]\t\t0x%016lx\n", vcpuid, r15); } if (!error && (get_rflags || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); if (error == 0) printf("rflags[%d]\t0x%016lx\n", vcpuid, rflags); } return (error); } static int get_all_segments(struct vcpu *vcpu, int vcpuid) { uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; int error = 0; if (!error && (get_desc_ds || get_all)) { error = vm_get_desc(vcpu, VM_REG_GUEST_DS, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_es || get_all)) { error = vm_get_desc(vcpu, VM_REG_GUEST_ES, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_fs || get_all)) { error = vm_get_desc(vcpu, VM_REG_GUEST_FS, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_gs || get_all)) { error = vm_get_desc(vcpu, VM_REG_GUEST_GS, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_ss || get_all)) { error = vm_get_desc(vcpu, VM_REG_GUEST_SS, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_cs || get_all)) { error = vm_get_desc(vcpu, VM_REG_GUEST_CS, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_tr || get_all)) { error = vm_get_desc(vcpu, VM_REG_GUEST_TR, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_ldtr || get_all)) { error = vm_get_desc(vcpu, VM_REG_GUEST_LDTR, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_gdtr || get_all)) { error = vm_get_desc(vcpu, VM_REG_GUEST_GDTR, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("gdtr[%d]\t\t0x%016lx/0x%08x\n", vcpuid, desc_base, desc_limit); } } if (!error && (get_desc_idtr || get_all)) { error = vm_get_desc(vcpu, VM_REG_GUEST_IDTR, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("idtr[%d]\t\t0x%016lx/0x%08x\n", vcpuid, desc_base, desc_limit); } } if (!error && (get_cs || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_CS, &cs); if (error == 0) printf("cs[%d]\t\t0x%04lx\n", vcpuid, cs); } if (!error && (get_ds || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_DS, &ds); if (error == 0) printf("ds[%d]\t\t0x%04lx\n", vcpuid, ds); } if (!error && (get_es || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_ES, &es); if (error == 0) printf("es[%d]\t\t0x%04lx\n", vcpuid, es); } if (!error && (get_fs || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_FS, &fs); if (error == 0) printf("fs[%d]\t\t0x%04lx\n", vcpuid, fs); } if (!error && (get_gs || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_GS, &gs); if (error == 0) printf("gs[%d]\t\t0x%04lx\n", vcpuid, gs); } if (!error && (get_ss || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_SS, &ss); if (error == 0) printf("ss[%d]\t\t0x%04lx\n", vcpuid, ss); } if (!error && (get_tr || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_TR, &tr); if (error == 0) printf("tr[%d]\t\t0x%04lx\n", vcpuid, tr); } if (!error && (get_ldtr || get_all)) { error = vm_get_register(vcpu, VM_REG_GUEST_LDTR, &ldtr); if (error == 0) printf("ldtr[%d]\t\t0x%04lx\n", vcpuid, ldtr); } return (error); } static int get_misc_vmcs(struct vcpu *vcpu, int vcpuid) { uint64_t ctl, cr0, cr3, cr4, rsp, rip, pat, addr, u64; int error = 0; if (!error && (get_cr0_mask || get_all)) { uint64_t cr0mask; error = vm_get_vmcs_field(vcpu, VMCS_CR0_MASK, &cr0mask); if (error == 0) printf("cr0_mask[%d]\t\t0x%016lx\n", vcpuid, cr0mask); } if (!error && (get_cr0_shadow || get_all)) { uint64_t cr0shadow; error = vm_get_vmcs_field(vcpu, VMCS_CR0_SHADOW, &cr0shadow); if (error == 0) printf("cr0_shadow[%d]\t\t0x%016lx\n", vcpuid, cr0shadow); } if (!error && (get_cr4_mask || get_all)) { uint64_t cr4mask; error = vm_get_vmcs_field(vcpu, VMCS_CR4_MASK, &cr4mask); if (error == 0) printf("cr4_mask[%d]\t\t0x%016lx\n", vcpuid, cr4mask); } if (!error && (get_cr4_shadow || get_all)) { uint64_t cr4shadow; error = vm_get_vmcs_field(vcpu, VMCS_CR4_SHADOW, &cr4shadow); if (error == 0) printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpuid, cr4shadow); } if (!error && (get_cr3_targets || get_all)) { uint64_t target_count, target_addr; error = vm_get_vmcs_field(vcpu, VMCS_CR3_TARGET_COUNT, &target_count); if (error == 0) { printf("cr3_target_count[%d]\t0x%016lx\n", vcpuid, target_count); } error = vm_get_vmcs_field(vcpu, VMCS_CR3_TARGET0, &target_addr); if (error == 0) { printf("cr3_target0[%d]\t\t0x%016lx\n", vcpuid, target_addr); } error = vm_get_vmcs_field(vcpu, VMCS_CR3_TARGET1, &target_addr); if (error == 0) { printf("cr3_target1[%d]\t\t0x%016lx\n", vcpuid, target_addr); } error = vm_get_vmcs_field(vcpu, VMCS_CR3_TARGET2, &target_addr); if (error == 0) { printf("cr3_target2[%d]\t\t0x%016lx\n", vcpuid, target_addr); } error = vm_get_vmcs_field(vcpu, VMCS_CR3_TARGET3, &target_addr); if (error == 0) { printf("cr3_target3[%d]\t\t0x%016lx\n", vcpuid, target_addr); } } if (!error && (get_pinbased_ctls || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_PIN_BASED_CTLS, &ctl); if (error == 0) printf("pinbased_ctls[%d]\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_procbased_ctls || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_PRI_PROC_BASED_CTLS, &ctl); if (error == 0) printf("procbased_ctls[%d]\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_procbased_ctls2 || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_SEC_PROC_BASED_CTLS, &ctl); if (error == 0) printf("procbased_ctls2[%d]\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_vmcs_gla || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_GUEST_LINEAR_ADDRESS, &u64); if (error == 0) printf("gla[%d]\t\t0x%016lx\n", vcpuid, u64); } if (!error && (get_vmcs_gpa || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_GUEST_PHYSICAL_ADDRESS, &u64); if (error == 0) printf("gpa[%d]\t\t0x%016lx\n", vcpuid, u64); } if (!error && (get_vmcs_entry_interruption_info || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_ENTRY_INTR_INFO,&u64); if (error == 0) { printf("entry_interruption_info[%d]\t0x%016lx\n", vcpuid, u64); } } if (!error && (get_tpr_threshold || get_all)) { uint64_t threshold; error = vm_get_vmcs_field(vcpu, VMCS_TPR_THRESHOLD, &threshold); if (error == 0) printf("tpr_threshold[%d]\t0x%016lx\n", vcpuid, threshold); } if (!error && (get_inst_err || get_all)) { uint64_t insterr; error = vm_get_vmcs_field(vcpu, VMCS_INSTRUCTION_ERROR, &insterr); if (error == 0) { printf("instruction_error[%d]\t0x%016lx\n", vcpuid, insterr); } } if (!error && (get_exit_ctls || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_EXIT_CTLS, &ctl); if (error == 0) printf("exit_ctls[%d]\t\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_entry_ctls || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_ENTRY_CTLS, &ctl); if (error == 0) printf("entry_ctls[%d]\t\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_host_pat || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_HOST_IA32_PAT, &pat); if (error == 0) printf("host_pat[%d]\t\t0x%016lx\n", vcpuid, pat); } if (!error && (get_host_cr0 || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_HOST_CR0, &cr0); if (error == 0) printf("host_cr0[%d]\t\t0x%016lx\n", vcpuid, cr0); } if (!error && (get_host_cr3 || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_HOST_CR3, &cr3); if (error == 0) printf("host_cr3[%d]\t\t0x%016lx\n", vcpuid, cr3); } if (!error && (get_host_cr4 || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_HOST_CR4, &cr4); if (error == 0) printf("host_cr4[%d]\t\t0x%016lx\n", vcpuid, cr4); } if (!error && (get_host_rip || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_HOST_RIP, &rip); if (error == 0) printf("host_rip[%d]\t\t0x%016lx\n", vcpuid, rip); } if (!error && (get_host_rsp || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_HOST_RSP, &rsp); if (error == 0) printf("host_rsp[%d]\t\t0x%016lx\n", vcpuid, rsp); } if (!error && (get_vmcs_link || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_LINK_POINTER, &addr); if (error == 0) printf("vmcs_pointer[%d]\t0x%016lx\n", vcpuid, addr); } if (!error && (get_vmcs_exit_interruption_info || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_EXIT_INTR_INFO, &u64); if (error == 0) { printf("vmcs_exit_interruption_info[%d]\t0x%016lx\n", vcpuid, u64); } } if (!error && (get_vmcs_exit_interruption_error || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_EXIT_INTR_ERRCODE, &u64); if (error == 0) { printf("vmcs_exit_interruption_error[%d]\t0x%016lx\n", vcpuid, u64); } } if (!error && (get_vmcs_interruptibility || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_GUEST_INTERRUPTIBILITY, &u64); if (error == 0) { printf("vmcs_guest_interruptibility[%d]\t0x%016lx\n", vcpuid, u64); } } if (!error && (get_vmcs_exit_inst_length || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_EXIT_INSTRUCTION_LENGTH, &u64); if (error == 0) printf("vmcs_exit_inst_length[%d]\t0x%08x\n", vcpuid, (uint32_t)u64); } if (!error && (get_vmcs_exit_qualification || get_all)) { error = vm_get_vmcs_field(vcpu, VMCS_EXIT_QUALIFICATION, &u64); if (error == 0) printf("vmcs_exit_qualification[%d]\t0x%016lx\n", vcpuid, u64); } return (error); } static int get_misc_vmcb(struct vcpu *vcpu, int vcpuid) { uint64_t ctl, addr; int error = 0; if (!error && (get_vmcb_intercept || get_all)) { error = vm_get_vmcb_field(vcpu, VMCB_OFF_CR_INTERCEPT, 4, &ctl); if (error == 0) printf("cr_intercept[%d]\t0x%08x\n", vcpuid, (int)ctl); error = vm_get_vmcb_field(vcpu, VMCB_OFF_DR_INTERCEPT, 4, &ctl); if (error == 0) printf("dr_intercept[%d]\t0x%08x\n", vcpuid, (int)ctl); error = vm_get_vmcb_field(vcpu, VMCB_OFF_EXC_INTERCEPT, 4, &ctl); if (error == 0) printf("exc_intercept[%d]\t0x%08x\n", vcpuid, (int)ctl); error = vm_get_vmcb_field(vcpu, VMCB_OFF_INST1_INTERCEPT, 4, &ctl); if (error == 0) printf("inst1_intercept[%d]\t0x%08x\n", vcpuid, (int)ctl); error = vm_get_vmcb_field(vcpu, VMCB_OFF_INST2_INTERCEPT, 4, &ctl); if (error == 0) printf("inst2_intercept[%d]\t0x%08x\n", vcpuid, (int)ctl); } if (!error && (get_vmcb_tlb_ctrl || get_all)) { error = vm_get_vmcb_field(vcpu, VMCB_OFF_TLB_CTRL, 4, &ctl); if (error == 0) printf("TLB ctrl[%d]\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_vmcb_exit_details || get_all)) { error = vm_get_vmcb_field(vcpu, VMCB_OFF_EXITINFO1, 8, &ctl); if (error == 0) printf("exitinfo1[%d]\t0x%016lx\n", vcpuid, ctl); error = vm_get_vmcb_field(vcpu, VMCB_OFF_EXITINFO2, 8, &ctl); if (error == 0) printf("exitinfo2[%d]\t0x%016lx\n", vcpuid, ctl); error = vm_get_vmcb_field(vcpu, VMCB_OFF_EXITINTINFO, 8, &ctl); if (error == 0) printf("exitintinfo[%d]\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_vmcb_virq || get_all)) { error = vm_get_vmcb_field(vcpu, VMCB_OFF_VIRQ, 8, &ctl); if (error == 0) printf("v_irq/tpr[%d]\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_apic_access_addr || get_all)) { error = vm_get_vmcb_field(vcpu, VMCB_OFF_AVIC_BAR, 8, &addr); if (error == 0) printf("AVIC apic_bar[%d]\t0x%016lx\n", vcpuid, addr); } if (!error && (get_virtual_apic_addr || get_all)) { error = vm_get_vmcb_field(vcpu, VMCB_OFF_AVIC_PAGE, 8, &addr); if (error == 0) printf("AVIC backing page[%d]\t0x%016lx\n", vcpuid, addr); } if (!error && (get_avic_table || get_all)) { error = vm_get_vmcb_field(vcpu, VMCB_OFF_AVIC_LT, 8, &addr); if (error == 0) printf("AVIC logical table[%d]\t0x%016lx\n", vcpuid, addr); error = vm_get_vmcb_field(vcpu, VMCB_OFF_AVIC_PT, 8, &addr); if (error == 0) printf("AVIC physical table[%d]\t0x%016lx\n", vcpuid, addr); } return (error); } static struct option * setup_options(bool cpu_intel) { const struct option common_opts[] = { { "vm", REQ_ARG, 0, VMNAME }, { "cpu", REQ_ARG, 0, VCPU }, { "set-mem", REQ_ARG, 0, SET_MEM }, { "set-efer", REQ_ARG, 0, SET_EFER }, { "set-cr0", REQ_ARG, 0, SET_CR0 }, { "set-cr2", REQ_ARG, 0, SET_CR2 }, { "set-cr3", REQ_ARG, 0, SET_CR3 }, { "set-cr4", REQ_ARG, 0, SET_CR4 }, { "set-dr0", REQ_ARG, 0, SET_DR0 }, { "set-dr1", REQ_ARG, 0, SET_DR1 }, { "set-dr2", REQ_ARG, 0, SET_DR2 }, { "set-dr3", REQ_ARG, 0, SET_DR3 }, { "set-dr6", REQ_ARG, 0, SET_DR6 }, { "set-dr7", REQ_ARG, 0, SET_DR7 }, { "set-rsp", REQ_ARG, 0, SET_RSP }, { "set-rip", REQ_ARG, 0, SET_RIP }, { "set-rax", REQ_ARG, 0, SET_RAX }, { "set-rflags", REQ_ARG, 0, SET_RFLAGS }, { "desc-base", REQ_ARG, 0, DESC_BASE }, { "desc-limit", REQ_ARG, 0, DESC_LIMIT }, { "desc-access",REQ_ARG, 0, DESC_ACCESS }, { "set-cs", REQ_ARG, 0, SET_CS }, { "set-ds", REQ_ARG, 0, SET_DS }, { "set-es", REQ_ARG, 0, SET_ES }, { "set-fs", REQ_ARG, 0, SET_FS }, { "set-gs", REQ_ARG, 0, SET_GS }, { "set-ss", REQ_ARG, 0, SET_SS }, { "set-tr", REQ_ARG, 0, SET_TR }, { "set-ldtr", REQ_ARG, 0, SET_LDTR }, { "set-x2apic-state",REQ_ARG, 0, SET_X2APIC_STATE }, { "capname", REQ_ARG, 0, CAPNAME }, { "unassign-pptdev", REQ_ARG, 0, UNASSIGN_PPTDEV }, { "setcap", REQ_ARG, 0, SET_CAP }, { "get-gpa-pmap", REQ_ARG, 0, GET_GPA_PMAP }, { "assert-lapic-lvt", REQ_ARG, 0, ASSERT_LAPIC_LVT }, { "get-rtc-time", NO_ARG, &get_rtc_time, 1 }, { "set-rtc-time", REQ_ARG, 0, SET_RTC_TIME }, { "rtc-nvram-offset", REQ_ARG, 0, RTC_NVRAM_OFFSET }, { "get-rtc-nvram", NO_ARG, &get_rtc_nvram, 1 }, { "set-rtc-nvram", REQ_ARG, 0, SET_RTC_NVRAM }, { "getcap", NO_ARG, &getcap, 1 }, { "get-stats", NO_ARG, &get_stats, 1 }, { "get-desc-ds",NO_ARG, &get_desc_ds, 1 }, { "set-desc-ds",NO_ARG, &set_desc_ds, 1 }, { "get-desc-es",NO_ARG, &get_desc_es, 1 }, { "set-desc-es",NO_ARG, &set_desc_es, 1 }, { "get-desc-ss",NO_ARG, &get_desc_ss, 1 }, { "set-desc-ss",NO_ARG, &set_desc_ss, 1 }, { "get-desc-cs",NO_ARG, &get_desc_cs, 1 }, { "set-desc-cs",NO_ARG, &set_desc_cs, 1 }, { "get-desc-fs",NO_ARG, &get_desc_fs, 1 }, { "set-desc-fs",NO_ARG, &set_desc_fs, 1 }, { "get-desc-gs",NO_ARG, &get_desc_gs, 1 }, { "set-desc-gs",NO_ARG, &set_desc_gs, 1 }, { "get-desc-tr",NO_ARG, &get_desc_tr, 1 }, { "set-desc-tr",NO_ARG, &set_desc_tr, 1 }, { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 }, { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 }, { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 }, { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 }, { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 }, { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 }, { "get-memmap", NO_ARG, &get_memmap, 1 }, { "get-memseg", NO_ARG, &get_memseg, 1 }, { "get-efer", NO_ARG, &get_efer, 1 }, { "get-cr0", NO_ARG, &get_cr0, 1 }, { "get-cr2", NO_ARG, &get_cr2, 1 }, { "get-cr3", NO_ARG, &get_cr3, 1 }, { "get-cr4", NO_ARG, &get_cr4, 1 }, { "get-dr0", NO_ARG, &get_dr0, 1 }, { "get-dr1", NO_ARG, &get_dr1, 1 }, { "get-dr2", NO_ARG, &get_dr2, 1 }, { "get-dr3", NO_ARG, &get_dr3, 1 }, { "get-dr6", NO_ARG, &get_dr6, 1 }, { "get-dr7", NO_ARG, &get_dr7, 1 }, { "get-rsp", NO_ARG, &get_rsp, 1 }, { "get-rip", NO_ARG, &get_rip, 1 }, { "get-rax", NO_ARG, &get_rax, 1 }, { "get-rbx", NO_ARG, &get_rbx, 1 }, { "get-rcx", NO_ARG, &get_rcx, 1 }, { "get-rdx", NO_ARG, &get_rdx, 1 }, { "get-rsi", NO_ARG, &get_rsi, 1 }, { "get-rdi", NO_ARG, &get_rdi, 1 }, { "get-rbp", NO_ARG, &get_rbp, 1 }, { "get-r8", NO_ARG, &get_r8, 1 }, { "get-r9", NO_ARG, &get_r9, 1 }, { "get-r10", NO_ARG, &get_r10, 1 }, { "get-r11", NO_ARG, &get_r11, 1 }, { "get-r12", NO_ARG, &get_r12, 1 }, { "get-r13", NO_ARG, &get_r13, 1 }, { "get-r14", NO_ARG, &get_r14, 1 }, { "get-r15", NO_ARG, &get_r15, 1 }, { "get-rflags", NO_ARG, &get_rflags, 1 }, { "get-cs", NO_ARG, &get_cs, 1 }, { "get-ds", NO_ARG, &get_ds, 1 }, { "get-es", NO_ARG, &get_es, 1 }, { "get-fs", NO_ARG, &get_fs, 1 }, { "get-gs", NO_ARG, &get_gs, 1 }, { "get-ss", NO_ARG, &get_ss, 1 }, { "get-tr", NO_ARG, &get_tr, 1 }, { "get-ldtr", NO_ARG, &get_ldtr, 1 }, { "get-eptp", NO_ARG, &get_eptp, 1 }, { "get-exception-bitmap", NO_ARG, &get_exception_bitmap, 1 }, { "get-io-bitmap-address", NO_ARG, &get_io_bitmap, 1 }, { "get-tsc-offset", NO_ARG, &get_tsc_offset, 1 }, { "get-msr-bitmap", NO_ARG, &get_msr_bitmap, 1 }, { "get-msr-bitmap-address", NO_ARG, &get_msr_bitmap_address, 1 }, { "get-guest-pat", NO_ARG, &get_guest_pat, 1 }, { "get-guest-sysenter", NO_ARG, &get_guest_sysenter, 1 }, { "get-exit-reason", NO_ARG, &get_exit_reason, 1 }, { "get-x2apic-state", NO_ARG, &get_x2apic_state, 1 }, { "get-all", NO_ARG, &get_all, 1 }, { "run", NO_ARG, &run, 1 }, { "create", NO_ARG, &create, 1 }, { "destroy", NO_ARG, &destroy, 1 }, { "inject-nmi", NO_ARG, &inject_nmi, 1 }, { "force-reset", NO_ARG, &force_reset, 1 }, { "force-poweroff", NO_ARG, &force_poweroff, 1 }, { "get-active-cpus", NO_ARG, &get_active_cpus, 1 }, { "get-suspended-cpus", NO_ARG, &get_suspended_cpus, 1 }, { "get-intinfo", NO_ARG, &get_intinfo, 1 }, { "get-cpu-topology", NO_ARG, &get_cpu_topology, 1 }, #ifdef BHYVE_SNAPSHOT { "checkpoint", REQ_ARG, 0, SET_CHECKPOINT_FILE}, { "suspend", REQ_ARG, 0, SET_SUSPEND_FILE}, + { "migrate", REQ_ARG, 0, MIGRATE_VM}, + { "migrate-live", REQ_ARG, 0, MIGRATE_VM_LIVE}, #endif }; const struct option intel_opts[] = { { "get-vmcs-pinbased-ctls", NO_ARG, &get_pinbased_ctls, 1 }, { "get-vmcs-procbased-ctls", NO_ARG, &get_procbased_ctls, 1 }, { "get-vmcs-procbased-ctls2", NO_ARG, &get_procbased_ctls2, 1 }, { "get-vmcs-guest-linear-address", NO_ARG, &get_vmcs_gla, 1 }, { "get-vmcs-guest-physical-address", NO_ARG, &get_vmcs_gpa, 1 }, { "get-vmcs-entry-interruption-info", NO_ARG, &get_vmcs_entry_interruption_info, 1}, { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 }, { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 }, { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 }, { "get-vmcs-cr4-shadow", NO_ARG, &get_cr4_shadow, 1 }, { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1 }, { "get-vmcs-tpr-threshold", NO_ARG, &get_tpr_threshold, 1 }, { "get-vmcs-vpid", NO_ARG, &get_vpid_asid, 1 }, { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 }, { "get-vmcs-entry-ctls", NO_ARG, &get_entry_ctls, 1 }, { "get-vmcs-instruction-error", NO_ARG, &get_inst_err, 1 }, { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 }, { "get-vmcs-host-cr0", NO_ARG, &get_host_cr0, 1 }, { "get-vmcs-exit-qualification", NO_ARG, &get_vmcs_exit_qualification, 1 }, { "get-vmcs-exit-inst-length", NO_ARG, &get_vmcs_exit_inst_length, 1 }, { "get-vmcs-interruptibility", NO_ARG, &get_vmcs_interruptibility, 1 }, { "get-vmcs-exit-interruption-error", NO_ARG, &get_vmcs_exit_interruption_error, 1 }, { "get-vmcs-exit-interruption-info", NO_ARG, &get_vmcs_exit_interruption_info, 1 }, { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 }, { "get-vmcs-host-cr3", NO_ARG, &get_host_cr3, 1 }, { "get-vmcs-host-cr4", NO_ARG, &get_host_cr4, 1 }, { "get-vmcs-host-rip", NO_ARG, &get_host_rip, 1 }, { "get-vmcs-host-rsp", NO_ARG, &get_host_rsp, 1 }, { "get-apic-access-address", NO_ARG, &get_apic_access_addr, 1}, { "get-virtual-apic-address", NO_ARG, &get_virtual_apic_addr, 1} }; const struct option amd_opts[] = { { "get-vmcb-intercepts", NO_ARG, &get_vmcb_intercept, 1 }, { "get-vmcb-asid", NO_ARG, &get_vpid_asid, 1 }, { "get-vmcb-exit-details", NO_ARG, &get_vmcb_exit_details, 1 }, { "get-vmcb-tlb-ctrl", NO_ARG, &get_vmcb_tlb_ctrl, 1 }, { "get-vmcb-virq", NO_ARG, &get_vmcb_virq, 1 }, { "get-avic-apic-bar", NO_ARG, &get_apic_access_addr, 1 }, { "get-avic-backing-page", NO_ARG, &get_virtual_apic_addr, 1 }, { "get-avic-table", NO_ARG, &get_avic_table, 1 } }; const struct option null_opt = { NULL, 0, NULL, 0 }; struct option *all_opts; char *cp; int optlen; optlen = sizeof(common_opts); if (cpu_intel) optlen += sizeof(intel_opts); else optlen += sizeof(amd_opts); optlen += sizeof(null_opt); all_opts = malloc(optlen); cp = (char *)all_opts; memcpy(cp, common_opts, sizeof(common_opts)); cp += sizeof(common_opts); if (cpu_intel) { memcpy(cp, intel_opts, sizeof(intel_opts)); cp += sizeof(intel_opts); } else { memcpy(cp, amd_opts, sizeof(amd_opts)); cp += sizeof(amd_opts); } memcpy(cp, &null_opt, sizeof(null_opt)); cp += sizeof(null_opt); return (all_opts); } static const char * wday_str(int idx) { static const char *weekdays[] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" }; if (idx >= 0 && idx < 7) return (weekdays[idx]); else return ("UNK"); } static const char * mon_str(int idx) { static const char *months[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" }; if (idx >= 0 && idx < 12) return (months[idx]); else return ("UNK"); } static int show_memmap(struct vmctx *ctx) { char name[SPECNAMELEN + 1], numbuf[8]; vm_ooffset_t segoff; vm_paddr_t gpa; size_t maplen, seglen; int error, flags, prot, segid, delim; printf("Address Length Segment Offset "); printf("Prot Flags\n"); gpa = 0; while (1) { error = vm_mmap_getnext(ctx, &gpa, &segid, &segoff, &maplen, &prot, &flags); if (error) return (errno == ENOENT ? 0 : error); error = vm_get_memseg(ctx, segid, &seglen, name, sizeof(name)); if (error) return (error); printf("%-12lX", gpa); humanize_number(numbuf, sizeof(numbuf), maplen, "B", HN_AUTOSCALE, HN_NOSPACE); printf("%-12s", numbuf); printf("%-12s", name[0] ? name : "sysmem"); printf("%-12lX", segoff); printf("%c%c%c ", prot & PROT_READ ? 'R' : '-', prot & PROT_WRITE ? 'W' : '-', prot & PROT_EXEC ? 'X' : '-'); delim = '\0'; if (flags & VM_MEMMAP_F_WIRED) { printf("%cwired", delim); delim = '/'; } if (flags & VM_MEMMAP_F_IOMMU) { printf("%ciommu", delim); delim = '/'; } printf("\n"); gpa += maplen; } } static int show_memseg(struct vmctx *ctx) { char name[SPECNAMELEN + 1], numbuf[8]; size_t seglen; int error, segid; printf("ID Length Name\n"); segid = 0; while (1) { error = vm_get_memseg(ctx, segid, &seglen, name, sizeof(name)); if (error) return (errno == EINVAL ? 0 : error); if (seglen) { printf("%-4d", segid); humanize_number(numbuf, sizeof(numbuf), seglen, "B", HN_AUTOSCALE, HN_NOSPACE); printf("%-12s", numbuf); printf("%s", name[0] ? name : "sysmem"); printf("\n"); } segid++; } } #ifdef BHYVE_SNAPSHOT static int send_message(const char *vmname, nvlist_t *nvl) { struct sockaddr_un addr; int err = 0, socket_fd; socket_fd = socket(PF_UNIX, SOCK_STREAM, 0); if (socket_fd < 0) { perror("Error creating bhyvectl socket"); err = errno; goto done; } memset(&addr, 0, sizeof(struct sockaddr_un)); snprintf(addr.sun_path, sizeof(addr.sun_path), "%s%s", BHYVE_RUN_DIR, vmname); addr.sun_family = AF_UNIX; addr.sun_len = SUN_LEN(&addr); if (connect(socket_fd, (struct sockaddr *)&addr, addr.sun_len) != 0) { perror("connect() failed"); err = errno; goto done; } if (nvlist_send(socket_fd, nvl) < 0) { perror("nvlist_send() failed"); err = errno; } done: nvlist_destroy(nvl); if (socket_fd >= 0) close(socket_fd); return (err); } static int open_directory(const char *file) { char *path; int fd; if ((path = strdup(file)) == NULL) return (-1); dirname(path); fd = open(path, O_DIRECTORY); free(path); return (fd); } static int snapshot_request(const char *vmname, char *file, bool suspend) { nvlist_t *nvl; int fd; if ((fd = open_directory(file)) < 0) return (errno); nvl = nvlist_create(0); nvlist_add_string(nvl, "cmd", "checkpoint"); nvlist_add_string(nvl, "filename", basename(file)); nvlist_add_bool(nvl, "suspend", suspend); nvlist_move_descriptor(nvl, "fddir", fd); return (send_message(vmname, nvl)); } -#endif + +static int +migration_request(const char *vmname, const char *migrate_vm, bool live) +{ + nvlist_t *nvl; + char *hostname, *pos; + int rc; + unsigned int port = DEFAULT_MIGRATION_PORT; + + hostname = strdup(migrate_vm); + + if ((pos = strchr(hostname, ':')) != NULL) { + *pos = '\0'; + pos = pos + 1; + + rc = sscanf(pos, "%u", &port); + + if (rc <= 0) { + fprintf(stderr, "Could not parse the port\n"); + free(hostname); + return (EINVAL); + } + } + + nvl = nvlist_create(0); + nvlist_add_string(nvl, "cmd", "migrate"); + nvlist_add_string(nvl, "hostname", hostname); + nvlist_add_number(nvl, "port", port); + nvlist_add_bool(nvl, "live", live); + + free(hostname); + + return (send_message(vmname, nvl)); +} + +#endif /* BHYVE_SNAPSHOT */ int main(int argc, char *argv[]) { char *vmname; int error, ch, vcpuid, ptenum; vm_paddr_t gpa_pmap; struct vm_run vmrun; uint64_t rax, cr0, cr2, cr3, cr4, dr0, dr1, dr2, dr3, dr6, dr7; uint64_t rsp, rip, rflags, efer, pat; uint64_t eptp, bm, addr, u64, pteval[4], *pte, info[2]; struct vmctx *ctx; struct vcpu *vcpu; cpuset_t cpus; bool cpu_intel; uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; struct tm tm; struct option *opts; #ifdef BHYVE_SNAPSHOT - char *checkpoint_file = NULL; + char *checkpoint_file = NULL, *migrate_host = NULL; #endif cpu_intel = cpu_vendor_intel(); opts = setup_options(cpu_intel); vcpuid = 0; vmname = NULL; assert_lapic_lvt = -1; progname = basename(argv[0]); while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) { switch (ch) { case 0: break; case VMNAME: vmname = optarg; break; case VCPU: vcpuid = atoi(optarg); break; case SET_MEM: memsize = atoi(optarg) * MB; memsize = roundup(memsize, 2 * MB); break; case SET_EFER: efer = strtoul(optarg, NULL, 0); set_efer = 1; break; case SET_CR0: cr0 = strtoul(optarg, NULL, 0); set_cr0 = 1; break; case SET_CR2: cr2 = strtoul(optarg, NULL, 0); set_cr2 = 1; break; case SET_CR3: cr3 = strtoul(optarg, NULL, 0); set_cr3 = 1; break; case SET_CR4: cr4 = strtoul(optarg, NULL, 0); set_cr4 = 1; break; case SET_DR0: dr0 = strtoul(optarg, NULL, 0); set_dr0 = 1; break; case SET_DR1: dr1 = strtoul(optarg, NULL, 0); set_dr1 = 1; break; case SET_DR2: dr2 = strtoul(optarg, NULL, 0); set_dr2 = 1; break; case SET_DR3: dr3 = strtoul(optarg, NULL, 0); set_dr3 = 1; break; case SET_DR6: dr6 = strtoul(optarg, NULL, 0); set_dr6 = 1; break; case SET_DR7: dr7 = strtoul(optarg, NULL, 0); set_dr7 = 1; break; case SET_RSP: rsp = strtoul(optarg, NULL, 0); set_rsp = 1; break; case SET_RIP: rip = strtoul(optarg, NULL, 0); set_rip = 1; break; case SET_RAX: rax = strtoul(optarg, NULL, 0); set_rax = 1; break; case SET_RFLAGS: rflags = strtoul(optarg, NULL, 0); set_rflags = 1; break; case DESC_BASE: desc_base = strtoul(optarg, NULL, 0); break; case DESC_LIMIT: desc_limit = strtoul(optarg, NULL, 0); break; case DESC_ACCESS: desc_access = strtoul(optarg, NULL, 0); break; case SET_CS: cs = strtoul(optarg, NULL, 0); set_cs = 1; break; case SET_DS: ds = strtoul(optarg, NULL, 0); set_ds = 1; break; case SET_ES: es = strtoul(optarg, NULL, 0); set_es = 1; break; case SET_FS: fs = strtoul(optarg, NULL, 0); set_fs = 1; break; case SET_GS: gs = strtoul(optarg, NULL, 0); set_gs = 1; break; case SET_SS: ss = strtoul(optarg, NULL, 0); set_ss = 1; break; case SET_TR: tr = strtoul(optarg, NULL, 0); set_tr = 1; break; case SET_LDTR: ldtr = strtoul(optarg, NULL, 0); set_ldtr = 1; break; case SET_X2APIC_STATE: x2apic_state = strtol(optarg, NULL, 0); set_x2apic_state = 1; break; case SET_CAP: capval = strtoul(optarg, NULL, 0); setcap = 1; break; case SET_RTC_TIME: rtc_secs = strtoul(optarg, NULL, 0); set_rtc_time = 1; break; case SET_RTC_NVRAM: rtc_nvram_value = (uint8_t)strtoul(optarg, NULL, 0); set_rtc_nvram = 1; break; case RTC_NVRAM_OFFSET: rtc_nvram_offset = strtoul(optarg, NULL, 0); break; case GET_GPA_PMAP: gpa_pmap = strtoul(optarg, NULL, 0); get_gpa_pmap = 1; break; case CAPNAME: capname = optarg; break; case UNASSIGN_PPTDEV: unassign_pptdev = 1; if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3) usage(cpu_intel); break; case ASSERT_LAPIC_LVT: assert_lapic_lvt = atoi(optarg); break; #ifdef BHYVE_SNAPSHOT case SET_CHECKPOINT_FILE: case SET_SUSPEND_FILE: if (checkpoint_file != NULL) usage(cpu_intel); checkpoint_file = optarg; vm_suspend_opt = (ch == SET_SUSPEND_FILE); break; + case MIGRATE_VM: + case MIGRATE_VM_LIVE: + if (migrate_host != NULL) + usage(cpu_intel); + + migrate_host = optarg; + vm_migrate_live = (ch == MIGRATE_VM_LIVE); + break; #endif default: usage(cpu_intel); } } argc -= optind; argv += optind; if (vmname == NULL) usage(cpu_intel); error = 0; if (!error && create) error = vm_create(vmname); if (!error) { ctx = vm_open(vmname); if (ctx == NULL) { fprintf(stderr, "vm_open: %s could not be opened: %s\n", vmname, strerror(errno)); exit (1); } vcpu = vm_vcpu_open(ctx, vcpuid); } if (!error && memsize) error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (!error && set_efer) error = vm_set_register(vcpu, VM_REG_GUEST_EFER, efer); if (!error && set_cr0) error = vm_set_register(vcpu, VM_REG_GUEST_CR0, cr0); if (!error && set_cr2) error = vm_set_register(vcpu, VM_REG_GUEST_CR2, cr2); if (!error && set_cr3) error = vm_set_register(vcpu, VM_REG_GUEST_CR3, cr3); if (!error && set_cr4) error = vm_set_register(vcpu, VM_REG_GUEST_CR4, cr4); if (!error && set_dr0) error = vm_set_register(vcpu, VM_REG_GUEST_DR0, dr0); if (!error && set_dr1) error = vm_set_register(vcpu, VM_REG_GUEST_DR1, dr1); if (!error && set_dr2) error = vm_set_register(vcpu, VM_REG_GUEST_DR2, dr2); if (!error && set_dr3) error = vm_set_register(vcpu, VM_REG_GUEST_DR3, dr3); if (!error && set_dr6) error = vm_set_register(vcpu, VM_REG_GUEST_DR6, dr6); if (!error && set_dr7) error = vm_set_register(vcpu, VM_REG_GUEST_DR7, dr7); if (!error && set_rsp) error = vm_set_register(vcpu, VM_REG_GUEST_RSP, rsp); if (!error && set_rip) error = vm_set_register(vcpu, VM_REG_GUEST_RIP, rip); if (!error && set_rax) error = vm_set_register(vcpu, VM_REG_GUEST_RAX, rax); if (!error && set_rflags) { error = vm_set_register(vcpu, VM_REG_GUEST_RFLAGS, rflags); } if (!error && set_desc_ds) { error = vm_set_desc(vcpu, VM_REG_GUEST_DS, desc_base, desc_limit, desc_access); } if (!error && set_desc_es) { error = vm_set_desc(vcpu, VM_REG_GUEST_ES, desc_base, desc_limit, desc_access); } if (!error && set_desc_ss) { error = vm_set_desc(vcpu, VM_REG_GUEST_SS, desc_base, desc_limit, desc_access); } if (!error && set_desc_cs) { error = vm_set_desc(vcpu, VM_REG_GUEST_CS, desc_base, desc_limit, desc_access); } if (!error && set_desc_fs) { error = vm_set_desc(vcpu, VM_REG_GUEST_FS, desc_base, desc_limit, desc_access); } if (!error && set_desc_gs) { error = vm_set_desc(vcpu, VM_REG_GUEST_GS, desc_base, desc_limit, desc_access); } if (!error && set_desc_tr) { error = vm_set_desc(vcpu, VM_REG_GUEST_TR, desc_base, desc_limit, desc_access); } if (!error && set_desc_ldtr) { error = vm_set_desc(vcpu, VM_REG_GUEST_LDTR, desc_base, desc_limit, desc_access); } if (!error && set_desc_gdtr) { error = vm_set_desc(vcpu, VM_REG_GUEST_GDTR, desc_base, desc_limit, 0); } if (!error && set_desc_idtr) { error = vm_set_desc(vcpu, VM_REG_GUEST_IDTR, desc_base, desc_limit, 0); } if (!error && set_cs) error = vm_set_register(vcpu, VM_REG_GUEST_CS, cs); if (!error && set_ds) error = vm_set_register(vcpu, VM_REG_GUEST_DS, ds); if (!error && set_es) error = vm_set_register(vcpu, VM_REG_GUEST_ES, es); if (!error && set_fs) error = vm_set_register(vcpu, VM_REG_GUEST_FS, fs); if (!error && set_gs) error = vm_set_register(vcpu, VM_REG_GUEST_GS, gs); if (!error && set_ss) error = vm_set_register(vcpu, VM_REG_GUEST_SS, ss); if (!error && set_tr) error = vm_set_register(vcpu, VM_REG_GUEST_TR, tr); if (!error && set_ldtr) error = vm_set_register(vcpu, VM_REG_GUEST_LDTR, ldtr); if (!error && set_x2apic_state) error = vm_set_x2apic_state(vcpu, x2apic_state); if (!error && unassign_pptdev) error = vm_unassign_pptdev(ctx, bus, slot, func); if (!error && inject_nmi) { error = vm_inject_nmi(vcpu); } if (!error && assert_lapic_lvt != -1) { error = vm_lapic_local_irq(vcpu, assert_lapic_lvt); } if (!error && (get_memseg || get_all)) error = show_memseg(ctx); if (!error && (get_memmap || get_all)) error = show_memmap(ctx); if (!error) error = get_all_registers(vcpu, vcpuid); if (!error) error = get_all_segments(vcpu, vcpuid); if (!error) { if (cpu_intel) error = get_misc_vmcs(vcpu, vcpuid); else error = get_misc_vmcb(vcpu, vcpuid); } if (!error && (get_x2apic_state || get_all)) { error = vm_get_x2apic_state(vcpu, &x2apic_state); if (error == 0) printf("x2apic_state[%d]\t%d\n", vcpuid, x2apic_state); } if (!error && (get_eptp || get_all)) { if (cpu_intel) error = vm_get_vmcs_field(vcpu, VMCS_EPTP, &eptp); else error = vm_get_vmcb_field(vcpu, VMCB_OFF_NPT_BASE, 8, &eptp); if (error == 0) printf("%s[%d]\t\t0x%016lx\n", cpu_intel ? "eptp" : "rvi/npt", vcpuid, eptp); } if (!error && (get_exception_bitmap || get_all)) { if(cpu_intel) error = vm_get_vmcs_field(vcpu, VMCS_EXCEPTION_BITMAP, &bm); else error = vm_get_vmcb_field(vcpu, VMCB_OFF_EXC_INTERCEPT, 4, &bm); if (error == 0) printf("exception_bitmap[%d]\t%#lx\n", vcpuid, bm); } if (!error && (get_io_bitmap || get_all)) { if (cpu_intel) { error = vm_get_vmcs_field(vcpu, VMCS_IO_BITMAP_A, &bm); if (error == 0) printf("io_bitmap_a[%d]\t%#lx\n", vcpuid, bm); error = vm_get_vmcs_field(vcpu, VMCS_IO_BITMAP_B, &bm); if (error == 0) printf("io_bitmap_b[%d]\t%#lx\n", vcpuid, bm); } else { error = vm_get_vmcb_field(vcpu, VMCB_OFF_IO_PERM, 8, &bm); if (error == 0) printf("io_bitmap[%d]\t%#lx\n", vcpuid, bm); } } if (!error && (get_tsc_offset || get_all)) { uint64_t tscoff; if (cpu_intel) error = vm_get_vmcs_field(vcpu, VMCS_TSC_OFFSET, &tscoff); else error = vm_get_vmcb_field(vcpu, VMCB_OFF_TSC_OFFSET, 8, &tscoff); if (error == 0) printf("tsc_offset[%d]\t0x%016lx\n", vcpuid, tscoff); } if (!error && (get_msr_bitmap_address || get_all)) { if (cpu_intel) error = vm_get_vmcs_field(vcpu, VMCS_MSR_BITMAP, &addr); else error = vm_get_vmcb_field(vcpu, VMCB_OFF_MSR_PERM, 8, &addr); if (error == 0) printf("msr_bitmap[%d]\t\t%#lx\n", vcpuid, addr); } if (!error && (get_msr_bitmap || get_all)) { if (cpu_intel) { error = vm_get_vmcs_field(vcpu, VMCS_MSR_BITMAP, &addr); } else { error = vm_get_vmcb_field(vcpu, VMCB_OFF_MSR_PERM, 8, &addr); } if (error == 0) error = dump_msr_bitmap(vcpuid, addr, cpu_intel); } if (!error && (get_vpid_asid || get_all)) { uint64_t vpid; if (cpu_intel) error = vm_get_vmcs_field(vcpu, VMCS_VPID, &vpid); else error = vm_get_vmcb_field(vcpu, VMCB_OFF_ASID, 4, &vpid); if (error == 0) printf("%s[%d]\t\t0x%04lx\n", cpu_intel ? "vpid" : "asid", vcpuid, vpid); } if (!error && (get_guest_pat || get_all)) { if (cpu_intel) error = vm_get_vmcs_field(vcpu, VMCS_GUEST_IA32_PAT, &pat); else error = vm_get_vmcb_field(vcpu, VMCB_OFF_GUEST_PAT, 8, &pat); if (error == 0) printf("guest_pat[%d]\t\t0x%016lx\n", vcpuid, pat); } if (!error && (get_guest_sysenter || get_all)) { if (cpu_intel) error = vm_get_vmcs_field(vcpu, VMCS_GUEST_IA32_SYSENTER_CS, &cs); else error = vm_get_vmcb_field(vcpu, VMCB_OFF_SYSENTER_CS, 8, &cs); if (error == 0) printf("guest_sysenter_cs[%d]\t%#lx\n", vcpuid, cs); if (cpu_intel) error = vm_get_vmcs_field(vcpu, VMCS_GUEST_IA32_SYSENTER_ESP, &rsp); else error = vm_get_vmcb_field(vcpu, VMCB_OFF_SYSENTER_ESP, 8, &rsp); if (error == 0) printf("guest_sysenter_sp[%d]\t%#lx\n", vcpuid, rsp); if (cpu_intel) error = vm_get_vmcs_field(vcpu, VMCS_GUEST_IA32_SYSENTER_EIP, &rip); else error = vm_get_vmcb_field(vcpu, VMCB_OFF_SYSENTER_EIP, 8, &rip); if (error == 0) printf("guest_sysenter_ip[%d]\t%#lx\n", vcpuid, rip); } if (!error && (get_exit_reason || get_all)) { if (cpu_intel) error = vm_get_vmcs_field(vcpu, VMCS_EXIT_REASON, &u64); else error = vm_get_vmcb_field(vcpu, VMCB_OFF_EXIT_REASON, 8, &u64); if (error == 0) printf("exit_reason[%d]\t%#lx\n", vcpuid, u64); } if (!error && setcap) { int captype; captype = vm_capability_name2type(capname); error = vm_set_capability(vcpu, captype, capval); if (error != 0 && errno == ENOENT) printf("Capability \"%s\" is not available\n", capname); } if (!error && get_gpa_pmap) { error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum); if (error == 0) { printf("gpa %#lx:", gpa_pmap); pte = &pteval[0]; while (ptenum-- > 0) printf(" %#lx", *pte++); printf("\n"); } } if (!error && set_rtc_nvram) error = vm_rtc_write(ctx, rtc_nvram_offset, rtc_nvram_value); if (!error && (get_rtc_nvram || get_all)) { error = vm_rtc_read(ctx, rtc_nvram_offset, &rtc_nvram_value); if (error == 0) { printf("rtc nvram[%03d]: 0x%02x\n", rtc_nvram_offset, rtc_nvram_value); } } if (!error && set_rtc_time) error = vm_rtc_settime(ctx, rtc_secs); if (!error && (get_rtc_time || get_all)) { error = vm_rtc_gettime(ctx, &rtc_secs); if (error == 0) { gmtime_r(&rtc_secs, &tm); printf("rtc time %#lx: %s %s %02d %02d:%02d:%02d %d\n", rtc_secs, wday_str(tm.tm_wday), mon_str(tm.tm_mon), tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, 1900 + tm.tm_year); } } if (!error && (getcap || get_all)) { int captype, val, getcaptype; if (getcap && capname) getcaptype = vm_capability_name2type(capname); else getcaptype = -1; for (captype = 0; captype < VM_CAP_MAX; captype++) { if (getcaptype >= 0 && captype != getcaptype) continue; error = vm_get_capability(vcpu, captype, &val); if (error == 0) { printf("Capability \"%s\" is %s on vcpu %d\n", vm_capability_type2name(captype), val ? "set" : "not set", vcpuid); } else if (errno == ENOENT) { error = 0; printf("Capability \"%s\" is not available\n", vm_capability_type2name(captype)); } else { break; } } } if (!error && (get_active_cpus || get_all)) { error = vm_active_cpus(ctx, &cpus); if (!error) print_cpus("active cpus", &cpus); } if (!error && (get_suspended_cpus || get_all)) { error = vm_suspended_cpus(ctx, &cpus); if (!error) print_cpus("suspended cpus", &cpus); } if (!error && (get_intinfo || get_all)) { error = vm_get_intinfo(vcpu, &info[0], &info[1]); if (!error) { print_intinfo("pending", info[0]); print_intinfo("current", info[1]); } } if (!error && (get_stats || get_all)) { int i, num_stats; uint64_t *stats; struct timeval tv; const char *desc; stats = vm_get_stats(vcpu, &tv, &num_stats); if (stats != NULL) { printf("vcpu%d stats:\n", vcpuid); for (i = 0; i < num_stats; i++) { desc = vm_get_stat_desc(ctx, i); printf("%-40s\t%ld\n", desc, stats[i]); } } } if (!error && (get_cpu_topology || get_all)) { uint16_t sockets, cores, threads, maxcpus; vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); printf("cpu_topology:\tsockets=%hu, cores=%hu, threads=%hu, " "maxcpus=%hu\n", sockets, cores, threads, maxcpus); } if (!error && run) { struct vm_exit vmexit; cpuset_t cpuset; vmrun.vm_exit = &vmexit; vmrun.cpuset = &cpuset; vmrun.cpusetsize = sizeof(cpuset); error = vm_run(vcpu, &vmrun); if (error == 0) dump_vm_run_exitcode(&vmexit, vcpuid); else printf("vm_run error %d\n", error); } if (!error && force_reset) error = vm_suspend(ctx, VM_SUSPEND_RESET); if (!error && force_poweroff) error = vm_suspend(ctx, VM_SUSPEND_POWEROFF); if (error) printf("errno = %d\n", errno); if (!error && destroy) vm_destroy(ctx); #ifdef BHYVE_SNAPSHOT if (!error && checkpoint_file) error = snapshot_request(vmname, checkpoint_file, vm_suspend_opt); + + if (!error && migrate_host) + error = migration_request(vmname, migrate_host, vm_migrate_live); #endif free (opts); exit(error); }