diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile index 86039a8c45da..499c0d1bb89d 100644 --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -1,137 +1,135 @@ # # .include PROG= bhyve PACKAGE= bhyve MAN= bhyve.8 bhyve_config.5 BHYVE_SYSDIR?=${SRCTOP} .PATH: ${.CURDIR}/${MACHINE_CPUARCH} \ ${SRCTOP}/sys/libkern \ ${SRCTOP}/sys/cam/ctl SRCS= \ acpi.c \ acpi_device.c \ audio.c \ basl.c \ bhyvegc.c \ bhyverun.c \ block_if.c \ bootrom.c \ config.c \ console.c \ crc16.c \ ctl_scsi_all.c \ ctl_util.c \ - e820.c \ gdb.c \ hda_codec.c \ inout.c \ ioapic.c \ iov.c \ kernemu_dev.c \ mem.c \ mevent.c \ - mptbl.c \ net_backends.c \ net_utils.c \ pci_ahci.c \ pci_e82545.c \ pci_emul.c \ pci_fbuf.c \ pci_gvt-d.c \ pci_hda.c \ pci_hostbridge.c \ pci_irq.c \ pci_lpc.c \ pci_nvme.c \ pci_passthru.c \ pci_uart.c \ pci_virtio_9p.c \ pci_virtio_block.c \ pci_virtio_console.c \ pci_virtio_input.c \ pci_virtio_net.c \ pci_virtio_rnd.c \ pci_virtio_scsi.c \ pci_xhci.c \ pctestdev.c \ pm.c \ qemu_fwcfg.c \ qemu_loader.c \ rfb.c \ rtc.c \ smbiostbl.c \ sockstream.c \ spinup_ap.c \ tpm_device.c \ tpm_emul_passthru.c \ tpm_intf_crb.c \ tpm_ppi_qemu.c \ uart_emul.c \ usb_emul.c \ usb_mouse.c \ vga.c \ virtio.c \ vmgenc.c \ xmsr.c .if ${MK_BHYVE_SNAPSHOT} != "no" SRCS+= snapshot.c .endif .include "${MACHINE_CPUARCH}/Makefile.inc" CFLAGS.kernemu_dev.c+= -I${SRCTOP}/sys/amd64 CFLAGS+=-I${.CURDIR} \ -I${.CURDIR}/../../contrib/lib9p \ -I${SRCTOP}/sys LIBADD= vmmapi md nv pthread z util sbuf cam 9p .if ${MK_BHYVE_SNAPSHOT} != "no" LIBADD+= ucl xo .endif .if ${MK_INET_SUPPORT} != "no" CFLAGS+=-DINET .endif .if ${MK_INET6_SUPPORT} != "no" CFLAGS+=-DINET6 .endif .if ${MK_NETGRAPH_SUPPORT} != "no" CFLAGS+=-DNETGRAPH LIBADD+= netgraph .endif .if ${MK_OPENSSL} == "no" CFLAGS+=-DNO_OPENSSL .else LIBADD+= crypto CFLAGS+=-DOPENSSL_API_COMPAT=0x10100000L .endif CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000 CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller .if ${MK_BHYVE_SNAPSHOT} != "no" CFLAGS+= -I${SRCTOP}/contrib/libucl/include CFLAGS+= -DBHYVE_SNAPSHOT .endif .ifdef GDB_LOG CFLAGS+=-DGDB_LOG .endif # Disable thread safety analysis since it only finds very simple bugs and # yields many false positives. NO_WTHREAD_SAFETY= NO_WCAST_ALIGN= .include diff --git a/usr.sbin/bhyve/amd64/Makefile.inc b/usr.sbin/bhyve/amd64/Makefile.inc index c52219cb8dd1..e0cb84e32b85 100644 --- a/usr.sbin/bhyve/amd64/Makefile.inc +++ b/usr.sbin/bhyve/amd64/Makefile.inc @@ -1,13 +1,15 @@ SRCS+= \ atkbdc.c \ + e820.c \ fwctl.c \ + mptbl.c \ post.c \ ps2kbd.c \ ps2mouse.c \ task_switch.c .PATH: ${BHYVE_SYSDIR}/sys/amd64/vmm SRCS+= vmm_instruction_emul.c SUBDIR+= kbdlayout diff --git a/usr.sbin/bhyve/e820.c b/usr.sbin/bhyve/amd64/e820.c similarity index 95% rename from usr.sbin/bhyve/e820.c rename to usr.sbin/bhyve/amd64/e820.c index 99a66645f70f..545878aad39f 100644 --- a/usr.sbin/bhyve/e820.c +++ b/usr.sbin/bhyve/amd64/e820.c @@ -1,468 +1,490 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG * Author: Corvin Köhne */ #include #include #include #include #include #include #include #include #include #include "e820.h" #include "qemu_fwcfg.h" /* * E820 always uses 64 bit entries. Emulation code will use vm_paddr_t since it * works on physical addresses. If vm_paddr_t is larger than uint64_t E820 can't * hold all possible physical addresses and we can get into trouble. */ static_assert(sizeof(vm_paddr_t) <= sizeof(uint64_t), "Unable to represent physical memory by E820 table"); #define E820_FWCFG_FILE_NAME "etc/e820" #define KB (1024UL) #define MB (1024 * KB) #define GB (1024 * MB) /* * Fix E820 memory holes: * [ A0000, C0000) VGA * [ C0000, 100000) ROM */ #define E820_VGA_MEM_BASE 0xA0000 #define E820_VGA_MEM_END 0xC0000 #define E820_ROM_MEM_BASE 0xC0000 #define E820_ROM_MEM_END 0x100000 struct e820_element { TAILQ_ENTRY(e820_element) chain; uint64_t base; uint64_t end; enum e820_memory_type type; }; static TAILQ_HEAD(e820_table, e820_element) e820_table = TAILQ_HEAD_INITIALIZER( e820_table); static struct e820_element * e820_element_alloc(uint64_t base, uint64_t end, enum e820_memory_type type) { struct e820_element *element; element = calloc(1, sizeof(*element)); if (element == NULL) { return (NULL); } element->base = base; element->end = end; element->type = type; return (element); } static const char * e820_get_type_name(const enum e820_memory_type type) { switch (type) { case E820_TYPE_MEMORY: return ("RAM"); case E820_TYPE_RESERVED: return ("Reserved"); case E820_TYPE_ACPI: return ("ACPI"); case E820_TYPE_NVS: return ("NVS"); default: return ("Unknown"); } } void e820_dump_table(void) { struct e820_element *element; uint64_t i; fprintf(stderr, "E820 map:\n"); i = 0; TAILQ_FOREACH(element, &e820_table, chain) { fprintf(stderr, " (%4lu) [%16lx, %16lx] %s\n", i, element->base, element->end, e820_get_type_name(element->type)); ++i; } } -struct qemu_fwcfg_item * +static struct qemu_fwcfg_item * e820_get_fwcfg_item(void) { struct qemu_fwcfg_item *fwcfg_item; struct e820_element *element; struct e820_entry *entries; int count, i; count = 0; TAILQ_FOREACH(element, &e820_table, chain) { ++count; } if (count == 0) { warnx("%s: E820 table empty", __func__); return (NULL); } fwcfg_item = calloc(1, sizeof(struct qemu_fwcfg_item)); if (fwcfg_item == NULL) { return (NULL); } fwcfg_item->size = count * sizeof(struct e820_entry); fwcfg_item->data = calloc(count, sizeof(struct e820_entry)); if (fwcfg_item->data == NULL) { free(fwcfg_item); return (NULL); } i = 0; entries = (struct e820_entry *)fwcfg_item->data; TAILQ_FOREACH(element, &e820_table, chain) { struct e820_entry *entry = &entries[i]; entry->base = element->base; entry->length = element->end - element->base; entry->type = element->type; ++i; } return (fwcfg_item); } static int e820_add_entry(const uint64_t base, const uint64_t end, const enum e820_memory_type type) { struct e820_element *new_element; struct e820_element *element; struct e820_element *ram_element; assert(end >= base); new_element = e820_element_alloc(base, end, type); if (new_element == NULL) { return (ENOMEM); } /* * E820 table should always be sorted in ascending order. Therefore, * search for a range whose end is larger than the base parameter. */ TAILQ_FOREACH(element, &e820_table, chain) { if (element->end > base) { break; } } /* * System memory requires special handling. */ if (type == E820_TYPE_MEMORY) { /* * base is larger than of any existing element. Add new system * memory at the end of the table. */ if (element == NULL) { TAILQ_INSERT_TAIL(&e820_table, new_element, chain); return (0); } /* * System memory shouldn't overlap with any existing element. */ assert(end >= element->base); TAILQ_INSERT_BEFORE(element, new_element, chain); return (0); } /* * If some one tries to allocate a specific address, it could happen, that * this address is not allocatable. Therefore, do some checks. If the * address is not allocatable, don't panic. The user may have a fallback and * tries to allocate another address. This is true for the GVT-d emulation * which tries to reuse the host address of the graphics stolen memory and * falls back to allocating the highest address below 4 GB. */ if (element == NULL || element->type != E820_TYPE_MEMORY || (base < element->base || end > element->end)) return (ENOMEM); if (base == element->base) { /* * New element at system memory base boundary. Add new * element before current and adjust the base of the old * element. * * Old table: * [ 0x1000, 0x4000] RAM <-- element * New table: * [ 0x1000, 0x2000] Reserved * [ 0x2000, 0x4000] RAM <-- element */ TAILQ_INSERT_BEFORE(element, new_element, chain); element->base = end; } else if (end == element->end) { /* * New element at system memory end boundary. Add new * element after current and adjust the end of the * current element. * * Old table: * [ 0x1000, 0x4000] RAM <-- element * New table: * [ 0x1000, 0x3000] RAM <-- element * [ 0x3000, 0x4000] Reserved */ TAILQ_INSERT_AFTER(&e820_table, element, new_element, chain); element->end = base; } else { /* * New element inside system memory entry. Split it by * adding a system memory element and the new element * before current. * * Old table: * [ 0x1000, 0x4000] RAM <-- element * New table: * [ 0x1000, 0x2000] RAM * [ 0x2000, 0x3000] Reserved * [ 0x3000, 0x4000] RAM <-- element */ ram_element = e820_element_alloc(element->base, base, E820_TYPE_MEMORY); if (ram_element == NULL) { return (ENOMEM); } TAILQ_INSERT_BEFORE(element, ram_element, chain); TAILQ_INSERT_BEFORE(element, new_element, chain); element->base = end; } return (0); } static int e820_add_memory_hole(const uint64_t base, const uint64_t end) { struct e820_element *element; struct e820_element *ram_element; assert(end >= base); /* * E820 table should be always sorted in ascending order. Therefore, * search for an element which end is larger than the base parameter. */ TAILQ_FOREACH(element, &e820_table, chain) { if (element->end > base) { break; } } if (element == NULL || end <= element->base) { /* Nothing to do. Hole already exists */ return (0); } /* Memory holes are only allowed in system memory */ assert(element->type == E820_TYPE_MEMORY); if (base == element->base) { /* * New hole at system memory base boundary. * * Old table: * [ 0x1000, 0x4000] RAM * New table: * [ 0x2000, 0x4000] RAM */ element->base = end; } else if (end == element->end) { /* * New hole at system memory end boundary. * * Old table: * [ 0x1000, 0x4000] RAM * New table: * [ 0x1000, 0x3000] RAM */ element->end = base; } else { /* * New hole inside system memory entry. Split the system memory. * * Old table: * [ 0x1000, 0x4000] RAM <-- element * New table: * [ 0x1000, 0x2000] RAM * [ 0x3000, 0x4000] RAM <-- element */ ram_element = e820_element_alloc(element->base, base, E820_TYPE_MEMORY); if (ram_element == NULL) { return (ENOMEM); } TAILQ_INSERT_BEFORE(element, ram_element, chain); element->base = end; } return (0); } static uint64_t e820_alloc_highest(const uint64_t max_address, const uint64_t length, const uint64_t alignment, const enum e820_memory_type type) { struct e820_element *element; TAILQ_FOREACH_REVERSE(element, &e820_table, e820_table, chain) { uint64_t address, base, end; end = MIN(max_address, element->end); base = roundup2(element->base, alignment); /* * If end - length == 0, we would allocate memory at address 0. This * address is mostly unusable and we should avoid allocating it. * Therefore, search for another block in that case. */ if (element->type != E820_TYPE_MEMORY || end < base || end - base < length || end - length == 0) { continue; } address = rounddown2(end - length, alignment); if (e820_add_entry(address, address + length, type) != 0) { return (0); } return (address); } return (0); } static uint64_t e820_alloc_lowest(const uint64_t min_address, const uint64_t length, const uint64_t alignment, const enum e820_memory_type type) { struct e820_element *element; TAILQ_FOREACH(element, &e820_table, chain) { uint64_t base, end; end = element->end; base = MAX(min_address, roundup2(element->base, alignment)); /* * If base == 0, we would allocate memory at address 0. This * address is mostly unusable and we should avoid allocating it. * Therefore, search for another block in that case. */ if (element->type != E820_TYPE_MEMORY || end < base || end - base < length || base == 0) { continue; } if (e820_add_entry(base, base + length, type) != 0) { return (0); } return (base); } return (0); } uint64_t e820_alloc(const uint64_t address, const uint64_t length, const uint64_t alignment, const enum e820_memory_type type, const enum e820_allocation_strategy strategy) { assert(powerof2(alignment)); assert((address & (alignment - 1)) == 0); switch (strategy) { case E820_ALLOCATE_ANY: /* * Allocate any address. Therefore, ignore the address parameter * and reuse the code path for allocating the lowest address. */ return (e820_alloc_lowest(0, length, alignment, type)); case E820_ALLOCATE_LOWEST: return (e820_alloc_lowest(address, length, alignment, type)); case E820_ALLOCATE_HIGHEST: return (e820_alloc_highest(address, length, alignment, type)); case E820_ALLOCATE_SPECIFIC: if (e820_add_entry(address, address + length, type) != 0) { return (0); } return (address); } return (0); } int e820_init(struct vmctx *const ctx) { uint64_t lowmem_size, highmem_size; int error; TAILQ_INIT(&e820_table); lowmem_size = vm_get_lowmem_size(ctx); error = e820_add_entry(0, lowmem_size, E820_TYPE_MEMORY); if (error) { warnx("%s: Could not add lowmem", __func__); return (error); } highmem_size = vm_get_highmem_size(ctx); if (highmem_size != 0) { error = e820_add_entry(4 * GB, 4 * GB + highmem_size, E820_TYPE_MEMORY); if (error) { warnx("%s: Could not add highmem", __func__); return (error); } } error = e820_add_memory_hole(E820_VGA_MEM_BASE, E820_VGA_MEM_END); if (error) { warnx("%s: Could not add VGA memory", __func__); return (error); } error = e820_add_memory_hole(E820_ROM_MEM_BASE, E820_ROM_MEM_END); if (error) { warnx("%s: Could not add ROM area", __func__); return (error); } return (0); } + +int +e820_finalize(void) +{ + struct qemu_fwcfg_item *e820_fwcfg_item; + int error; + + e820_fwcfg_item = e820_get_fwcfg_item(); + if (e820_fwcfg_item == NULL) { + warnx("invalid e820 table"); + return (ENOMEM); + } + error = qemu_fwcfg_add_file("etc/e820", + e820_fwcfg_item->size, e820_fwcfg_item->data); + if (error != 0) { + warnx("could not add qemu fwcfg etc/e820"); + return (error); + } + free(e820_fwcfg_item); + + return (0); +} diff --git a/usr.sbin/bhyve/e820.h b/usr.sbin/bhyve/amd64/e820.h similarity index 95% rename from usr.sbin/bhyve/e820.h rename to usr.sbin/bhyve/amd64/e820.h index 8703a55115cd..ae68fe9040a9 100644 --- a/usr.sbin/bhyve/e820.h +++ b/usr.sbin/bhyve/amd64/e820.h @@ -1,45 +1,45 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG * Author: Corvin Köhne */ #pragma once #include #include "qemu_fwcfg.h" enum e820_memory_type { E820_TYPE_MEMORY = 1, E820_TYPE_RESERVED = 2, E820_TYPE_ACPI = 3, E820_TYPE_NVS = 4 }; enum e820_allocation_strategy { /* allocate any address */ E820_ALLOCATE_ANY, /* allocate lowest address larger than address */ E820_ALLOCATE_LOWEST, /* allocate highest address lower than address */ E820_ALLOCATE_HIGHEST, /* allocate a specific address */ E820_ALLOCATE_SPECIFIC }; struct e820_entry { uint64_t base; uint64_t length; uint32_t type; } __packed; #define E820_ALIGNMENT_NONE 1 uint64_t e820_alloc(const uint64_t address, const uint64_t length, const uint64_t alignment, const enum e820_memory_type type, const enum e820_allocation_strategy strategy); void e820_dump_table(void); -struct qemu_fwcfg_item *e820_get_fwcfg_item(void); int e820_init(struct vmctx *const ctx); +int e820_finalize(void); diff --git a/usr.sbin/bhyve/mptbl.c b/usr.sbin/bhyve/amd64/mptbl.c similarity index 100% rename from usr.sbin/bhyve/mptbl.c rename to usr.sbin/bhyve/amd64/mptbl.c diff --git a/usr.sbin/bhyve/mptbl.h b/usr.sbin/bhyve/amd64/mptbl.h similarity index 100% rename from usr.sbin/bhyve/mptbl.h rename to usr.sbin/bhyve/amd64/mptbl.h diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index 6bebfddc3985..a61ebcda0003 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -1,1633 +1,1631 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #ifdef BHYVE_SNAPSHOT #include #include #endif #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #include #include #endif #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include "bhyverun.h" #include "acpi.h" #ifdef __amd64__ #include "amd64/atkbdc.h" #endif #include "bootrom.h" #include "config.h" #include "inout.h" #include "debug.h" -#include "e820.h" #ifdef __amd64__ +#include "amd64/e820.h" #include "amd64/fwctl.h" #endif #include "gdb.h" #include "ioapic.h" #include "kernemu_dev.h" #include "mem.h" #include "mevent.h" -#include "mptbl.h" +#ifdef __amd64__ +#include "amd64/mptbl.h" +#endif #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "qemu_fwcfg.h" #include "smbiostbl.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "tpm_device.h" #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include "vmgenc.h" #define MB (1024UL * 1024) #define GB (1024UL * MB) static const char * const vmx_exit_reason_desc[] = { [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", [EXIT_REASON_EXT_INTR] = "External interrupt", [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", [EXIT_REASON_INIT] = "INIT signal", [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", [EXIT_REASON_SMI] = "Other SMI", [EXIT_REASON_INTR_WINDOW] = "Interrupt window", [EXIT_REASON_NMI_WINDOW] = "NMI window", [EXIT_REASON_TASK_SWITCH] = "Task switch", [EXIT_REASON_CPUID] = "CPUID", [EXIT_REASON_GETSEC] = "GETSEC", [EXIT_REASON_HLT] = "HLT", [EXIT_REASON_INVD] = "INVD", [EXIT_REASON_INVLPG] = "INVLPG", [EXIT_REASON_RDPMC] = "RDPMC", [EXIT_REASON_RDTSC] = "RDTSC", [EXIT_REASON_RSM] = "RSM", [EXIT_REASON_VMCALL] = "VMCALL", [EXIT_REASON_VMCLEAR] = "VMCLEAR", [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", [EXIT_REASON_VMPTRLD] = "VMPTRLD", [EXIT_REASON_VMPTRST] = "VMPTRST", [EXIT_REASON_VMREAD] = "VMREAD", [EXIT_REASON_VMRESUME] = "VMRESUME", [EXIT_REASON_VMWRITE] = "VMWRITE", [EXIT_REASON_VMXOFF] = "VMXOFF", [EXIT_REASON_VMXON] = "VMXON", [EXIT_REASON_CR_ACCESS] = "Control-register accesses", [EXIT_REASON_DR_ACCESS] = "MOV DR", [EXIT_REASON_INOUT] = "I/O instruction", [EXIT_REASON_RDMSR] = "RDMSR", [EXIT_REASON_WRMSR] = "WRMSR", [EXIT_REASON_INVAL_VMCS] = "VM-entry failure due to invalid guest state", [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", [EXIT_REASON_MWAIT] = "MWAIT", [EXIT_REASON_MTF] = "Monitor trap flag", [EXIT_REASON_MONITOR] = "MONITOR", [EXIT_REASON_PAUSE] = "PAUSE", [EXIT_REASON_MCE_DURING_ENTRY] = "VM-entry failure due to machine-check event", [EXIT_REASON_TPR] = "TPR below threshold", [EXIT_REASON_APIC_ACCESS] = "APIC access", [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", [EXIT_REASON_EPT_FAULT] = "EPT violation", [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", [EXIT_REASON_INVEPT] = "INVEPT", [EXIT_REASON_RDTSCP] = "RDTSCP", [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", [EXIT_REASON_INVVPID] = "INVVPID", [EXIT_REASON_WBINVD] = "WBINVD", [EXIT_REASON_XSETBV] = "XSETBV", [EXIT_REASON_APIC_WRITE] = "APIC write", [EXIT_REASON_RDRAND] = "RDRAND", [EXIT_REASON_INVPCID] = "INVPCID", [EXIT_REASON_VMFUNC] = "VMFUNC", [EXIT_REASON_ENCLS] = "ENCLS", [EXIT_REASON_RDSEED] = "RDSEED", [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", [EXIT_REASON_XSAVES] = "XSAVES", [EXIT_REASON_XRSTORS] = "XRSTORS" }; typedef int (*vmexit_handler_t)(struct vmctx *, struct vcpu *, struct vm_run *); int guest_ncpus; uint16_t cpu_cores, cpu_sockets, cpu_threads; int raw_stdio = 0; static char *progname; static const int BSP = 0; static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu); static struct vcpu_info { struct vmctx *ctx; struct vcpu *vcpu; int vcpuid; } *vcpu_info; static cpuset_t **vcpumap; static void usage(int code) { fprintf(stderr, "Usage: %s [-AaCDeHhPSuWwxY]\n" " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" " %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n" " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n" " -A: create ACPI tables\n" " -a: local apic is in xAPIC mode (deprecated)\n" " -C: include guest memory in core file\n" " -c: number of CPUs and/or topology specification\n" " -D: destroy on power-off\n" " -e: exit on unhandled I/O access\n" " -G: start a debug server\n" " -H: vmexit from the guest on HLT\n" " -h: help\n" " -k: key=value flat config file\n" " -K: PS2 keyboard layout\n" " -l: LPC device configuration\n" " -m: memory size\n" " -o: set config 'var' to 'value'\n" " -P: vmexit from the guest on pause\n" " -p: pin 'vcpu' to 'hostcpu'\n" #ifdef BHYVE_SNAPSHOT " -r: path to checkpoint file\n" #endif " -S: guest memory cannot be swapped\n" " -s: PCI slot config\n" " -U: UUID\n" " -u: RTC keeps UTC time\n" " -W: force virtio to use single-vector MSI\n" " -w: ignore unimplemented MSRs\n" " -x: local APIC is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), "", (int)strlen(progname), "", (int)strlen(progname), ""); exit(code); } /* * XXX This parser is known to have the following issues: * 1. It accepts null key=value tokens ",," as setting "cpus" to an * empty string. * * The acceptance of a null specification ('-c ""') is by design to match the * manual page syntax specification, this results in a topology of 1 vCPU. */ static int topology_parse(const char *opt) { char *cp, *str, *tofree; if (*opt == '\0') { set_config_value("sockets", "1"); set_config_value("cores", "1"); set_config_value("threads", "1"); set_config_value("cpus", "1"); return (0); } tofree = str = strdup(opt); if (str == NULL) errx(4, "Failed to allocate memory"); while ((cp = strsep(&str, ",")) != NULL) { if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) set_config_value("cpus", cp + strlen("cpus=")); else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0) set_config_value("sockets", cp + strlen("sockets=")); else if (strncmp(cp, "cores=", strlen("cores=")) == 0) set_config_value("cores", cp + strlen("cores=")); else if (strncmp(cp, "threads=", strlen("threads=")) == 0) set_config_value("threads", cp + strlen("threads=")); else if (strchr(cp, '=') != NULL) goto out; else set_config_value("cpus", cp); } free(tofree); return (0); out: free(tofree); return (-1); } static int parse_int_value(const char *key, const char *value, int minval, int maxval) { char *cp; long lval; errno = 0; lval = strtol(value, &cp, 0); if (errno != 0 || *cp != '\0' || cp == value || lval < minval || lval > maxval) errx(4, "Invalid value for %s: '%s'", key, value); return (lval); } /* * Set the sockets, cores, threads, and guest_cpus variables based on * the configured topology. * * The limits of UINT16_MAX are due to the types passed to * vm_set_topology(). vmm.ko may enforce tighter limits. */ static void calc_topology(void) { const char *value; bool explicit_cpus; uint64_t ncpus; value = get_config_value("cpus"); if (value != NULL) { guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX); explicit_cpus = true; } else { guest_ncpus = 1; explicit_cpus = false; } value = get_config_value("cores"); if (value != NULL) cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX); else cpu_cores = 1; value = get_config_value("threads"); if (value != NULL) cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX); else cpu_threads = 1; value = get_config_value("sockets"); if (value != NULL) cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX); else cpu_sockets = guest_ncpus; /* * Compute sockets * cores * threads avoiding overflow. The * range check above insures these are 16 bit values. */ ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads; if (ncpus > UINT16_MAX) errx(4, "Computed number of vCPUs too high: %ju", (uintmax_t)ncpus); if (explicit_cpus) { if (guest_ncpus != (int)ncpus) errx(4, "Topology (%d sockets, %d cores, %d threads) " "does not match %d vCPUs", cpu_sockets, cpu_cores, cpu_threads, guest_ncpus); } else guest_ncpus = ncpus; } static int pincpu_parse(const char *opt) { const char *value; char *newval; char key[16]; int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0) { fprintf(stderr, "invalid vcpu '%d'\n", vcpu); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (asprintf(&newval, "%s%s%d", value != NULL ? value : "", value != NULL ? "," : "", pcpu) == -1) { perror("failed to build new cpuset string"); return (-1); } set_config_value(key, newval); free(newval); return (0); } static void parse_cpuset(int vcpu, const char *list, cpuset_t *set) { char *cp, *token; int pcpu, start; CPU_ZERO(set); start = -1; token = __DECONST(char *, list); for (;;) { pcpu = strtoul(token, &cp, 0); if (cp == token) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); if (pcpu < 0 || pcpu >= CPU_SETSIZE) errx(4, "hostcpu '%d' outside valid range from 0 to %d", pcpu, CPU_SETSIZE - 1); switch (*cp) { case ',': case '\0': if (start >= 0) { if (start > pcpu) errx(4, "Invalid hostcpu range %d-%d", start, pcpu); while (start < pcpu) { CPU_SET(start, set); start++; } start = -1; } CPU_SET(pcpu, set); break; case '-': if (start >= 0) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); start = pcpu; break; default: errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); } if (*cp == '\0') break; token = cp + 1; } } static void build_vcpumaps(void) { char key[16]; const char *value; int vcpu; vcpumap = calloc(guest_ncpus, sizeof(*vcpumap)); for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (value == NULL) continue; vcpumap[vcpu] = malloc(sizeof(cpuset_t)); if (vcpumap[vcpu] == NULL) err(4, "Failed to allocate cpuset for vcpu %d", vcpu); parse_cpuset(vcpu, value, vcpumap[vcpu]); } } void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode) { int error, restart_instruction; restart_instruction = 1; error = vm_inject_exception(vcpu, vector, errcode_valid, errcode, restart_instruction); assert(error == 0); } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr) { return (vm_rev_map_gpa(ctx, addr)); } #endif int fbsdrun_virtio_msix(void) { return (get_config_bool_default("virtio_msix", true)); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct vcpu_info *vi = param; int error; snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid); pthread_set_name_np(pthread_self(), tname); if (vcpumap[vi->vcpuid] != NULL) { error = pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), vcpumap[vi->vcpuid]); assert(error == 0); } #ifdef BHYVE_SNAPSHOT checkpoint_cpu_add(vi->vcpuid); #endif gdb_cpu_add(vi->vcpu); vm_loop(vi->ctx, vi->vcpu); /* not reached */ exit(1); return (NULL); } static void fbsdrun_addcpu(struct vcpu_info *vi) { pthread_t thr; int error; error = vm_activate_cpu(vi->vcpu); if (error != 0) err(EX_OSERR, "could not activate CPU %d", vi->vcpuid); CPU_SET_ATOMIC(vi->vcpuid, &cpumask); vm_suspend_cpu(vi->vcpu); error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi); assert(error == 0); } static void fbsdrun_deletecpu(int vcpu) { static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; pthread_mutex_lock(&resetcpu_mtx); if (!CPU_ISSET(vcpu, &cpumask)) { fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); exit(4); } CPU_CLR(vcpu, &cpumask); if (vcpu != BSP) { pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); /* NOTREACHED */ } while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); } static int vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; int error; int bytes, port, in; vme = vmrun->vm_exit; port = vme->u.inout.port; bytes = vme->u.inout.bytes; in = vme->u.inout.in; error = emulate_inout(ctx, vcpu, vme); if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port, vme->rip); return (VMEXIT_ABORT); } else { return (VMEXIT_CONTINUE); } } static int vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; uint64_t val; uint32_t eax, edx; int error; vme = vmrun->vm_exit; val = 0; error = emulate_rdmsr(vcpu, vme->u.msr.code, &val); if (error != 0) { fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", vme->u.msr.code, vcpu_id(vcpu)); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(vcpu); return (VMEXIT_CONTINUE); } } eax = val; error = vm_set_register(vcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; error = vm_set_register(vcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; int error; vme = vmrun->vm_exit; error = emulate_wrmsr(vcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", vme->u.msr.code, vme->u.msr.wval, vcpu_id(vcpu)); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(vcpu); return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); } #define DEBUG_EPT_MISCONFIG #ifdef DEBUG_EPT_MISCONFIG #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; static int ept_misconfig_ptenum; #endif static const char * vmexit_vmx_desc(uint32_t exit_reason) { if (exit_reason >= nitems(vmx_exit_reason_desc) || vmx_exit_reason_desc[exit_reason] == NULL) return ("Unknown"); return (vmx_exit_reason_desc[exit_reason]); } static int vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; vme = vmrun->vm_exit; fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); fprintf(stderr, "\treason\t\tVMX\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vme->u.vmx.status); fprintf(stderr, "\texit_reason\t%u (%s)\n", vme->u.vmx.exit_reason, vmexit_vmx_desc(vme->u.vmx.exit_reason)); fprintf(stderr, "\tqualification\t0x%016lx\n", vme->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vme->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vme->u.vmx.inst_error); #ifdef DEBUG_EPT_MISCONFIG if (vme->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { vm_get_register(vcpu, VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), &ept_misconfig_gpa); vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, &ept_misconfig_ptenum); fprintf(stderr, "\tEPT misconfiguration:\n"); fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", ept_misconfig_ptenum, ept_misconfig_pte[0], ept_misconfig_pte[1], ept_misconfig_pte[2], ept_misconfig_pte[3]); } #endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } static int vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; vme = vmrun->vm_exit; fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); fprintf(stderr, "\treason\t\tSVM\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\texitcode\t%#lx\n", vme->u.svm.exitcode); fprintf(stderr, "\texitinfo1\t%#lx\n", vme->u.svm.exitinfo1); fprintf(stderr, "\texitinfo2\t%#lx\n", vme->u.svm.exitinfo2); return (VMEXIT_ABORT); } static int vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun) { assert(vmrun->vm_exit->inst_length == 0); return (VMEXIT_CONTINUE); } static int vmexit_reqidle(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun) { assert(vmrun->vm_exit->inst_length == 0); return (VMEXIT_CONTINUE); } static int vmexit_hlt(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun __unused) { /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int vmexit_pause(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun __unused) { return (VMEXIT_CONTINUE); } static int vmexit_mtrap(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { assert(vmrun->vm_exit->inst_length == 0); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(vcpu_id(vcpu)); #endif gdb_cpu_mtrap(vcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(vcpu_id(vcpu)); #endif return (VMEXIT_CONTINUE); } static int vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; struct vie *vie; int err, i, cs_d; enum vm_cpu_mode mode; vme = vmrun->vm_exit; vie = &vme->u.inst_emul.vie; if (!vie->decoded) { /* * Attempt to decode in userspace as a fallback. This allows * updating instruction decode in bhyve without rebooting the * kernel (rapid prototyping), albeit with much slower * emulation. */ vie_restart(vie); mode = vme->u.inst_emul.paging.cpu_mode; cs_d = vme->u.inst_emul.cs_d; if (vmm_decode_instruction(mode, cs_d, vie) != 0) goto fail; if (vm_set_register(vcpu, VM_REG_GUEST_RIP, vme->rip + vie->num_processed) != 0) goto fail; } err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie, &vme->u.inst_emul.paging); if (err) { if (err == ESRCH) { EPRINTLN("Unhandled memory access to 0x%lx\n", vme->u.inst_emul.gpa); } goto fail; } return (VMEXIT_CONTINUE); fail: fprintf(stderr, "Failed to emulate instruction sequence [ "); for (i = 0; i < vie->num_valid; i++) fprintf(stderr, "%02x", vie->inst[i]); FPRINTLN(stderr, " ] at 0x%lx", vme->rip); return (VMEXIT_ABORT); } static int vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; enum vm_suspend_how how; int vcpuid = vcpu_id(vcpu); vme = vmrun->vm_exit; how = vme->u.suspended.how; fbsdrun_deletecpu(vcpuid); switch (how) { case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: if (get_config_bool_default("destroy_on_poweroff", false)) vm_destroy(ctx); exit(1); case VM_SUSPEND_HALT: exit(2); case VM_SUSPEND_TRIPLEFAULT: exit(3); default: fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); exit(100); } return (0); /* NOTREACHED */ } static int vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun __unused) { #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(vcpu_id(vcpu)); #endif gdb_cpu_suspend(vcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(vcpu_id(vcpu)); #endif /* * XXX-MJ sleep for a short period to avoid chewing up the CPU in the * window between activation of the vCPU thread and the STARTUP IPI. */ usleep(1000); return (VMEXIT_CONTINUE); } static int vmexit_breakpoint(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { gdb_cpu_breakpoint(vcpu, vmrun->vm_exit); return (VMEXIT_CONTINUE); } static int vmexit_ipi(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun) { struct vm_exit *vme; cpuset_t *dmask; int error = -1; int i; dmask = vmrun->cpuset; vme = vmrun->vm_exit; switch (vme->u.ipi.mode) { case APIC_DELMODE_INIT: CPU_FOREACH_ISSET(i, dmask) { error = vm_suspend_cpu(vcpu_info[i].vcpu); if (error) { warnx("%s: failed to suspend cpu %d\n", __func__, i); break; } } break; case APIC_DELMODE_STARTUP: CPU_FOREACH_ISSET(i, dmask) { spinup_ap(vcpu_info[i].vcpu, vme->u.ipi.vector << PAGE_SHIFT); } error = 0; break; default: break; } return (error); } int vmexit_task_switch(struct vmctx *, struct vcpu *, struct vm_run *); static const vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, [VM_EXITCODE_BPT] = vmexit_breakpoint, [VM_EXITCODE_IPI] = vmexit_ipi, [VM_EXITCODE_HLT] = vmexit_hlt, [VM_EXITCODE_PAUSE] = vmexit_pause, }; static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu) { struct vm_exit vme; struct vm_run vmrun; int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus, dmask; error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus)); vmrun.vm_exit = &vme; vmrun.cpuset = &dmask; vmrun.cpusetsize = sizeof(dmask); while (1) { error = vm_run(vcpu, &vmrun); if (error != 0) break; exitcode = vme.exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(4); } rc = (*handler[exitcode])(ctx, vcpu, &vmrun); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(4); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu) { uint16_t sockets, cores, threads, maxcpus; int tmp, error; /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp); if (error != 0) return (1); error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); if (error == 0) return (maxcpus); else return (1); } static void fbsdrun_set_capabilities(struct vcpu *vcpu) { int err, tmp; if (get_config_bool_default("x86.vmexit_on_hlt", false)) { err = vm_get_capability(vcpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); exit(4); } vm_set_capability(vcpu, VM_CAP_HALT_EXIT, 1); } if (get_config_bool_default("x86.vmexit_on_pause", false)) { /* * pause exit support required for this mode */ err = vm_get_capability(vcpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); exit(4); } vm_set_capability(vcpu, VM_CAP_PAUSE_EXIT, 1); } if (get_config_bool_default("x86.x2apic", false)) err = vm_set_x2apic_state(vcpu, X2APIC_ENABLED); else err = vm_set_x2apic_state(vcpu, X2APIC_DISABLED); if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); exit(4); } vm_set_capability(vcpu, VM_CAP_ENABLE_INVPCID, 1); err = vm_set_capability(vcpu, VM_CAP_IPI_EXIT, 1); assert(err == 0); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool reinit, romboot; reinit = romboot = false; if (lpc_bootrom()) romboot = true; error = vm_create(vmname); if (error) { if (errno == EEXIST) { if (romboot) { reinit = true; } else { /* * The virtual machine has been setup by the * userspace bootloader. */ } } else { perror("vm_create"); exit(4); } } else { if (!romboot) { /* * If the virtual machine was just created then a * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); exit(4); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(4); } #ifndef WITHOUT_CAPSICUM if (vm_limit_rights(ctx) != 0) err(EX_OSERR, "vm_limit_rights"); #endif if (reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(4); } } error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0); if (error) errx(EX_OSERR, "vm_set_topology"); return (ctx); } static void spinup_vcpu(struct vcpu_info *vi, bool bsp) { int error; if (!bsp) { fbsdrun_set_capabilities(vi->vcpu); /* * Enable the 'unrestricted guest' mode for APs. * * APs startup in power-on 16-bit mode. */ error = vm_set_capability(vi->vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); } fbsdrun_addcpu(vi); } static bool parse_config_option(const char *option) { const char *value; char *path; value = strchr(option, '='); if (value == NULL || value[1] == '\0') return (false); path = strndup(option, value - option); if (path == NULL) err(4, "Failed to allocate memory"); set_config_value(path, value + 1); return (true); } static void parse_simple_config_file(const char *path) { FILE *fp; char *line, *cp; size_t linecap; unsigned int lineno; fp = fopen(path, "r"); if (fp == NULL) err(4, "Failed to open configuration file %s", path); line = NULL; linecap = 0; lineno = 1; for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) { if (*line == '#' || *line == '\n') continue; cp = strchr(line, '\n'); if (cp != NULL) *cp = '\0'; if (!parse_config_option(line)) errx(4, "%s line %u: invalid config option '%s'", path, lineno, line); } free(line); fclose(fp); } static void parse_gdb_options(const char *opt) { const char *sport; char *colon; if (opt[0] == 'w') { set_config_bool("gdb.wait", true); opt++; } colon = strrchr(opt, ':'); if (colon == NULL) { sport = opt; } else { *colon = '\0'; colon++; sport = colon; set_config_value("gdb.address", opt); } set_config_value("gdb.port", sport); } static void set_defaults(void) { set_config_bool("acpi_tables", false); set_config_bool("acpi_tables_in_memory", true); set_config_value("memory.size", "256M"); set_config_bool("x86.strictmsr", true); set_config_value("lpc.fwcfg", "bhyve"); } int main(int argc, char *argv[]) { int c, error; int max_vcpus, memflags; struct vcpu *bsp; struct vmctx *ctx; - struct qemu_fwcfg_item *e820_fwcfg_item; size_t memsize; const char *optstr, *value, *vmname; #ifdef BHYVE_SNAPSHOT char *restore_file; struct restore_state rstate; restore_file = NULL; #endif init_config(); set_defaults(); progname = basename(argv[0]); #ifdef BHYVE_SNAPSHOT optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:"; #else optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': set_config_bool("x86.x2apic", false); break; case 'A': set_config_bool("acpi_tables", true); break; case 'D': set_config_bool("destroy_on_poweroff", true); break; case 'p': if (pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': if (topology_parse(optarg) != 0) { errx(EX_USAGE, "invalid cpu topology " "'%s'", optarg); } break; case 'C': set_config_bool("memory.guest_in_core", true); break; case 'f': if (qemu_fwcfg_parse_cmdline_arg(optarg) != 0) { errx(EX_USAGE, "invalid fwcfg item '%s'", optarg); } break; case 'G': parse_gdb_options(optarg); break; case 'k': parse_simple_config_file(optarg); break; case 'K': set_config_value("keyboard.layout", optarg); break; case 'l': if (strncmp(optarg, "help", strlen(optarg)) == 0) { lpc_print_supported_devices(); exit(0); } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; #ifdef BHYVE_SNAPSHOT case 'r': restore_file = optarg; break; #endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); exit(0); } else if (pci_parse_slot(optarg) != 0) exit(4); else break; case 'S': set_config_bool("memory.wired", true); break; case 'm': set_config_value("memory.size", optarg); break; case 'o': if (!parse_config_option(optarg)) errx(EX_USAGE, "invalid configuration option '%s'", optarg); break; case 'H': set_config_bool("x86.vmexit_on_hlt", true); break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': set_config_bool("x86.vmexit_on_pause", true); break; case 'e': set_config_bool("x86.strictio", true); break; case 'u': set_config_bool("rtc.use_localtime", false); break; case 'U': set_config_value("uuid", optarg); break; case 'w': set_config_bool("x86.strictmsr", false); break; case 'W': set_config_bool("virtio_msix", false); break; case 'x': set_config_bool("x86.x2apic", true); break; +#ifdef __amd64__ case 'Y': set_config_bool("x86.mptable", false); break; +#endif case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; if (argc > 1) usage(1); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { error = load_restore_file(restore_file, &rstate); if (error) { fprintf(stderr, "Failed to read checkpoint info from " "file: '%s'.\n", restore_file); exit(1); } vmname = lookup_vmname(&rstate); if (vmname != NULL) set_config_value("name", vmname); } #endif if (argc == 1) set_config_value("name", argv[0]); vmname = get_config_value("name"); if (vmname == NULL) usage(1); if (get_config_bool_default("config.dump", false)) { dump_config(); exit(1); } calc_topology(); build_vcpumaps(); value = get_config_value("memory.size"); error = vm_parse_memsize(value, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", value); ctx = do_open(vmname); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { guest_ncpus = lookup_guest_ncpus(&rstate); memflags = lookup_memflags(&rstate); memsize = lookup_memsize(&rstate); } if (guest_ncpus < 1) { fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); exit(1); } #endif bsp = vm_vcpu_open(ctx, BSP); max_vcpus = num_vcpus_allowed(ctx, bsp); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(4); } fbsdrun_set_capabilities(bsp); /* Allocate per-VCPU resources. */ vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info)); for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) { vcpu_info[vcpuid].ctx = ctx; vcpu_info[vcpuid].vcpuid = vcpuid; if (vcpuid == BSP) vcpu_info[vcpuid].vcpu = bsp; else vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid); } memflags = 0; if (get_config_bool_default("memory.wired", false)) memflags |= VM_MEM_F_WIRED; if (get_config_bool_default("memory.guest_in_core", false)) memflags |= VM_MEM_F_INCORE; vm_set_memflags(ctx, memflags); error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (error) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); exit(4); } init_mem(guest_ncpus); init_inout(); kernemu_dev_init(); init_bootrom(ctx); #ifdef __amd64__ atkbdc_init(ctx); #endif pci_irq_init(ctx); ioapic_init(ctx); rtc_init(ctx); sci_init(ctx); if (qemu_fwcfg_init(ctx) != 0) { fprintf(stderr, "qemu fwcfg initialization error"); exit(4); } if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus), &guest_ncpus) != 0) { fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu"); exit(4); } +#ifdef __amd64__ if (e820_init(ctx) != 0) { fprintf(stderr, "Unable to setup E820"); exit(4); } +#endif /* * Exit if a device emulation finds an error in its initialization */ if (init_pci(ctx) != 0) { perror("device emulation initialization error"); exit(4); } if (init_tpm(ctx) != 0) { fprintf(stderr, "Failed to init TPM device"); exit(4); } /* * Initialize after PCI, to allow a bootrom file to reserve the high * region. */ if (get_config_bool("acpi_tables")) vmgenc_init(ctx); init_gdb(ctx); if (lpc_bootrom()) { if (vm_set_capability(bsp, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); exit(4); } error = vcpu_reset(bsp); assert(error == 0); } /* * Add all vCPUs. */ for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) spinup_vcpu(&vcpu_info[vcpuid], vcpuid == BSP); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { fprintf(stdout, "Pausing pci devs...\r\n"); if (vm_pause_devices() != 0) { fprintf(stderr, "Failed to pause PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring vm mem...\r\n"); if (restore_vm_mem(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore VM memory.\n"); exit(1); } fprintf(stdout, "Restoring pci devs...\r\n"); if (vm_restore_devices(&rstate) != 0) { fprintf(stderr, "Failed to restore PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring kernel structs...\r\n"); if (vm_restore_kern_structs(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore kernel structs.\n"); exit(1); } fprintf(stdout, "Resuming pci devs...\r\n"); if (vm_resume_devices() != 0) { fprintf(stderr, "Failed to resume PCI device state.\n"); exit(1); } } #endif - /* - * build the guest tables, MP etc. - */ +#ifdef __amd64__ if (get_config_bool_default("x86.mptable", true)) { error = mptable_build(ctx, guest_ncpus); if (error) { perror("error to build the guest tables"); exit(4); } } +#endif error = smbios_build(ctx); if (error != 0) exit(4); if (get_config_bool("acpi_tables")) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } - e820_fwcfg_item = e820_get_fwcfg_item(); - if (e820_fwcfg_item == NULL) { - fprintf(stderr, "invalid e820 table"); - exit(4); - } - if (qemu_fwcfg_add_file("etc/e820", e820_fwcfg_item->size, - e820_fwcfg_item->data) != 0) { - fprintf(stderr, "could not add qemu fwcfg etc/e820"); +#ifdef __amd64__ + error = e820_finalize(); + if (error != 0) exit(4); - } - free(e820_fwcfg_item); +#endif #ifdef __amd64__ if (lpc_bootrom() && strcmp(lpc_fwcfg(), "bhyve") == 0) { fwctl_init(); } #endif /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); #ifdef BHYVE_SNAPSHOT /* initialize mutex/cond variables */ init_snapshot(); /* * checkpointing thread for communication with bhyvectl */ if (init_checkpoint_thread(ctx) != 0) errx(EX_OSERR, "Failed to start checkpoint thread"); #endif #ifndef WITHOUT_CAPSICUM caph_cache_catpages(); if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_enter() == -1) errx(EX_OSERR, "cap_enter() failed"); #endif #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { destroy_restore_state(&rstate); if (vm_restore_time(ctx) < 0) err(EX_OSERR, "Unable to restore time"); for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) vm_resume_cpu(vcpu_info[vcpuid].vcpu); } else #endif vm_resume_cpu(bsp); /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(4); } diff --git a/usr.sbin/bhyve/pci_gvt-d.c b/usr.sbin/bhyve/pci_gvt-d.c index 78fa9878358e..f64cc5984352 100644 --- a/usr.sbin/bhyve/pci_gvt-d.c +++ b/usr.sbin/bhyve/pci_gvt-d.c @@ -1,304 +1,304 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2020 Beckhoff Automation GmbH & Co. KG * Author: Corvin Köhne */ #include #include #include #include #include #include #include #include #include -#include "e820.h" +#include "amd64/e820.h" #include "pci_gvt-d-opregion.h" #include "pci_passthru.h" #define KB (1024UL) #define MB (1024 * KB) #define GB (1024 * MB) #ifndef _PATH_MEM #define _PATH_MEM "/dev/mem" #endif #define PCI_VENDOR_INTEL 0x8086 #define PCIR_BDSM 0x5C /* Base of Data Stolen Memory register */ #define PCIR_ASLS_CTL 0xFC /* Opregion start address register */ #define PCIM_BDSM_GSM_ALIGNMENT \ 0x00100000 /* Graphics Stolen Memory is 1 MB aligned */ #define GVT_D_MAP_GSM 0 #define GVT_D_MAP_OPREGION 1 static int gvt_d_probe(struct pci_devinst *const pi) { struct passthru_softc *sc; uint16_t vendor; uint8_t class; sc = pi->pi_arg; vendor = pci_host_read_config(passthru_get_sel(sc), PCIR_VENDOR, 0x02); if (vendor != PCI_VENDOR_INTEL) return (ENXIO); class = pci_host_read_config(passthru_get_sel(sc), PCIR_CLASS, 0x01); if (class != PCIC_DISPLAY) return (ENXIO); return (0); } static vm_paddr_t gvt_d_alloc_mmio_memory(const vm_paddr_t host_address, const vm_paddr_t length, const vm_paddr_t alignment, const enum e820_memory_type type) { vm_paddr_t address; /* Try to reuse host address. */ address = e820_alloc(host_address, length, E820_ALIGNMENT_NONE, type, E820_ALLOCATE_SPECIFIC); if (address != 0) { return (address); } /* * We're not able to reuse the host address. Fall back to the highest usable * address below 4 GB. */ return ( e820_alloc(4 * GB, length, alignment, type, E820_ALLOCATE_HIGHEST)); } /* * Note that the graphics stolen memory is somehow confusing. On the one hand * the Intel Open Source HD Graphics Programmers' Reference Manual states that * it's only GPU accessible. As the CPU can't access the area, the guest * shouldn't need it. On the other hand, the Intel GOP driver refuses to work * properly, if it's not set to a proper address. * * Intel itself maps it into the guest by EPT [1]. At the moment, we're not * aware of any situation where this EPT mapping is required, so we don't do it * yet. * * Intel also states that the Windows driver for Tiger Lake reads the address of * the graphics stolen memory [2]. As the GVT-d code doesn't support Tiger Lake * in its first implementation, we can't check how it behaves. We should keep an * eye on it. * * [1] * https://github.com/projectacrn/acrn-hypervisor/blob/e28d6fbfdfd556ff1bc3ff330e41d4ddbaa0f897/devicemodel/hw/pci/passthrough.c#L655-L657 * [2] * https://github.com/projectacrn/acrn-hypervisor/blob/e28d6fbfdfd556ff1bc3ff330e41d4ddbaa0f897/devicemodel/hw/pci/passthrough.c#L626-L629 */ static int gvt_d_setup_gsm(struct pci_devinst *const pi) { struct passthru_softc *sc; struct passthru_mmio_mapping *gsm; size_t sysctl_len; uint32_t bdsm; int error; sc = pi->pi_arg; gsm = passthru_get_mmio(sc, GVT_D_MAP_GSM); if (gsm == NULL) { warnx("%s: Unable to access gsm", __func__); return (-1); } sysctl_len = sizeof(gsm->hpa); error = sysctlbyname("hw.intel_graphics_stolen_base", &gsm->hpa, &sysctl_len, NULL, 0); if (error) { warn("%s: Unable to get graphics stolen memory base", __func__); return (-1); } sysctl_len = sizeof(gsm->len); error = sysctlbyname("hw.intel_graphics_stolen_size", &gsm->len, &sysctl_len, NULL, 0); if (error) { warn("%s: Unable to get graphics stolen memory length", __func__); return (-1); } gsm->hva = NULL; /* unused */ gsm->gva = NULL; /* unused */ gsm->gpa = gvt_d_alloc_mmio_memory(gsm->hpa, gsm->len, PCIM_BDSM_GSM_ALIGNMENT, E820_TYPE_RESERVED); if (gsm->gpa == 0) { warnx( "%s: Unable to add Graphics Stolen Memory to E820 table (hpa 0x%lx len 0x%lx)", __func__, gsm->hpa, gsm->len); e820_dump_table(); return (-1); } if (gsm->gpa != gsm->hpa) { /* * ACRN source code implies that graphics driver for newer Intel * platforms like Tiger Lake will read the Graphics Stolen Memory * address from an MMIO register. We have three options to solve this * issue: * 1. Patch the value in the MMIO register * This could have unintended side effects. Without any * documentation how this register is used by the GPU, don't do * it. * 2. Trap the MMIO register * It's not possible to trap a single MMIO register. We need to * trap a whole page. Trapping a bunch of MMIO register could * degrade the performance noticeably. We have to test it. * 3. Use an 1:1 host to guest mapping * Maybe not always possible. As far as we know, no supported * platform requires a 1:1 mapping. For that reason, just log a * warning. */ warnx( "Warning: Unable to reuse host address of Graphics Stolen Memory. GPU passthrough might not work properly."); } bdsm = pci_host_read_config(passthru_get_sel(sc), PCIR_BDSM, 4); pci_set_cfgdata32(pi, PCIR_BDSM, gsm->gpa | (bdsm & (PCIM_BDSM_GSM_ALIGNMENT - 1))); return (set_pcir_handler(sc, PCIR_BDSM, 4, passthru_cfgread_emulate, passthru_cfgwrite_emulate)); } static int gvt_d_setup_opregion(struct pci_devinst *const pi) { struct passthru_softc *sc; struct passthru_mmio_mapping *opregion; struct igd_opregion_header *header; uint64_t asls; int memfd; sc = pi->pi_arg; memfd = open(_PATH_MEM, O_RDONLY, 0); if (memfd < 0) { warn("%s: Failed to open %s", __func__, _PATH_MEM); return (-1); } opregion = passthru_get_mmio(sc, GVT_D_MAP_OPREGION); if (opregion == NULL) { warnx("%s: Unable to access opregion", __func__); close(memfd); return (-1); } asls = pci_host_read_config(passthru_get_sel(sc), PCIR_ASLS_CTL, 4); header = mmap(NULL, sizeof(*header), PROT_READ, MAP_SHARED, memfd, asls); if (header == MAP_FAILED) { warn("%s: Unable to map OpRegion header", __func__); close(memfd); return (-1); } if (memcmp(header->sign, IGD_OPREGION_HEADER_SIGN, sizeof(header->sign)) != 0) { warnx("%s: Invalid OpRegion signature", __func__); munmap(header, sizeof(*header)); close(memfd); return (-1); } opregion->hpa = asls; opregion->len = header->size * KB; munmap(header, sizeof(header)); opregion->hva = mmap(NULL, opregion->len * KB, PROT_READ, MAP_SHARED, memfd, opregion->hpa); if (opregion->hva == MAP_FAILED) { warn("%s: Unable to map host OpRegion", __func__); close(memfd); return (-1); } close(memfd); opregion->gpa = gvt_d_alloc_mmio_memory(opregion->hpa, opregion->len, E820_ALIGNMENT_NONE, E820_TYPE_NVS); if (opregion->gpa == 0) { warnx( "%s: Unable to add OpRegion to E820 table (hpa 0x%lx len 0x%lx)", __func__, opregion->hpa, opregion->len); e820_dump_table(); return (-1); } opregion->gva = vm_map_gpa(pi->pi_vmctx, opregion->gpa, opregion->len); if (opregion->gva == NULL) { warnx("%s: Unable to map guest OpRegion", __func__); return (-1); } if (opregion->gpa != opregion->hpa) { /* * A 1:1 host to guest mapping is not required but this could * change in the future. */ warnx( "Warning: Unable to reuse host address of OpRegion. GPU passthrough might not work properly."); } memcpy(opregion->gva, opregion->hva, opregion->len); pci_set_cfgdata32(pi, PCIR_ASLS_CTL, opregion->gpa); return (set_pcir_handler(sc, PCIR_ASLS_CTL, 4, passthru_cfgread_emulate, passthru_cfgwrite_emulate)); } static int gvt_d_init(struct pci_devinst *const pi, nvlist_t *const nvl __unused) { int error; if ((error = gvt_d_setup_gsm(pi)) != 0) { warnx("%s: Unable to setup Graphics Stolen Memory", __func__); goto done; } if ((error = gvt_d_setup_opregion(pi)) != 0) { warnx("%s: Unable to setup OpRegion", __func__); goto done; } done: return (error); } static void gvt_d_deinit(struct pci_devinst *const pi) { struct passthru_softc *sc; struct passthru_mmio_mapping *opregion; sc = pi->pi_arg; opregion = passthru_get_mmio(sc, GVT_D_MAP_OPREGION); /* HVA is only set, if it's initialized */ if (opregion->hva) munmap((void *)opregion->hva, opregion->len); } static struct passthru_dev gvt_d_dev = { .probe = gvt_d_probe, .init = gvt_d_init, .deinit = gvt_d_deinit, }; PASSTHRU_DEV_SET(gvt_d_dev); diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c index 593930c7f8a3..2daef0ef74f0 100644 --- a/usr.sbin/bhyve/snapshot.c +++ b/usr.sbin/bhyve/snapshot.c @@ -1,1625 +1,1624 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2016 Flavius Anton * Copyright (c) 2016 Mihai Tiganus * Copyright (c) 2016-2019 Mihai Carabas * Copyright (c) 2017-2019 Darius Mihai * Copyright (c) 2017-2019 Elena Mihailescu * Copyright (c) 2018-2019 Sergiu Weisz * All rights reserved. * The bhyve-snapshot feature was developed under sponsorships * from Matthew Grooms. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include "bhyverun.h" #include "acpi.h" #ifdef __amd64__ #include "amd64/atkbdc.h" #endif #include "debug.h" #include "inout.h" #include "ipc.h" #include "ioapic.h" #include "mem.h" #include "mevent.h" -#include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "smbiostbl.h" #include "snapshot.h" #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include #include struct spinner_info { const size_t *crtval; const size_t maxval; const size_t total; }; extern int guest_ncpus; static struct winsize winsize; static sig_t old_winch_handler; #define KB (1024UL) #define MB (1024UL * KB) #define GB (1024UL * MB) #define SNAPSHOT_CHUNK (4 * MB) #define PROG_BUF_SZ (8192) #define SNAPSHOT_BUFFER_SIZE (20 * MB) #define JSON_KERNEL_ARR_KEY "kern_structs" #define JSON_DEV_ARR_KEY "devices" #define JSON_BASIC_METADATA_KEY "basic metadata" #define JSON_SNAPSHOT_REQ_KEY "device" #define JSON_SIZE_KEY "size" #define JSON_FILE_OFFSET_KEY "file_offset" #define JSON_NCPUS_KEY "ncpus" #define JSON_VMNAME_KEY "vmname" #define JSON_MEMSIZE_KEY "memsize" #define JSON_MEMFLAGS_KEY "memflags" #define min(a,b) \ ({ \ __typeof__ (a) _a = (a); \ __typeof__ (b) _b = (b); \ _a < _b ? _a : _b; \ }) static const struct vm_snapshot_kern_info snapshot_kern_structs[] = { { "vhpet", STRUCT_VHPET }, { "vm", STRUCT_VM }, { "vioapic", STRUCT_VIOAPIC }, { "vlapic", STRUCT_VLAPIC }, { "vmcx", STRUCT_VMCX }, { "vatpit", STRUCT_VATPIT }, { "vatpic", STRUCT_VATPIC }, { "vpmtmr", STRUCT_VPMTMR }, { "vrtc", STRUCT_VRTC }, }; static cpuset_t vcpus_active, vcpus_suspended; static pthread_mutex_t vcpu_lock; static pthread_cond_t vcpus_idle, vcpus_can_run; static bool checkpoint_active; /* * TODO: Harden this function and all of its callers since 'base_str' is a user * provided string. */ static char * strcat_extension(const char *base_str, const char *ext) { char *res; size_t base_len, ext_len; base_len = strnlen(base_str, NAME_MAX); ext_len = strnlen(ext, NAME_MAX); if (base_len + ext_len > NAME_MAX) { fprintf(stderr, "Filename exceeds maximum length.\n"); return (NULL); } res = malloc(base_len + ext_len + 1); if (res == NULL) { perror("Failed to allocate memory."); return (NULL); } memcpy(res, base_str, base_len); memcpy(res + base_len, ext, ext_len); res[base_len + ext_len] = 0; return (res); } void destroy_restore_state(struct restore_state *rstate) { if (rstate == NULL) { fprintf(stderr, "Attempting to destroy NULL restore struct.\n"); return; } if (rstate->kdata_map != MAP_FAILED) munmap(rstate->kdata_map, rstate->kdata_len); if (rstate->kdata_fd > 0) close(rstate->kdata_fd); if (rstate->vmmem_fd > 0) close(rstate->vmmem_fd); if (rstate->meta_root_obj != NULL) ucl_object_unref(rstate->meta_root_obj); if (rstate->meta_parser != NULL) ucl_parser_free(rstate->meta_parser); } static int load_vmmem_file(const char *filename, struct restore_state *rstate) { struct stat sb; int err; rstate->vmmem_fd = open(filename, O_RDONLY); if (rstate->vmmem_fd < 0) { perror("Failed to open restore file"); return (-1); } err = fstat(rstate->vmmem_fd, &sb); if (err < 0) { perror("Failed to stat restore file"); goto err_load_vmmem; } if (sb.st_size == 0) { fprintf(stderr, "Restore file is empty.\n"); goto err_load_vmmem; } rstate->vmmem_len = sb.st_size; return (0); err_load_vmmem: if (rstate->vmmem_fd > 0) close(rstate->vmmem_fd); return (-1); } static int load_kdata_file(const char *filename, struct restore_state *rstate) { struct stat sb; int err; rstate->kdata_fd = open(filename, O_RDONLY); if (rstate->kdata_fd < 0) { perror("Failed to open kernel data file"); return (-1); } err = fstat(rstate->kdata_fd, &sb); if (err < 0) { perror("Failed to stat kernel data file"); goto err_load_kdata; } if (sb.st_size == 0) { fprintf(stderr, "Kernel data file is empty.\n"); goto err_load_kdata; } rstate->kdata_len = sb.st_size; rstate->kdata_map = mmap(NULL, rstate->kdata_len, PROT_READ, MAP_SHARED, rstate->kdata_fd, 0); if (rstate->kdata_map == MAP_FAILED) { perror("Failed to map restore file"); goto err_load_kdata; } return (0); err_load_kdata: if (rstate->kdata_fd > 0) close(rstate->kdata_fd); return (-1); } static int load_metadata_file(const char *filename, struct restore_state *rstate) { ucl_object_t *obj; struct ucl_parser *parser; int err; parser = ucl_parser_new(UCL_PARSER_DEFAULT); if (parser == NULL) { fprintf(stderr, "Failed to initialize UCL parser.\n"); err = -1; goto err_load_metadata; } err = ucl_parser_add_file(parser, filename); if (err == 0) { fprintf(stderr, "Failed to parse metadata file: '%s'\n", filename); err = -1; goto err_load_metadata; } obj = ucl_parser_get_object(parser); if (obj == NULL) { fprintf(stderr, "Failed to parse object.\n"); err = -1; goto err_load_metadata; } rstate->meta_parser = parser; rstate->meta_root_obj = (ucl_object_t *)obj; return (0); err_load_metadata: if (parser != NULL) ucl_parser_free(parser); return (err); } int load_restore_file(const char *filename, struct restore_state *rstate) { int err = 0; char *kdata_filename = NULL, *meta_filename = NULL; assert(filename != NULL); assert(rstate != NULL); memset(rstate, 0, sizeof(*rstate)); rstate->kdata_map = MAP_FAILED; err = load_vmmem_file(filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest RAM file.\n"); goto err_restore; } kdata_filename = strcat_extension(filename, ".kern"); if (kdata_filename == NULL) { fprintf(stderr, "Failed to construct kernel data filename.\n"); goto err_restore; } err = load_kdata_file(kdata_filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest kernel data file.\n"); goto err_restore; } meta_filename = strcat_extension(filename, ".meta"); if (meta_filename == NULL) { fprintf(stderr, "Failed to construct kernel metadata filename.\n"); goto err_restore; } err = load_metadata_file(meta_filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest metadata file.\n"); goto err_restore; } return (0); err_restore: destroy_restore_state(rstate); if (kdata_filename != NULL) free(kdata_filename); if (meta_filename != NULL) free(meta_filename); return (-1); } #define JSON_GET_INT_OR_RETURN(key, obj, result_ptr, ret) \ do { \ const ucl_object_t *obj__; \ obj__ = ucl_object_lookup(obj, key); \ if (obj__ == NULL) { \ fprintf(stderr, "Missing key: '%s'", key); \ return (ret); \ } \ if (!ucl_object_toint_safe(obj__, result_ptr)) { \ fprintf(stderr, "Cannot convert '%s' value to int.", key); \ return (ret); \ } \ } while(0) #define JSON_GET_STRING_OR_RETURN(key, obj, result_ptr, ret) \ do { \ const ucl_object_t *obj__; \ obj__ = ucl_object_lookup(obj, key); \ if (obj__ == NULL) { \ fprintf(stderr, "Missing key: '%s'", key); \ return (ret); \ } \ if (!ucl_object_tostring_safe(obj__, result_ptr)) { \ fprintf(stderr, "Cannot convert '%s' value to string.", key); \ return (ret); \ } \ } while(0) static void * lookup_check_dev(const char *dev_name, struct restore_state *rstate, const ucl_object_t *obj, size_t *data_size) { const char *snapshot_req; int64_t size, file_offset; snapshot_req = NULL; JSON_GET_STRING_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj, &snapshot_req, NULL); assert(snapshot_req != NULL); if (!strcmp(snapshot_req, dev_name)) { JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj, &size, NULL); assert(size >= 0); JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj, &file_offset, NULL); assert(file_offset >= 0); assert((uint64_t)file_offset + size <= rstate->kdata_len); *data_size = (size_t)size; return ((uint8_t *)rstate->kdata_map + file_offset); } return (NULL); } static void * lookup_dev(const char *dev_name, const char *key, struct restore_state *rstate, size_t *data_size) { const ucl_object_t *devs = NULL, *obj = NULL; ucl_object_iter_t it = NULL; void *ret; devs = ucl_object_lookup(rstate->meta_root_obj, key); if (devs == NULL) { fprintf(stderr, "Failed to find '%s' object.\n", JSON_DEV_ARR_KEY); return (NULL); } if (ucl_object_type(devs) != UCL_ARRAY) { fprintf(stderr, "Object '%s' is not an array.\n", JSON_DEV_ARR_KEY); return (NULL); } while ((obj = ucl_object_iterate(devs, &it, true)) != NULL) { ret = lookup_check_dev(dev_name, rstate, obj, data_size); if (ret != NULL) return (ret); } return (NULL); } static const ucl_object_t * lookup_basic_metadata_object(struct restore_state *rstate) { const ucl_object_t *basic_meta_obj = NULL; basic_meta_obj = ucl_object_lookup(rstate->meta_root_obj, JSON_BASIC_METADATA_KEY); if (basic_meta_obj == NULL) { fprintf(stderr, "Failed to find '%s' object.\n", JSON_BASIC_METADATA_KEY); return (NULL); } if (ucl_object_type(basic_meta_obj) != UCL_OBJECT) { fprintf(stderr, "Object '%s' is not a JSON object.\n", JSON_BASIC_METADATA_KEY); return (NULL); } return (basic_meta_obj); } const char * lookup_vmname(struct restore_state *rstate) { const char *vmname; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (NULL); JSON_GET_STRING_OR_RETURN(JSON_VMNAME_KEY, obj, &vmname, NULL); return (vmname); } int lookup_memflags(struct restore_state *rstate) { int64_t memflags; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_MEMFLAGS_KEY, obj, &memflags, 0); return ((int)memflags); } size_t lookup_memsize(struct restore_state *rstate) { int64_t memsize; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_MEMSIZE_KEY, obj, &memsize, 0); if (memsize < 0) memsize = 0; return ((size_t)memsize); } int lookup_guest_ncpus(struct restore_state *rstate) { int64_t ncpus; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_NCPUS_KEY, obj, &ncpus, 0); return ((int)ncpus); } static void winch_handler(int signal __unused) { #ifdef TIOCGWINSZ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize); #endif /* TIOCGWINSZ */ } static int print_progress(size_t crtval, const size_t maxval) { size_t rc; double crtval_gb, maxval_gb; size_t i, win_width, prog_start, prog_done, prog_end; int mval_len; static char prog_buf[PROG_BUF_SZ]; static const size_t len = sizeof(prog_buf); static size_t div; static const char *div_str; static char wip_bar[] = { '/', '-', '\\', '|' }; static int wip_idx = 0; if (maxval == 0) { printf("[0B / 0B]\r\n"); return (0); } if (crtval > maxval) crtval = maxval; if (maxval > 10 * GB) { div = GB; div_str = "GiB"; } else if (maxval > 10 * MB) { div = MB; div_str = "MiB"; } else { div = KB; div_str = "KiB"; } crtval_gb = (double) crtval / div; maxval_gb = (double) maxval / div; rc = snprintf(prog_buf, len, "%.03lf", maxval_gb); if (rc == len) { fprintf(stderr, "Maxval too big\n"); return (-1); } mval_len = rc; rc = snprintf(prog_buf, len, "\r[%*.03lf%s / %.03lf%s] |", mval_len, crtval_gb, div_str, maxval_gb, div_str); if (rc == len) { fprintf(stderr, "Buffer too small to print progress\n"); return (-1); } win_width = min(winsize.ws_col, len); prog_start = rc; if (prog_start < (win_width - 2)) { prog_end = win_width - prog_start - 2; prog_done = prog_end * (crtval_gb / maxval_gb); for (i = prog_start; i < prog_start + prog_done; i++) prog_buf[i] = '#'; if (crtval != maxval) { prog_buf[i] = wip_bar[wip_idx]; wip_idx = (wip_idx + 1) % sizeof(wip_bar); i++; } else { prog_buf[i++] = '#'; } for (; i < win_width - 2; i++) prog_buf[i] = '_'; prog_buf[win_width - 2] = '|'; } prog_buf[win_width - 1] = '\0'; write(STDOUT_FILENO, prog_buf, win_width); return (0); } static void * snapshot_spinner_cb(void *arg) { int rc; size_t crtval, maxval, total; struct spinner_info *si; struct timespec ts; si = arg; if (si == NULL) pthread_exit(NULL); ts.tv_sec = 0; ts.tv_nsec = 50 * 1000 * 1000; /* 50 ms sleep time */ do { crtval = *si->crtval; maxval = si->maxval; total = si->total; rc = print_progress(crtval, total); if (rc < 0) { fprintf(stderr, "Failed to parse progress\n"); break; } nanosleep(&ts, NULL); } while (crtval < maxval); pthread_exit(NULL); return NULL; } static int vm_snapshot_mem_part(const int snapfd, const size_t foff, void *src, const size_t len, const size_t totalmem, const bool op_wr) { int rc; size_t part_done, todo, rem; ssize_t done; bool show_progress; pthread_t spinner_th; struct spinner_info *si; if (lseek(snapfd, foff, SEEK_SET) < 0) { perror("Failed to change file offset"); return (-1); } show_progress = false; if (isatty(STDIN_FILENO) && (winsize.ws_col != 0)) show_progress = true; part_done = foff; rem = len; if (show_progress) { si = &(struct spinner_info) { .crtval = &part_done, .maxval = foff + len, .total = totalmem }; rc = pthread_create(&spinner_th, 0, snapshot_spinner_cb, si); if (rc) { perror("Unable to create spinner thread"); show_progress = false; } } while (rem > 0) { if (show_progress) todo = min(SNAPSHOT_CHUNK, rem); else todo = rem; if (op_wr) done = write(snapfd, src, todo); else done = read(snapfd, src, todo); if (done < 0) { perror("Failed to write in file"); return (-1); } src = (uint8_t *)src + done; part_done += done; rem -= done; } if (show_progress) { rc = pthread_join(spinner_th, NULL); if (rc) perror("Unable to end spinner thread"); } return (0); } static size_t vm_snapshot_mem(struct vmctx *ctx, int snapfd, size_t memsz, const bool op_wr) { int ret; size_t lowmem, highmem, totalmem; char *baseaddr; ret = vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem, &highmem); if (ret) { fprintf(stderr, "%s: unable to retrieve guest memory size\r\n", __func__); return (0); } totalmem = lowmem + highmem; if ((op_wr == false) && (totalmem != memsz)) { fprintf(stderr, "%s: mem size mismatch: %ld vs %ld\r\n", __func__, totalmem, memsz); return (0); } winsize.ws_col = 80; #ifdef TIOCGWINSZ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize); #endif /* TIOCGWINSZ */ old_winch_handler = signal(SIGWINCH, winch_handler); ret = vm_snapshot_mem_part(snapfd, 0, baseaddr, lowmem, totalmem, op_wr); if (ret) { fprintf(stderr, "%s: Could not %s lowmem\r\n", __func__, op_wr ? "write" : "read"); totalmem = 0; goto done; } if (highmem == 0) goto done; ret = vm_snapshot_mem_part(snapfd, lowmem, baseaddr + 4*GB, highmem, totalmem, op_wr); if (ret) { fprintf(stderr, "%s: Could not %s highmem\r\n", __func__, op_wr ? "write" : "read"); totalmem = 0; goto done; } done: printf("\r\n"); signal(SIGWINCH, old_winch_handler); return (totalmem); } int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate) { size_t restored; restored = vm_snapshot_mem(ctx, rstate->vmmem_fd, rstate->vmmem_len, false); if (restored != rstate->vmmem_len) return (-1); return (0); } int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate) { for (unsigned i = 0; i < nitems(snapshot_kern_structs); i++) { const struct vm_snapshot_kern_info *info; struct vm_snapshot_meta *meta; void *data; size_t size; info = &snapshot_kern_structs[i]; data = lookup_dev(info->struct_name, JSON_KERNEL_ARR_KEY, rstate, &size); if (data == NULL) errx(EX_DATAERR, "Cannot find kern struct %s", info->struct_name); if (size == 0) errx(EX_DATAERR, "data with zero size for %s", info->struct_name); meta = &(struct vm_snapshot_meta) { .dev_name = info->struct_name, .dev_req = info->req, .buffer.buf_start = data, .buffer.buf_size = size, .buffer.buf = data, .buffer.buf_rem = size, .op = VM_SNAPSHOT_RESTORE, }; if (vm_snapshot_req(ctx, meta)) err(EX_DATAERR, "Failed to restore %s", info->struct_name); } return (0); } static int vm_restore_device(struct restore_state *rstate, vm_snapshot_dev_cb func, const char *name, void *data) { void *dev_ptr; size_t dev_size; int ret; struct vm_snapshot_meta *meta; dev_ptr = lookup_dev(name, JSON_DEV_ARR_KEY, rstate, &dev_size); if (dev_ptr == NULL) { EPRINTLN("Failed to lookup dev: %s", name); return (EINVAL); } if (dev_size == 0) { EPRINTLN("Restore device size is 0: %s", name); return (EINVAL); } meta = &(struct vm_snapshot_meta) { .dev_name = name, .dev_data = data, .buffer.buf_start = dev_ptr, .buffer.buf_size = dev_size, .buffer.buf = dev_ptr, .buffer.buf_rem = dev_size, .op = VM_SNAPSHOT_RESTORE, }; ret = func(meta); if (ret != 0) { EPRINTLN("Failed to restore dev: %s %d", name, ret); return (ret); } return (0); } int vm_restore_devices(struct restore_state *rstate) { int ret; struct pci_devinst *pdi = NULL; while ((pdi = pci_next(pdi)) != NULL) { ret = vm_restore_device(rstate, pci_snapshot, pdi->pi_name, pdi); if (ret) return (ret); } #ifdef __amd64__ ret = vm_restore_device(rstate, atkbdc_snapshot, "atkbdc", NULL); #else ret = 0; #endif return (ret); } int vm_pause_devices(void) { int ret; struct pci_devinst *pdi = NULL; while ((pdi = pci_next(pdi)) != NULL) { ret = pci_pause(pdi); if (ret) { EPRINTLN("Cannot pause dev %s: %d", pdi->pi_name, ret); return (ret); } } return (0); } int vm_resume_devices(void) { int ret; struct pci_devinst *pdi = NULL; while ((pdi = pci_next(pdi)) != NULL) { ret = pci_resume(pdi); if (ret) { EPRINTLN("Cannot resume '%s': %d", pdi->pi_name, ret); return (ret); } } return (0); } static int vm_save_kern_struct(struct vmctx *ctx, int data_fd, xo_handle_t *xop, const char *array_key, struct vm_snapshot_meta *meta, off_t *offset) { int ret; size_t data_size; ssize_t write_cnt; ret = vm_snapshot_req(ctx, meta); if (ret != 0) { fprintf(stderr, "%s: Failed to snapshot struct %s\r\n", __func__, meta->dev_name); ret = -1; goto done; } data_size = vm_get_snapshot_size(meta); /* XXX-MJ no handling for short writes. */ write_cnt = write(data_fd, meta->buffer.buf_start, data_size); if (write_cnt < 0 || (size_t)write_cnt != data_size) { perror("Failed to write all snapshotted data."); ret = -1; goto done; } /* Write metadata. */ xo_open_instance_h(xop, array_key); xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name); xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); xo_close_instance_h(xop, JSON_KERNEL_ARR_KEY); *offset += data_size; done: return (ret); } static int vm_save_kern_structs(struct vmctx *ctx, int data_fd, xo_handle_t *xop) { int ret, error; size_t buf_size, i, offset; char *buffer; struct vm_snapshot_meta *meta; error = 0; offset = 0; buf_size = SNAPSHOT_BUFFER_SIZE; buffer = malloc(SNAPSHOT_BUFFER_SIZE * sizeof(char)); if (buffer == NULL) { error = ENOMEM; perror("Failed to allocate memory for snapshot buffer"); goto err_vm_snapshot_kern_data; } meta = &(struct vm_snapshot_meta) { .buffer.buf_start = buffer, .buffer.buf_size = buf_size, .op = VM_SNAPSHOT_SAVE, }; xo_open_list_h(xop, JSON_KERNEL_ARR_KEY); for (i = 0; i < nitems(snapshot_kern_structs); i++) { meta->dev_name = snapshot_kern_structs[i].struct_name; meta->dev_req = snapshot_kern_structs[i].req; memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); meta->buffer.buf = meta->buffer.buf_start; meta->buffer.buf_rem = meta->buffer.buf_size; ret = vm_save_kern_struct(ctx, data_fd, xop, JSON_DEV_ARR_KEY, meta, &offset); if (ret != 0) { error = -1; goto err_vm_snapshot_kern_data; } } xo_close_list_h(xop, JSON_KERNEL_ARR_KEY); err_vm_snapshot_kern_data: if (buffer != NULL) free(buffer); return (error); } static int vm_snapshot_basic_metadata(struct vmctx *ctx, xo_handle_t *xop, size_t memsz) { xo_open_container_h(xop, JSON_BASIC_METADATA_KEY); xo_emit_h(xop, "{:" JSON_NCPUS_KEY "/%ld}\n", guest_ncpus); xo_emit_h(xop, "{:" JSON_VMNAME_KEY "/%s}\n", vm_get_name(ctx)); xo_emit_h(xop, "{:" JSON_MEMSIZE_KEY "/%lu}\n", memsz); xo_emit_h(xop, "{:" JSON_MEMFLAGS_KEY "/%d}\n", vm_get_memflags(ctx)); xo_close_container_h(xop, JSON_BASIC_METADATA_KEY); return (0); } static int vm_snapshot_dev_write_data(int data_fd, xo_handle_t *xop, const char *array_key, struct vm_snapshot_meta *meta, off_t *offset) { ssize_t ret; size_t data_size; data_size = vm_get_snapshot_size(meta); /* XXX-MJ no handling for short writes. */ ret = write(data_fd, meta->buffer.buf_start, data_size); if (ret < 0 || (size_t)ret != data_size) { perror("Failed to write all snapshotted data."); return (-1); } /* Write metadata. */ xo_open_instance_h(xop, array_key); xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name); xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); xo_close_instance_h(xop, array_key); *offset += data_size; return (0); } static int vm_snapshot_device(vm_snapshot_dev_cb func, const char *dev_name, void *devdata, int data_fd, xo_handle_t *xop, struct vm_snapshot_meta *meta, off_t *offset) { int ret; memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); meta->buffer.buf = meta->buffer.buf_start; meta->buffer.buf_rem = meta->buffer.buf_size; meta->dev_name = dev_name; meta->dev_data = devdata; ret = func(meta); if (ret != 0) { EPRINTLN("Failed to snapshot %s; ret=%d", dev_name, ret); return (ret); } ret = vm_snapshot_dev_write_data(data_fd, xop, JSON_DEV_ARR_KEY, meta, offset); if (ret != 0) return (ret); return (0); } static int vm_snapshot_devices(int data_fd, xo_handle_t *xop) { int ret; off_t offset; void *buffer; size_t buf_size; struct vm_snapshot_meta *meta; struct pci_devinst *pdi; buf_size = SNAPSHOT_BUFFER_SIZE; offset = lseek(data_fd, 0, SEEK_CUR); if (offset < 0) { perror("Failed to get data file current offset."); return (-1); } buffer = malloc(buf_size); if (buffer == NULL) { perror("Failed to allocate memory for snapshot buffer"); ret = ENOSPC; goto snapshot_err; } meta = &(struct vm_snapshot_meta) { .buffer.buf_start = buffer, .buffer.buf_size = buf_size, .op = VM_SNAPSHOT_SAVE, }; xo_open_list_h(xop, JSON_DEV_ARR_KEY); /* Save PCI devices */ pdi = NULL; while ((pdi = pci_next(pdi)) != NULL) { ret = vm_snapshot_device(pci_snapshot, pdi->pi_name, pdi, data_fd, xop, meta, &offset); if (ret != 0) goto snapshot_err; } #ifdef __amd64__ ret = vm_snapshot_device(atkbdc_snapshot, "atkbdc", NULL, data_fd, xop, meta, &offset); #else ret = 0; #endif xo_close_list_h(xop, JSON_DEV_ARR_KEY); snapshot_err: if (buffer != NULL) free(buffer); return (ret); } void checkpoint_cpu_add(int vcpu) { pthread_mutex_lock(&vcpu_lock); CPU_SET(vcpu, &vcpus_active); if (checkpoint_active) { CPU_SET(vcpu, &vcpus_suspended); while (checkpoint_active) pthread_cond_wait(&vcpus_can_run, &vcpu_lock); CPU_CLR(vcpu, &vcpus_suspended); } pthread_mutex_unlock(&vcpu_lock); } /* * When a vCPU is suspended for any reason, it calls * checkpoint_cpu_suspend(). This records that the vCPU is idle. * Before returning from suspension, checkpoint_cpu_resume() is * called. In suspend we note that the vCPU is idle. In resume we * pause the vCPU thread until the checkpoint is complete. The reason * for the two-step process is that vCPUs might already be stopped in * the debug server when a checkpoint is requested. This approach * allows us to account for and handle those vCPUs. */ void checkpoint_cpu_suspend(int vcpu) { pthread_mutex_lock(&vcpu_lock); CPU_SET(vcpu, &vcpus_suspended); if (checkpoint_active && CPU_CMP(&vcpus_active, &vcpus_suspended) == 0) pthread_cond_signal(&vcpus_idle); pthread_mutex_unlock(&vcpu_lock); } void checkpoint_cpu_resume(int vcpu) { pthread_mutex_lock(&vcpu_lock); while (checkpoint_active) pthread_cond_wait(&vcpus_can_run, &vcpu_lock); CPU_CLR(vcpu, &vcpus_suspended); pthread_mutex_unlock(&vcpu_lock); } static void vm_vcpu_pause(struct vmctx *ctx) { pthread_mutex_lock(&vcpu_lock); checkpoint_active = true; vm_suspend_all_cpus(ctx); while (CPU_CMP(&vcpus_active, &vcpus_suspended) != 0) pthread_cond_wait(&vcpus_idle, &vcpu_lock); pthread_mutex_unlock(&vcpu_lock); } static void vm_vcpu_resume(struct vmctx *ctx) { pthread_mutex_lock(&vcpu_lock); checkpoint_active = false; pthread_mutex_unlock(&vcpu_lock); vm_resume_all_cpus(ctx); pthread_cond_broadcast(&vcpus_can_run); } static int vm_checkpoint(struct vmctx *ctx, int fddir, const char *checkpoint_file, bool stop_vm) { int fd_checkpoint = 0, kdata_fd = 0, fd_meta; int ret = 0; int error = 0; size_t memsz; xo_handle_t *xop = NULL; char *meta_filename = NULL; char *kdata_filename = NULL; FILE *meta_file = NULL; kdata_filename = strcat_extension(checkpoint_file, ".kern"); if (kdata_filename == NULL) { fprintf(stderr, "Failed to construct kernel data filename.\n"); return (-1); } kdata_fd = openat(fddir, kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); if (kdata_fd < 0) { perror("Failed to open kernel data snapshot file."); error = -1; goto done; } fd_checkpoint = openat(fddir, checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700); if (fd_checkpoint < 0) { perror("Failed to create checkpoint file"); error = -1; goto done; } meta_filename = strcat_extension(checkpoint_file, ".meta"); if (meta_filename == NULL) { fprintf(stderr, "Failed to construct vm metadata filename.\n"); goto done; } fd_meta = openat(fddir, meta_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); if (fd_meta != -1) meta_file = fdopen(fd_meta, "w"); if (meta_file == NULL) { perror("Failed to open vm metadata snapshot file."); close(fd_meta); goto done; } xop = xo_create_to_file(meta_file, XO_STYLE_JSON, XOF_PRETTY); if (xop == NULL) { perror("Failed to get libxo handle on metadata file."); goto done; } vm_vcpu_pause(ctx); ret = vm_pause_devices(); if (ret != 0) { fprintf(stderr, "Could not pause devices\r\n"); error = ret; goto done; } memsz = vm_snapshot_mem(ctx, fd_checkpoint, 0, true); if (memsz == 0) { perror("Could not write guest memory to file"); error = -1; goto done; } ret = vm_snapshot_basic_metadata(ctx, xop, memsz); if (ret != 0) { fprintf(stderr, "Failed to snapshot vm basic metadata.\n"); error = -1; goto done; } ret = vm_save_kern_structs(ctx, kdata_fd, xop); if (ret != 0) { fprintf(stderr, "Failed to snapshot vm kernel data.\n"); error = -1; goto done; } ret = vm_snapshot_devices(kdata_fd, xop); if (ret != 0) { fprintf(stderr, "Failed to snapshot device state.\n"); error = -1; goto done; } xo_finish_h(xop); if (stop_vm) { vm_destroy(ctx); exit(0); } done: ret = vm_resume_devices(); if (ret != 0) fprintf(stderr, "Could not resume devices\r\n"); vm_vcpu_resume(ctx); if (fd_checkpoint > 0) close(fd_checkpoint); if (meta_filename != NULL) free(meta_filename); if (kdata_filename != NULL) free(kdata_filename); if (xop != NULL) xo_destroy(xop); if (meta_file != NULL) fclose(meta_file); if (kdata_fd > 0) close(kdata_fd); return (error); } static int handle_message(struct vmctx *ctx, nvlist_t *nvl) { const char *cmd; struct ipc_command **ipc_cmd; if (!nvlist_exists_string(nvl, "cmd")) return (EINVAL); cmd = nvlist_get_string(nvl, "cmd"); IPC_COMMAND_FOREACH(ipc_cmd, ipc_cmd_set) { if (strcmp(cmd, (*ipc_cmd)->name) == 0) return ((*ipc_cmd)->handler(ctx, nvl)); } return (EOPNOTSUPP); } /* * Listen for commands from bhyvectl */ void * checkpoint_thread(void *param) { int fd; struct checkpoint_thread_info *thread_info; nvlist_t *nvl; pthread_set_name_np(pthread_self(), "checkpoint thread"); thread_info = (struct checkpoint_thread_info *)param; while ((fd = accept(thread_info->socket_fd, NULL, NULL)) != -1) { nvl = nvlist_recv(fd, 0); if (nvl != NULL) handle_message(thread_info->ctx, nvl); else EPRINTLN("nvlist_recv() failed: %s", strerror(errno)); close(fd); nvlist_destroy(nvl); } return (NULL); } static int vm_do_checkpoint(struct vmctx *ctx, const nvlist_t *nvl) { int error; if (!nvlist_exists_string(nvl, "filename") || !nvlist_exists_bool(nvl, "suspend") || !nvlist_exists_descriptor(nvl, "fddir")) error = EINVAL; else error = vm_checkpoint(ctx, nvlist_get_descriptor(nvl, "fddir"), nvlist_get_string(nvl, "filename"), nvlist_get_bool(nvl, "suspend")); return (error); } IPC_COMMAND(ipc_cmd_set, checkpoint, vm_do_checkpoint); void init_snapshot(void) { int err; err = pthread_mutex_init(&vcpu_lock, NULL); if (err != 0) errc(1, err, "checkpoint mutex init"); err = pthread_cond_init(&vcpus_idle, NULL); if (err != 0) errc(1, err, "checkpoint cv init (vcpus_idle)"); err = pthread_cond_init(&vcpus_can_run, NULL); if (err != 0) errc(1, err, "checkpoint cv init (vcpus_can_run)"); } /* * Create the listening socket for IPC with bhyvectl */ int init_checkpoint_thread(struct vmctx *ctx) { struct checkpoint_thread_info *checkpoint_info = NULL; struct sockaddr_un addr; int socket_fd; pthread_t checkpoint_pthread; int err; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif memset(&addr, 0, sizeof(addr)); socket_fd = socket(PF_UNIX, SOCK_STREAM, 0); if (socket_fd < 0) { EPRINTLN("Socket creation failed: %s", strerror(errno)); err = -1; goto fail; } addr.sun_family = AF_UNIX; snprintf(addr.sun_path, sizeof(addr.sun_path), "%s%s", BHYVE_RUN_DIR, vm_get_name(ctx)); addr.sun_len = SUN_LEN(&addr); unlink(addr.sun_path); if (bind(socket_fd, (struct sockaddr *)&addr, addr.sun_len) != 0) { EPRINTLN("Failed to bind socket \"%s\": %s\n", addr.sun_path, strerror(errno)); err = -1; goto fail; } if (listen(socket_fd, 10) < 0) { EPRINTLN("ipc socket listen: %s\n", strerror(errno)); err = errno; goto fail; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_ACCEPT, CAP_READ, CAP_RECV, CAP_WRITE, CAP_SEND, CAP_GETSOCKOPT); if (caph_rights_limit(socket_fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif checkpoint_info = calloc(1, sizeof(*checkpoint_info)); checkpoint_info->ctx = ctx; checkpoint_info->socket_fd = socket_fd; err = pthread_create(&checkpoint_pthread, NULL, checkpoint_thread, checkpoint_info); if (err != 0) goto fail; return (0); fail: free(checkpoint_info); if (socket_fd > 0) close(socket_fd); unlink(addr.sun_path); return (err); } void vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op) { const char *__op; if (op == VM_SNAPSHOT_SAVE) __op = "save"; else if (op == VM_SNAPSHOT_RESTORE) __op = "restore"; else __op = "unknown"; fprintf(stderr, "%s: snapshot-%s failed for %s\r\n", __func__, __op, bufname); } int vm_snapshot_buf(void *data, size_t data_size, struct vm_snapshot_meta *meta) { struct vm_snapshot_buffer *buffer; int op; buffer = &meta->buffer; op = meta->op; if (buffer->buf_rem < data_size) { fprintf(stderr, "%s: buffer too small\r\n", __func__); return (E2BIG); } if (op == VM_SNAPSHOT_SAVE) memcpy(buffer->buf, data, data_size); else if (op == VM_SNAPSHOT_RESTORE) memcpy(data, buffer->buf, data_size); else return (EINVAL); buffer->buf += data_size; buffer->buf_rem -= data_size; return (0); } size_t vm_get_snapshot_size(struct vm_snapshot_meta *meta) { size_t length; struct vm_snapshot_buffer *buffer; buffer = &meta->buffer; if (buffer->buf_size < buffer->buf_rem) { fprintf(stderr, "%s: Invalid buffer: size = %zu, rem = %zu\r\n", __func__, buffer->buf_size, buffer->buf_rem); length = 0; } else { length = buffer->buf_size - buffer->buf_rem; } return (length); } int vm_snapshot_guest2host_addr(struct vmctx *ctx, void **addrp, size_t len, bool restore_null, struct vm_snapshot_meta *meta) { int ret; vm_paddr_t gaddr; if (meta->op == VM_SNAPSHOT_SAVE) { gaddr = paddr_host2guest(ctx, *addrp); if (gaddr == (vm_paddr_t) -1) { if (!restore_null || (restore_null && (*addrp != NULL))) { ret = EFAULT; goto done; } } SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); } else if (meta->op == VM_SNAPSHOT_RESTORE) { SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); if (gaddr == (vm_paddr_t) -1) { if (!restore_null) { ret = EFAULT; goto done; } } *addrp = paddr_guest2host(ctx, gaddr, len); } else { ret = EINVAL; } done: return (ret); } int vm_snapshot_buf_cmp(void *data, size_t data_size, struct vm_snapshot_meta *meta) { struct vm_snapshot_buffer *buffer; int op; int ret; buffer = &meta->buffer; op = meta->op; if (buffer->buf_rem < data_size) { fprintf(stderr, "%s: buffer too small\r\n", __func__); ret = E2BIG; goto done; } if (op == VM_SNAPSHOT_SAVE) { ret = 0; memcpy(buffer->buf, data, data_size); } else if (op == VM_SNAPSHOT_RESTORE) { ret = memcmp(data, buffer->buf, data_size); } else { ret = EINVAL; goto done; } buffer->buf += data_size; buffer->buf_rem -= data_size; done: return (ret); }