diff --git a/usr.sbin/bhyve/basl.c b/usr.sbin/bhyve/basl.c index 8a4f2c4f311e..c20a52571937 100644 --- a/usr.sbin/bhyve/basl.c +++ b/usr.sbin/bhyve/basl.c @@ -1,679 +1,702 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Beckhoff Automation GmbH & Co. KG */ #include #include #include #include #include #include #include #include #include #include #include #include #include "basl.h" +#include "config.h" #include "qemu_loader.h" struct basl_table_checksum { STAILQ_ENTRY(basl_table_checksum) chain; uint32_t off; uint32_t start; uint32_t len; }; struct basl_table_length { STAILQ_ENTRY(basl_table_length) chain; uint32_t off; uint8_t size; }; struct basl_table_pointer { STAILQ_ENTRY(basl_table_pointer) chain; uint8_t src_signature[ACPI_NAMESEG_SIZE]; uint32_t off; uint8_t size; }; struct basl_table { STAILQ_ENTRY(basl_table) chain; struct vmctx *ctx; uint8_t fwcfg_name[QEMU_FWCFG_MAX_NAME]; void *data; uint32_t len; uint32_t off; uint32_t alignment; STAILQ_HEAD(basl_table_checksum_list, basl_table_checksum) checksums; STAILQ_HEAD(basl_table_length_list, basl_table_length) lengths; STAILQ_HEAD(basl_table_pointer_list, basl_table_pointer) pointers; }; static STAILQ_HEAD(basl_table_list, basl_table) basl_tables = STAILQ_HEAD_INITIALIZER( basl_tables); static struct qemu_loader *basl_loader; static struct basl_table *rsdt; static struct basl_table *xsdt; +static bool load_into_memory; static __inline uint64_t basl_le_dec(void *pp, size_t len) { assert(len <= 8); switch (len) { case 1: return ((uint8_t *)pp)[0]; case 2: return le16dec(pp); case 4: return le32dec(pp); case 8: return le64dec(pp); } return 0; } static __inline void basl_le_enc(void *pp, uint64_t val, size_t len) { char buf[8]; assert(len <= 8); le64enc(buf, val); memcpy(pp, buf, len); } static int basl_dump_table(const struct basl_table *const table, const bool mem) { const ACPI_TABLE_HEADER *const header = table->data; const uint8_t *data; if (!mem) { data = table->data; } else { data = vm_map_gpa(table->ctx, BHYVE_ACPI_BASE + table->off, table->len); if (data == NULL) { return (ENOMEM); } } printf("%.4s @ %8x (%s)\n", header->Signature, BHYVE_ACPI_BASE + table->off, mem ? "Memory" : "FwCfg"); hexdump(data, table->len, NULL, 0); return (0); } static int __unused basl_dump(const bool mem) { struct basl_table *table; STAILQ_FOREACH(table, &basl_tables, chain) { BASL_EXEC(basl_dump_table(table, mem)); } return (0); } void basl_fill_gas(ACPI_GENERIC_ADDRESS *const gas, const uint8_t space_id, const uint8_t bit_width, const uint8_t bit_offset, const uint8_t access_width, const uint64_t address) { assert(gas != NULL); gas->SpaceId = space_id; gas->BitWidth = bit_width; gas->BitOffset = bit_offset; gas->AccessWidth = access_width; gas->Address = htole64(address); } static int basl_finish_install_guest_tables(struct basl_table *const table, uint32_t *const off) { void *gva; table->off = roundup2(*off, table->alignment); *off = table->off + table->len; if (*off <= table->off) { warnx("%s: invalid table length 0x%8x @ offset 0x%8x", __func__, table->len, table->off); return (EFAULT); } + /* Cause guest BIOS to copy the ACPI table into guest memory. */ + BASL_EXEC( + qemu_fwcfg_add_file(table->fwcfg_name, table->len, table->data)); + BASL_EXEC(qemu_loader_alloc(basl_loader, table->fwcfg_name, + table->alignment, QEMU_LOADER_ALLOC_HIGH)); + + if (!load_into_memory) { + return (0); + } + /* * Install ACPI tables directly in guest memory for use by guests which * do not boot via EFI. EFI ROMs provide a pointer to the firmware * generated ACPI tables instead, but it doesn't hurt to install the * tables always. */ gva = vm_map_gpa(table->ctx, BHYVE_ACPI_BASE + table->off, table->len); if (gva == NULL) { warnx("%s: could not map gpa [ 0x%16lx, 0x%16lx ]", __func__, (uint64_t)BHYVE_ACPI_BASE + table->off, (uint64_t)BHYVE_ACPI_BASE + table->off + table->len); return (ENOMEM); } memcpy(gva, table->data, table->len); - /* Cause guest bios to copy the ACPI table into guest memory. */ - BASL_EXEC( - qemu_fwcfg_add_file(table->fwcfg_name, table->len, table->data)); - BASL_EXEC(qemu_loader_alloc(basl_loader, table->fwcfg_name, - table->alignment, QEMU_LOADER_ALLOC_HIGH)); - return (0); } static int basl_finish_patch_checksums(struct basl_table *const table) { struct basl_table_checksum *checksum; STAILQ_FOREACH(checksum, &table->checksums, chain) { uint8_t *gva, *checksum_gva; uint64_t gpa; uint32_t len; uint8_t sum; len = checksum->len; if (len == BASL_TABLE_CHECKSUM_LEN_FULL_TABLE) { len = table->len; } assert(checksum->off < table->len); assert(checksum->start < table->len); assert(checksum->start + len <= table->len); + /* Cause guest BIOS to patch the checksum. */ + BASL_EXEC(qemu_loader_add_checksum(basl_loader, + table->fwcfg_name, checksum->off, checksum->start, len)); + + if (!load_into_memory) { + continue; + } + /* * Install ACPI tables directly in guest memory for use by * guests which do not boot via EFI. EFI ROMs provide a pointer * to the firmware generated ACPI tables instead, but it doesn't * hurt to install the tables always. */ gpa = BHYVE_ACPI_BASE + table->off + checksum->start; if ((gpa < BHYVE_ACPI_BASE) || (gpa < BHYVE_ACPI_BASE + table->off)) { warnx("%s: invalid gpa (off 0x%8x start 0x%8x)", __func__, table->off, checksum->start); return (EFAULT); } gva = vm_map_gpa(table->ctx, gpa, len); if (gva == NULL) { warnx("%s: could not map gpa [ 0x%16lx, 0x%16lx ]", __func__, gpa, gpa + len); return (ENOMEM); } checksum_gva = gva + checksum->off; if (checksum_gva < gva) { warnx("%s: invalid checksum offset 0x%8x", __func__, checksum->off); return (EFAULT); } sum = 0; for (uint32_t i = 0; i < len; ++i) { sum += *(gva + i); } *checksum_gva = -sum; - - /* Cause guest bios to patch the checksum. */ - BASL_EXEC(qemu_loader_add_checksum(basl_loader, - table->fwcfg_name, checksum->off, checksum->start, len)); } return (0); } static struct basl_table * basl_get_table_by_signature(const uint8_t signature[ACPI_NAMESEG_SIZE]) { struct basl_table *table; STAILQ_FOREACH(table, &basl_tables, chain) { const ACPI_TABLE_HEADER *const header = (const ACPI_TABLE_HEADER *)table->data; if (strncmp(header->Signature, signature, sizeof(header->Signature)) == 0) { return (table); } } warnx("%s: %.4s not found", __func__, signature); return (NULL); } static int basl_finish_patch_pointers(struct basl_table *const table) { struct basl_table_pointer *pointer; STAILQ_FOREACH(pointer, &table->pointers, chain) { const struct basl_table *src_table; uint8_t *gva; uint64_t gpa, val; assert(pointer->off < table->len); assert(pointer->off + pointer->size <= table->len); src_table = basl_get_table_by_signature(pointer->src_signature); if (src_table == NULL) { warnx("%s: could not find ACPI table %.4s", __func__, pointer->src_signature); return (EFAULT); } + /* Cause guest BIOS to patch the pointer. */ + BASL_EXEC( + qemu_loader_add_pointer(basl_loader, table->fwcfg_name, + src_table->fwcfg_name, pointer->off, pointer->size)); + + if (!load_into_memory) { + continue; + } + /* * Install ACPI tables directly in guest memory for use by * guests which do not boot via EFI. EFI ROMs provide a pointer * to the firmware generated ACPI tables instead, but it doesn't * hurt to install the tables always. */ gpa = BHYVE_ACPI_BASE + table->off; if (gpa < BHYVE_ACPI_BASE) { warnx("%s: table offset of 0x%8x is too large", __func__, table->off); return (EFAULT); } gva = vm_map_gpa(table->ctx, gpa, table->len); if (gva == NULL) { warnx("%s: could not map gpa [ 0x%16lx, 0x%16lx ]", __func__, gpa, gpa + table->len); return (ENOMEM); } val = basl_le_dec(gva + pointer->off, pointer->size); val += BHYVE_ACPI_BASE + src_table->off; basl_le_enc(gva + pointer->off, val, pointer->size); - - /* Cause guest bios to patch the pointer. */ - BASL_EXEC( - qemu_loader_add_pointer(basl_loader, table->fwcfg_name, - src_table->fwcfg_name, pointer->off, pointer->size)); } return (0); } static int basl_finish_set_length(struct basl_table *const table) { struct basl_table_length *length; STAILQ_FOREACH(length, &table->lengths, chain) { assert(length->off < table->len); assert(length->off + length->size <= table->len); basl_le_enc((uint8_t *)table->data + length->off, table->len, length->size); } return (0); } int basl_finish(void) { struct basl_table *table; uint32_t off = 0; if (STAILQ_EMPTY(&basl_tables)) { warnx("%s: no ACPI tables found", __func__); return (EINVAL); } + /* + * If we install ACPI tables by FwCfg and by memory, Windows will use + * the tables from memory. This can cause issues when using advanced + * features like a TPM log because we aren't able to patch the memory + * tables accordingly. + */ + load_into_memory = get_config_bool_default("acpi_tables_in_memory", + true); + /* * We have to install all tables before we can patch them. Therefore, * use two loops. The first one installs all tables and the second one * patches them. */ STAILQ_FOREACH(table, &basl_tables, chain) { BASL_EXEC(basl_finish_set_length(table)); BASL_EXEC(basl_finish_install_guest_tables(table, &off)); } STAILQ_FOREACH(table, &basl_tables, chain) { BASL_EXEC(basl_finish_patch_pointers(table)); /* * Calculate the checksum as last step! */ BASL_EXEC(basl_finish_patch_checksums(table)); } BASL_EXEC(qemu_loader_finish(basl_loader)); return (0); } static int basl_init_rsdt(struct vmctx *const ctx) { BASL_EXEC( basl_table_create(&rsdt, ctx, ACPI_SIG_RSDT, BASL_TABLE_ALIGNMENT)); /* Header */ BASL_EXEC(basl_table_append_header(rsdt, ACPI_SIG_RSDT, 1, 1)); /* Pointers (added by basl_table_register_to_rsdt) */ return (0); } static int basl_init_xsdt(struct vmctx *const ctx) { BASL_EXEC( basl_table_create(&xsdt, ctx, ACPI_SIG_XSDT, BASL_TABLE_ALIGNMENT)); /* Header */ BASL_EXEC(basl_table_append_header(xsdt, ACPI_SIG_XSDT, 1, 1)); /* Pointers (added by basl_table_register_to_rsdt) */ return (0); } int basl_init(struct vmctx *const ctx) { BASL_EXEC(basl_init_rsdt(ctx)); BASL_EXEC(basl_init_xsdt(ctx)); BASL_EXEC( qemu_loader_create(&basl_loader, QEMU_FWCFG_FILE_TABLE_LOADER)); return (0); } int basl_table_add_checksum(struct basl_table *const table, const uint32_t off, const uint32_t start, const uint32_t len) { struct basl_table_checksum *checksum; assert(table != NULL); checksum = calloc(1, sizeof(struct basl_table_checksum)); if (checksum == NULL) { warnx("%s: failed to allocate checksum", __func__); return (ENOMEM); } checksum->off = off; checksum->start = start; checksum->len = len; STAILQ_INSERT_TAIL(&table->checksums, checksum, chain); return (0); } int basl_table_add_length(struct basl_table *const table, const uint32_t off, const uint8_t size) { struct basl_table_length *length; assert(table != NULL); assert(size == 4 || size == 8); length = calloc(1, sizeof(struct basl_table_length)); if (length == NULL) { warnx("%s: failed to allocate length", __func__); return (ENOMEM); } length->off = off; length->size = size; STAILQ_INSERT_TAIL(&table->lengths, length, chain); return (0); } int basl_table_add_pointer(struct basl_table *const table, const uint8_t src_signature[ACPI_NAMESEG_SIZE], const uint32_t off, const uint8_t size) { struct basl_table_pointer *pointer; assert(table != NULL); assert(size == 4 || size == 8); pointer = calloc(1, sizeof(struct basl_table_pointer)); if (pointer == NULL) { warnx("%s: failed to allocate pointer", __func__); return (ENOMEM); } memcpy(pointer->src_signature, src_signature, sizeof(pointer->src_signature)); pointer->off = off; pointer->size = size; STAILQ_INSERT_TAIL(&table->pointers, pointer, chain); return (0); } int basl_table_append_bytes(struct basl_table *const table, const void *const bytes, const uint32_t len) { void *end; assert(table != NULL); assert(bytes != NULL); if (table->len + len <= table->len) { warnx("%s: table too large (table->len 0x%8x len 0x%8x)", __func__, table->len, len); return (EFAULT); } table->data = reallocf(table->data, table->len + len); if (table->data == NULL) { warnx("%s: failed to realloc table to length 0x%8x", __func__, table->len + len); table->len = 0; return (ENOMEM); } end = (uint8_t *)table->data + table->len; table->len += len; memcpy(end, bytes, len); return (0); } int basl_table_append_checksum(struct basl_table *const table, const uint32_t start, const uint32_t len) { assert(table != NULL); BASL_EXEC(basl_table_add_checksum(table, table->len, start, len)); BASL_EXEC(basl_table_append_int(table, 0, 1)); return (0); } int basl_table_append_content(struct basl_table *table, void *data, uint32_t len) { assert(data != NULL); assert(len >= sizeof(ACPI_TABLE_HEADER)); return (basl_table_append_bytes(table, (void *)((uintptr_t)(data) + sizeof(ACPI_TABLE_HEADER)), len - sizeof(ACPI_TABLE_HEADER))); } int basl_table_append_fwcfg(struct basl_table *const table, const uint8_t *fwcfg_name, const uint32_t alignment, const uint8_t size) { assert(table != NULL); assert(fwcfg_name != NULL); assert(size <= sizeof(uint64_t)); BASL_EXEC(qemu_loader_alloc(basl_loader, fwcfg_name, alignment, QEMU_LOADER_ALLOC_HIGH)); BASL_EXEC(qemu_loader_add_pointer(basl_loader, table->fwcfg_name, fwcfg_name, table->len, size)); BASL_EXEC(basl_table_append_int(table, 0, size)); return (0); } int basl_table_append_gas(struct basl_table *const table, const uint8_t space_id, const uint8_t bit_width, const uint8_t bit_offset, const uint8_t access_width, const uint64_t address) { ACPI_GENERIC_ADDRESS gas_le = { .SpaceId = space_id, .BitWidth = bit_width, .BitOffset = bit_offset, .AccessWidth = access_width, .Address = htole64(address), }; return (basl_table_append_bytes(table, &gas_le, sizeof(gas_le))); } int basl_table_append_header(struct basl_table *const table, const uint8_t signature[ACPI_NAMESEG_SIZE], const uint8_t revision, const uint32_t oem_revision) { ACPI_TABLE_HEADER header_le; /* + 1 is required for the null terminator */ char oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; assert(table != NULL); assert(table->len == 0); memcpy(header_le.Signature, signature, ACPI_NAMESEG_SIZE); header_le.Length = 0; /* patched by basl_finish */ header_le.Revision = revision; header_le.Checksum = 0; /* patched by basl_finish */ memcpy(header_le.OemId, "BHYVE ", ACPI_OEM_ID_SIZE); snprintf(oem_table_id, ACPI_OEM_TABLE_ID_SIZE, "BV%.4s ", signature); memcpy(header_le.OemTableId, oem_table_id, sizeof(header_le.OemTableId)); header_le.OemRevision = htole32(oem_revision); memcpy(header_le.AslCompilerId, "BASL", ACPI_NAMESEG_SIZE); header_le.AslCompilerRevision = htole32(0x20220504); BASL_EXEC( basl_table_append_bytes(table, &header_le, sizeof(header_le))); BASL_EXEC(basl_table_add_length(table, offsetof(ACPI_TABLE_HEADER, Length), sizeof(header_le.Length))); BASL_EXEC(basl_table_add_checksum(table, offsetof(ACPI_TABLE_HEADER, Checksum), 0, BASL_TABLE_CHECKSUM_LEN_FULL_TABLE)); return (0); } int basl_table_append_int(struct basl_table *const table, const uint64_t val, const uint8_t size) { char buf[8]; assert(size <= sizeof(val)); basl_le_enc(buf, val, size); return (basl_table_append_bytes(table, buf, size)); } int basl_table_append_length(struct basl_table *const table, const uint8_t size) { assert(table != NULL); assert(size <= sizeof(table->len)); BASL_EXEC(basl_table_add_length(table, table->len, size)); BASL_EXEC(basl_table_append_int(table, 0, size)); return (0); } int basl_table_append_pointer(struct basl_table *const table, const uint8_t src_signature[ACPI_NAMESEG_SIZE], const uint8_t size) { assert(table != NULL); assert(size == 4 || size == 8); BASL_EXEC(basl_table_add_pointer(table, src_signature, table->len, size)); BASL_EXEC(basl_table_append_int(table, 0, size)); return (0); } int basl_table_create(struct basl_table **const table, struct vmctx *ctx, const uint8_t *const name, const uint32_t alignment) { struct basl_table *new_table; assert(table != NULL); new_table = calloc(1, sizeof(struct basl_table)); if (new_table == NULL) { warnx("%s: failed to allocate table", __func__); return (ENOMEM); } new_table->ctx = ctx; snprintf(new_table->fwcfg_name, sizeof(new_table->fwcfg_name), "etc/acpi/%s", name); new_table->alignment = alignment; STAILQ_INIT(&new_table->checksums); STAILQ_INIT(&new_table->lengths); STAILQ_INIT(&new_table->pointers); STAILQ_INSERT_TAIL(&basl_tables, new_table, chain); *table = new_table; return (0); } int basl_table_register_to_rsdt(struct basl_table *table) { const ACPI_TABLE_HEADER *header; assert(table != NULL); header = (const ACPI_TABLE_HEADER *)table->data; BASL_EXEC(basl_table_append_pointer(rsdt, header->Signature, ACPI_RSDT_ENTRY_SIZE)); BASL_EXEC(basl_table_append_pointer(xsdt, header->Signature, ACPI_XSDT_ENTRY_SIZE)); return (0); } diff --git a/usr.sbin/bhyve/bhyve_config.5 b/usr.sbin/bhyve/bhyve_config.5 index d074d4503894..6904ad096c0d 100644 --- a/usr.sbin/bhyve/bhyve_config.5 +++ b/usr.sbin/bhyve/bhyve_config.5 @@ -1,710 +1,717 @@ .\" SPDX-License-Identifier: BSD-2-Clause .\" .\" Copyright (c) 2021 John H. Baldwin .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .Dd August 19, 2022 .Dt BHYVE_CONFIG 5 .Os .Sh NAME .Nm bhyve_config .Nd "bhyve configuration variables" .Sh DESCRIPTION .Xr bhyve 8 uses a hierarchical tree of configuration variables to describe global and per-device settings. Internal nodes in this tree do not have a value, only leaf nodes have values. This manual describes the configuration variables understood by .Xr bhyve 8 . If additional variables are defined, .Xr bhyve 8 will ignore them and will not emit errors for unknown variables. However, these additional variables can be referenced by other variables as described below. .Sh VARIABLE VALUES Configuration variable values are stored as strings. A configuration variable value may refer to one or more other configuration values by name. Instances of the pattern .Sq % Ns Pq Ar var are replaced by the value of the configuration variable .Va var . To avoid unwanted expansion, .Sq % characters can be escaped by a leading .Sq % . For example, if a configuration variable .Va disk uses the value .Pa /dev/zvol/bhyve/%(name) , then the final value of the .Va disk variable will be set to the path of a ZFS volume whose name matches the name of the virtual machine on the pool .Pa bhyve . .Pp Some configuration variables may be interpreted as a boolean value. For those variables the following case-insensitive values may be used to indicate true: .Pp .Bl -bullet -offset indent -compact .It true .It on .It yes .It 1 .El .Pp The following values may be used to indicate false: .Pp .Bl -bullet -offset indent -compact .It false .It off .It no .It 0 .El .Pp Some configuration variables may be interperted as an integer. For those variables, any syntax supported by .Xr strtol 3 may be used. .Sh GLOBAL SETTINGS .Ss Architecture Neutral Settings .Bl -column "memory.guest_in_core" "integer" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va name Ta string Ta Ta The name of the VM. .It Va cpus Ta integer Ta 1 Ta The total number of virtual CPUs. .It Va cores Ta integer Ta 1 Ta The number of virtual cores in each virtual socket. .It Va threads Ta integer Ta 1 Ta The number of virtual CPUs in each virtual core. .It Va sockets Ta integer Ta 1 Ta The number of virtual sockets. .It Va memory.guest_in_core Ta bool Ta false Ta Include guest memory in core file. .It Va memory.size Ta string Ta 256M Ta Guest physical memory size in bytes. The value must be formatted as described in .Xr expand_number 3 . .It Va memory.wired Ta bool Ta false Ta Wire guest memory. .It Va acpi_tables Ta bool Ta false Ta Generate ACPI tables. +.It Va acpi_tables_in_memory Ta bool Ta true Ta +.Xr bhyve 8 +always exposes ACPI tables by FwCfg. +For backward compatibility bhyve copies them into the guest memory as well. +This can cause problems if the guest uses the in-memory version, since certain +advanced features, such as TPM emulation, are exposed only via FwCfg. +Therefore, it is recommended to set this flag to false when running Windows guests. .It Va destroy_on_poweroff Ta bool Ta false Ta Destroy the VM on guest-initiated power-off. .It Va gdb.address Ta string Ta localhost Ta Hostname, IP address, or IPv6 address for the debug server. .It Va gdb.port Ta integer Ta 0 Ta TCP port number for the debug server. If this is set to a non-zero value, a debug server will listen for connections on this port. .It Va gdb.wait Ta bool Ta false Ta If the debug server is enabled, wait for a debugger to connect before starting the guest. .It Va keyboard.layout Ta string Ta Ta Specify the keyboard layout name with the file name in .Ar /usr/share/bhyve/kbdlayout . This value only works when loaded with UEFI mode for VNC, and used a VNC client that don't support QEMU Extended Key Event Message (e.g. TightVNC). .It Va tpm.path Ta string Ta Ta Path to the host TPM device. This is typically /dev/tpm0. .It Va tpm.type Ta string Ta Ta Type of the TPM device passed to the guest. Currently, only "passthru" is supported. .It Va tpm.version Ta string Ta 2.0 Ta Version of the TPM device according to the TCG specification. Currently, only version 2.0 is supported. .It Va rtc.use_localtime Ta bool Ta true Ta The real time clock uses the local time of the host. If this is set to false, the real time clock uses UTC. .It Va uuid Ta string Ta Ta The universally unique identifier (UUID) to use in the guest's System Management BIOS System Information structure. If an explicit value is not set, a valid UUID is generated from the host's hostname and the VM name. .It Va virtio_msix Ta bool Ta true Ta Use MSI-X interrupts for PCI VirtIO devices. If set to false, MSI interrupts are used instead. .It Va config.dump Ta bool Ta false Ta If this value is set to true after .Xr bhyve 8 has finished parsing command line options, then .Xr bhyve 8 will write all of its configuration variables to stdout and exit. No VM will be started. .It Va bios.vendor Ta string Ta BHYVE Ta This value is used for the guest's System Management BIOS System Information structure. .It Va bios.version Ta string Ta 14.0 Ta This value is used for the guest's System Management BIOS System Information structure. .It Va bios.release_date Ta string Ta 10/17/2021 Ta This value is used for the guest's System Management BIOS System Information structure. .It Va system.family_name Ta string Ta Virtual Machine Ta Family the computer belongs to. This value is used for the guest's System Management BIOS System Information structure. .It Va system.manufacturer Ta string Ta FreeBSD Ta This value is used for the guest's System Management BIOS System Information structure. .It Va system.product_name Ta string Ta BHYVE Ta This value is used for the guest's System Management BIOS System Information structure. .It Va system.serial_number Ta string Ta None Ta This value is used for the guest's System Management BIOS System Information structure. .It Va system.sku Ta string Ta None Ta Stock keeping unit of the computer. It's also called product ID or purchase order number. This value is used for the guest's System Management BIOS System Information structure. .It Va system.version Ta string Ta 1.0 Ta This value is used for the guest's System Management BIOS System Information structure. .It Va board.manufacturer Ta string Ta FreeBSD Ta This value is used for the guest's System Management BIOS System Information structure. .It Va board.product_name Ta string Ta BHYVE Ta This value is used for the guest's System Management BIOS System Information structure. .It Va board.version Ta string Ta 1.0 Ta This value is used for the guest's System Management BIOS System Information structure. .It Va board.serial_number Ta string Ta None Ta This value is used for the guest's System Management BIOS System Information structure. .It Va board.asset_tag Ta string Ta None Ta This value is used for the guest's System Management BIOS System Information structure. .It Va board.location Ta string Ta None Ta Describes the board's location within the chassis. This value is used for the guest's System Management BIOS System Information structure. .It Va chassis.manufacturer Ta string Ta FreeBSD Ta This value is used for the guest's System Management BIOS System Information structure. .It Va chassis.version Ta string Ta 1.0 Ta This value is used for the guest's System Management BIOS System Information structure. .It Va chassis.serial_number Ta string Ta None Ta This value is used for the guest's System Management BIOS System Information structure. .It Va chassis.asset_tag Ta string Ta None Ta This value is used for the guest's System Management BIOS System Information structure. .It Va chassis.sku Ta string Ta None Ta Stock keeping unit of the chassis. It's also called product ID or purchase order number. This value is used for the guest's System Management BIOS System Information structure. .El .Ss x86-Specific Settings .Bl -column "x86.vmexit_on_pause" "integer" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va x86.mptable Ta bool Ta true Ta Generate an MPTable. .It Va x86.x2apic Ta bool Ta false Ta Configure guest's local APICs in x2APIC mode. .It Va x86.strictio Ta bool Ta false Ta Exit if a guest accesses an I/O port that is not emulated. By default, writes are ignored and reads return all bits set. .It Va x86.strictmsr Ta bool Ta true Ta Inject a general protection fault if a guest accesses a Model Specific Register (MSR) that is not emulated. If this is false, writes are ignored and reads return zero. .It Va x86.vmexit_on_hlt Ta bool Ta false Ta Force a VM exit when a guest CPU executes the .Dv HLT instruction. This allows idle guest CPUs to yield the host CPU. .It Va x86.vmexit_on_pause Ta bool Ta false Ta Force a VM exit when a guest CPU executes the .Dv PAUSE instruction. .El .Sh DEVICE SETTINGS Device settings are stored under a device node. The device node's name is set by the parent bus of the device. .Ss PCI Device Settings PCI devices are described by a device node named .Dq pci . Ns Ar bus . Ns Ar slot . Ns Ar function where each of .Ar bus , .Ar slot , and .Ar function are formatted as decimal values with no padding. All PCI device nodes must contain a configuration variable named .Dq device which specifies the device model to use. The following PCI device models are supported: .Bl -tag -indent .It Li hostbridge Provide a simple PCI-Host bridge device. This is usually configured at pci0:0:0 and is required by most guest operating systems. .It Li ahci AHCI storage controller. .It Li e1000 Intel e82545 network interface. .It Li fbuf VGA framebuffer device attached to VNC server. .It Li lpc LPC PCI-ISA bridge with COM1-COM4 16550 serial ports, a boot ROM, an optional fwcfg type, and an optional debug/test device. This device must be configured on bus 0. .It Li hda High Definition audio controller. .It Li nvme NVM Express (NVMe) controller. .It Li passthru PCI pass-through device. .It Li uart PCI 16550 serial device. .It Li virtio-9p VirtIO 9p (VirtFS) interface. .It Li virtio-blk VirtIO block storage interface. .It Li virtio-console VirtIO console interface. .It Li virtio-input VirtIO input interface. .It Li virtio-net VirtIO network interface. .It Li virtio-rnd VirtIO RNG interface. .It Li virtio-scsi VirtIO SCSI interface. .It Li xhci Extensible Host Controller Interface (XHCI) USB controller. .El .Ss USB Device Settings USB controller devices contain zero or more child USB devices attached to slots. Each USB device stores its settings in a node named .Dq slot. Ns Va N under the controller's device node. .Va N is the number of the slot to which the USB device is attached. Note that USB slot numbers begin at 1. All USB device nodes must contain a configuration variable named .Dq device which specifies the device model to use. The following USB device models are supported: .Bl -tag -indent .It Li tablet A USB tablet device which provides precise cursor synchronization when using VNC. .El .Ss Block Device Settings Block devices use the following settings to configure their backing store. These settings are stored in the configuration node of the respective device. .Bl -column "sectorsize" "logical[/physical]" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It path Ta string Ta Ta The path of the file or disk device to use as the backing store. .It nocache Ta bool Ta false Ta Disable caching on the backing file by opening the backing file with .Dv O_DIRECT . .It nodelete Ta bool Ta false Ta Disable emulation of guest trim requests via .Dv DIOCGDELETE requests. .It sync Ta bool Ta false Ta Write changes to the backing file with synchronous writes. .It direct Ta bool Ta false Ta An alias for .Va sync . .It ro Ta bool Ta false Ta Disable writes to the backing file. .It sectorsize Ta Va logical Ns Op / Ns Va physical Ta Ta Specify the logical and physical sector size of the emulated disk. If the physical size is not specified, it is equal to the logical size. .El .Ss Network Backend Settings Network devices use the following settings to configure their backend. The backend is responsible for passing packets between the device model and a desired destination. Configuring a backend requires setting the .Va backend variable. The type of a backend can either be set explicitly via the .Va type variable or it can be inferred from the value of .Va backend . .Pp The following types of backends are supported: .Bl -tag -width "netgraph" .It tap Use the .Xr tap 4 interface named in .Va backend as the backend. .It netgraph Use a .Xr netgraph 4 socket hook as the backend. This backend uses the following additional variables: .Bl -column "peerhook" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va path Ta string Ta Ta The name of the .Xr netgraph 4 destination node. .It Va peerhook Ta string Ta Ta The name of the destination hook. .It Va socket Ta string Ta Ta The name of the created .Xr ng_socket 4 node. .It Va hook Ta string Ta vmlink Ta The name of the source hook on the created .Xr ng_socket 4 node. .El .It netmap Use .Xr netmap 4 either on a network interface or a port on a .Xr vale 4 bridge as the backend. The value of .Va backend is passed to .Xr nm_open to connect to a netmap port. .El .Pp If .Va type is not specified explicitly, then it is inferred from .Va backend based on the following patterns: .Bl -column -offset indent "valuebridge:port" .It Sy Pattern Ta Sy Type .It tap Ns Va N Ta tap .It vmnet Ns Va N Ta tap .It netgraph Ta netgraph .It netmap: Ns Va interface Ta netmap .It vale Ns Va bridge : Ns Va port Ta netmap .El .Ss UART Device Settings .Bl -column "Name" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va path Ta path Ta Ta Backend device for the serial port. Either the pathname of a character device or .Dq stdio to use standard input and output of the .Xr bhyve 8 process. .El .Ss Host Bridge Settings .Bl -column "pcireg.*" "integer" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va pcireg.* Ta integer Ta Ta Values of PCI register. .Bl -column "device" "Default" .It Sy Name Ta Sy Default .It Va vendor Ta integer Ta 0x1275 Ta .It Va device Ta integer Ta 0x1275 Ta .El .El .Ss AHCI Controller Settings AHCI controller devices contain zero or more ports each of which provides a storage device. Each port stores its settings in a node named .Dq port. Ns Va N under the controller's device node. The .Va N values are formatted as successive decimal values starting with 0. In addition to the block device settings described above, each port supports the following settings: .Bl -column "model" "integer" "generated" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va type Ta string Ta Ta The type of storage device to emulate. Must be set to either .Dq cd or .Dq hd . .It Va nmrr Ta integer Ta 0 Ta Nominal Media Rotation Rate, also known as RPM. A value 1 of indicates a device with no rate such as a Solid State Disk. .It Va ser Ta string Ta generated Ta Serial number of up to twenty characters. A default serial number is generated using a hash of the backing store's pathname. .It Va rev Ta string Ta 001 Ta Revision number of up to eight characters. .It Va model Ta string Ta Ta Model number of up to forty characters. Separate default model strings are used for .Dq cd and .Dq hd device types. .El .Ss e1000 Settings In addition to the network backend settings, Intel e82545 network interfaces support the following variables: .Bl -column "Name" "MAC address" "generated" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va mac Ta MAC address Ta generated Ta MAC address. If an explicit address is not provided, a MAC address is generated from a hash of the device's PCI address. .El .Ss Frame Buffer Settings .Bl -column "password" "[IP:]port" "127.0.0.1:5900" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va wait Ta bool Ta false Ta Wait for a remote connection before starting the VM. .It Va rfb Ta Oo Ar IP Ns : Oc Ns Ar port Ta 127.0.0.1:5900 Ta TCP address to listen on for remote connections. The IP address must be given as a numeric address. IPv6 addresses must be enclosed in square brackets and support scoped identifiers as described in .Xr getaddrinfo 3 . A bare port number may be given in which case the IPv4 localhost address is used. .It Va vga Ta string Ta io Ta VGA configuration. More details are provided in .Xr bhyve 8 . .It Va w Ta integer Ta 1024 Ta Frame buffer width in pixels. .It Va h Ta integer Ta 768 Ta Frame buffer height in pixels. .It Va password Ta string Ta Ta Password to use for VNC authentication. This type of authentication is known to be cryptographically weak and is not intended for use on untrusted networks. .El .Ss High Definition Audio Settings .Bl -column "Name" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va play Ta path Ta Ta Host playback device, typically .Pa /dev/dsp0 . .It Va rec Ta path Ta Ta Host recording device, typically .Pa /dev/dsp0 . .El .Ss LPC Device Settings The LPC bridge stores its configuration under a top-level .Va lpc node rather than under the PCI LPC device's node. The following nodes are available under .Va lpc : .Bl -column "pc-testdev" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va bootrom Ta path Ta Ta Path to a boot ROM. The contents of this file are copied into the guest's memory ending just before the 4GB physical address. If a boot ROM is present, a firmware interface device is also enabled for use by the boot ROM. .It Va bootvars Ta path Ta Ta Path to boot VARS. The contents of this file are copied beneath the boot ROM. Firmware can write to it to save variables. All variables will be persistent even on reboots of the guest. .It Va com1 Ta node Ta Ta Settings for the COM1 serial port device. .It Va com2 Ta node Ta Ta Settings for the COM2 serial port device. .It Va com3 Ta node Ta Ta Settings for the COM3 serial port device. .It Va com4 Ta node Ta Ta Settings for the COM4 serial port device. .It Va fwcfg Ta string Ta bhyve Ta The fwcfg type to be used. Supported values are .Dq bhyve for fwctl and .Dq qemu for fwcfg. .It Va pc-testdev Ta bool Ta false Ta Enable the PC debug/test device. .It Va pcireg.* Ta integer Ta Ta Values of PCI register. It also accepts the value .Ar host to use the pci id of the host system. This value is required for the Intel GOP driver to work properly. .Bl -column "subvendor" "Default" .It Sy Name Ta Sy Default .It Va vendor Ta 0x8086 .It Va device Ta 0x7000 .It Va revid Ta 0 .It Va subvendor Ta 0 .It Va subdevice Ta 0 .El .El .Ss NVMe Controller Settings Each NVMe controller supports a single storage device. The device can be backed either by a memory disk described by the .Va ram variable, or a block device using the block device settings described above. In addition, each controller supports the following settings: .Bl -column "ioslots" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va maxq Ta integer Ta 16 Ta Maximum number of I/O submission and completion queue pairs. .It Va qsz Ta integer Ta 2058 Ta Number of elements in each I/O queue. .It Va ioslots Ta integer Ta 8 Ta Maximum number of concurrent I/O requests. .It Va sectsz Ta integer Ta Ta Sector size. Can be one of 512, 4096, or 8192. Devices backed by a memory disk use 4096 as the default. Devices backed by a block device use the block device's sector size as the default. .It Va ser Ta string Ta Ta Serial number of up to twenty characters. A default serial number is generated using a hash of the device's PCI address. .It Va eui64 Ta integer Ta Ta IEEE Extended Unique Identifier. If an EUI is not provided, a default is generated using a checksum of the device's PCI address. .It Va dsm Ta string Ta auto Ta Whether or not to advertise DataSet Management support. One of .Dq auto , .Dq enable , or .Dq disable . The .Dq auto setting only advertises support if the backing store supports resource freeing, for example via TRIM. .It Va ram Ta integer Ta Ta If set, allocate a memory disk as the backing store. The value of this variable is the size of the memory disk in megabytes. .El .Ss PCI Passthrough Settings The .Xr ppt 4 device driver must be attached to the PCI device being passed through. The device to pass through can be identified either by name or its host PCI bus location. .Bl -column "Name" "integer" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va bus Ta integer Ta Ta Host PCI bus address of device to pass through. .It Va slot Ta integer Ta Ta Host PCI slot address of device to pass through. .It Va func Ta integer Ta Ta Host PCI function address of device to pass through. .It Va pptdev Ta string Ta Ta Name of a .Xr ppt 4 device to pass through. .It Va rom Ta path Ta Ta ROM file of the device which will be executed by OVMF to init the device. .El .Ss VirtIO 9p Settings Each VirtIO 9p device exposes a single filesystem from a host path. .Bl -column "sharename" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va sharename Ta string Ta Ta The share name exposed to the guest. .It Va path Ta path Ta Ta The path of a directory on the host to export to the guest. .It Va ro Ta bool Ta false Ta If true, the guest filesystem is read-only. .El .Ss VirtIO Block Device Settings In addition to the block device settings described above, each VirtIO block device supports the following settings: .Bl -column "model" "integer" "generated" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va ser Ta string Ta generated Ta Serial number of up to twenty characters. A default serial number is generated using a hash of the backing store's pathname. .El .Ss VirtIO Console Device Settings Each VirtIO Console device contains one or more console ports. Each port stores its settings in a node named .Dq port. Ns Va N under the controller's device node. The .Va N values are formatted as successive decimal values starting with 0. Each port supports the following settings: .Bl -column "Name" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va name Ta string Ta Ta The name of the port exposed to the guest. .It Va path Ta path Ta Ta The path of a UNIX domain socket providing the host connection for the port. .El .Ss VirtIO Input Interface Settings Each VirtIO Input device contains one input event device. All input events of the input event device are send to the guest by VirtIO Input interface. VirtIO Input Interfaces support the following variables: .Bl -column "Name" "Format" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va path Ta path Ta Ta The path of the input event device exposed to the guest .El .Ss VirtIO Network Interface Settings In addition to the network backend settings, VirtIO network interfaces support the following variables: .Bl -column "Name" "MAC address" "generated" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va mac Ta MAC address Ta generated Ta MAC address. If an explicit address is not provided, a MAC address is generated from a hash of the device's PCI address. .It Va mtu Ta integer Ta 1500 Ta The largest supported MTU advertised to the guest. .El .Ss VirtIO SCSI Settings .Bl -column "Name" "integer" "Default" .It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description .It Va dev Ta path Ta Ta The path of a CAM target layer (CTL) device to export: .Pa /dev/cam/ctl Ns Oo Ar pp . Ns Ar vp Oc . .It Va iid Ta integer Ta 0 Ta Initiator ID to use when sending requests to the CTL port. .El .Sh SEE ALSO .Xr expand_number 3 , .Xr getaddrinfo 3 , .Xr strtol 3 , .Xr netgraph 4 , .Xr netmap 4 , .Xr ng_socket 4 , .Xr tap 4 , .Xr vale 4 , .Xr vmnet 4 , .Xr bhyve 8 diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index 3d32bfd35408..3db796c65a28 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -1,1622 +1,1623 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #ifdef BHYVE_SNAPSHOT #include #include #endif #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #include #include #endif #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "bootrom.h" #include "config.h" #include "inout.h" #include "debug.h" #include "e820.h" #include "fwctl.h" #include "gdb.h" #include "ioapic.h" #include "kernemu_dev.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "qemu_fwcfg.h" #include "smbiostbl.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "tpm_device.h" #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include "vmgenc.h" #define MB (1024UL * 1024) #define GB (1024UL * MB) static const char * const vmx_exit_reason_desc[] = { [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", [EXIT_REASON_EXT_INTR] = "External interrupt", [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", [EXIT_REASON_INIT] = "INIT signal", [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", [EXIT_REASON_SMI] = "Other SMI", [EXIT_REASON_INTR_WINDOW] = "Interrupt window", [EXIT_REASON_NMI_WINDOW] = "NMI window", [EXIT_REASON_TASK_SWITCH] = "Task switch", [EXIT_REASON_CPUID] = "CPUID", [EXIT_REASON_GETSEC] = "GETSEC", [EXIT_REASON_HLT] = "HLT", [EXIT_REASON_INVD] = "INVD", [EXIT_REASON_INVLPG] = "INVLPG", [EXIT_REASON_RDPMC] = "RDPMC", [EXIT_REASON_RDTSC] = "RDTSC", [EXIT_REASON_RSM] = "RSM", [EXIT_REASON_VMCALL] = "VMCALL", [EXIT_REASON_VMCLEAR] = "VMCLEAR", [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", [EXIT_REASON_VMPTRLD] = "VMPTRLD", [EXIT_REASON_VMPTRST] = "VMPTRST", [EXIT_REASON_VMREAD] = "VMREAD", [EXIT_REASON_VMRESUME] = "VMRESUME", [EXIT_REASON_VMWRITE] = "VMWRITE", [EXIT_REASON_VMXOFF] = "VMXOFF", [EXIT_REASON_VMXON] = "VMXON", [EXIT_REASON_CR_ACCESS] = "Control-register accesses", [EXIT_REASON_DR_ACCESS] = "MOV DR", [EXIT_REASON_INOUT] = "I/O instruction", [EXIT_REASON_RDMSR] = "RDMSR", [EXIT_REASON_WRMSR] = "WRMSR", [EXIT_REASON_INVAL_VMCS] = "VM-entry failure due to invalid guest state", [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", [EXIT_REASON_MWAIT] = "MWAIT", [EXIT_REASON_MTF] = "Monitor trap flag", [EXIT_REASON_MONITOR] = "MONITOR", [EXIT_REASON_PAUSE] = "PAUSE", [EXIT_REASON_MCE_DURING_ENTRY] = "VM-entry failure due to machine-check event", [EXIT_REASON_TPR] = "TPR below threshold", [EXIT_REASON_APIC_ACCESS] = "APIC access", [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", [EXIT_REASON_EPT_FAULT] = "EPT violation", [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", [EXIT_REASON_INVEPT] = "INVEPT", [EXIT_REASON_RDTSCP] = "RDTSCP", [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", [EXIT_REASON_INVVPID] = "INVVPID", [EXIT_REASON_WBINVD] = "WBINVD", [EXIT_REASON_XSETBV] = "XSETBV", [EXIT_REASON_APIC_WRITE] = "APIC write", [EXIT_REASON_RDRAND] = "RDRAND", [EXIT_REASON_INVPCID] = "INVPCID", [EXIT_REASON_VMFUNC] = "VMFUNC", [EXIT_REASON_ENCLS] = "ENCLS", [EXIT_REASON_RDSEED] = "RDSEED", [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", [EXIT_REASON_XSAVES] = "XSAVES", [EXIT_REASON_XRSTORS] = "XRSTORS" }; typedef int (*vmexit_handler_t)(struct vmctx *, struct vcpu *, struct vm_run *); int guest_ncpus; uint16_t cpu_cores, cpu_sockets, cpu_threads; int raw_stdio = 0; static char *progname; static const int BSP = 0; static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu); static struct vcpu_info { struct vmctx *ctx; struct vcpu *vcpu; int vcpuid; } *vcpu_info; static cpuset_t **vcpumap; static void usage(int code) { fprintf(stderr, "Usage: %s [-AaCDeHhPSuWwxY]\n" " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" " %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n" " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n" " -A: create ACPI tables\n" " -a: local apic is in xAPIC mode (deprecated)\n" " -C: include guest memory in core file\n" " -c: number of CPUs and/or topology specification\n" " -D: destroy on power-off\n" " -e: exit on unhandled I/O access\n" " -G: start a debug server\n" " -H: vmexit from the guest on HLT\n" " -h: help\n" " -k: key=value flat config file\n" " -K: PS2 keyboard layout\n" " -l: LPC device configuration\n" " -m: memory size\n" " -o: set config 'var' to 'value'\n" " -P: vmexit from the guest on pause\n" " -p: pin 'vcpu' to 'hostcpu'\n" #ifdef BHYVE_SNAPSHOT " -r: path to checkpoint file\n" #endif " -S: guest memory cannot be swapped\n" " -s: PCI slot config\n" " -U: UUID\n" " -u: RTC keeps UTC time\n" " -W: force virtio to use single-vector MSI\n" " -w: ignore unimplemented MSRs\n" " -x: local APIC is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), "", (int)strlen(progname), "", (int)strlen(progname), ""); exit(code); } /* * XXX This parser is known to have the following issues: * 1. It accepts null key=value tokens ",," as setting "cpus" to an * empty string. * * The acceptance of a null specification ('-c ""') is by design to match the * manual page syntax specification, this results in a topology of 1 vCPU. */ static int topology_parse(const char *opt) { char *cp, *str, *tofree; if (*opt == '\0') { set_config_value("sockets", "1"); set_config_value("cores", "1"); set_config_value("threads", "1"); set_config_value("cpus", "1"); return (0); } tofree = str = strdup(opt); if (str == NULL) errx(4, "Failed to allocate memory"); while ((cp = strsep(&str, ",")) != NULL) { if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) set_config_value("cpus", cp + strlen("cpus=")); else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0) set_config_value("sockets", cp + strlen("sockets=")); else if (strncmp(cp, "cores=", strlen("cores=")) == 0) set_config_value("cores", cp + strlen("cores=")); else if (strncmp(cp, "threads=", strlen("threads=")) == 0) set_config_value("threads", cp + strlen("threads=")); else if (strchr(cp, '=') != NULL) goto out; else set_config_value("cpus", cp); } free(tofree); return (0); out: free(tofree); return (-1); } static int parse_int_value(const char *key, const char *value, int minval, int maxval) { char *cp; long lval; errno = 0; lval = strtol(value, &cp, 0); if (errno != 0 || *cp != '\0' || cp == value || lval < minval || lval > maxval) errx(4, "Invalid value for %s: '%s'", key, value); return (lval); } /* * Set the sockets, cores, threads, and guest_cpus variables based on * the configured topology. * * The limits of UINT16_MAX are due to the types passed to * vm_set_topology(). vmm.ko may enforce tighter limits. */ static void calc_topology(void) { const char *value; bool explicit_cpus; uint64_t ncpus; value = get_config_value("cpus"); if (value != NULL) { guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX); explicit_cpus = true; } else { guest_ncpus = 1; explicit_cpus = false; } value = get_config_value("cores"); if (value != NULL) cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX); else cpu_cores = 1; value = get_config_value("threads"); if (value != NULL) cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX); else cpu_threads = 1; value = get_config_value("sockets"); if (value != NULL) cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX); else cpu_sockets = guest_ncpus; /* * Compute sockets * cores * threads avoiding overflow. The * range check above insures these are 16 bit values. */ ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads; if (ncpus > UINT16_MAX) errx(4, "Computed number of vCPUs too high: %ju", (uintmax_t)ncpus); if (explicit_cpus) { if (guest_ncpus != (int)ncpus) errx(4, "Topology (%d sockets, %d cores, %d threads) " "does not match %d vCPUs", cpu_sockets, cpu_cores, cpu_threads, guest_ncpus); } else guest_ncpus = ncpus; } static int pincpu_parse(const char *opt) { const char *value; char *newval; char key[16]; int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0) { fprintf(stderr, "invalid vcpu '%d'\n", vcpu); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (asprintf(&newval, "%s%s%d", value != NULL ? value : "", value != NULL ? "," : "", pcpu) == -1) { perror("failed to build new cpuset string"); return (-1); } set_config_value(key, newval); free(newval); return (0); } static void parse_cpuset(int vcpu, const char *list, cpuset_t *set) { char *cp, *token; int pcpu, start; CPU_ZERO(set); start = -1; token = __DECONST(char *, list); for (;;) { pcpu = strtoul(token, &cp, 0); if (cp == token) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); if (pcpu < 0 || pcpu >= CPU_SETSIZE) errx(4, "hostcpu '%d' outside valid range from 0 to %d", pcpu, CPU_SETSIZE - 1); switch (*cp) { case ',': case '\0': if (start >= 0) { if (start > pcpu) errx(4, "Invalid hostcpu range %d-%d", start, pcpu); while (start < pcpu) { CPU_SET(start, set); start++; } start = -1; } CPU_SET(pcpu, set); break; case '-': if (start >= 0) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); start = pcpu; break; default: errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); } if (*cp == '\0') break; token = cp + 1; } } static void build_vcpumaps(void) { char key[16]; const char *value; int vcpu; vcpumap = calloc(guest_ncpus, sizeof(*vcpumap)); for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (value == NULL) continue; vcpumap[vcpu] = malloc(sizeof(cpuset_t)); if (vcpumap[vcpu] == NULL) err(4, "Failed to allocate cpuset for vcpu %d", vcpu); parse_cpuset(vcpu, value, vcpumap[vcpu]); } } void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode) { int error, restart_instruction; restart_instruction = 1; error = vm_inject_exception(vcpu, vector, errcode_valid, errcode, restart_instruction); assert(error == 0); } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr) { return (vm_rev_map_gpa(ctx, addr)); } #endif int fbsdrun_virtio_msix(void) { return (get_config_bool_default("virtio_msix", true)); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct vcpu_info *vi = param; int error; snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid); pthread_set_name_np(pthread_self(), tname); if (vcpumap[vi->vcpuid] != NULL) { error = pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), vcpumap[vi->vcpuid]); assert(error == 0); } #ifdef BHYVE_SNAPSHOT checkpoint_cpu_add(vi->vcpuid); #endif gdb_cpu_add(vi->vcpu); vm_loop(vi->ctx, vi->vcpu); /* not reached */ exit(1); return (NULL); } static void fbsdrun_addcpu(struct vcpu_info *vi) { pthread_t thr; int error; error = vm_activate_cpu(vi->vcpu); if (error != 0) err(EX_OSERR, "could not activate CPU %d", vi->vcpuid); CPU_SET_ATOMIC(vi->vcpuid, &cpumask); vm_suspend_cpu(vi->vcpu); error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi); assert(error == 0); } static void fbsdrun_deletecpu(int vcpu) { static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; pthread_mutex_lock(&resetcpu_mtx); if (!CPU_ISSET(vcpu, &cpumask)) { fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); exit(4); } CPU_CLR(vcpu, &cpumask); if (vcpu != BSP) { pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); /* NOTREACHED */ } while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); } static int vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; int error; int bytes, port, in; vme = vmrun->vm_exit; port = vme->u.inout.port; bytes = vme->u.inout.bytes; in = vme->u.inout.in; error = emulate_inout(ctx, vcpu, vme); if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port, vme->rip); return (VMEXIT_ABORT); } else { return (VMEXIT_CONTINUE); } } static int vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; uint64_t val; uint32_t eax, edx; int error; vme = vmrun->vm_exit; val = 0; error = emulate_rdmsr(vcpu, vme->u.msr.code, &val); if (error != 0) { fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", vme->u.msr.code, vcpu_id(vcpu)); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(vcpu); return (VMEXIT_CONTINUE); } } eax = val; error = vm_set_register(vcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; error = vm_set_register(vcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; int error; vme = vmrun->vm_exit; error = emulate_wrmsr(vcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", vme->u.msr.code, vme->u.msr.wval, vcpu_id(vcpu)); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(vcpu); return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); } #define DEBUG_EPT_MISCONFIG #ifdef DEBUG_EPT_MISCONFIG #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; static int ept_misconfig_ptenum; #endif static const char * vmexit_vmx_desc(uint32_t exit_reason) { if (exit_reason >= nitems(vmx_exit_reason_desc) || vmx_exit_reason_desc[exit_reason] == NULL) return ("Unknown"); return (vmx_exit_reason_desc[exit_reason]); } static int vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; vme = vmrun->vm_exit; fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); fprintf(stderr, "\treason\t\tVMX\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vme->u.vmx.status); fprintf(stderr, "\texit_reason\t%u (%s)\n", vme->u.vmx.exit_reason, vmexit_vmx_desc(vme->u.vmx.exit_reason)); fprintf(stderr, "\tqualification\t0x%016lx\n", vme->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vme->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vme->u.vmx.inst_error); #ifdef DEBUG_EPT_MISCONFIG if (vme->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { vm_get_register(vcpu, VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), &ept_misconfig_gpa); vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, &ept_misconfig_ptenum); fprintf(stderr, "\tEPT misconfiguration:\n"); fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", ept_misconfig_ptenum, ept_misconfig_pte[0], ept_misconfig_pte[1], ept_misconfig_pte[2], ept_misconfig_pte[3]); } #endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } static int vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; vme = vmrun->vm_exit; fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); fprintf(stderr, "\treason\t\tSVM\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\texitcode\t%#lx\n", vme->u.svm.exitcode); fprintf(stderr, "\texitinfo1\t%#lx\n", vme->u.svm.exitinfo1); fprintf(stderr, "\texitinfo2\t%#lx\n", vme->u.svm.exitinfo2); return (VMEXIT_ABORT); } static int vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun) { assert(vmrun->vm_exit->inst_length == 0); return (VMEXIT_CONTINUE); } static int vmexit_reqidle(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun) { assert(vmrun->vm_exit->inst_length == 0); return (VMEXIT_CONTINUE); } static int vmexit_hlt(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun __unused) { /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int vmexit_pause(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun __unused) { return (VMEXIT_CONTINUE); } static int vmexit_mtrap(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { assert(vmrun->vm_exit->inst_length == 0); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(vcpu_id(vcpu)); #endif gdb_cpu_mtrap(vcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(vcpu_id(vcpu)); #endif return (VMEXIT_CONTINUE); } static int vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; struct vie *vie; int err, i, cs_d; enum vm_cpu_mode mode; vme = vmrun->vm_exit; vie = &vme->u.inst_emul.vie; if (!vie->decoded) { /* * Attempt to decode in userspace as a fallback. This allows * updating instruction decode in bhyve without rebooting the * kernel (rapid prototyping), albeit with much slower * emulation. */ vie_restart(vie); mode = vme->u.inst_emul.paging.cpu_mode; cs_d = vme->u.inst_emul.cs_d; if (vmm_decode_instruction(mode, cs_d, vie) != 0) goto fail; if (vm_set_register(vcpu, VM_REG_GUEST_RIP, vme->rip + vie->num_processed) != 0) goto fail; } err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie, &vme->u.inst_emul.paging); if (err) { if (err == ESRCH) { EPRINTLN("Unhandled memory access to 0x%lx\n", vme->u.inst_emul.gpa); } goto fail; } return (VMEXIT_CONTINUE); fail: fprintf(stderr, "Failed to emulate instruction sequence [ "); for (i = 0; i < vie->num_valid; i++) fprintf(stderr, "%02x", vie->inst[i]); FPRINTLN(stderr, " ] at 0x%lx", vme->rip); return (VMEXIT_ABORT); } static int vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; enum vm_suspend_how how; int vcpuid = vcpu_id(vcpu); vme = vmrun->vm_exit; how = vme->u.suspended.how; fbsdrun_deletecpu(vcpuid); switch (how) { case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: if (get_config_bool_default("destroy_on_poweroff", false)) vm_destroy(ctx); exit(1); case VM_SUSPEND_HALT: exit(2); case VM_SUSPEND_TRIPLEFAULT: exit(3); default: fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); exit(100); } return (0); /* NOTREACHED */ } static int vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun __unused) { #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(vcpu_id(vcpu)); #endif gdb_cpu_suspend(vcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(vcpu_id(vcpu)); #endif /* * XXX-MJ sleep for a short period to avoid chewing up the CPU in the * window between activation of the vCPU thread and the STARTUP IPI. */ usleep(1000); return (VMEXIT_CONTINUE); } static int vmexit_breakpoint(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { gdb_cpu_breakpoint(vcpu, vmrun->vm_exit); return (VMEXIT_CONTINUE); } static int vmexit_ipi(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun) { struct vm_exit *vme; cpuset_t *dmask; int error = -1; int i; dmask = vmrun->cpuset; vme = vmrun->vm_exit; switch (vme->u.ipi.mode) { case APIC_DELMODE_INIT: CPU_FOREACH_ISSET(i, dmask) { error = vm_suspend_cpu(vcpu_info[i].vcpu); if (error) { warnx("%s: failed to suspend cpu %d\n", __func__, i); break; } } break; case APIC_DELMODE_STARTUP: CPU_FOREACH_ISSET(i, dmask) { spinup_ap(vcpu_info[i].vcpu, vme->u.ipi.vector << PAGE_SHIFT); } error = 0; break; default: break; } return (error); } static const vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, [VM_EXITCODE_BPT] = vmexit_breakpoint, [VM_EXITCODE_IPI] = vmexit_ipi, [VM_EXITCODE_HLT] = vmexit_hlt, [VM_EXITCODE_PAUSE] = vmexit_pause, }; static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu) { struct vm_exit vme; struct vm_run vmrun; int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus, dmask; error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus)); vmrun.vm_exit = &vme; vmrun.cpuset = &dmask; vmrun.cpusetsize = sizeof(dmask); while (1) { error = vm_run(vcpu, &vmrun); if (error != 0) break; exitcode = vme.exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(4); } rc = (*handler[exitcode])(ctx, vcpu, &vmrun); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(4); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu) { uint16_t sockets, cores, threads, maxcpus; int tmp, error; /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp); if (error != 0) return (1); error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); if (error == 0) return (maxcpus); else return (1); } static void fbsdrun_set_capabilities(struct vcpu *vcpu) { int err, tmp; if (get_config_bool_default("x86.vmexit_on_hlt", false)) { err = vm_get_capability(vcpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); exit(4); } vm_set_capability(vcpu, VM_CAP_HALT_EXIT, 1); } if (get_config_bool_default("x86.vmexit_on_pause", false)) { /* * pause exit support required for this mode */ err = vm_get_capability(vcpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); exit(4); } vm_set_capability(vcpu, VM_CAP_PAUSE_EXIT, 1); } if (get_config_bool_default("x86.x2apic", false)) err = vm_set_x2apic_state(vcpu, X2APIC_ENABLED); else err = vm_set_x2apic_state(vcpu, X2APIC_DISABLED); if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); exit(4); } vm_set_capability(vcpu, VM_CAP_ENABLE_INVPCID, 1); err = vm_set_capability(vcpu, VM_CAP_IPI_EXIT, 1); assert(err == 0); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool reinit, romboot; reinit = romboot = false; if (lpc_bootrom()) romboot = true; error = vm_create(vmname); if (error) { if (errno == EEXIST) { if (romboot) { reinit = true; } else { /* * The virtual machine has been setup by the * userspace bootloader. */ } } else { perror("vm_create"); exit(4); } } else { if (!romboot) { /* * If the virtual machine was just created then a * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); exit(4); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(4); } #ifndef WITHOUT_CAPSICUM if (vm_limit_rights(ctx) != 0) err(EX_OSERR, "vm_limit_rights"); #endif if (reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(4); } } error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0); if (error) errx(EX_OSERR, "vm_set_topology"); return (ctx); } static void spinup_vcpu(struct vcpu_info *vi, bool bsp) { int error; if (!bsp) { fbsdrun_set_capabilities(vi->vcpu); /* * Enable the 'unrestricted guest' mode for APs. * * APs startup in power-on 16-bit mode. */ error = vm_set_capability(vi->vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); } fbsdrun_addcpu(vi); } static bool parse_config_option(const char *option) { const char *value; char *path; value = strchr(option, '='); if (value == NULL || value[1] == '\0') return (false); path = strndup(option, value - option); if (path == NULL) err(4, "Failed to allocate memory"); set_config_value(path, value + 1); return (true); } static void parse_simple_config_file(const char *path) { FILE *fp; char *line, *cp; size_t linecap; unsigned int lineno; fp = fopen(path, "r"); if (fp == NULL) err(4, "Failed to open configuration file %s", path); line = NULL; linecap = 0; lineno = 1; for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) { if (*line == '#' || *line == '\n') continue; cp = strchr(line, '\n'); if (cp != NULL) *cp = '\0'; if (!parse_config_option(line)) errx(4, "%s line %u: invalid config option '%s'", path, lineno, line); } free(line); fclose(fp); } static void parse_gdb_options(const char *opt) { const char *sport; char *colon; if (opt[0] == 'w') { set_config_bool("gdb.wait", true); opt++; } colon = strrchr(opt, ':'); if (colon == NULL) { sport = opt; } else { *colon = '\0'; colon++; sport = colon; set_config_value("gdb.address", opt); } set_config_value("gdb.port", sport); } static void set_defaults(void) { set_config_bool("acpi_tables", false); + set_config_bool("acpi_tables_in_memory", true); set_config_value("memory.size", "256M"); set_config_bool("x86.strictmsr", true); set_config_value("lpc.fwcfg", "bhyve"); } int main(int argc, char *argv[]) { int c, error; int max_vcpus, memflags; struct vcpu *bsp; struct vmctx *ctx; struct qemu_fwcfg_item *e820_fwcfg_item; size_t memsize; const char *optstr, *value, *vmname; #ifdef BHYVE_SNAPSHOT char *restore_file; struct restore_state rstate; restore_file = NULL; #endif init_config(); set_defaults(); progname = basename(argv[0]); #ifdef BHYVE_SNAPSHOT optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:"; #else optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': set_config_bool("x86.x2apic", false); break; case 'A': set_config_bool("acpi_tables", true); break; case 'D': set_config_bool("destroy_on_poweroff", true); break; case 'p': if (pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': if (topology_parse(optarg) != 0) { errx(EX_USAGE, "invalid cpu topology " "'%s'", optarg); } break; case 'C': set_config_bool("memory.guest_in_core", true); break; case 'f': if (qemu_fwcfg_parse_cmdline_arg(optarg) != 0) { errx(EX_USAGE, "invalid fwcfg item '%s'", optarg); } break; case 'G': parse_gdb_options(optarg); break; case 'k': parse_simple_config_file(optarg); break; case 'K': set_config_value("keyboard.layout", optarg); break; case 'l': if (strncmp(optarg, "help", strlen(optarg)) == 0) { lpc_print_supported_devices(); exit(0); } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; #ifdef BHYVE_SNAPSHOT case 'r': restore_file = optarg; break; #endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); exit(0); } else if (pci_parse_slot(optarg) != 0) exit(4); else break; case 'S': set_config_bool("memory.wired", true); break; case 'm': set_config_value("memory.size", optarg); break; case 'o': if (!parse_config_option(optarg)) errx(EX_USAGE, "invalid configuration option '%s'", optarg); break; case 'H': set_config_bool("x86.vmexit_on_hlt", true); break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': set_config_bool("x86.vmexit_on_pause", true); break; case 'e': set_config_bool("x86.strictio", true); break; case 'u': set_config_bool("rtc.use_localtime", false); break; case 'U': set_config_value("uuid", optarg); break; case 'w': set_config_bool("x86.strictmsr", false); break; case 'W': set_config_bool("virtio_msix", false); break; case 'x': set_config_bool("x86.x2apic", true); break; case 'Y': set_config_bool("x86.mptable", false); break; case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; if (argc > 1) usage(1); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { error = load_restore_file(restore_file, &rstate); if (error) { fprintf(stderr, "Failed to read checkpoint info from " "file: '%s'.\n", restore_file); exit(1); } vmname = lookup_vmname(&rstate); if (vmname != NULL) set_config_value("name", vmname); } #endif if (argc == 1) set_config_value("name", argv[0]); vmname = get_config_value("name"); if (vmname == NULL) usage(1); if (get_config_bool_default("config.dump", false)) { dump_config(); exit(1); } calc_topology(); build_vcpumaps(); value = get_config_value("memory.size"); error = vm_parse_memsize(value, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", value); ctx = do_open(vmname); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { guest_ncpus = lookup_guest_ncpus(&rstate); memflags = lookup_memflags(&rstate); memsize = lookup_memsize(&rstate); } if (guest_ncpus < 1) { fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); exit(1); } #endif bsp = vm_vcpu_open(ctx, BSP); max_vcpus = num_vcpus_allowed(ctx, bsp); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(4); } fbsdrun_set_capabilities(bsp); /* Allocate per-VCPU resources. */ vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info)); for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) { vcpu_info[vcpuid].ctx = ctx; vcpu_info[vcpuid].vcpuid = vcpuid; if (vcpuid == BSP) vcpu_info[vcpuid].vcpu = bsp; else vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid); } memflags = 0; if (get_config_bool_default("memory.wired", false)) memflags |= VM_MEM_F_WIRED; if (get_config_bool_default("memory.guest_in_core", false)) memflags |= VM_MEM_F_INCORE; vm_set_memflags(ctx, memflags); error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (error) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); exit(4); } init_mem(guest_ncpus); init_inout(); kernemu_dev_init(); init_bootrom(ctx); atkbdc_init(ctx); pci_irq_init(ctx); ioapic_init(ctx); rtc_init(ctx); sci_init(ctx); if (qemu_fwcfg_init(ctx) != 0) { fprintf(stderr, "qemu fwcfg initialization error"); exit(4); } if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus), &guest_ncpus) != 0) { fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu"); exit(4); } if (e820_init(ctx) != 0) { fprintf(stderr, "Unable to setup E820"); exit(4); } /* * Exit if a device emulation finds an error in its initialization */ if (init_pci(ctx) != 0) { perror("device emulation initialization error"); exit(4); } if (init_tpm(ctx) != 0) { fprintf(stderr, "Failed to init TPM device"); exit(4); } /* * Initialize after PCI, to allow a bootrom file to reserve the high * region. */ if (get_config_bool("acpi_tables")) vmgenc_init(ctx); init_gdb(ctx); if (lpc_bootrom()) { if (vm_set_capability(bsp, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); exit(4); } error = vcpu_reset(bsp); assert(error == 0); } /* * Add all vCPUs. */ for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) spinup_vcpu(&vcpu_info[vcpuid], vcpuid == BSP); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { fprintf(stdout, "Pausing pci devs...\r\n"); if (vm_pause_devices() != 0) { fprintf(stderr, "Failed to pause PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring vm mem...\r\n"); if (restore_vm_mem(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore VM memory.\n"); exit(1); } fprintf(stdout, "Restoring pci devs...\r\n"); if (vm_restore_devices(&rstate) != 0) { fprintf(stderr, "Failed to restore PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring kernel structs...\r\n"); if (vm_restore_kern_structs(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore kernel structs.\n"); exit(1); } fprintf(stdout, "Resuming pci devs...\r\n"); if (vm_resume_devices() != 0) { fprintf(stderr, "Failed to resume PCI device state.\n"); exit(1); } } #endif /* * build the guest tables, MP etc. */ if (get_config_bool_default("x86.mptable", true)) { error = mptable_build(ctx, guest_ncpus); if (error) { perror("error to build the guest tables"); exit(4); } } error = smbios_build(ctx); if (error != 0) exit(4); if (get_config_bool("acpi_tables")) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } e820_fwcfg_item = e820_get_fwcfg_item(); if (e820_fwcfg_item == NULL) { fprintf(stderr, "invalid e820 table"); exit(4); } if (qemu_fwcfg_add_file("etc/e820", e820_fwcfg_item->size, e820_fwcfg_item->data) != 0) { fprintf(stderr, "could not add qemu fwcfg etc/e820"); exit(4); } free(e820_fwcfg_item); if (lpc_bootrom() && strcmp(lpc_fwcfg(), "bhyve") == 0) { fwctl_init(); } /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); #ifdef BHYVE_SNAPSHOT /* initialize mutex/cond variables */ init_snapshot(); /* * checkpointing thread for communication with bhyvectl */ if (init_checkpoint_thread(ctx) != 0) errx(EX_OSERR, "Failed to start checkpoint thread"); #endif #ifndef WITHOUT_CAPSICUM caph_cache_catpages(); if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_enter() == -1) errx(EX_OSERR, "cap_enter() failed"); #endif #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { destroy_restore_state(&rstate); if (vm_restore_time(ctx) < 0) err(EX_OSERR, "Unable to restore time"); for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) vm_resume_cpu(vcpu_info[vcpuid].vcpu); } else #endif vm_resume_cpu(bsp); /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(4); }