diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c index 85864da57af2..6ff8dd8e273b 100644 --- a/usr.sbin/bhyve/acpi.c +++ b/usr.sbin/bhyve/acpi.c @@ -1,780 +1,904 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * bhyve ACPI table generator. * * Create the minimal set of ACPI tables required to boot FreeBSD (and * hopefully other o/s's). * * The tables are placed in the guest's ROM area just below 1MB physical, * above the MPTable. */ #include +#include +#include #include #include #include +#include #include #include #include #include #include #include #include #include +#include #include +#include #include #include "bhyverun.h" #include "acpi.h" #include "basl.h" #include "pci_emul.h" #include "vmgenc.h" #define BHYVE_ASL_TEMPLATE "bhyve.XXXXXXX" #define BHYVE_ASL_SUFFIX ".aml" #define BHYVE_ASL_COMPILER "/usr/sbin/iasl" #define BHYVE_ADDRESS_IOAPIC 0xFEC00000 #define BHYVE_ADDRESS_HPET 0xFED00000 #define BHYVE_ADDRESS_LAPIC 0xFEE00000 static int basl_keep_temps; static int basl_verbose_iasl; static int basl_ncpu; /* * Contains the full pathname of the template to be passed * to mkstemp/mktemps(3) */ static char basl_template[MAXPATHLEN]; static char basl_stemplate[MAXPATHLEN]; +/* + * SRAT vCPU affinity info. + */ +struct acpi_vcpu_affinity_entry { + RB_ENTRY(acpi_vcpu_affinity_entry) entry; + int vcpuid; + int domain; +}; + +static int vcpu_affinity_cmp(struct acpi_vcpu_affinity_entry *const a1, + struct acpi_vcpu_affinity_entry *const a2); +static RB_HEAD(vcpu_affinities, + acpi_vcpu_affinity_entry) aff_head = RB_INITIALIZER(&aff_head); +RB_GENERATE_STATIC(vcpu_affinities, acpi_vcpu_affinity_entry, entry, + vcpu_affinity_cmp); + /* * State for dsdt_line(), dsdt_indent(), and dsdt_unindent(). */ static FILE *dsdt_fp; static int dsdt_indent_level; static int dsdt_error; struct basl_fio { int fd; FILE *fp; char f_name[MAXPATHLEN]; }; #define EFPRINTF(...) \ if (fprintf(__VA_ARGS__) < 0) goto err_exit #define EFFLUSH(x) \ if (fflush(x) != 0) goto err_exit /* * A list for additional ACPI devices like a TPM. */ struct acpi_device_list_entry { SLIST_ENTRY(acpi_device_list_entry) chain; const struct acpi_device *dev; }; static SLIST_HEAD(acpi_device_list, acpi_device_list_entry) acpi_devices = SLIST_HEAD_INITIALIZER(acpi_devices); int acpi_tables_add_device(const struct acpi_device *const dev) { struct acpi_device_list_entry *const entry = calloc(1, sizeof(*entry)); if (entry == NULL) { return (ENOMEM); } entry->dev = dev; SLIST_INSERT_HEAD(&acpi_devices, entry, chain); return (0); } +static int +vcpu_affinity_cmp(struct acpi_vcpu_affinity_entry *a1, + struct acpi_vcpu_affinity_entry *a2) +{ + return (a1->vcpuid < a2->vcpuid ? -1 : a1->vcpuid > a2->vcpuid); +} + +int +acpi_add_vcpu_affinity(int vcpuid, int domain) +{ + struct acpi_vcpu_affinity_entry *entry = calloc(1, sizeof(*entry)); + if (entry == NULL) { + return (ENOMEM); + } + + entry->vcpuid = vcpuid; + entry->domain = domain; + if (RB_INSERT(vcpu_affinities, &aff_head, entry) != NULL) { + free(entry); + return (EEXIST); + } + + return (0); +} + /* * Helper routines for writing to the DSDT from other modules. */ void dsdt_line(const char *fmt, ...) { va_list ap; if (dsdt_error != 0) return; if (strcmp(fmt, "") != 0) { if (dsdt_indent_level != 0) EFPRINTF(dsdt_fp, "%*c", dsdt_indent_level * 2, ' '); va_start(ap, fmt); if (vfprintf(dsdt_fp, fmt, ap) < 0) { va_end(ap); goto err_exit; } va_end(ap); } EFPRINTF(dsdt_fp, "\n"); return; err_exit: dsdt_error = errno; } void dsdt_indent(int levels) { dsdt_indent_level += levels; assert(dsdt_indent_level >= 0); } void dsdt_unindent(int levels) { assert(dsdt_indent_level >= levels); dsdt_indent_level -= levels; } void dsdt_fixed_ioport(uint16_t iobase, uint16_t length) { dsdt_line("IO (Decode16,"); dsdt_line(" 0x%04X, // Range Minimum", iobase); dsdt_line(" 0x%04X, // Range Maximum", iobase); dsdt_line(" 0x01, // Alignment"); dsdt_line(" 0x%02X, // Length", length); dsdt_line(" )"); } void dsdt_fixed_irq(uint8_t irq) { dsdt_line("IRQNoFlags ()"); dsdt_line(" {%d}", irq); } void dsdt_fixed_mem32(uint32_t base, uint32_t length) { dsdt_line("Memory32Fixed (ReadWrite,"); dsdt_line(" 0x%08X, // Address Base", base); dsdt_line(" 0x%08X, // Address Length", length); dsdt_line(" )"); } static int basl_fwrite_dsdt(FILE *fp) { dsdt_fp = fp; dsdt_error = 0; dsdt_indent_level = 0; dsdt_line("/*"); dsdt_line(" * bhyve DSDT template"); dsdt_line(" */"); dsdt_line("DefinitionBlock (\"bhyve_dsdt.aml\", \"DSDT\", 2," "\"BHYVE \", \"BVDSDT \", 0x00000001)"); dsdt_line("{"); dsdt_line(" Name (_S5, Package ()"); dsdt_line(" {"); dsdt_line(" 0x05,"); dsdt_line(" Zero,"); dsdt_line(" })"); pci_write_dsdt(); #ifdef __amd64__ dsdt_line(""); dsdt_line(" Scope (_SB.PC00)"); dsdt_line(" {"); dsdt_line(" Device (HPET)"); dsdt_line(" {"); dsdt_line(" Name (_HID, EISAID(\"PNP0103\"))"); dsdt_line(" Name (_UID, 0)"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(4); dsdt_fixed_mem32(0xFED00000, 0x400); dsdt_unindent(4); dsdt_line(" })"); dsdt_line(" }"); dsdt_line(" }"); #endif vmgenc_write_dsdt(); const struct acpi_device_list_entry *entry; SLIST_FOREACH(entry, &acpi_devices, chain) { BASL_EXEC(acpi_device_write_dsdt(entry->dev)); } dsdt_line("}"); if (dsdt_error != 0) return (dsdt_error); EFFLUSH(fp); return (0); err_exit: return (errno); } static int basl_open(struct basl_fio *bf, int suffix) { int err; err = 0; if (suffix) { strlcpy(bf->f_name, basl_stemplate, MAXPATHLEN); bf->fd = mkstemps(bf->f_name, strlen(BHYVE_ASL_SUFFIX)); } else { strlcpy(bf->f_name, basl_template, MAXPATHLEN); bf->fd = mkstemp(bf->f_name); } if (bf->fd > 0) { bf->fp = fdopen(bf->fd, "w+"); if (bf->fp == NULL) { unlink(bf->f_name); close(bf->fd); } } else { err = 1; } return (err); } static void basl_close(struct basl_fio *bf) { if (!basl_keep_temps) unlink(bf->f_name); fclose(bf->fp); } static int basl_start(struct basl_fio *in, struct basl_fio *out) { int err; err = basl_open(in, 0); if (!err) { err = basl_open(out, 1); if (err) { basl_close(in); } } return (err); } static void basl_end(struct basl_fio *in, struct basl_fio *out) { basl_close(in); basl_close(out); } static int basl_load(struct vmctx *ctx, int fd) { struct stat sb; void *addr; if (fstat(fd, &sb) < 0) return (errno); addr = calloc(1, sb.st_size); if (addr == NULL) return (ENOMEM); if (read(fd, addr, sb.st_size) < 0) return (errno); struct basl_table *table; uint8_t name[ACPI_NAMESEG_SIZE + 1] = { 0 }; memcpy(name, addr, sizeof(name) - 1 /* last char is '\0' */); BASL_EXEC(basl_table_create(&table, ctx, name, BASL_TABLE_ALIGNMENT)); BASL_EXEC(basl_table_append_bytes(table, addr, sb.st_size)); free(addr); return (0); } static int basl_compile(struct vmctx *ctx, int (*fwrite_section)(FILE *)) { struct basl_fio io[2]; static char iaslbuf[3*MAXPATHLEN + 10]; const char *fmt; int err; err = basl_start(&io[0], &io[1]); if (!err) { err = (*fwrite_section)(io[0].fp); if (!err) { /* * iasl sends the results of the compilation to * stdout. Shut this down by using the shell to * redirect stdout to /dev/null, unless the user * has requested verbose output for debugging * purposes */ fmt = basl_verbose_iasl ? "%s -p %s %s" : "/bin/sh -c \"%s -p %s %s\" 1> /dev/null"; snprintf(iaslbuf, sizeof(iaslbuf), fmt, BHYVE_ASL_COMPILER, io[1].f_name, io[0].f_name); err = system(iaslbuf); if (!err) { /* * Copy the aml output file into guest * memory at the specified location */ err = basl_load(ctx, io[1].fd); } } basl_end(&io[0], &io[1]); } return (err); } static int basl_make_templates(void) { const char *tmpdir; int err; int len; err = 0; /* * */ if ((tmpdir = getenv("BHYVE_TMPDIR")) == NULL || *tmpdir == '\0' || (tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0') { tmpdir = _PATH_TMP; } len = strlen(tmpdir); if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1) < MAXPATHLEN) { strcpy(basl_template, tmpdir); while (len > 0 && basl_template[len - 1] == '/') len--; basl_template[len] = '/'; strcpy(&basl_template[len + 1], BHYVE_ASL_TEMPLATE); } else err = E2BIG; if (!err) { /* * len has been initialized (and maybe adjusted) above */ if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1 + sizeof(BHYVE_ASL_SUFFIX)) < MAXPATHLEN) { strcpy(basl_stemplate, tmpdir); basl_stemplate[len] = '/'; strcpy(&basl_stemplate[len + 1], BHYVE_ASL_TEMPLATE); len = strlen(basl_stemplate); strcpy(&basl_stemplate[len], BHYVE_ASL_SUFFIX); } else err = E2BIG; } return (err); } static int build_dsdt(struct vmctx *const ctx) { BASL_EXEC(basl_compile(ctx, basl_fwrite_dsdt)); return (0); } static int build_facs(struct vmctx *const ctx) { ACPI_TABLE_FACS facs; struct basl_table *table; BASL_EXEC(basl_table_create(&table, ctx, ACPI_SIG_FACS, BASL_TABLE_ALIGNMENT_FACS)); memset(&facs, 0, sizeof(facs)); memcpy(facs.Signature, ACPI_SIG_FACS, ACPI_NAMESEG_SIZE); facs.Length = sizeof(facs); facs.Version = htole32(2); BASL_EXEC(basl_table_append_bytes(table, &facs, sizeof(facs))); return (0); } static int build_fadt(struct vmctx *const ctx) { ACPI_TABLE_FADT fadt; struct basl_table *table; BASL_EXEC(basl_table_create(&table, ctx, ACPI_SIG_FADT, BASL_TABLE_ALIGNMENT)); memset(&fadt, 0, sizeof(fadt)); BASL_EXEC(basl_table_append_header(table, ACPI_SIG_FADT, 5, 1)); fadt.Facs = htole32(0); /* patched by basl */ fadt.Dsdt = htole32(0); /* patched by basl */ fadt.SciInterrupt = htole16(SCI_INT); fadt.SmiCommand = htole32(SMI_CMD); fadt.AcpiEnable = BHYVE_ACPI_ENABLE; fadt.AcpiDisable = BHYVE_ACPI_DISABLE; fadt.Pm1aEventBlock = htole32(PM1A_EVT_ADDR); fadt.Pm1aControlBlock = htole32(PM1A_CNT_ADDR); fadt.PmTimerBlock = htole32(IO_PMTMR); fadt.Gpe0Block = htole32(IO_GPE0_BLK); fadt.Pm1EventLength = 4; fadt.Pm1ControlLength = 2; fadt.PmTimerLength = 4; fadt.Gpe0BlockLength = IO_GPE0_LEN; fadt.Century = 0x32; fadt.BootFlags = htole16(ACPI_FADT_NO_VGA | ACPI_FADT_NO_ASPM); fadt.Flags = htole32(ACPI_FADT_WBINVD | ACPI_FADT_C1_SUPPORTED | ACPI_FADT_SLEEP_BUTTON | ACPI_FADT_32BIT_TIMER | ACPI_FADT_RESET_REGISTER | ACPI_FADT_HEADLESS | ACPI_FADT_APIC_PHYSICAL); basl_fill_gas(&fadt.ResetRegister, ACPI_ADR_SPACE_SYSTEM_IO, 8, 0, ACPI_GAS_ACCESS_WIDTH_BYTE, 0xCF9); fadt.ResetValue = 6; fadt.MinorRevision = 1; fadt.XFacs = htole64(0); /* patched by basl */ fadt.XDsdt = htole64(0); /* patched by basl */ basl_fill_gas(&fadt.XPm1aEventBlock, ACPI_ADR_SPACE_SYSTEM_IO, 0x20, 0, ACPI_GAS_ACCESS_WIDTH_WORD, PM1A_EVT_ADDR); basl_fill_gas(&fadt.XPm1bEventBlock, ACPI_ADR_SPACE_SYSTEM_IO, 0, 0, ACPI_GAS_ACCESS_WIDTH_UNDEFINED, 0); basl_fill_gas(&fadt.XPm1aControlBlock, ACPI_ADR_SPACE_SYSTEM_IO, 0x10, 0, ACPI_GAS_ACCESS_WIDTH_WORD, PM1A_CNT_ADDR); basl_fill_gas(&fadt.XPm1bControlBlock, ACPI_ADR_SPACE_SYSTEM_IO, 0, 0, ACPI_GAS_ACCESS_WIDTH_UNDEFINED, 0); basl_fill_gas(&fadt.XPm2ControlBlock, ACPI_ADR_SPACE_SYSTEM_IO, 8, 0, ACPI_GAS_ACCESS_WIDTH_UNDEFINED, 0); basl_fill_gas(&fadt.XPmTimerBlock, ACPI_ADR_SPACE_SYSTEM_IO, 0x20, 0, ACPI_GAS_ACCESS_WIDTH_DWORD, IO_PMTMR); basl_fill_gas(&fadt.XGpe0Block, ACPI_ADR_SPACE_SYSTEM_IO, IO_GPE0_LEN * 8, 0, ACPI_GAS_ACCESS_WIDTH_BYTE, IO_GPE0_BLK); basl_fill_gas(&fadt.XGpe1Block, ACPI_ADR_SPACE_SYSTEM_IO, 0, 0, ACPI_GAS_ACCESS_WIDTH_UNDEFINED, 0); basl_fill_gas(&fadt.SleepControl, ACPI_ADR_SPACE_SYSTEM_IO, 8, 0, ACPI_GAS_ACCESS_WIDTH_BYTE, 0); basl_fill_gas(&fadt.SleepStatus, ACPI_ADR_SPACE_SYSTEM_IO, 8, 0, ACPI_GAS_ACCESS_WIDTH_BYTE, 0); BASL_EXEC(basl_table_append_content(table, &fadt, sizeof(fadt))); BASL_EXEC(basl_table_add_pointer(table, ACPI_SIG_FACS, offsetof(ACPI_TABLE_FADT, Facs), sizeof(fadt.Facs))); BASL_EXEC(basl_table_add_pointer(table, ACPI_SIG_DSDT, offsetof(ACPI_TABLE_FADT, Dsdt), sizeof(fadt.Dsdt))); BASL_EXEC(basl_table_add_pointer(table, ACPI_SIG_FACS, offsetof(ACPI_TABLE_FADT, XFacs), sizeof(fadt.XFacs))); BASL_EXEC(basl_table_add_pointer(table, ACPI_SIG_DSDT, offsetof(ACPI_TABLE_FADT, XDsdt), sizeof(fadt.XDsdt))); BASL_EXEC(basl_table_register_to_rsdt(table)); return (0); } #ifdef __amd64__ static int build_hpet(struct vmctx *const ctx) { ACPI_TABLE_HPET hpet; struct basl_table *table; uint32_t hpet_capabilities; int err; err = vm_get_hpet_capabilities(ctx, &hpet_capabilities); if (err != 0) return (err); BASL_EXEC(basl_table_create(&table, ctx, ACPI_SIG_HPET, BASL_TABLE_ALIGNMENT)); memset(&hpet, 0, sizeof(hpet)); BASL_EXEC(basl_table_append_header(table, ACPI_SIG_HPET, 1, 1)); hpet.Id = htole32(hpet_capabilities); basl_fill_gas(&hpet.Address, ACPI_ADR_SPACE_SYSTEM_MEMORY, 0, 0, ACPI_GAS_ACCESS_WIDTH_LEGACY, BHYVE_ADDRESS_HPET); hpet.Flags = ACPI_HPET_PAGE_PROTECT4; BASL_EXEC(basl_table_append_content(table, &hpet, sizeof(hpet))); BASL_EXEC(basl_table_register_to_rsdt(table)); return (0); } #endif static int build_madt(struct vmctx *const ctx) { ACPI_TABLE_MADT madt; ACPI_MADT_LOCAL_APIC madt_lapic; ACPI_MADT_IO_APIC madt_ioapic; ACPI_MADT_INTERRUPT_OVERRIDE madt_irq_override; ACPI_MADT_LOCAL_APIC_NMI madt_lapic_nmi; struct basl_table *table; BASL_EXEC(basl_table_create(&table, ctx, ACPI_SIG_MADT, BASL_TABLE_ALIGNMENT)); memset(&madt, 0, sizeof(madt)); BASL_EXEC(basl_table_append_header(table, ACPI_SIG_MADT, 1, 1)); madt.Address = htole32(BHYVE_ADDRESS_LAPIC); madt.Flags = htole32(ACPI_MADT_PCAT_COMPAT); BASL_EXEC(basl_table_append_content(table, &madt, sizeof(madt))); /* Local APIC for each CPU */ for (int i = 0; i < basl_ncpu; ++i) { memset(&madt_lapic, 0, sizeof(madt_lapic)); madt_lapic.Header.Type = ACPI_MADT_TYPE_LOCAL_APIC; madt_lapic.Header.Length = sizeof(madt_lapic); madt_lapic.ProcessorId = i; madt_lapic.Id = i; madt_lapic.LapicFlags = htole32(ACPI_MADT_ENABLED); BASL_EXEC(basl_table_append_bytes(table, &madt_lapic, sizeof(madt_lapic))); } /* I/O APIC */ memset(&madt_ioapic, 0, sizeof(madt_ioapic)); madt_ioapic.Header.Type = ACPI_MADT_TYPE_IO_APIC; madt_ioapic.Header.Length = sizeof(madt_ioapic); madt_ioapic.Address = htole32(BHYVE_ADDRESS_IOAPIC); BASL_EXEC( basl_table_append_bytes(table, &madt_ioapic, sizeof(madt_ioapic))); /* Legacy IRQ0 is connected to pin 2 of the I/O APIC */ memset(&madt_irq_override, 0, sizeof(madt_irq_override)); madt_irq_override.Header.Type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE; madt_irq_override.Header.Length = sizeof(madt_irq_override); madt_irq_override.GlobalIrq = htole32(2); madt_irq_override.IntiFlags = htole16( ACPI_MADT_POLARITY_ACTIVE_HIGH | ACPI_MADT_TRIGGER_EDGE); BASL_EXEC(basl_table_append_bytes(table, &madt_irq_override, sizeof(madt_irq_override))); memset(&madt_irq_override, 0, sizeof(madt_irq_override)); madt_irq_override.Header.Type = ACPI_MADT_TYPE_INTERRUPT_OVERRIDE; madt_irq_override.Header.Length = sizeof(madt_irq_override); madt_irq_override.SourceIrq = SCI_INT; madt_irq_override.GlobalIrq = htole32(SCI_INT); madt_irq_override.IntiFlags = htole16( ACPI_MADT_POLARITY_ACTIVE_LOW | ACPI_MADT_TRIGGER_LEVEL); BASL_EXEC(basl_table_append_bytes(table, &madt_irq_override, sizeof(madt_irq_override))); /* Local APIC NMI is conntected to LINT 1 on all CPUs */ memset(&madt_lapic_nmi, 0, sizeof(madt_lapic_nmi)); madt_lapic_nmi.Header.Type = ACPI_MADT_TYPE_LOCAL_APIC_NMI; madt_lapic_nmi.Header.Length = sizeof(madt_lapic_nmi); madt_lapic_nmi.ProcessorId = 0xFF; madt_lapic_nmi.IntiFlags = htole16( ACPI_MADT_POLARITY_ACTIVE_HIGH | ACPI_MADT_TRIGGER_EDGE); madt_lapic_nmi.Lint = 1; BASL_EXEC(basl_table_append_bytes(table, &madt_lapic_nmi, sizeof(madt_lapic_nmi))); BASL_EXEC(basl_table_register_to_rsdt(table)); return (0); } static int build_mcfg(struct vmctx *const ctx) { ACPI_TABLE_MCFG mcfg; ACPI_MCFG_ALLOCATION mcfg_allocation; struct basl_table *table; BASL_EXEC(basl_table_create(&table, ctx, ACPI_SIG_MCFG, BASL_TABLE_ALIGNMENT)); memset(&mcfg, 0, sizeof(mcfg)); BASL_EXEC(basl_table_append_header(table, ACPI_SIG_MCFG, 1, 1)); BASL_EXEC(basl_table_append_content(table, &mcfg, sizeof(mcfg))); memset(&mcfg_allocation, 0, sizeof(mcfg_allocation)); mcfg_allocation.Address = htole64(pci_ecfg_base()); mcfg_allocation.EndBusNumber = 0xFF; BASL_EXEC(basl_table_append_bytes(table, &mcfg_allocation, sizeof(mcfg_allocation))); BASL_EXEC(basl_table_register_to_rsdt(table)); return (0); } static int build_rsdp(struct vmctx *const ctx) { ACPI_TABLE_RSDP rsdp; struct basl_table *table; BASL_EXEC(basl_table_create(&table, ctx, ACPI_RSDP_NAME, BASL_TABLE_ALIGNMENT)); memset(&rsdp, 0, sizeof(rsdp)); memcpy(rsdp.Signature, ACPI_SIG_RSDP, 8); rsdp.Checksum = 0; /* patched by basl */ memcpy(rsdp.OemId, "BHYVE ", ACPI_OEM_ID_SIZE); rsdp.Revision = 2; rsdp.RsdtPhysicalAddress = htole32(0); /* patched by basl */ rsdp.Length = htole32(0); /* patched by basl */ rsdp.XsdtPhysicalAddress = htole64(0); /* patched by basl */ rsdp.ExtendedChecksum = 0; /* patched by basl */ BASL_EXEC(basl_table_append_bytes(table, &rsdp, sizeof(rsdp))); BASL_EXEC(basl_table_add_checksum(table, offsetof(ACPI_TABLE_RSDP, Checksum), 0, 20)); BASL_EXEC(basl_table_add_pointer(table, ACPI_SIG_RSDT, offsetof(ACPI_TABLE_RSDP, RsdtPhysicalAddress), sizeof(rsdp.RsdtPhysicalAddress))); BASL_EXEC(basl_table_add_length(table, offsetof(ACPI_TABLE_RSDP, Length), sizeof(rsdp.Length))); BASL_EXEC(basl_table_add_pointer(table, ACPI_SIG_XSDT, offsetof(ACPI_TABLE_RSDP, XsdtPhysicalAddress), sizeof(rsdp.XsdtPhysicalAddress))); BASL_EXEC(basl_table_add_checksum(table, offsetof(ACPI_TABLE_RSDP, ExtendedChecksum), 0, BASL_TABLE_CHECKSUM_LEN_FULL_TABLE)); return (0); } static int build_spcr(struct vmctx *const ctx) { ACPI_TABLE_SPCR spcr; struct basl_table *table; BASL_EXEC(basl_table_create(&table, ctx, ACPI_SIG_SPCR, BASL_TABLE_ALIGNMENT)); memset(&spcr, 0, sizeof(spcr)); BASL_EXEC(basl_table_append_header(table, ACPI_SIG_SPCR, 1, 1)); spcr.InterfaceType = ACPI_DBG2_16550_COMPATIBLE; basl_fill_gas(&spcr.SerialPort, ACPI_ADR_SPACE_SYSTEM_IO, 8, 0, ACPI_GAS_ACCESS_WIDTH_LEGACY, 0x3F8); spcr.InterruptType = ACPI_SPCR_INTERRUPT_TYPE_8259; spcr.PcInterrupt = 4; spcr.BaudRate = ACPI_SPCR_BAUD_RATE_115200; spcr.Parity = ACPI_SPCR_PARITY_NO_PARITY; spcr.StopBits = ACPI_SPCR_STOP_BITS_1; spcr.FlowControl = 3; /* RTS/CTS | DCD */ spcr.TerminalType = ACPI_SPCR_TERMINAL_TYPE_VT_UTF8; BASL_EXEC(basl_table_append_content(table, &spcr, sizeof(spcr))); BASL_EXEC(basl_table_register_to_rsdt(table)); return (0); } +static int +build_srat(struct vmctx *const ctx) +{ + ACPI_TABLE_SRAT srat; + ACPI_SRAT_MEM_AFFINITY srat_mem_affinity; + ACPI_SRAT_CPU_AFFINITY srat_cpu_affinity; + + struct acpi_vcpu_affinity_entry *ep; + struct basl_table *table; + int segid, domain; + int _flags, _prot; + vm_ooffset_t _off; + size_t maplen; + uint64_t gpa; + int ret; + + if (RB_EMPTY(&aff_head)) + return (0); + + memset(&srat, 0, sizeof(srat)); + BASL_EXEC(basl_table_create(&table, ctx, ACPI_SIG_SRAT, + BASL_TABLE_ALIGNMENT)); + BASL_EXEC(basl_table_append_header(table, ACPI_SIG_SRAT, 1, 1)); + srat.TableRevision = 1; + BASL_EXEC(basl_table_append_content(table, &srat, sizeof(srat))); + + /* + * Iterate over the VM's memory maps and add + * a 'Memory Affinity Structure' for each mapping. + */ + gpa = 0; + while (1) { + ret = vm_mmap_getnext(ctx, &gpa, &segid, &_off, &maplen, &_prot, + &_flags); + if (ret) { + break; + } + + if (segid >= VM_SYSMEM && segid < VM_BOOTROM) { + domain = segid - VM_SYSMEM; + } else { + /* Treat devmem segs as domain 0. */ + domain = 0; + } + memset(&srat_mem_affinity, 0, sizeof(srat_mem_affinity)); + srat_mem_affinity.Header.Type = ACPI_SRAT_TYPE_MEMORY_AFFINITY; + srat_mem_affinity.Header.Length = sizeof(srat_mem_affinity); + srat_mem_affinity.Flags |= ACPI_SRAT_MEM_ENABLED; + srat_mem_affinity.ProximityDomain = htole32(domain); + srat_mem_affinity.BaseAddress = htole64(gpa); + srat_mem_affinity.Length = htole64(maplen); + srat_mem_affinity.Flags = htole32(ACPI_SRAT_MEM_ENABLED); + BASL_EXEC(basl_table_append_bytes(table, &srat_mem_affinity, + sizeof(srat_mem_affinity))); + gpa += maplen; + } + + /* + * Iterate over each "vCPUid to domain id" mapping and emit a + * 'Processor Local APIC/SAPIC Affinity Structure' for each entry. + */ + RB_FOREACH(ep, vcpu_affinities, &aff_head) { + memset(&srat_cpu_affinity, 0, sizeof(srat_cpu_affinity)); + srat_cpu_affinity.Header.Type = ACPI_SRAT_TYPE_CPU_AFFINITY; + srat_cpu_affinity.Header.Length = sizeof(srat_cpu_affinity); + srat_cpu_affinity.ProximityDomainLo = (uint8_t)ep->domain; + srat_cpu_affinity.ApicId = (uint8_t)ep->vcpuid; + srat_cpu_affinity.Flags = htole32(ACPI_SRAT_CPU_USE_AFFINITY); + BASL_EXEC(basl_table_append_bytes(table, &srat_cpu_affinity, + sizeof(srat_cpu_affinity))); + } + + BASL_EXEC(basl_table_register_to_rsdt(table)); + + return (0); +} + int acpi_build(struct vmctx *ctx, int ncpu) { basl_ncpu = ncpu; /* * For debug, allow the user to have iasl compiler output sent * to stdout rather than /dev/null */ if (getenv("BHYVE_ACPI_VERBOSE_IASL")) basl_verbose_iasl = 1; /* * Allow the user to keep the generated ASL files for debugging * instead of deleting them following use */ if (getenv("BHYVE_ACPI_KEEPTMPS")) basl_keep_temps = 1; BASL_EXEC(basl_init(ctx)); BASL_EXEC(basl_make_templates()); /* * Generate ACPI tables and copy them into guest memory. * * According to UEFI Specification v6.3 chapter 5.1 the FADT should be * the first table pointed to by XSDT. For that reason, build it as the * first table after XSDT. */ BASL_EXEC(build_rsdp(ctx)); BASL_EXEC(build_fadt(ctx)); BASL_EXEC(build_madt(ctx)); #ifdef __amd64__ BASL_EXEC(build_hpet(ctx)); #endif BASL_EXEC(build_mcfg(ctx)); BASL_EXEC(build_facs(ctx)); BASL_EXEC(build_spcr(ctx)); + BASL_EXEC(build_srat(ctx)); /* Build ACPI device-specific tables such as a TPM2 table. */ const struct acpi_device_list_entry *entry; SLIST_FOREACH(entry, &acpi_devices, chain) { BASL_EXEC(acpi_device_build_table(entry->dev)); } BASL_EXEC(build_dsdt(ctx)); BASL_EXEC(basl_finish()); return (0); } diff --git a/usr.sbin/bhyve/acpi.h b/usr.sbin/bhyve/acpi.h index 4b557993d67f..824dce776306 100644 --- a/usr.sbin/bhyve/acpi.h +++ b/usr.sbin/bhyve/acpi.h @@ -1,67 +1,68 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _ACPI_H_ #define _ACPI_H_ #include "acpi_device.h" #define SCI_INT 9 #define SMI_CMD 0xb2 #define BHYVE_ACPI_ENABLE 0xa0 #define BHYVE_ACPI_DISABLE 0xa1 #define PM1A_EVT_ADDR 0x400 #define PM1A_CNT_ADDR 0x404 #define IO_PMTMR 0x408 /* 4-byte i/o port for the timer */ #define IO_GPE0_BLK 0x40c /* 2x 1-byte IO port for GPE0_STS/EN */ #define IO_GPE0_LEN 0x2 #define IO_GPE0_STS IO_GPE0_BLK #define IO_GPE0_EN (IO_GPE0_BLK + (IO_GPE0_LEN / 2)) /* Allocated GPE bits. */ #define GPE_VMGENC 0 struct vmctx; int acpi_build(struct vmctx *ctx, int ncpu); void acpi_raise_gpe(struct vmctx *ctx, unsigned bit); int acpi_tables_add_device(const struct acpi_device *const dev); +int acpi_add_vcpu_affinity(int vcpuid, int domain); void dsdt_line(const char *fmt, ...); void dsdt_fixed_ioport(uint16_t iobase, uint16_t length); void dsdt_fixed_irq(uint8_t irq); void dsdt_fixed_mem32(uint32_t base, uint32_t length); void dsdt_indent(int levels); void dsdt_unindent(int levels); void sci_init(struct vmctx *ctx); #endif /* _ACPI_H_ */ diff --git a/usr.sbin/bhyve/amd64/bhyverun_machdep.c b/usr.sbin/bhyve/amd64/bhyverun_machdep.c index 85af124b5536..dad8f1e52e4e 100644 --- a/usr.sbin/bhyve/amd64/bhyverun_machdep.c +++ b/usr.sbin/bhyve/amd64/bhyverun_machdep.c @@ -1,384 +1,394 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include "acpi.h" #include "atkbdc.h" #include "bhyverun.h" #include "bootrom.h" #include "config.h" #include "debug.h" #include "e820.h" #include "fwctl.h" #include "ioapic.h" #include "inout.h" #include "kernemu_dev.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "rtc.h" #include "smbiostbl.h" #include "xmsr.h" void bhyve_init_config(void) { init_config(); /* Set default values prior to option parsing. */ set_config_bool("acpi_tables", true); set_config_bool("acpi_tables_in_memory", true); set_config_value("memory.size", "256M"); set_config_bool("x86.strictmsr", true); set_config_bool("x86.verbosemsr", false); set_config_value("lpc.fwcfg", "bhyve"); } void bhyve_usage(int code) { const char *progname; progname = getprogname(); fprintf(stderr, "Usage: %s [-aCDeHhPSuWwxY]\n" " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" " %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n" " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n" " -a: local apic is in xAPIC mode (deprecated)\n" " -C: include guest memory in core file\n" " -c: number of CPUs and/or topology specification\n" " -D: destroy on power-off\n" " -e: exit on unhandled I/O access\n" " -G: start a debug server\n" " -H: vmexit from the guest on HLT\n" " -h: help\n" " -k: key=value flat config file\n" " -K: PS2 keyboard layout\n" " -l: LPC device configuration\n" " -m: memory size\n" + " -n: NUMA domain specification\n" " -o: set config 'var' to 'value'\n" " -P: vmexit from the guest on pause\n" " -p: pin 'vcpu' to 'hostcpu'\n" #ifdef BHYVE_SNAPSHOT " -r: path to checkpoint file\n" #endif " -S: guest memory cannot be swapped\n" " -s: PCI slot config\n" " -U: UUID\n" " -u: RTC keeps UTC time\n" " -W: force virtio to use single-vector MSI\n" " -w: ignore unimplemented MSRs\n" " -x: local APIC is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), "", (int)strlen(progname), "", (int)strlen(progname), ""); exit(code); } void bhyve_optparse(int argc, char **argv) { const char *optstr; int c; #ifdef BHYVE_SNAPSHOT - optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:"; + optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:n:l:K:U:r:"; #else - optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:"; + optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:n:l:K:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': set_config_bool("x86.x2apic", false); break; case 'A': /* * NOP. For backward compatibility. Most systems don't * work properly without sane ACPI tables. Therefore, * we're always generating them. */ break; case 'D': set_config_bool("destroy_on_poweroff", true); break; case 'p': if (bhyve_pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': if (bhyve_topology_parse(optarg) != 0) { errx(EX_USAGE, "invalid cpu topology " "'%s'", optarg); } break; case 'C': set_config_bool("memory.guest_in_core", true); break; case 'f': if (qemu_fwcfg_parse_cmdline_arg(optarg) != 0) { errx(EX_USAGE, "invalid fwcfg item '%s'", optarg); } break; case 'G': bhyve_parse_gdb_options(optarg); break; case 'k': bhyve_parse_simple_config_file(optarg); break; case 'K': set_config_value("keyboard.layout", optarg); break; case 'l': if (strncmp(optarg, "help", strlen(optarg)) == 0) { lpc_print_supported_devices(); exit(0); } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; #ifdef BHYVE_SNAPSHOT case 'r': restore_file = optarg; break; #endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); exit(0); } else if (pci_parse_slot(optarg) != 0) exit(4); else break; case 'S': set_config_bool("memory.wired", true); break; case 'm': set_config_value("memory.size", optarg); break; + case 'n': + if (bhyve_numa_parse(optarg) != 0) + errx(EX_USAGE, + "invalid NUMA configuration " + "'%s'", + optarg); + if (!get_config_bool("acpi_tables")) + errx(EX_USAGE, "NUMA emulation requires ACPI"); + break; case 'o': if (!bhyve_parse_config_option(optarg)) { errx(EX_USAGE, "invalid configuration option '%s'", optarg); } break; case 'H': set_config_bool("x86.vmexit_on_hlt", true); break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': set_config_bool("x86.vmexit_on_pause", true); break; case 'e': set_config_bool("x86.strictio", true); break; case 'u': set_config_bool("rtc.use_localtime", false); break; case 'U': set_config_value("uuid", optarg); break; case 'w': set_config_bool("x86.strictmsr", false); break; case 'W': set_config_bool("virtio_msix", false); break; case 'x': set_config_bool("x86.x2apic", true); break; case 'Y': set_config_bool("x86.mptable", false); break; case 'h': bhyve_usage(0); default: bhyve_usage(1); } } /* Handle backwards compatibility aliases in config options. */ if (get_config_value("lpc.bootrom") != NULL && get_config_value("bootrom") == NULL) { warnx("lpc.bootrom is deprecated, use '-o bootrom' instead"); set_config_value("bootrom", get_config_value("lpc.bootrom")); } if (get_config_value("lpc.bootvars") != NULL && get_config_value("bootvars") == NULL) { warnx("lpc.bootvars is deprecated, use '-o bootvars' instead"); set_config_value("bootvars", get_config_value("lpc.bootvars")); } } void bhyve_init_vcpu(struct vcpu *vcpu) { int err, tmp; if (get_config_bool_default("x86.vmexit_on_hlt", false)) { err = vm_get_capability(vcpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { EPRINTLN("VM exit on HLT not supported"); exit(4); } vm_set_capability(vcpu, VM_CAP_HALT_EXIT, 1); } if (get_config_bool_default("x86.vmexit_on_pause", false)) { /* * pause exit support required for this mode */ err = vm_get_capability(vcpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { EPRINTLN("SMP mux requested, no pause support"); exit(4); } vm_set_capability(vcpu, VM_CAP_PAUSE_EXIT, 1); } if (get_config_bool_default("x86.x2apic", false)) err = vm_set_x2apic_state(vcpu, X2APIC_ENABLED); else err = vm_set_x2apic_state(vcpu, X2APIC_DISABLED); if (err) { EPRINTLN("Unable to set x2apic state (%d)", err); exit(4); } vm_set_capability(vcpu, VM_CAP_ENABLE_INVPCID, 1); err = vm_set_capability(vcpu, VM_CAP_IPI_EXIT, 1); assert(err == 0); } void bhyve_start_vcpu(struct vcpu *vcpu, bool bsp) { int error; if (bsp) { if (bootrom_boot()) { error = vm_set_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); if (error != 0) { err(4, "ROM boot failed: unrestricted guest " "capability not available"); } error = vcpu_reset(vcpu); assert(error == 0); } } else { bhyve_init_vcpu(vcpu); /* * Enable the 'unrestricted guest' mode for APs. * * APs startup in power-on 16-bit mode. */ error = vm_set_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); } fbsdrun_addcpu(vcpu_id(vcpu)); } int bhyve_init_platform(struct vmctx *ctx, struct vcpu *bsp __unused) { int error; error = init_msr(); if (error != 0) return (error); init_inout(); kernemu_dev_init(); atkbdc_init(ctx); pci_irq_init(ctx); ioapic_init(ctx); rtc_init(ctx); sci_init(ctx); error = e820_init(ctx); if (error != 0) return (error); error = bootrom_loadrom(ctx); if (error != 0) return (error); return (0); } int bhyve_init_platform_late(struct vmctx *ctx, struct vcpu *bsp __unused) { int error; if (get_config_bool_default("x86.mptable", true)) { error = mptable_build(ctx, guest_ncpus); if (error != 0) return (error); } error = smbios_build(ctx); if (error != 0) return (error); error = e820_finalize(); if (error != 0) return (error); if (bootrom_boot() && strcmp(lpc_fwcfg(), "bhyve") == 0) fwctl_init(); if (get_config_bool("acpi_tables")) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } return (0); } diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 index 62e567fd359d..89c0b23961a8 100644 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -1,1227 +1,1303 @@ .\" Copyright (c) 2013 Peter Grehan .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .Dd August 21, 2024 .Dt BHYVE 8 .Os .Sh NAME .Nm bhyve .Nd "run a guest operating system inside a virtual machine" .Sh SYNOPSIS .Nm .Op Fl aCDeHhPSuWwxY .Oo .Sm off .Fl c\~ .Oo .Op Cm cpus= .Ar numcpus .Oc .Op Cm ,sockets= Ar n .Op Cm ,cores= Ar n .Op Cm ,threads= Ar n .Oc .Sm on .Oo Fl f .Sm off .Ar name Cm \&, .Oo .Cm string No | Cm file .Oc .Cm \&= Ar data .Sm on .Oc .Oo .Sm off .Fl G\~ .Oo Ar w Oc .Oo Ar bind_address Cm \&: Oc .Ar port .Sm on .Oc .Op Fl k Ar config_file .Op Fl K Ar layout .Oo Fl l .Sm off .Ar lpcdev Op Cm \&, Ar conf .Sm on .Oc .Sm off .Oo Fl m\~ .Ar memsize .Oo .Cm K | Cm k | Cm M | Cm m | Cm G | Cm g | Cm T | Cm t .Oc .Sm on .Oc .Op Fl o Ar var Ns Cm = Ns Ar value .Op Fl p Ar vcpu Ns Cm \&: Ns Ar hostcpu .Op Fl r Ar file .Sm off .Oo Fl s\~ .Ar slot Cm \&, Ar emulation Op Cm \&, Ar conf .Sm on .Oc .Op Fl U Ar uuid .Ar vmname .Nm .Fl l Cm help .Nm .Fl s Cm help .Sh DESCRIPTION .Nm is a hypervisor that runs guest operating systems inside a virtual machine. It can run guests on amd64 and arm64 platforms with suitable hardware support. .Pp Parameters such as the number of virtual CPUs, amount of guest memory, and I/O connectivity can be specified with command-line parameters. .Pp .Nm is typically used with a boot ROM that can load the guest operating system. On arm64 platforms, this is currently required. If not using a boot ROM, the guest operating system must be loaded with .Xr bhyveload 8 or a similar boot loader before running .Nm , otherwise. On amd64, the .Pa edk2-bhyve package provides a UEFI firmware that can be used to boot the guest; on arm64 the .Pa u-boot-bhyve-arm64 package provides a U-Boot image that can be used to boot the guest. .Pp .Nm runs until the guest operating system reboots or an unhandled hypervisor exit is detected. .Sh OPTIONS .Bl -tag -width 10n .It Fl a The guest's local APIC is configured in xAPIC mode. This option only applies to the amd64 platform. xAPIC mode is the default setting so this option is redundant. It will be deprecated in a future version. .It Fl C Include guest memory in core files. .It Fl c Op Ar setting ... Number of guest virtual CPUs and/or the CPU topology. The default value for each of .Ar numcpus , .Ar sockets , .Ar cores , and .Ar threads is 1. If .Ar numcpus is not specified then it will be calculated from the other arguments. The topology must be consistent in that the .Ar numcpus must equal the product of .Ar sockets , .Ar cores , and .Ar threads . If a .Ar setting is specified more than once the last one has precedence. .Pp The maximum number of virtual CPUs defaults to the number of active physical CPUs in the system available via the .Va hw.vmm.maxcpu .Xr sysctl 8 variable. The limit can be adjusted via the .Va hw.vmm.maxcpu loader tunable. .It Fl D Destroy the VM on guest initiated power-off. .It Fl e Force .Nm to exit when a guest issues an access to an I/O port that is not emulated. This is intended for debug purposes and only applies to the amd64 platform. .It Fl f Ar name Ns Cm \&, Ns Oo Cm string Ns No | Ns Cm file Ns Oc Ns Cm \&= Ns Ar data Add a fw_cfg file .Ar name to the fw_cfg interface. If a .Cm string is specified, the fw_cfg file contains the string as data. If a .Cm file is specified, bhyve reads the file and adds the file content as fw_cfg data. .It Fl G Xo .Sm off .Oo Ar w Oc .Oo Ar bind_address Cm \&: Oc .Ar port .Sm on .Xc Start a debug server that uses the GDB protocol to export guest state to a debugger. An IPv4 TCP socket will be bound to the supplied .Ar bind_address and .Ar port to listen for debugger connections. Only a single debugger may be attached to the debug server at a time. If the option begins with .Sq w , .Nm will pause execution at the first instruction waiting for a debugger to attach. .It Fl H Yield the virtual CPU thread when a HLT instruction is detected. If this option is not specified, virtual CPUs will use 100% of a host CPU. This option applies only to the amd64 platform. .It Fl h Print help message and exit. .It Fl k Ar config_file Set configuration variables from a simple, key-value config file. Each line of the config file is expected to consist of a config variable name, an equals sign .Pq Sq = , and a value. No spaces are permitted between the variable name, equals sign, or value. Blank lines and lines starting with .Sq # are ignored. See .Xr bhyve_config 5 for more details. .It Fl K Ar layout Specify the keyboard layout. The value that can be specified sets the file name in .Ar /usr/share/bhyve/kbdlayout . This specification only works when loaded with UEFI mode for VNC. When using a VNC client that supports QEMU Extended Key Event Message (e.g. TigerVNC), this option isn't needed. When using a VNC client that doesn't support QEMU Extended Key Event Message (e.g. tightVNC), the layout defaults to the US keyboard unless specified otherwise. .It Fl l Cm help Print a list of supported LPC devices. .It Fl l Ar lpcdev Ns Op Cm \&, Ns Ar conf Allow devices behind the LPC PCI-ISA bridge to be configured. The only supported devices are the TTY-class devices .Cm com1 , com2 , com3 , and .Cm com4 , the TPM module .Cm tpm , the boot ROM device .Cm bootrom , the .Cm fwcfg type and the debug/test device .Cm pc-testdev . .Pp The possible values for the .Ar conf argument are listed in the .Fl s flag description. .Pp This option applies only to the amd64 platform. On arm64, the console and boot ROM devices are configured using the more generic .Fl o option. .It Xo .Fl m Ar memsize Ns Oo .Sm off .Cm K | k | M | m | G | g | T | t .Sm on .Oc .Xc Set the guest physical memory size. This must be the same size that was given to .Xr bhyveload 8 . .Pp The size argument may be suffixed with one of .Cm K , M , G or .Cm T (either upper or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes, or terabytes. If no suffix is given, the value is assumed to be in megabytes. -.Pp The default is 256M. +.Pp +.It Fl n Ar id Ns Cm \&, Ns Ar size Ns Cm \&, Ns Ar cpus Ns Op Cm \&, Ns Ar domain_policy +Configure guest NUMA domains. +This option applies only to the amd64 platform. +.Pp +The +.Fl n +option allows the guest physical address space to be partitioned into domains. +The layout of each domain is encoded in an ACPI table +visible to the guest operating system. +The +.Fl n +option also allows the specification of a +.Xr domainset 9 +memory allocation policy for the host memory backing a given NUMA domain. +A guest can have up to 8 NUMA domains. +This feature requires that the guest use a boot ROM, and in +particular cannot be used if the guest was initialized using +.Xr bhyveload 8 . +.Pp +Each domain is identified by a numerical +.Em id . +The domain memory +.Em size +is specified using the same format as the +.Fl m +flag. +The sum of all +.Em size +parameters overrides the total VM memory size specified by the +.Fl m +flag. +However, if at least one domain memory size parameter is +missing, the total VM memory size will be equally distributed across +all emulated domains. +The +.Em cpuset +parameter specifies the set of CPUs that are part of the domain. +The +.Em domain_policy +parameter may be optionally used to configure the +.Xr domainset 9 +host NUMA memory allocation policy for an emulated +domain. +See the +.Ar -n +flag in +.Xr cpuset 1 +for a list of valid NUMA memory allocation policies and their formats. .It Fl o Ar var Ns Cm = Ns Ar value Set the configuration variable .Ar var to .Ar value . See .Xr bhyve_config 5 for configuration options. .It Fl P Force the guest virtual CPU to exit when a PAUSE instruction is detected. This option applies only to the amd64 platform. .It Fl p Ar vcpu Ns Cm \& : Ns Ar hostcpu Pin guest's virtual CPU .Em vcpu to .Em hostcpu . Host CPUs and guest virtual CPUs are numbered starting from 0. A .Fl p option is required for every guest vCPU to be pinned. To map a 4 vCPU guest to host CPUs 12-15: .Bd -literal -p 0:12 -p 1:13 -p 2:14 -p 3:15 .Ed .It Fl r Ar file Resume a guest from a snapshot. The guest memory contents are restored from .Ar file , and the guest device and vCPU state are restored from the file .Dq Ar file Ns .kern . .Pp Note that the current snapshot file format requires that the configuration of devices in the new VM match the VM from which the snapshot was taken by specifying the same .Fl s and .Fl l options. The count of vCPUs and memory configuration are read from the snapshot. .It Fl S Wire guest memory. .It Fl s Cm help Print a list of supported PCI devices. .It Fl s Ar slot Ns Cm \&, Ns Ar emulation Ns Op Cm \&, Ns Ar conf Configure a virtual PCI slot and function. .Pp .Nm provides PCI bus emulation and virtual devices that can be attached to slots on the bus. There are 32 available slots, with the option of providing up to 8 functions per slot. .Pp The .Ar slot can be specified in one of the following formats: .Pp .Bl -bullet -compact .It .Ar pcislot .It .Sm off .Ar pcislot Cm \&: Ar function .Sm on .It .Sm off .Ar bus Cm \&: Ar pcislot Cm \&: Ar function .Sm on .El .Pp The .Ar pcislot value is 0 to 31. The optional .Ar function value is 0 to 7. The optional .Ar bus value is 0 to 255. If not specified, the .Ar function value defaults to 0. If not specified, the .Ar bus value defaults to 0. .Pp See .Sx "PCI EMULATION" for available options for the .Ar emulation argument. .It Fl U Ar uuid Set the universally unique identifier .Pq UUID in the guest's System Management BIOS System Information structure. By default a UUID is generated from the host's hostname and .Ar vmname . .It Fl u RTC keeps UTC time. .It Fl W Force virtio PCI device emulations to use MSI interrupts instead of MSI-X interrupts. .It Fl w Ignore accesses to unimplemented Model Specific Registers (MSRs). This is intended for debug purposes. .It Fl x The guest's local APIC is configured in x2APIC mode. This option applies only to the amd64 platform. .It Fl Y Disable MPtable generation. This option applies only to the amd64 platform. .It Ar vmname Alphanumeric name of the guest. This should be the same as that created by .Xr bhyveload 8 . .El .Sh PCI EMULATION .Nm provides emulation for various PCI devices. They are specified by the .Fl s .Ar slot,emulation,conf configuration's .Ar emulation argument, which can be one of the following: .Bl -tag -width "amd_hostbridge" .It Cm hostbridge A simple host bridge. This is usually configured at slot 0, and is required by most guest operating systems. .It Cm amd_hostbridge Emulation identical to .Cm hostbridge using a PCI vendor ID of AMD. .It Cm passthru PCI pass-through device. .It Cm virtio-net Virtio network interface. .It Cm virtio-blk Virtio block storage interface. .It Cm virtio-scsi Virtio SCSI interface. .It Cm virtio-9p Virtio 9p (VirtFS) interface. .It Cm virtio-rnd Virtio RNG interface. .It Cm virtio-console Virtio console interface, which exposes multiple ports to the guest in the form of simple char devices for simple IO between the guest and host userspaces. .It Cm virtio-input Virtio input interface. .It Cm ahci AHCI controller attached to arbitrary devices. .It Cm ahci-cd AHCI controller attached to an ATAPI CD/DVD. .It Cm ahci-hd AHCI controller attached to a SATA hard drive. .It Cm e1000 Intel e82545 network interface. .It Cm uart PCI 16550 serial device. .It Cm lpc LPC PCI-ISA bridge with COM1, COM2, COM3, and COM4 16550 serial ports, a boot ROM, and, optionally, a TPM module, a fwcfg type, and the debug/test device. The LPC bridge emulation can only be configured on bus 0. .It Cm fbuf Raw framebuffer device attached to VNC server. .It Cm xhci eXtensible Host Controller Interface (xHCI) USB controller. .It Cm nvme NVM Express (NVMe) controller. .It Cm hda High Definition Audio Controller. .El .Pp The optional parameter .Ar conf describes the backend for device emulations. If .Ar conf is not specified, the device emulation has no backend and can be considered unconnected. .Ss Network device backends .Sm off .Bl -bullet .It .Xo .Cm tap Ar N .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx .Op Cm \&,mtu= Ar N .Xc .It .Xo .Cm vmnet Ar N .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx .Op Cm \&,mtu= Ar N .Xc .It .Xo .Cm netgraph,path= Ar ADDRESS Cm \&,peerhook= Ar HOOK .Op Cm \&,socket= Ar NAME .Op Cm \&,hook= Ar HOOK .Op Cm \&,mac= Ar xx:xx:xx:xx:xx:xx .Op Cm \&,mtu= Ar N .Xc .It .Xo .Cm slirp,hostfwd= Ar proto : Ar hostaddr : Ar hostport - Ar guestaddr : Ar guestport .Xc .El .Sm on .Pp If .Cm mac is not specified, the MAC address is derived from a fixed OUI, and the remaining bytes from an MD5 hash of the slot and function numbers and the device name. .Pp The MAC address is an ASCII string in .Xr ethers 5 format. .Pp With .Cm virtio-net devices, the .Cm mtu parameter can be specified to inform the guest about the largest MTU that should be allowed, expressed in bytes. .Pp With .Cm netgraph backend, the .Cm path and .Cm peerhook parameters must be specified to set the destination node and corresponding hook. The optional parameters .Cm socket and .Cm hook may be used to set the .Xr ng_socket 4 node name and source hook. The .Ar ADDRESS , .Ar HOOK , and .Ar NAME must comply with .Xr netgraph 4 addressing rules. .Pp The slirp backend can be used to provide a NATed network to the guest. This backend has poor performance but does not require any network configuration on the host system. It depends on the .Pa net/libslirp port. The .Cm hostfwd option takes a 5-tuple describing how connections from the host are to be forwarded to the guest. Multiple rules can be specified, separated by semicolons. Note that semicolons must be escaped or quoted to prevent the shell from interpreting them. .Ss Block storage device backends: .Bl -bullet .Sm off .It .Ar /filename Op Cm \&, Ar block-device-options .It .Ar /dev/xxx Op Cm \&, Ar block-device-options .Sm on .El .Pp The .Ar block-device-options are: .Bl -tag -width 10n .It Cm nocache Open the file with .Dv O_DIRECT . .It Cm direct Open the file using .Dv O_SYNC . .It Cm ro Force the file to be opened read-only. .It Cm sectorsize= Ns Ar logical Ns Oo Cm \&/ Ns Ar physical Oc Specify the logical and physical sector sizes of the emulated disk. The physical sector size is optional and is equal to the logical sector size if not explicitly specified. .It Cm nodelete Disable emulation of guest trim requests via .Dv DIOCGDELETE requests. .It Li bootindex= Ns Ar index Add the device to the bootorder at .Ar index . A fwcfg file is used to specify the bootorder. The guest firmware may ignore or doesn't support this fwcfg file. In that case, this feature doesn't work as expected. .El .Ss SCSI device backends .Bl -bullet .Sm off .It .Pa /dev/cam/ctl Oo Ar pp Cm \&. Ar vp Oc Oo Cm \&, Ar scsi-device-options Oc .Sm on .El .Pp The .Ar scsi-device-options are: .Bl -tag -width 10n .It Cm iid= Ns Ar IID Initiator ID to use when sending requests to specified CTL port. The default value is 0. .It Li bootindex= Ns Ar index Add the device to the bootorder at .Ar index . A fwcfg file is used to specify the bootorder. The guest firmware may ignore or doesn't support this fwcfg file. In that case, this feature doesn't work as expected. .El .Ss 9P device backends .Bl -bullet .Sm off .It .Ar sharename Cm = Ar /path/to/share Op Cm \&, Ar 9p-device-options .Sm on .El .Pp The .Ar 9p-device-options are: .Bl -tag -width 10n .It Cm ro Expose the share in read-only mode. .El .Ss TTY device backends .Bl -tag -width 10n .It Cm stdio Connect the serial port to the standard input and output of the .Nm process. .It Ar /dev/xxx Use the host TTY device for serial port I/O. .It Ar tcp=ip:port Use the TCP server for serial port I/O. Configuring this option will start a TCP server that waits for connections. Only one connection is allowed at any time. Other connection try to connect to TCP server will be disconnected immediately. Note that this feature allows unprivileged users to access the guest console, so ensure that access is appropriately restricted. .El .Ss TPM device backends .Bl -bullet .Sm off .It .Ar type Ns \&, Ns Ar path Ns Op Cm \&, Ns Ar tpm-device-options .Sm on .El .Pp Emulate a TPM device. Supported options for .Ar type : .Bl -tag -width 10n .It Cm passthru Use a physical TPM device. The argument .Ar path needs to point to a valid TPM device path, i.e. .Pa /dev/tpm0 . .It Cm swtpm Connect to a running .Cm swtpm instance. The argument .Ar path needs to point to a UNIX domain socket that a .Cm swtpm process is listening on. .El .Pp The .Ar tpm-device-options are: .Bl -tag -width 10n .It Cm version= Ns Ar version Version of the TPM device according to the TCG specification. Defaults to .Cm 2.0 , which is the only version currently supported. .El .Ss Boot ROM device backends .Sm off .Bl -bullet .It .Ar romfile Ns Op Cm \&, Ns Ar varfile .El .Sm on .Pp Map .Ar romfile in the guest address space reserved for boot firmware. .Pp If .Ar varfile is provided, that file is also mapped in the boot firmware guest address space, and any modifications the guest makes will be saved to that file. .Pp Fwcfg types: .Bl -tag -width 10n .It Ar fwcfg The fwcfg interface is used to pass information such as the CPU count or ACPI tables to the guest firmware. Supported values are .Ql bhyve and .Ql qemu . Due to backward compatibility reasons, .Ql bhyve is the default option. When .Ql bhyve is used, bhyve's fwctl interface is used. It currently reports only the CPU count to the guest firmware. The .Ql qemu option uses QEMU's fwcfg interface. This interface is widely used and allows user-defined information to be passed to the guest. It is used for passing the CPU count, ACPI tables, a boot order and many other things to the guest. Some operating systems such as Fedora CoreOS can be configured by qemu's fwcfg interface as well. .El .Ss Pass-through device backends .Sm off .Bl -bullet .It .Cm ppt Ar N Oo , Ar passthru-device-options Oc .It .Ns Ar bus Cm \&/ Ar slot Cm \&/ Ar function .Op , Ar passthru-device-options .It .Cm pci Ar bus Cm : Ar slot Cm : Ns Ar function .Op , Ar passthru-device-options .El .Sm on .Pp Connect to a PCI device on the host either named ppt .Ns Ar N or at the selector described by .Ar slot , .Ar bus , and .Ar function numbers. .Pp The .Ar passthru-device-options are: .Bl -tag -width 10n .It Cm rom= Ns Ar romfile Add .Ar romfile as option ROM to the PCI device. The ROM will be loaded by firmware and should be capable of initializing the device. .It Li bootindex= Ns Ar index Add the device to the bootorder at .Ar index . A fwcfg file is used to specify the bootorder. The guest firmware may ignore or doesn't support this fwcfg file. In that case, this feature doesn't work as expected. .El .Pp Guest memory must be wired using the .Fl S option when a pass-through device is configured. .Pp The host device must have been reserved at boot-time using the .Va pptdevs loader variable as described in .Xr vmm 4 . .Ss Virtio console device backends .Bl -bullet .Sm off .It .Cm port1= Ns Ar /path/to/port1.sock Ns Op Cm ,port Ns Ar N Cm \&= Ns Ar /path/to/port2.sock No \~ Ar ... .Sm on .El .Pp A maximum of 16 ports per device can be created. Every port is named and corresponds to a Unix domain socket created by .Nm . .Nm accepts at most one connection per port at a time. .Pp Limitations: .Bl -bullet .It Due to the lack of destructors in .Nm , sockets on the filesystem must be cleaned up manually after .Nm exits. .It There is no way to use the .Dq console port feature, nor the console port resize at present. .It Emergency write is advertised, but no-op at present. .El .Ss Virtio input device backends: .Bl -bullet .Sm off .It .Ar /dev/input/eventX .Sm on .El .Pp Send input events of .Ar /dev/input/eventX to guest by VirtIO Input Interface. .Ss Framebuffer device backends .Bl -bullet .Sm off .It .Op Cm rfb= Ar ip-and-port .Op Cm ,w= Ar width .Op Cm ,h= Ar height .Op Cm ,vga= Ar vgaconf .Op Cm ,wait .Op Cm ,password= Ar password .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Cm rfb= Ns Ar ip-and-port Pq or Cm tcp= Ns Ar ip-and-port An IP address and a port VNC should listen on. There are two formats: .Pp .Bl -bullet -compact .It .Sm off .Op Ar IPv4 Cm \&: .Ar port .Sm on .It .Sm off .Cm \&[ Ar IPv6%zone Cm \&] Cm \&: Ar port .Sm on .El .Pp The default is to listen on localhost IPv4 address and default VNC port 5900. An IPv6 address must be enclosed in square brackets and may contain an optional zone identifier. .It Cm w= Ns Ar width No and Cm h= Ns Ar height A display resolution, width and height, respectively. If not specified, a default resolution of 1024x768 pixels will be used. Minimal supported resolution is 640x480 pixels, and maximum is 3840x2160 pixels. .It Cm vga= Ns Ar vgaconf Possible values for this option are .Cm io (default), .Cm on , and .Cm off . PCI graphics cards have a dual personality in that they are standard PCI devices with BAR addressing, but may also implicitly decode legacy VGA I/O space .Pq Ad 0x3c0-3df and memory space .Pq 64KB at Ad 0xA0000 . The default .Cm io option should be used for guests that attempt to issue BIOS calls which result in I/O port queries, and fail to boot if I/O decode is disabled. .Pp The .Cm on option should be used along with the CSM BIOS capability in UEFI to boot traditional BIOS guests that require the legacy VGA I/O and memory regions to be available. .Pp The .Cm off option should be used for the UEFI guests that assume that VGA adapter is present if they detect the I/O ports. An example of such a guest is .Ox in UEFI mode. .Pp Please refer to the .Nm .Fx wiki page .Pq Lk https://wiki.freebsd.org/bhyve for configuration notes of particular guests. .It Cm wait Instruct .Nm to only boot upon the initiation of a VNC connection, simplifying the installation of operating systems that require immediate keyboard input. This can be removed for post-installation use. .It Cm password= Ns Ar password This type of authentication is known to be cryptographically weak and is not intended for use on untrusted networks. Many implementations will want to use stronger security, such as running the session over an encrypted channel provided by IPsec or SSH. .El .Ss xHCI USB device backends .Bl -bullet .Sm off .It .Ar tablet .Sm on .El .Pp A USB tablet device that provides precise cursor synchronization when using VNC. .Ss NVMe device backends .Bl -bullet .Sm off .It .Ar devpath .Op Cm ,maxq= Ar # .Op Cm ,qsz= Ar # .Op Cm ,ioslots= Ar # .Op Cm ,sectsz= Ar # .Op Cm ,ser= Ar # .Op Cm ,eui64= Ar # .Op Cm ,dsm= Ar opt .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Ar devpath Accepted device paths are: .Ar /dev/blockdev or .Ar /path/to/image or .Cm ram= Ns Ar size_in_MiB . .It Cm maxq Max number of queues. .It Cm qsz Max elements in each queue. .It Cm ioslots Max number of concurrent I/O requests. .It Cm sectsz Sector size (defaults to blockif sector size). .It Cm ser Serial number with maximum 20 characters. .It Cm eui64 IEEE Extended Unique Identifier (8 byte value). .It Cm dsm DataSet Management support. Supported values are: .Cm auto , enable , and .Cm disable . .El .Ss AHCI device backends .Bl -bullet .It .Sm off .Op Oo Cm hd\&: | cd\&: Oc Ar path .Op Cm ,nmrr= Ar nmrr .Op Cm ,ser= Ar # .Op Cm ,rev= Ar # .Op Cm ,model= Ar # .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Cm nmrr Nominal Media Rotation Rate, known as RPM. Value 1 will indicate device as Solid State Disk. Default value is 0, not report. .It Cm ser Serial Number with maximum 20 characters. .It Cm rev Revision Number with maximum 8 characters. .It Cm model Model Number with maximum 40 characters. .El .Ss HD Audio device backends .Bl -bullet .It .Sm off .Op Cm play= Ar playback .Op Cm ,rec= Ar recording .Sm on .El .Pp Configuration options are defined as follows: .Bl -tag -width 10n .It Cm play Playback device, typically .Ar /dev/dsp0 . .It Cm rec Recording device, typically .Ar /dev/dsp0 . .El .Sh CONFIGURATION VARIABLES .Nm uses an internal tree of configuration variables to describe global and per-device settings. When .Nm starts, it parses command line options (including config files) in the order given on the command line. Each command line option sets one or more configuration variables. For example, the .Fl s option creates a new tree node for a PCI device and sets one or more variables under that node including the device model and device model-specific variables. Variables may be set multiple times during this parsing stage with the final value overriding previous values. .Pp Once all of the command line options have been processed, the configuration values are frozen. .Nm then uses the value of configuration values to initialize device models and global settings. .Pp More details on configuration variables can be found in .Xr bhyve_config 5 . .Sh CONFIGURATION FILE CREATION The .Fl k flag allows one to provide a path to a configuration file holding all settings, which otherwise would need to be defined by providing a long list of program arguments to .Nm . .Pp There is a very simple way to translate a complex set of program arguments to an equivalent configuration file in .Xr bhyve_config 5 format. .Pp Use .Fl o .Ar config.dump=1 to make .Nm dump a configuration file representing the used flags and arguments to stdout. You can pipe the output into a file to persist the generated settings. .Pp Make sure to remove the .Ar config.dump line from the resulting configuration file before using it to start .Nm . .Sh DEBUG SERVER The current debug server provides limited support for debuggers. .Ss Registers Each virtual CPU is exposed to the debugger as a thread. .Pp General purpose registers can be queried for each virtual CPU, but other registers such as floating-point and system registers cannot be queried. .Ss Memory Memory (including memory mapped I/O regions) can be read and written by the debugger. Memory operations use virtual addresses that are resolved to physical addresses via the current virtual CPU's active address translation. .Ss Control The running guest can be interrupted by the debugger at any time .Pq for example, by pressing Ctrl-C in the debugger . .Pp Single stepping is only supported on Intel CPUs supporting the MTRAP VM exit. .Pp Breakpoints are supported on Intel CPUs that support single stepping. Note that continuing from a breakpoint while interrupts are enabled in the guest may not work as expected due to timer interrupts firing while single stepping over the breakpoint. .Sh SIGNAL HANDLING .Nm deals with the following signals: .Pp .Bl -tag -width SIGTERM -compact .It SIGTERM Trigger ACPI poweroff for a VM .El .Sh EXIT STATUS Exit status indicates how the VM was terminated: .Pp .Bl -tag -width indent -compact .It 0 rebooted .It 1 powered off .It 2 halted .It 3 triple fault .It 4 exited due to an error .El .Sh EXAMPLES If not using a boot ROM, the guest operating system must have been loaded with .Xr bhyveload 8 or a similar boot loader before .Xr bhyve 4 can be run. Otherwise, the boot loader is not needed. .Pp To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio block device backed by the .Pa /my/image filesystem image, and a serial port for the console: .Bd -literal -offset indent bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\ -l com1,stdio -H -P -m 1G vm1 .Ed .Pp To do the same on arm64: .Bd -literal -offset indent .Ed bhyve -c 2 -s 0,hostbridge -s 1,virtio-blk,/my/image -o console=stdio \\ -o bootrom=/usr/local/share/u-boot/u-boot-bhyve-arm64/u-boot.bin -m 1G vm1 .Pp Run a 24GB single-CPU virtual machine with three network ports, one of which has a MAC address specified: .Bd -literal -offset indent bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\ -s 2:1,virtio-net,tap1 \\ -s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\ -s 3,virtio-blk,/my/image -l com1,stdio \\ -H -P -m 24G bigvm .Ed .Pp Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI CD-ROM, a single virtio network port, an AMD hostbridge, and the console port connected to an .Xr nmdm 4 null-modem device. .Bd -literal -offset indent bhyve -c 4 \\ -s 0,amd_hostbridge -s 1,lpc \\ -s 1:0,ahci,hd:/images/disk.1,hd:/images/disk.2,\\ hd:/images/disk.3,hd:/images/disk.4,\\ hd:/images/disk.5,hd:/images/disk.6,\\ hd:/images/disk.7,hd:/images/disk.8,\\ cd:/images/install.iso \\ -s 3,virtio-net,tap0 \\ -l com1,/dev/nmdm0A \\ -H -P -m 8G .Ed .Pp Run a UEFI virtual machine with a display resolution of 800 by 600 pixels that can be accessed via VNC at: 0.0.0.0:5900 or via serial console over TCP at: 127.0.0.1:1234 (unsafe if you expose serial console without protection). .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 3,ahci-cd,/path/to/uefi-OS-install.iso \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=0.0.0.0:5900,w=800,h=600,wait \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,tcp=127.0.0.1:1234 \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Pp Run a UEFI virtual machine with a VNC display that is bound to all IPv6 addresses on port 5900 and a serial I/O port bound to TCP port 1234 of loopback address (unsafe if you expose serial console without protection). .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=[::]:5900,w=800,h=600 \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,tcp=[::1]:1234 \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Pp Run a UEFI virtual machine with a VARS file to save EFI variables. Note that .Nm will write guest modifications to the given VARS file. Be sure to create a per-guest copy of the template VARS file from .Pa /usr . .Bd -literal -offset indent bhyve -c 2 -m 4g -w -H \\ -s 0,hostbridge \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI_CODE.fd,BHYVE_UEFI_VARS.fd uefivm .Ed .Pp To create a configuration file .Pa configfile for a virtual machine, use .Fl o .Ar config.dump=1 : .Bd -literal -offset indent /usr/sbin/bhyve -c 2 -m 256 -H -P \\ -s 0:0,hostbridge -s 1:0,virtio-net,tap0 \\ -s 2:0,ahci-hd,./vm0.img \\ -s 31,lpc -l com1,stdio \\ -o config.dump=1 vm0 > configfile .Ed .Pp Then use an editor of your choice to remove the line "config.dump=1" from the newly generated .Pa configfile . .Pp To start .Nm using this configuration file, use flag .Fl k : .Bd -literal -offset indent /usr/sbin/bhyve -k configfile vm0 .Ed +.Pp +Run a UEFI virtual machine with four CPUs and two emulated NUMA domains: +.Bd -literal -offset indent +bhyve -c 4 -w -H \\ + -s 0,hostbridge \\ + -s 4,ahci-hd,disk.img \\ + -s 31,lpc -l com1,stdio \\ + -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ + -n id=0,size=4G,cpus=0-1 \\ + -n id=1,size=4G,cpus=2-3 \\ + numavm +.Ed +.Pp +Assuming a host machine with two NUMA domains, +run a UEFI virtual machine with four CPUs using a +.Ar prefer +.Xr domainset 9 +policy to allocate guest memory from the first host NUMA domain only. +.Bd -literal -offset indent +bhyve -c 2 -w -H \\ + -s 0,hostbridge \\ + -s 4,ahci-hd,disk.img \\ + -s 31,lpc -l com1,stdio \\ + -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ + -n id=0,size=4G,cpus=0-1,domain_policy=prefer:0 \\ + numavm +.Ed .Sh SEE ALSO .Xr bhyve 4 , .Xr netgraph 4 , .Xr ng_socket 4 , .Xr nmdm 4 , .Xr vmm 4 , .Xr bhyve_config 5 , .Xr ethers 5 , .Xr bhyvectl 8 , -.Xr bhyveload 8 +.Xr bhyveload 8 , +.Xr domainset 9 .Pp .Rs .%A Intel .%B 64 and IA-32 Architectures Software Developer’s Manual .%V Volume 3 .Re .Sh HISTORY .Nm first appeared in .Fx 10.0 . .Sh AUTHORS .An Neel Natu Aq Mt neel@freebsd.org .An Peter Grehan Aq Mt grehan@freebsd.org diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index be9cd1611700..9ead49582a7d 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -1,852 +1,1024 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #ifndef WITHOUT_CAPSICUM #include #endif +#include +#include #include #ifdef BHYVE_SNAPSHOT #include #include #endif #include #ifdef BHYVE_SNAPSHOT #include #endif #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #endif #include +#include #include #include #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #include #include #endif +#include #include #include "acpi.h" #include "bhyverun.h" #include "bootrom.h" #include "config.h" #include "debug.h" #ifdef BHYVE_GDB #include "gdb.h" #endif #include "mem.h" #include "mevent.h" #include "pci_emul.h" #ifdef __amd64__ #include "amd64/pci_lpc.h" #endif #include "qemu_fwcfg.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "tpm_device.h" #include "vmgenc.h" #include "vmexit.h" #define MB (1024UL * 1024) #define GB (1024UL * MB) int guest_ncpus; uint16_t cpu_cores, cpu_sockets, cpu_threads; int raw_stdio = 0; #ifdef BHYVE_SNAPSHOT char *restore_file; #endif static const int BSP = 0; static cpuset_t cpumask; +static struct vm_mem_domain guest_domains[VM_MAXMEMDOM]; +static int guest_ndomains = 0; + static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu); static struct vcpu_info { struct vmctx *ctx; struct vcpu *vcpu; int vcpuid; } *vcpu_info; static cpuset_t **vcpumap; /* * XXX This parser is known to have the following issues: * 1. It accepts null key=value tokens ",," as setting "cpus" to an * empty string. * * The acceptance of a null specification ('-c ""') is by design to match the * manual page syntax specification, this results in a topology of 1 vCPU. */ int bhyve_topology_parse(const char *opt) { char *cp, *str, *tofree; if (*opt == '\0') { set_config_value("sockets", "1"); set_config_value("cores", "1"); set_config_value("threads", "1"); set_config_value("cpus", "1"); return (0); } tofree = str = strdup(opt); if (str == NULL) errx(4, "Failed to allocate memory"); while ((cp = strsep(&str, ",")) != NULL) { if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) set_config_value("cpus", cp + strlen("cpus=")); else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0) set_config_value("sockets", cp + strlen("sockets=")); else if (strncmp(cp, "cores=", strlen("cores=")) == 0) set_config_value("cores", cp + strlen("cores=")); else if (strncmp(cp, "threads=", strlen("threads=")) == 0) set_config_value("threads", cp + strlen("threads=")); else if (strchr(cp, '=') != NULL) goto out; else set_config_value("cpus", cp); } free(tofree); return (0); out: free(tofree); return (-1); } static int parse_int_value(const char *key, const char *value, int minval, int maxval) { char *cp; long lval; errno = 0; lval = strtol(value, &cp, 0); if (errno != 0 || *cp != '\0' || cp == value || lval < minval || lval > maxval) errx(4, "Invalid value for %s: '%s'", key, value); return (lval); } +int +bhyve_numa_parse(const char *opt) +{ + int id = -1; + nvlist_t *nvl; + char *cp, *str, *tofree; + char pathbuf[64] = { 0 }; + char *size = NULL, *cpus = NULL, *domain_policy = NULL; + + if (*opt == '\0') { + return (-1); + } + + tofree = str = strdup(opt); + if (str == NULL) + errx(4, "Failed to allocate memory"); + + while ((cp = strsep(&str, ",")) != NULL) { + if (strncmp(cp, "id=", strlen("id=")) == 0) + id = parse_int_value("id", cp + strlen("id="), 0, + UINT8_MAX); + else if (strncmp(cp, "size=", strlen("size=")) == 0) + size = cp + strlen("size="); + else if (strncmp(cp, + "domain_policy=", strlen("domain_policy=")) == 0) + domain_policy = cp + strlen("domain_policy="); + else if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) + cpus = cp + strlen("cpus="); + } + + if (id == -1) { + EPRINTLN("Missing NUMA domain ID in '%s'", opt); + goto out; + } + + snprintf(pathbuf, sizeof(pathbuf), "domains.%d", id); + nvl = find_config_node(pathbuf); + if (nvl == NULL) + nvl = create_config_node(pathbuf); + if (size != NULL) + set_config_value_node(nvl, "size", size); + if (domain_policy != NULL) + set_config_value_node(nvl, "domain_policy", domain_policy); + if (cpus != NULL) + set_config_value_node(nvl, "cpus", cpus); + + free(tofree); + return (0); + +out: + free(tofree); + return (-1); +} + +static void +calc_mem_affinity(size_t vm_memsize) +{ + int i; + nvlist_t *nvl; + bool need_recalc; + const char *value; + struct vm_mem_domain *dom; + char pathbuf[64] = { 0 }; + + need_recalc = false; + for (i = 0; i < VM_MAXMEMDOM; i++) { + dom = &guest_domains[i]; + snprintf(pathbuf, sizeof(pathbuf), "domains.%d", i); + nvl = find_config_node(pathbuf); + if (nvl == NULL) { + break; + } + + value = get_config_value_node(nvl, "size"); + need_recalc |= value == NULL; + if (value != NULL && vm_parse_memsize(value, &dom->size)) { + errx(EX_USAGE, "invalid memsize for domain %d: '%s'", i, + value); + } + + dom->ds_mask = calloc(1, sizeof(domainset_t)); + if (dom->ds_mask == NULL) { + errx(EX_OSERR, "Failed to allocate domainset mask"); + } + dom->ds_size = sizeof(domainset_t); + value = get_config_value_node(nvl, "domain_policy"); + if (value == NULL) { + dom->ds_policy = DOMAINSET_POLICY_INVALID; + DOMAINSET_ZERO(dom->ds_mask); + } else if (domainset_parselist(value, dom->ds_mask, &dom->ds_policy) != + CPUSET_PARSE_OK) { + errx(EX_USAGE, "failed to parse domain policy '%s'", value); + } + } + + guest_ndomains = i; + if (guest_ndomains == 0) { + /* + * No domains were specified - create domain + * 0 holding all CPUs and memory. + */ + guest_ndomains = 1; + guest_domains[0].size = vm_memsize; + } else if (need_recalc) { + warnx("At least one domain memory size was not specified, distributing" + " total VM memory size across all domains"); + for (i = 0; i < guest_ndomains; i++) { + guest_domains[i].size = vm_memsize / guest_ndomains; + } + } +} + /* * Set the sockets, cores, threads, and guest_cpus variables based on * the configured topology. * * The limits of UINT16_MAX are due to the types passed to * vm_set_topology(). vmm.ko may enforce tighter limits. */ static void calc_topology(void) { const char *value; bool explicit_cpus; uint64_t ncpus; value = get_config_value("cpus"); if (value != NULL) { guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX); explicit_cpus = true; } else { guest_ncpus = 1; explicit_cpus = false; } value = get_config_value("cores"); if (value != NULL) cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX); else cpu_cores = 1; value = get_config_value("threads"); if (value != NULL) cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX); else cpu_threads = 1; value = get_config_value("sockets"); if (value != NULL) cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX); else cpu_sockets = guest_ncpus; /* * Compute sockets * cores * threads avoiding overflow. The * range check above insures these are 16 bit values. */ ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads; if (ncpus > UINT16_MAX) errx(4, "Computed number of vCPUs too high: %ju", (uintmax_t)ncpus); if (explicit_cpus) { if (guest_ncpus != (int)ncpus) errx(4, "Topology (%d sockets, %d cores, %d threads) " "does not match %d vCPUs", cpu_sockets, cpu_cores, cpu_threads, guest_ncpus); } else guest_ncpus = ncpus; } int bhyve_pincpu_parse(const char *opt) { const char *value; char *newval; char key[16]; int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0) { fprintf(stderr, "invalid vcpu '%d'\n", vcpu); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (asprintf(&newval, "%s%s%d", value != NULL ? value : "", value != NULL ? "," : "", pcpu) == -1) { perror("failed to build new cpuset string"); return (-1); } set_config_value(key, newval); free(newval); return (0); } static void parse_cpuset(int vcpu, const char *list, cpuset_t *set) { char *cp, *token; int pcpu, start; CPU_ZERO(set); start = -1; token = __DECONST(char *, list); for (;;) { pcpu = strtoul(token, &cp, 0); if (cp == token) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); if (pcpu < 0 || pcpu >= CPU_SETSIZE) errx(4, "hostcpu '%d' outside valid range from 0 to %d", pcpu, CPU_SETSIZE - 1); switch (*cp) { case ',': case '\0': if (start >= 0) { if (start > pcpu) errx(4, "Invalid hostcpu range %d-%d", start, pcpu); while (start < pcpu) { CPU_SET(start, set); start++; } start = -1; } CPU_SET(pcpu, set); break; case '-': if (start >= 0) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); start = pcpu; break; default: errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); } if (*cp == '\0') break; token = cp + 1; } } static void build_vcpumaps(void) { char key[16]; const char *value; int vcpu; vcpumap = calloc(guest_ncpus, sizeof(*vcpumap)); for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (value == NULL) continue; vcpumap[vcpu] = malloc(sizeof(cpuset_t)); if (vcpumap[vcpu] == NULL) err(4, "Failed to allocate cpuset for vcpu %d", vcpu); parse_cpuset(vcpu, value, vcpumap[vcpu]); } } +static void +set_vcpu_affinities(void) +{ + int cpu, error; + nvlist_t *nvl = NULL; + cpuset_t cpus; + const char *value; + char pathbuf[64] = { 0 }; + + for (int dom = 0; dom < guest_ndomains; dom++) { + snprintf(pathbuf, sizeof(pathbuf), "domains.%d", dom); + nvl = find_config_node(pathbuf); + if (nvl == NULL) + break; + + value = get_config_value_node(nvl, "cpus"); + if (value == NULL) { + EPRINTLN("Missing CPU set for domain %d", dom); + exit(4); + } + + parse_cpuset(dom, value, &cpus); + CPU_FOREACH_ISSET(cpu, &cpus) { + error = acpi_add_vcpu_affinity(cpu, dom); + if (error) { + EPRINTLN( + "Unable to set vCPU %d affinity for domain %d: %s", + cpu, dom, strerror(errno)); + exit(4); + } + } + } + if (guest_ndomains > 1 || nvl != NULL) + return; + + /* + * If we're dealing with one domain and no cpuset was provided, create a + * default one holding all cpus. + */ + for (cpu = 0; cpu < guest_ncpus; cpu++) { + error = acpi_add_vcpu_affinity(cpu, 0); + if (error) { + EPRINTLN( + "Unable to set vCPU %d affinity for domain %d: %s", + cpu, 0, strerror(errno)); + exit(4); + } + } +} + void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr) { return (vm_rev_map_gpa(ctx, addr)); } #endif int fbsdrun_virtio_msix(void) { return (get_config_bool_default("virtio_msix", true)); } struct vcpu * fbsdrun_vcpu(int vcpuid) { return (vcpu_info[vcpuid].vcpu); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct vcpu_info *vi = param; int error; snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid); pthread_set_name_np(pthread_self(), tname); if (vcpumap[vi->vcpuid] != NULL) { error = pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), vcpumap[vi->vcpuid]); assert(error == 0); } #ifdef BHYVE_SNAPSHOT checkpoint_cpu_add(vi->vcpuid); #endif #ifdef BHYVE_GDB gdb_cpu_add(vi->vcpu); #endif vm_loop(vi->ctx, vi->vcpu); /* not reached */ exit(1); return (NULL); } void fbsdrun_addcpu(int vcpuid) { struct vcpu_info *vi; pthread_t thr; int error; vi = &vcpu_info[vcpuid]; error = vm_activate_cpu(vi->vcpu); if (error != 0) err(EX_OSERR, "could not activate CPU %d", vi->vcpuid); CPU_SET_ATOMIC(vcpuid, &cpumask); error = vm_suspend_cpu(vi->vcpu); assert(error == 0); error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi); assert(error == 0); } void fbsdrun_deletecpu(int vcpu) { static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; pthread_mutex_lock(&resetcpu_mtx); if (!CPU_ISSET(vcpu, &cpumask)) { EPRINTLN("Attempting to delete unknown cpu %d", vcpu); exit(4); } CPU_CLR(vcpu, &cpumask); if (vcpu != BSP) { pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); /* NOTREACHED */ } while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); } int fbsdrun_suspendcpu(int vcpuid) { return (vm_suspend_cpu(vcpu_info[vcpuid].vcpu)); } static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu) { struct vm_exit vme; struct vm_run vmrun; int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus, dmask; error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus)); vmrun.vm_exit = &vme; vmrun.cpuset = &dmask; vmrun.cpusetsize = sizeof(dmask); while (1) { error = vm_run(vcpu, &vmrun); if (error != 0) break; exitcode = vme.exitcode; if (exitcode >= VM_EXITCODE_MAX || vmexit_handlers[exitcode] == NULL) { warnx("vm_loop: unexpected exitcode 0x%x", exitcode); exit(4); } rc = (*vmexit_handlers[exitcode])(ctx, vcpu, &vmrun); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(4); } } EPRINTLN("vm_run error %d, errno %d", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu) { uint16_t sockets, cores, threads, maxcpus; int tmp, error; /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp); if (error != 0) return (1); error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); if (error == 0) return (maxcpus); else return (1); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool romboot; romboot = bootrom_boot(); /* * If we don't have a boot ROM, the guest context must have been * initialized by bhyveload(8) or equivalent. */ ctx = vm_openf(vmname, romboot ? VMMAPI_OPEN_REINIT : 0); if (ctx == NULL) { if (errno != ENOENT) err(4, "vm_openf"); if (!romboot) errx(4, "no bootrom was configured"); ctx = vm_openf(vmname, VMMAPI_OPEN_CREATE); if (ctx == NULL) err(4, "vm_openf"); } #ifndef WITHOUT_CAPSICUM if (vm_limit_rights(ctx) != 0) err(EX_OSERR, "vm_limit_rights"); #endif error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0); if (error) errx(EX_OSERR, "vm_set_topology"); return (ctx); } bool bhyve_parse_config_option(const char *option) { const char *value; char *path; value = strchr(option, '='); if (value == NULL || value[1] == '\0') return (false); path = strndup(option, value - option); if (path == NULL) err(4, "Failed to allocate memory"); set_config_value(path, value + 1); free(path); return (true); } void bhyve_parse_simple_config_file(const char *path) { FILE *fp; char *line, *cp; size_t linecap; unsigned int lineno; fp = fopen(path, "r"); if (fp == NULL) err(4, "Failed to open configuration file %s", path); line = NULL; linecap = 0; lineno = 1; for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) { if (*line == '#' || *line == '\n') continue; cp = strchr(line, '\n'); if (cp != NULL) *cp = '\0'; if (!bhyve_parse_config_option(line)) errx(4, "%s line %u: invalid config option '%s'", path, lineno, line); } free(line); fclose(fp); } #ifdef BHYVE_GDB void bhyve_parse_gdb_options(const char *opt) { const char *sport; char *colon; if (opt[0] == 'w') { set_config_bool("gdb.wait", true); opt++; } colon = strrchr(opt, ':'); if (colon == NULL) { sport = opt; } else { *colon = '\0'; colon++; sport = colon; set_config_value("gdb.address", opt); } set_config_value("gdb.port", sport); } #endif int main(int argc, char *argv[]) { int error; int max_vcpus, memflags; struct vcpu *bsp; struct vmctx *ctx; size_t memsize; const char *value, *vmname; #ifdef BHYVE_SNAPSHOT struct restore_state rstate; #endif bhyve_init_config(); bhyve_optparse(argc, argv); argc -= optind; argv += optind; if (argc > 1) bhyve_usage(1); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { error = load_restore_file(restore_file, &rstate); if (error) { fprintf(stderr, "Failed to read checkpoint info from " "file: '%s'.\n", restore_file); exit(1); } vmname = lookup_vmname(&rstate); if (vmname != NULL) set_config_value("name", vmname); } #endif if (argc == 1) set_config_value("name", argv[0]); vmname = get_config_value("name"); if (vmname == NULL) bhyve_usage(1); if (get_config_bool_default("config.dump", false)) { dump_config(); exit(1); } calc_topology(); build_vcpumaps(); value = get_config_value("memory.size"); error = vm_parse_memsize(value, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", value); ctx = do_open(vmname); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { guest_ncpus = lookup_guest_ncpus(&rstate); memflags = lookup_memflags(&rstate); memsize = lookup_memsize(&rstate); } if (guest_ncpus < 1) { fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); exit(1); } #endif bsp = vm_vcpu_open(ctx, BSP); max_vcpus = num_vcpus_allowed(ctx, bsp); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(4); } bhyve_init_vcpu(bsp); /* Allocate per-VCPU resources. */ vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info)); for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) { vcpu_info[vcpuid].ctx = ctx; vcpu_info[vcpuid].vcpuid = vcpuid; if (vcpuid == BSP) vcpu_info[vcpuid].vcpu = bsp; else vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid); } + calc_mem_affinity(memsize); memflags = 0; if (get_config_bool_default("memory.wired", false)) memflags |= VM_MEM_F_WIRED; if (get_config_bool_default("memory.guest_in_core", false)) memflags |= VM_MEM_F_INCORE; vm_set_memflags(ctx, memflags); - error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); + error = vm_setup_memory_domains(ctx, VM_MMAP_ALL, guest_domains, + guest_ndomains); if (error) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } + set_vcpu_affinities(); init_mem(guest_ncpus); init_bootrom(ctx); if (bhyve_init_platform(ctx, bsp) != 0) exit(4); if (qemu_fwcfg_init(ctx) != 0) { fprintf(stderr, "qemu fwcfg initialization error\n"); exit(4); } if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus), &guest_ncpus) != 0) { fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu\n"); exit(4); } /* * Exit if a device emulation finds an error in its initialization */ if (init_pci(ctx) != 0) { EPRINTLN("Device emulation initialization error: %s", strerror(errno)); exit(4); } if (init_tpm(ctx) != 0) { EPRINTLN("Failed to init TPM device"); exit(4); } /* * Initialize after PCI, to allow a bootrom file to reserve the high * region. */ if (get_config_bool("acpi_tables")) vmgenc_init(ctx); #ifdef BHYVE_GDB init_gdb(ctx); #endif /* * Add all vCPUs. */ for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) bhyve_start_vcpu(vcpu_info[vcpuid].vcpu, vcpuid == BSP); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { FPRINTLN(stdout, "Pausing pci devs..."); if (vm_pause_devices() != 0) { EPRINTLN("Failed to pause PCI device state."); exit(1); } FPRINTLN(stdout, "Restoring vm mem..."); if (restore_vm_mem(ctx, &rstate) != 0) { EPRINTLN("Failed to restore VM memory."); exit(1); } FPRINTLN(stdout, "Restoring pci devs..."); if (vm_restore_devices(&rstate) != 0) { EPRINTLN("Failed to restore PCI device state."); exit(1); } FPRINTLN(stdout, "Restoring kernel structs..."); if (vm_restore_kern_structs(ctx, &rstate) != 0) { EPRINTLN("Failed to restore kernel structs."); exit(1); } FPRINTLN(stdout, "Resuming pci devs..."); if (vm_resume_devices() != 0) { EPRINTLN("Failed to resume PCI device state."); exit(1); } } #endif if (bhyve_init_platform_late(ctx, bsp) != 0) exit(4); /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); #ifdef BHYVE_SNAPSHOT /* * checkpointing thread for communication with bhyvectl */ if (init_checkpoint_thread(ctx) != 0) errx(EX_OSERR, "Failed to start checkpoint thread"); #endif #ifndef WITHOUT_CAPSICUM caph_cache_catpages(); if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_enter() == -1) errx(EX_OSERR, "cap_enter() failed"); #endif #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { destroy_restore_state(&rstate); if (vm_restore_time(ctx) < 0) err(EX_OSERR, "Unable to restore time"); for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) vm_resume_cpu(vcpu_info[vcpuid].vcpu); } else #endif vm_resume_cpu(bsp); /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(4); } diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h index 005de6dc5410..0a7bbd72a19c 100644 --- a/usr.sbin/bhyve/bhyverun.h +++ b/usr.sbin/bhyve/bhyverun.h @@ -1,82 +1,83 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _BHYVERUN_H_ #define _BHYVERUN_H_ #include #define VMEXIT_CONTINUE (0) #define VMEXIT_ABORT (-1) extern int guest_ncpus; extern uint16_t cpu_cores, cpu_sockets, cpu_threads; #ifdef BHYVE_SNAPSHOT extern char *restore_file; #endif struct vcpu; struct vmctx; struct vm_run; void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len); #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr); #endif struct vcpu; struct vcpu *fbsdrun_vcpu(int vcpuid); void fbsdrun_addcpu(int vcpuid); void fbsdrun_deletecpu(int vcpuid); int fbsdrun_suspendcpu(int vcpuid); int fbsdrun_virtio_msix(void); typedef int (*vmexit_handler_t)(struct vmctx *, struct vcpu *, struct vm_run *); /* Interfaces implemented by machine-dependent code. */ void bhyve_init_config(void); void bhyve_optparse(int argc, char **argv); void bhyve_usage(int code); /* Interfaces used by command-line option-parsing code. */ bool bhyve_parse_config_option(const char *option); void bhyve_parse_simple_config_file(const char *path); #ifdef BHYVE_GDB void bhyve_parse_gdb_options(const char *opt); #endif int bhyve_pincpu_parse(const char *opt); int bhyve_topology_parse(const char *opt); +int bhyve_numa_parse(const char *opt); void bhyve_init_vcpu(struct vcpu *vcpu); void bhyve_start_vcpu(struct vcpu *vcpu, bool bsp); int bhyve_init_platform(struct vmctx *ctx, struct vcpu *bsp); int bhyve_init_platform_late(struct vmctx *ctx, struct vcpu *bsp); #endif diff --git a/usr.sbin/bhyve/bootrom.c b/usr.sbin/bhyve/bootrom.c index e4adaca55947..339974cb2017 100644 --- a/usr.sbin/bhyve/bootrom.c +++ b/usr.sbin/bhyve/bootrom.c @@ -1,325 +1,326 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2015 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "bootrom.h" #include "debug.h" #include "mem.h" #define BOOTROM_SIZE (16 * 1024 * 1024) /* 16 MB */ /* * ROM region is 16 MB at the top of 4GB ("low") memory. * * The size is limited so it doesn't encroach into reserved MMIO space (e.g., * APIC, HPET, MSI). * * It is allocated in page-multiple blocks on a first-come first-serve basis, * from high to low, during initialization, and does not change at runtime. */ static char *romptr; /* Pointer to userspace-mapped bootrom region. */ static vm_paddr_t gpa_base; /* GPA of low end of region. */ static vm_paddr_t gpa_allocbot; /* Low GPA of free region. */ static vm_paddr_t gpa_alloctop; /* High GPA, minus 1, of free region. */ #define CFI_BCS_WRITE_BYTE 0x10 #define CFI_BCS_CLEAR_STATUS 0x50 #define CFI_BCS_READ_STATUS 0x70 #define CFI_BCS_READ_ARRAY 0xff static struct bootrom_var_state { uint8_t *mmap; uint64_t gpa; off_t size; uint8_t cmd; } var = { NULL, 0, 0, CFI_BCS_READ_ARRAY }; /* * Emulate just those CFI basic commands that will convince EDK II * that the Firmware Volume area is writable and persistent. */ static int bootrom_var_mem_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, int size, uint64_t *val, void *arg1 __unused, long arg2 __unused) { off_t offset; offset = addr - var.gpa; if (offset + size > var.size || offset < 0 || offset + size <= offset) return (EINVAL); if (dir == MEM_F_WRITE) { switch (var.cmd) { case CFI_BCS_WRITE_BYTE: memcpy(var.mmap + offset, val, size); var.cmd = CFI_BCS_READ_ARRAY; break; default: var.cmd = *(uint8_t *)val; } } else { switch (var.cmd) { case CFI_BCS_CLEAR_STATUS: case CFI_BCS_READ_STATUS: memset(val, 0, size); var.cmd = CFI_BCS_READ_ARRAY; break; default: memcpy(val, var.mmap + offset, size); break; } } return (0); } void init_bootrom(struct vmctx *ctx) { vm_paddr_t highmem; romptr = vm_create_devmem(ctx, VM_BOOTROM, "bootrom", BOOTROM_SIZE); if (romptr == MAP_FAILED) err(4, "%s: vm_create_devmem", __func__); highmem = vm_get_highmem_base(ctx); gpa_base = highmem - BOOTROM_SIZE; gpa_allocbot = gpa_base; gpa_alloctop = highmem - 1; } int bootrom_alloc(struct vmctx *ctx, size_t len, int prot, int flags, char **region_out, uint64_t *gpa_out) { static const int bootrom_valid_flags = BOOTROM_ALLOC_TOP; vm_paddr_t gpa; vm_ooffset_t segoff; if (flags & ~bootrom_valid_flags) { warnx("%s: Invalid flags: %x", __func__, flags & ~bootrom_valid_flags); return (EINVAL); } if (prot & ~_PROT_ALL) { warnx("%s: Invalid protection: %x", __func__, prot & ~_PROT_ALL); return (EINVAL); } if (len == 0 || len > BOOTROM_SIZE) { warnx("ROM size %zu is invalid", len); return (EINVAL); } if (len & PAGE_MASK) { warnx("ROM size %zu is not a multiple of the page size", len); return (EINVAL); } if (flags & BOOTROM_ALLOC_TOP) { gpa = (gpa_alloctop - len) + 1; if (gpa < gpa_allocbot) { warnx("No room for %zu ROM in bootrom region", len); return (ENOMEM); } } else { gpa = gpa_allocbot; if (gpa > (gpa_alloctop - len) + 1) { warnx("No room for %zu ROM in bootrom region", len); return (ENOMEM); } } segoff = gpa - gpa_base; if (vm_mmap_memseg(ctx, gpa, VM_BOOTROM, segoff, len, prot) != 0) { int serrno = errno; warn("%s: vm_mmap_mapseg", __func__); return (serrno); } if (flags & BOOTROM_ALLOC_TOP) gpa_alloctop = gpa - 1; else gpa_allocbot = gpa + len; *region_out = romptr + segoff; if (gpa_out != NULL) *gpa_out = gpa; return (0); } int bootrom_loadrom(struct vmctx *ctx) { struct stat sbuf; ssize_t rlen; off_t rom_size, var_size, total_size; char *ptr, *romfile; int fd, varfd, i, rv; const char *bootrom, *varfile; rv = -1; varfd = -1; bootrom = get_config_value("bootrom"); if (bootrom == NULL) { return (0); } /* * get_config_value_node may use a thread local buffer to return * variables. So, when we query the second variable, the first variable * might get overwritten. For that reason, the bootrom should be * duplicated. */ romfile = strdup(bootrom); if (romfile == NULL) { return (-1); } fd = open(romfile, O_RDONLY); if (fd < 0) { EPRINTLN("Error opening bootrom \"%s\": %s", romfile, strerror(errno)); goto done; } if (fstat(fd, &sbuf) < 0) { EPRINTLN("Could not fstat bootrom file \"%s\": %s", romfile, strerror(errno)); goto done; } rom_size = sbuf.st_size; varfile = get_config_value("bootvars"); var_size = 0; if (varfile != NULL) { varfd = open(varfile, O_RDWR); if (varfd < 0) { EPRINTLN("Error opening bootrom variable file " "\"%s\": %s", varfile, strerror(errno)); goto done; } if (fstat(varfd, &sbuf) < 0) { EPRINTLN( "Could not fstat bootrom variable file \"%s\": %s", varfile, strerror(errno)); goto done; } var_size = sbuf.st_size; } if (var_size > BOOTROM_SIZE || (var_size != 0 && var_size < PAGE_SIZE)) { EPRINTLN("Invalid bootrom variable size %ld", var_size); goto done; } total_size = rom_size + var_size; if (total_size > BOOTROM_SIZE) { EPRINTLN("Invalid bootrom and variable aggregate size %ld", total_size); goto done; } /* Map the bootrom into the guest address space */ if (bootrom_alloc(ctx, rom_size, PROT_READ | PROT_EXEC, BOOTROM_ALLOC_TOP, &ptr, NULL) != 0) { goto done; } /* Read 'romfile' into the guest address space */ for (i = 0; i < rom_size / PAGE_SIZE; i++) { rlen = read(fd, ptr + i * PAGE_SIZE, PAGE_SIZE); if (rlen != PAGE_SIZE) { EPRINTLN("Incomplete read of page %d of bootrom " "file %s: %ld bytes", i, romfile, rlen); goto done; } } if (varfd >= 0) { var.mmap = mmap(NULL, var_size, PROT_READ | PROT_WRITE, MAP_SHARED, varfd, 0); if (var.mmap == MAP_FAILED) goto done; var.size = var_size; var.gpa = (gpa_alloctop - var_size) + 1; gpa_alloctop = var.gpa - 1; rv = register_mem(&(struct mem_range){ .name = "bootrom variable", .flags = MEM_F_RW, .handler = bootrom_var_mem_handler, .base = var.gpa, .size = var.size, }); if (rv != 0) goto done; } rv = 0; done: if (varfd >= 0) close(varfd); if (fd >= 0) close(fd); free(romfile); return (rv); } /* * Are we relying on a bootrom to initialize the guest's CPU context? */ bool bootrom_boot(void) { return (get_config_value("bootrom") != NULL); } diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c index 2f04a488d9c1..9d6060e3e254 100644 --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -1,2804 +1,2805 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include "acpi.h" #include "bhyverun.h" #include "bootrom.h" #include "config.h" #include "debug.h" #ifdef __amd64__ #include "amd64/inout.h" #endif #include "mem.h" #include "pci_emul.h" #ifdef __amd64__ #include "amd64/pci_lpc.h" #include "pci_passthru.h" #endif #include "qemu_fwcfg.h" #define CONF1_ADDR_PORT 0x0cf8 #define CONF1_DATA_PORT 0x0cfc #define CONF1_ENABLE 0x80000000ul #define MAXBUSES (PCI_BUSMAX + 1) #define MAXSLOTS (PCI_SLOTMAX + 1) #define MAXFUNCS (PCI_FUNCMAX + 1) #define GB (1024 * 1024 * 1024UL) struct funcinfo { nvlist_t *fi_config; struct pci_devemu *fi_pde; struct pci_devinst *fi_devi; }; struct intxinfo { int ii_count; struct pci_irq ii_irq; }; struct slotinfo { struct intxinfo si_intpins[4]; struct funcinfo si_funcs[MAXFUNCS]; }; struct businfo { uint16_t iobase, iolimit; /* I/O window */ uint32_t membase32, memlimit32; /* mmio window below 4GB */ uint64_t membase64, memlimit64; /* mmio window above 4GB */ struct slotinfo slotinfo[MAXSLOTS]; }; static struct businfo *pci_businfo[MAXBUSES]; SET_DECLARE(pci_devemu_set, struct pci_devemu); static uint64_t pci_emul_iobase; static uint8_t *pci_emul_rombase; static uint64_t pci_emul_romoffset; static uint8_t *pci_emul_romlim; static uint64_t pci_emul_membase32; static uint64_t pci_emul_membase64; static uint64_t pci_emul_memlim64; struct pci_bar_allocation { TAILQ_ENTRY(pci_bar_allocation) chain; struct pci_devinst *pdi; int idx; enum pcibar_type type; uint64_t size; }; static TAILQ_HEAD(pci_bar_list, pci_bar_allocation) pci_bars = TAILQ_HEAD_INITIALIZER(pci_bars); struct boot_device { TAILQ_ENTRY(boot_device) boot_device_chain; struct pci_devinst *pdi; int bootindex; }; static TAILQ_HEAD(boot_list, boot_device) boot_devices = TAILQ_HEAD_INITIALIZER( boot_devices); #if defined(__amd64__) #define PCI_EMUL_IOBASE 0x2000 #define PCI_EMUL_IOLIMIT 0x10000 #define PCI_EMUL_IOMASK 0xffff /* * OVMF always uses 0xc0000000 as base address for 32 bit PCI MMIO. Don't * change this address without changing it in OVMF. */ #define PCI_EMUL_MEMBASE32 0xc0000000 #elif defined(__aarch64__) || defined(__riscv) #define PCI_EMUL_IOBASE 0xdf000000UL #define PCI_EMUL_IOLIMIT 0xe0000000UL #define PCI_EMUL_MEMBASE32 0xa0000000UL #else #error Unsupported platform #endif #define PCI_EMUL_ROMSIZE 0x10000000 #define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ #define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ #ifdef __amd64__ SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); #endif #define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE #define PCI_EMUL_MEMSIZE64 (32*GB) static void pci_lintr_route(struct pci_devinst *pi); static void pci_lintr_update(struct pci_devinst *pi); static struct pci_devemu *pci_emul_finddev(const char *name); static void pci_cfgrw(int in, int bus, int slot, int func, int coff, int bytes, uint32_t *val); static __inline void CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes) { if (bytes == 1) pci_set_cfgdata8(pi, coff, val); else if (bytes == 2) pci_set_cfgdata16(pi, coff, val); else pci_set_cfgdata32(pi, coff, val); } static __inline uint32_t CFGREAD(struct pci_devinst *pi, int coff, int bytes) { if (bytes == 1) return (pci_get_cfgdata8(pi, coff)); else if (bytes == 2) return (pci_get_cfgdata16(pi, coff)); else return (pci_get_cfgdata32(pi, coff)); } static int is_pcir_bar(int coff) { return (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)); } static int is_pcir_bios(int coff) { return (coff >= PCIR_BIOS && coff < PCIR_BIOS + 4); } /* * I/O access */ /* * Slot options are in the form: * * ::,[,] * [:],[,] * * slot is 0..31 * func is 0..7 * emul is a string describing the type of PCI device e.g. virtio-net * config is an optional string, depending on the device, that can be * used for configuration. * Examples are: * 1,virtio-net,tap0 * 3:0,dummy */ static void pci_parse_slot_usage(char *aopt) { EPRINTLN("Invalid PCI slot info field \"%s\"", aopt); } /* * Helper function to parse a list of comma-separated options where * each option is formatted as "name[=value]". If no value is * provided, the option is treated as a boolean and is given a value * of true. */ int pci_parse_legacy_config(nvlist_t *nvl, const char *opt) { char *config, *name, *tofree, *value; if (opt == NULL) return (0); config = tofree = strdup(opt); while ((name = strsep(&config, ",")) != NULL) { value = strchr(name, '='); if (value != NULL) { *value = '\0'; value++; set_config_value_node(nvl, name, value); } else set_config_bool_node(nvl, name, true); } free(tofree); return (0); } /* * PCI device configuration is stored in MIBs that encode the device's * location: * * pci... * * Where "bus", "slot", and "func" are all decimal values without * leading zeroes. Each valid device must have a "device" node which * identifies the driver model of the device. * * Device backends can provide a parser for the "config" string. If * a custom parser is not provided, pci_parse_legacy_config() is used * to parse the string. */ int pci_parse_slot(char *opt) { char node_name[sizeof("pci.XXX.XX.X")]; struct pci_devemu *pde; char *emul, *config, *str, *cp; int error, bnum, snum, fnum; nvlist_t *nvl; error = -1; str = strdup(opt); emul = config = NULL; if ((cp = strchr(str, ',')) != NULL) { *cp = '\0'; emul = cp + 1; if ((cp = strchr(emul, ',')) != NULL) { *cp = '\0'; config = cp + 1; } } else { pci_parse_slot_usage(opt); goto done; } /* :: */ if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) { bnum = 0; /* : */ if (sscanf(str, "%d:%d", &snum, &fnum) != 2) { fnum = 0; /* */ if (sscanf(str, "%d", &snum) != 1) { snum = -1; } } } if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS || fnum < 0 || fnum >= MAXFUNCS) { pci_parse_slot_usage(opt); goto done; } pde = pci_emul_finddev(emul); if (pde == NULL) { EPRINTLN("pci slot %d:%d:%d: unknown device \"%s\"", bnum, snum, fnum, emul); goto done; } snprintf(node_name, sizeof(node_name), "pci.%d.%d.%d", bnum, snum, fnum); nvl = find_config_node(node_name); if (nvl != NULL) { EPRINTLN("pci slot %d:%d:%d already occupied!", bnum, snum, fnum); goto done; } nvl = create_config_node(node_name); if (pde->pe_alias != NULL) set_config_value_node(nvl, "device", pde->pe_alias); else set_config_value_node(nvl, "device", pde->pe_emu); if (pde->pe_legacy_config != NULL) error = pde->pe_legacy_config(nvl, config); else error = pci_parse_legacy_config(nvl, config); done: free(str); return (error); } void pci_print_supported_devices(void) { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; printf("%s\n", pdp->pe_emu); } } uint32_t pci_config_read_reg(const struct pcisel *const host_sel, nvlist_t *nvl, const uint32_t reg, const uint8_t size, const uint32_t def) { const char *config; const nvlist_t *pci_regs; assert(size == 1 || size == 2 || size == 4); pci_regs = find_relative_config_node(nvl, "pcireg"); if (pci_regs == NULL) { return def; } switch (reg) { case PCIR_DEVICE: config = get_config_value_node(pci_regs, "device"); break; case PCIR_VENDOR: config = get_config_value_node(pci_regs, "vendor"); break; case PCIR_REVID: config = get_config_value_node(pci_regs, "revid"); break; case PCIR_SUBVEND_0: config = get_config_value_node(pci_regs, "subvendor"); break; case PCIR_SUBDEV_0: config = get_config_value_node(pci_regs, "subdevice"); break; default: return (-1); } if (config == NULL) { return def; } else if (host_sel != NULL && strcmp(config, "host") == 0) { #ifdef __amd64__ return pci_host_read_config(host_sel, reg, size); #else errx(1, "cannot fetch host PCI configuration"); #endif } else { return strtol(config, NULL, 16); } } static int pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) { if (offset < pi->pi_msix.pba_offset) return (0); if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { return (0); } return (1); } int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, uint64_t value) { int msix_entry_offset; int tab_index; char *dest; /* support only 4 or 8 byte writes */ if (size != 4 && size != 8) return (-1); /* * Return if table index is beyond what device supports */ tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index >= pi->pi_msix.table_count) return (-1); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned writes */ if ((msix_entry_offset % size) != 0) return (-1); dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 4) *((uint32_t *)dest) = value; else *((uint64_t *)dest) = value; return (0); } uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) { char *dest; int msix_entry_offset; int tab_index; uint64_t retval = ~0; /* * The PCI standard only allows 4 and 8 byte accesses to the MSI-X * table but we also allow 1 byte access to accommodate reads from * ddb. */ if (size != 1 && size != 4 && size != 8) return (retval); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned reads */ if ((msix_entry_offset % size) != 0) { return (retval); } tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index < pi->pi_msix.table_count) { /* valid MSI-X Table access */ dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 1) retval = *((uint8_t *)dest); else if (size == 4) retval = *((uint32_t *)dest); else retval = *((uint64_t *)dest); } else if (pci_valid_pba_offset(pi, offset)) { /* return 0 for PBA access */ retval = 0; } return (retval); } int pci_msix_table_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.table_bar); else return (-1); } int pci_msix_pba_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.pba_bar); else return (-1); } #ifdef __amd64__ static int pci_emul_io_handler(struct vmctx *ctx __unused, int in, int port, int bytes, uint32_t *eax, void *arg) { struct pci_devinst *pdi = arg; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int i; assert(port >= 0); for (i = 0; i <= PCI_BARMAX; i++) { if (pdi->pi_bar[i].type == PCIBAR_IO && (uint64_t)port >= pdi->pi_bar[i].addr && (uint64_t)port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { offset = port - pdi->pi_bar[i].addr; if (in) *eax = (*pe->pe_barread)(pdi, i, offset, bytes); else (*pe->pe_barwrite)(pdi, i, offset, bytes, *eax); return (0); } } return (-1); } #else static int pci_emul_iomem_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { struct pci_devinst *pdi = arg1; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int bidx = (int)arg2; assert(bidx <= PCI_BARMAX); assert(pdi->pi_bar[bidx].type == PCIBAR_IO); assert(addr >= pdi->pi_bar[bidx].addr && addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); assert(size == 1 || size == 2 || size == 4); offset = addr - pdi->pi_bar[bidx].addr; if (dir == MEM_F_READ) *val = (*pe->pe_barread)(pdi, bidx, offset, size); else (*pe->pe_barwrite)(pdi, bidx, offset, size, *val); return (0); } #endif /* !__amd64__ */ static int pci_emul_mem_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { struct pci_devinst *pdi = arg1; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int bidx = (int)arg2; assert(bidx <= PCI_BARMAX); assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 || pdi->pi_bar[bidx].type == PCIBAR_MEM64); assert(addr >= pdi->pi_bar[bidx].addr && addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); offset = addr - pdi->pi_bar[bidx].addr; if (dir == MEM_F_WRITE) { if (size == 8) { (*pe->pe_barwrite)(pdi, bidx, offset, 4, *val & 0xffffffff); (*pe->pe_barwrite)(pdi, bidx, offset + 4, 4, *val >> 32); } else { (*pe->pe_barwrite)(pdi, bidx, offset, size, *val); } } else { if (size == 8) { *val = (*pe->pe_barread)(pdi, bidx, offset, 4); *val |= (*pe->pe_barread)(pdi, bidx, offset + 4, 4) << 32; } else { *val = (*pe->pe_barread)(pdi, bidx, offset, size); } } return (0); } static int pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, uint64_t *addr) { uint64_t base; assert((size & (size - 1)) == 0); /* must be a power of 2 */ base = roundup2(*baseptr, size); if (base + size <= limit) { *addr = base; *baseptr = base + size; return (0); } else return (-1); } /* * Register (or unregister) the MMIO or I/O region associated with the BAR * register 'idx' of an emulated pci device. */ static void modify_bar_registration(struct pci_devinst *pi, int idx, int registration) { struct pci_devemu *pe; int error; enum pcibar_type type; pe = pi->pi_d; type = pi->pi_bar[idx].type; switch (type) { case PCIBAR_IO: { #ifdef __amd64__ struct inout_port iop; bzero(&iop, sizeof(struct inout_port)); iop.name = pi->pi_name; iop.port = pi->pi_bar[idx].addr; iop.size = pi->pi_bar[idx].size; if (registration) { iop.flags = IOPORT_F_INOUT; iop.handler = pci_emul_io_handler; iop.arg = pi; error = register_inout(&iop); } else error = unregister_inout(&iop); #else struct mem_range mr; bzero(&mr, sizeof(struct mem_range)); mr.name = pi->pi_name; mr.base = pi->pi_bar[idx].addr; mr.size = pi->pi_bar[idx].size; if (registration) { mr.flags = MEM_F_RW; mr.handler = pci_emul_iomem_handler; mr.arg1 = pi; mr.arg2 = idx; error = register_mem(&mr); } else error = unregister_mem(&mr); #endif break; } case PCIBAR_MEM32: case PCIBAR_MEM64: { struct mem_range mr; bzero(&mr, sizeof(struct mem_range)); mr.name = pi->pi_name; mr.base = pi->pi_bar[idx].addr; mr.size = pi->pi_bar[idx].size; if (registration) { mr.flags = MEM_F_RW; mr.handler = pci_emul_mem_handler; mr.arg1 = pi; mr.arg2 = idx; error = register_mem(&mr); } else error = unregister_mem(&mr); break; } case PCIBAR_ROM: error = 0; break; default: error = EINVAL; break; } assert(error == 0); if (pe->pe_baraddr != NULL) (*pe->pe_baraddr)(pi, idx, registration, pi->pi_bar[idx].addr); } static void unregister_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 0); } static void register_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 1); } /* Is the ROM enabled for the emulated pci device? */ static int romen(struct pci_devinst *pi) { return (pi->pi_bar[PCI_ROM_IDX].lobits & PCIM_BIOS_ENABLE) == PCIM_BIOS_ENABLE; } /* Are we decoding i/o port accesses for the emulated pci device? */ static int porten(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_PORTEN); } /* Are we decoding memory accesses for the emulated pci device? */ static int memen(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_MEMEN); } /* * Update the MMIO or I/O address that is decoded by the BAR register. * * If the pci device has enabled the address space decoding then intercept * the address range decoded by the BAR register. */ static void update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) { int decode; if (pi->pi_bar[idx].type == PCIBAR_IO) decode = porten(pi); else decode = memen(pi); if (decode) unregister_bar(pi, idx); switch (type) { case PCIBAR_IO: case PCIBAR_MEM32: pi->pi_bar[idx].addr = addr; break; case PCIBAR_MEM64: pi->pi_bar[idx].addr &= ~0xffffffffUL; pi->pi_bar[idx].addr |= addr; break; case PCIBAR_MEMHI64: pi->pi_bar[idx].addr &= 0xffffffff; pi->pi_bar[idx].addr |= addr; break; default: assert(0); } if (decode) register_bar(pi, idx); } int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size) { assert((type == PCIBAR_ROM) || (idx >= 0 && idx <= PCI_BARMAX)); assert((type != PCIBAR_ROM) || (idx == PCI_ROM_IDX)); if ((size & (size - 1)) != 0) size = 1UL << flsl(size); /* round up to a power of 2 */ /* Enforce minimum BAR sizes required by the PCI standard */ if (type == PCIBAR_IO) { if (size < 4) size = 4; } else if (type == PCIBAR_ROM) { if (size < ~PCIM_BIOS_ADDR_MASK + 1) size = ~PCIM_BIOS_ADDR_MASK + 1; } else { if (size < 16) size = 16; } /* * To reduce fragmentation of the MMIO space, we allocate the BARs by * size. Therefore, don't allocate the BAR yet. We create a list of all * BAR allocation which is sorted by BAR size. When all PCI devices are * initialized, we will assign an address to the BARs. */ /* create a new list entry */ struct pci_bar_allocation *const new_bar = malloc(sizeof(*new_bar)); memset(new_bar, 0, sizeof(*new_bar)); new_bar->pdi = pdi; new_bar->idx = idx; new_bar->type = type; new_bar->size = size; /* * Search for a BAR which size is lower than the size of our newly * allocated BAR. */ struct pci_bar_allocation *bar = NULL; TAILQ_FOREACH(bar, &pci_bars, chain) { if (bar->size < size) { break; } } if (bar == NULL) { /* * Either the list is empty or new BAR is the smallest BAR of * the list. Append it to the end of our list. */ TAILQ_INSERT_TAIL(&pci_bars, new_bar, chain); } else { /* * The found BAR is smaller than our new BAR. For that reason, * insert our new BAR before the found BAR. */ TAILQ_INSERT_BEFORE(bar, new_bar, chain); } /* * Enable PCI BARs only if we don't have a boot ROM, i.e., bhyveload was * used to load the initial guest image. Otherwise, we rely on the boot * ROM to handle this. */ if (!get_config_bool_default("pci.enable_bars", !bootrom_boot())) return (0); /* * pci_passthru devices synchronize their physical and virtual command * register on init. For that reason, the virtual cmd reg should be * updated as early as possible. */ uint16_t enbit = 0; switch (type) { case PCIBAR_IO: enbit = PCIM_CMD_PORTEN; break; case PCIBAR_MEM64: case PCIBAR_MEM32: enbit = PCIM_CMD_MEMEN; break; default: enbit = 0; break; } const uint16_t cmd = pci_get_cfgdata16(pdi, PCIR_COMMAND); pci_set_cfgdata16(pdi, PCIR_COMMAND, cmd | enbit); return (0); } static int pci_emul_assign_bar(struct pci_devinst *const pdi, const int idx, const enum pcibar_type type, const uint64_t size) { int error; uint64_t *baseptr, limit, addr, mask, lobits, bar; switch (type) { case PCIBAR_NONE: baseptr = NULL; addr = mask = lobits = 0; break; case PCIBAR_IO: baseptr = &pci_emul_iobase; limit = PCI_EMUL_IOLIMIT; mask = PCIM_BAR_IO_BASE; lobits = PCIM_BAR_IO_SPACE; break; case PCIBAR_MEM64: /* * XXX * Some drivers do not work well if the 64-bit BAR is allocated * above 4GB. Allow for this by allocating small requests under * 4GB unless then allocation size is larger than some arbitrary * number (128MB currently). */ if (size > 128 * 1024 * 1024) { baseptr = &pci_emul_membase64; limit = pci_emul_memlim64; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | PCIM_BAR_MEM_PREFETCH; } else { baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64; } break; case PCIBAR_MEM32: baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; break; case PCIBAR_ROM: /* do not claim memory for ROM. OVMF will do it for us. */ baseptr = NULL; limit = 0; mask = PCIM_BIOS_ADDR_MASK; lobits = 0; break; default: printf("pci_emul_alloc_base: invalid bar type %d\n", type); assert(0); } if (baseptr != NULL) { error = pci_emul_alloc_resource(baseptr, limit, size, &addr); if (error != 0) return (error); } else { addr = 0; } pdi->pi_bar[idx].type = type; pdi->pi_bar[idx].addr = addr; pdi->pi_bar[idx].size = size; /* * passthru devices are using same lobits as physical device they set * this property */ if (pdi->pi_bar[idx].lobits != 0) { lobits = pdi->pi_bar[idx].lobits; } else { pdi->pi_bar[idx].lobits = lobits; } /* Initialize the BAR register in config space */ bar = (addr & mask) | lobits; pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar); if (type == PCIBAR_MEM64) { assert(idx + 1 <= PCI_BARMAX); pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); } switch (type) { case PCIBAR_IO: if (porten(pdi)) register_bar(pdi, idx); break; case PCIBAR_MEM32: case PCIBAR_MEM64: case PCIBAR_MEMHI64: if (memen(pdi)) register_bar(pdi, idx); break; default: break; } return (0); } int pci_emul_alloc_rom(struct pci_devinst *const pdi, const uint64_t size, void **const addr) { /* allocate ROM space once on first call */ if (pci_emul_rombase == 0) { pci_emul_rombase = vm_create_devmem(pdi->pi_vmctx, VM_PCIROM, "pcirom", PCI_EMUL_ROMSIZE); if (pci_emul_rombase == MAP_FAILED) { warnx("%s: failed to create rom segment", __func__); return (-1); } pci_emul_romlim = pci_emul_rombase + PCI_EMUL_ROMSIZE; pci_emul_romoffset = 0; } /* ROM size should be a power of 2 and greater than 2 KB */ const uint64_t rom_size = MAX(1UL << flsl(size), ~PCIM_BIOS_ADDR_MASK + 1); /* check if ROM fits into ROM space */ if (pci_emul_romoffset + rom_size > PCI_EMUL_ROMSIZE) { warnx("%s: no space left in rom segment:", __func__); warnx("%16lu bytes left", PCI_EMUL_ROMSIZE - pci_emul_romoffset); warnx("%16lu bytes required by %d/%d/%d", rom_size, pdi->pi_bus, pdi->pi_slot, pdi->pi_func); return (-1); } /* allocate ROM BAR */ const int error = pci_emul_alloc_bar(pdi, PCI_ROM_IDX, PCIBAR_ROM, rom_size); if (error) return error; /* return address */ *addr = pci_emul_rombase + pci_emul_romoffset; /* save offset into ROM Space */ pdi->pi_romoffset = pci_emul_romoffset; /* increase offset for next ROM */ pci_emul_romoffset += rom_size; return (0); } int pci_emul_add_boot_device(struct pci_devinst *pi, int bootindex) { struct boot_device *new_device, *device; /* don't permit a negative bootindex */ if (bootindex < 0) { errx(4, "Invalid bootindex %d for %s", bootindex, pi->pi_name); } /* alloc new boot device */ new_device = calloc(1, sizeof(struct boot_device)); if (new_device == NULL) { return (ENOMEM); } new_device->pdi = pi; new_device->bootindex = bootindex; /* search for boot device with higher boot index */ TAILQ_FOREACH(device, &boot_devices, boot_device_chain) { if (device->bootindex == bootindex) { errx(4, "Could not set bootindex %d for %s. Bootindex already occupied by %s", bootindex, pi->pi_name, device->pdi->pi_name); } else if (device->bootindex > bootindex) { break; } } /* add boot device to queue */ if (device == NULL) { TAILQ_INSERT_TAIL(&boot_devices, new_device, boot_device_chain); } else { TAILQ_INSERT_BEFORE(device, new_device, boot_device_chain); } return (0); } #define CAP_START_OFFSET 0x40 static int pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen) { int i, capoff, reallen; uint16_t sts; assert(caplen > 0); reallen = roundup2(caplen, 4); /* dword aligned */ sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) == 0) capoff = CAP_START_OFFSET; else capoff = pi->pi_capend + 1; /* Check if we have enough space */ if (capoff + reallen > PCI_REGMAX + 1) return (-1); /* Set the previous capability pointer */ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) { pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff); pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT); } else pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff); /* Copy the capability */ for (i = 0; i < caplen; i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); /* Set the next capability pointer */ pci_set_cfgdata8(pi, capoff + 1, 0); pi->pi_prevcap = capoff; pi->pi_capend = capoff + reallen - 1; return (0); } static struct pci_devemu * pci_emul_finddev(const char *name) { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; if (!strcmp(pdp->pe_emu, name)) { return (pdp); } } return (NULL); } static int pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot, int func, struct funcinfo *fi) { struct pci_devinst *pdi; int err; pdi = calloc(1, sizeof(struct pci_devinst)); pdi->pi_vmctx = ctx; pdi->pi_bus = bus; pdi->pi_slot = slot; pdi->pi_func = func; pthread_mutex_init(&pdi->pi_lintr.lock, NULL); pdi->pi_lintr.pin = 0; pdi->pi_lintr.state = IDLE; pci_irq_init_irq(&pdi->pi_lintr.irq); pdi->pi_d = pde; snprintf(pdi->pi_name, PI_NAMESZ, "%s@pci.%d.%d.%d", pde->pe_emu, bus, slot, func); /* Disable legacy interrupts */ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); if (get_config_bool_default("pci.enable_bars", !bootrom_boot())) pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_BUSMASTEREN); err = (*pde->pe_init)(pdi, fi->fi_config); if (err == 0) fi->fi_devi = pdi; else free(pdi); return (err); } void pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) { int mmc; /* Number of msi messages must be a power of 2 between 1 and 32 */ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); mmc = ffs(msgnum) - 1; bzero(msicap, sizeof(struct msicap)); msicap->capid = PCIY_MSI; msicap->nextptr = nextptr; msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1); } int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) { struct msicap msicap; pci_populate_msicap(&msicap, msgnum, 0); return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); } static void pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, uint32_t msix_tab_size) { assert(msix_tab_size % 4096 == 0); bzero(msixcap, sizeof(struct msixcap)); msixcap->capid = PCIY_MSIX; /* * Message Control Register, all fields set to * zero except for the Table Size. * Note: Table size N is encoded as N-1 */ msixcap->msgctrl = msgnum - 1; /* * MSI-X BAR setup: * - MSI-X table start at offset 0 * - PBA table starts at a 4K aligned offset after the MSI-X table */ msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK; msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK); } static void pci_msix_table_init(struct pci_devinst *pi, int table_entries) { int i, table_size; assert(table_entries > 0); assert(table_entries <= MAX_MSIX_TABLE_ENTRIES); table_size = table_entries * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* set mask bit of vector control register */ for (i = 0; i < table_entries; i++) pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) { uint32_t tab_size; struct msixcap msixcap; assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE; /* Align table size to nearest 4K */ tab_size = roundup2(tab_size, 4096); pi->pi_msix.table_bar = barnum; pi->pi_msix.pba_bar = barnum; pi->pi_msix.table_offset = 0; pi->pi_msix.table_count = msgnum; pi->pi_msix.pba_offset = tab_size; pi->pi_msix.pba_size = PBA_SIZE(msgnum); pci_msix_table_init(pi, msgnum); pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size); /* allocate memory for MSI-X Table and PBA */ pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32, tab_size + pi->pi_msix.pba_size); return (pci_emul_add_capability(pi, (u_char *)&msixcap, sizeof(msixcap))); } static void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask; int off; off = offset - capoff; /* Message Control Register */ if (off == 2 && bytes == 2) { rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; pci_lintr_update(pi); } CFGWRITE(pi, offset, val, bytes); } static void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask, msgdata, mme; uint32_t addrlo; /* * If guest is writing to the message control register make sure * we do not overwrite read-only fields. */ if ((offset - capoff) == 2 && bytes == 2) { rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; } CFGWRITE(pi, offset, val, bytes); msgctrl = pci_get_cfgdata16(pi, capoff + 2); addrlo = pci_get_cfgdata32(pi, capoff + 4); if (msgctrl & PCIM_MSICTRL_64BIT) msgdata = pci_get_cfgdata16(pi, capoff + 12); else msgdata = pci_get_cfgdata16(pi, capoff + 8); mme = msgctrl & PCIM_MSICTRL_MME_MASK; pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0; if (pi->pi_msi.enabled) { pi->pi_msi.addr = addrlo; pi->pi_msi.msg_data = msgdata; pi->pi_msi.maxmsgnum = 1 << (mme >> 4); } else { pi->pi_msi.maxmsgnum = 0; } pci_lintr_update(pi); } static void pciecap_cfgwrite(struct pci_devinst *pi, int capoff __unused, int offset, int bytes, uint32_t val) { /* XXX don't write to the readonly parts */ CFGWRITE(pi, offset, val, bytes); } #define PCIECAP_VERSION 0x2 int pci_emul_add_pciecap(struct pci_devinst *pi, int type) { int err; struct pciecap pciecap; bzero(&pciecap, sizeof(pciecap)); /* * Use the integrated endpoint type for endpoints on a root complex bus. * * NB: bhyve currently only supports a single PCI bus that is the root * complex bus, so all endpoints are integrated. */ if ((type == PCIEM_TYPE_ENDPOINT) && (pi->pi_bus == 0)) type = PCIEM_TYPE_ROOT_INT_EP; pciecap.capid = PCIY_EXPRESS; pciecap.pcie_capabilities = PCIECAP_VERSION | type; if (type != PCIEM_TYPE_ROOT_INT_EP) { pciecap.link_capabilities = 0x411; /* gen1, x1 */ pciecap.link_status = 0x11; /* gen1, x1 */ } err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap)); return (err); } /* * This function assumes that 'coff' is in the capabilities region of the * config space. A capoff parameter of zero will force a search for the * offset and type. */ void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val, uint8_t capoff, int capid) { uint8_t nextoff; /* Do not allow un-aligned writes */ if ((offset & (bytes - 1)) != 0) return; if (capoff == 0) { /* Find the capability that we want to update */ capoff = CAP_START_OFFSET; while (1) { nextoff = pci_get_cfgdata8(pi, capoff + 1); if (nextoff == 0) break; if (offset >= capoff && offset < nextoff) break; capoff = nextoff; } assert(offset >= capoff); capid = pci_get_cfgdata8(pi, capoff); } /* * Capability ID and Next Capability Pointer are readonly. * However, some o/s's do 4-byte writes that include these. * For this case, trim the write back to 2 bytes and adjust * the data. */ if (offset == capoff || offset == capoff + 1) { if (offset == capoff && bytes == 4) { bytes = 2; offset += 2; val >>= 16; } else return; } switch (capid) { case PCIY_MSI: msicap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_MSIX: msixcap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_EXPRESS: pciecap_cfgwrite(pi, capoff, offset, bytes, val); break; default: break; } } static int pci_emul_iscap(struct pci_devinst *pi, int offset) { uint16_t sts; sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) != 0) { if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend) return (1); } return (0); } static int pci_emul_fallback_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr __unused, int size __unused, uint64_t *val, void *arg1 __unused, long arg2 __unused) { /* * Ignore writes; return 0xff's for reads. The mem read code * will take care of truncating to the correct size. */ if (dir == MEM_F_READ) { *val = 0xffffffffffffffff; } return (0); } static int pci_emul_ecfg_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, int bytes, uint64_t *val, void *arg1 __unused, long arg2 __unused) { int bus, slot, func, coff, in; coff = addr & 0xfff; func = (addr >> 12) & 0x7; slot = (addr >> 15) & 0x1f; bus = (addr >> 20) & 0xff; in = (dir == MEM_F_READ); if (in) *val = ~0UL; pci_cfgrw(in, bus, slot, func, coff, bytes, (uint32_t *)val); return (0); } uint64_t pci_ecfg_base(void) { return (PCI_EMUL_ECFG_BASE); } static int init_bootorder(void) { struct boot_device *device; FILE *fp; char *bootorder; size_t bootorder_len; if (TAILQ_EMPTY(&boot_devices)) return (0); fp = open_memstream(&bootorder, &bootorder_len); TAILQ_FOREACH(device, &boot_devices, boot_device_chain) { fprintf(fp, "/pci@i0cf8/pci@%d,%d\n", device->pdi->pi_slot, device->pdi->pi_func); } fclose(fp); return (qemu_fwcfg_add_file("bootorder", bootorder_len, bootorder)); } #define BUSIO_ROUNDUP 32 #define BUSMEM32_ROUNDUP (1024 * 1024) #define BUSMEM64_ROUNDUP (512 * 1024 * 1024) int init_pci(struct vmctx *ctx) { char node_name[sizeof("pci.XXX.XX.X")]; struct mem_range mr; struct pci_devemu *pde; struct businfo *bi; struct slotinfo *si; struct funcinfo *fi; nvlist_t *nvl; const char *emul; size_t lowmem; int bus, slot, func; int error; if (vm_get_lowmem_limit(ctx) > PCI_EMUL_MEMBASE32) errx(EX_OSERR, "Invalid lowmem limit"); pci_emul_iobase = PCI_EMUL_IOBASE; pci_emul_membase32 = PCI_EMUL_MEMBASE32; pci_emul_membase64 = vm_get_highmem_base(ctx) + vm_get_highmem_size(ctx); pci_emul_membase64 = roundup2(pci_emul_membase64, PCI_EMUL_MEMSIZE64); pci_emul_memlim64 = pci_emul_membase64 + PCI_EMUL_MEMSIZE64; TAILQ_INIT(&boot_devices); for (bus = 0; bus < MAXBUSES; bus++) { snprintf(node_name, sizeof(node_name), "pci.%d", bus); nvl = find_config_node(node_name); if (nvl == NULL) continue; pci_businfo[bus] = calloc(1, sizeof(struct businfo)); bi = pci_businfo[bus]; /* * Keep track of the i/o and memory resources allocated to * this bus. */ bi->iobase = pci_emul_iobase; bi->membase32 = pci_emul_membase32; bi->membase64 = pci_emul_membase64; /* first run: init devices */ for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; snprintf(node_name, sizeof(node_name), "pci.%d.%d.%d", bus, slot, func); nvl = find_config_node(node_name); if (nvl == NULL) continue; fi->fi_config = nvl; emul = get_config_value_node(nvl, "device"); if (emul == NULL) { EPRINTLN("pci slot %d:%d:%d: missing " "\"device\" value", bus, slot, func); return (EINVAL); } pde = pci_emul_finddev(emul); if (pde == NULL) { EPRINTLN("pci slot %d:%d:%d: unknown " "device \"%s\"", bus, slot, func, emul); return (EINVAL); } if (pde->pe_alias != NULL) { EPRINTLN("pci slot %d:%d:%d: legacy " "device \"%s\", use \"%s\" instead", bus, slot, func, emul, pde->pe_alias); return (EINVAL); } fi->fi_pde = pde; error = pci_emul_init(ctx, pde, bus, slot, func, fi); if (error) return (error); } } /* second run: assign BARs and free list */ struct pci_bar_allocation *bar; struct pci_bar_allocation *bar_tmp; TAILQ_FOREACH_SAFE(bar, &pci_bars, chain, bar_tmp) { pci_emul_assign_bar(bar->pdi, bar->idx, bar->type, bar->size); free(bar); } TAILQ_INIT(&pci_bars); /* * Add some slop to the I/O and memory resources decoded by * this bus to give a guest some flexibility if it wants to * reprogram the BARs. */ pci_emul_iobase += BUSIO_ROUNDUP; pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP); bi->iolimit = pci_emul_iobase; pci_emul_membase32 += BUSMEM32_ROUNDUP; pci_emul_membase32 = roundup2(pci_emul_membase32, BUSMEM32_ROUNDUP); bi->memlimit32 = pci_emul_membase32; pci_emul_membase64 += BUSMEM64_ROUNDUP; pci_emul_membase64 = roundup2(pci_emul_membase64, BUSMEM64_ROUNDUP); bi->memlimit64 = pci_emul_membase64; } /* * PCI backends are initialized before routing INTx interrupts * so that LPC devices are able to reserve ISA IRQs before * routing PIRQ pins. */ for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_devi == NULL) continue; pci_lintr_route(fi->fi_devi); } } } #ifdef __amd64__ lpc_pirq_routed(); #endif if ((error = init_bootorder()) != 0) { warnx("%s: Unable to init bootorder", __func__); return (error); } /* * The guest physical memory map looks like the following on amd64: * [0, lowmem) guest system memory * [lowmem, 0xC0000000) memory hole (may be absent) * [0xC0000000, 0xE0000000) PCI hole (32-bit BAR allocation) * [0xE0000000, 0xF0000000) PCI extended config window * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware * [4GB, 4GB + highmem) guest system memory * [roundup(4GB + highmem, 32GB), ...) PCI 64-bit BAR allocation * * On arm64 the guest physical memory map looks like this: * [0x0DF00000, 0x10000000) PCI I/O memory * [0xA0000000, 0xE0000000) PCI 32-bit BAR allocation * [0xE0000000, 0xF0000000) PCI extended config window * [4GB, 4GB + highmem) guest system memory * [roundup(4GB + highmem, 32GB), ...) PCI 64-bit BAR allocation * * "lowmem" is guest memory below 0xC0000000. amd64 guests provisioned * with less than 3GB of RAM will have no memory above the 4GB boundary. * System memory for arm64 guests is all above the 4GB boundary. */ /* * Accesses to memory addresses that are not allocated to system * memory or PCI devices return 0xff's. */ lowmem = vm_get_lowmem_size(ctx); bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI hole"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = lowmem; mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; mr.handler = pci_emul_fallback_handler; error = register_mem_fallback(&mr); assert(error == 0); /* PCI extended config space */ bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI ECFG"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = PCI_EMUL_ECFG_BASE; mr.size = PCI_EMUL_ECFG_SIZE; mr.handler = pci_emul_ecfg_handler; error = register_mem(&mr); assert(error == 0); return (0); } #ifdef __amd64__ static void pci_apic_prt_entry(int bus __unused, int slot, int pin, struct pci_irq *irq, void *arg __unused) { dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" Zero,"); dsdt_line(" 0x%X", irq->ioapic_irq); dsdt_line(" },"); } static void pci_pirq_prt_entry(int bus __unused, int slot, int pin, struct pci_irq *irq, void *arg __unused) { char *name; name = lpc_pirq_name(irq->pirq_pin); if (name == NULL) return; dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" %s,", name); dsdt_line(" 0x00"); dsdt_line(" },"); free(name); } #endif /* * A bhyve virtual machine has a flat PCI hierarchy with a root port * corresponding to each PCI bus. */ static void pci_bus_write_dsdt(int bus) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; int func, slot; /* * If there are no devices on this 'bus' then just return. */ if ((bi = pci_businfo[bus]) == NULL) { /* * Bus 0 is special because it decodes the I/O ports used * for PCI config space access even if there are no devices * on it. */ if (bus != 0) return; } dsdt_line(" Device (PC%02X)", bus); dsdt_line(" {"); dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))"); dsdt_line(" Method (_BBN, 0, NotSerialized)"); dsdt_line(" {"); dsdt_line(" Return (0x%08X)", bus); dsdt_line(" }"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, " "MaxFixed, PosDecode,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bus); dsdt_line(" 0x%04X, // Range Maximum", bus); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0001, // Length"); dsdt_line(" ,, )"); #ifdef __amd64__ if (bus == 0) { dsdt_indent(3); dsdt_fixed_ioport(0xCF8, 8); dsdt_unindent(3); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0000, // Range Minimum"); dsdt_line(" 0x0CF7, // Range Maximum"); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0CF8, // Length"); dsdt_line(" ,, , TypeStatic)"); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0D00, // Range Minimum"); dsdt_line(" 0x%04X, // Range Maximum", PCI_EMUL_IOBASE - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", PCI_EMUL_IOBASE - 0x0D00); dsdt_line(" ,, , TypeStatic)"); if (bi == NULL) { dsdt_line(" })"); goto done; } } #endif assert(bi != NULL); /* i/o window */ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bi->iobase); dsdt_line(" 0x%04X, // Range Maximum", bi->iolimit - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", bi->iolimit - bi->iobase); dsdt_line(" ,, , TypeStatic)"); /* mmio window (32-bit) */ dsdt_line(" DWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x00000000, // Granularity"); dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32); dsdt_line(" 0x%08X, // Range Maximum\n", bi->memlimit32 - 1); dsdt_line(" 0x00000000, // Translation Offset"); dsdt_line(" 0x%08X, // Length\n", bi->memlimit32 - bi->membase32); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); /* mmio window (64-bit) */ dsdt_line(" QWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x0000000000000000, // Granularity"); dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64); dsdt_line(" 0x%016lX, // Range Maximum\n", bi->memlimit64 - 1); dsdt_line(" 0x0000000000000000, // Translation Offset"); dsdt_line(" 0x%016lX, // Length\n", bi->memlimit64 - bi->membase64); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); dsdt_line(" })"); #ifdef __amd64__ if (pci_count_lintr(bus) != 0) { dsdt_indent(2); dsdt_line("Name (PPRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); dsdt_line("})"); dsdt_line("Name (APRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_apic_prt_entry, NULL); dsdt_line("})"); dsdt_line("Method (_PRT, 0, NotSerialized)"); dsdt_line("{"); dsdt_line(" If (PICM)"); dsdt_line(" {"); dsdt_line(" Return (APRT)"); dsdt_line(" }"); dsdt_line(" Else"); dsdt_line(" {"); dsdt_line(" Return (PPRT)"); dsdt_line(" }"); dsdt_line("}"); dsdt_unindent(2); } #endif dsdt_indent(2); for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { pi = si->si_funcs[func].fi_devi; if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL) pi->pi_d->pe_write_dsdt(pi); } } dsdt_unindent(2); #ifdef __amd64__ done: #endif dsdt_line(" }"); } void pci_write_dsdt(void) { int bus; dsdt_indent(1); dsdt_line("Name (PICM, 0x00)"); dsdt_line("Method (_PIC, 1, NotSerialized)"); dsdt_line("{"); dsdt_line(" Store (Arg0, PICM)"); dsdt_line("}"); dsdt_line(""); dsdt_line("Scope (_SB)"); dsdt_line("{"); for (bus = 0; bus < MAXBUSES; bus++) pci_bus_write_dsdt(bus); dsdt_line("}"); dsdt_unindent(1); } int pci_bus_configured(int bus) { assert(bus >= 0 && bus < MAXBUSES); return (pci_businfo[bus] != NULL); } int pci_msi_enabled(struct pci_devinst *pi) { return (pi->pi_msi.enabled); } int pci_msi_maxmsgnum(struct pci_devinst *pi) { if (pi->pi_msi.enabled) return (pi->pi_msi.maxmsgnum); else return (0); } int pci_msix_enabled(struct pci_devinst *pi) { return (pi->pi_msix.enabled && !pi->pi_msi.enabled); } void pci_generate_msix(struct pci_devinst *pi, int index) { struct msix_table_entry *mte; if (!pci_msix_enabled(pi)) return; if (pi->pi_msix.function_mask) return; if (index >= pi->pi_msix.table_count) return; mte = &pi->pi_msix.table[index]; if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { /* XXX Set PBA bit if interrupt is disabled */ vm_raise_msi(pi->pi_vmctx, mte->addr, mte->msg_data, pi->pi_bus, pi->pi_slot, pi->pi_func); } } void pci_generate_msi(struct pci_devinst *pi, int index) { if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) { vm_raise_msi(pi->pi_vmctx, pi->pi_msi.addr, pi->pi_msi.msg_data + index, pi->pi_bus, pi->pi_slot, pi->pi_func); } } static bool pci_lintr_permitted(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (!(pi->pi_msi.enabled || pi->pi_msix.enabled || (cmd & PCIM_CMD_INTxDIS))); } void pci_lintr_request(struct pci_devinst *pi) { struct businfo *bi; struct slotinfo *si; int bestpin, bestcount, pin; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); /* * Just allocate a pin from our slot. The pin will be * assigned IRQs later when interrupts are routed. */ si = &bi->slotinfo[pi->pi_slot]; bestpin = 0; bestcount = si->si_intpins[0].ii_count; for (pin = 1; pin < 4; pin++) { if (si->si_intpins[pin].ii_count < bestcount) { bestpin = pin; bestcount = si->si_intpins[pin].ii_count; } } si->si_intpins[bestpin].ii_count++; pi->pi_lintr.pin = bestpin + 1; pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1); } static void pci_lintr_route(struct pci_devinst *pi) { struct businfo *bi; struct intxinfo *ii; struct pci_irq *irq; if (pi->pi_lintr.pin == 0) return; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1]; irq = &ii->ii_irq; pci_irq_route(pi, irq); pi->pi_lintr.irq = *irq; pci_set_cfgdata8(pi, PCIR_INTLINE, pci_irq_intline(irq)); } void pci_lintr_assert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == IDLE) { if (pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } else pi->pi_lintr.state = PENDING; } pthread_mutex_unlock(&pi->pi_lintr.lock); } void pci_lintr_deassert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED) { pi->pi_lintr.state = IDLE; pci_irq_deassert(pi); } else if (pi->pi_lintr.state == PENDING) pi->pi_lintr.state = IDLE; pthread_mutex_unlock(&pi->pi_lintr.lock); } static void pci_lintr_update(struct pci_devinst *pi) { pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) { pci_irq_deassert(pi); pi->pi_lintr.state = PENDING; } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } pthread_mutex_unlock(&pi->pi_lintr.lock); } int pci_count_lintr(int bus) { int count, slot, pin; struct slotinfo *slotinfo; count = 0; if (pci_businfo[bus] != NULL) { for (slot = 0; slot < MAXSLOTS; slot++) { slotinfo = &pci_businfo[bus]->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { if (slotinfo->si_intpins[pin].ii_count != 0) count++; } } } return (count); } void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg) { struct businfo *bi; struct slotinfo *si; struct intxinfo *ii; int slot, pin; if ((bi = pci_businfo[bus]) == NULL) return; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { ii = &si->si_intpins[pin]; if (ii->ii_count != 0) cb(bus, slot, pin + 1, &ii->ii_irq, arg); } } } /* * Return 1 if the emulated device in 'slot' is a multi-function device. * Return 0 otherwise. */ static int pci_emul_is_mfdev(int bus, int slot) { struct businfo *bi; struct slotinfo *si; int f, numfuncs; numfuncs = 0; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; for (f = 0; f < MAXFUNCS; f++) { if (si->si_funcs[f].fi_devi != NULL) { numfuncs++; } } } return (numfuncs > 1); } /* * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on * whether or not is a multi-function being emulated in the pci 'slot'. */ static void pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) { int mfdev; if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) { mfdev = pci_emul_is_mfdev(bus, slot); switch (bytes) { case 1: case 2: *rv &= ~PCIM_MFDEV; if (mfdev) { *rv |= PCIM_MFDEV; } break; case 4: *rv &= ~(PCIM_MFDEV << 16); if (mfdev) { *rv |= (PCIM_MFDEV << 16); } break; } } } /* * Update device state in response to changes to the PCI command * register. */ void pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old) { int i; uint16_t changed, new; new = pci_get_cfgdata16(pi, PCIR_COMMAND); changed = old ^ new; /* * If the MMIO or I/O address space decoding has changed then * register/unregister all BARs that decode that address space. */ for (i = 0; i <= PCI_BARMAX_WITH_ROM; i++) { switch (pi->pi_bar[i].type) { case PCIBAR_NONE: case PCIBAR_MEMHI64: break; case PCIBAR_IO: /* I/O address space decoding changed? */ if (changed & PCIM_CMD_PORTEN) { if (new & PCIM_CMD_PORTEN) register_bar(pi, i); else unregister_bar(pi, i); } break; case PCIBAR_ROM: /* skip (un-)register of ROM if it disabled */ if (!romen(pi)) break; /* fallthrough */ case PCIBAR_MEM32: case PCIBAR_MEM64: /* MMIO address space decoding changed? */ if (changed & PCIM_CMD_MEMEN) { if (new & PCIM_CMD_MEMEN) register_bar(pi, i); else unregister_bar(pi, i); } break; default: assert(0); } } /* * If INTx has been unmasked and is pending, assert the * interrupt. */ pci_lintr_update(pi); } static void pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes) { int rshift; uint32_t cmd, old, readonly; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ /* * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3. * * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are * 'write 1 to clear'. However these bits are not set to '1' by * any device emulation so it is simpler to treat them as readonly. */ rshift = (coff & 0x3) * 8; readonly = 0xFFFFF880 >> rshift; old = CFGREAD(pi, coff, bytes); new &= ~readonly; new |= (old & readonly); CFGWRITE(pi, coff, new, bytes); /* update config */ pci_emul_cmd_changed(pi, cmd); } static void pci_cfgrw(int in, int bus, int slot, int func, int coff, int bytes, uint32_t *valp) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; struct pci_devemu *pe; int idx, needcfg; uint64_t addr, bar, mask; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; pi = si->si_funcs[func].fi_devi; } else pi = NULL; /* * Just return if there is no device at this slot:func or if the * guest is doing an un-aligned access. */ if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || (coff & (bytes - 1)) != 0) { if (in) *valp = 0xffffffff; return; } /* * Ignore all writes beyond the standard config space and return all * ones on reads. */ if (coff >= PCI_REGMAX + 1) { if (in) { *valp = 0xffffffff; /* * Extended capabilities begin at offset 256 in config * space. Absence of extended capabilities is signaled * with all 0s in the extended capability header at * offset 256. */ if (coff <= PCI_REGMAX + 4) *valp = 0x00000000; } return; } pe = pi->pi_d; /* * Config read */ if (in) { /* Let the device emulation override the default handler */ if (pe->pe_cfgread != NULL) { needcfg = pe->pe_cfgread(pi, coff, bytes, valp); } else { needcfg = 1; } if (needcfg) *valp = CFGREAD(pi, coff, bytes); pci_emul_hdrtype_fixup(bus, slot, coff, bytes, valp); } else { /* Let the device emulation override the default handler */ if (pe->pe_cfgwrite != NULL && (*pe->pe_cfgwrite)(pi, coff, bytes, *valp) == 0) return; /* * Special handling for write to BAR and ROM registers */ if (is_pcir_bar(coff) || is_pcir_bios(coff)) { /* * Ignore writes to BAR registers that are not * 4-byte aligned. */ if (bytes != 4 || (coff & 0x3) != 0) return; if (is_pcir_bar(coff)) { idx = (coff - PCIR_BAR(0)) / 4; } else if (is_pcir_bios(coff)) { idx = PCI_ROM_IDX; } else { errx(4, "%s: invalid BAR offset %d", __func__, coff); } mask = ~(pi->pi_bar[idx].size - 1); switch (pi->pi_bar[idx].type) { case PCIBAR_NONE: pi->pi_bar[idx].addr = bar = 0; break; case PCIBAR_IO: addr = *valp & mask; #if defined(PCI_EMUL_IOMASK) addr &= PCI_EMUL_IOMASK; #endif bar = addr | pi->pi_bar[idx].lobits; /* * Register the new BAR value for interception */ if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_IO); } break; case PCIBAR_MEM32: addr = bar = *valp & mask; bar |= pi->pi_bar[idx].lobits; if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM32); } break; case PCIBAR_MEM64: addr = bar = *valp & mask; bar |= pi->pi_bar[idx].lobits; if (addr != (uint32_t)pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM64); } break; case PCIBAR_MEMHI64: mask = ~(pi->pi_bar[idx - 1].size - 1); addr = ((uint64_t)*valp << 32) & mask; bar = addr >> 32; if (bar != pi->pi_bar[idx - 1].addr >> 32) { update_bar_address(pi, addr, idx - 1, PCIBAR_MEMHI64); } break; case PCIBAR_ROM: addr = bar = *valp & mask; if (memen(pi) && romen(pi)) { unregister_bar(pi, idx); } pi->pi_bar[idx].addr = addr; pi->pi_bar[idx].lobits = *valp & PCIM_BIOS_ENABLE; /* romen could have changed it value */ if (memen(pi) && romen(pi)) { register_bar(pi, idx); } bar |= pi->pi_bar[idx].lobits; break; default: assert(0); } pci_set_cfgdata32(pi, coff, bar); } else if (pci_emul_iscap(pi, coff)) { pci_emul_capwrite(pi, coff, bytes, *valp, 0, 0); } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) { pci_emul_cmdsts_write(pi, coff, *valp, bytes); } else { CFGWRITE(pi, coff, *valp, bytes); } } } #ifdef __amd64__ static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; static int pci_emul_cfgaddr(struct vmctx *ctx __unused, int in, int port __unused, int bytes, uint32_t *eax, void *arg __unused) { uint32_t x; if (bytes != 4) { if (in) *eax = (bytes == 2) ? 0xffff : 0xff; return (0); } if (in) { x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff; if (cfgenable) x |= CONF1_ENABLE; *eax = x; } else { x = *eax; cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; cfgoff = (x & PCI_REGMAX) & ~0x03; cfgfunc = (x >> 8) & PCI_FUNCMAX; cfgslot = (x >> 11) & PCI_SLOTMAX; cfgbus = (x >> 16) & PCI_BUSMAX; } return (0); } INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); static int pci_emul_cfgdata(struct vmctx *ctx __unused, int in, int port, int bytes, uint32_t *eax, void *arg __unused) { int coff; assert(bytes == 1 || bytes == 2 || bytes == 4); coff = cfgoff + (port - CONF1_DATA_PORT); if (cfgenable) { pci_cfgrw(in, cfgbus, cfgslot, cfgfunc, coff, bytes, eax); } else { /* Ignore accesses to cfgdata if not enabled by cfgaddr */ if (in) *eax = 0xffffffff; } return (0); } INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); #endif #ifdef BHYVE_SNAPSHOT /* * Saves/restores PCI device emulated state. Returns 0 on success. */ static int pci_snapshot_pci_dev(struct vm_snapshot_meta *meta) { struct pci_devinst *pi; int i; int ret; pi = meta->dev_data; SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.addr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.msg_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.maxmsgnum, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_bar, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_bar, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_offset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_offset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_size, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.function_mask, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(pi->pi_cfgdata, sizeof(pi->pi_cfgdata), meta, ret, done); for (i = 0; i < (int)nitems(pi->pi_bar); i++) { SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].type, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].size, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].addr, meta, ret, done); } /* Restore MSI-X table. */ for (i = 0; i < pi->pi_msix.table_count; i++) { SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].addr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].msg_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].vector_control, meta, ret, done); } done: return (ret); } int pci_snapshot(struct vm_snapshot_meta *meta) { struct pci_devemu *pde; struct pci_devinst *pdi; int ret; assert(meta->dev_name != NULL); pdi = meta->dev_data; pde = pdi->pi_d; if (pde->pe_snapshot == NULL) return (ENOTSUP); ret = pci_snapshot_pci_dev(meta); if (ret == 0) ret = (*pde->pe_snapshot)(meta); return (ret); } int pci_pause(struct pci_devinst *pdi) { struct pci_devemu *pde = pdi->pi_d; if (pde->pe_pause == NULL) { /* The pause/resume functionality is optional. */ return (0); } return (*pde->pe_pause)(pdi); } int pci_resume(struct pci_devinst *pdi) { struct pci_devemu *pde = pdi->pi_d; if (pde->pe_resume == NULL) { /* The pause/resume functionality is optional. */ return (0); } return (*pde->pe_resume)(pdi); } #endif #define PCI_EMUL_TEST #ifdef PCI_EMUL_TEST /* * Define a dummy test device */ #define DIOSZ 8 #define DMEMSZ 4096 struct pci_emul_dsoftc { uint8_t ioregs[DIOSZ]; uint8_t memregs[2][DMEMSZ]; }; #define PCI_EMUL_MSI_MSGS 4 #define PCI_EMUL_MSIX_MSGS 16 static int pci_emul_dinit(struct pci_devinst *pi, nvlist_t *nvl __unused) { int error; struct pci_emul_dsoftc *sc; sc = calloc(1, sizeof(struct pci_emul_dsoftc)); pi->pi_arg = sc; pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS); assert(error == 0); error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ); assert(error == 0); return (0); } static void pci_emul_diow(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { int i; struct pci_emul_dsoftc *sc = pi->pi_arg; if (baridx == 0) { if (offset + size > DIOSZ) { printf("diow: iow too large, offset %ld size %d\n", offset, size); return; } if (size == 1) { sc->ioregs[offset] = value & 0xff; } else if (size == 2) { *(uint16_t *)&sc->ioregs[offset] = value & 0xffff; } else if (size == 4) { *(uint32_t *)&sc->ioregs[offset] = value; } else { printf("diow: iow unknown size %d\n", size); } /* * Special magic value to generate an interrupt */ if (offset == 4 && size == 4 && pci_msi_enabled(pi)) pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi)); if (value == 0xabcdef) { for (i = 0; i < pci_msi_maxmsgnum(pi); i++) pci_generate_msi(pi, i); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("diow: memw too large, offset %ld size %d\n", offset, size); return; } i = baridx - 1; /* 'memregs' index */ if (size == 1) { sc->memregs[i][offset] = value; } else if (size == 2) { *(uint16_t *)&sc->memregs[i][offset] = value; } else if (size == 4) { *(uint32_t *)&sc->memregs[i][offset] = value; } else if (size == 8) { *(uint64_t *)&sc->memregs[i][offset] = value; } else { printf("diow: memw unknown size %d\n", size); } /* * magic interrupt ?? */ } if (baridx > 2 || baridx < 0) { printf("diow: unknown bar idx %d\n", baridx); } } static uint64_t pci_emul_dior(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_emul_dsoftc *sc = pi->pi_arg; uint32_t value; int i; if (baridx == 0) { if (offset + size > DIOSZ) { printf("dior: ior too large, offset %ld size %d\n", offset, size); return (0); } value = 0; if (size == 1) { value = sc->ioregs[offset]; } else if (size == 2) { value = *(uint16_t *) &sc->ioregs[offset]; } else if (size == 4) { value = *(uint32_t *) &sc->ioregs[offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("dior: memr too large, offset %ld size %d\n", offset, size); return (0); } i = baridx - 1; /* 'memregs' index */ if (size == 1) { value = sc->memregs[i][offset]; } else if (size == 2) { value = *(uint16_t *) &sc->memregs[i][offset]; } else if (size == 4) { value = *(uint32_t *) &sc->memregs[i][offset]; } else if (size == 8) { value = *(uint64_t *) &sc->memregs[i][offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx > 2 || baridx < 0) { printf("dior: unknown bar idx %d\n", baridx); return (0); } return (value); } #ifdef BHYVE_SNAPSHOT struct pci_devinst * pci_next(const struct pci_devinst *cursor) { unsigned bus = 0, slot = 0, func = 0; struct businfo *bi; struct slotinfo *si; struct funcinfo *fi; bus = cursor ? cursor->pi_bus : 0; slot = cursor ? cursor->pi_slot : 0; func = cursor ? (cursor->pi_func + 1) : 0; for (; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; if (slot >= MAXSLOTS) slot = 0; for (; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; if (func >= MAXFUNCS) func = 0; for (; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_devi == NULL) continue; return (fi->fi_devi); } } } return (NULL); } static int pci_emul_snapshot(struct vm_snapshot_meta *meta __unused) { return (0); } #endif static const struct pci_devemu pci_dummy = { .pe_emu = "dummy", .pe_init = pci_emul_dinit, .pe_barwrite = pci_emul_diow, .pe_barread = pci_emul_dior, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_emul_snapshot, #endif }; PCI_EMUL_SET(pci_dummy); #endif /* PCI_EMUL_TEST */ diff --git a/usr.sbin/bhyve/pci_fbuf.c b/usr.sbin/bhyve/pci_fbuf.c index 125428e0b772..1e3ec77c15b0 100644 --- a/usr.sbin/bhyve/pci_fbuf.c +++ b/usr.sbin/bhyve/pci_fbuf.c @@ -1,466 +1,467 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2015 Nahanni Systems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include +#include #include #include #include #include #include #include #include #include #include "bhyvegc.h" #include "bhyverun.h" #include "config.h" #include "debug.h" #include "console.h" #include "pci_emul.h" #include "rfb.h" #ifdef __amd64__ #include "amd64/vga.h" #endif /* * bhyve Framebuffer device emulation. * BAR0 points to the current mode information. * BAR1 is the 32-bit framebuffer address. * * -s ,fbuf,wait,vga=on|io|off,rfb=:port,w=width,h=height */ static int fbuf_debug = 1; #define DEBUG_INFO 1 #define DEBUG_VERBOSE 4 #define DPRINTF(level, params) if (level <= fbuf_debug) PRINTLN params #define KB (1024UL) #define MB (1024 * 1024UL) #define DMEMSZ 128 #define FB_SIZE (32*MB) #define COLS_MAX 3840 #define ROWS_MAX 2160 #define COLS_DEFAULT 1024 #define ROWS_DEFAULT 768 #define COLS_MIN 640 #define ROWS_MIN 480 struct pci_fbuf_softc { struct pci_devinst *fsc_pi; struct { uint32_t fbsize; uint16_t width; uint16_t height; uint16_t depth; uint16_t refreshrate; uint8_t reserved[116]; } __packed memregs; /* rfb server */ char *rfb_host; char *rfb_password; int rfb_port; int rfb_wait; int vga_enabled; int vga_full; uint32_t fbaddr; char *fb_base; uint16_t gc_width; uint16_t gc_height; void *vgasc; struct bhyvegc_image *gc_image; }; static struct pci_fbuf_softc *fbuf_sc; #define PCI_FBUF_MSI_MSGS 4 static void pci_fbuf_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_fbuf_softc *sc; uint8_t *p; assert(baridx == 0); sc = pi->pi_arg; DPRINTF(DEBUG_VERBOSE, ("fbuf wr: offset 0x%lx, size: %d, value: 0x%lx", offset, size, value)); if (offset + size > DMEMSZ) { printf("fbuf: write too large, offset %ld size %d\n", offset, size); return; } p = (uint8_t *)&sc->memregs + offset; switch (size) { case 1: *p = value; break; case 2: *(uint16_t *)p = value; break; case 4: *(uint32_t *)p = value; break; case 8: *(uint64_t *)p = value; break; default: printf("fbuf: write unknown size %d\n", size); break; } if (!sc->gc_image->vgamode && sc->memregs.width == 0 && sc->memregs.height == 0) { DPRINTF(DEBUG_INFO, ("switching to VGA mode")); sc->gc_image->vgamode = 1; sc->gc_width = 0; sc->gc_height = 0; } else if (sc->gc_image->vgamode && sc->memregs.width != 0 && sc->memregs.height != 0) { DPRINTF(DEBUG_INFO, ("switching to VESA mode")); sc->gc_image->vgamode = 0; } } static uint64_t pci_fbuf_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_fbuf_softc *sc; uint8_t *p; uint64_t value; assert(baridx == 0); sc = pi->pi_arg; if (offset + size > DMEMSZ) { printf("fbuf: read too large, offset %ld size %d\n", offset, size); return (0); } p = (uint8_t *)&sc->memregs + offset; value = 0; switch (size) { case 1: value = *p; break; case 2: value = *(uint16_t *)p; break; case 4: value = *(uint32_t *)p; break; case 8: value = *(uint64_t *)p; break; default: printf("fbuf: read unknown size %d\n", size); break; } DPRINTF(DEBUG_VERBOSE, ("fbuf rd: offset 0x%lx, size: %d, value: 0x%lx", offset, size, value)); return (value); } static void pci_fbuf_baraddr(struct pci_devinst *pi, int baridx, int enabled, uint64_t address) { struct pci_fbuf_softc *sc; int prot; if (baridx != 1) return; sc = pi->pi_arg; if (!enabled) { if (vm_munmap_memseg(pi->pi_vmctx, sc->fbaddr, FB_SIZE) != 0) EPRINTLN("pci_fbuf: munmap_memseg failed"); sc->fbaddr = 0; } else { prot = PROT_READ | PROT_WRITE; if (vm_mmap_memseg(pi->pi_vmctx, address, VM_FRAMEBUFFER, 0, FB_SIZE, prot) != 0) EPRINTLN("pci_fbuf: mmap_memseg failed"); else sc->fbaddr = address; } } static int pci_fbuf_parse_config(struct pci_fbuf_softc *sc, nvlist_t *nvl) { const char *value; char *cp; sc->rfb_wait = get_config_bool_node_default(nvl, "wait", false); /* Prefer "rfb" to "tcp". */ value = get_config_value_node(nvl, "rfb"); if (value == NULL) value = get_config_value_node(nvl, "tcp"); if (value != NULL) { /* * IPv4 -- host-ip:port * IPv6 -- [host-ip%zone]:port * XXX for now port is mandatory for IPv4. */ if (value[0] == '[') { cp = strchr(value + 1, ']'); if (cp == NULL || cp == value + 1) { EPRINTLN("fbuf: Invalid IPv6 address: \"%s\"", value); return (-1); } sc->rfb_host = strndup(value + 1, cp - (value + 1)); cp++; if (*cp == ':') { cp++; if (*cp == '\0') { EPRINTLN( "fbuf: Missing port number: \"%s\"", value); return (-1); } sc->rfb_port = atoi(cp); } else if (*cp != '\0') { EPRINTLN("fbuf: Invalid IPv6 address: \"%s\"", value); return (-1); } } else { cp = strchr(value, ':'); if (cp == NULL) { sc->rfb_port = atoi(value); } else { sc->rfb_host = strndup(value, cp - value); cp++; if (*cp == '\0') { EPRINTLN( "fbuf: Missing port number: \"%s\"", value); return (-1); } sc->rfb_port = atoi(cp); } } } value = get_config_value_node(nvl, "vga"); if (value != NULL) { if (strcmp(value, "off") == 0) { sc->vga_enabled = 0; } else if (strcmp(value, "io") == 0) { sc->vga_enabled = 1; sc->vga_full = 0; } else if (strcmp(value, "on") == 0) { sc->vga_enabled = 1; sc->vga_full = 1; } else { EPRINTLN("fbuf: Invalid vga setting: \"%s\"", value); return (-1); } } value = get_config_value_node(nvl, "w"); if (value != NULL) sc->memregs.width = strtol(value, NULL, 10); value = get_config_value_node(nvl, "h"); if (value != NULL) sc->memregs.height = strtol(value, NULL, 10); if (sc->memregs.width > COLS_MAX || sc->memregs.height > ROWS_MAX) { EPRINTLN("fbuf: max resolution is %ux%u", COLS_MAX, ROWS_MAX); return (-1); } if (sc->memregs.width < COLS_MIN || sc->memregs.height < ROWS_MIN) { EPRINTLN("fbuf: minimum resolution is %ux%u", COLS_MIN, ROWS_MIN); return (-1); } value = get_config_value_node(nvl, "password"); if (value != NULL) sc->rfb_password = strdup(value); return (0); } static void pci_fbuf_render(struct bhyvegc *gc, void *arg) { struct pci_fbuf_softc *sc; sc = arg; if (sc->vga_full && sc->gc_image->vgamode) { /* TODO: mode switching to vga and vesa should use the special * EFI-bhyve protocol port. */ vga_render(gc, sc->vgasc); return; } if (sc->gc_width != sc->memregs.width || sc->gc_height != sc->memregs.height) { bhyvegc_resize(gc, sc->memregs.width, sc->memregs.height); sc->gc_width = sc->memregs.width; sc->gc_height = sc->memregs.height; } } static int pci_fbuf_init(struct pci_devinst *pi, nvlist_t *nvl) { int error; struct pci_fbuf_softc *sc; if (fbuf_sc != NULL) { EPRINTLN("Only one frame buffer device is allowed."); return (-1); } sc = calloc(1, sizeof(struct pci_fbuf_softc)); pi->pi_arg = sc; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x40FB); pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_DISPLAY); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_DISPLAY_VGA); sc->fb_base = vm_create_devmem(pi->pi_vmctx, VM_FRAMEBUFFER, "framebuffer", FB_SIZE); if (sc->fb_base == MAP_FAILED) { error = -1; goto done; } error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, DMEMSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, FB_SIZE); assert(error == 0); error = pci_emul_add_msicap(pi, PCI_FBUF_MSI_MSGS); assert(error == 0); sc->memregs.fbsize = FB_SIZE; sc->memregs.width = COLS_DEFAULT; sc->memregs.height = ROWS_DEFAULT; sc->memregs.depth = 32; sc->vga_enabled = 1; sc->vga_full = 0; sc->fsc_pi = pi; error = pci_fbuf_parse_config(sc, nvl); if (error != 0) goto done; /* XXX until VGA rendering is enabled */ if (sc->vga_full != 0) { EPRINTLN("pci_fbuf: VGA rendering not enabled"); goto done; } DPRINTF(DEBUG_INFO, ("fbuf frame buffer base: %p [sz %lu]", sc->fb_base, FB_SIZE)); console_init(sc->memregs.width, sc->memregs.height, sc->fb_base); console_fb_register(pci_fbuf_render, sc); if (sc->vga_enabled) sc->vgasc = vga_init(!sc->vga_full); sc->gc_image = console_get_image(); fbuf_sc = sc; memset((void *)sc->fb_base, 0, FB_SIZE); error = rfb_init(sc->rfb_host, sc->rfb_port, sc->rfb_wait, sc->rfb_password); done: if (error) free(sc); return (error); } #ifdef BHYVE_SNAPSHOT static int pci_fbuf_snapshot(struct vm_snapshot_meta *meta) { int ret; SNAPSHOT_BUF_OR_LEAVE(fbuf_sc->fb_base, FB_SIZE, meta, ret, err); err: return (ret); } #endif static const struct pci_devemu pci_fbuf = { .pe_emu = "fbuf", .pe_init = pci_fbuf_init, .pe_barwrite = pci_fbuf_write, .pe_barread = pci_fbuf_read, .pe_baraddr = pci_fbuf_baraddr, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_fbuf_snapshot, #endif }; PCI_EMUL_SET(pci_fbuf); diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c index 9d38ae9168a1..a82078f6e036 100644 --- a/usr.sbin/bhyve/pci_passthru.c +++ b/usr.sbin/bhyve/pci_passthru.c @@ -1,1352 +1,1353 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include +#include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include "debug.h" #include "mem.h" #include "pci_passthru.h" #ifndef _PATH_DEVPCI #define _PATH_DEVPCI "/dev/pci" #endif #define LEGACY_SUPPORT 1 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1) #define MSIX_CAPLEN 12 #define PASSTHRU_MMIO_MAX 3 static int pcifd = -1; SET_DECLARE(passthru_dev_set, struct passthru_dev); struct passthru_softc { struct pci_devinst *psc_pi; /* ROM is handled like a BAR */ struct pcibar psc_bar[PCI_BARMAX_WITH_ROM + 1]; struct { int capoff; int msgctrl; int emulated; } psc_msi; struct { int capoff; } psc_msix; struct pcisel psc_sel; struct passthru_mmio_mapping psc_mmio_map[PASSTHRU_MMIO_MAX]; cfgread_handler psc_pcir_rhandler[PCI_REGMAX + 1]; cfgwrite_handler psc_pcir_whandler[PCI_REGMAX + 1]; }; static int msi_caplen(int msgctrl) { int len; len = 10; /* minimum length of msi capability */ if (msgctrl & PCIM_MSICTRL_64BIT) len += 4; #if 0 /* * Ignore the 'mask' and 'pending' bits in the MSI capability. * We'll let the guest manipulate them directly. */ if (msgctrl & PCIM_MSICTRL_VECTOR) len += 10; #endif return (len); } static int pcifd_open(void) { int fd; fd = open(_PATH_DEVPCI, O_RDWR, 0); if (fd < 0) { warn("failed to open %s", _PATH_DEVPCI); return (-1); } return (fd); } static int pcifd_init(void) { pcifd = pcifd_open(); if (pcifd < 0) return (1); #ifndef WITHOUT_CAPSICUM cap_rights_t pcifd_rights; cap_rights_init(&pcifd_rights, CAP_IOCTL, CAP_READ, CAP_WRITE); if (caph_rights_limit(pcifd, &pcifd_rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); const cap_ioctl_t pcifd_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR, PCIOCBARIO, PCIOCBARMMAP, PCIOCGETCONF }; if (caph_ioctls_limit(pcifd, pcifd_ioctls, nitems(pcifd_ioctls)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif return (0); } static uint32_t host_read_config(int fd, const struct pcisel *sel, long reg, int width) { struct pci_io pi; bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; if (ioctl(fd, PCIOCREAD, &pi) < 0) return (0); /* XXX */ else return (pi.pi_data); } static uint32_t passthru_read_config(const struct pcisel *sel, long reg, int width) { return (host_read_config(pcifd, sel, reg, width)); } uint32_t pci_host_read_config(const struct pcisel *sel, long reg, int width) { uint32_t ret; int fd; fd = pcifd_open(); if (fd < 0) return (0); ret = host_read_config(fd, sel, reg, width); (void)close(fd); return (ret); } static void host_write_config(int fd, const struct pcisel *sel, long reg, int width, uint32_t data) { struct pci_io pi; bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; pi.pi_data = data; (void)ioctl(fd, PCIOCWRITE, &pi); /* XXX */ } static void passthru_write_config(const struct pcisel *sel, long reg, int width, uint32_t data) { host_write_config(pcifd, sel, reg, width, data); } void pci_host_write_config(const struct pcisel *sel, long reg, int width, uint32_t data) { int fd; fd = pcifd_open(); if (fd < 0) return; host_write_config(fd, sel, reg, width, data); (void)close(fd); } #ifdef LEGACY_SUPPORT static int passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr) { int capoff; struct msicap msicap; u_char *capdata; pci_populate_msicap(&msicap, msgnum, nextptr); /* * XXX * Copy the msi capability structure in the last 16 bytes of the * config space. This is wrong because it could shadow something * useful to the device. */ capoff = 256 - roundup(sizeof(msicap), 4); capdata = (u_char *)&msicap; for (size_t i = 0; i < sizeof(msicap); i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); return (capoff); } #endif /* LEGACY_SUPPORT */ static int cfginitmsi(struct passthru_softc *sc) { int i, ptr, capptr, cap, sts, caplen, table_size; uint32_t u32; struct pcisel sel; struct pci_devinst *pi; struct msixcap msixcap; char *msixcap_ptr; pi = sc->psc_pi; sel = sc->psc_sel; /* * Parse the capabilities and cache the location of the MSI * and MSI-X capabilities. */ sts = passthru_read_config(&sel, PCIR_STATUS, 2); if (sts & PCIM_STATUS_CAPPRESENT) { ptr = passthru_read_config(&sel, PCIR_CAP_PTR, 1); while (ptr != 0 && ptr != 0xff) { cap = passthru_read_config(&sel, ptr + PCICAP_ID, 1); if (cap == PCIY_MSI) { /* * Copy the MSI capability into the config * space of the emulated pci device */ sc->psc_msi.capoff = ptr; sc->psc_msi.msgctrl = passthru_read_config(&sel, ptr + 2, 2); sc->psc_msi.emulated = 0; caplen = msi_caplen(sc->psc_msi.msgctrl); capptr = ptr; while (caplen > 0) { u32 = passthru_read_config(&sel, capptr, 4); pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; } } else if (cap == PCIY_MSIX) { /* * Copy the MSI-X capability */ sc->psc_msix.capoff = ptr; caplen = 12; msixcap_ptr = (char *)&msixcap; capptr = ptr; while (caplen > 0) { u32 = passthru_read_config(&sel, capptr, 4); memcpy(msixcap_ptr, &u32, 4); pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; msixcap_ptr += 4; } } ptr = passthru_read_config(&sel, ptr + PCICAP_NEXTPTR, 1); } } if (sc->psc_msix.capoff != 0) { pi->pi_msix.pba_bar = msixcap.pba_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.pba_offset = msixcap.pba_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_bar = msixcap.table_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.table_offset = msixcap.table_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl); pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count); /* Allocate the emulated MSI-X table array */ table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* Mask all table entries */ for (i = 0; i < pi->pi_msix.table_count; i++) { pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } } #ifdef LEGACY_SUPPORT /* * If the passthrough device does not support MSI then craft a * MSI capability for it. We link the new MSI capability at the * head of the list of capabilities. */ if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) { int origptr, msiptr; origptr = passthru_read_config(&sel, PCIR_CAP_PTR, 1); msiptr = passthru_add_msicap(pi, 1, origptr); sc->psc_msi.capoff = msiptr; sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2); sc->psc_msi.emulated = 1; pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr); } #endif /* Make sure one of the capabilities is present */ if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) return (-1); else return (0); } static uint64_t msix_table_read(struct passthru_softc *sc, uint64_t offset, int size) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *src8; uint16_t *src16; uint32_t *src32; uint64_t *src64; uint64_t data; size_t entry_offset; uint32_t table_offset; int index, table_count; pi = sc->psc_pi; table_offset = pi->pi_msix.table_offset; table_count = pi->pi_msix.table_count; if (offset < table_offset || offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) { switch (size) { case 1: src8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset); data = *src8; break; case 2: src16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset); data = *src16; break; case 4: src32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset); data = *src32; break; case 8: src64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset); data = *src64; break; default: return (-1); } return (data); } offset -= table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; assert(index < table_count); entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; switch (size) { case 1: src8 = (uint8_t *)((uint8_t *)entry + entry_offset); data = *src8; break; case 2: src16 = (uint16_t *)((uint8_t *)entry + entry_offset); data = *src16; break; case 4: src32 = (uint32_t *)((uint8_t *)entry + entry_offset); data = *src32; break; case 8: src64 = (uint64_t *)((uint8_t *)entry + entry_offset); data = *src64; break; default: return (-1); } return (data); } static void msix_table_write(struct passthru_softc *sc, uint64_t offset, int size, uint64_t data) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *dest8; uint16_t *dest16; uint32_t *dest32; uint64_t *dest64; size_t entry_offset; uint32_t table_offset, vector_control; int index, table_count; pi = sc->psc_pi; table_offset = pi->pi_msix.table_offset; table_count = pi->pi_msix.table_count; if (offset < table_offset || offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) { switch (size) { case 1: dest8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset); *dest8 = data; break; case 2: dest16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset); *dest16 = data; break; case 4: dest32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset); *dest32 = data; break; case 8: dest64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset); *dest64 = data; break; } return; } offset -= table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; assert(index < table_count); entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* Only 4 byte naturally-aligned writes are supported */ assert(size == 4); assert(entry_offset % 4 == 0); vector_control = entry->vector_control; dest32 = (uint32_t *)((uint8_t *)entry + entry_offset); *dest32 = data; /* If MSI-X hasn't been enabled, do nothing */ if (pi->pi_msix.enabled) { /* If the entry is masked, don't set it up */ if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 || (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { (void)vm_setup_pptdev_msix(sc->psc_pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, index, entry->addr, entry->msg_data, entry->vector_control); } } } static int init_msix_table(struct passthru_softc *sc) { struct pci_devinst *pi = sc->psc_pi; struct pci_bar_mmap pbm; int b, s, f; uint32_t table_size, table_offset; assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0); b = sc->psc_sel.pc_bus; s = sc->psc_sel.pc_dev; f = sc->psc_sel.pc_func; /* * Map the region of the BAR containing the MSI-X table. This is * necessary for two reasons: * 1. The PBA may reside in the first or last page containing the MSI-X * table. * 2. While PCI devices are not supposed to use the page(s) containing * the MSI-X table for other purposes, some do in practice. */ memset(&pbm, 0, sizeof(pbm)); pbm.pbm_sel = sc->psc_sel; pbm.pbm_flags = PCIIO_BAR_MMAP_RW; pbm.pbm_reg = PCIR_BAR(pi->pi_msix.table_bar); pbm.pbm_memattr = VM_MEMATTR_DEVICE; if (ioctl(pcifd, PCIOCBARMMAP, &pbm) != 0) { warn("Failed to map MSI-X table BAR on %d/%d/%d", b, s, f); return (-1); } assert(pbm.pbm_bar_off == 0); pi->pi_msix.mapped_addr = (uint8_t *)(uintptr_t)pbm.pbm_map_base; pi->pi_msix.mapped_size = pbm.pbm_map_length; table_offset = rounddown2(pi->pi_msix.table_offset, 4096); table_size = pi->pi_msix.table_offset - table_offset; table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; table_size = roundup2(table_size, 4096); /* * Unmap any pages not containing the table, we do not need to emulate * accesses to them. Avoid releasing address space to help ensure that * a buggy out-of-bounds access causes a crash. */ if (table_offset != 0) if (mprotect(pi->pi_msix.mapped_addr, table_offset, PROT_NONE) != 0) warn("Failed to unmap MSI-X table BAR region"); if (table_offset + table_size != pi->pi_msix.mapped_size) if (mprotect( pi->pi_msix.mapped_addr + table_offset + table_size, pi->pi_msix.mapped_size - (table_offset + table_size), PROT_NONE) != 0) warn("Failed to unmap MSI-X table BAR region"); return (0); } static int cfginitbar(struct passthru_softc *sc) { int i, error; struct pci_devinst *pi; struct pci_bar_io bar; enum pcibar_type bartype; uint64_t base, size; pi = sc->psc_pi; /* * Initialize BAR registers */ for (i = 0; i <= PCI_BARMAX; i++) { uint8_t lobits; bzero(&bar, sizeof(bar)); bar.pbi_sel = sc->psc_sel; bar.pbi_reg = PCIR_BAR(i); if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0) continue; if (PCI_BAR_IO(bar.pbi_base)) { bartype = PCIBAR_IO; base = bar.pbi_base & PCIM_BAR_IO_BASE; } else { switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) { case PCIM_BAR_MEM_64: bartype = PCIBAR_MEM64; break; default: bartype = PCIBAR_MEM32; break; } base = bar.pbi_base & PCIM_BAR_MEM_BASE; } size = bar.pbi_length; if (bartype != PCIBAR_IO) { if (((base | size) & PAGE_MASK) != 0) { warnx("passthru device %d/%d/%d BAR %d: " "base %#lx or size %#lx not page aligned\n", sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, base, size); return (-1); } } /* Cache information about the "real" BAR */ sc->psc_bar[i].type = bartype; sc->psc_bar[i].size = size; sc->psc_bar[i].addr = base; sc->psc_bar[i].lobits = 0; /* Allocate the BAR in the guest I/O or MMIO space */ error = pci_emul_alloc_bar(pi, i, bartype, size); if (error) return (-1); /* Use same lobits as physical bar */ lobits = (uint8_t)passthru_read_config(&sc->psc_sel, PCIR_BAR(i), 0x01); if (bartype == PCIBAR_MEM32 || bartype == PCIBAR_MEM64) { lobits &= ~PCIM_BAR_MEM_BASE; } else { lobits &= ~PCIM_BAR_IO_BASE; } sc->psc_bar[i].lobits = lobits; pi->pi_bar[i].lobits = lobits; /* * 64-bit BAR takes up two slots so skip the next one. */ if (bartype == PCIBAR_MEM64) { i++; assert(i <= PCI_BARMAX); sc->psc_bar[i].type = PCIBAR_MEMHI64; } } return (0); } static int cfginit(struct pci_devinst *pi, int bus, int slot, int func) { int error; struct passthru_softc *sc; uint16_t cmd; uint8_t intline, intpin; error = 1; sc = pi->pi_arg; bzero(&sc->psc_sel, sizeof(struct pcisel)); sc->psc_sel.pc_bus = bus; sc->psc_sel.pc_dev = slot; sc->psc_sel.pc_func = func; /* * Copy physical PCI header to virtual config space. COMMAND, * INTLINE, and INTPIN shouldn't be aligned with their * physical value and they are already set by pci_emul_init(). */ cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); intline = pci_get_cfgdata8(pi, PCIR_INTLINE); intpin = pci_get_cfgdata8(pi, PCIR_INTPIN); for (int i = 0; i <= PCIR_MAXLAT; i += 4) { pci_set_cfgdata32(pi, i, passthru_read_config(&sc->psc_sel, i, 4)); } pci_set_cfgdata16(pi, PCIR_COMMAND, cmd); pci_set_cfgdata8(pi, PCIR_INTLINE, intline); pci_set_cfgdata8(pi, PCIR_INTPIN, intpin); if (cfginitmsi(sc) != 0) { warnx("failed to initialize MSI for PCI %d/%d/%d", bus, slot, func); goto done; } if (cfginitbar(sc) != 0) { warnx("failed to initialize BARs for PCI %d/%d/%d", bus, slot, func); goto done; } if (pci_msix_table_bar(pi) >= 0) { error = init_msix_table(sc); if (error != 0) { warnx( "failed to initialize MSI-X table for PCI %d/%d/%d: %d", bus, slot, func, error); goto done; } } error = 0; /* success */ done: return (error); } struct passthru_mmio_mapping * passthru_get_mmio(struct passthru_softc *sc, int num) { assert(sc != NULL); assert(num < PASSTHRU_MMIO_MAX); return (&sc->psc_mmio_map[num]); } struct pcisel * passthru_get_sel(struct passthru_softc *sc) { assert(sc != NULL); return (&sc->psc_sel); } int set_pcir_handler(struct passthru_softc *sc, int reg, int len, cfgread_handler rhandler, cfgwrite_handler whandler) { if (reg > PCI_REGMAX || reg + len > PCI_REGMAX + 1) return (-1); for (int i = reg; i < reg + len; ++i) { assert(sc->psc_pcir_rhandler[i] == NULL || rhandler == NULL); assert(sc->psc_pcir_whandler[i] == NULL || whandler == NULL); sc->psc_pcir_rhandler[i] = rhandler; sc->psc_pcir_whandler[i] = whandler; } return (0); } static int passthru_legacy_config(nvlist_t *nvl, const char *opts) { const char *cp; char *tofree; char value[16]; int bus, slot, func; if (opts == NULL) return (0); cp = strchr(opts, ','); if (strncmp(opts, "ppt", strlen("ppt")) == 0) { tofree = strndup(opts, cp - opts); set_config_value_node(nvl, "pptdev", tofree); free(tofree); } else if (sscanf(opts, "pci0:%d:%d:%d", &bus, &slot, &func) == 3 || sscanf(opts, "pci%d:%d:%d", &bus, &slot, &func) == 3 || sscanf(opts, "%d/%d/%d", &bus, &slot, &func) == 3) { snprintf(value, sizeof(value), "%d", bus); set_config_value_node(nvl, "bus", value); snprintf(value, sizeof(value), "%d", slot); set_config_value_node(nvl, "slot", value); snprintf(value, sizeof(value), "%d", func); set_config_value_node(nvl, "func", value); } else { EPRINTLN("passthru: invalid options \"%s\"", opts); return (-1); } if (cp == NULL) { return (0); } return (pci_parse_legacy_config(nvl, cp + 1)); } static int passthru_init_rom(struct passthru_softc *const sc, const char *const romfile) { if (romfile == NULL) { return (0); } const int fd = open(romfile, O_RDONLY); if (fd < 0) { warnx("%s: can't open romfile \"%s\"", __func__, romfile); return (-1); } struct stat sbuf; if (fstat(fd, &sbuf) < 0) { warnx("%s: can't fstat romfile \"%s\"", __func__, romfile); close(fd); return (-1); } const uint64_t rom_size = sbuf.st_size; void *const rom_data = mmap(NULL, rom_size, PROT_READ, MAP_SHARED, fd, 0); if (rom_data == MAP_FAILED) { warnx("%s: unable to mmap romfile \"%s\" (%d)", __func__, romfile, errno); close(fd); return (-1); } void *rom_addr; int error = pci_emul_alloc_rom(sc->psc_pi, rom_size, &rom_addr); if (error) { warnx("%s: failed to alloc rom segment", __func__); munmap(rom_data, rom_size); close(fd); return (error); } memcpy(rom_addr, rom_data, rom_size); sc->psc_bar[PCI_ROM_IDX].type = PCIBAR_ROM; sc->psc_bar[PCI_ROM_IDX].addr = (uint64_t)rom_addr; sc->psc_bar[PCI_ROM_IDX].size = rom_size; munmap(rom_data, rom_size); close(fd); return (0); } static bool passthru_lookup_pptdev(const char *name, int *bus, int *slot, int *func) { struct pci_conf_io pc; struct pci_conf conf[1]; struct pci_match_conf patterns[1]; char *cp; bzero(&pc, sizeof(struct pci_conf_io)); pc.match_buf_len = sizeof(conf); pc.matches = conf; bzero(&patterns, sizeof(patterns)); /* * The pattern structure requires the unit to be split out from * the driver name. Walk backwards from the end of the name to * find the start of the unit. */ cp = strchr(name, '\0'); assert(cp != NULL); while (cp != name && isdigit(cp[-1])) cp--; if (cp == name || !isdigit(*cp)) { EPRINTLN("Invalid passthru device name %s", name); return (false); } if ((size_t)(cp - name) + 1 > sizeof(patterns[0].pd_name)) { EPRINTLN("Passthru device name %s is too long", name); return (false); } memcpy(patterns[0].pd_name, name, cp - name); patterns[0].pd_unit = strtol(cp, &cp, 10); if (*cp != '\0') { EPRINTLN("Invalid passthru device name %s", name); return (false); } patterns[0].flags = PCI_GETCONF_MATCH_NAME | PCI_GETCONF_MATCH_UNIT; pc.num_patterns = 1; pc.pat_buf_len = sizeof(patterns); pc.patterns = patterns; if (ioctl(pcifd, PCIOCGETCONF, &pc) == -1) { EPRINTLN("ioctl(PCIOCGETCONF): %s", strerror(errno)); return (false); } if (pc.status != PCI_GETCONF_LAST_DEVICE && pc.status != PCI_GETCONF_MORE_DEVS) { EPRINTLN("error returned from PCIOCGETCONF ioctl"); return (false); } if (pc.num_matches == 0) { EPRINTLN("Passthru device %s not found", name); return (false); } if (conf[0].pc_sel.pc_domain != 0) { EPRINTLN("Passthru device %s on unsupported domain", name); return (false); } *bus = conf[0].pc_sel.pc_bus; *slot = conf[0].pc_sel.pc_dev; *func = conf[0].pc_sel.pc_func; return (true); } static int passthru_init(struct pci_devinst *pi, nvlist_t *nvl) { int bus, slot, func, error, memflags; struct passthru_softc *sc; struct passthru_dev **devpp; struct passthru_dev *devp, *dev = NULL; const char *value; sc = NULL; error = 1; memflags = vm_get_memflags(pi->pi_vmctx); if (!(memflags & VM_MEM_F_WIRED)) { warnx("passthru requires guest memory to be wired"); return (error); } if (pcifd < 0 && pcifd_init()) { return (error); } #define GET_INT_CONFIG(var, name) do { \ value = get_config_value_node(nvl, name); \ if (value == NULL) { \ EPRINTLN("passthru: missing required %s setting", name); \ return (error); \ } \ var = atoi(value); \ } while (0) value = get_config_value_node(nvl, "pptdev"); if (value != NULL) { if (!passthru_lookup_pptdev(value, &bus, &slot, &func)) return (error); } else { GET_INT_CONFIG(bus, "bus"); GET_INT_CONFIG(slot, "slot"); GET_INT_CONFIG(func, "func"); } if (vm_assign_pptdev(pi->pi_vmctx, bus, slot, func) != 0) { warnx("PCI device at %d/%d/%d is not using the ppt(4) driver", bus, slot, func); goto done; } sc = calloc(1, sizeof(struct passthru_softc)); pi->pi_arg = sc; sc->psc_pi = pi; /* initialize config space */ if ((error = cfginit(pi, bus, slot, func)) != 0) goto done; /* initialize ROM */ if ((error = passthru_init_rom(sc, get_config_value_node(nvl, "rom"))) != 0) goto done; /* Emulate most PCI header register. */ if ((error = set_pcir_handler(sc, 0, PCIR_MAXLAT + 1, passthru_cfgread_emulate, passthru_cfgwrite_emulate)) != 0) goto done; /* Allow access to the physical status register. */ if ((error = set_pcir_handler(sc, PCIR_COMMAND, 0x04, NULL, NULL)) != 0) goto done; SET_FOREACH(devpp, passthru_dev_set) { devp = *devpp; assert(devp->probe != NULL); if (devp->probe(pi) == 0) { dev = devp; break; } } if (dev != NULL) { error = dev->init(pi, nvl); if (error != 0) goto done; } error = 0; /* success */ done: if (error) { if (dev != NULL) dev->deinit(pi); free(sc); vm_unassign_pptdev(pi->pi_vmctx, bus, slot, func); } return (error); } static int msicap_access(struct passthru_softc *sc, int coff) { int caplen; if (sc->psc_msi.capoff == 0) return (0); caplen = msi_caplen(sc->psc_msi.msgctrl); if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen) return (1); else return (0); } static int msixcap_access(struct passthru_softc *sc, int coff) { if (sc->psc_msix.capoff == 0) return (0); return (coff >= sc->psc_msix.capoff && coff < sc->psc_msix.capoff + MSIX_CAPLEN); } static int passthru_cfgread_default(struct passthru_softc *sc, struct pci_devinst *pi __unused, int coff, int bytes, uint32_t *rv) { /* * MSI capability is emulated. */ if (msicap_access(sc, coff) || msixcap_access(sc, coff)) return (-1); /* * Emulate the command register. If a single read reads both the * command and status registers, read the status register from the * device's config space. */ if (coff == PCIR_COMMAND) { uint32_t st; if (bytes <= 2) return (-1); st = passthru_read_config(&sc->psc_sel, PCIR_STATUS, 2); *rv = (st << 16) | pci_get_cfgdata16(pi, PCIR_COMMAND); return (0); } /* Everything else just read from the device's config space */ *rv = passthru_read_config(&sc->psc_sel, coff, bytes); return (0); } int passthru_cfgread_emulate(struct passthru_softc *sc __unused, struct pci_devinst *pi __unused, int coff __unused, int bytes __unused, uint32_t *rv __unused) { return (-1); } static int passthru_cfgread(struct pci_devinst *pi, int coff, int bytes, uint32_t *rv) { struct passthru_softc *sc; sc = pi->pi_arg; if (sc->psc_pcir_rhandler[coff] != NULL) return (sc->psc_pcir_rhandler[coff](sc, pi, coff, bytes, rv)); return (passthru_cfgread_default(sc, pi, coff, bytes, rv)); } static int passthru_cfgwrite_default(struct passthru_softc *sc, struct pci_devinst *pi, int coff, int bytes, uint32_t val) { int error, msix_table_entries, i; uint16_t cmd_old; /* * MSI capability is emulated */ if (msicap_access(sc, coff)) { pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff, PCIY_MSI); error = vm_setup_pptdev_msi(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum); if (error != 0) err(1, "vm_setup_pptdev_msi"); return (0); } if (msixcap_access(sc, coff)) { pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff, PCIY_MSIX); if (pi->pi_msix.enabled) { msix_table_entries = pi->pi_msix.table_count; for (i = 0; i < msix_table_entries; i++) { error = vm_setup_pptdev_msix(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, pi->pi_msix.table[i].addr, pi->pi_msix.table[i].msg_data, pi->pi_msix.table[i].vector_control); if (error) err(1, "vm_setup_pptdev_msix"); } } else { error = vm_disable_pptdev_msix(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func); if (error) err(1, "vm_disable_pptdev_msix"); } return (0); } /* * The command register is emulated, but the status register * is passed through. */ if (coff == PCIR_COMMAND) { if (bytes <= 2) return (-1); /* Update the physical status register. */ passthru_write_config(&sc->psc_sel, PCIR_STATUS, val >> 16, 2); /* Update the virtual command register. */ cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND); pci_set_cfgdata16(pi, PCIR_COMMAND, val & 0xffff); pci_emul_cmd_changed(pi, cmd_old); return (0); } passthru_write_config(&sc->psc_sel, coff, bytes, val); return (0); } int passthru_cfgwrite_emulate(struct passthru_softc *sc __unused, struct pci_devinst *pi __unused, int coff __unused, int bytes __unused, uint32_t val __unused) { return (-1); } static int passthru_cfgwrite(struct pci_devinst *pi, int coff, int bytes, uint32_t val) { struct passthru_softc *sc; sc = pi->pi_arg; if (sc->psc_pcir_whandler[coff] != NULL) return (sc->psc_pcir_whandler[coff](sc, pi, coff, bytes, val)); return (passthru_cfgwrite_default(sc, pi, coff, bytes, val)); } static void passthru_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct passthru_softc *sc; struct pci_bar_ioreq pio; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { msix_table_write(sc, offset, size, value); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); assert(size == 1 || size == 2 || size == 4); assert(offset <= UINT32_MAX && offset + size <= UINT32_MAX); bzero(&pio, sizeof(pio)); pio.pbi_sel = sc->psc_sel; pio.pbi_op = PCIBARIO_WRITE; pio.pbi_bar = baridx; pio.pbi_offset = (uint32_t)offset; pio.pbi_width = size; pio.pbi_value = (uint32_t)value; (void)ioctl(pcifd, PCIOCBARIO, &pio); } } static uint64_t passthru_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct passthru_softc *sc; struct pci_bar_ioreq pio; uint64_t val; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { val = msix_table_read(sc, offset, size); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); assert(size == 1 || size == 2 || size == 4); assert(offset <= UINT32_MAX && offset + size <= UINT32_MAX); bzero(&pio, sizeof(pio)); pio.pbi_sel = sc->psc_sel; pio.pbi_op = PCIBARIO_READ; pio.pbi_bar = baridx; pio.pbi_offset = (uint32_t)offset; pio.pbi_width = size; (void)ioctl(pcifd, PCIOCBARIO, &pio); val = pio.pbi_value; } return (val); } static void passthru_msix_addr(struct pci_devinst *pi, int baridx, int enabled, uint64_t address) { struct passthru_softc *sc; size_t remaining; uint32_t table_size, table_offset; sc = pi->pi_arg; table_offset = rounddown2(pi->pi_msix.table_offset, 4096); if (table_offset > 0) { if (!enabled) { if (vm_unmap_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, table_offset) != 0) warnx("pci_passthru: unmap_pptdev_mmio failed"); } else { if (vm_map_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, table_offset, sc->psc_bar[baridx].addr) != 0) warnx("pci_passthru: map_pptdev_mmio failed"); } } table_size = pi->pi_msix.table_offset - table_offset; table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; table_size = roundup2(table_size, 4096); remaining = pi->pi_bar[baridx].size - table_offset - table_size; if (remaining > 0) { address += table_offset + table_size; if (!enabled) { if (vm_unmap_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, remaining) != 0) warnx("pci_passthru: unmap_pptdev_mmio failed"); } else { if (vm_map_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, remaining, sc->psc_bar[baridx].addr + table_offset + table_size) != 0) warnx("pci_passthru: map_pptdev_mmio failed"); } } } static void passthru_mmio_addr(struct pci_devinst *pi, int baridx, int enabled, uint64_t address) { struct passthru_softc *sc; sc = pi->pi_arg; if (!enabled) { if (vm_unmap_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, sc->psc_bar[baridx].size) != 0) warnx("pci_passthru: unmap_pptdev_mmio failed"); } else { if (vm_map_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, sc->psc_bar[baridx].size, sc->psc_bar[baridx].addr) != 0) warnx("pci_passthru: map_pptdev_mmio failed"); } } static void passthru_addr_rom(struct pci_devinst *const pi, const int idx, const int enabled) { const uint64_t addr = pi->pi_bar[idx].addr; const uint64_t size = pi->pi_bar[idx].size; if (!enabled) { if (vm_munmap_memseg(pi->pi_vmctx, addr, size) != 0) { errx(4, "%s: munmap_memseg @ [%016lx - %016lx] failed", __func__, addr, addr + size); } } else { if (vm_mmap_memseg(pi->pi_vmctx, addr, VM_PCIROM, pi->pi_romoffset, size, PROT_READ | PROT_EXEC) != 0) { errx(4, "%s: mmap_memseg @ [%016lx - %016lx] failed", __func__, addr, addr + size); } } } static void passthru_addr(struct pci_devinst *pi, int baridx, int enabled, uint64_t address) { switch (pi->pi_bar[baridx].type) { case PCIBAR_IO: /* IO BARs are emulated */ break; case PCIBAR_ROM: passthru_addr_rom(pi, baridx, enabled); break; case PCIBAR_MEM32: case PCIBAR_MEM64: if (baridx == pci_msix_table_bar(pi)) passthru_msix_addr(pi, baridx, enabled, address); else passthru_mmio_addr(pi, baridx, enabled, address); break; default: errx(4, "%s: invalid BAR type %d", __func__, pi->pi_bar[baridx].type); } } static const struct pci_devemu passthru = { .pe_emu = "passthru", .pe_init = passthru_init, .pe_legacy_config = passthru_legacy_config, .pe_cfgwrite = passthru_cfgwrite, .pe_cfgread = passthru_cfgread, .pe_barwrite = passthru_write, .pe_barread = passthru_read, .pe_baraddr = passthru_addr, }; PCI_EMUL_SET(passthru);