diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -73,6 +73,7 @@ VM_SYSMEM, VM_BOOTROM, VM_FRAMEBUFFER, + VM_PCIROM, }; /* @@ -180,6 +181,8 @@ vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len); +int vm_get_memory_region_info(struct vmctx *const ctx, vm_paddr_t *const base, + vm_paddr_t *const size, const enum vm_memory_region_type type); int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec); int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -1012,6 +1012,25 @@ return (ioctl(ctx->fd, VM_UNMAP_PPTDEV_MMIO, &pptmmio)); } +int +vm_get_memory_region_info(struct vmctx *const ctx, vm_paddr_t *const base, + vm_paddr_t *const size, const enum vm_memory_region_type type) +{ + struct vm_memory_region_info memory_region_info; + + bzero(&memory_region_info, sizeof(memory_region_info)); + memory_region_info.type = type; + + const int error = ioctl(ctx->fd, VM_GET_MEMORY_REGION_INFO, &memory_region_info); + + if (base) + *base = memory_region_info.base; + if (size) + *size = memory_region_info.size; + + return (error); +} + int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec) @@ -1687,7 +1706,7 @@ VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV, VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI, VM_PPTDEV_MSIX, VM_UNMAP_PPTDEV_MMIO, VM_PPTDEV_DISABLE_MSIX, - VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, + VM_GET_MEMORY_REGION_INFO, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE, VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA, VM_GLA2GPA_NOFAULT, diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -741,6 +741,11 @@ } u; }; +enum vm_memory_region_type { + MEMORY_REGION_INTEL_GSM, + MEMORY_REGION_INTEL_OPREGION, +}; + /* APIs to inject faults into the guest */ void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, int errcode); diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -146,6 +146,17 @@ size_t len; }; +struct vm_memory_region_info { + vm_paddr_t base; + vm_paddr_t size; + enum vm_memory_region_type type; +}; + +#ifdef _KERNEL +extern vm_paddr_t intel_graphics_stolen_base; +extern vm_paddr_t intel_graphics_stolen_size; +#endif + struct vm_pptdev_msi { int vcpu; int bus; @@ -309,6 +320,7 @@ IOCNUM_PPTDEV_MSIX = 44, IOCNUM_PPTDEV_DISABLE_MSIX = 45, IOCNUM_UNMAP_PPTDEV_MMIO = 46, + IOCNUM_GET_MEMORY_REGION_INFO = 47, /* statistics */ IOCNUM_VM_STATS = 50, @@ -427,6 +439,8 @@ _IOW('v', IOCNUM_PPTDEV_DISABLE_MSIX, struct vm_pptdev) #define VM_UNMAP_PPTDEV_MMIO \ _IOW('v', IOCNUM_UNMAP_PPTDEV_MMIO, struct vm_pptdev_mmio) +#define VM_GET_MEMORY_REGION_INFO \ + _IOWR('v', IOCNUM_GET_MEMORY_REGION_INFO, struct vm_memory_region_info) #define VM_INJECT_NMI \ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) #define VM_STATS \ diff --git a/sys/amd64/vmm/intel/intelgpu.h b/sys/amd64/vmm/intel/intelgpu.h new file mode 100644 --- /dev/null +++ b/sys/amd64/vmm/intel/intelgpu.h @@ -0,0 +1,185 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * Author: Corvin Köhne + */ + +#pragma once + +/* + * See + * + */ + +#define IGD_OPREGION_HEADER_SIGN "IntelGraphicsMem" +#define IGD_OPREGION_HEADER_MBOX1 BIT0 +#define IGD_OPREGION_HEADER_MBOX2 BIT1 +#define IGD_OPREGION_HEADER_MBOX3 BIT2 +#define IGD_OPREGION_HEADER_MBOX4 BIT3 +#define IGD_OPREGION_HEADER_MBOX5 BIT4 + +#define IGD_OPREGION_VBT_SIZE_6K (6 * 1024UL) + +/** + OpRegion structures: + Sub-structures define the different parts of the OpRegion followed by the + main structure representing the entire OpRegion. + + @note These structures are packed to 1 byte offsets because the exact + data location is required by the supporting design specification due to + the fact that the data is used by ASL and Graphics driver code compiled + separately. +**/ +#pragma pack(1) +/// +/// OpRegion Mailbox 0 Header structure. The OpRegion Header is used to +/// identify a block of memory as the graphics driver OpRegion. +/// Offset 0x0, Size 0x100 +/// +struct igd_opregion_header { + int8_t sign[0x10]; ///< Offset 0x00 OpRegion Signature + uint32_t size; ///< Offset 0x10 OpRegion Size + uint32_t over; ///< Offset 0x14 OpRegion Structure Version + uint8_t sver[0x20]; ///< Offset 0x18 System BIOS Build Version + uint8_t vver[0x10]; ///< Offset 0x38 Video BIOS Build Version + uint8_t gver[0x10]; ///< Offset 0x48 Graphic Driver Build Version + uint32_t mbox; ///< Offset 0x58 Supported Mailboxes + uint32_t dmod; ///< Offset 0x5C Driver Model + uint32_t pcon; ///< Offset 0x60 Platform Configuration + int16_t dver[0x10]; ///< Offset 0x64 GOP Version + uint8_t rm01[0x7C]; ///< Offset 0x84 Reserved Must be zero +}; + +/// +/// OpRegion Mailbox 1 - Public ACPI Methods +/// Offset 0x100, Size 0x100 +/// +struct igd_opregion_mbox1 { + uint32_t drdy; ///< Offset 0x100 Driver Readiness + uint32_t csts; ///< Offset 0x104 Status + uint32_t cevt; ///< Offset 0x108 Current Event + uint8_t rm11[0x14]; ///< Offset 0x10C Reserved Must be Zero + uint32_t didl[8]; ///< Offset 0x120 Supported Display Devices ID List + uint32_t + cpdl[8]; ///< Offset 0x140 Currently Attached Display Devices List + uint32_t + cadl[8]; ///< Offset 0x160 Currently Active Display Devices List + uint32_t nadl[8]; ///< Offset 0x180 Next Active Devices List + uint32_t aslp; ///< Offset 0x1A0 ASL Sleep Time Out + uint32_t tidx; ///< Offset 0x1A4 Toggle Table Index + uint32_t chpd; ///< Offset 0x1A8 Current Hotplug Enable Indicator + uint32_t clid; ///< Offset 0x1AC Current Lid State Indicator + uint32_t cdck; ///< Offset 0x1B0 Current Docking State Indicator + uint32_t sxsw; ///< Offset 0x1B4 Display Switch Notification on Sx + ///< StateResume + uint32_t evts; ///< Offset 0x1B8 Events supported by ASL + uint32_t cnot; ///< Offset 0x1BC Current OS Notification + uint32_t NRDY; ///< Offset 0x1C0 Driver Status + uint8_t did2[0x1C]; ///< Offset 0x1C4 Extended Supported Devices ID + ///< List(DOD) + uint8_t + cpd2[0x1C]; ///< Offset 0x1E0 Extended Attached Display Devices List + uint8_t rm12[4]; ///< Offset 0x1FC - 0x1FF Reserved Must be zero +}; + +/// +/// OpRegion Mailbox 2 - Software SCI Interface +/// Offset 0x200, Size 0x100 +/// +struct igd_opregion_mbox2 { + uint32_t scic; ///< Offset 0x200 Software SCI Command / Status / Data + uint32_t parm; ///< Offset 0x204 Software SCI Parameters + uint32_t dslp; ///< Offset 0x208 Driver Sleep Time Out + uint8_t rm21[0xF4]; ///< Offset 0x20C - 0x2FF Reserved Must be zero +}; + +/// +/// OpRegion Mailbox 3 - BIOS/Driver Notification - ASLE Support +/// Offset 0x300, Size 0x100 +/// +struct igd_opregion_mbox3 { + uint32_t ardy; ///< Offset 0x300 Driver Readiness + uint32_t aslc; ///< Offset 0x304 ASLE Interrupt Command / Status + uint32_t tche; ///< Offset 0x308 Technology Enabled Indicator + uint32_t alsi; ///< Offset 0x30C Current ALS Luminance Reading + uint32_t bclp; ///< Offset 0x310 Requested Backlight Brightness + uint32_t pfit; ///< Offset 0x314 Panel Fitting State or Request + uint32_t cblv; ///< Offset 0x318 Current Brightness Level + uint16_t bclm[0x14]; ///< Offset 0x31C Backlight Brightness Levels Duty + ///< Cycle Mapping Table + uint32_t cpfm; ///< Offset 0x344 Current Panel Fitting Mode + uint32_t epfm; ///< Offset 0x348 Enabled Panel Fitting Modes + uint8_t plut[0x4A]; ///< Offset 0x34C Panel Look Up Table & Identifier + uint32_t pfmb; ///< Offset 0x396 PWM Frequency and Minimum Brightness + uint32_t ccdv; ///< Offset 0x39A Color Correction Default Values + uint32_t pcft; ///< Offset 0x39E Power Conservation Features + uint32_t srot; ///< Offset 0x3A2 Supported Rotation Angles + uint32_t iuer; ///< Offset 0x3A6 Intel Ultrabook(TM) Event Register + uint64_t fdss; ///< Offset 0x3AA DSS Buffer address allocated for IFFS + ///< feature + uint32_t fdsp; ///< Offset 0x3B2 Size of DSS buffer + uint32_t stat; ///< Offset 0x3B6 State Indicator + uint64_t rvda; ///< Offset 0x3BA Absolute/Relative Address of Raw VBT + ///< Data from OpRegion Base + uint32_t rvds; ///< Offset 0x3C2 Raw VBT Data Size + uint8_t rsvd2[0x3A]; ///< Offset 0x3C6 - 0x3FF Reserved Must be zero. + ///< Bug in spec 0x45(69) +}; + +/// +/// OpRegion Mailbox 4 - VBT Video BIOS Table +/// Offset 0x400, Size 0x1800 +/// +struct igd_opregion_mbox4 { + uint8_t rvbt[IGD_OPREGION_VBT_SIZE_6K]; ///< Offset 0x400 - 0x1BFF Raw + ///< VBT Data +}; + +/// +/// OpRegion Mailbox 5 - BIOS/Driver Notification - Data storage BIOS to Driver +/// data sync Offset 0x1C00, Size 0x400 +/// +struct igd_opregion_mbox5 { + uint32_t phed; ///< Offset 0x1C00 Panel Header + uint8_t bddc[0x100]; ///< Offset 0x1C04 Panel EDID (DDC data) + uint8_t rm51[0x2FC]; ///< Offset 0x1D04 - 0x1FFF Reserved Must be zero +}; + +/// +/// IGD OpRegion Structure +/// +struct igd_opregion { + struct igd_opregion_header + header; ///< OpRegion header (Offset 0x0, Size 0x100) + struct igd_opregion_mbox1 mbox1; ///< Mailbox 1: Public ACPI Methods + ///< (Offset 0x100, Size 0x100) + struct igd_opregion_mbox2 mbox2; ///< Mailbox 2: Software SCI Interface + ///< (Offset 0x200, Size 0x100) + struct igd_opregion_mbox3 + mbox3; ///< Mailbox 3: BIOS to Driver Notification (Offset 0x300, + ///< Size 0x100) + struct igd_opregion_mbox4 mbox4; ///< Mailbox 4: Video BIOS Table (VBT) + ///< (Offset 0x400, Size 0x1800) + struct igd_opregion_mbox5 + mbox5; ///< Mailbox 5: BIOS to Driver Notification Extension (Offset + ///< 0x1C00, Size 0x400) +}; + +/// +/// VBT Header Structure +/// +struct vbt_header { + uint8_t product_string[20]; + uint16_t version; + uint16_t header_size; + uint16_t table_size; + uint8_t checksum; + uint8_t reserved1; + uint32_t bios_data_offset; + uint32_t aim_data_offset[4]; +}; + +#pragma pack() + +int vm_intelgpu_get_opregion(vm_paddr_t *const base, vm_paddr_t *const size); diff --git a/sys/amd64/vmm/intel/intelgpu.c b/sys/amd64/vmm/intel/intelgpu.c new file mode 100644 --- /dev/null +++ b/sys/amd64/vmm/intel/intelgpu.c @@ -0,0 +1,55 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * Author: Corvin Köhne + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include + +#include +#include + +#include "intelgpu.h" + +#define KB (1024UL) + +int +vm_intelgpu_get_opregion(vm_paddr_t *const base, vm_paddr_t *const size) +{ + /* intel graphics device is always located at 0:2.0 */ + device_t dev = pci_find_bsf(0, 2, 0); + if (dev == NULL) { + return (ENOENT); + } + + if ((pci_get_vendor(dev) != PCI_VENDOR_INTEL) || + (pci_get_class(dev) != PCIC_DISPLAY) || + (pci_get_subclass(dev) != PCIS_DISPLAY_VGA)) { + return (ENODEV); + } + + const uint64_t asls = pci_read_config(dev, PCIR_ASLS_CTL, 4); + + const struct igd_opregion_header *const opregion_header = + (struct igd_opregion_header *)pmap_map(NULL, asls, + asls + sizeof(*opregion_header), VM_PROT_READ); + if (opregion_header == NULL || + memcmp(opregion_header->sign, IGD_OPREGION_HEADER_SIGN, + sizeof(opregion_header->sign))) { + return (ENODEV); + } + + *base = asls; + *size = opregion_header->size * KB; + + return (0); +} diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -134,7 +134,7 @@ bool sysmem; struct vm_object *object; }; -#define VM_MAX_MEMSEGS 3 +#define VM_MAX_MEMSEGS 4 struct mem_map { vm_paddr_t gpa; diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -60,6 +60,7 @@ #include #include +#include "intel/intelgpu.h" #include "vmm_lapic.h" #include "vmm_stat.h" #include "vmm_mem.h" @@ -373,6 +374,7 @@ struct vm_capability *vmcap; struct vm_pptdev *pptdev; struct vm_pptdev_mmio *pptmmio; + struct vm_memory_region_info *memory_region_info; struct vm_pptdev_msi *pptmsi; struct vm_pptdev_msix *pptmsix; struct vm_nmi *vmnmi; @@ -540,6 +542,24 @@ error = ppt_unmap_mmio(sc->vm, pptmmio->bus, pptmmio->slot, pptmmio->func, pptmmio->gpa, pptmmio->len); break; + case VM_GET_MEMORY_REGION_INFO: + memory_region_info = (struct vm_memory_region_info *)data; + switch (memory_region_info->type) { + case MEMORY_REGION_INTEL_GSM: + memory_region_info->base = intel_graphics_stolen_base; + memory_region_info->size = intel_graphics_stolen_size; + error = 0; + break; + case MEMORY_REGION_INTEL_OPREGION: + error = + vm_intelgpu_get_opregion(&memory_region_info->base, + &memory_region_info->size); + break; + default: + error = EINVAL; + break; + } + break; case VM_BIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, diff --git a/sys/dev/pci/pcireg.h b/sys/dev/pci/pcireg.h --- a/sys/dev/pci/pcireg.h +++ b/sys/dev/pci/pcireg.h @@ -1098,3 +1098,14 @@ #define PCIM_OSC_CTL_PCIE_PME 0x04 /* PCIe Native Power Mgt Events */ #define PCIM_OSC_CTL_PCIE_AER 0x08 /* PCIe Advanced Error Reporting */ #define PCIM_OSC_CTL_PCIE_CAP_STRUCT 0x10 /* Various Capability Structures */ + +/* + * Intel graphics device definitions + */ +#define PCIR_BDSM 0x5C /* Base of Data Stolen Memory register */ +#define PCIR_ASLS_CTL 0xFC /* Opregion start address register */ + +/* + * PCI Vendors + */ +#define PCI_VENDOR_INTEL 0x8086 diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile --- a/sys/modules/vmm/Makefile +++ b/sys/modules/vmm/Makefile @@ -42,6 +42,7 @@ # intel-specific files .PATH: ${SRCTOP}/sys/amd64/vmm/intel SRCS+= ept.c \ + intelgpu.c \ vmcs.c \ vmx_msr.c \ vmx_support.S \ diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -15,6 +15,7 @@ BHYVE_SYSDIR?=${SRCTOP} SRCS= \ + acpi_device.c \ atkbdc.c \ acpi.c \ audio.c \ @@ -26,6 +27,7 @@ console.c \ ctl_util.c \ ctl_scsi_all.c \ + e820.c \ fwctl.c \ gdb.c \ hda_codec.c \ @@ -42,6 +44,7 @@ pci_emul.c \ pci_hda.c \ pci_fbuf.c \ + pci_gvt-d.c \ pci_hostbridge.c \ pci_irq.c \ pci_lpc.c \ @@ -61,6 +64,7 @@ post.c \ ps2kbd.c \ ps2mouse.c \ + qemu_fwcfg.c \ rfb.c \ rtc.c \ smbiostbl.c \ diff --git a/usr.sbin/bhyve/acpi.h b/usr.sbin/bhyve/acpi.h --- a/usr.sbin/bhyve/acpi.h +++ b/usr.sbin/bhyve/acpi.h @@ -31,6 +31,8 @@ #ifndef _ACPI_H_ #define _ACPI_H_ +#include "acpi_device.h" + #define SCI_INT 9 #define SMI_CMD 0xb2 @@ -55,6 +57,7 @@ int acpi_build(struct vmctx *ctx, int ncpu); void acpi_raise_gpe(struct vmctx *ctx, unsigned bit); +int acpi_tables_add_device(const struct acpi_device *const dev); void dsdt_line(const char *fmt, ...); void dsdt_fixed_ioport(uint16_t iobase, uint16_t length); void dsdt_fixed_irq(uint8_t irq); diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c --- a/usr.sbin/bhyve/acpi.c +++ b/usr.sbin/bhyve/acpi.c @@ -139,6 +139,30 @@ #define EFFLUSH(x) \ if (fflush(x) != 0) goto err_exit; +/* + * A list for additional ACPI devices like a TPM. + */ +struct acpi_device_list_entry { + SLIST_ENTRY(acpi_device_list_entry) chain; + const struct acpi_device *dev; +}; +SLIST_HEAD(acpi_device_list, + acpi_device_list_entry) acpi_devices = SLIST_HEAD_INITIALIZER(acpi_devices); + +int +acpi_tables_add_device(const struct acpi_device *const dev) +{ + struct acpi_device_list_entry *const entry = calloc(1, sizeof(*entry)); + if (entry == NULL) { + return (ENOMEM); + } + + entry->dev = dev; + SLIST_INSERT_HEAD(&acpi_devices, entry, chain); + + return (0); +} + static int basl_fwrite_rsdp(FILE *fp) { @@ -760,6 +784,11 @@ vmgenc_write_dsdt(); + const struct acpi_device_list_entry *entry; + SLIST_FOREACH(entry, &acpi_devices, chain) { + acpi_device_write_dsdt(entry->dev); + } + dsdt_line("}"); if (dsdt_error != 0) diff --git a/usr.sbin/bhyve/acpi_device.h b/usr.sbin/bhyve/acpi_device.h new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/acpi_device.h @@ -0,0 +1,42 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * Author: Corvin Köhne + */ + +#pragma once + +#include + +struct vmctx; + +struct acpi_device; + +/** + * Creates an ACPI device. + * + * @param[out] new_dev Returns the newly create ACPI device. + * @param[in] vm_ctx VM context the ACPI device is created in. + * @param[in] name Name of the ACPI device. Should always be a NULL + * terminated string. + * @param[in] hid Hardware ID of the ACPI device. Should always be a NULL + * terminated string. + */ +int acpi_device_create(struct acpi_device **const new_dev, + struct vmctx *const vm_ctx, const char *const name, const char *const hid); +void acpi_device_destroy(struct acpi_device *const dev); + +/** + * @note: acpi_device_add_res_acpi_buffer doesn't ensure that no resources are + * added on an error condition. On error the caller should assume that + * the ACPI_BUFFER is partially added to the ACPI device. + */ +int acpi_device_add_res_acpi_buffer(struct acpi_device *const dev, + const ACPI_BUFFER resources); +int acpi_device_add_res_fixed_ioport(struct acpi_device *const dev, + const UINT16 port, UINT8 length); +int acpi_device_add_res_fixed_memory32(struct acpi_device *const dev, + const UINT8 write_protected, const UINT32 address, const UINT32 length); + +void acpi_device_write_dsdt(const struct acpi_device *const dev); diff --git a/usr.sbin/bhyve/acpi_device.c b/usr.sbin/bhyve/acpi_device.c new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/acpi_device.c @@ -0,0 +1,240 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * Author: Corvin Köhne + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include + +#include +#include +#include + +#include "acpi.h" +#include "acpi_device.h" + +/** + * List entry to enumerate all resources used by an ACPI device. + * + * @param chain Used to chain multiple elements together. + * @param type Type of the ACPI resource. + * @param data Data of the ACPI resource. + */ +struct acpi_resource_list_entry { + SLIST_ENTRY(acpi_resource_list_entry) chain; + UINT32 type; + ACPI_RESOURCE_DATA data; +}; + +/** + * Holds information about an ACPI device. + * + * @param vm_ctx VM context the ACPI device was created in. + * @param name Name of the ACPI device. + * @param hid Hardware ID of the ACPI device. + * @param crs Current resources used by the ACPI device. + */ +struct acpi_device { + struct vmctx *vm_ctx; + const char *name; + const char *hid; + SLIST_HEAD(acpi_resource_list, acpi_resource_list_entry) crs; +}; + +int +acpi_device_create(struct acpi_device **const new_dev, + struct vmctx *const vm_ctx, const char *const name, const char *const hid) +{ + if (new_dev == NULL || vm_ctx == NULL || name == NULL || hid == NULL) { + return (EINVAL); + } + + struct acpi_device *const dev = calloc(1, sizeof(*dev)); + if (dev == NULL) { + return (ENOMEM); + } + + dev->vm_ctx = vm_ctx; + dev->name = name; + dev->hid = hid; + SLIST_INIT(&dev->crs); + + /* current resources always contain an end tag */ + struct acpi_resource_list_entry *const crs_end_tag = calloc(1, + sizeof(*crs_end_tag)); + if (crs_end_tag == NULL) { + acpi_device_destroy(dev); + return (ENOMEM); + } + crs_end_tag->type = ACPI_RESOURCE_TYPE_END_TAG; + SLIST_INSERT_HEAD(&dev->crs, crs_end_tag, chain); + + const int error = acpi_tables_add_device(dev); + if (error) { + acpi_device_destroy(dev); + return (error); + } + + *new_dev = dev; + + return (0); +} + +void +acpi_device_destroy(struct acpi_device *const dev) +{ + if (dev == NULL) { + return; + } + + struct acpi_resource_list_entry *res; + while (!SLIST_EMPTY(&dev->crs)) { + res = SLIST_FIRST(&dev->crs); + SLIST_REMOVE_HEAD(&dev->crs, chain); + free(res); + } +} + +int +acpi_device_add_res_acpi_buffer(struct acpi_device *const dev, + const ACPI_BUFFER resources) +{ + if (dev == NULL) { + return (EINVAL); + } + + int error = 0; + size_t offset = 0; + while (offset < resources.Length) { + const ACPI_RESOURCE *const res = + (const ACPI_RESOURCE *)((UINT8 *)resources.Pointer + + offset); + switch (res->Type) { + case ACPI_RESOURCE_TYPE_FIXED_IO: + error = acpi_device_add_res_fixed_ioport(dev, + res->Data.FixedIo.Address, + res->Data.FixedIo.AddressLength); + break; + case ACPI_RESOURCE_TYPE_FIXED_MEMORY32: + error = acpi_device_add_res_fixed_memory32(dev, + res->Data.FixedMemory32.WriteProtect, + res->Data.FixedMemory32.Address, + res->Data.FixedMemory32.AddressLength); + break; + case ACPI_RESOURCE_TYPE_END_TAG: + break; + default: + warnx("%s: unknown resource type %d", __func__, + res->Type); + return (ENODEV); + } + if (error) { + break; + } + offset += res->Length; + } + + return (error); +} + +int +acpi_device_add_res_fixed_ioport(struct acpi_device *const dev, + const UINT16 port, const UINT8 length) +{ + if (dev == NULL) { + return (EINVAL); + } + + struct acpi_resource_list_entry *const res = calloc(1, sizeof(*res)); + if (res == NULL) { + return (ENOMEM); + } + + res->type = ACPI_RESOURCE_TYPE_FIXED_IO; + res->data.FixedIo.Address = port; + res->data.FixedIo.AddressLength = length; + + SLIST_INSERT_HEAD(&dev->crs, res, chain); + + return (0); +} + +int +acpi_device_add_res_fixed_memory32(struct acpi_device *const dev, + const UINT8 write_protected, const UINT32 address, const UINT32 length) +{ + if (dev == NULL) { + return (EINVAL); + } + + struct acpi_resource_list_entry *const res = calloc(1, sizeof(*res)); + if (res == NULL) { + return (ENOMEM); + } + + res->type = ACPI_RESOURCE_TYPE_FIXED_MEMORY32; + res->data.FixedMemory32.WriteProtect = write_protected; + res->data.FixedMemory32.Address = address; + res->data.FixedMemory32.AddressLength = length; + + SLIST_INSERT_HEAD(&dev->crs, res, chain); + + return (0); +} + +static void +acpi_device_write_dsdt_crs(const struct acpi_device *const dev) +{ + const struct acpi_resource_list_entry *res; + SLIST_FOREACH (res, &dev->crs, chain) { + switch (res->type) { + case ACPI_RESOURCE_TYPE_FIXED_IO: + dsdt_fixed_ioport(res->data.FixedIo.Address, + res->data.FixedIo.AddressLength); + break; + case ACPI_RESOURCE_TYPE_FIXED_MEMORY32: { + dsdt_fixed_mem32(res->data.FixedMemory32.Address, + res->data.FixedMemory32.AddressLength); + break; + } + case ACPI_RESOURCE_TYPE_END_TAG: + break; + default: + warnx("%s: unknown resource type %d", __func__, + res->type); + return; + } + } +} + +void +acpi_device_write_dsdt(const struct acpi_device *const dev) +{ + if (dev == NULL) { + return; + } + + dsdt_line(""); + dsdt_line(" Scope (\\_SB)"); + dsdt_line(" {"); + dsdt_line(" Device (%s)", dev->name); + dsdt_line(" {"); + dsdt_line(" Name (_HID, \"%s\")", dev->hid); + dsdt_line(" Name (_STA, 0x0F)"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(4); + acpi_device_write_dsdt_crs(dev); + dsdt_unindent(4); + dsdt_line(" })"); + dsdt_line(" }"); + dsdt_line(" }"); +} diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -45,6 +45,15 @@ .Op Cm ,threads= Ar n .Oc .Sm on +.Oo Fl f +.Sm off +.Ar name Cm \&, +.Oo +.Cm string No | Cm file +.Oc +.Cm \&= Ar data +.Sm on +.Oc .Oo .Sm off .Fl G\~ @@ -144,6 +153,16 @@ .Nm to exit when a guest issues an access to an I/O port that is not emulated. This is intended for debug purposes. +.It Fl f Ar name Ns Cm \&, Ns Oo Cm string Ns No | Ns Cm file Ns Oc Ns Cm \&= Ns Ar data +Add a fw_cfg file +.Ar name +to the fw_cfg interface. +If a +.Cm string +is specified, the fw_cfg file contains the string as data. +If a +.Cm file +is specified, bhyve reads the file and adds the file content as fw_cfg data. .It Fl G Xo .Sm off .Oo Ar w Oc @@ -520,6 +539,11 @@ and .Ar function numbers. +.It Li rom= Ns Ar romfile +Add +.Ar romfile +as option ROM to the PCI device. +The ROM will be loaded by firmware and should be capable of initializing the device. .El .Pp Guest memory must be wired using the diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -90,6 +90,7 @@ #include "config.h" #include "inout.h" #include "debug.h" +#include "e820.h" #include "fwctl.h" #include "gdb.h" #include "ioapic.h" @@ -100,6 +101,7 @@ #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" +#include "qemu_fwcfg.h" #include "smbiostbl.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" @@ -1249,9 +1251,9 @@ progname = basename(argv[0]); #ifdef BHYVE_SNAPSHOT - optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:r:"; + optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:U:r:"; #else - optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:"; + optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { @@ -1279,6 +1281,11 @@ case 'C': set_config_bool("memory.guest_in_core", true); break; + case 'f': + if (qemu_fwcfg_parse_cmdline_arg(optarg) != 0) { + exit(1); + } + break; case 'G': parse_gdb_options(optarg); break; @@ -1452,6 +1459,57 @@ rtc_init(ctx); sci_init(ctx); + const char *fwcfg = lpc_fwcfg(); + if (lpc_bootrom()) { + if (fwcfg == NULL || strcmp(fwcfg, "bhyve") == 0) { + fwctl_init(); + } else if (strcmp(fwcfg, "qemu") == 0) { + if (qemu_fwcfg_init(ctx) != 0) { + fprintf(stderr, + "qemu fwcfg initialization error"); + exit(4); + } + /* + * QEMU uses fwcfg item 0x0f (FW_CFG_MAX_CPUS) to report + * the number of cpus to the guest but states that it + * has a special meaning for x86. Don't know yet if that + * can cause unintented side-effects. Use an own fwcfg + * item to be safe. + * + * QEMU comment: + * FW_CFG_MAX_CPUS is a bit confusing/problematic + * on x86: + * + * For machine types prior to 1.8, SeaBIOS needs + * FW_CFG_MAX_CPUS for building MPTable, ACPI MADT, + * ACPI CPU hotplug and ACPI SRAT table, that + * tables are based on xAPIC ID and QEMU<->SeaBIOS + * interface for CPU hotplug also uses APIC ID and + * not "CPU index". This means that FW_CFG_MAX_CPUS + * is not the "maximum number of CPUs", but the + * "limit to the APIC ID values SeaBIOS may see". + * + * So for compatibility reasons with old BIOSes we + * are stuck with "etc/max-cpus" actually being + * apic_id_limit + */ + if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", + sizeof(guest_ncpus), &guest_ncpus) != 0) { + fprintf(stderr, + "Could not add qemu fwcfg opt/bhyve/hw.ncpu"); + exit(4); + } + + if (e820_init(ctx) != 0) { + fprintf(stderr, "Unable to setup E820"); + exit(4); + } + } else { + fprintf(stderr, "Invalid fwcfg %s", fwcfg); + exit(4); + } + } + /* * Exit if a device emulation finds an error in its initilization */ @@ -1535,8 +1593,20 @@ assert(error == 0); } - if (lpc_bootrom()) - fwctl_init(); + if (strcmp(fwcfg, "qemu") == 0) { + struct qemu_fwcfg_item *const e820_fwcfg_item = + e820_get_fwcfg_item(); + if (e820_fwcfg_item == NULL) { + fprintf(stderr, "invalid e820 table"); + exit(4); + } + if (qemu_fwcfg_add_file("etc/e820", e820_fwcfg_item->size, + e820_fwcfg_item->data) != 0) { + fprintf(stderr, "could not add qemu fwcfg etc/e820"); + exit(4); + } + free(e820_fwcfg_item); + } /* * Change the proc title to include the VM name. diff --git a/usr.sbin/bhyve/e820.h b/usr.sbin/bhyve/e820.h new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/e820.h @@ -0,0 +1,49 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * Author: Corvin Köhne + */ + +#pragma once + +#include + +#include "qemu_fwcfg.h" + +#pragma pack(push, 1) + +enum e820_memory_type { + E820_TYPE_MEMORY = 1, + E820_TYPE_RESERVED = 2, + E820_TYPE_ACPI = 3, + E820_TYPE_NVS = 4 +}; + +enum e820_allocation_strategy { + /* allocate any address */ + E820_ALLOCATE_ANY, + /* allocate lowest address larger than address */ + E820_ALLOCATE_LOWEST, + /* allocate highest address lower than address */ + E820_ALLOCATE_HIGHEST, + /* allocate a specific address */ + E820_ALLOCATE_SPECIFIC +}; + +struct e820_entry { + uint64_t base; + uint64_t length; + enum e820_memory_type type; +}; + +#pragma pack(pop) + +#define E820_ALIGNMENT_NONE 1 + +uint64_t e820_alloc(const uint64_t address, const uint64_t length, + const uint64_t alignment, const enum e820_memory_type type, + const enum e820_allocation_strategy strategy); +void e820_dump_table(); +struct qemu_fwcfg_item *e820_get_fwcfg_item(); +int e820_init(struct vmctx *const ctx); diff --git a/usr.sbin/bhyve/e820.c b/usr.sbin/bhyve/e820.c new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/e820.c @@ -0,0 +1,452 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * Author: Corvin Köhne + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "e820.h" +#include "qemu_fwcfg.h" + +/* + * E820 always uses 64 bit entries. Emulation code will use vm_paddr_t since it + * works on physical addresses. If vm_paddr_t is larger than uint64_t E820 can't + * hold all possible physical addresses and we can get into trouble. + */ +static_assert(sizeof(vm_paddr_t) <= sizeof(uint64_t), + "Unable to represent physical memory by E820 table"); + +#define E820_FWCFG_FILE_NAME "etc/e820" + +#define KB (1024UL) +#define MB (1024 * KB) +#define GB (1024 * MB) + +/* + * Fix E820 memory holes: + * [ A0000, C0000) VGA + * [ C0000, 100000) ROM + */ +#define E820_VGA_MEM_BASE 0xA0000 +#define E820_VGA_MEM_END 0xC0000 +#define E820_ROM_MEM_BASE 0xC0000 +#define E820_ROM_MEM_END 0x100000 + +struct e820_element { + TAILQ_ENTRY(e820_element) chain; + uint64_t base; + uint64_t end; + enum e820_memory_type type; +}; +TAILQ_HEAD(e820_table, e820_element) e820_table = TAILQ_HEAD_INITIALIZER( + e820_table); + +static const char * +e820_get_type_name(const enum e820_memory_type type) +{ + switch (type) { + case E820_TYPE_MEMORY: + return "RAM "; + case E820_TYPE_RESERVED: + return "Reserved"; + case E820_TYPE_ACPI: + return "ACPI "; + case E820_TYPE_NVS: + return "NVS "; + default: + return "Unknown "; + } +} + +void +e820_dump_table() +{ + fprintf(stderr, "E820 map:\n\r"); + uint64_t i = 0; + struct e820_element *element; + TAILQ_FOREACH (element, &e820_table, chain) { + fprintf(stderr, " (%4lu) [ %16lx, %16lx] %s\n\r", i, + element->base, element->end, + e820_get_type_name(element->type)); + ++i; + } +} + +struct qemu_fwcfg_item * +e820_get_fwcfg_item() +{ + uint64_t count = 0; + struct e820_element *element; + TAILQ_FOREACH (element, &e820_table, chain) { + ++count; + } + if (count == 0) { + warnx("%s: E820 table empty", __func__); + return (NULL); + } + + struct qemu_fwcfg_item *const fwcfg_item = malloc( + sizeof(struct qemu_fwcfg_item)); + if (fwcfg_item == NULL) { + return (NULL); + } + fwcfg_item->size = count * sizeof(struct e820_entry); + fwcfg_item->data = malloc(fwcfg_item->size); + if (fwcfg_item->data == NULL) { + free(fwcfg_item); + return (NULL); + } + uint64_t i = 0; + struct e820_entry *entries = (struct e820_entry *)fwcfg_item->data; + TAILQ_FOREACH (element, &e820_table, chain) { + struct e820_entry *entry = &entries[i]; + entry->base = element->base; + entry->length = element->end - element->base; + entry->type = element->type; + ++i; + } + + return fwcfg_item; +} + +int +e820_add_entry(const uint64_t base, const uint64_t end, + const enum e820_memory_type type) +{ + if (end < base) { + return (-1); + } + + struct e820_element *const new_element = malloc( + sizeof(struct e820_element)); + if (new_element == NULL) { + return (-ENOMEM); + } + + new_element->base = base; + new_element->end = end; + new_element->type = type; + + /* + * E820 table should be always sorted in ascending order. Therefore, + * search for an element which end is larger than the base parameter. + */ + struct e820_element *element; + TAILQ_FOREACH (element, &e820_table, chain) { + if (element->end > base) { + break; + } + } + + /* + * System memory requires special handling. + */ + if (type == E820_TYPE_MEMORY) { + /* + * base is larger than of any existing element. Add new system + * memory at the end of the table. + */ + if (element == NULL) { + TAILQ_INSERT_TAIL(&e820_table, new_element, chain); + return (0); + } + + /* + * System memory shouldn't overlap with any existing element. + */ + if (end > element->base) { + return (-1); + } + TAILQ_INSERT_BEFORE(element, new_element, chain); + return (0); + } + + if (element == NULL) { + /* No suitable element found */ + return (-1); + } + + /* + * Non system memory should be allocated inside system memory. + */ + if (element->type != E820_TYPE_MEMORY) { + return (-1); + } + /* + * New element should fit into existing system memory element. + */ + if (base < element->base || end > element->end) { + return (-1); + } + + if (base == element->base) { + /* + * New element at system memory base boundary. Add new + * element before current and adjust the base of the old + * element. + * + * Old table: + * [ 0x1000, 0x4000] RAM <-- element + * New table: + * [ 0x1000, 0x2000] Reserved + * [ 0x2000, 0x4000] RAM <-- element + */ + TAILQ_INSERT_BEFORE(element, new_element, chain); + element->base = end; + } else if (end == element->end) { + /* + * New element at system memory end boundary. Add new + * element after current and adjust the end of the + * current element. + * + * Old table: + * [ 0x1000, 0x4000] RAM <-- element + * New table: + * [ 0x1000, 0x3000] RAM <-- element + * [ 0x3000, 0x4000] Reserved + */ + TAILQ_INSERT_AFTER(&e820_table, element, new_element, chain); + element->end = base; + } else { + /* + * New element inside system memory entry. Split it by + * adding a system memory element and the new element + * before current. + * + * Old table: + * [ 0x1000, 0x4000] RAM <-- element + * New table: + * [ 0x1000, 0x2000] RAM + * [ 0x2000, 0x3000] Reserved + * [ 0x3000, 0x4000] RAM <-- element + */ + struct e820_element *ram_element = malloc( + sizeof(struct e820_element)); + if (ram_element == NULL) { + return (-ENOMEM); + } + ram_element->base = element->base; + ram_element->end = base; + ram_element->type = E820_TYPE_MEMORY; + TAILQ_INSERT_BEFORE(element, ram_element, chain); + TAILQ_INSERT_BEFORE(element, new_element, chain); + element->base = end; + } + + return (0); +} + +static int +e820_add_memory_hole(const uint64_t base, const uint64_t end) +{ + if (end < base) { + return (-1); + } + + /* + * E820 table should be always sorted in ascending order. Therefore, + * search for an element which end is larger than the base parameter. + */ + struct e820_element *element; + TAILQ_FOREACH (element, &e820_table, chain) { + if (element->end > base) { + break; + } + } + + if (element == NULL || end <= element->base) { + /* Nothing to do. Hole already exists */ + return (0); + } + + if (element->type != E820_TYPE_MEMORY) { + /* Memory holes are only allowed in system memory */ + return (-1); + } + + if (base == element->base) { + /* + * New hole at system memory base boundary. + * + * Old table: + * [ 0x1000, 0x4000] RAM + * New table: + * [ 0x2000, 0x4000] RAM + */ + element->base = end; + + } else if (end == element->end) { + /* + * New hole at system memory end boundary. + * + * Old table: + * [ 0x1000, 0x4000] RAM + * New table: + * [ 0x1000, 0x3000] RAM + */ + element->end = base; + + } else { + /* + * New hole inside system memory entry. Split the system memory. + * + * Old table: + * [ 0x1000, 0x4000] RAM <-- element + * New table: + * [ 0x1000, 0x2000] RAM + * [ 0x3000, 0x4000] RAM <-- element + */ + struct e820_element *const ram_element = malloc( + sizeof(struct e820_element)); + if (ram_element == NULL) { + return (-ENOMEM); + } + ram_element->base = element->base; + ram_element->end = base; + ram_element->type = E820_TYPE_MEMORY; + TAILQ_INSERT_BEFORE(element, ram_element, chain); + element->base = end; + } + + return (0); +} + +static uint64_t +e820_alloc_highest(const uint64_t max_address, const uint64_t length, + const uint64_t alignment, const enum e820_memory_type type) +{ + struct e820_element *element; + TAILQ_FOREACH_REVERSE (element, &e820_table, e820_table, chain) { + const uint64_t end = MIN(max_address, element->end); + const uint64_t base = roundup2(element->base, alignment); + + if (element->type != E820_TYPE_MEMORY || end < base || + end - base < length || end - length == 0) { + continue; + } + + const uint64_t address = rounddown2(end - length, alignment); + + if (e820_add_entry(address, address + length, type) != 0) { + return 0; + } + + return address; + } + + return 0; +} + +static uint64_t +e820_alloc_lowest(const uint64_t min_address, const uint64_t length, + const uint64_t alignment, const enum e820_memory_type type) +{ + struct e820_element *element; + TAILQ_FOREACH (element, &e820_table, chain) { + const uint64_t end = element->end; + const uint64_t base = MAX(min_address, + roundup2(element->base, alignment)); + + if (element->type != E820_TYPE_MEMORY || end < base || + end - base < length || base == 0) { + continue; + } + + if (e820_add_entry(base, base + length, type) != 0) { + return 0; + } + + return base; + } + + return 0; +} + +uint64_t +e820_alloc(const uint64_t address, const uint64_t length, + const uint64_t alignment, const enum e820_memory_type type, + const enum e820_allocation_strategy strategy) +{ + /* address should be aligned */ + if (!powerof2(alignment) || (address & (alignment - 1)) != 0) { + return 0; + } + + switch (strategy) { + case E820_ALLOCATE_ANY: + /* + * Allocate any address. Therefore, ignore the address parameter + * and reuse the code path for allocating the lowest address. + */ + return e820_alloc_lowest(0, length, alignment, type); + case E820_ALLOCATE_LOWEST: + return e820_alloc_lowest(address, length, alignment, type); + case E820_ALLOCATE_HIGHEST: + return e820_alloc_highest(address, length, alignment, type); + case E820_ALLOCATE_SPECIFIC: + if (e820_add_entry(address, address + length, type) != 0) { + return 0; + } + + return address; + } + + return 0; +} + +int +e820_init(struct vmctx *const ctx) +{ + int error; + + TAILQ_INIT(&e820_table); + + /* add memory below 4 GB to E820 table */ + const uint64_t lowmem_length = vm_get_lowmem_size(ctx); + error = e820_add_entry(0, lowmem_length, E820_TYPE_MEMORY); + if (error) { + warnx("%s: Could not add lowmem", __func__); + return (error); + } + + /* add memory above 4 GB to E820 table */ + const uint64_t highmem_length = vm_get_highmem_size(ctx); + if (highmem_length != 0) { + error = e820_add_entry(4 * GB, 4 * GB + highmem_length, + E820_TYPE_MEMORY); + if (error) { + warnx("%s: Could not add highmem", __func__); + return (error); + } + } + + /* add memory holes to E820 table */ + error = e820_add_memory_hole(E820_VGA_MEM_BASE, E820_VGA_MEM_END); + if (error) { + warnx("%s: Could not add VGA memory", __func__); + return (error); + } + + error = e820_add_memory_hole(E820_ROM_MEM_BASE, E820_ROM_MEM_END); + if (error) { + warnx("%s: Could not add ROM area", __func__); + return (error); + } + + return (0); +} diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h --- a/usr.sbin/bhyve/pci_emul.h +++ b/usr.sbin/bhyve/pci_emul.h @@ -42,6 +42,8 @@ #include #define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */ +#define PCI_BARMAX_WITH_ROM (PCI_BARMAX + 1) +#define PCI_ROM_IDX (PCI_BARMAX + 1) struct vmctx; struct pci_devinst; @@ -92,7 +94,8 @@ PCIBAR_IO, PCIBAR_MEM32, PCIBAR_MEM64, - PCIBAR_MEMHI64 + PCIBAR_MEMHI64, + PCIBAR_ROM, }; struct pcibar { @@ -165,7 +168,9 @@ void *pi_arg; /* devemu-private data */ u_char pi_cfgdata[PCI_REGMAX + 1]; - struct pcibar pi_bar[PCI_BARMAX + 1]; + /* ROM is handled like a BAR */ + struct pcibar pi_bar[PCI_BARMAX_WITH_ROM + 1]; + uint64_t pi_romoffset; }; struct msicap { @@ -229,6 +234,8 @@ void pci_callback(void); int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size); +int pci_emul_alloc_rom(struct pci_devinst *const pdi, const uint64_t size, + void **const addr); int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type); void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -33,10 +33,12 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -101,6 +103,9 @@ SET_DECLARE(pci_devemu_set, struct pci_devemu); static uint64_t pci_emul_iobase; +static uint8_t *pci_emul_rombase; +static uint64_t pci_emul_romoffset; +static uint8_t *pci_emul_romlim; static uint64_t pci_emul_membase32; static uint64_t pci_emul_membase64; static uint64_t pci_emul_memlim64; @@ -118,6 +123,8 @@ #define PCI_EMUL_IOBASE 0x2000 #define PCI_EMUL_IOLIMIT 0x10000 +#define PCI_EMUL_ROMSIZE 0x10000000 + #define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ #define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); @@ -562,6 +569,12 @@ (*pe->pe_baraddr)(pi->pi_vmctx, pi, idx, registration, pi->pi_bar[idx].addr); break; + case PCIBAR_ROM: + error = 0; + if (pe->pe_baraddr != NULL) + (*pe->pe_baraddr)(pi->pi_vmctx, pi, idx, registration, + pi->pi_bar[idx].addr); + break; default: error = EINVAL; break; @@ -583,6 +596,14 @@ modify_bar_registration(pi, idx, 1); } +/* Is the ROM enabled for the emulated pci device? */ +static int +romen(struct pci_devinst *pi) +{ + return (pi->pi_bar[PCI_ROM_IDX].lobits & PCIM_BIOS_ENABLE) == + PCIM_BIOS_ENABLE; +} + /* Are we decoding i/o port accesses for the emulated pci device? */ static int porten(struct pci_devinst *pi) @@ -649,7 +670,11 @@ pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size) { - assert(idx >= 0 && idx <= PCI_BARMAX); + if ((type != PCIBAR_ROM) && (idx < 0 || idx > PCI_BARMAX)) { + errx(4, "Illegal BAR idx"); + } else if ((type == PCIBAR_ROM) && (idx != PCI_ROM_IDX)) { + errx(4, "Illegal ROM idx"); + } if ((size & (size - 1)) != 0) size = 1UL << flsl(size); /* round up to a power of 2 */ @@ -658,6 +683,9 @@ if (type == PCIBAR_IO) { if (size < 4) size = 4; + } else if (type == PCIBAR_ROM) { + if (size < ~PCIM_BIOS_ADDR_MASK + 1) + size = ~PCIM_BIOS_ADDR_MASK + 1; } else { if (size < 16) size = 16; @@ -715,6 +743,7 @@ break; case PCIBAR_MEM64: case PCIBAR_MEM32: + case PCIBAR_ROM: enbit = PCIM_CMD_MEMEN; break; default: @@ -773,6 +802,13 @@ mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; break; + case PCIBAR_ROM: + /* do not claim memory for ROM. OVMF will do it for us. */ + baseptr = NULL; + limit = 0; + mask = PCIM_BIOS_ADDR_MASK; + lobits = 0; + break; default: printf("pci_emul_alloc_base: invalid bar type %d\n", type); assert(0); @@ -807,7 +843,57 @@ pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); } - register_bar(pdi, idx); + if (type != PCIBAR_ROM) { + register_bar(pdi, idx); + } + + return (0); +} + +int +pci_emul_alloc_rom(struct pci_devinst *const pdi, const uint64_t size, + void **const addr) +{ + /* allocate ROM space once on first call */ + if (pci_emul_rombase == 0) { + pci_emul_rombase = vm_create_devmem(pdi->pi_vmctx, VM_PCIROM, + "pcirom", PCI_EMUL_ROMSIZE); + if (pci_emul_rombase == MAP_FAILED) { + warnx("%s: failed to create rom segment", __func__); + return (-1); + } + pci_emul_romlim = pci_emul_rombase + PCI_EMUL_ROMSIZE; + pci_emul_romoffset = 0; + } + + /* ROM size should be a power of 2 and greater than 2 KB */ + const uint64_t rom_size = MAX(1UL << flsl(size), + ~PCIM_BIOS_ADDR_MASK + 1); + + /* check if ROM fits into ROM space */ + if (pci_emul_romoffset + rom_size > PCI_EMUL_ROMSIZE) { + warnx("%s: no space left in rom segment:", __func__); + warnx("%16lu bytes left", + PCI_EMUL_ROMSIZE - pci_emul_romoffset); + warnx("%16lu bytes required by %d/%d/%d", rom_size, pdi->pi_bus, + pdi->pi_slot, pdi->pi_func); + return (-1); + } + + /* allocate ROM BAR */ + const int error = pci_emul_alloc_bar(pdi, PCI_ROM_IDX, PCIBAR_ROM, + rom_size); + if (error) + return error; + + /* return address */ + *addr = pci_emul_rombase + pci_emul_romoffset; + + /* save offset into ROM Space */ + pdi->pi_romoffset = pci_emul_romoffset; + + /* increase offset for next ROM */ + pci_emul_romoffset += rom_size; return (0); } @@ -1885,7 +1971,7 @@ * If the MMIO or I/O address space decoding has changed then * register/unregister all BARs that decode that address space. */ - for (i = 0; i <= PCI_BARMAX; i++) { + for (i = 0; i <= PCI_BARMAX_WITH_ROM; i++) { switch (pi->pi_bar[i].type) { case PCIBAR_NONE: case PCIBAR_MEMHI64: @@ -1899,6 +1985,11 @@ unregister_bar(pi, i); } break; + case PCIBAR_ROM: + /* skip (un-)register of ROM if it disabled */ + if (!romen(pi)) + break; + /* fallthrough */ case PCIBAR_MEM32: case PCIBAR_MEM64: /* MMIO address space decoding changed? */ @@ -2019,16 +2110,21 @@ return; /* - * Special handling for write to BAR registers + * Special handling for write to BAR and ROM registers */ - if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) { + if ((coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) || + (coff >= PCIR_BIOS && coff < PCIR_BIOS + 4)) { /* * Ignore writes to BAR registers that are not * 4-byte aligned. */ if (bytes != 4 || (coff & 0x3) != 0) return; - idx = (coff - PCIR_BAR(0)) / 4; + if (coff != PCIR_BIOS) { + idx = (coff - PCIR_BAR(0)) / 4; + } else { + idx = PCI_ROM_IDX; + } mask = ~(pi->pi_bar[idx].size - 1); switch (pi->pi_bar[idx].type) { case PCIBAR_NONE: @@ -2071,6 +2167,20 @@ PCIBAR_MEMHI64); } break; + case PCIBAR_ROM: + addr = bar = *eax & mask; + if (memen(pi) && romen(pi)) { + unregister_bar(pi, idx); + } + pi->pi_bar[idx].addr = addr; + pi->pi_bar[idx].lobits = *eax & + PCIM_BIOS_ENABLE; + /* romen could have changed it value */ + if (memen(pi) && romen(pi)) { + register_bar(pi, idx); + } + bar |= pi->pi_bar[idx].lobits; + break; default: assert(0); } diff --git a/usr.sbin/bhyve/pci_gvt-d.c b/usr.sbin/bhyve/pci_gvt-d.c new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/pci_gvt-d.c @@ -0,0 +1,262 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Beckhoff Automation GmbH & Co. KG + * Author: Corvin Köhne + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "e820.h" +#include "inout.h" +#include "pci_passthru.h" + +#define MB (1024 * 1024UL) +#define GB (1024 * MB) + +#ifndef _PATH_MEM +#define _PATH_MEM "/dev/mem" +#endif + +/* + * PCI definitions + */ +#define PCIM_BDSM_GSM_ALIGNMENT \ + 0x00100000 /* Graphics Stolen Memory is 1 MB aligned */ + +/* GVT-d definitions */ +#define GVT_D_MAP_OPREGION 0 +#define GVT_D_MAP_GSM 1 + +static int +gvt_d_aslswrite(struct vmctx *const ctx, const int vcpu, + struct pci_devinst *const pi, const int coff, const int bytes, + const uint32_t val) +{ + struct passthru_softc *const sc = pi->pi_arg; + + struct passthru_mmio_mapping *const opregion = + &sc->psc_mmio_map[GVT_D_MAP_OPREGION]; + + /* write new value to cfg space */ + if (bytes == 1) { + pci_set_cfgdata8(pi, coff, val); + } else if (bytes == 2) { + pci_set_cfgdata16(pi, coff, val); + } else { + pci_set_cfgdata32(pi, coff, val); + } + + /* get new address of opregion */ + opregion->gpa = pci_get_cfgdata32(pi, PCIR_ASLS_CTL); + + /* copy opregion into guest mem */ + opregion->gva = vm_map_gpa(ctx, opregion->gpa, opregion->len); + if (opregion->gva == 0) { + warnx("%s: Unable to map opregion (0x%016lx)", __func__, + opregion->gpa); + /* return 0 to avoid emulation of ASLS register */ + return (0); + } + memcpy(opregion->gva, opregion->hva, opregion->len); + + return (0); +} + +static vm_paddr_t +gvt_d_alloc_mmio_memory(const vm_paddr_t host_address, const vm_paddr_t length, + const vm_paddr_t alignment, const enum e820_memory_type type) +{ + /* try to use host address */ + const vm_paddr_t address = e820_alloc(host_address, length, + E820_ALIGNMENT_NONE, type, E820_ALLOCATE_SPECIFIC); + if (address != 0) { + return address; + } + + /* try to use highest address below 4 GB */ + return e820_alloc(4 * GB, length, alignment, type, + E820_ALLOCATE_HIGHEST); +} + +static int +gvt_d_setup_gsm(struct vmctx *const ctx, struct pci_devinst *const pi) +{ + struct passthru_softc *const sc = pi->pi_arg; + + struct passthru_mmio_mapping *const gsm = + &sc->psc_mmio_map[GVT_D_MAP_GSM]; + + const int error = vm_get_memory_region_info(ctx, &gsm->hpa, &gsm->len, + MEMORY_REGION_INTEL_GSM); + if (error) { + warnx( + "%s: Unable to get Graphics Stolen Memory base and length", + __func__); + return (error); + } + gsm->hva = NULL; /* unused */ + gsm->gva = NULL; /* unused */ + gsm->gpa = gvt_d_alloc_mmio_memory(gsm->hpa, gsm->len, + PCIM_BDSM_GSM_ALIGNMENT, E820_TYPE_RESERVED); + if (gsm->gpa == 0) { + warnx( + "%s: Unable to add Graphics Stolen Memory to E820 table (hpa 0x%lx len 0x%lx)", + __func__, gsm->hpa, gsm->len); + e820_dump_table(); + return (-1); + } + if (gsm->gpa != gsm->hpa) { + /* + * ACRN source code implies that graphics driver for newer Intel + * platforms like Tiger Lake will read the Graphics Stolen + * Memory address from an MMIO register. We have three options + * to solve this issue: + * 1. Patch the value in the MMIO register + * This could have unintended side effects. Without + * any documentation how this register is used by + * the GPU, don't do it. + * 2. Trap the MMIO register + * It's not possible to trap a single MMIO + * register. We need to trap a whole page. Trapping + * a bunch of MMIO register could degrade the + * performance noticeably. + * 3. Use an 1:1 host to guest mapping + * Maybe not always possible. + * As far as we know, no supported platform requires a 1:1 + * mapping. For that reason, just log a warning. + */ + warnx( + "Warning: Unable to reuse host address of Graphics Stolen Memory. GPU passthrough might not work properly."); + } + + const uint64_t bdsm = read_config(&sc->psc_sel, PCIR_BDSM, 4); + pci_set_cfgdata32(pi, PCIR_BDSM, + gsm->gpa | (bdsm & (PCIM_BDSM_GSM_ALIGNMENT - 1))); + + return (0); +} + +static int +gvt_d_setup_opregion(struct vmctx *const ctx, struct pci_devinst *const pi, + const int memfd) +{ + struct passthru_softc *const sc = pi->pi_arg; + + struct passthru_mmio_mapping *const opregion = + &sc->psc_mmio_map[GVT_D_MAP_OPREGION]; + + const int error = vm_get_memory_region_info(ctx, &opregion->hpa, + &opregion->len, MEMORY_REGION_INTEL_OPREGION); + if (error) { + warnx("%s: Unable to get OpRegion base and length", __func__); + return (error); + } + opregion->hva = mmap(NULL, opregion->len, PROT_READ, MAP_SHARED, memfd, + opregion->hpa); + if (opregion->hva == MAP_FAILED) { + warnx("%s: Unable to map host OpRegion", __func__); + return (-1); + } + opregion->gpa = gvt_d_alloc_mmio_memory(opregion->hpa, opregion->len, + E820_ALIGNMENT_NONE, E820_TYPE_NVS); + if (opregion->gpa == 0) { + warnx( + "%s: Unable to add OpRegion to E820 table (hpa 0x%lx len 0x%lx)", + __func__, opregion->hpa, opregion->len); + e820_dump_table(); + return (-1); + } + opregion->gva = vm_map_gpa(ctx, opregion->gpa, opregion->len); + if (opregion->gva == NULL) { + warnx("%s: Unable to map guest OpRegion", __func__); + return (-1); + } + if (opregion->gpa != opregion->hpa) { + /* + * A 1:1 host to guest mapping is not required but this could + * change in the future. + */ + warnx( + "Warning: Unable to reuse host address of OpRegion. GPU passthrough might not work properly."); + } + + memcpy(opregion->gva, opregion->hva, opregion->len); + + pci_set_cfgdata32(pi, PCIR_ASLS_CTL, opregion->gpa); + + return (0); +} + +int +gvt_d_init(struct vmctx *const ctx, struct pci_devinst *const pi, + nvlist_t *const nvl) +{ + int error; + + struct passthru_softc *const sc = pi->pi_arg; + + /* get memory descriptor */ + const int memfd = open(_PATH_MEM, O_RDWR, 0); + if (memfd < 0) { + warn("%s: Failed to open %s", __func__, _PATH_MEM); + return (-1); + } + + if ((error = gvt_d_setup_gsm(ctx, pi)) != 0) { + warnx("%s: Unable to setup Graphics Stolen Memory", __func__); + goto done; + } + + if ((error = gvt_d_setup_opregion(ctx, pi, memfd)) != 0) { + warnx("%s: Unable to setup OpRegion", __func__); + goto done; + } + + /* protect Graphics Stolen Memory register */ + if ((error = set_pcir_handler(sc, PCIR_BDSM, 4, + passthru_cfgread_emulate, passthru_cfgwrite_emulate)) != 0) { + warnx("%s: Unable to protect opregion", __func__); + goto done; + } + /* protect opregion register */ + if ((error = set_pcir_handler(sc, PCIR_ASLS_CTL, 4, + passthru_cfgread_emulate, gvt_d_aslswrite)) != 0) { + warnx("%s: Unable to protect opregion", __func__); + goto done; + } + +done: + return (error); +} + +void +gvt_d_deinit(struct vmctx *const ctx, struct pci_devinst *const pi) +{ + struct passthru_softc *const sc = pi->pi_arg; + + struct passthru_mmio_mapping *const opregion = + &sc->psc_mmio_map[GVT_D_MAP_OPREGION]; + + /* HVA is only set, if it's initialized */ + if (opregion->hva) + munmap((void *)opregion->hva, opregion->len); +} diff --git a/usr.sbin/bhyve/pci_lpc.h b/usr.sbin/bhyve/pci_lpc.h --- a/usr.sbin/bhyve/pci_lpc.h +++ b/usr.sbin/bhyve/pci_lpc.h @@ -72,5 +72,6 @@ char *lpc_pirq_name(int pin); void lpc_pirq_routed(void); const char *lpc_bootrom(void); +const char *lpc_fwcfg(void); #endif diff --git a/usr.sbin/bhyve/pci_lpc.c b/usr.sbin/bhyve/pci_lpc.c --- a/usr.sbin/bhyve/pci_lpc.c +++ b/usr.sbin/bhyve/pci_lpc.c @@ -32,13 +32,24 @@ #include __FBSDID("$FreeBSD$"); +#ifndef WITHOUT_CAPSICUM +#include +#endif #include +#include #include #include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include #include #include #include +#include #include @@ -85,6 +96,29 @@ "COM1", "COM2", "COM3", "COM4" }; +#ifndef _PATH_DEVPCI +#define _PATH_DEVPCI "/dev/pci" +#endif + +static int pcifd = -1; + +static uint32_t +read_config(const struct pcisel *const sel, const long reg, const int width) +{ + struct pci_io pi; + pi.pi_sel.pc_domain = sel->pc_domain; + pi.pi_sel.pc_bus = sel->pc_bus; + pi.pi_sel.pc_dev = sel->pc_dev; + pi.pi_sel.pc_func = sel->pc_func; + pi.pi_reg = reg; + pi.pi_width = width; + + if (ioctl(pcifd, PCIOCREAD, &pi) < 0) + return (0); + + return (pi.pi_data); +} + /* * LPC device configuration is in the following form: * [,] @@ -101,7 +135,13 @@ lpcdev = strsep(&str, ","); if (lpcdev != NULL) { if (strcasecmp(lpcdev, "bootrom") == 0) { - set_config_value("lpc.bootrom", str); + nvlist_t *const nvl = create_config_node("lpc.bootrom"); + /* use qemu as default fwcfg */ + set_config_value_node(nvl, "fwcfg", "qemu"); + + const char *const code = strsep(&str, ","); + set_config_value_node(nvl, "code", code); + pci_parse_legacy_config(nvl, str); error = 0; goto done; } @@ -145,7 +185,13 @@ lpc_bootrom(void) { - return (get_config_value("lpc.bootrom")); + return (get_config_value("lpc.bootrom.code")); +} + +const char * +lpc_fwcfg(void) +{ + return (get_config_value("lpc.bootrom.fwcfg")); } static void @@ -208,7 +254,7 @@ char *node_name; int unit, error; - romfile = get_config_value("lpc.bootrom"); + romfile = get_config_value("lpc.bootrom.code"); if (romfile != NULL) { error = bootrom_loadrom(ctx, romfile); if (error) @@ -452,6 +498,48 @@ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA); + pcifd = open(_PATH_DEVPCI, O_RDWR, 0); + if (pcifd < 0) { + warn("failed to open %s", _PATH_DEVPCI); + return (-1); + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_t pcifd_rights; + cap_rights_init(&pcifd_rights, CAP_IOCTL, CAP_READ); + + const cap_ioctl_t pcifd_ioctls[] = { PCIOCREAD }; + + if (caph_rights_limit(pcifd, &pcifd_rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_ioctls_limit(pcifd, pcifd_ioctls, nitems(pcifd_ioctls)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* on Intel systems lpc is always connected to 0:1f.0 */ + const struct pcisel sel = { .pc_dev = 0x1f }; + + if (read_config(&sel, PCIR_VENDOR, 2) == PCI_VENDOR_INTEL) { + /* + * The VID, DID, REVID, SUBVID and SUBDID of igd-lpc need to be + * aligned with the physical ones. Without these physical + * values, GVT-d GOP driver couldn't work. + */ + pci_set_cfgdata16(pi, PCIR_DEVICE, + read_config(&sel, PCIR_DEVICE, 2)); + pci_set_cfgdata16(pi, PCIR_VENDOR, + read_config(&sel, PCIR_VENDOR, 2)); + pci_set_cfgdata8(pi, PCIR_REVID, + read_config(&sel, PCIR_REVID, 1)); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, + read_config(&sel, PCIR_SUBVEND_0, 2)); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, + read_config(&sel, PCIR_SUBDEV_0, 2)); + } + + close(pcifd); + pcifd = -1; + lpc_bridge = pi; return (0); diff --git a/usr.sbin/bhyve/pci_passthru.h b/usr.sbin/bhyve/pci_passthru.h new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/pci_passthru.h @@ -0,0 +1,69 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Beckhoff Automation GmbH & Co. KG + * Author: Corvin Köhne + */ + +#pragma once + +#include + +#include + +#include "pci_emul.h" + +struct passthru_mmio_mapping { + vm_paddr_t gpa; /* guest physical address */ + void *gva; /* guest virtual address */ + vm_paddr_t hpa; /* host physical address */ + void *hva; /* guest virtual address */ + vm_paddr_t len; +}; + +typedef int (*cfgread_handler)(struct vmctx *const ctx, const int vcpu, + struct pci_devinst *const pi, const int coff, const int bytes, + uint32_t *const rv); +typedef int (*cfgwrite_handler)(struct vmctx *const ctx, const int vcpu, + struct pci_devinst *const pi, const int coff, const int bytes, + const uint32_t val); + +struct passthru_softc { + struct pci_devinst *psc_pi; + /* ROM is handled like a BAR */ + struct pcibar psc_bar[PCI_BARMAX_WITH_ROM + 1]; + struct { + int capoff; + int msgctrl; + int emulated; + } psc_msi; + struct { + int capoff; + } psc_msix; + struct pcisel psc_sel; + + struct passthru_mmio_mapping psc_mmio_map[2]; + cfgread_handler psc_pcir_rhandler[PCI_REGMAX + 1]; + cfgwrite_handler psc_pcir_whandler[PCI_REGMAX + 1]; +}; + +uint32_t read_config(const struct pcisel *sel, long reg, int width); +void write_config(const struct pcisel *sel, long reg, int width, uint32_t data); +int passthru_cfgread_default(struct vmctx *const ctx, const int vcpu, + struct pci_devinst *const pi, const int coff, const int bytes, + uint32_t *const rv); +int passthru_cfgread_emulate(struct vmctx *const ctx, const int vcpu, + struct pci_devinst *const pi, const int coff, const int bytes, + uint32_t *const rv); +int passthru_cfgwrite_default(struct vmctx *const ctx, const int vcpu, + struct pci_devinst *const pi, const int coff, const int bytes, + const uint32_t val); +int passthru_cfgwrite_emulate(struct vmctx *const ctx, const int vcpu, + struct pci_devinst *const pi, const int coff, const int bytes, + const uint32_t val); +int set_pcir_handler(struct passthru_softc *const sc, const uint32_t reg, + const uint32_t len, const cfgread_handler rhandler, + const cfgwrite_handler whandler); +int gvt_d_init(struct vmctx *const ctx, struct pci_devinst *const pi, + nvlist_t *const nvl); +void gvt_d_deinit(struct vmctx *const ctx, struct pci_devinst *const pi); diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c --- a/usr.sbin/bhyve/pci_passthru.c +++ b/usr.sbin/bhyve/pci_passthru.c @@ -37,8 +37,8 @@ #endif #include #include -#include #include +#include #include #include @@ -61,12 +61,11 @@ #include #include -#include #include "config.h" #include "debug.h" -#include "pci_emul.h" #include "mem.h" +#include "pci_passthru.h" #ifndef _PATH_DEVPCI #define _PATH_DEVPCI "/dev/pci" @@ -77,21 +76,9 @@ #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1) #define MSIX_CAPLEN 12 -static int pcifd = -1; +#define PCI_CAP_START_OFFSET 0x40 -struct passthru_softc { - struct pci_devinst *psc_pi; - struct pcibar psc_bar[PCI_BARMAX + 1]; - struct { - int capoff; - int msgctrl; - int emulated; - } psc_msi; - struct { - int capoff; - } psc_msix; - struct pcisel psc_sel; -}; +static int pcifd = -1; static int msi_caplen(int msgctrl) @@ -115,7 +102,7 @@ return (len); } -static uint32_t +uint32_t read_config(const struct pcisel *sel, long reg, int width) { struct pci_io pi; @@ -131,7 +118,7 @@ return (pi.pi_data); } -static void +void write_config(const struct pcisel *sel, long reg, int width, uint32_t data) { struct pci_io pi; @@ -444,7 +431,7 @@ memset(&pbm, 0, sizeof(pbm)); pbm.pbm_sel = sc->psc_sel; pbm.pbm_flags = PCIIO_BAR_MMAP_RW; - pbm.pbm_reg = PCIR_BAR(pi->pi_msix.pba_bar); + pbm.pbm_reg = PCIR_BAR(pi->pi_msix.table_bar); pbm.pbm_memattr = VM_MEMATTR_DEVICE; if (ioctl(pcifd, PCIOCBARMMAP, &pbm) != 0) { @@ -462,7 +449,7 @@ table_size = roundup2(table_size, 4096); /* - * Unmap any pages not covered by the table, we do not need to emulate + * Unmap any pages not containing the table, we do not need to emulate * accesses to them. Avoid releasing address space to help ensure that * a buggy out-of-bounds access causes a crash. */ @@ -471,7 +458,8 @@ PROT_NONE) != 0) warn("Failed to unmap MSI-X table BAR region"); if (table_offset + table_size != pi->pi_msix.mapped_size) - if (mprotect(pi->pi_msix.mapped_addr, + if (mprotect( + pi->pi_msix.mapped_addr + table_offset + table_size, pi->pi_msix.mapped_size - (table_offset + table_size), PROT_NONE) != 0) warn("Failed to unmap MSI-X table BAR region"); @@ -574,6 +562,17 @@ sc->psc_sel.pc_dev = slot; sc->psc_sel.pc_func = func; + /* copy physical PCI header to virtual cfgspace */ + for (uint32_t i = 0; i < PCI_CAP_START_OFFSET; ++i) { + /* + * INTLINE and INTPIN shouldn't be aligned with it's physical + * value. They are already set by pci_emul_init. + */ + if (i == PCIR_INTLINE || i == PCIR_INTPIN) + continue; + pci_set_cfgdata8(pi, i, read_config(&sc->psc_sel, i, 1)); + } + if (cfginitmsi(sc) != 0) { warnx("failed to initialize MSI for PCI %d/%d/%d", bus, slot, func); @@ -608,6 +607,22 @@ return (error); } +int +set_pcir_handler(struct passthru_softc *const sc, const uint32_t reg, + const uint32_t len, const cfgread_handler rhandler, + const cfgwrite_handler whandler) +{ + if (reg > PCI_REGMAX || reg + len > PCI_REGMAX + 1) + return (-1); + + for (uint32_t i = reg; i < reg + len; ++i) { + sc->psc_pcir_rhandler[i] = rhandler; + sc->psc_pcir_whandler[i] = whandler; + } + + return 0; +} + static int passthru_legacy_config(nvlist_t *nvl, const char *opts) { @@ -628,9 +643,101 @@ set_config_value_node(nvl, "slot", value); snprintf(value, sizeof(value), "%d", func); set_config_value_node(nvl, "func", value); + + return (pci_parse_legacy_config(nvl, strchr(opts, ','))); +} + +static int +passthru_init_rom(struct vmctx *const ctx, struct passthru_softc *const sc, + const char *const romfile) +{ + if (romfile == NULL) { + return (0); + } + + const int fd = open(romfile, O_RDONLY); + if (fd < 0) { + warnx("%s: can't open romfile \"%s\"", __func__, romfile); + return (-1); + } + + struct stat sbuf; + if (fstat(fd, &sbuf) < 0) { + warnx("%s: can't fstat romfile \"%s\"", __func__, romfile); + close(fd); + return (-1); + } + const uint64_t rom_size = sbuf.st_size; + + void *const rom_data = mmap(NULL, rom_size, PROT_READ, MAP_SHARED, fd, + 0); + if (rom_data == MAP_FAILED) { + warnx("%s: unable to mmap romfile \"%s\" (%d)", __func__, + romfile, errno); + close(fd); + return (-1); + } + + void *rom_addr; + int error = pci_emul_alloc_rom(sc->psc_pi, rom_size, &rom_addr); + if (error) { + warnx("%s: failed to alloc rom segment", __func__); + munmap(rom_data, rom_size); + close(fd); + return (error); + } + memcpy(rom_addr, rom_data, rom_size); + + sc->psc_bar[PCI_ROM_IDX].type = PCIBAR_ROM; + sc->psc_bar[PCI_ROM_IDX].addr = (uint64_t)rom_addr; + sc->psc_bar[PCI_ROM_IDX].size = rom_size; + + munmap(rom_data, rom_size); + close(fd); + return (0); } +static int +passthru_init_quirks(struct vmctx *const ctx, struct pci_devinst *const pi, + nvlist_t *const nvl) +{ + struct passthru_softc *const sc = pi->pi_arg; + + const uint16_t vendor = read_config(&sc->psc_sel, PCIR_VENDOR, 0x02); + const uint8_t class = read_config(&sc->psc_sel, PCIR_CLASS, 0x01); + + /* currently only display devices have quirks */ + if (class != PCIC_DISPLAY) + return (0); + + if (vendor == PCI_VENDOR_INTEL) + return gvt_d_init(ctx, pi, nvl); + + return (0); +} + +static void +passthru_deinit_quirks(struct vmctx *const ctx, struct pci_devinst *const pi) +{ + struct passthru_softc *const sc = pi->pi_arg; + + if (sc == NULL) + return; + + const uint16_t vendor = read_config(&sc->psc_sel, PCIR_VENDOR, 0x02); + const uint8_t class = read_config(&sc->psc_sel, PCIR_CLASS, 0x01); + + /* currently only display devices have quirks */ + if (class != PCIC_DISPLAY) + return; + + if (vendor == PCI_VENDOR_INTEL) + return gvt_d_deinit(ctx, pi); + + return; +} + static int passthru_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { @@ -696,9 +803,34 @@ sc->psc_pi = pi; /* initialize config space */ - error = cfginit(ctx, pi, bus, slot, func); + if ((error = cfginit(ctx, pi, bus, slot, func)) != 0) + goto done; + + /* set default handler for all PCI registers */ + if ((error = set_pcir_handler(sc, 0, PCI_REGMAX + 1, + passthru_cfgread_default, passthru_cfgwrite_default)) != 0) + goto done; + /* protect PCI header */ + if ((error = set_pcir_handler(sc, 0, PCI_CAP_START_OFFSET, + passthru_cfgread_emulate, passthru_cfgwrite_emulate)) != 0) + goto done; + /* allow access to command and status register */ + if ((error = set_pcir_handler(sc, PCIR_COMMAND, 0x04, + passthru_cfgread_default, passthru_cfgwrite_default)) != 0) + goto done; + + if ((error = passthru_init_quirks(ctx, pi, nvl)) != 0) + goto done; + + /* initialize ROM */ + if ((error = passthru_init_rom(ctx, sc, + get_config_value_node(nvl, "rom"))) != 0) + goto done; + + error = 0; /* success */ done: if (error) { + passthru_deinit_quirks(ctx, pi); free(sc); vm_unassign_pptdev(ctx, bus, slot, func); } @@ -708,7 +840,8 @@ static int bar_access(int coff) { - if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) + if ((coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) || + coff == PCIR_BIOS) return (1); else return (0); @@ -743,29 +876,27 @@ static int passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t *rv) +{ + struct passthru_softc *const sc = pi->pi_arg; + + return sc->psc_pcir_rhandler[coff](ctx, vcpu, pi, coff, bytes, rv); +} + +int +passthru_cfgread_default(struct vmctx *const ctx, const int vcpu, + struct pci_devinst *const pi, const int coff, const int bytes, + uint32_t *const rv) { struct passthru_softc *sc; sc = pi->pi_arg; /* - * PCI BARs and MSI capability is emulated. + * MSI capability is emulated. */ - if (bar_access(coff) || msicap_access(sc, coff) || - msixcap_access(sc, coff)) + if (msicap_access(sc, coff) || msixcap_access(sc, coff)) return (-1); -#ifdef LEGACY_SUPPORT - /* - * Emulate PCIR_CAP_PTR if this device does not support MSI capability - * natively. - */ - if (sc->psc_msi.emulated) { - if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4) - return (-1); - } -#endif - /* * Emulate the command register. If a single read reads both the * command and status registers, read the status register from the @@ -785,9 +916,27 @@ return (0); } +int +passthru_cfgread_emulate(struct vmctx *const ctx, const int vcpu, + struct pci_devinst *const pi, const int coff, const int bytes, + uint32_t *const rv) +{ + return (-1); +} + static int passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t val) +{ + struct passthru_softc *const sc = pi->pi_arg; + + return sc->psc_pcir_whandler[coff](ctx, vcpu, pi, coff, bytes, val); +} + +int +passthru_cfgwrite_default(struct vmctx *const ctx, const int vcpu, + struct pci_devinst *const pi, const int coff, const int bytes, + const uint32_t val) { int error, msix_table_entries, i; struct passthru_softc *sc; @@ -795,12 +944,6 @@ sc = pi->pi_arg; - /* - * PCI BARs are emulated - */ - if (bar_access(coff)) - return (-1); - /* * MSI capability is emulated */ @@ -841,6 +984,7 @@ return (0); } + uint32_t write_val = val; #ifdef LEGACY_SUPPORT /* * If this device does not support MSI natively then we cannot let @@ -849,23 +993,31 @@ */ if (sc->psc_msi.emulated && pci_msi_enabled(pi)) { if (coff == PCIR_COMMAND && bytes == 2) - val &= ~PCIM_CMD_INTxDIS; + write_val &= ~PCIM_CMD_INTxDIS; } #endif - write_config(&sc->psc_sel, coff, bytes, val); + write_config(&sc->psc_sel, coff, bytes, write_val); if (coff == PCIR_COMMAND) { cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND); if (bytes == 1) - pci_set_cfgdata8(pi, PCIR_COMMAND, val); + pci_set_cfgdata8(pi, PCIR_COMMAND, write_val); else if (bytes == 2) - pci_set_cfgdata16(pi, PCIR_COMMAND, val); + pci_set_cfgdata16(pi, PCIR_COMMAND, write_val); pci_emul_cmd_changed(pi, cmd_old); } return (0); } +int +passthru_cfgwrite_emulate(struct vmctx *const ctx, const int vcpu, + struct pci_devinst *const pi, const int coff, const int bytes, + const uint32_t val) +{ + return (-1); +} + static void passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) @@ -1000,16 +1152,49 @@ } static void -passthru_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx, - int enabled, uint64_t address) +passthru_addr_rom(struct pci_devinst *const pi, const int idx, + const int enabled) { + const uint64_t addr = pi->pi_bar[idx].addr; + const uint64_t size = pi->pi_bar[idx].size; - if (pi->pi_bar[baridx].type == PCIBAR_IO) - return; - if (baridx == pci_msix_table_bar(pi)) - passthru_msix_addr(ctx, pi, baridx, enabled, address); - else - passthru_mmio_addr(ctx, pi, baridx, enabled, address); + if (!enabled) { + if (vm_munmap_memseg(pi->pi_vmctx, addr, size) != 0) { + warnx("%s: munmap_memseg @ [%016lx - %016lx] failed", + __func__, addr, addr + size); + } + + } else { + if (vm_mmap_memseg(pi->pi_vmctx, addr, VM_PCIROM, + pi->pi_romoffset, size, PROT_READ | PROT_EXEC) != 0) { + warnx("%s: mnmap_memseg @ [%016lx - %016lx] failed", + __func__, addr, addr + size); + } + } +} + +static void +passthru_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx, + int enabled, uint64_t address) +{ + switch (pi->pi_bar[baridx].type) { + case PCIBAR_IO: + /* IO BARs are emulated */ + break; + case PCIBAR_ROM: + passthru_addr_rom(pi, baridx, enabled); + break; + case PCIBAR_MEM32: + case PCIBAR_MEM64: + if (baridx == pci_msix_table_bar(pi)) + passthru_msix_addr(ctx, pi, baridx, enabled, address); + else + passthru_mmio_addr(ctx, pi, baridx, enabled, address); + break; + default: + errx(4, "%s: invalid BAR type %d", __func__, + pi->pi_bar[baridx].type); + } } struct pci_devemu passthru = { diff --git a/usr.sbin/bhyve/qemu_fwcfg.h b/usr.sbin/bhyve/qemu_fwcfg.h new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/qemu_fwcfg.h @@ -0,0 +1,24 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * Author: Corvin Köhne + */ + +#pragma once + +#include + +#define QEMU_FWCFG_MAX_ARCHS 0x2 +#define QEMU_FWCFG_MAX_ENTRIES 0x3FFF +#define QEMU_FWCFG_MAX_NAME 56 + +struct qemu_fwcfg_item { + uint32_t size; + uint8_t *data; +}; + +int qemu_fwcfg_add_file(const uint8_t name[QEMU_FWCFG_MAX_NAME], + const uint32_t size, void *const data); +int qemu_fwcfg_init(struct vmctx *const ctx); +int qemu_fwcfg_parse_cmdline_arg(const char *opt); diff --git a/usr.sbin/bhyve/qemu_fwcfg.c b/usr.sbin/bhyve/qemu_fwcfg.c new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/qemu_fwcfg.c @@ -0,0 +1,541 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * Author: Corvin Köhne + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "acpi_device.h" +#include "inout.h" +#include "qemu_fwcfg.h" + +#define QEMU_FWCFG_ACPI_DEVICE_NAME "FWCF" +#define QEMU_FWCFG_ACPI_HARDWARE_ID "QEMU0002" + +#define QEMU_FWCFG_SELECTOR_PORT_NUMBER 0x510 +#define QEMU_FWCFG_SELECTOR_PORT_SIZE 1 +#define QEMU_FWCFG_SELECTOR_PORT_FLAGS IOPORT_F_INOUT +#define QEMU_FWCFG_DATA_PORT_NUMBER 0x511 +#define QEMU_FWCFG_DATA_PORT_SIZE 1 +#define QEMU_FWCFG_DATA_PORT_FLAGS \ + IOPORT_F_INOUT /* QEMU v2.4+ ignores writes */ + +#define QEMU_FWCFG_ARCHITECTURE_MASK 0x0001 +#define QEMU_FWCFG_INDEX_MASK 0x3FFF + +#define QEMU_FWCFG_SELECT_READ 0 +#define QEMU_FWCFG_SELECT_WRITE 1 + +#define QEMU_FWCFG_ARCHITECTURE_GENERIC 0 +#define QEMU_FWCFG_ARCHITECTURE_SPECIFIC 1 + +#define QEMU_FWCFG_INDEX_SIGNATURE 0x00 +#define QEMU_FWCFG_INDEX_ID 0x01 +#define QEMU_FWCFG_INDEX_FILE_DIR 0x19 + +#define QEMU_FWCFG_FIRST_FILE_INDEX 0x20 + +#define QEMU_FWCFG_MIN_FILES 10 + +#pragma pack(1) + +union qemu_fwcfg_selector { + struct { + uint16_t index : 14; + uint16_t writeable : 1; + /* + * 0 = generic | for all architectures + * 1 = specific | only for current architecture + */ + uint16_t architecture : 1; + }; + uint16_t bits; +}; + +struct qemu_fwcfg_signature { + uint8_t signature[4]; +}; + +struct qemu_fwcfg_id { + uint32_t interface : 1; /* always set */ + uint32_t DMA : 1; + uint32_t reserved : 30; +}; + +struct qemu_fwcfg_file { + uint32_t be_size; + uint16_t be_selector; + uint16_t reserved; + uint8_t name[QEMU_FWCFG_MAX_NAME]; +}; + +struct qemu_fwcfg_directory { + uint32_t be_count; + struct qemu_fwcfg_file files[0]; +}; + +struct qemu_fwcfg_softc { + struct acpi_device *acpi_dev; + + uint32_t data_offset; + union qemu_fwcfg_selector selector; + struct qemu_fwcfg_item items[QEMU_FWCFG_MAX_ARCHS] + [QEMU_FWCFG_MAX_ENTRIES]; + struct qemu_fwcfg_directory *directory; +}; + +#pragma pack() + +static struct qemu_fwcfg_softc sc; + +struct qemu_fwcfg_user_file { + STAILQ_ENTRY(qemu_fwcfg_user_file) chain; + uint8_t name[QEMU_FWCFG_MAX_NAME]; + uint32_t size; + void *data; +}; +STAILQ_HEAD(qemu_fwcfg_user_file_list, + qemu_fwcfg_user_file) user_files = STAILQ_HEAD_INITIALIZER(user_files); + +static int +qemu_fwcfg_selector_port_handler(struct vmctx *const ctx, const int vcpu, + const int in, const int port, const int bytes, uint32_t *const eax, + void *const arg) +{ + if (in) { + *eax = *(uint16_t *)&sc.selector; + return (0); + } + + sc.data_offset = 0; + sc.selector.bits = *eax; + + return (0); +} + +static int +qemu_fwcfg_data_port_handler(struct vmctx *const ctx, const int vcpu, + const int in, const int port, const int bytes, uint32_t *const eax, + void *const arg) +{ + if (!in) { + warnx("%s: Writes to qemu fwcfg data port aren't allowed", + __func__); + return (-1); + } + + /* get fwcfg item */ + struct qemu_fwcfg_item *const item = + &sc.items[sc.selector.architecture][sc.selector.index]; + if (item->data == NULL) { + warnx( + "%s: qemu fwcfg item doesn't exist (architecture %s index 0x%x)", + __func__, sc.selector.architecture ? "specific" : "generic", + sc.selector.index); + *eax = 0x00; + return (0); + } else if (sc.data_offset >= item->size) { + warnx( + "%s: qemu fwcfg item read exceeds size (architecture %s index 0x%x size 0x%x offset 0x%x)", + __func__, sc.selector.architecture ? "specific" : "generic", + sc.selector.index, item->size, sc.data_offset); + *eax = 0x00; + return (0); + } + + /* return item data */ + *eax = item->data[sc.data_offset]; + sc.data_offset++; + + return (0); +} + +static int +qemu_fwcfg_add_item(const uint16_t architecture, const uint16_t index, + const uint32_t size, void *const data) +{ + /* truncate architecture and index to their desired size */ + const uint16_t arch = architecture & QEMU_FWCFG_ARCHITECTURE_MASK; + const uint16_t idx = index & QEMU_FWCFG_INDEX_MASK; + + /* get pointer to item specified by selector */ + struct qemu_fwcfg_item *const fwcfg_item = &sc.items[arch][idx]; + + /* check if item is already used */ + if (fwcfg_item->data != NULL) { + warnx("%s: qemu fwcfg item exists (architecture %s index 0x%x)", + __func__, arch ? "specific" : "generic", idx); + return (-1); + } + + /* save data of the item */ + fwcfg_item->size = size; + fwcfg_item->data = data; + + return (0); +} + +static int +qemu_fwcfg_add_item_file_dir() +{ + /* alloc directory */ + const size_t size = sizeof(struct qemu_fwcfg_directory) + + QEMU_FWCFG_MIN_FILES * sizeof(struct qemu_fwcfg_file); + struct qemu_fwcfg_directory *const fwcfg_directory = calloc(1, size); + if (fwcfg_directory == NULL) { + return (-ENOMEM); + } + + /* init directory */ + sc.directory = fwcfg_directory; + + /* add directory */ + return qemu_fwcfg_add_item(QEMU_FWCFG_ARCHITECTURE_GENERIC, + QEMU_FWCFG_INDEX_FILE_DIR, sizeof(struct qemu_fwcfg_directory), (uint8_t *)sc.directory); +} + +static int +qemu_fwcfg_add_item_id() +{ + /* alloc id */ + struct qemu_fwcfg_id *const fwcfg_id = calloc(1, + sizeof(struct qemu_fwcfg_id)); + if (fwcfg_id == NULL) { + return (-ENOMEM); + } + + /* init id */ + fwcfg_id->interface = 1; + fwcfg_id->DMA = 0; + + /* + * QEMU specifies ID as little endian. + * Convert fwcfg_id to little endian. + */ + uint32_t *const le_fwcfg_id_ptr = (uint32_t *)fwcfg_id; + *le_fwcfg_id_ptr = htole32(*le_fwcfg_id_ptr); + + /* add id */ + return qemu_fwcfg_add_item(QEMU_FWCFG_ARCHITECTURE_GENERIC, + QEMU_FWCFG_INDEX_ID, sizeof(struct qemu_fwcfg_id), + (uint8_t *)fwcfg_id); +} + +static int +qemu_fwcfg_add_item_signature() +{ + /* alloc signature */ + struct qemu_fwcfg_signature *const fwcfg_signature = calloc(1, + sizeof(struct qemu_fwcfg_signature)); + if (fwcfg_signature == NULL) { + return (-ENOMEM); + } + + /* init signature */ + fwcfg_signature->signature[0] = 'Q'; + fwcfg_signature->signature[1] = 'E'; + fwcfg_signature->signature[2] = 'M'; + fwcfg_signature->signature[3] = 'U'; + + /* add signature */ + return qemu_fwcfg_add_item(QEMU_FWCFG_ARCHITECTURE_GENERIC, + QEMU_FWCFG_INDEX_SIGNATURE, sizeof(struct qemu_fwcfg_signature), + (uint8_t *)fwcfg_signature); +} + +static int +qemu_fwcfg_register_port(const char *const name, const int port, const int size, + const int flags, const inout_func_t handler) +{ + struct inout_port iop; + + bzero(&iop, sizeof(iop)); + iop.name = name; + iop.port = port; + iop.size = size; + iop.flags = flags; + iop.handler = handler; + + return register_inout(&iop); +} + +int +qemu_fwcfg_add_file(const uint8_t name[QEMU_FWCFG_MAX_NAME], const uint32_t size, + void *const data) +{ + /* + * QEMU specifies count as big endian. + * Convert it to host endian to work with it. + */ + const uint32_t count = be32toh(sc.directory->be_count) + 1; + + /* add file to items list */ + const uint32_t index = QEMU_FWCFG_FIRST_FILE_INDEX + count - 1; + const int error = qemu_fwcfg_add_item(QEMU_FWCFG_ARCHITECTURE_GENERIC, + index, size, data); + if (error != 0) { + return (error); + } + + /* + * files should be sorted alphabetical, get index for new file + */ + uint32_t file_index; + for (file_index = 0; file_index < count; ++file_index) { + if (strcmp(name, sc.directory->files[file_index].name) < 0) + break; + } + + if (count > QEMU_FWCFG_MIN_FILES) { + /* alloc new file directory */ + const uint64_t new_size = sizeof(struct qemu_fwcfg_directory) + + count * sizeof(struct qemu_fwcfg_file); + struct qemu_fwcfg_directory *const new_directory = calloc(1, + new_size); + if (new_directory == NULL) { + warnx( + "%s: Unable to allocate a new qemu fwcfg files directory (count %d)", + __func__, count); + return (-ENOMEM); + } + + /* copy files below file_index to new directory */ + memcpy(new_directory->files, sc.directory->files, + file_index * sizeof(struct qemu_fwcfg_file)); + + /* copy files behind file_index to directory */ + memcpy(&new_directory->files[file_index + 1], + &sc.directory->files[file_index], + (count - file_index) * sizeof(struct qemu_fwcfg_file)); + + /* free old directory */ + free(sc.directory); + + /* set directory pointer to new directory */ + sc.directory = new_directory; + + /* adjust directory pointer */ + sc.items[0][QEMU_FWCFG_INDEX_FILE_DIR].data = (uint8_t *) + sc.directory; + } else { + /* shift files behind file_index */ + for (uint32_t i = QEMU_FWCFG_MIN_FILES - 1; i > file_index; --i) { + memcpy(&sc.directory->files[i], + &sc.directory->files[i - 1], + sizeof(struct qemu_fwcfg_file)); + } + } + + /* + * QEMU specifies count, size and index as big endian. + * Save these values in big endian to simplify guest reads of these + * values. + */ + sc.directory->be_count = htobe32(count); + sc.directory->files[file_index].be_size = htobe32(size); + sc.directory->files[file_index].be_selector = htobe16(index); + strcpy(sc.directory->files[file_index].name, name); + + /* set new size for the fwcfg_file_directory */ + sc.items[0][QEMU_FWCFG_INDEX_FILE_DIR].size = + sizeof(struct qemu_fwcfg_directory) + + count * sizeof(struct qemu_fwcfg_file); + + return (0); +} + +static int +qemu_fwcfg_add_user_files() +{ + const struct qemu_fwcfg_user_file *fwcfg_file; + STAILQ_FOREACH (fwcfg_file, &user_files, chain) { + const int error = qemu_fwcfg_add_file(fwcfg_file->name, + fwcfg_file->size, fwcfg_file->data); + if (error) + return (error); + } + + return (0); +} + +int +qemu_fwcfg_init(struct vmctx *const ctx) +{ + int error; + + error = acpi_device_create(&sc.acpi_dev, ctx, + QEMU_FWCFG_ACPI_DEVICE_NAME, QEMU_FWCFG_ACPI_HARDWARE_ID); + if (error) { + warnx("%s: failed to create ACPI device for QEMU FwCfg", + __func__); + goto done; + } + + error = acpi_device_add_res_fixed_ioport(sc.acpi_dev, + QEMU_FWCFG_SELECTOR_PORT_NUMBER, 2); + if (error) { + warnx("%s: failed to add fixed IO port for QEMU FwCfg", + __func__); + goto done; + } + + /* add common fwcfg items */ + if ((error = qemu_fwcfg_add_item_signature()) != 0) { + warnx("%s: Unable to add signature item", __func__); + goto done; + } + if ((error = qemu_fwcfg_add_item_id()) != 0) { + warnx("%s: Unable to add id item", __func__); + goto done; + } + if ((error = qemu_fwcfg_add_item_file_dir()) != 0) { + warnx("%s: Unable to add file_dir item", __func__); + goto done; + } + + /* add handlers for fwcfg ports */ + if ((error = qemu_fwcfg_register_port("qemu_fwcfg_selector", + QEMU_FWCFG_SELECTOR_PORT_NUMBER, QEMU_FWCFG_SELECTOR_PORT_SIZE, + QEMU_FWCFG_SELECTOR_PORT_FLAGS, + qemu_fwcfg_selector_port_handler)) != 0) { + warnx("%s: Unable to register qemu fwcfg selector port 0x%x", + __func__, QEMU_FWCFG_SELECTOR_PORT_NUMBER); + goto done; + } + if ((error = qemu_fwcfg_register_port("qemu_fwcfg_data", + QEMU_FWCFG_DATA_PORT_NUMBER, QEMU_FWCFG_DATA_PORT_SIZE, + QEMU_FWCFG_DATA_PORT_FLAGS, qemu_fwcfg_data_port_handler)) != + 0) { + warnx("%s: Unable to register qemu fwcfg data port 0x%x", + __func__, QEMU_FWCFG_DATA_PORT_NUMBER); + goto done; + } + + if ((error = qemu_fwcfg_add_user_files()) != 0) { + warnx("%s: Unable to add user files", __func__); + goto done; + } + +done: + if (error) { + acpi_device_destroy(sc.acpi_dev); + } + + return (error); +} + +static void +qemu_fwcfg_usage(const char *opt) +{ + warnx("Invalid fw_cfg option \"%s\"", opt); + warnx("-f [name=],(string|file)="); +} + +/* + * Parses the cmdline argument for user defined fw_cfg items. The cmdline + * argument has the format: + * "-f [name=],(string|file)=" + * + * E.g.: "-f opt/com.page/example,string=Hello" + */ +int +qemu_fwcfg_parse_cmdline_arg(const char *opt) +{ + struct qemu_fwcfg_user_file *const fwcfg_file = malloc(sizeof(*fwcfg_file)); + if (fwcfg_file == NULL) { + warnx("Unable to allocate fw_cfg_user_file"); + return (-ENOMEM); + } + + /* get pointer to */ + const char *opt_ptr = opt; + /* If [name=] is specified, skip it */ + if (strncmp(opt_ptr, "name=", sizeof("name=") - 1) == 0) { + opt_ptr += sizeof("name=") - 1; + } + + /* get the end of */ + const char *opt_end = strchr(opt_ptr, ','); + if (opt_end == NULL) { + qemu_fwcfg_usage(opt); + return (-1); + } + + /* check if is too long */ + if (opt_end - opt_ptr > QEMU_FWCFG_MAX_NAME) { + warnx("fw_cfg name too long: \"%s\"", opt); + return (-1); + } + + /* save */ + strncpy(fwcfg_file->name, opt_ptr, opt_end - opt_ptr); + + /* set opt_ptr and opt_end to */ + opt_ptr = opt_end + 1; + opt_end = opt_ptr + strlen(opt_ptr); + + if (strncmp(opt_ptr, "string=", sizeof("string=") - 1) == 0) { + opt_ptr += sizeof("string=") - 1; + fwcfg_file->data = strdup(opt_ptr); + if (fwcfg_file->data == NULL) { + warnx(" Can't duplicate fw_cfg_user_file string \"%s\"", + opt_ptr); + return (-ENOMEM); + } + fwcfg_file->size = strlen(opt_ptr) + 1; + + } else if (strncmp(opt_ptr, "file=", sizeof("file=") - 1) == 0) { + opt_ptr += sizeof("file=") - 1; + + /* open file */ + const int fd = open(opt_ptr, O_RDONLY); + if (fd < 0) { + warnx("Can't open fw_cfg_user_file file \"%s\"", + opt_ptr); + return (-1); + } + + /* get file size */ + const uint64_t size = lseek(fd, 0, SEEK_END); + lseek(fd, 0, SEEK_SET); + + /* read file */ + fwcfg_file->data = malloc(size); + if (fwcfg_file->data == NULL) { + warnx( + "Can't allocate fw_cfg_user_file file \"%s\" (size: 0x%16lx)", + opt_ptr, size); + close(fd); + return (-ENOMEM); + } + fwcfg_file->size = read(fd, fwcfg_file->data, size); + + close(fd); + + } else { + qemu_fwcfg_usage(opt); + return (-1); + } + + STAILQ_INSERT_TAIL(&user_files, fwcfg_file, chain); + + return (0); +}