diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -73,6 +73,7 @@ VM_SYSMEM, VM_BOOTROM, VM_FRAMEBUFFER, + VM_PCIROM, }; /* @@ -180,6 +181,8 @@ vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len); +int vm_get_memory_region_info(struct vmctx *ctx, vm_paddr_t *base, + vm_paddr_t *size, enum vm_memory_region_type type); int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec); int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -1009,6 +1009,25 @@ return (ioctl(ctx->fd, VM_UNMAP_PPTDEV_MMIO, &pptmmio)); } +int +vm_get_memory_region_info(struct vmctx *ctx, vm_paddr_t *base, vm_paddr_t *size, + enum vm_memory_region_type type) +{ + struct vm_memory_region_info memory_region_info; + + bzero(&memory_region_info, sizeof(memory_region_info)); + memory_region_info.type = type; + + const int error = ioctl(ctx->fd, VM_GET_MEMORY_REGION_INFO, &memory_region_info); + + if (base) + *base = memory_region_info.base; + if (size) + *size = memory_region_info.size; + + return (error); +} + int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec) @@ -1684,7 +1703,7 @@ VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV, VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI, VM_PPTDEV_MSIX, VM_UNMAP_PPTDEV_MMIO, VM_PPTDEV_DISABLE_MSIX, - VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, + VM_GET_MEMORY_REGION_INFO, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE, VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA, VM_GLA2GPA_NOFAULT, diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -741,6 +741,11 @@ } u; }; +enum vm_memory_region_type { + MEMORY_REGION_INTEL_GSM, + MEMORY_REGION_INTEL_OPREGION +}; + /* APIs to inject faults into the guest */ void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, int errcode); diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -146,6 +146,17 @@ size_t len; }; +struct vm_memory_region_info { + vm_paddr_t base; + vm_paddr_t size; + enum vm_memory_region_type type; +}; + +#ifdef _KERNEL +extern vm_paddr_t intel_graphics_stolen_base; +extern vm_paddr_t intel_graphics_stolen_size; +#endif + struct vm_pptdev_msi { int vcpu; int bus; @@ -309,6 +320,7 @@ IOCNUM_PPTDEV_MSIX = 44, IOCNUM_PPTDEV_DISABLE_MSIX = 45, IOCNUM_UNMAP_PPTDEV_MMIO = 46, + IOCNUM_GET_MEMORY_REGION_INFO = 47, /* statistics */ IOCNUM_VM_STATS = 50, @@ -427,6 +439,8 @@ _IOW('v', IOCNUM_PPTDEV_DISABLE_MSIX, struct vm_pptdev) #define VM_UNMAP_PPTDEV_MMIO \ _IOW('v', IOCNUM_UNMAP_PPTDEV_MMIO, struct vm_pptdev_mmio) +#define VM_GET_MEMORY_REGION_INFO \ + _IOWR('v', IOCNUM_GET_MEMORY_REGION_INFO, struct vm_memory_region_info) #define VM_INJECT_NMI \ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) #define VM_STATS \ diff --git a/sys/amd64/vmm/intel/intelgpu.h b/sys/amd64/vmm/intel/intelgpu.h new file mode 100644 --- /dev/null +++ b/sys/amd64/vmm/intel/intelgpu.h @@ -0,0 +1,206 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR OR CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include +#include + +#define IGD_OPREGION_HEADER_SIGN "IntelGraphicsMem" +#define IGD_OPREGION_HEADER_MBOX1 BIT0 +#define IGD_OPREGION_HEADER_MBOX2 BIT1 +#define IGD_OPREGION_HEADER_MBOX3 BIT2 +#define IGD_OPREGION_HEADER_MBOX4 BIT3 +#define IGD_OPREGION_HEADER_MBOX5 BIT4 + +#define IGD_OPREGION_VBT_SIZE_6K (6 * 1024UL) + +/** + OpRegion structures: + Sub-structures define the different parts of the OpRegion followed by the + main structure representing the entire OpRegion. + + @note These structures are packed to 1 byte offsets because the exact + data location is required by the supporting design specification due to + the fact that the data is used by ASL and Graphics driver code compiled + separately. +**/ +#pragma pack(push, 1) +/// +/// OpRegion Mailbox 0 Header structure. The OpRegion Header is used to +/// identify a block of memory as the graphics driver OpRegion. +/// Offset 0x0, Size 0x100 +/// +struct igd_opregion_header { + int8_t sign[0x10]; ///< Offset 0x00 OpRegion Signature + uint32_t size; ///< Offset 0x10 OpRegion Size + uint32_t over; ///< Offset 0x14 OpRegion Structure Version + uint8_t sver[0x20]; ///< Offset 0x18 System BIOS Build Version + uint8_t vver[0x10]; ///< Offset 0x38 Video BIOS Build Version + uint8_t gver[0x10]; ///< Offset 0x48 Graphic Driver Build Version + uint32_t mbox; ///< Offset 0x58 Supported Mailboxes + uint32_t dmod; ///< Offset 0x5C Driver Model + uint32_t pcon; ///< Offset 0x60 Platform Configuration + int16_t dver[0x10]; ///< Offset 0x64 GOP Version + uint8_t rm01[0x7C]; ///< Offset 0x84 Reserved Must be zero +}; + +/// +/// OpRegion Mailbox 1 - Public ACPI Methods +/// Offset 0x100, Size 0x100 +/// +struct igd_opregion_mbox1 { + uint32_t drdy; ///< Offset 0x100 Driver Readiness + uint32_t csts; ///< Offset 0x104 Status + uint32_t cevt; ///< Offset 0x108 Current Event + uint8_t rm11[0x14]; ///< Offset 0x10C Reserved Must be Zero + uint32_t didl[8]; ///< Offset 0x120 Supported Display Devices ID List + uint32_t + cpdl[8]; ///< Offset 0x140 Currently Attached Display Devices List + uint32_t + cadl[8]; ///< Offset 0x160 Currently Active Display Devices List + uint32_t nadl[8]; ///< Offset 0x180 Next Active Devices List + uint32_t aslp; ///< Offset 0x1A0 ASL Sleep Time Out + uint32_t tidx; ///< Offset 0x1A4 Toggle Table Index + uint32_t chpd; ///< Offset 0x1A8 Current Hotplug Enable Indicator + uint32_t clid; ///< Offset 0x1AC Current Lid State Indicator + uint32_t cdck; ///< Offset 0x1B0 Current Docking State Indicator + uint32_t sxsw; ///< Offset 0x1B4 Display Switch Notification on Sx + ///< StateResume + uint32_t evts; ///< Offset 0x1B8 Events supported by ASL + uint32_t cnot; ///< Offset 0x1BC Current OS Notification + uint32_t NRDY; ///< Offset 0x1C0 Driver Status + uint8_t did2[0x1C]; ///< Offset 0x1C4 Extended Supported Devices ID + ///< List(DOD) + uint8_t + cpd2[0x1C]; ///< Offset 0x1E0 Extended Attached Display Devices List + uint8_t rm12[4]; ///< Offset 0x1FC - 0x1FF Reserved Must be zero +}; + +/// +/// OpRegion Mailbox 2 - Software SCI Interface +/// Offset 0x200, Size 0x100 +/// +struct igd_opregion_mbox2 { + uint32_t scic; ///< Offset 0x200 Software SCI Command / Status / Data + uint32_t parm; ///< Offset 0x204 Software SCI Parameters + uint32_t dslp; ///< Offset 0x208 Driver Sleep Time Out + uint8_t rm21[0xF4]; ///< Offset 0x20C - 0x2FF Reserved Must be zero +}; + +/// +/// OpRegion Mailbox 3 - BIOS/Driver Notification - ASLE Support +/// Offset 0x300, Size 0x100 +/// +struct igd_opregion_mbox3 { + uint32_t ardy; ///< Offset 0x300 Driver Readiness + uint32_t aslc; ///< Offset 0x304 ASLE Interrupt Command / Status + uint32_t tche; ///< Offset 0x308 Technology Enabled Indicator + uint32_t alsi; ///< Offset 0x30C Current ALS Luminance Reading + uint32_t bclp; ///< Offset 0x310 Requested Backlight Brightness + uint32_t pfit; ///< Offset 0x314 Panel Fitting State or Request + uint32_t cblv; ///< Offset 0x318 Current Brightness Level + uint16_t bclm[0x14]; ///< Offset 0x31C Backlight Brightness Levels Duty + ///< Cycle Mapping Table + uint32_t cpfm; ///< Offset 0x344 Current Panel Fitting Mode + uint32_t epfm; ///< Offset 0x348 Enabled Panel Fitting Modes + uint8_t plut[0x4A]; ///< Offset 0x34C Panel Look Up Table & Identifier + uint32_t pfmb; ///< Offset 0x396 PWM Frequency and Minimum Brightness + uint32_t ccdv; ///< Offset 0x39A Color Correction Default Values + uint32_t pcft; ///< Offset 0x39E Power Conservation Features + uint32_t srot; ///< Offset 0x3A2 Supported Rotation Angles + uint32_t iuer; ///< Offset 0x3A6 Intel Ultrabook(TM) Event Register + uint64_t fdss; ///< Offset 0x3AA DSS Buffer address allocated for IFFS + ///< feature + uint32_t fdsp; ///< Offset 0x3B2 Size of DSS buffer + uint32_t stat; ///< Offset 0x3B6 State Indicator + uint64_t rvda; ///< Offset 0x3BA Absolute/Relative Address of Raw VBT + ///< Data from OpRegion Base + uint32_t rvds; ///< Offset 0x3C2 Raw VBT Data Size + uint8_t rsvd2[0x3A]; ///< Offset 0x3C6 - 0x3FF Reserved Must be zero. + ///< Bug in spec 0x45(69) +}; + +/// +/// OpRegion Mailbox 4 - VBT Video BIOS Table +/// Offset 0x400, Size 0x1800 +/// +struct igd_opregion_mbox4 { + uint8_t rvbt[IGD_OPREGION_VBT_SIZE_6K]; ///< Offset 0x400 - 0x1BFF Raw + ///< VBT Data +}; + +/// +/// OpRegion Mailbox 5 - BIOS/Driver Notification - Data storage BIOS to Driver +/// data sync Offset 0x1C00, Size 0x400 +/// +struct igd_opregion_mbox5 { + uint32_t phed; ///< Offset 0x1C00 Panel Header + uint8_t bddc[0x100]; ///< Offset 0x1C04 Panel EDID (DDC data) + uint8_t rm51[0x2FC]; ///< Offset 0x1D04 - 0x1FFF Reserved Must be zero +}; + +/// +/// IGD OpRegion Structure +/// +struct igd_opregion { + struct igd_opregion_header + header; ///< OpRegion header (Offset 0x0, Size 0x100) + struct igd_opregion_mbox1 mbox1; ///< Mailbox 1: Public ACPI Methods + ///< (Offset 0x100, Size 0x100) + struct igd_opregion_mbox2 mbox2; ///< Mailbox 2: Software SCI Interface + ///< (Offset 0x200, Size 0x100) + struct igd_opregion_mbox3 + mbox3; ///< Mailbox 3: BIOS to Driver Notification (Offset 0x300, + ///< Size 0x100) + struct igd_opregion_mbox4 mbox4; ///< Mailbox 4: Video BIOS Table (VBT) + ///< (Offset 0x400, Size 0x1800) + struct igd_opregion_mbox5 + mbox5; ///< Mailbox 5: BIOS to Driver Notification Extension (Offset + ///< 0x1C00, Size 0x400) +}; + +/// +/// VBT Header Structure +/// +struct vbt_header { + uint8_t product_string[20]; + uint16_t version; + uint16_t header_size; + uint16_t table_size; + uint8_t checksum; + uint8_t reserved1; + uint32_t bios_data_offset; + uint32_t aim_data_offset[4]; +}; + +#pragma pack(pop) + +int vm_intelgpu_get_opregion(struct vm *vm, vm_paddr_t *base, vm_paddr_t *size); diff --git a/sys/amd64/vmm/intel/intelgpu.c b/sys/amd64/vmm/intel/intelgpu.c new file mode 100644 --- /dev/null +++ b/sys/amd64/vmm/intel/intelgpu.c @@ -0,0 +1,78 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR OR CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include + +#include +#include + +#include "intelgpu.h" + +#define KB (1024UL) + +int +vm_intelgpu_get_opregion(struct vm *vm, vm_paddr_t *base, vm_paddr_t *size) +{ + /* intel graphics device is always located at 0:2.0 */ + device_t dev = pci_find_bsf(0, 2, 0); + if (dev == NULL) { + return (ENOENT); + } + + if ((pci_get_vendor(dev) != PCI_VENDOR_INTEL) || + (pci_get_class(dev) != PCIC_DISPLAY) || + (pci_get_subclass(dev) != PCIS_DISPLAY_VGA)) { + return (ENODEV); + } + + uint64_t asls = pci_read_config(dev, PCIR_ASLS_CTL, 4); + + struct igd_opregion_header *opregion_header = + (struct igd_opregion_header *)pmap_map(NULL, asls, + asls + sizeof(*opregion_header), VM_PROT_READ); + if (opregion_header == NULL || + memcmp(opregion_header->sign, IGD_OPREGION_HEADER_SIGN, + sizeof(opregion_header->sign))) { + return (ENODEV); + } + + *base = asls; + *size = opregion_header->size * KB; + + return (0); +} diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -134,7 +134,7 @@ bool sysmem; struct vm_object *object; }; -#define VM_MAX_MEMSEGS 3 +#define VM_MAX_MEMSEGS 4 struct mem_map { vm_paddr_t gpa; diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -60,6 +60,7 @@ #include #include +#include "intel/intelgpu.h" #include "vmm_lapic.h" #include "vmm_stat.h" #include "vmm_mem.h" @@ -366,6 +367,7 @@ struct vm_capability *vmcap; struct vm_pptdev *pptdev; struct vm_pptdev_mmio *pptmmio; + struct vm_memory_region_info *memory_region_info; struct vm_pptdev_msi *pptmsi; struct vm_pptdev_msix *pptmsix; struct vm_nmi *vmnmi; @@ -533,6 +535,24 @@ error = ppt_unmap_mmio(sc->vm, pptmmio->bus, pptmmio->slot, pptmmio->func, pptmmio->gpa, pptmmio->len); break; + case VM_GET_MEMORY_REGION_INFO: + memory_region_info = (struct vm_memory_region_info *)data; + switch (memory_region_info->type) { + case MEMORY_REGION_INTEL_GSM: + memory_region_info->base = intel_graphics_stolen_base; + memory_region_info->size = intel_graphics_stolen_size; + error = 0; + break; + case MEMORY_REGION_INTEL_OPREGION: + error = vm_intelgpu_get_opregion(sc->vm, + &memory_region_info->base, + &memory_region_info->size); + break; + default: + error = EINVAL; + break; + } + break; case VM_BIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, diff --git a/sys/dev/pci/pcireg.h b/sys/dev/pci/pcireg.h --- a/sys/dev/pci/pcireg.h +++ b/sys/dev/pci/pcireg.h @@ -1098,3 +1098,14 @@ #define PCIM_OSC_CTL_PCIE_PME 0x04 /* PCIe Native Power Mgt Events */ #define PCIM_OSC_CTL_PCIE_AER 0x08 /* PCIe Advanced Error Reporting */ #define PCIM_OSC_CTL_PCIE_CAP_STRUCT 0x10 /* Various Capability Structures */ + +/* + * Intel graphics device definitions + */ +#define PCIR_BDSM 0x5C /* Base of Data Stolen Memory register */ +#define PCIR_ASLS_CTL 0xFC /* Opregion start address register */ + +/* + * PCI Vendors + */ +#define PCI_VENDOR_INTEL 0x8086 diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile --- a/sys/modules/vmm/Makefile +++ b/sys/modules/vmm/Makefile @@ -42,6 +42,7 @@ # intel-specific files .PATH: ${SRCTOP}/sys/amd64/vmm/intel SRCS+= ept.c \ + intelgpu.c \ vmcs.c \ vmx_msr.c \ vmx_support.S \ diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -15,6 +15,7 @@ BHYVE_SYSDIR?=${SRCTOP} SRCS= \ + acpi_device.c \ atkbdc.c \ acpi.c \ audio.c \ @@ -25,7 +26,7 @@ console.c \ ctl_util.c \ ctl_scsi_all.c \ - fwctl.c \ + e820.c \ gdb.c \ hda_codec.c \ inout.c \ @@ -41,6 +42,7 @@ pci_emul.c \ pci_hda.c \ pci_fbuf.c \ + pci_gvt-d.c \ pci_hostbridge.c \ pci_irq.c \ pci_lpc.c \ @@ -59,6 +61,7 @@ post.c \ ps2kbd.c \ ps2mouse.c \ + qemu_fwcfg.c \ rfb.c \ rtc.c \ smbiostbl.c \ diff --git a/usr.sbin/bhyve/acpi.h b/usr.sbin/bhyve/acpi.h --- a/usr.sbin/bhyve/acpi.h +++ b/usr.sbin/bhyve/acpi.h @@ -31,6 +31,8 @@ #ifndef _ACPI_H_ #define _ACPI_H_ +#include "acpi_device.h" + #define SCI_INT 9 #define SMI_CMD 0xb2 @@ -55,6 +57,7 @@ int acpi_build(struct vmctx *ctx, int ncpu); void acpi_raise_gpe(struct vmctx *ctx, unsigned bit); +int acpi_tables_add_device(const struct acpi_device *const dev); void dsdt_line(const char *fmt, ...); void dsdt_fixed_ioport(uint16_t iobase, uint16_t length); void dsdt_fixed_irq(uint8_t irq); diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c --- a/usr.sbin/bhyve/acpi.c +++ b/usr.sbin/bhyve/acpi.c @@ -139,6 +139,30 @@ #define EFFLUSH(x) \ if (fflush(x) != 0) goto err_exit; +/* + * A list for additional ACPI devices like a TPM. + */ +struct acpi_device_list_entry { + SLIST_ENTRY(acpi_device_list_entry) chain; + const struct acpi_device *dev; +}; +SLIST_HEAD(acpi_device_list, + acpi_device_list_entry) acpi_devices = SLIST_HEAD_INITIALIZER(acpi_devices); + +int +acpi_tables_add_device(const struct acpi_device *const dev) +{ + struct acpi_device_list_entry *const entry = calloc(1, sizeof(*entry)); + if (entry == NULL) { + return (ENOMEM); + } + + entry->dev = dev; + SLIST_INSERT_HEAD(&acpi_devices, entry, chain); + + return (0); +} + static int basl_fwrite_rsdp(FILE *fp) { @@ -760,6 +784,11 @@ vmgenc_write_dsdt(); + const struct acpi_device_list_entry *entry; + SLIST_FOREACH(entry, &acpi_devices, chain) { + acpi_device_write_dsdt(entry->dev); + } + dsdt_line("}"); if (dsdt_error != 0) diff --git a/usr.sbin/bhyve/acpi_device.h b/usr.sbin/bhyve/acpi_device.h new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/acpi_device.h @@ -0,0 +1,42 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * Author: Corvin Köhne + */ + +#pragma once + +#include + +struct vmctx; + +struct acpi_device; + +/** + * Creates an ACPI device. + * + * @param[out] new_dev Returns the newly create ACPI device. + * @param[in] vm_ctx VM context the ACPI device is created in. + * @param[in] name Name of the ACPI device. Should always be a NULL + * terminated string. + * @param[in] hid Hardware ID of the ACPI device. Should always be a NULL + * terminated string. + */ +int acpi_device_create(struct acpi_device **const new_dev, + struct vmctx *const vm_ctx, const char *const name, const char *const hid); +void acpi_device_destroy(struct acpi_device *const dev); + +/** + * @note: acpi_device_add_res_acpi_buffer doesn't ensure that no resources are + * added on an error condition. On error the caller should assume that + * the ACPI_BUFFER is partially added to the ACPI device. + */ +int acpi_device_add_res_acpi_buffer(struct acpi_device *const dev, + const ACPI_BUFFER resources); +int acpi_device_add_res_fixed_ioport(struct acpi_device *const dev, + const UINT16 port, UINT8 length); +int acpi_device_add_res_fixed_memory32(struct acpi_device *const dev, + const UINT8 write_protected, const UINT32 address, const UINT32 length); + +void acpi_device_write_dsdt(const struct acpi_device *const dev); diff --git a/usr.sbin/bhyve/acpi_device.c b/usr.sbin/bhyve/acpi_device.c new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/acpi_device.c @@ -0,0 +1,240 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * Author: Corvin Köhne + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include + +#include +#include +#include + +#include "acpi.h" +#include "acpi_device.h" + +/** + * List entry to enumerate all resources used by an ACPI device. + * + * @param chain Used to chain multiple elements together. + * @param type Type of the ACPI resource. + * @param data Data of the ACPI resource. + */ +struct acpi_resource_list_entry { + SLIST_ENTRY(acpi_resource_list_entry) chain; + UINT32 type; + ACPI_RESOURCE_DATA data; +}; + +/** + * Holds information about an ACPI device. + * + * @param vm_ctx VM context the ACPI device was created in. + * @param name Name of the ACPI device. + * @param hid Hardware ID of the ACPI device. + * @param crs Current resources used by the ACPI device. + */ +struct acpi_device { + struct vmctx *vm_ctx; + const char *name; + const char *hid; + SLIST_HEAD(acpi_resource_list, acpi_resource_list_entry) crs; +}; + +int +acpi_device_create(struct acpi_device **const new_dev, + struct vmctx *const vm_ctx, const char *const name, const char *const hid) +{ + if (new_dev == NULL || vm_ctx == NULL || name == NULL || hid == NULL) { + return (EINVAL); + } + + struct acpi_device *const dev = calloc(1, sizeof(*dev)); + if (dev == NULL) { + return (ENOMEM); + } + + dev->vm_ctx = vm_ctx; + dev->name = name; + dev->hid = hid; + SLIST_INIT(&dev->crs); + + /* current resources always contain an end tag */ + struct acpi_resource_list_entry *const crs_end_tag = calloc(1, + sizeof(*crs_end_tag)); + if (crs_end_tag == NULL) { + acpi_device_destroy(dev); + return (ENOMEM); + } + crs_end_tag->type = ACPI_RESOURCE_TYPE_END_TAG; + SLIST_INSERT_HEAD(&dev->crs, crs_end_tag, chain); + + const int error = acpi_tables_add_device(dev); + if (error) { + acpi_device_destroy(dev); + return (error); + } + + *new_dev = dev; + + return (0); +} + +void +acpi_device_destroy(struct acpi_device *const dev) +{ + if (dev == NULL) { + return; + } + + struct acpi_resource_list_entry *res; + while (!SLIST_EMPTY(&dev->crs)) { + res = SLIST_FIRST(&dev->crs); + SLIST_REMOVE_HEAD(&dev->crs, chain); + free(res); + } +} + +int +acpi_device_add_res_acpi_buffer(struct acpi_device *const dev, + const ACPI_BUFFER resources) +{ + if (dev == NULL) { + return (EINVAL); + } + + int error = 0; + size_t offset = 0; + while (offset < resources.Length) { + const ACPI_RESOURCE *const res = + (const ACPI_RESOURCE *)((UINT8 *)resources.Pointer + + offset); + switch (res->Type) { + case ACPI_RESOURCE_TYPE_FIXED_IO: + error = acpi_device_add_res_fixed_ioport(dev, + res->Data.FixedIo.Address, + res->Data.FixedIo.AddressLength); + break; + case ACPI_RESOURCE_TYPE_FIXED_MEMORY32: + error = acpi_device_add_res_fixed_memory32(dev, + res->Data.FixedMemory32.WriteProtect, + res->Data.FixedMemory32.Address, + res->Data.FixedMemory32.AddressLength); + break; + case ACPI_RESOURCE_TYPE_END_TAG: + break; + default: + warnx("%s: unknown resource type %d", __func__, + res->Type); + return (ENODEV); + } + if (error) { + break; + } + offset += res->Length; + } + + return (error); +} + +int +acpi_device_add_res_fixed_ioport(struct acpi_device *const dev, + const UINT16 port, const UINT8 length) +{ + if (dev == NULL) { + return (EINVAL); + } + + struct acpi_resource_list_entry *const res = calloc(1, sizeof(*res)); + if (res == NULL) { + return (ENOMEM); + } + + res->type = ACPI_RESOURCE_TYPE_FIXED_IO; + res->data.FixedIo.Address = port; + res->data.FixedIo.AddressLength = length; + + SLIST_INSERT_HEAD(&dev->crs, res, chain); + + return (0); +} + +int +acpi_device_add_res_fixed_memory32(struct acpi_device *const dev, + const UINT8 write_protected, const UINT32 address, const UINT32 length) +{ + if (dev == NULL) { + return (EINVAL); + } + + struct acpi_resource_list_entry *const res = calloc(1, sizeof(*res)); + if (res == NULL) { + return (ENOMEM); + } + + res->type = ACPI_RESOURCE_TYPE_FIXED_MEMORY32; + res->data.FixedMemory32.WriteProtect = write_protected; + res->data.FixedMemory32.Address = address; + res->data.FixedMemory32.AddressLength = length; + + SLIST_INSERT_HEAD(&dev->crs, res, chain); + + return (0); +} + +static void +acpi_device_write_dsdt_crs(const struct acpi_device *const dev) +{ + const struct acpi_resource_list_entry *res; + SLIST_FOREACH (res, &dev->crs, chain) { + switch (res->type) { + case ACPI_RESOURCE_TYPE_FIXED_IO: + dsdt_fixed_ioport(res->data.FixedIo.Address, + res->data.FixedIo.AddressLength); + break; + case ACPI_RESOURCE_TYPE_FIXED_MEMORY32: { + dsdt_fixed_mem32(res->data.FixedMemory32.Address, + res->data.FixedMemory32.AddressLength); + break; + } + case ACPI_RESOURCE_TYPE_END_TAG: + break; + default: + warnx("%s: unknown resource type %d", __func__, + res->type); + return; + } + } +} + +void +acpi_device_write_dsdt(const struct acpi_device *const dev) +{ + if (dev == NULL) { + return; + } + + dsdt_line(""); + dsdt_line(" Scope (\\_SB)"); + dsdt_line(" {"); + dsdt_line(" Device (%s)", dev->name); + dsdt_line(" {"); + dsdt_line(" Name (_HID, \"%s\")", dev->hid); + dsdt_line(" Name (_STA, 0x0F)"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(4); + acpi_device_write_dsdt_crs(dev); + dsdt_unindent(4); + dsdt_line(" })"); + dsdt_line(" }"); + dsdt_line(" }"); +} diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -409,6 +409,11 @@ and .Ar function numbers. +.It Li rom= Ns Ar romfile +Add +.Ar romfile +as option ROM to the PCI device. +The ROM will be loaded by firmware and should be capable of initializing the device. .El .Pp Guest memory must be wired using the diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -89,7 +89,7 @@ #include "bootrom.h" #include "inout.h" #include "debug.h" -#include "fwctl.h" +#include "e820.h" #include "gdb.h" #include "ioapic.h" #include "kernemu_dev.h" @@ -99,6 +99,7 @@ #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" +#include "qemu_fwcfg.h" #include "smbiostbl.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" @@ -1296,6 +1297,41 @@ rtc_init(ctx, rtc_localtime); sci_init(ctx); + if (qemu_fwcfg_init(ctx) != 0) { + fprintf(stderr, "qemu fwcfg initialization error"); + exit(4); + } + + /* + * QEMU uses fwcfg item 0x0f (FW_CFG_MAX_CPUS) to report the number of + * cpus to the guest but states that it has a special meaning for x86. + * Don't know yet if that can cause unintented side-effects. Use an own + * fwcfg item to be safe. + * + * QEMU comment: + * FW_CFG_MAX_CPUS is a bit confusing/problematic on x86: + * + * For machine types prior to 1.8, SeaBIOS needs FW_CFG_MAX_CPUS + * for building MPTable, ACPI MADT, ACPI CPU hotplug and ACPI SRAT + * table, that tables are based on xAPIC ID and QEMU<->SeaBIOS + * interface for CPU hotplug also uses APIC ID and not "CPU index". + * This means that FW_CFG_MAX_CPUS is not the "maximum number of + * CPUs", but the "limit to the APIC ID values SeaBIOS may see". + * + * So for compatibility reasons with old BIOSes we are stuck with + * "etc/max-cpus" actually being apic_id_limit + */ + if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus), + &guest_ncpus) != 0) { + fprintf(stderr, "could not add qemu fwcfg opt/bhyve/hw.ncpu"); + exit(4); + } + + if (e820_init(ctx) != 0) { + fprintf(stderr, "Unable to setup E820"); + exit(4); + } + /* * Exit if a device emulation finds an error in its initilization */ @@ -1380,8 +1416,17 @@ assert(error == 0); } - if (lpc_bootrom()) - fwctl_init(); + struct qemu_fwcfg_item *fwcfg_item = e820_get_fwcfg_item(); + if (fwcfg_item == NULL) { + fprintf(stderr, "invalid e820 table"); + exit(4); + } + if (qemu_fwcfg_add_file("etc/e820", fwcfg_item->size, + fwcfg_item->data) != 0) { + fprintf(stderr, "could not add qemu fwcfg etc/e820"); + exit(4); + } + free(fwcfg_item); /* * Change the proc title to include the VM name. diff --git a/usr.sbin/bhyve/e820.h b/usr.sbin/bhyve/e820.h new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/e820.h @@ -0,0 +1,71 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR OR CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include + +#include "qemu_fwcfg.h" + +#pragma pack(push, 1) + +enum e820_memory_type { + E820_TYPE_MEMORY = 1, + E820_TYPE_RESERVED = 2, + E820_TYPE_ACPI = 3, + E820_TYPE_NVS = 4 +}; + +enum e820_allocation_strategy { + /* allocate any address */ + E820_ALLOCATE_ANY, + /* allocate lowest address larger than address */ + E820_ALLOCATE_LOWEST, + /* allocate highest address lower than address */ + E820_ALLOCATE_HIGHEST, + /* allocate a specific address */ + E820_ALLOCATE_SPECIFIC +}; + +struct e820_entry { + uint64_t base; + uint64_t length; + enum e820_memory_type type; +}; + +#pragma pack(pop) + +#define E820_ALIGNMENT_NONE 1 + +uint64_t e820_alloc(uint64_t address, uint64_t length, uint64_t alignment, + enum e820_memory_type type, enum e820_allocation_strategy strategy); +void e820_dump_table(); +struct qemu_fwcfg_item *e820_get_fwcfg_item(); +int e820_init(struct vmctx *ctx); diff --git a/usr.sbin/bhyve/e820.c b/usr.sbin/bhyve/e820.c new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/e820.c @@ -0,0 +1,460 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR OR CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "e820.h" +#include "qemu_fwcfg.h" + +/* + * E820 always uses 64 bit entries. Emulation code will use vm_paddr_t since it + * works on physical addresses. If vm_paddr_t is larger than uint64_t E820 can't + * hold all possible physical addresses and we can get into trouble. + */ +static_assert(sizeof(vm_paddr_t) <= sizeof(uint64_t), + "Unable to represent physical memory by E820 table"); + +#define E820_FWCFG_FILE_NAME "etc/e820" + +#define KB (1024UL) +#define MB (1024 * KB) +#define GB (1024 * MB) + +/* + * Fix E820 memory holes: + * [ A0000, C0000) VGA + * [ C0000, 100000) ROM + */ +#define E820_VGA_MEM_BASE 0xA0000 +#define E820_VGA_MEM_END 0xC0000 +#define E820_ROM_MEM_BASE 0xC0000 +#define E820_ROM_MEM_END 0x100000 + +struct e820_element { + TAILQ_ENTRY(e820_element) chain; + uint64_t base; + uint64_t end; + enum e820_memory_type type; +}; +TAILQ_HEAD(e820_table, e820_element) e820_table = TAILQ_HEAD_INITIALIZER( + e820_table); + +static char * +e820_get_type_name(enum e820_memory_type type) +{ + switch (type) { + case E820_TYPE_MEMORY: + return "RAM "; + case E820_TYPE_RESERVED: + return "Reserved"; + case E820_TYPE_ACPI: + return "ACPI "; + case E820_TYPE_NVS: + return "NVS "; + default: + return "Unknown "; + } +} + +void +e820_dump_table() +{ + fprintf(stderr, "E820 map:\n\r"); + uint64_t i = 0; + struct e820_element *element; + TAILQ_FOREACH (element, &e820_table, chain) { + fprintf(stderr, " (%4lu) [ %16lx, %16lx] %s\n\r", i, + element->base, element->end, + e820_get_type_name(element->type)); + ++i; + } +} + +struct qemu_fwcfg_item * +e820_get_fwcfg_item() +{ + uint64_t count = 0; + struct e820_element *element; + TAILQ_FOREACH (element, &e820_table, chain) { + ++count; + } + + struct qemu_fwcfg_item *fwcfg_item = malloc( + sizeof(struct qemu_fwcfg_item)); + if (fwcfg_item == NULL) { + return (NULL); + } + fwcfg_item->size = count * sizeof(struct e820_entry); + fwcfg_item->data = malloc(fwcfg_item->size); + if (fwcfg_item->data == NULL) { + free(fwcfg_item); + return (NULL); + } + uint64_t i = 0; + struct e820_entry *entries = (struct e820_entry *)fwcfg_item->data; + TAILQ_FOREACH (element, &e820_table, chain) { + struct e820_entry *entry = &entries[i]; + entry->base = element->base; + entry->length = element->end - element->base; + entry->type = element->type; + ++i; + } + + return fwcfg_item; +} + +int +e820_add_entry(uint64_t base, uint64_t end, enum e820_memory_type type) +{ + if (end < base) { + return (-1); + } + + struct e820_element *new_element = malloc(sizeof(struct e820_element)); + if (new_element == NULL) { + return (-ENOMEM); + } + + new_element->base = base; + new_element->end = end; + new_element->type = type; + + /* + * E820 table should be always sorted in ascending order. Therefore, + * search for an element which end is larger than the base parameter. + */ + struct e820_element *element; + TAILQ_FOREACH (element, &e820_table, chain) { + if (element->end > base) { + break; + } + } + + /* + * System memory requires special handling. + */ + if (type == E820_TYPE_MEMORY) { + /* + * base is larger than of any existing element. Add new system + * memory at the end of the table. + */ + if (element == NULL) { + TAILQ_INSERT_TAIL(&e820_table, new_element, chain); + return (0); + } + + /* + * System memory shouldn't overlap with any existing element. + */ + if (end > element->base) { + return (-1); + } + TAILQ_INSERT_BEFORE(element, new_element, chain); + return (0); + } + + if (element == NULL) { + /* No suitable element found */ + return (-1); + } + + /* + * Non system memory should be allocated inside system memory. + */ + if (element->type != E820_TYPE_MEMORY) { + return (-1); + } + /* + * New element should fit into existing system memory element. + */ + if (base < element->base || end > element->end) { + return (-1); + } + + if (base == element->base) { + /* + * New element at system memory base boundary. Add new + * element before current and adjust the base of the old + * element. + * + * Old table: + * [ 0x1000, 0x4000] RAM <-- element + * New table: + * [ 0x1000, 0x2000] Reserved + * [ 0x2000, 0x4000] RAM <-- element + */ + TAILQ_INSERT_BEFORE(element, new_element, chain); + element->base = end; + } else if (end == element->end) { + /* + * New element at system memory end boundary. Add new + * element after current and adjust the end of the + * current element. + * + * Old table: + * [ 0x1000, 0x4000] RAM <-- element + * New table: + * [ 0x1000, 0x3000] RAM <-- element + * [ 0x3000, 0x4000] Reserved + */ + TAILQ_INSERT_AFTER(&e820_table, element, new_element, chain); + element->end = base; + } else { + /* + * New element inside system memory entry. Split it by + * adding a system memory element and the new element + * before current. + * + * Old table: + * [ 0x1000, 0x4000] RAM <-- element + * New table: + * [ 0x1000, 0x2000] RAM + * [ 0x2000, 0x3000] Reserved + * [ 0x3000, 0x4000] RAM <-- element + */ + struct e820_element *ram_element = malloc( + sizeof(struct e820_element)); + if (ram_element == NULL) { + return (-ENOMEM); + } + ram_element->base = element->base; + ram_element->end = base; + ram_element->type = E820_TYPE_MEMORY; + TAILQ_INSERT_BEFORE(element, ram_element, chain); + TAILQ_INSERT_BEFORE(element, new_element, chain); + element->base = end; + } + + return (0); +} + +int +e820_add_memory_hole(uint64_t base, uint64_t end) +{ + if (end < base) { + return (-1); + } + + /* + * E820 table should be always sorted in ascending order. Therefore, + * search for an element which end is larger than the base parameter. + */ + struct e820_element *element; + TAILQ_FOREACH (element, &e820_table, chain) { + if (element->end > base) { + break; + } + } + + if (element == NULL || end <= element->base) { + /* Nothing to do. Hole already exists */ + return (0); + } + + if (element->type != E820_TYPE_MEMORY) { + /* Memory holes are only allowed in system memory */ + return (-1); + } + + if (base == element->base) { + /* + * New hole at system memory base boundary. + * + * Old table: + * [ 0x1000, 0x4000] RAM + * New table: + * [ 0x2000, 0x4000] RAM + */ + element->base = end; + + } else if (end == element->end) { + /* + * New hole at system memory end boundary. + * + * Old table: + * [ 0x1000, 0x4000] RAM + * New table: + * [ 0x1000, 0x3000] RAM + */ + element->end = base; + + } else { + /* + * New hole inside system memory entry. Split the system memory. + * + * Old table: + * [ 0x1000, 0x4000] RAM <-- element + * New table: + * [ 0x1000, 0x2000] RAM + * [ 0x3000, 0x4000] RAM <-- element + */ + struct e820_element *ram_element = malloc( + sizeof(struct e820_element)); + if (ram_element == NULL) { + return (-ENOMEM); + } + ram_element->base = element->base; + ram_element->end = base; + ram_element->type = E820_TYPE_MEMORY; + TAILQ_INSERT_BEFORE(element, ram_element, chain); + element->base = end; + } + + return (0); +} + +uint64_t +e820_alloc(uint64_t address, uint64_t length, uint64_t alignment, + enum e820_memory_type type, enum e820_allocation_strategy strategy) +{ + /* address should be aligned */ + if (!powerof2(alignment) || (address & (alignment - 1)) != 0) { + return 0; + } + + struct e820_element *element; + uint64_t end; + uint64_t base; + switch (strategy) { + case E820_ALLOCATE_ANY: + /* + * Allocate any address. Therefore, ignore the address parameter + * and reuse the code path for allocating the lowest address. + */ + address = 0; + /* fallthrough */ + case E820_ALLOCATE_LOWEST: + TAILQ_FOREACH (element, &e820_table, chain) { + end = element->end; + base = roundup2(element->base, alignment); + if (address != 0) { + base = MAX(base, address); + } + + if (element->type != E820_TYPE_MEMORY || end < base || + end - base < length || base == 0) { + continue; + } + + if (e820_add_entry(base, base + length, type) != 0) { + return 0; + } + + return base; + } + break; + case E820_ALLOCATE_HIGHEST: + TAILQ_FOREACH_REVERSE (element, &e820_table, e820_table, + chain) { + end = element->end; + base = roundup2(element->base, alignment); + if (address != 0) { + end = MIN(end, address); + } + + if (element->type != E820_TYPE_MEMORY || end < base || + end - base < length || end - length == 0) { + continue; + } + base = rounddown2(end - length, alignment); + + if (e820_add_entry(base, base + length, type) != 0) { + return 0; + } + + return base; + } + break; + case E820_ALLOCATE_SPECIFIC: + base = address; + if (e820_add_entry(base, base + length, type) != 0) { + return 0; + } + + return address; + } + + return 0; +} + +int +e820_init(struct vmctx *ctx) +{ + int error; + + TAILQ_INIT(&e820_table); + + /* add memory below 4 GB to E820 table */ + const uint64_t lowmem_length = vm_get_lowmem_size(ctx); + error = e820_add_entry(0, lowmem_length, E820_TYPE_MEMORY); + if (error) { + warnx("%s: Could not add lowmem", __func__); + return (error); + } + + /* add memory above 4 GB to E820 table */ + const uint64_t highmem_length = vm_get_highmem_size(ctx); + if (highmem_length != 0) { + error = e820_add_entry(4 * GB, 4 * GB + highmem_length, + E820_TYPE_MEMORY); + if (error) { + warnx("%s: Could not add highmem", __func__); + return (error); + } + } + + /* add memory holes to E820 table */ + error = e820_add_memory_hole(E820_VGA_MEM_BASE, E820_VGA_MEM_END); + if (error) { + warnx("%s: Could not add VGA memory", __func__); + return (error); + } + + error = e820_add_memory_hole(E820_ROM_MEM_BASE, E820_ROM_MEM_END); + if (error) { + warnx("%s: Could not add ROM area", __func__); + return (error); + } + + return (0); +} diff --git a/usr.sbin/bhyve/fwctl.c b/usr.sbin/bhyve/fwctl.c deleted file mode 100644 --- a/usr.sbin/bhyve/fwctl.c +++ /dev/null @@ -1,552 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2015 Peter Grehan - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -/* - * Guest firmware interface. Uses i/o ports x510/x511 as Qemu does, - * but with a request/response messaging protocol. - */ -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "bhyverun.h" -#include "inout.h" -#include "fwctl.h" - -/* - * Messaging protocol base operations - */ -#define OP_NULL 1 -#define OP_ECHO 2 -#define OP_GET 3 -#define OP_GET_LEN 4 -#define OP_SET 5 -#define OP_MAX OP_SET - -/* I/O ports */ -#define FWCTL_OUT 0x510 -#define FWCTL_IN 0x511 - -/* - * Back-end state-machine - */ -enum state { - DORMANT, - IDENT_WAIT, - IDENT_SEND, - REQ, - RESP -} be_state = DORMANT; - -static uint8_t sig[] = { 'B', 'H', 'Y', 'V' }; -static u_int ident_idx; - -struct op_info { - int op; - int (*op_start)(uint32_t len); - void (*op_data)(uint32_t data, uint32_t len); - int (*op_result)(struct iovec **data); - void (*op_done)(struct iovec *data); -}; -static struct op_info *ops[OP_MAX+1]; - -/* Return 0-padded uint32_t */ -static uint32_t -fwctl_send_rest(uint32_t *data, size_t len) -{ - union { - uint8_t c[4]; - uint32_t w; - } u; - uint8_t *cdata; - int i; - - cdata = (uint8_t *) data; - u.w = 0; - - for (i = 0, u.w = 0; i < len; i++) - u.c[i] = *cdata++; - - return (u.w); -} - -/* - * error op dummy proto - drop all data sent and return an error -*/ -static int errop_code; - -static void -errop_set(int err) -{ - - errop_code = err; -} - -static int -errop_start(uint32_t len) -{ - errop_code = ENOENT; - - /* accept any length */ - return (errop_code); -} - -static void -errop_data(uint32_t data, uint32_t len) -{ - - /* ignore */ -} - -static int -errop_result(struct iovec **data) -{ - - /* no data to send back; always successful */ - *data = NULL; - return (errop_code); -} - -static void -errop_done(struct iovec *data) -{ - - /* assert data is NULL */ -} - -static struct op_info errop_info = { - .op_start = errop_start, - .op_data = errop_data, - .op_result = errop_result, - .op_done = errop_done -}; - -/* OID search */ -SET_DECLARE(ctl_set, struct ctl); - -CTL_NODE("hw.ncpu", &guest_ncpus, sizeof(guest_ncpus)); - -static struct ctl * -ctl_locate(const char *str, int maxlen) -{ - struct ctl *cp, **cpp; - - SET_FOREACH(cpp, ctl_set) { - cp = *cpp; - if (!strncmp(str, cp->c_oid, maxlen)) - return (cp); - } - - return (NULL); -} - -/* uefi-sysctl get-len */ -#define FGET_STRSZ 80 -static struct iovec fget_biov[2]; -static char fget_str[FGET_STRSZ]; -static struct { - size_t f_sz; - uint32_t f_data[1024]; -} fget_buf; -static int fget_cnt; -static size_t fget_size; - -static int -fget_start(uint32_t len) -{ - - if (len > FGET_STRSZ) - return(E2BIG); - - fget_cnt = 0; - - return (0); -} - -static void -fget_data(uint32_t data, uint32_t len) -{ - - *((uint32_t *) &fget_str[fget_cnt]) = data; - fget_cnt += sizeof(uint32_t); -} - -static int -fget_result(struct iovec **data, int val) -{ - struct ctl *cp; - int err; - - err = 0; - - /* Locate the OID */ - cp = ctl_locate(fget_str, fget_cnt); - if (cp == NULL) { - *data = NULL; - err = ENOENT; - } else { - if (val) { - /* For now, copy the len/data into a buffer */ - memset(&fget_buf, 0, sizeof(fget_buf)); - fget_buf.f_sz = cp->c_len; - memcpy(fget_buf.f_data, cp->c_data, cp->c_len); - fget_biov[0].iov_base = (char *)&fget_buf; - fget_biov[0].iov_len = sizeof(fget_buf.f_sz) + - cp->c_len; - } else { - fget_size = cp->c_len; - fget_biov[0].iov_base = (char *)&fget_size; - fget_biov[0].iov_len = sizeof(fget_size); - } - - fget_biov[1].iov_base = NULL; - fget_biov[1].iov_len = 0; - *data = fget_biov; - } - - return (err); -} - -static void -fget_done(struct iovec *data) -{ - - /* nothing needs to be freed */ -} - -static int -fget_len_result(struct iovec **data) -{ - return (fget_result(data, 0)); -} - -static int -fget_val_result(struct iovec **data) -{ - return (fget_result(data, 1)); -} - -static struct op_info fgetlen_info = { - .op_start = fget_start, - .op_data = fget_data, - .op_result = fget_len_result, - .op_done = fget_done -}; - -static struct op_info fgetval_info = { - .op_start = fget_start, - .op_data = fget_data, - .op_result = fget_val_result, - .op_done = fget_done -}; - -static struct req_info { - int req_error; - u_int req_count; - uint32_t req_size; - uint32_t req_type; - uint32_t req_txid; - struct op_info *req_op; - int resp_error; - int resp_count; - size_t resp_size; - size_t resp_off; - struct iovec *resp_biov; -} rinfo; - -static void -fwctl_response_done(void) -{ - - (*rinfo.req_op->op_done)(rinfo.resp_biov); - - /* reinit the req data struct */ - memset(&rinfo, 0, sizeof(rinfo)); -} - -static void -fwctl_request_done(void) -{ - - rinfo.resp_error = (*rinfo.req_op->op_result)(&rinfo.resp_biov); - - /* XXX only a single vector supported at the moment */ - rinfo.resp_off = 0; - if (rinfo.resp_biov == NULL) { - rinfo.resp_size = 0; - } else { - rinfo.resp_size = rinfo.resp_biov[0].iov_len; - } -} - -static int -fwctl_request_start(void) -{ - int err; - - /* Data size doesn't include header */ - rinfo.req_size -= 12; - - rinfo.req_op = &errop_info; - if (rinfo.req_type <= OP_MAX && ops[rinfo.req_type] != NULL) - rinfo.req_op = ops[rinfo.req_type]; - - err = (*rinfo.req_op->op_start)(rinfo.req_size); - - if (err) { - errop_set(err); - rinfo.req_op = &errop_info; - } - - /* Catch case of zero-length message here */ - if (rinfo.req_size == 0) { - fwctl_request_done(); - return (1); - } - - return (0); -} - -static int -fwctl_request_data(uint32_t value) -{ - - /* Make sure remaining size is >= 0 */ - if (rinfo.req_size <= sizeof(uint32_t)) - rinfo.req_size = 0; - else - rinfo.req_size -= sizeof(uint32_t); - - (*rinfo.req_op->op_data)(value, rinfo.req_size); - - if (rinfo.req_size < sizeof(uint32_t)) { - fwctl_request_done(); - return (1); - } - - return (0); -} - -static int -fwctl_request(uint32_t value) -{ - - int ret; - - ret = 0; - - switch (rinfo.req_count) { - case 0: - /* Verify size */ - if (value < 12) { - printf("msg size error"); - exit(4); - } - rinfo.req_size = value; - rinfo.req_count = 1; - break; - case 1: - rinfo.req_type = value; - rinfo.req_count++; - break; - case 2: - rinfo.req_txid = value; - rinfo.req_count++; - ret = fwctl_request_start(); - break; - default: - ret = fwctl_request_data(value); - break; - } - - return (ret); -} - -static int -fwctl_response(uint32_t *retval) -{ - uint32_t *dp; - ssize_t remlen; - - switch(rinfo.resp_count) { - case 0: - /* 4 x u32 header len + data */ - *retval = 4*sizeof(uint32_t) + - roundup(rinfo.resp_size, sizeof(uint32_t)); - rinfo.resp_count++; - break; - case 1: - *retval = rinfo.req_type; - rinfo.resp_count++; - break; - case 2: - *retval = rinfo.req_txid; - rinfo.resp_count++; - break; - case 3: - *retval = rinfo.resp_error; - rinfo.resp_count++; - break; - default: - remlen = rinfo.resp_size - rinfo.resp_off; - dp = (uint32_t *) - ((uint8_t *)rinfo.resp_biov->iov_base + rinfo.resp_off); - if (remlen >= sizeof(uint32_t)) { - *retval = *dp; - } else if (remlen > 0) { - *retval = fwctl_send_rest(dp, remlen); - } - rinfo.resp_off += sizeof(uint32_t); - break; - } - - if (rinfo.resp_count > 3 && - rinfo.resp_off >= rinfo.resp_size) { - fwctl_response_done(); - return (1); - } - - return (0); -} - - -/* - * i/o port handling. - */ -static uint8_t -fwctl_inb(void) -{ - uint8_t retval; - - retval = 0xff; - - switch (be_state) { - case IDENT_SEND: - retval = sig[ident_idx++]; - if (ident_idx >= sizeof(sig)) - be_state = REQ; - break; - default: - break; - } - - return (retval); -} - -static void -fwctl_outw(uint16_t val) -{ - switch (be_state) { - case IDENT_WAIT: - if (val == 0) { - be_state = IDENT_SEND; - ident_idx = 0; - } - break; - default: - /* ignore */ - break; - } -} - -static uint32_t -fwctl_inl(void) -{ - uint32_t retval; - - switch (be_state) { - case RESP: - if (fwctl_response(&retval)) - be_state = REQ; - break; - default: - retval = 0xffffffff; - break; - } - - return (retval); -} - -static void -fwctl_outl(uint32_t val) -{ - - switch (be_state) { - case REQ: - if (fwctl_request(val)) - be_state = RESP; - default: - break; - } - -} - -static int -fwctl_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) -{ - - if (in) { - if (bytes == 1) - *eax = fwctl_inb(); - else if (bytes == 4) - *eax = fwctl_inl(); - else - *eax = 0xffff; - } else { - if (bytes == 2) - fwctl_outw(*eax); - else if (bytes == 4) - fwctl_outl(*eax); - } - - return (0); -} -INOUT_PORT(fwctl_wreg, FWCTL_OUT, IOPORT_F_INOUT, fwctl_handler); -INOUT_PORT(fwctl_rreg, FWCTL_IN, IOPORT_F_IN, fwctl_handler); - -void -fwctl_init(void) -{ - - ops[OP_GET_LEN] = &fgetlen_info; - ops[OP_GET] = &fgetval_info; - - be_state = IDENT_WAIT; -} diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h --- a/usr.sbin/bhyve/pci_emul.h +++ b/usr.sbin/bhyve/pci_emul.h @@ -41,6 +41,8 @@ #include #define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */ +#define PCI_BARMAX_WITH_ROM (PCI_BARMAX + 1) +#define PCI_ROM_IDX (PCI_BARMAX + 1) struct vmctx; struct pci_devinst; @@ -88,13 +90,15 @@ PCIBAR_IO, PCIBAR_MEM32, PCIBAR_MEM64, - PCIBAR_MEMHI64 + PCIBAR_MEMHI64, + PCIBAR_ROM, }; struct pcibar { enum pcibar_type type; /* io or memory */ uint64_t size; uint64_t addr; + uint8_t lobits; }; #define PI_NAMESZ 40 @@ -160,7 +164,9 @@ void *pi_arg; /* devemu-private data */ u_char pi_cfgdata[PCI_REGMAX + 1]; - struct pcibar pi_bar[PCI_BARMAX + 1]; + /* ROM is handled like a BAR */ + struct pcibar pi_bar[PCI_BARMAX_WITH_ROM + 1]; + uint64_t pi_romoffset; }; struct msicap { @@ -224,6 +230,7 @@ void pci_callback(void); int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size); +int pci_emul_alloc_rom(struct pci_devinst *pdi, uint64_t size, uint64_t *addr); int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type); void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -33,12 +33,15 @@ #include #include +#include + #include #include #include #include #include +#include #include #include #include @@ -72,6 +75,8 @@ #define MAXSLOTS (PCI_SLOTMAX + 1) #define MAXFUNCS (PCI_FUNCMAX + 1) +#define GB (1024 * 1024 * 1024UL) + struct funcinfo { char *fi_name; char *fi_param; @@ -101,18 +106,36 @@ SET_DECLARE(pci_devemu_set, struct pci_devemu); static uint64_t pci_emul_iobase; +static uint64_t pci_emul_iolim; +static uint64_t pci_emul_rombase; +static uint64_t pci_emul_romoffset; +static uint64_t pci_emul_romlim; static uint64_t pci_emul_membase32; +static uint64_t pci_emul_memlim32; static uint64_t pci_emul_membase64; static uint64_t pci_emul_memlim64; +struct pci_bar_allocation { + TAILQ_ENTRY(pci_bar_allocation) pci_bar_chain; + struct pci_devinst *pdi; + int idx; + enum pcibar_type type; + uint64_t size; +}; +TAILQ_HEAD(pci_bar_list, pci_bar_allocation) pci_bars = TAILQ_HEAD_INITIALIZER( + pci_bars); + #define PCI_EMUL_IOBASE 0x2000 #define PCI_EMUL_IOLIMIT 0x10000 +#define PCI_EMUL_ROMSIZE 0x10000000 + #define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ #define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); #define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE +#define PCI_EMUL_MEMSIZE64 (32 * GB) static struct pci_devemu *pci_emul_finddev(char *name); static void pci_lintr_route(struct pci_devinst *pi); @@ -502,6 +525,12 @@ (*pe->pe_baraddr)(pi->pi_vmctx, pi, idx, registration, pi->pi_bar[idx].addr); break; + case PCIBAR_ROM: + error = 0; + if (pe->pe_baraddr != NULL) + (*pe->pe_baraddr)(pi->pi_vmctx, pi, idx, registration, + pi->pi_bar[idx].addr); + break; default: error = EINVAL; break; @@ -523,6 +552,13 @@ modify_bar_registration(pi, idx, 1); } +/* Is the ROM enabled for the emulated pci device? */ +static int +romen(struct pci_devinst *pi) +{ + return (pi->pi_bar[PCI_ROM_IDX].lobits & PCIM_BIOS_ENABLE) == PCIM_BIOS_ENABLE; +} + /* Are we decoding i/o port accesses for the emulated pci device? */ static int porten(struct pci_devinst *pi) @@ -589,11 +625,11 @@ pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size) { - int error; - uint64_t *baseptr, limit, addr, mask, lobits, bar; - uint16_t cmd, enbit; - - assert(idx >= 0 && idx <= PCI_BARMAX); + if ((type != PCIBAR_ROM) && (idx < 0 || idx > PCI_BARMAX)) { + errx(4, "Illegal BAR idx"); + } else if ((type == PCIBAR_ROM) && (idx != PCI_ROM_IDX)) { + errx(4, "Illegal ROM idx"); + } if ((size & (size - 1)) != 0) size = 1UL << flsl(size); /* round up to a power of 2 */ @@ -602,11 +638,89 @@ if (type == PCIBAR_IO) { if (size < 4) size = 4; + } else if (type == PCIBAR_ROM) { + if (size < ~PCIM_BIOS_ADDR_MASK + 1) + size = ~PCIM_BIOS_ADDR_MASK + 1; } else { if (size < 16) size = 16; } + /* allocate new bar */ + struct pci_bar_allocation *new_bar = malloc(sizeof(struct pci_bar_allocation)); + memset(new_bar, 0, sizeof(struct pci_bar_allocation)); + new_bar->pdi = pdi; + new_bar->idx = idx; + new_bar->type = type; + new_bar->size = size; + + /* get bar position */ + struct pci_bar_allocation *bar = NULL; + TAILQ_FOREACH (bar, &pci_bars, pci_bar_chain) { + if (bar->size < size) { + break; + } + } + + /* insert bar into queue */ + if (bar == NULL) { + TAILQ_INSERT_TAIL(&pci_bars, new_bar, pci_bar_chain); + } else { + TAILQ_INSERT_BEFORE(bar, new_bar, pci_bar_chain); + } + + return (0); +} + +int +pci_emul_alloc_rom(struct pci_devinst *pdi, uint64_t size, uint64_t *addr) +{ + /* allocate ROM-Space once */ + if (pci_emul_rombase == 0) { + pci_emul_rombase = (uint64_t)vm_create_devmem(pdi->pi_vmctx, VM_PCIROM, + "pcirom", PCI_EMUL_ROMSIZE); + if ((void *)pci_emul_rombase == MAP_FAILED) + return -ENOMEM; + pci_emul_romlim = pci_emul_rombase + PCI_EMUL_ROMSIZE; + pci_emul_romoffset = 0; + } + + /* round up to a power of 2 */ + uint64_t rom_size = 1UL << flsl(size); + /* ROM size should be greater than 2 KB */ + rom_size = MAX(rom_size, ~PCIM_BIOS_ADDR_MASK + 1); + + /* check if ROM fits into ROM-Space */ + if (pci_emul_romoffset + rom_size > PCI_EMUL_ROMSIZE) + return -E2BIG; + + /* allocate ROM BAR */ + const int error = pci_emul_alloc_bar(pdi, PCI_ROM_IDX, PCIBAR_ROM, rom_size); + if (error) + return error; + + /* return address */ + *addr = pci_emul_rombase + pci_emul_romoffset; + /* save offset into ROM Space */ + pdi->pi_romoffset = pci_emul_romoffset; + /* increase offset for next ROM */ + pci_emul_romoffset += rom_size; + + return (0); +} + +static int +pci_emul_assign_bar(struct pci_bar_allocation *pci_bar) +{ + struct pci_devinst *pdi = pci_bar->pdi; + int idx = pci_bar->idx; + enum pcibar_type type = pci_bar->type; + uint64_t size = pci_bar->size; + + int error; + uint64_t *baseptr, limit, addr, mask, lobits, bar; + uint16_t cmd, enbit; + switch (type) { case PCIBAR_NONE: baseptr = NULL; @@ -614,7 +728,7 @@ break; case PCIBAR_IO: baseptr = &pci_emul_iobase; - limit = PCI_EMUL_IOLIMIT; + limit = pci_emul_iolim; mask = PCIM_BAR_IO_BASE; lobits = PCIM_BAR_IO_SPACE; enbit = PCIM_CMD_PORTEN; @@ -633,21 +747,33 @@ mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | PCIM_BAR_MEM_PREFETCH; - } else { - baseptr = &pci_emul_membase32; - limit = PCI_EMUL_MEMLIMIT32; - mask = PCIM_BAR_MEM_BASE; - lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64; + enbit = PCIM_CMD_MEMEN; + break; } - enbit = PCIM_CMD_MEMEN; - break; + /* + * Use 32 bit BARs for small requests: + * Fallthrough into MEM32 case + */ + type = PCIBAR_MEM32; + pdi->pi_bar[idx + 1].type = PCIBAR_NONE; + /* clear 64-bit flag */ + pdi->pi_bar[idx].lobits &= ~PCIM_BAR_MEM_64; + /* [fallthrough] */ case PCIBAR_MEM32: baseptr = &pci_emul_membase32; - limit = PCI_EMUL_MEMLIMIT32; + limit = pci_emul_memlim32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; enbit = PCIM_CMD_MEMEN; break; + case PCIBAR_ROM: + /* do not claim memory for ROM. OVMF will do it for us. */ + baseptr = NULL; + limit = 0; + mask = PCIM_BIOS_ADDR_MASK; + lobits = 0; + enbit = PCIM_CMD_MEMEN; + break; default: printf("pci_emul_alloc_base: invalid bar type %d\n", type); assert(0); @@ -662,6 +788,13 @@ pdi->pi_bar[idx].type = type; pdi->pi_bar[idx].addr = addr; pdi->pi_bar[idx].size = size; + /* passthru devices are using same lobits as physical device + * they set this property + */ + if (pdi->pi_bar[idx].lobits != 0) + lobits = pdi->pi_bar[idx].lobits; + else + pdi->pi_bar[idx].lobits = lobits; /* Initialize the BAR register in config space */ bar = (addr & mask) | lobits; @@ -676,7 +809,9 @@ cmd = pci_get_cfgdata16(pdi, PCIR_COMMAND); if ((cmd & enbit) != enbit) pci_set_cfgdata16(pdi, PCIR_COMMAND, cmd | enbit); - register_bar(pdi, idx); + if (type != PCIBAR_ROM) { + register_bar(pdi, idx); + } return (0); } @@ -1098,25 +1233,17 @@ struct slotinfo *si; struct funcinfo *fi; size_t lowmem; - uint64_t cpu_maxphysaddr, pci_emul_memresv64; - u_int regs[4]; int bus, slot, func, error; pci_emul_iobase = PCI_EMUL_IOBASE; + pci_emul_iolim = PCI_EMUL_IOLIMIT; + pci_emul_membase32 = vm_get_lowmem_limit(ctx); + pci_emul_memlim32 = PCI_EMUL_MEMLIMIT32; - do_cpuid(0x80000008, regs); - cpu_maxphysaddr = 1ULL << (regs[0] & 0xff); - if (cpu_maxphysaddr > VM_MAXUSER_ADDRESS_LA48) - cpu_maxphysaddr = VM_MAXUSER_ADDRESS_LA48; - pci_emul_memresv64 = cpu_maxphysaddr / 4; - /* - * Max power of 2 that is less then - * cpu_maxphysaddr - pci_emul_memresv64. - */ - pci_emul_membase64 = 1ULL << (flsl(cpu_maxphysaddr - - pci_emul_memresv64) - 1); - pci_emul_memlim64 = cpu_maxphysaddr; + pci_emul_membase64 = 4 * GB + vm_get_highmem_size(ctx); + pci_emul_membase64 = roundup2(pci_emul_membase64, PCI_EMUL_MEMSIZE64); + pci_emul_memlim64 = pci_emul_membase64 + PCI_EMUL_MEMSIZE64; for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) @@ -1129,6 +1256,7 @@ bi->membase32 = pci_emul_membase32; bi->membase64 = pci_emul_membase64; + /* first run: init devices */ for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { @@ -1144,6 +1272,18 @@ } } + /* second run: assign BARs */ + struct pci_bar_allocation *bar; + TAILQ_FOREACH (bar, &pci_bars, pci_bar_chain) { + pci_emul_assign_bar(bar); + } + /* free BARs */ + while (!TAILQ_EMPTY(&pci_bars)) { + bar = TAILQ_FIRST(&pci_bars); + TAILQ_REMOVE(&pci_bars, bar, pci_bar_chain); + free(bar); + } + /* * Add some slop to the I/O and memory resources decoded by * this bus to give a guest some flexibility if it wants to @@ -1717,7 +1857,7 @@ * If the MMIO or I/O address space decoding has changed then * register/unregister all BARs that decode that address space. */ - for (i = 0; i <= PCI_BARMAX; i++) { + for (i = 0; i <= PCI_BARMAX_WITH_ROM; i++) { switch (pi->pi_bar[i].type) { case PCIBAR_NONE: case PCIBAR_MEMHI64: @@ -1731,6 +1871,11 @@ unregister_bar(pi, i); } break; + case PCIBAR_ROM: + /* skip (un-)register of ROM if it disabled */ + if (pi->pi_bar[i].lobits == 0) + break; + /* fallthrough */ case PCIBAR_MEM32: case PCIBAR_MEM64: /* MMIO address space decoding changed? */ @@ -1851,16 +1996,21 @@ return; /* - * Special handling for write to BAR registers + * Special handling for write to BAR and ROM registers */ - if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) { + if ((coff >= PCIR_BAR(0) && coff <= PCIR_BAR(PCI_BARMAX)) || + (coff >= PCIR_BIOS && coff < PCIR_BIOS + 4)) { /* * Ignore writes to BAR registers that are not * 4-byte aligned. */ if (bytes != 4 || (coff & 0x3) != 0) return; - idx = (coff - PCIR_BAR(0)) / 4; + if (coff != PCIR_BIOS) { + idx = (coff - PCIR_BAR(0)) / 4; + } else { + idx = PCI_ROM_IDX; + } mask = ~(pi->pi_bar[idx].size - 1); switch (pi->pi_bar[idx].type) { case PCIBAR_NONE: @@ -1869,7 +2019,7 @@ case PCIBAR_IO: addr = *eax & mask; addr &= 0xffff; - bar = addr | PCIM_BAR_IO_SPACE; + bar = addr | pi->pi_bar[idx].lobits; /* * Register the new BAR value for interception */ @@ -1880,7 +2030,7 @@ break; case PCIBAR_MEM32: addr = bar = *eax & mask; - bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; + bar |= pi->pi_bar[idx].lobits; if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM32); @@ -1888,8 +2038,7 @@ break; case PCIBAR_MEM64: addr = bar = *eax & mask; - bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | - PCIM_BAR_MEM_PREFETCH; + bar |= pi->pi_bar[idx].lobits; if (addr != (uint32_t)pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM64); @@ -1904,6 +2053,20 @@ PCIBAR_MEMHI64); } break; + case PCIBAR_ROM: + addr = bar = *eax & mask; + if (memen(pi) && romen(pi)) { + unregister_bar(pi, idx); + } + pi->pi_bar[idx].addr = addr; + pi->pi_bar[idx].lobits = *eax & + PCIM_BIOS_ENABLE; + /* romen could have changed it value */ + if (memen(pi) && romen(pi)) { + register_bar(pi, idx); + } + bar |= pi->pi_bar[idx].lobits; + break; default: assert(0); } @@ -1941,7 +2104,7 @@ } else { x = *eax; cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; - cfgoff = x & PCI_REGMAX; + cfgoff = (x & PCI_REGMAX) & ~0x03; cfgfunc = (x >> 8) & PCI_FUNCMAX; cfgslot = (x >> 11) & PCI_SLOTMAX; cfgbus = (x >> 16) & PCI_BUSMAX; diff --git a/usr.sbin/bhyve/pci_gvt-d.c b/usr.sbin/bhyve/pci_gvt-d.c new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/pci_gvt-d.c @@ -0,0 +1,288 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Beckhoff Automation GmbH & Co. KG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR OR CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "e820.h" +#include "inout.h" +#include "pci_passthru.h" + +#define MB (1024 * 1024UL) +#define GB (1024 * MB) + +#ifndef _PATH_MEM +#define _PATH_MEM "/dev/mem" +#endif + +/* + * PCI definitions + */ +#define PCIM_BDSM_GSM_ALIGNMENT \ + 0x00100000 /* Graphics Stolen Memory is 1 MB aligned */ + +/* GVT-d definitions */ +#define GVT_D_MAP_OPREGION 0 +#define GVT_D_MAP_GSM 1 + +static int +gvt_d_aslswrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, + int bytes, uint32_t val) +{ + struct passthru_softc *sc; + + sc = pi->pi_arg; + + struct passthru_mmio_mapping *opregion = + &sc->psc_mmio_map[GVT_D_MAP_OPREGION]; + + /* write new value to cfg space */ + if (bytes == 1) { + pci_set_cfgdata8(pi, coff, val); + } else if (bytes == 2) { + pci_set_cfgdata16(pi, coff, val); + } else { + pci_set_cfgdata32(pi, coff, val); + } + + /* get new address of opregion */ + opregion->gpa = pci_get_cfgdata32(pi, PCIR_ASLS_CTL); + + /* copy opregion into guest mem */ + opregion->gva = vm_map_gpa(ctx, opregion->gpa, opregion->len); + if (opregion->gva == 0) { + warnx("%s: Unable to map opregion (0x%016lx)", __func__, + opregion->gpa); + /* return 0 to avoid emulation of ASLS register */ + return (0); + } + memcpy(opregion->gva, opregion->hva, opregion->len); + + return (0); +} + +static vm_paddr_t +gvt_d_alloc_mmio_memory(vm_paddr_t host_address, vm_paddr_t length, vm_paddr_t alignment, + enum e820_memory_type type) +{ + /* try to use host address */ + vm_paddr_t address = e820_alloc(host_address, length, E820_ALIGNMENT_NONE, + type, E820_ALLOCATE_SPECIFIC); + if (address != 0) { + return address; + } + + /* try to use highest address below 4 GB */ + return e820_alloc(4 * GB, length, alignment, type, + E820_ALLOCATE_HIGHEST); +} + +static int +gvt_d_setup_gsm(struct vmctx *ctx, struct pci_devinst *pi) +{ + struct passthru_softc *sc = pi->pi_arg; + + struct passthru_mmio_mapping *gsm = &sc->psc_mmio_map[GVT_D_MAP_GSM]; + + const int error = vm_get_memory_region_info(ctx, &gsm->hpa, &gsm->len, + MEMORY_REGION_INTEL_GSM); + if (error) { + warnx( + "%s: Unable to get Graphics Stolen Memory base and length", + __func__); + return (error); + } + gsm->hva = NULL; /* unused */ + gsm->gva = NULL; /* unused */ + gsm->gpa = gvt_d_alloc_mmio_memory(gsm->hpa, gsm->len, + PCIM_BDSM_GSM_ALIGNMENT, E820_TYPE_RESERVED); + if (gsm->gpa == 0) { + warnx( + "%s: Unable to add Graphics Stolen Memory to E820 table (hpa 0x%lx len 0x%lx)", + __func__, gsm->hpa, gsm->len); + e820_dump_table(); + return (-1); + } + if (gsm->gpa != gsm->hpa) { + /* + * ACRN source code implies that graphics driver for newer Intel + * platforms like Tiger Lake will read the Graphics Stolen + * Memory address from an MMIO register. We have three options + * to solve this issue: + * 1. Patch the value in the MMIO register + * This could have unintended side effects. Without + * any documentation how this register is used by + * the GPU, don't do it. + * 2. Trap the MMIO register + * It's not possible to trap a single MMIO + * register. We need to trap a whole page. Trapping + * a bunch of MMIO register could degrade the + * performance noticeably. + * 3. Use an 1:1 host to guest mapping + * Maybe not always possible. + * As far as we know, no supported platform requires a 1:1 + * mapping. For that reason, just log a warning. + */ + warnx( + "Warning: Unable to reuse host address of Graphics Stolen Memory. GPU passthrough might not work properly."); + } + + const uint64_t bdsm = read_config(&sc->psc_sel, PCIR_BDSM, 4); + pci_set_cfgdata32(pi, PCIR_BDSM, + gsm->gpa | (bdsm & (PCIM_BDSM_GSM_ALIGNMENT - 1))); + + return (0); +} + +static int +gvt_d_setup_opregion(struct vmctx *ctx, struct pci_devinst *pi, const int memfd) +{ + struct passthru_softc *sc = pi->pi_arg; + + struct passthru_mmio_mapping *opregion = + &sc->psc_mmio_map[GVT_D_MAP_OPREGION]; + + const int error = vm_get_memory_region_info(ctx, &opregion->hpa, + &opregion->len, MEMORY_REGION_INTEL_OPREGION); + if (error) { + warnx( + "%s: Unable to get OpRegion base and length", + __func__); + return (error); + } + opregion->hva = mmap(NULL, opregion->len, PROT_READ, MAP_SHARED, memfd, + opregion->hpa); + if (opregion->hva == MAP_FAILED) { + warnx("%s: Unable to map host OpRegion", __func__); + return (-1); + } + opregion->gpa = gvt_d_alloc_mmio_memory(opregion->hpa, opregion->len, + E820_ALIGNMENT_NONE, E820_TYPE_NVS); + if (opregion->gpa == 0) { + warnx( + "%s: Unable to add OpRegion to E820 table (hpa 0x%lx len 0x%lx)", + __func__, opregion->hpa, opregion->len); + e820_dump_table(); + return (-1); + } + opregion->gva = vm_map_gpa(ctx, opregion->gpa, opregion->len); + if (opregion->gva == NULL) { + warnx("%s: Unable to map guest OpRegion", __func__); + return (-1); + } + if (opregion->gpa != opregion->hpa) { + /* + * A 1:1 host to guest mapping is not required but this could + * change in the future. + */ + warnx( + "Warning: Unable to reuse host address of OpRegion. GPU passthrough might not work properly."); + } + + memcpy(opregion->gva, opregion->hva, opregion->len); + + pci_set_cfgdata32(pi, PCIR_ASLS_CTL, opregion->gpa); + + return (0); +} + +int +gvt_d_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + int error; + struct passthru_softc *sc; + + sc = pi->pi_arg; + + /* get memory descriptor */ + const int memfd = open(_PATH_MEM, O_RDWR, 0); + if (memfd < 0) { + warn("%s: Failed to open %s", __func__, _PATH_MEM); + return (-1); + } + + if ((error = gvt_d_setup_gsm(ctx, pi)) != 0) { + warnx("%s: Unable to setup Graphics Stolen Memory", __func__); + goto done; + } + + if ((error = gvt_d_setup_opregion(ctx, pi, memfd)) != 0) { + warnx("%s: Unable to setup OpRegion", __func__); + goto done; + } + + /* protect Graphics Stolen Memory register */ + if ((error = set_pcir_handler(sc, PCIR_BDSM, 4, + passthru_cfgread_emulate, passthru_cfgwrite_emulate)) != 0) { + warnx("%s: Unable to protect opregion", __func__); + goto done; + } + /* protect opregion register */ + if ((error = set_pcir_handler(sc, PCIR_ASLS_CTL, 4, + passthru_cfgread_emulate, gvt_d_aslswrite)) != 0) { + warnx("%s: Unable to protect opregion", __func__); + goto done; + } + +done: + return (error); +} + +void +gvt_d_deinit(struct vmctx *ctx, struct pci_devinst *pi) +{ + struct passthru_softc *sc; + + sc = pi->pi_arg; + + struct passthru_mmio_mapping *opregion = + &sc->psc_mmio_map[GVT_D_MAP_OPREGION]; + + /* HVA is only set, if it's initialized */ + if (opregion->hva) + munmap((void *)opregion->hva, opregion->len); +} diff --git a/usr.sbin/bhyve/pci_lpc.c b/usr.sbin/bhyve/pci_lpc.c --- a/usr.sbin/bhyve/pci_lpc.c +++ b/usr.sbin/bhyve/pci_lpc.c @@ -33,9 +33,13 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include +#include +#include +#include #include #include #include @@ -83,6 +87,29 @@ static bool pctestdev_present; +#ifndef _PATH_DEVPCI +#define _PATH_DEVPCI "/dev/pci" +#endif + +static int pcifd = -1; + +static uint32_t +read_config(struct pcisel *sel, long reg, int width) +{ + struct pci_io pi; + pi.pi_sel.pc_domain = sel->pc_domain; + pi.pi_sel.pc_bus = sel->pc_bus; + pi.pi_sel.pc_dev = sel->pc_dev; + pi.pi_sel.pc_func = sel->pc_func; + pi.pi_reg = reg; + pi.pi_width = width; + + if (ioctl(pcifd, PCIOCREAD, &pi) < 0) + return (0); + + return (pi.pi_data); +} + /* * LPC device configuration is in the following form: * [,] @@ -446,6 +473,40 @@ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA); + /* open host device */ + if (pcifd < 0) { + pcifd = open(_PATH_DEVPCI, O_RDWR, 0); + if (pcifd < 0) { + warn("failed to open %s", _PATH_DEVPCI); + return (-1); + } + } + + /* on Intel systems lpc is always connected to 0:1f.0 */ + struct pcisel sel; + sel.pc_domain = 0; + sel.pc_bus = 0; + sel.pc_dev = 0x1f; + sel.pc_func = 0; + + if (read_config(&sel, PCIR_VENDOR, 2) == PCI_VENDOR_INTEL) { + /* + * The VID, DID, REVID, SUBVID and SUBDID of igd-lpc need to be + * aligned with the physical ones. Without these physical + * values, GVT-d GOP driver couldn't work. + */ + pci_set_cfgdata16( + pi, PCIR_DEVICE, read_config(&sel, PCIR_DEVICE, 2)); + pci_set_cfgdata16( + pi, PCIR_VENDOR, read_config(&sel, PCIR_VENDOR, 2)); + pci_set_cfgdata8( + pi, PCIR_REVID, read_config(&sel, PCIR_REVID, 1)); + pci_set_cfgdata16( + pi, PCIR_SUBVEND_0, read_config(&sel, PCIR_SUBVEND_0, 2)); + pci_set_cfgdata16( + pi, PCIR_SUBDEV_0, read_config(&sel, PCIR_SUBDEV_0, 2)); + } + lpc_bridge = pi; return (0); diff --git a/usr.sbin/bhyve/pci_passthru.h b/usr.sbin/bhyve/pci_passthru.h new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/pci_passthru.h @@ -0,0 +1,84 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Beckhoff Automation GmbH & Co. KG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR OR CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#pragma once + +#include + +#include + +#include "pci_emul.h" + +struct passthru_mmio_mapping { + vm_paddr_t gpa; /* guest physical address */ + void *gva; /* guest virtual address */ + vm_paddr_t hpa; /* host physical address */ + void *hva; /* guest virtual address */ + vm_paddr_t len; +}; + +typedef int (*cfgread_handler)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int coff, int bytes, uint32_t *rv); +typedef int (*cfgwrite_handler)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int coff, int bytes, uint32_t val); + +struct passthru_softc { + struct pci_devinst *psc_pi; + /* ROM is handled like a BAR */ + struct pcibar psc_bar[PCI_BARMAX_WITH_ROM + 1]; + struct { + int capoff; + int msgctrl; + int emulated; + } psc_msi; + struct { + int capoff; + } psc_msix; + struct pcisel psc_sel; + + struct passthru_mmio_mapping psc_mmio_map[2]; + cfgread_handler psc_pcir_rhandler[PCI_REGMAX + 1]; + cfgwrite_handler psc_pcir_whandler[PCI_REGMAX + 1]; +}; + +uint32_t read_config(const struct pcisel *sel, long reg, int width); +void write_config(const struct pcisel *sel, long reg, int width, uint32_t data); +int passthru_cfgread_default(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int coff, int bytes, uint32_t *rv); +int passthru_cfgread_emulate(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int coff, int bytes, uint32_t *rv); +int passthru_cfgwrite_default(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int coff, int bytes, uint32_t val); +int passthru_cfgwrite_emulate(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int coff, int bytes, uint32_t val); +int set_pcir_handler(struct passthru_softc *sc, uint32_t reg, uint32_t len, + cfgread_handler rhandler, cfgwrite_handler whandler); +int gvt_d_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts); +void gvt_d_deinit(struct vmctx *ctx, struct pci_devinst *pi); diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c --- a/usr.sbin/bhyve/pci_passthru.c +++ b/usr.sbin/bhyve/pci_passthru.c @@ -48,19 +48,19 @@ #ifndef WITHOUT_CAPSICUM #include #endif -#include -#include -#include +#include + #include #include #include +#include +#include +#include #include #include -#include -#include -#include "pci_emul.h" #include "mem.h" +#include "pci_passthru.h" #ifndef _PATH_DEVPCI #define _PATH_DEVPCI "/dev/pci" @@ -79,24 +79,12 @@ #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1) #define MSIX_CAPLEN 12 +#define PCI_CAP_START_OFFSET 0x40 + static int pcifd = -1; static int iofd = -1; static int memfd = -1; -struct passthru_softc { - struct pci_devinst *psc_pi; - struct pcibar psc_bar[PCI_BARMAX + 1]; - struct { - int capoff; - int msgctrl; - int emulated; - } psc_msi; - struct { - int capoff; - } psc_msix; - struct pcisel psc_sel; -}; - static int msi_caplen(int msgctrl) { @@ -119,7 +107,7 @@ return (len); } -static uint32_t +uint32_t read_config(const struct pcisel *sel, long reg, int width) { struct pci_io pi; @@ -135,7 +123,7 @@ return (pi.pi_data); } -static void +void write_config(const struct pcisel *sel, long reg, int width, uint32_t data) { struct pci_io pi; @@ -556,12 +544,23 @@ sc->psc_bar[i].type = bartype; sc->psc_bar[i].size = size; sc->psc_bar[i].addr = base; + sc->psc_bar[i].lobits = 0; /* Allocate the BAR in the guest I/O or MMIO space */ error = pci_emul_alloc_bar(pi, i, bartype, size); if (error) return (-1); + /* Use same lobits as physical bar */ + uint8_t lobits = read_config(&sc->psc_sel, PCIR_BAR(i), 0x01); + if (bartype == PCIBAR_MEM32 || bartype == PCIBAR_MEM64) { + lobits &= ~PCIM_BAR_MEM_BASE; + } else { + lobits &= ~PCIM_BAR_IO_BASE; + } + sc->psc_bar[i].lobits = lobits; + pi->pi_bar[i].lobits = lobits; + /* The MSI-X table needs special handling */ if (i == pci_msix_table_bar(pi)) { error = init_msix_table(ctx, sc, base); @@ -595,6 +594,17 @@ sc->psc_sel.pc_dev = slot; sc->psc_sel.pc_func = func; + /* copy physical PCI header to virtual cfgspace */ + for (uint32_t i = 0; i < PCI_CAP_START_OFFSET; ++i) { + /* + * INTLINE and INTPIN shouldn't be aligned with it's physical + * value and they are already set by pci_emul_init + */ + if (i == PCIR_INTLINE || i == PCIR_INTPIN) + continue; + pci_set_cfgdata8(pi, i, read_config(&sc->psc_sel, i, 1)); + } + if (cfginitmsi(sc) != 0) { warnx("failed to initialize MSI for PCI %d/%d/%d", bus, slot, func); @@ -607,14 +617,154 @@ goto done; } - pci_set_cfgdata16(pi, PCIR_COMMAND, read_config(&sc->psc_sel, - PCIR_COMMAND, 2)); + write_config( + &sc->psc_sel, PCIR_COMMAND, 2, pci_get_cfgdata16(pi, PCIR_COMMAND)); error = 0; /* success */ done: return (error); } +int +set_pcir_handler(struct passthru_softc *sc, uint32_t reg, uint32_t len, cfgread_handler rhandler, cfgwrite_handler whandler) +{ + if (reg > PCI_REGMAX || reg + len > PCI_REGMAX + 1) + return (-1); + + for (uint32_t i = reg; i < reg + len; ++i) { + sc->psc_pcir_rhandler[i] = rhandler; + sc->psc_pcir_whandler[i] = whandler; + } + + return 0; +} + +static int +passthru_init_quirks(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct passthru_softc *sc = pi->pi_arg; + + uint16_t vendor = read_config(&sc->psc_sel, PCIR_VENDOR, 0x02); + uint8_t class = read_config(&sc->psc_sel, PCIR_CLASS, 0x01); + + /* currently only display devices have quirks */ + if (class != PCIC_DISPLAY) + return (0); + + if (vendor == PCI_VENDOR_INTEL) + return gvt_d_init(ctx, pi, opts); + + return (0); +} + +static void +passthru_deinit_quirks(struct vmctx *ctx, struct pci_devinst *pi) +{ + struct passthru_softc *sc = pi->pi_arg; + + if (sc == NULL) + return; + + uint16_t vendor = read_config(&sc->psc_sel, PCIR_VENDOR, 0x02); + uint8_t class = read_config(&sc->psc_sel, PCIR_CLASS, 0x01); + + /* currently only display devices have quirks */ + if (class != PCIC_DISPLAY) + return; + + if (vendor == PCI_VENDOR_INTEL) + return gvt_d_deinit(ctx, pi); + + return; +} + +static void +passthru_usage(char *opt) +{ + warnx("Invalid passthru option \"%s\"", opt); + warnx("passthru,//,{rom=rom_file}"); +} + +static int +passthru_parse_opts(struct passthru_softc *sc, char *opts) +{ + int error = 0; + char *uopts = strdup(opts); + char *xopt = strtok(uopts, ","); + for (xopt = strtok(NULL, ","); xopt != NULL; xopt = strtok(NULL, ",")) { + char *config = strchr(xopt, '='); + if (config == NULL) { + error = -1; + break; + } + *config = '\0'; + ++config; + if (strcmp(xopt, "rom") == 0) { + const int fd = open(config, O_RDONLY); + if (fd < 0) { + warnx("Can't open romfile \"%s\"", config); + error = -1; + break; + } + /* determine file size */ + uint64_t rom_size = lseek(fd, 0, SEEK_END); + lseek(fd, 0, SEEK_SET); + /* read bios */ + void *rom_addr = malloc(rom_size); + if (rom_addr == NULL) { + warnx("Can't malloc rom \"%s\" (size: 0x%8lx)", + config, rom_size); + error = -ENOMEM; + close(fd); + break; + } + rom_size = read(fd, rom_addr, rom_size); + close(fd); + + /* save physical values of ROM */ + sc->psc_bar[PCI_ROM_IDX].type = PCIBAR_ROM; + sc->psc_bar[PCI_ROM_IDX].addr = (uint64_t)rom_addr; + sc->psc_bar[PCI_ROM_IDX].size = rom_size; + + continue; + } + /* option wasn't processed */ + passthru_usage(xopt); + error = -1; + break; + } + + return (error); +} + +static int +passthru_init_rom(struct vmctx *ctx, struct passthru_softc *sc) +{ + /* check if this device has a rom */ + if (sc->psc_bar[PCI_ROM_IDX].size == 0) + return (0); + + /* allocate ROM */ + uint64_t rom_addr; + int error = pci_emul_alloc_rom(sc->psc_pi, + sc->psc_bar[PCI_ROM_IDX].size, &rom_addr); + if (error) { + warnx("Failed to alloc ROM"); + goto done; + } + + /* copy ROM to guest */ + memcpy((void *)rom_addr, (void *)sc->psc_bar[PCI_ROM_IDX].addr, + sc->psc_bar[PCI_ROM_IDX].size); + /* free ROM */ + free((void *)sc->psc_bar[PCI_ROM_IDX].addr); + /* save new address of ROM */ + sc->psc_bar[PCI_ROM_IDX].addr = rom_addr; + +done: + return error; +} + static int passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { @@ -701,10 +851,47 @@ pi->pi_arg = sc; sc->psc_pi = pi; + /* parse opts */ + if ((error = passthru_parse_opts(sc, opts)) != 0) { + warnx("invalid passthru options"); + goto done; + } + /* initialize config space */ - error = cfginit(ctx, pi, bus, slot, func); + if ((error = cfginit(ctx, pi, bus, slot, func)) != 0) + goto done; + + /* set default handler for all PCI registers */ + if ((error = set_pcir_handler(sc, 0, PCI_REGMAX + 1, + passthru_cfgread_default, passthru_cfgwrite_default)) != 0) + goto done; + /* protect PCI header */ + if ((error = set_pcir_handler(sc, 0, PCI_CAP_START_OFFSET, + passthru_cfgread_emulate, passthru_cfgwrite_emulate)) != 0) + goto done; + /* allow access to command and status register */ + if ((error = set_pcir_handler(sc, PCIR_COMMAND, 0x04, + passthru_cfgread_default, passthru_cfgwrite_default)) != 0) + goto done; + + /* + * Keep following order!! + * Before init_quirks: + * set protection for PCI register + * After init_quirks: + * init ROM + */ + if ((error = passthru_init_quirks(ctx, pi, opts)) != 0) + goto done; + + /* initialize ROM */ + if ((error = passthru_init_rom(ctx, sc)) != 0) + goto done; + + error = 0; /* success */ done: if (error) { + passthru_deinit_quirks(ctx, pi); free(sc); vm_unassign_pptdev(ctx, bus, slot, func); } @@ -747,29 +934,29 @@ } static int -passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, - int coff, int bytes, uint32_t *rv) +passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, + int bytes, uint32_t *rv) { struct passthru_softc *sc; sc = pi->pi_arg; - /* - * PCI BARs and MSI capability is emulated. - */ - if (bar_access(coff) || msicap_access(sc, coff)) - return (-1); + return sc->psc_pcir_rhandler[coff](ctx, vcpu, pi, coff, bytes, rv); +} + +int +passthru_cfgread_default(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t *rv) +{ + struct passthru_softc *sc; + + sc = pi->pi_arg; -#ifdef LEGACY_SUPPORT /* - * Emulate PCIR_CAP_PTR if this device does not support MSI capability - * natively. + * MSI capability is emulated. */ - if (sc->psc_msi.emulated) { - if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4) - return (-1); - } -#endif + if (msicap_access(sc, coff) || msixcap_access(sc, coff)) + return (-1); /* * Emulate the command register. If a single read reads both the @@ -790,9 +977,28 @@ return (0); } +int +passthru_cfgread_emulate(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t *rv) +{ + return (-1); +} + static int -passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, - int coff, int bytes, uint32_t val) +passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, + int bytes, uint32_t val) +{ + + struct passthru_softc *sc; + + sc = pi->pi_arg; + + return sc->psc_pcir_whandler[coff](ctx, vcpu, pi, coff, bytes, val); +} + +int +passthru_cfgwrite_default(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t val) { int error, msix_table_entries, i; struct passthru_softc *sc; @@ -800,12 +1006,6 @@ sc = pi->pi_arg; - /* - * PCI BARs are emulated - */ - if (bar_access(coff)) - return (-1); - /* * MSI capability is emulated */ @@ -871,6 +1071,13 @@ return (0); } +int +passthru_cfgwrite_emulate(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t val) +{ + return (-1); +} + static void passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) @@ -995,17 +1202,39 @@ } } +static void +passthru_addr_rom(struct pci_devinst *pi, int idx, int enabled) +{ + if (!enabled) + vm_munmap_memseg(pi->pi_vmctx, pi->pi_bar[idx].addr, + pi->pi_bar[idx].size); + else + vm_mmap_memseg(pi->pi_vmctx, pi->pi_bar[idx].addr, VM_PCIROM, + pi->pi_romoffset, pi->pi_bar[idx].size, + PROT_READ | PROT_EXEC); +} + static void passthru_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx, int enabled, uint64_t address) { - - if (pi->pi_bar[baridx].type == PCIBAR_IO) - return; - if (baridx == pci_msix_table_bar(pi)) - passthru_msix_addr(ctx, pi, baridx, enabled, address); - else - passthru_mmio_addr(ctx, pi, baridx, enabled, address); + switch (pi->pi_bar[baridx].type) { + case PCIBAR_IO: + /* IO BARs are emulated */ + break; + case PCIBAR_ROM: + passthru_addr_rom(pi, baridx, enabled); + break; + case PCIBAR_MEM32: + case PCIBAR_MEM64: + if (baridx == pci_msix_table_bar(pi)) + passthru_msix_addr(ctx, pi, baridx, enabled, address); + else + passthru_mmio_addr(ctx, pi, baridx, enabled, address); + break; + default: + errx(4, "%s: invalid BAR type %d", __func__, pi->pi_bar[baridx].type); + } } struct pci_devemu passthru = { diff --git a/usr.sbin/bhyve/fwctl.h b/usr.sbin/bhyve/qemu_fwcfg.h rename from usr.sbin/bhyve/fwctl.h rename to usr.sbin/bhyve/qemu_fwcfg.h --- a/usr.sbin/bhyve/fwctl.h +++ b/usr.sbin/bhyve/qemu_fwcfg.h @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2015 Peter Grehan + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -13,10 +13,10 @@ * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR OR CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -28,29 +28,19 @@ * $FreeBSD$ */ -#ifndef _FWCTL_H_ -#define _FWCTL_H_ +#pragma once -#include +#include -/* - * Linker set api for export of information to guest firmware via - * a sysctl-like OID interface - */ -struct ctl { - const char *c_oid; - const void *c_data; - const int c_len; -}; +#define QEMU_FWCFG_MAX_ARCHS 0x2 +#define QEMU_FWCFG_MAX_ENTRIES 0x3FFF +#define QEMU_FWCFG_MAX_NAME 56 -#define CTL_NODE(oid, data, len) \ - static struct ctl __CONCAT(__ctl, __LINE__) = { \ - oid, \ - (data), \ - (len), \ - }; \ - DATA_SET(ctl_set, __CONCAT(__ctl, __LINE__)) - -void fwctl_init(void); +struct qemu_fwcfg_item { + uint32_t size; + uint8_t *data; +}; -#endif /* _FWCTL_H_ */ +int qemu_fwcfg_add_file(uint8_t name[QEMU_FWCFG_MAX_NAME], uint32_t size, + void *data); +int qemu_fwcfg_init(struct vmctx *ctx); diff --git a/usr.sbin/bhyve/qemu_fwcfg.c b/usr.sbin/bhyve/qemu_fwcfg.c new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/qemu_fwcfg.c @@ -0,0 +1,433 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR OR CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include "acpi_device.h" +#include "inout.h" +#include "qemu_fwcfg.h" + +#define QEMU_FWCFG_ACPI_DEVICE_NAME "FWCF" +#define QEMU_FWCFG_ACPI_HARDWARE_ID "QEMU0002" + +#define QEMU_FWCFG_SELECTOR_PORT_NUMBER 0x510 +#define QEMU_FWCFG_SELECTOR_PORT_SIZE 1 +#define QEMU_FWCFG_SELECTOR_PORT_FLAGS IOPORT_F_INOUT +#define QEMU_FWCFG_DATA_PORT_NUMBER 0x511 +#define QEMU_FWCFG_DATA_PORT_SIZE 1 +#define QEMU_FWCFG_DATA_PORT_FLAGS \ + IOPORT_F_INOUT /* QEMU v2.4+ ignores writes */ + +#define QEMU_FWCFG_ARCHITECTURE_MASK 0x0001 +#define QEMU_FWCFG_INDEX_MASK 0x3FFF + +#define QEMU_FWCFG_SELECT_READ 0 +#define QEMU_FWCFG_SELECT_WRITE 1 + +#define QEMU_FWCFG_ARCHITECTURE_GENERIC 0 +#define QEMU_FWCFG_ARCHITECTURE_SPECIFIC 1 + +#define QEMU_FWCFG_INDEX_SIGNATURE 0x00 +#define QEMU_FWCFG_INDEX_ID 0x01 +#define QEMU_FWCFG_INDEX_FILE_DIR 0x19 + +#define QEMU_FWCFG_FIRST_FILE_INDEX 0x20 + +#define QEMU_FWCFG_MIN_FILES 10 + +#pragma pack(1) + +union qemu_fwcfg_selector { + struct { + uint16_t index : 14; + uint16_t writeable : 1; + /* + * 0 = generic | for all architectures + * 1 = specific | only for current architecture + */ + uint16_t architecture : 1; + }; + uint16_t bits; +}; + +struct qemu_fwcfg_signature { + uint8_t signature[4]; +}; + +struct qemu_fwcfg_id { + uint32_t interface : 1; /* always set */ + uint32_t DMA : 1; + uint32_t reserved : 30; +}; + +struct qemu_fwcfg_file { + uint32_t be_size; + uint16_t be_selector; + uint16_t reserved; + uint8_t name[QEMU_FWCFG_MAX_NAME]; +}; + +struct qemu_fwcfg_directory { + uint32_t be_count; + struct qemu_fwcfg_file files[0]; +}; + +struct qemu_fwcfg_softc { + struct acpi_device *acpi_dev; + + uint32_t data_offset; + union qemu_fwcfg_selector selector; + struct qemu_fwcfg_item items[QEMU_FWCFG_MAX_ARCHS] + [QEMU_FWCFG_MAX_ENTRIES]; + struct qemu_fwcfg_directory *directory; +}; + +#pragma pack() + +static struct qemu_fwcfg_softc sc; + +static int +qemu_fwcfg_selector_port_handler(struct vmctx *ctx, int vcpu, int in, int port, + int bytes, uint32_t *eax, void *arg) +{ + if (in) { + *eax = *(uint16_t *)&sc.selector; + return (0); + } + + sc.data_offset = 0; + sc.selector.bits = *eax; + + return (0); +} + +static int +qemu_fwcfg_data_port_handler(struct vmctx *ctx, int vcpu, int in, int port, + int bytes, uint32_t *eax, void *arg) +{ + if (!in) { + warnx("%s: Writes to qemu fwcfg data port aren't allowed", + __func__); + return (-1); + } + + /* get fwcfg item */ + struct qemu_fwcfg_item *item = + &sc.items[sc.selector.architecture][sc.selector.index]; + if (item->data == NULL) { + warnx( + "%s: qemu fwcfg item doesn't exist (architecture %s index 0x%x)", + __func__, sc.selector.architecture ? "specific" : "generic", + sc.selector.index); + *eax = 0x00; + return (0); + } else if (sc.data_offset >= item->size) { + warnx( + "%s: qemu fwcfg item read exceeds size (architecture %s index 0x%x size 0x%x offset 0x%x)", + __func__, sc.selector.architecture ? "specific" : "generic", + sc.selector.index, item->size, sc.data_offset); + *eax = 0x00; + return (0); + } + + /* return item data */ + *eax = item->data[sc.data_offset]; + sc.data_offset++; + + return (0); +} + +static int +qemu_fwcfg_add_item(uint16_t architecture, uint16_t index, uint32_t size, + void *data) +{ + /* truncate architecture and index to their desired size */ + architecture &= QEMU_FWCFG_ARCHITECTURE_MASK; + index &= QEMU_FWCFG_INDEX_MASK; + + /* get pointer to item specified by selector */ + struct qemu_fwcfg_item *fwcfg_item = &sc.items[architecture][index]; + + /* check if item is already used */ + if (fwcfg_item->data != NULL) { + warnx("%s: qemu fwcfg item exists (architecture %s index 0x%x)", + __func__, architecture ? "specific" : "generic", index); + return (-1); + } + + /* save data of the item */ + fwcfg_item->size = size; + fwcfg_item->data = data; + + return (0); +} + +static int +qemu_fwcfg_add_item_file_dir() +{ + /* alloc directory */ + uint64_t size = sizeof(struct qemu_fwcfg_directory) + + QEMU_FWCFG_MIN_FILES * sizeof(struct qemu_fwcfg_file); + struct qemu_fwcfg_directory *fwcfg_directory = calloc(1, size); + if (fwcfg_directory == NULL) { + return (-ENOMEM); + } + + /* init directory */ + sc.directory = fwcfg_directory; + + /* add directory */ + return qemu_fwcfg_add_item(QEMU_FWCFG_ARCHITECTURE_GENERIC, + QEMU_FWCFG_INDEX_FILE_DIR, sizeof(struct qemu_fwcfg_directory), (uint8_t *)sc.directory); +} + +static int +qemu_fwcfg_add_item_id() +{ + /* alloc id */ + struct qemu_fwcfg_id *fwcfg_id = calloc(1, + sizeof(struct qemu_fwcfg_id)); + if (fwcfg_id == NULL) { + return (-ENOMEM); + } + + /* init id */ + fwcfg_id->interface = 1; + fwcfg_id->DMA = 0; + + /* + * QEMU specifies ID as little endian. + * Convert fwcfg_id to little endian. + */ + uint32_t *le_fwcfg_id_ptr = (uint32_t *)fwcfg_id; + *le_fwcfg_id_ptr = htole32(*le_fwcfg_id_ptr); + + /* add id */ + return qemu_fwcfg_add_item(QEMU_FWCFG_ARCHITECTURE_GENERIC, + QEMU_FWCFG_INDEX_ID, sizeof(struct qemu_fwcfg_id), + (uint8_t *)fwcfg_id); +} + +static int +qemu_fwcfg_add_item_signature() +{ + /* alloc signature */ + struct qemu_fwcfg_signature *fwcfg_signature = calloc(1, + sizeof(struct qemu_fwcfg_signature)); + if (fwcfg_signature == NULL) { + return (-ENOMEM); + } + + /* init signature */ + fwcfg_signature->signature[0] = 'Q'; + fwcfg_signature->signature[1] = 'E'; + fwcfg_signature->signature[2] = 'M'; + fwcfg_signature->signature[3] = 'U'; + + /* add signature */ + return qemu_fwcfg_add_item(QEMU_FWCFG_ARCHITECTURE_GENERIC, + QEMU_FWCFG_INDEX_SIGNATURE, sizeof(struct qemu_fwcfg_signature), + (uint8_t *)fwcfg_signature); +} + +static int +qemu_fwcfg_register_port(const char *name, int port, int size, int flags, + inout_func_t handler) +{ + struct inout_port iop; + + bzero(&iop, sizeof(iop)); + iop.name = name; + iop.port = port; + iop.size = size; + iop.flags = flags; + iop.handler = handler; + + return register_inout(&iop); +} + +int +qemu_fwcfg_add_file(uint8_t name[QEMU_FWCFG_MAX_NAME], uint32_t size, + void *data) +{ + /* + * QEMU specifies count as big endian. + * Convert it to host endian to work with it. + */ + uint32_t count = be32toh(sc.directory->be_count); + + /* add file to items list */ + uint32_t index = QEMU_FWCFG_FIRST_FILE_INDEX + count; + const int error = qemu_fwcfg_add_item(QEMU_FWCFG_ARCHITECTURE_GENERIC, + index, size, data); + if (error != 0) { + return (error); + } + + /* + * files should be sorted alphabetical, get index for new file + */ + uint32_t file_index; + for (file_index = 0; file_index < count; ++file_index) { + if (strcmp(name, sc.directory->files[file_index].name) < 0) + break; + } + + ++count; + if (count > QEMU_FWCFG_MIN_FILES) { + /* alloc new file directory */ + uint64_t new_size = sizeof(struct qemu_fwcfg_directory) + + count * sizeof(struct qemu_fwcfg_file); + struct qemu_fwcfg_directory *new_directory = calloc(1, + new_size); + if (new_directory == NULL) { + warnx( + "%s: Unable to allocate a new qemu fwcfg files directory (count %d)", + __func__, count); + return (-ENOMEM); + } + + /* copy files below file_index to new directory */ + memcpy(new_directory->files, sc.directory->files, + file_index * sizeof(struct qemu_fwcfg_file)); + + /* copy files behind file_index to directory */ + memcpy(&new_directory->files[file_index + 1], + &sc.directory->files[file_index], + (count - file_index) * sizeof(struct qemu_fwcfg_file)); + + /* free old directory */ + free(sc.directory); + + /* set directory pointer to new directory */ + sc.directory = new_directory; + + /* adjust directory pointer */ + sc.items[0][QEMU_FWCFG_INDEX_FILE_DIR].data = (uint8_t *) + sc.directory; + } else { + /* shift files behind file_index */ + for (uint32_t i = QEMU_FWCFG_MIN_FILES - 1; i > file_index; --i) { + memcpy(&sc.directory->files[i], + &sc.directory->files[i - 1], + sizeof(struct qemu_fwcfg_file)); + } + } + + /* + * QEMU specifies count, size and index as big endian. + * Save these values in big endian to simplify guest reads of these + * values. + */ + sc.directory->be_count = htobe32(count); + sc.directory->files[file_index].be_size = htobe32(size); + sc.directory->files[file_index].be_selector = htobe16(index); + strcpy(sc.directory->files[file_index].name, name); + + /* set new size for the fwcfg_file_directory */ + sc.items[0][QEMU_FWCFG_INDEX_FILE_DIR].size = + sizeof(struct qemu_fwcfg_directory) + + count * sizeof(struct qemu_fwcfg_file); + + return (0); +} + +int +qemu_fwcfg_init(struct vmctx *ctx) +{ + int error; + + error = acpi_device_create(&sc.acpi_dev, ctx, QEMU_FWCFG_ACPI_DEVICE_NAME, + QEMU_FWCFG_ACPI_HARDWARE_ID); + if (error) { + warnx("%s: failed to create ACPI device for QEMU FwCfg", + __func__); + goto done; + } + + error = acpi_device_add_res_fixed_ioport(sc.acpi_dev, + QEMU_FWCFG_SELECTOR_PORT_NUMBER, 2); + if (error) { + warnx("%s: failed to add fixed IO port for QEMU FwCfg", + __func__); + goto done; + } + + /* add common fwcfg items */ + if ((error = qemu_fwcfg_add_item_signature()) != 0) { + warnx("%s: Unable to add signature item", __func__); + goto done; + } + if ((error = qemu_fwcfg_add_item_id()) != 0) { + warnx("%s: Unable to add id item", __func__); + goto done; + } + if ((error = qemu_fwcfg_add_item_file_dir()) != 0) { + warnx("%s: Unable to add file_dir item", __func__); + goto done; + } + + /* add handlers for fwcfg ports */ + if ((error = qemu_fwcfg_register_port("qemu_fwcfg_selector", + QEMU_FWCFG_SELECTOR_PORT_NUMBER, QEMU_FWCFG_SELECTOR_PORT_SIZE, + QEMU_FWCFG_SELECTOR_PORT_FLAGS, + qemu_fwcfg_selector_port_handler)) != 0) { + warnx("%s: Unable to register qemu fwcfg selector port 0x%x", + __func__, QEMU_FWCFG_SELECTOR_PORT_NUMBER); + goto done; + } + if ((error = qemu_fwcfg_register_port("qemu_fwcfg_data", + QEMU_FWCFG_DATA_PORT_NUMBER, QEMU_FWCFG_DATA_PORT_SIZE, + QEMU_FWCFG_DATA_PORT_FLAGS, qemu_fwcfg_data_port_handler)) != + 0) { + warnx("%s: Unable to register qemu fwcfg data port 0x%x", + __func__, QEMU_FWCFG_DATA_PORT_NUMBER); + goto done; + } + +done: + if (error) { + acpi_device_destroy(sc.acpi_dev); + } + + return (error); +}