Index: releng/12.1/sys/amd64/vmm/intel/vtd.c =================================================================== --- releng/12.1/sys/amd64/vmm/intel/vtd.c (revision 363021) +++ releng/12.1/sys/amd64/vmm/intel/vtd.c (revision 363022) @@ -1,690 +1,776 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include "io/iommu.h" /* * Documented in the "Intel Virtualization Technology for Directed I/O", * Architecture Spec, September 2008. */ +#define VTD_DRHD_INCLUDE_PCI_ALL(Flags) (((Flags) >> 0) & 0x1) + /* Section 10.4 "Register Descriptions" */ struct vtdmap { volatile uint32_t version; volatile uint32_t res0; volatile uint64_t cap; volatile uint64_t ext_cap; volatile uint32_t gcr; volatile uint32_t gsr; volatile uint64_t rta; volatile uint64_t ccr; }; #define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F) #define VTD_CAP_ND(cap) ((cap) & 0x7) #define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1) #define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF) #define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1) #define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1) #define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1) #define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF) #define VTD_GCR_WBF (1 << 27) #define VTD_GCR_SRTP (1 << 30) #define VTD_GCR_TE (1U << 31) #define VTD_GSR_WBFS (1 << 27) #define VTD_GSR_RTPS (1 << 30) #define VTD_GSR_TES (1U << 31) #define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */ #define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */ #define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */ #define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */ #define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */ #define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */ #define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */ #define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */ #define VTD_IIR_DOMAIN_P 32 #define VTD_ROOT_PRESENT 0x1 #define VTD_CTX_PRESENT 0x1 #define VTD_CTX_TT_ALL (1UL << 2) #define VTD_PTE_RD (1UL << 0) #define VTD_PTE_WR (1UL << 1) #define VTD_PTE_SUPERPAGE (1UL << 7) #define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL) #define VTD_RID2IDX(rid) (((rid) & 0xff) * 2) struct domain { uint64_t *ptp; /* first level page table page */ int pt_levels; /* number of page table levels */ int addrwidth; /* 'AW' field in context entry */ int spsmask; /* supported super page sizes */ u_int id; /* domain id */ vm_paddr_t maxaddr; /* highest address to be mapped */ SLIST_ENTRY(domain) next; }; static SLIST_HEAD(, domain) domhead; #define DRHD_MAX_UNITS 8 -static int drhd_num; -static struct vtdmap *vtdmaps[DRHD_MAX_UNITS]; -static int max_domains; -typedef int (*drhd_ident_func_t)(void); +static ACPI_DMAR_HARDWARE_UNIT *drhds[DRHD_MAX_UNITS]; +static int drhd_num; +static struct vtdmap *vtdmaps[DRHD_MAX_UNITS]; +static int max_domains; +typedef int (*drhd_ident_func_t)(void); static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); static MALLOC_DEFINE(M_VTD, "vtd", "vtd"); static int vtd_max_domains(struct vtdmap *vtdmap) { int nd; nd = VTD_CAP_ND(vtdmap->cap); switch (nd) { case 0: return (16); case 1: return (64); case 2: return (256); case 3: return (1024); case 4: return (4 * 1024); case 5: return (16 * 1024); case 6: return (64 * 1024); default: panic("vtd_max_domains: invalid value of nd (0x%0x)", nd); } } static u_int domain_id(void) { u_int id; struct domain *dom; /* Skip domain id 0 - it is reserved when Caching Mode field is set */ for (id = 1; id < max_domains; id++) { SLIST_FOREACH(dom, &domhead, next) { if (dom->id == id) break; } if (dom == NULL) break; /* found it */ } if (id >= max_domains) panic("domain ids exhausted"); return (id); } +static struct vtdmap * +vtd_device_scope(uint16_t rid) +{ + int i, remaining, pathremaining; + char *end, *pathend; + struct vtdmap *vtdmap; + ACPI_DMAR_HARDWARE_UNIT *drhd; + ACPI_DMAR_DEVICE_SCOPE *device_scope; + ACPI_DMAR_PCI_PATH *path; + + for (i = 0; i < drhd_num; i++) { + drhd = drhds[i]; + + if (VTD_DRHD_INCLUDE_PCI_ALL(drhd->Flags)) { + /* + * From Intel VT-d arch spec, version 3.0: + * If a DRHD structure with INCLUDE_PCI_ALL flag Set is reported + * for a Segment, it must be enumerated by BIOS after all other + * DRHD structures for the same Segment. + */ + vtdmap = vtdmaps[i]; + return(vtdmap); + } + + end = (char *)drhd + drhd->Header.Length; + remaining = drhd->Header.Length - sizeof(ACPI_DMAR_HARDWARE_UNIT); + while (remaining > sizeof(ACPI_DMAR_DEVICE_SCOPE)) { + device_scope = (ACPI_DMAR_DEVICE_SCOPE *)(end - remaining); + remaining -= device_scope->Length; + + switch (device_scope->EntryType){ + /* 0x01 and 0x02 are PCI device entries */ + case 0x01: + case 0x02: + break; + default: + continue; + } + + if (PCI_RID2BUS(rid) != device_scope->Bus) + continue; + + pathend = (char *)device_scope + device_scope->Length; + pathremaining = device_scope->Length - sizeof(ACPI_DMAR_DEVICE_SCOPE); + while (pathremaining >= sizeof(ACPI_DMAR_PCI_PATH)) { + path = (ACPI_DMAR_PCI_PATH *)(pathend - pathremaining); + pathremaining -= sizeof(ACPI_DMAR_PCI_PATH); + + if (PCI_RID2SLOT(rid) != path->Device) + continue; + if (PCI_RID2FUNC(rid) != path->Function) + continue; + + vtdmap = vtdmaps[i]; + return (vtdmap); + } + } + } + + /* No matching scope */ + return (NULL); +} + static void vtd_wbflush(struct vtdmap *vtdmap) { if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0) pmap_invalidate_cache(); if (VTD_CAP_RWBF(vtdmap->cap)) { vtdmap->gcr = VTD_GCR_WBF; while ((vtdmap->gsr & VTD_GSR_WBFS) != 0) ; } } static void vtd_ctx_global_invalidate(struct vtdmap *vtdmap) { vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL; while ((vtdmap->ccr & VTD_CCR_ICC) != 0) ; } static void vtd_iotlb_global_invalidate(struct vtdmap *vtdmap) { int offset; volatile uint64_t *iotlb_reg, val; vtd_wbflush(vtdmap); offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16; iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8); *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL | VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES; while (1) { val = *iotlb_reg; if ((val & VTD_IIR_IVT) == 0) break; } } static void vtd_translation_enable(struct vtdmap *vtdmap) { vtdmap->gcr = VTD_GCR_TE; while ((vtdmap->gsr & VTD_GSR_TES) == 0) ; } static void vtd_translation_disable(struct vtdmap *vtdmap) { vtdmap->gcr = 0; while ((vtdmap->gsr & VTD_GSR_TES) != 0) ; } static int vtd_init(void) { - int i, units, remaining; + int i, units, remaining, tmp; struct vtdmap *vtdmap; vm_paddr_t ctx_paddr; char *end, envname[32]; unsigned long mapaddr; ACPI_STATUS status; ACPI_TABLE_DMAR *dmar; ACPI_DMAR_HEADER *hdr; ACPI_DMAR_HARDWARE_UNIT *drhd; /* * Allow the user to override the ACPI DMAR table by specifying the * physical address of each remapping unit. * * The following example specifies two remapping units at * physical addresses 0xfed90000 and 0xfeda0000 respectively. * set vtd.regmap.0.addr=0xfed90000 * set vtd.regmap.1.addr=0xfeda0000 */ for (units = 0; units < DRHD_MAX_UNITS; units++) { snprintf(envname, sizeof(envname), "vtd.regmap.%d.addr", units); if (getenv_ulong(envname, &mapaddr) == 0) break; vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr); } if (units > 0) goto skip_dmar; /* Search for DMAR table. */ status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar); if (ACPI_FAILURE(status)) return (ENXIO); end = (char *)dmar + dmar->Header.Length; remaining = dmar->Header.Length - sizeof(ACPI_TABLE_DMAR); while (remaining > sizeof(ACPI_DMAR_HEADER)) { hdr = (ACPI_DMAR_HEADER *)(end - remaining); if (hdr->Length > remaining) break; /* * From Intel VT-d arch spec, version 1.3: * BIOS implementations must report mapping structures * in numerical order, i.e. All remapping structures of * type 0 (DRHD) enumerated before remapping structures of * type 1 (RMRR) and so forth. */ if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT) break; drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr; - vtdmaps[units++] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address); - if (units >= DRHD_MAX_UNITS) + drhds[units] = drhd; + vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address); + if (++units >= DRHD_MAX_UNITS) break; remaining -= hdr->Length; } if (units <= 0) return (ENXIO); skip_dmar: drhd_num = units; - vtdmap = vtdmaps[0]; - if (VTD_CAP_CM(vtdmap->cap) != 0) - panic("vtd_init: invalid caching mode"); + max_domains = 64 * 1024; /* maximum valid value */ + for (i = 0; i < drhd_num; i++){ + vtdmap = vtdmaps[i]; - max_domains = vtd_max_domains(vtdmap); + if (VTD_CAP_CM(vtdmap->cap) != 0) + panic("vtd_init: invalid caching mode"); + /* take most compatible (minimum) value */ + if ((tmp = vtd_max_domains(vtdmap)) < max_domains) + max_domains = tmp; + } + /* * Set up the root-table to point to the context-entry tables */ for (i = 0; i < 256; i++) { ctx_paddr = vtophys(ctx_tables[i]); if (ctx_paddr & PAGE_MASK) panic("ctx table (0x%0lx) not page aligned", ctx_paddr); root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT; } return (0); } static void vtd_cleanup(void) { } static void vtd_enable(void) { int i; struct vtdmap *vtdmap; for (i = 0; i < drhd_num; i++) { vtdmap = vtdmaps[i]; vtd_wbflush(vtdmap); /* Update the root table address */ vtdmap->rta = vtophys(root_table); vtdmap->gcr = VTD_GCR_SRTP; while ((vtdmap->gsr & VTD_GSR_RTPS) == 0) ; vtd_ctx_global_invalidate(vtdmap); vtd_iotlb_global_invalidate(vtdmap); vtd_translation_enable(vtdmap); } } static void vtd_disable(void) { int i; struct vtdmap *vtdmap; for (i = 0; i < drhd_num; i++) { vtdmap = vtdmaps[i]; vtd_translation_disable(vtdmap); } } static void vtd_add_device(void *arg, uint16_t rid) { int idx; uint64_t *ctxp; struct domain *dom = arg; vm_paddr_t pt_paddr; struct vtdmap *vtdmap; uint8_t bus; - vtdmap = vtdmaps[0]; bus = PCI_RID2BUS(rid); ctxp = ctx_tables[bus]; pt_paddr = vtophys(dom->ptp); idx = VTD_RID2IDX(rid); if (ctxp[idx] & VTD_CTX_PRESENT) { panic("vtd_add_device: device %x is already owned by " "domain %d", rid, (uint16_t)(ctxp[idx + 1] >> 8)); } + if ((vtdmap = vtd_device_scope(rid)) == NULL) + panic("vtd_add_device: device %x is not in scope for " + "any DMA remapping unit", rid); + /* * Order is important. The 'present' bit is set only after all fields * of the context pointer are initialized. */ ctxp[idx + 1] = dom->addrwidth | (dom->id << 8); if (VTD_ECAP_DI(vtdmap->ext_cap)) ctxp[idx] = VTD_CTX_TT_ALL; else ctxp[idx] = 0; ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT; /* * 'Not Present' entries are not cached in either the Context Cache * or in the IOTLB, so there is no need to invalidate either of them. */ } static void vtd_remove_device(void *arg, uint16_t rid) { int i, idx; uint64_t *ctxp; struct vtdmap *vtdmap; uint8_t bus; bus = PCI_RID2BUS(rid); ctxp = ctx_tables[bus]; idx = VTD_RID2IDX(rid); /* * Order is important. The 'present' bit is must be cleared first. */ ctxp[idx] = 0; ctxp[idx + 1] = 0; /* * Invalidate the Context Cache and the IOTLB. * * XXX use device-selective invalidation for Context Cache * XXX use domain-selective invalidation for IOTLB */ for (i = 0; i < drhd_num; i++) { vtdmap = vtdmaps[i]; vtd_ctx_global_invalidate(vtdmap); vtd_iotlb_global_invalidate(vtdmap); } } #define CREATE_MAPPING 0 #define REMOVE_MAPPING 1 static uint64_t vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len, int remove) { struct domain *dom; int i, spshift, ptpshift, ptpindex, nlevels; uint64_t spsize, *ptp; dom = arg; ptpindex = 0; ptpshift = 0; KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__, gpa, len)); KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond " "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr)); if (gpa & PAGE_MASK) panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa); if (hpa & PAGE_MASK) panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa); if (len & PAGE_MASK) panic("vtd_create_mapping: unaligned len 0x%0lx", len); /* * Compute the size of the mapping that we can accommodate. * * This is based on three factors: * - supported super page size * - alignment of the region starting at 'gpa' and 'hpa' * - length of the region 'len' */ spshift = 48; for (i = 3; i >= 0; i--) { spsize = 1UL << spshift; if ((dom->spsmask & (1 << i)) != 0 && (gpa & (spsize - 1)) == 0 && (hpa & (spsize - 1)) == 0 && (len >= spsize)) { break; } spshift -= 9; } ptp = dom->ptp; nlevels = dom->pt_levels; while (--nlevels >= 0) { ptpshift = 12 + nlevels * 9; ptpindex = (gpa >> ptpshift) & 0x1FF; /* We have reached the leaf mapping */ if (spshift >= ptpshift) { break; } /* * We are working on a non-leaf page table page. * * Create a downstream page table page if necessary and point * to it from the current page table. */ if (ptp[ptpindex] == 0) { void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO); ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR; } ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M); } if ((gpa & ((1UL << ptpshift) - 1)) != 0) panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift); /* * Update the 'gpa' -> 'hpa' mapping */ if (remove) { ptp[ptpindex] = 0; } else { ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR; if (nlevels > 0) ptp[ptpindex] |= VTD_PTE_SUPERPAGE; } return (1UL << ptpshift); } static uint64_t vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) { return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING)); } static uint64_t vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len) { return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING)); } static void vtd_invalidate_tlb(void *dom) { int i; struct vtdmap *vtdmap; /* * Invalidate the IOTLB. * XXX use domain-selective invalidation for IOTLB */ for (i = 0; i < drhd_num; i++) { vtdmap = vtdmaps[i]; vtd_iotlb_global_invalidate(vtdmap); } } static void * vtd_create_domain(vm_paddr_t maxaddr) { struct domain *dom; vm_paddr_t addr; int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth; struct vtdmap *vtdmap; if (drhd_num <= 0) panic("vtd_create_domain: no dma remapping hardware available"); - vtdmap = vtdmaps[0]; - /* * Calculate AGAW. * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec. */ addr = 0; for (gaw = 0; addr < maxaddr; gaw++) addr = 1ULL << gaw; res = (gaw - 12) % 9; if (res == 0) agaw = gaw; else agaw = gaw + 9 - res; if (agaw > 64) agaw = 64; /* * Select the smallest Supported AGAW and the corresponding number * of page table levels. */ pt_levels = 2; sagaw = 30; addrwidth = 0; - tmp = VTD_CAP_SAGAW(vtdmap->cap); + + tmp = ~0; + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + /* take most compatible value */ + tmp &= VTD_CAP_SAGAW(vtdmap->cap); + } + for (i = 0; i < 5; i++) { if ((tmp & (1 << i)) != 0 && sagaw >= agaw) break; pt_levels++; addrwidth++; sagaw += 9; if (sagaw > 64) sagaw = 64; } if (i >= 5) { - panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d", - VTD_CAP_SAGAW(vtdmap->cap), agaw); + panic("vtd_create_domain: SAGAW 0x%x does not support AGAW %d", + tmp, agaw); } dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK); dom->pt_levels = pt_levels; dom->addrwidth = addrwidth; dom->id = domain_id(); dom->maxaddr = maxaddr; dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK); if ((uintptr_t)dom->ptp & PAGE_MASK) panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp); #ifdef notyet /* * XXX superpage mappings for the iommu do not work correctly. * * By default all physical memory is mapped into the host_domain. * When a VM is allocated wired memory the pages belonging to it * are removed from the host_domain and added to the vm's domain. * * If the page being removed was mapped using a superpage mapping * in the host_domain then we need to demote the mapping before * removing the page. * * There is not any code to deal with the demotion at the moment * so we disable superpage mappings altogether. */ - dom->spsmask = VTD_CAP_SPS(vtdmap->cap); + dom->spsmask = ~0; + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + /* take most compatible value */ + dom->spsmask &= VTD_CAP_SPS(vtdmap->cap); + } #endif SLIST_INSERT_HEAD(&domhead, dom, next); return (dom); } static void vtd_free_ptp(uint64_t *ptp, int level) { int i; uint64_t *nlp; if (level > 1) { for (i = 0; i < 512; i++) { if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0) continue; if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0) continue; nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M); vtd_free_ptp(nlp, level - 1); } } bzero(ptp, PAGE_SIZE); free(ptp, M_VTD); } static void vtd_destroy_domain(void *arg) { struct domain *dom; dom = arg; SLIST_REMOVE(&domhead, dom, domain, next); vtd_free_ptp(dom->ptp, dom->pt_levels); free(dom, M_VTD); } struct iommu_ops iommu_ops_intel = { vtd_init, vtd_cleanup, vtd_enable, vtd_disable, vtd_create_domain, vtd_destroy_domain, vtd_create_mapping, vtd_remove_mapping, vtd_add_device, vtd_remove_device, vtd_invalidate_tlb, }; Index: releng/12.1/usr.sbin/bhyve/pci_emul.c =================================================================== --- releng/12.1/usr.sbin/bhyve/pci_emul.c (revision 363021) +++ releng/12.1/usr.sbin/bhyve/pci_emul.c (revision 363022) @@ -1,2124 +1,2127 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "acpi.h" #include "bhyverun.h" #include "inout.h" #include "ioapic.h" #include "mem.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #define CONF1_ADDR_PORT 0x0cf8 #define CONF1_DATA_PORT 0x0cfc #define CONF1_ENABLE 0x80000000ul #define MAXBUSES (PCI_BUSMAX + 1) #define MAXSLOTS (PCI_SLOTMAX + 1) #define MAXFUNCS (PCI_FUNCMAX + 1) struct funcinfo { char *fi_name; char *fi_param; struct pci_devinst *fi_devi; }; struct intxinfo { int ii_count; int ii_pirq_pin; int ii_ioapic_irq; }; struct slotinfo { struct intxinfo si_intpins[4]; struct funcinfo si_funcs[MAXFUNCS]; }; struct businfo { uint16_t iobase, iolimit; /* I/O window */ uint32_t membase32, memlimit32; /* mmio window below 4GB */ uint64_t membase64, memlimit64; /* mmio window above 4GB */ struct slotinfo slotinfo[MAXSLOTS]; }; static struct businfo *pci_businfo[MAXBUSES]; SET_DECLARE(pci_devemu_set, struct pci_devemu); static uint64_t pci_emul_iobase; static uint64_t pci_emul_membase32; static uint64_t pci_emul_membase64; #define PCI_EMUL_IOBASE 0x2000 #define PCI_EMUL_IOLIMIT 0x10000 #define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ #define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); #define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE #define PCI_EMUL_MEMBASE64 0xD000000000UL #define PCI_EMUL_MEMLIMIT64 0xFD00000000UL static struct pci_devemu *pci_emul_finddev(char *name); static void pci_lintr_route(struct pci_devinst *pi); static void pci_lintr_update(struct pci_devinst *pi); static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, int coff, int bytes, uint32_t *val); static __inline void CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes) { if (bytes == 1) pci_set_cfgdata8(pi, coff, val); else if (bytes == 2) pci_set_cfgdata16(pi, coff, val); else pci_set_cfgdata32(pi, coff, val); } static __inline uint32_t CFGREAD(struct pci_devinst *pi, int coff, int bytes) { if (bytes == 1) return (pci_get_cfgdata8(pi, coff)); else if (bytes == 2) return (pci_get_cfgdata16(pi, coff)); else return (pci_get_cfgdata32(pi, coff)); } /* * I/O access */ /* * Slot options are in the form: * * ::,[,] * [:],[,] * * slot is 0..31 * func is 0..7 * emul is a string describing the type of PCI device e.g. virtio-net * config is an optional string, depending on the device, that can be * used for configuration. * Examples are: * 1,virtio-net,tap0 * 3:0,dummy */ static void pci_parse_slot_usage(char *aopt) { fprintf(stderr, "Invalid PCI slot info field \"%s\"\n", aopt); } int pci_parse_slot(char *opt) { struct businfo *bi; struct slotinfo *si; char *emul, *config, *str, *cp; int error, bnum, snum, fnum; error = -1; str = strdup(opt); emul = config = NULL; if ((cp = strchr(str, ',')) != NULL) { *cp = '\0'; emul = cp + 1; if ((cp = strchr(emul, ',')) != NULL) { *cp = '\0'; config = cp + 1; } } else { pci_parse_slot_usage(opt); goto done; } /* :: */ if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) { bnum = 0; /* : */ if (sscanf(str, "%d:%d", &snum, &fnum) != 2) { fnum = 0; /* */ if (sscanf(str, "%d", &snum) != 1) { snum = -1; } } } if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS || fnum < 0 || fnum >= MAXFUNCS) { pci_parse_slot_usage(opt); goto done; } if (pci_businfo[bnum] == NULL) pci_businfo[bnum] = calloc(1, sizeof(struct businfo)); bi = pci_businfo[bnum]; si = &bi->slotinfo[snum]; if (si->si_funcs[fnum].fi_name != NULL) { fprintf(stderr, "pci slot %d:%d already occupied!\n", snum, fnum); goto done; } if (pci_emul_finddev(emul) == NULL) { fprintf(stderr, "pci slot %d:%d: unknown device \"%s\"\n", snum, fnum, emul); goto done; } error = 0; si->si_funcs[fnum].fi_name = emul; si->si_funcs[fnum].fi_param = config; done: if (error) free(str); return (error); } void pci_print_supported_devices() { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; printf("%s\n", pdp->pe_emu); } } static int pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) { if (offset < pi->pi_msix.pba_offset) return (0); if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { return (0); } return (1); } int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, uint64_t value) { int msix_entry_offset; int tab_index; char *dest; /* support only 4 or 8 byte writes */ if (size != 4 && size != 8) return (-1); /* * Return if table index is beyond what device supports */ tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index >= pi->pi_msix.table_count) return (-1); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned writes */ if ((msix_entry_offset % size) != 0) return (-1); dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 4) *((uint32_t *)dest) = value; else *((uint64_t *)dest) = value; return (0); } uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) { char *dest; int msix_entry_offset; int tab_index; uint64_t retval = ~0; /* * The PCI standard only allows 4 and 8 byte accesses to the MSI-X * table but we also allow 1 byte access to accommodate reads from * ddb. */ if (size != 1 && size != 4 && size != 8) return (retval); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned reads */ if ((msix_entry_offset % size) != 0) { return (retval); } tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index < pi->pi_msix.table_count) { /* valid MSI-X Table access */ dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 1) retval = *((uint8_t *)dest); else if (size == 4) retval = *((uint32_t *)dest); else retval = *((uint64_t *)dest); } else if (pci_valid_pba_offset(pi, offset)) { /* return 0 for PBA access */ retval = 0; } return (retval); } int pci_msix_table_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.table_bar); else return (-1); } int pci_msix_pba_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.pba_bar); else return (-1); } static int pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { struct pci_devinst *pdi = arg; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int i; for (i = 0; i <= PCI_BARMAX; i++) { if (pdi->pi_bar[i].type == PCIBAR_IO && port >= pdi->pi_bar[i].addr && port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { offset = port - pdi->pi_bar[i].addr; if (in) *eax = (*pe->pe_barread)(ctx, vcpu, pdi, i, offset, bytes); else (*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset, bytes, *eax); return (0); } } return (-1); } static int pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { struct pci_devinst *pdi = arg1; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int bidx = (int) arg2; assert(bidx <= PCI_BARMAX); assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 || pdi->pi_bar[bidx].type == PCIBAR_MEM64); assert(addr >= pdi->pi_bar[bidx].addr && addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); offset = addr - pdi->pi_bar[bidx].addr; if (dir == MEM_F_WRITE) { if (size == 8) { (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, 4, *val & 0xffffffff); (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset + 4, 4, *val >> 32); } else { (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, size, *val); } } else { if (size == 8) { *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset, 4); *val |= (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset + 4, 4) << 32; } else { *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset, size); } } return (0); } static int pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, uint64_t *addr) { uint64_t base; assert((size & (size - 1)) == 0); /* must be a power of 2 */ base = roundup2(*baseptr, size); if (base + size <= limit) { *addr = base; *baseptr = base + size; return (0); } else return (-1); } int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size) { return (pci_emul_alloc_pbar(pdi, idx, 0, type, size)); } /* * Register (or unregister) the MMIO or I/O region associated with the BAR * register 'idx' of an emulated pci device. */ static void modify_bar_registration(struct pci_devinst *pi, int idx, int registration) { int error; struct inout_port iop; struct mem_range mr; switch (pi->pi_bar[idx].type) { case PCIBAR_IO: bzero(&iop, sizeof(struct inout_port)); iop.name = pi->pi_name; iop.port = pi->pi_bar[idx].addr; iop.size = pi->pi_bar[idx].size; if (registration) { iop.flags = IOPORT_F_INOUT; iop.handler = pci_emul_io_handler; iop.arg = pi; error = register_inout(&iop); } else error = unregister_inout(&iop); break; case PCIBAR_MEM32: case PCIBAR_MEM64: bzero(&mr, sizeof(struct mem_range)); mr.name = pi->pi_name; mr.base = pi->pi_bar[idx].addr; mr.size = pi->pi_bar[idx].size; if (registration) { mr.flags = MEM_F_RW; mr.handler = pci_emul_mem_handler; mr.arg1 = pi; mr.arg2 = idx; error = register_mem(&mr); } else error = unregister_mem(&mr); break; default: error = EINVAL; break; } assert(error == 0); } static void unregister_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 0); } static void register_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 1); } /* Are we decoding i/o port accesses for the emulated pci device? */ static int porten(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_PORTEN); } /* Are we decoding memory accesses for the emulated pci device? */ static int memen(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_MEMEN); } /* * Update the MMIO or I/O address that is decoded by the BAR register. * * If the pci device has enabled the address space decoding then intercept * the address range decoded by the BAR register. */ static void update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) { int decode; if (pi->pi_bar[idx].type == PCIBAR_IO) decode = porten(pi); else decode = memen(pi); if (decode) unregister_bar(pi, idx); switch (type) { case PCIBAR_IO: case PCIBAR_MEM32: pi->pi_bar[idx].addr = addr; break; case PCIBAR_MEM64: pi->pi_bar[idx].addr &= ~0xffffffffUL; pi->pi_bar[idx].addr |= addr; break; case PCIBAR_MEMHI64: pi->pi_bar[idx].addr &= 0xffffffff; pi->pi_bar[idx].addr |= addr; break; default: assert(0); } if (decode) register_bar(pi, idx); } int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, enum pcibar_type type, uint64_t size) { int error; uint64_t *baseptr, limit, addr, mask, lobits, bar; assert(idx >= 0 && idx <= PCI_BARMAX); if ((size & (size - 1)) != 0) size = 1UL << flsl(size); /* round up to a power of 2 */ /* Enforce minimum BAR sizes required by the PCI standard */ if (type == PCIBAR_IO) { if (size < 4) size = 4; } else { if (size < 16) size = 16; } switch (type) { case PCIBAR_NONE: baseptr = NULL; addr = mask = lobits = 0; break; case PCIBAR_IO: baseptr = &pci_emul_iobase; limit = PCI_EMUL_IOLIMIT; mask = PCIM_BAR_IO_BASE; lobits = PCIM_BAR_IO_SPACE; break; case PCIBAR_MEM64: /* * XXX * Some drivers do not work well if the 64-bit BAR is allocated * above 4GB. Allow for this by allocating small requests under * 4GB unless then allocation size is larger than some arbitrary * number (32MB currently). */ if (size > 32 * 1024 * 1024) { /* * XXX special case for device requiring peer-peer DMA */ if (size == 0x100000000UL) baseptr = &hostbase; else baseptr = &pci_emul_membase64; limit = PCI_EMUL_MEMLIMIT64; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | PCIM_BAR_MEM_PREFETCH; break; } else { baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64; } break; case PCIBAR_MEM32: baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; break; default: printf("pci_emul_alloc_base: invalid bar type %d\n", type); assert(0); } if (baseptr != NULL) { error = pci_emul_alloc_resource(baseptr, limit, size, &addr); if (error != 0) return (error); } pdi->pi_bar[idx].type = type; pdi->pi_bar[idx].addr = addr; pdi->pi_bar[idx].size = size; /* Initialize the BAR register in config space */ bar = (addr & mask) | lobits; pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar); if (type == PCIBAR_MEM64) { assert(idx + 1 <= PCI_BARMAX); pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); } register_bar(pdi, idx); return (0); } #define CAP_START_OFFSET 0x40 static int pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen) { int i, capoff, reallen; uint16_t sts; assert(caplen > 0); reallen = roundup2(caplen, 4); /* dword aligned */ sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) == 0) capoff = CAP_START_OFFSET; else capoff = pi->pi_capend + 1; /* Check if we have enough space */ if (capoff + reallen > PCI_REGMAX + 1) return (-1); /* Set the previous capability pointer */ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) { pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff); pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT); } else pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff); /* Copy the capability */ for (i = 0; i < caplen; i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); /* Set the next capability pointer */ pci_set_cfgdata8(pi, capoff + 1, 0); pi->pi_prevcap = capoff; pi->pi_capend = capoff + reallen - 1; return (0); } static struct pci_devemu * pci_emul_finddev(char *name) { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; if (!strcmp(pdp->pe_emu, name)) { return (pdp); } } return (NULL); } static int pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot, int func, struct funcinfo *fi) { struct pci_devinst *pdi; int err; pdi = calloc(1, sizeof(struct pci_devinst)); pdi->pi_vmctx = ctx; pdi->pi_bus = bus; pdi->pi_slot = slot; pdi->pi_func = func; pthread_mutex_init(&pdi->pi_lintr.lock, NULL); pdi->pi_lintr.pin = 0; pdi->pi_lintr.state = IDLE; pdi->pi_lintr.pirq_pin = 0; pdi->pi_lintr.ioapic_irq = 0; pdi->pi_d = pde; snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot); /* Disable legacy interrupts */ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN); err = (*pde->pe_init)(ctx, pdi, fi->fi_param); if (err == 0) fi->fi_devi = pdi; else free(pdi); return (err); } void pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) { int mmc; /* Number of msi messages must be a power of 2 between 1 and 32 */ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); mmc = ffs(msgnum) - 1; bzero(msicap, sizeof(struct msicap)); msicap->capid = PCIY_MSI; msicap->nextptr = nextptr; msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1); } int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) { struct msicap msicap; pci_populate_msicap(&msicap, msgnum, 0); return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); } static void pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, uint32_t msix_tab_size) { assert(msix_tab_size % 4096 == 0); bzero(msixcap, sizeof(struct msixcap)); msixcap->capid = PCIY_MSIX; /* * Message Control Register, all fields set to * zero except for the Table Size. * Note: Table size N is encoded as N-1 */ msixcap->msgctrl = msgnum - 1; /* * MSI-X BAR setup: * - MSI-X table start at offset 0 * - PBA table starts at a 4K aligned offset after the MSI-X table */ msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK; msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK); } static void pci_msix_table_init(struct pci_devinst *pi, int table_entries) { int i, table_size; assert(table_entries > 0); assert(table_entries <= MAX_MSIX_TABLE_ENTRIES); table_size = table_entries * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* set mask bit of vector control register */ for (i = 0; i < table_entries; i++) pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) { uint32_t tab_size; struct msixcap msixcap; assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE; /* Align table size to nearest 4K */ tab_size = roundup2(tab_size, 4096); pi->pi_msix.table_bar = barnum; pi->pi_msix.pba_bar = barnum; pi->pi_msix.table_offset = 0; pi->pi_msix.table_count = msgnum; pi->pi_msix.pba_offset = tab_size; pi->pi_msix.pba_size = PBA_SIZE(msgnum); pci_msix_table_init(pi, msgnum); pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size); /* allocate memory for MSI-X Table and PBA */ pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32, tab_size + pi->pi_msix.pba_size); return (pci_emul_add_capability(pi, (u_char *)&msixcap, sizeof(msixcap))); } -void +static void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask; int off; off = offset - capoff; /* Message Control Register */ if (off == 2 && bytes == 2) { rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; pci_lintr_update(pi); } CFGWRITE(pi, offset, val, bytes); } -void +static void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask, msgdata, mme; uint32_t addrlo; /* * If guest is writing to the message control register make sure * we do not overwrite read-only fields. */ if ((offset - capoff) == 2 && bytes == 2) { rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; addrlo = pci_get_cfgdata32(pi, capoff + 4); if (msgctrl & PCIM_MSICTRL_64BIT) msgdata = pci_get_cfgdata16(pi, capoff + 12); else msgdata = pci_get_cfgdata16(pi, capoff + 8); mme = msgctrl & PCIM_MSICTRL_MME_MASK; pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0; if (pi->pi_msi.enabled) { pi->pi_msi.addr = addrlo; pi->pi_msi.msg_data = msgdata; pi->pi_msi.maxmsgnum = 1 << (mme >> 4); } else { pi->pi_msi.maxmsgnum = 0; } pci_lintr_update(pi); } CFGWRITE(pi, offset, val, bytes); } void pciecap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { /* XXX don't write to the readonly parts */ CFGWRITE(pi, offset, val, bytes); } #define PCIECAP_VERSION 0x2 int pci_emul_add_pciecap(struct pci_devinst *pi, int type) { int err; struct pciecap pciecap; bzero(&pciecap, sizeof(pciecap)); /* * Use the integrated endpoint type for endpoints on a root complex bus. * * NB: bhyve currently only supports a single PCI bus that is the root * complex bus, so all endpoints are integrated. */ if ((type == PCIEM_TYPE_ENDPOINT) && (pi->pi_bus == 0)) type = PCIEM_TYPE_ROOT_INT_EP; pciecap.capid = PCIY_EXPRESS; pciecap.pcie_capabilities = PCIECAP_VERSION | type; if (type != PCIEM_TYPE_ROOT_INT_EP) { pciecap.link_capabilities = 0x411; /* gen1, x1 */ pciecap.link_status = 0x11; /* gen1, x1 */ } err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap)); return (err); } /* * This function assumes that 'coff' is in the capabilities region of the - * config space. + * config space. A capoff parameter of zero will force a search for the + * offset and type. */ -static void -pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val) +void +pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val, + uint8_t capoff, int capid) { - int capid; - uint8_t capoff, nextoff; + uint8_t nextoff; /* Do not allow un-aligned writes */ if ((offset & (bytes - 1)) != 0) return; - /* Find the capability that we want to update */ - capoff = CAP_START_OFFSET; - while (1) { - nextoff = pci_get_cfgdata8(pi, capoff + 1); - if (nextoff == 0) - break; - if (offset >= capoff && offset < nextoff) - break; + if (capoff == 0) { + /* Find the capability that we want to update */ + capoff = CAP_START_OFFSET; + while (1) { + nextoff = pci_get_cfgdata8(pi, capoff + 1); + if (nextoff == 0) + break; + if (offset >= capoff && offset < nextoff) + break; - capoff = nextoff; + capoff = nextoff; + } + assert(offset >= capoff); + capid = pci_get_cfgdata8(pi, capoff); } - assert(offset >= capoff); /* * Capability ID and Next Capability Pointer are readonly. * However, some o/s's do 4-byte writes that include these. * For this case, trim the write back to 2 bytes and adjust * the data. */ if (offset == capoff || offset == capoff + 1) { if (offset == capoff && bytes == 4) { bytes = 2; offset += 2; val >>= 16; } else return; } - capid = pci_get_cfgdata8(pi, capoff); switch (capid) { case PCIY_MSI: msicap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_MSIX: msixcap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_EXPRESS: pciecap_cfgwrite(pi, capoff, offset, bytes, val); break; default: break; } } static int pci_emul_iscap(struct pci_devinst *pi, int offset) { uint16_t sts; sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) != 0) { if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend) return (1); } return (0); } static int pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { /* * Ignore writes; return 0xff's for reads. The mem read code * will take care of truncating to the correct size. */ if (dir == MEM_F_READ) { *val = 0xffffffffffffffff; } return (0); } static int pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int bytes, uint64_t *val, void *arg1, long arg2) { int bus, slot, func, coff, in; coff = addr & 0xfff; func = (addr >> 12) & 0x7; slot = (addr >> 15) & 0x1f; bus = (addr >> 20) & 0xff; in = (dir == MEM_F_READ); if (in) *val = ~0UL; pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val); return (0); } uint64_t pci_ecfg_base(void) { return (PCI_EMUL_ECFG_BASE); } #define BUSIO_ROUNDUP 32 #define BUSMEM_ROUNDUP (1024 * 1024) int init_pci(struct vmctx *ctx) { struct mem_range mr; struct pci_devemu *pde; struct businfo *bi; struct slotinfo *si; struct funcinfo *fi; size_t lowmem; int bus, slot, func; int error; pci_emul_iobase = PCI_EMUL_IOBASE; pci_emul_membase32 = vm_get_lowmem_limit(ctx); pci_emul_membase64 = PCI_EMUL_MEMBASE64; for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; /* * Keep track of the i/o and memory resources allocated to * this bus. */ bi->iobase = pci_emul_iobase; bi->membase32 = pci_emul_membase32; bi->membase64 = pci_emul_membase64; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_name == NULL) continue; pde = pci_emul_finddev(fi->fi_name); assert(pde != NULL); error = pci_emul_init(ctx, pde, bus, slot, func, fi); if (error) return (error); } } /* * Add some slop to the I/O and memory resources decoded by * this bus to give a guest some flexibility if it wants to * reprogram the BARs. */ pci_emul_iobase += BUSIO_ROUNDUP; pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP); bi->iolimit = pci_emul_iobase; pci_emul_membase32 += BUSMEM_ROUNDUP; pci_emul_membase32 = roundup2(pci_emul_membase32, BUSMEM_ROUNDUP); bi->memlimit32 = pci_emul_membase32; pci_emul_membase64 += BUSMEM_ROUNDUP; pci_emul_membase64 = roundup2(pci_emul_membase64, BUSMEM_ROUNDUP); bi->memlimit64 = pci_emul_membase64; } /* * PCI backends are initialized before routing INTx interrupts * so that LPC devices are able to reserve ISA IRQs before * routing PIRQ pins. */ for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_devi == NULL) continue; pci_lintr_route(fi->fi_devi); } } } lpc_pirq_routed(); /* * The guest physical memory map looks like the following: * [0, lowmem) guest system memory * [lowmem, lowmem_limit) memory hole (may be absent) * [lowmem_limit, 0xE0000000) PCI hole (32-bit BAR allocation) * [0xE0000000, 0xF0000000) PCI extended config window * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware * [4GB, 4GB + highmem) */ /* * Accesses to memory addresses that are not allocated to system * memory or PCI devices return 0xff's. */ lowmem = vm_get_lowmem_size(ctx); bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI hole"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = lowmem; mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; mr.handler = pci_emul_fallback_handler; error = register_mem_fallback(&mr); assert(error == 0); /* PCI extended config space */ bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI ECFG"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = PCI_EMUL_ECFG_BASE; mr.size = PCI_EMUL_ECFG_SIZE; mr.handler = pci_emul_ecfg_handler; error = register_mem(&mr); assert(error == 0); return (0); } static void pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, void *arg) { dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" Zero,"); dsdt_line(" 0x%X", ioapic_irq); dsdt_line(" },"); } static void pci_pirq_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, void *arg) { char *name; name = lpc_pirq_name(pirq_pin); if (name == NULL) return; dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" %s,", name); dsdt_line(" 0x00"); dsdt_line(" },"); free(name); } /* * A bhyve virtual machine has a flat PCI hierarchy with a root port * corresponding to each PCI bus. */ static void pci_bus_write_dsdt(int bus) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; int count, func, slot; /* * If there are no devices on this 'bus' then just return. */ if ((bi = pci_businfo[bus]) == NULL) { /* * Bus 0 is special because it decodes the I/O ports used * for PCI config space access even if there are no devices * on it. */ if (bus != 0) return; } dsdt_line(" Device (PC%02X)", bus); dsdt_line(" {"); dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))"); dsdt_line(" Name (_ADR, Zero)"); dsdt_line(" Method (_BBN, 0, NotSerialized)"); dsdt_line(" {"); dsdt_line(" Return (0x%08X)", bus); dsdt_line(" }"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, " "MaxFixed, PosDecode,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bus); dsdt_line(" 0x%04X, // Range Maximum", bus); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0001, // Length"); dsdt_line(" ,, )"); if (bus == 0) { dsdt_indent(3); dsdt_fixed_ioport(0xCF8, 8); dsdt_unindent(3); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0000, // Range Minimum"); dsdt_line(" 0x0CF7, // Range Maximum"); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0CF8, // Length"); dsdt_line(" ,, , TypeStatic)"); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0D00, // Range Minimum"); dsdt_line(" 0x%04X, // Range Maximum", PCI_EMUL_IOBASE - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", PCI_EMUL_IOBASE - 0x0D00); dsdt_line(" ,, , TypeStatic)"); if (bi == NULL) { dsdt_line(" })"); goto done; } } assert(bi != NULL); /* i/o window */ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bi->iobase); dsdt_line(" 0x%04X, // Range Maximum", bi->iolimit - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", bi->iolimit - bi->iobase); dsdt_line(" ,, , TypeStatic)"); /* mmio window (32-bit) */ dsdt_line(" DWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x00000000, // Granularity"); dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32); dsdt_line(" 0x%08X, // Range Maximum\n", bi->memlimit32 - 1); dsdt_line(" 0x00000000, // Translation Offset"); dsdt_line(" 0x%08X, // Length\n", bi->memlimit32 - bi->membase32); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); /* mmio window (64-bit) */ dsdt_line(" QWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x0000000000000000, // Granularity"); dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64); dsdt_line(" 0x%016lX, // Range Maximum\n", bi->memlimit64 - 1); dsdt_line(" 0x0000000000000000, // Translation Offset"); dsdt_line(" 0x%016lX, // Length\n", bi->memlimit64 - bi->membase64); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); dsdt_line(" })"); count = pci_count_lintr(bus); if (count != 0) { dsdt_indent(2); dsdt_line("Name (PPRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); dsdt_line("})"); dsdt_line("Name (APRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_apic_prt_entry, NULL); dsdt_line("})"); dsdt_line("Method (_PRT, 0, NotSerialized)"); dsdt_line("{"); dsdt_line(" If (PICM)"); dsdt_line(" {"); dsdt_line(" Return (APRT)"); dsdt_line(" }"); dsdt_line(" Else"); dsdt_line(" {"); dsdt_line(" Return (PPRT)"); dsdt_line(" }"); dsdt_line("}"); dsdt_unindent(2); } dsdt_indent(2); for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { pi = si->si_funcs[func].fi_devi; if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL) pi->pi_d->pe_write_dsdt(pi); } } dsdt_unindent(2); done: dsdt_line(" }"); } void pci_write_dsdt(void) { int bus; dsdt_indent(1); dsdt_line("Name (PICM, 0x00)"); dsdt_line("Method (_PIC, 1, NotSerialized)"); dsdt_line("{"); dsdt_line(" Store (Arg0, PICM)"); dsdt_line("}"); dsdt_line(""); dsdt_line("Scope (_SB)"); dsdt_line("{"); for (bus = 0; bus < MAXBUSES; bus++) pci_bus_write_dsdt(bus); dsdt_line("}"); dsdt_unindent(1); } int pci_bus_configured(int bus) { assert(bus >= 0 && bus < MAXBUSES); return (pci_businfo[bus] != NULL); } int pci_msi_enabled(struct pci_devinst *pi) { return (pi->pi_msi.enabled); } int pci_msi_maxmsgnum(struct pci_devinst *pi) { if (pi->pi_msi.enabled) return (pi->pi_msi.maxmsgnum); else return (0); } int pci_msix_enabled(struct pci_devinst *pi) { return (pi->pi_msix.enabled && !pi->pi_msi.enabled); } void pci_generate_msix(struct pci_devinst *pi, int index) { struct msix_table_entry *mte; if (!pci_msix_enabled(pi)) return; if (pi->pi_msix.function_mask) return; if (index >= pi->pi_msix.table_count) return; mte = &pi->pi_msix.table[index]; if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { /* XXX Set PBA bit if interrupt is disabled */ vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data); } } void pci_generate_msi(struct pci_devinst *pi, int index) { if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) { vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr, pi->pi_msi.msg_data + index); } } static bool pci_lintr_permitted(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (!(pi->pi_msi.enabled || pi->pi_msix.enabled || (cmd & PCIM_CMD_INTxDIS))); } void pci_lintr_request(struct pci_devinst *pi) { struct businfo *bi; struct slotinfo *si; int bestpin, bestcount, pin; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); /* * Just allocate a pin from our slot. The pin will be * assigned IRQs later when interrupts are routed. */ si = &bi->slotinfo[pi->pi_slot]; bestpin = 0; bestcount = si->si_intpins[0].ii_count; for (pin = 1; pin < 4; pin++) { if (si->si_intpins[pin].ii_count < bestcount) { bestpin = pin; bestcount = si->si_intpins[pin].ii_count; } } si->si_intpins[bestpin].ii_count++; pi->pi_lintr.pin = bestpin + 1; pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1); } static void pci_lintr_route(struct pci_devinst *pi) { struct businfo *bi; struct intxinfo *ii; if (pi->pi_lintr.pin == 0) return; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1]; /* * Attempt to allocate an I/O APIC pin for this intpin if one * is not yet assigned. */ if (ii->ii_ioapic_irq == 0) ii->ii_ioapic_irq = ioapic_pci_alloc_irq(pi); assert(ii->ii_ioapic_irq > 0); /* * Attempt to allocate a PIRQ pin for this intpin if one is * not yet assigned. */ if (ii->ii_pirq_pin == 0) ii->ii_pirq_pin = pirq_alloc_pin(pi); assert(ii->ii_pirq_pin > 0); pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq; pi->pi_lintr.pirq_pin = ii->ii_pirq_pin; pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin)); } void pci_lintr_assert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == IDLE) { if (pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } else pi->pi_lintr.state = PENDING; } pthread_mutex_unlock(&pi->pi_lintr.lock); } void pci_lintr_deassert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED) { pi->pi_lintr.state = IDLE; pci_irq_deassert(pi); } else if (pi->pi_lintr.state == PENDING) pi->pi_lintr.state = IDLE; pthread_mutex_unlock(&pi->pi_lintr.lock); } static void pci_lintr_update(struct pci_devinst *pi) { pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) { pci_irq_deassert(pi); pi->pi_lintr.state = PENDING; } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } pthread_mutex_unlock(&pi->pi_lintr.lock); } int pci_count_lintr(int bus) { int count, slot, pin; struct slotinfo *slotinfo; count = 0; if (pci_businfo[bus] != NULL) { for (slot = 0; slot < MAXSLOTS; slot++) { slotinfo = &pci_businfo[bus]->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { if (slotinfo->si_intpins[pin].ii_count != 0) count++; } } } return (count); } void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg) { struct businfo *bi; struct slotinfo *si; struct intxinfo *ii; int slot, pin; if ((bi = pci_businfo[bus]) == NULL) return; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { ii = &si->si_intpins[pin]; if (ii->ii_count != 0) cb(bus, slot, pin + 1, ii->ii_pirq_pin, ii->ii_ioapic_irq, arg); } } } /* * Return 1 if the emulated device in 'slot' is a multi-function device. * Return 0 otherwise. */ static int pci_emul_is_mfdev(int bus, int slot) { struct businfo *bi; struct slotinfo *si; int f, numfuncs; numfuncs = 0; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; for (f = 0; f < MAXFUNCS; f++) { if (si->si_funcs[f].fi_devi != NULL) { numfuncs++; } } } return (numfuncs > 1); } /* * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on * whether or not is a multi-function being emulated in the pci 'slot'. */ static void pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) { int mfdev; if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) { mfdev = pci_emul_is_mfdev(bus, slot); switch (bytes) { case 1: case 2: *rv &= ~PCIM_MFDEV; if (mfdev) { *rv |= PCIM_MFDEV; } break; case 4: *rv &= ~(PCIM_MFDEV << 16); if (mfdev) { *rv |= (PCIM_MFDEV << 16); } break; } } } static void pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes) { int i, rshift; uint32_t cmd, cmd2, changed, old, readonly; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ /* * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3. * * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are * 'write 1 to clear'. However these bits are not set to '1' by * any device emulation so it is simpler to treat them as readonly. */ rshift = (coff & 0x3) * 8; readonly = 0xFFFFF880 >> rshift; old = CFGREAD(pi, coff, bytes); new &= ~readonly; new |= (old & readonly); CFGWRITE(pi, coff, new, bytes); /* update config */ cmd2 = pci_get_cfgdata16(pi, PCIR_COMMAND); /* get updated value */ changed = cmd ^ cmd2; /* * If the MMIO or I/O address space decoding has changed then * register/unregister all BARs that decode that address space. */ for (i = 0; i <= PCI_BARMAX; i++) { switch (pi->pi_bar[i].type) { case PCIBAR_NONE: case PCIBAR_MEMHI64: break; case PCIBAR_IO: /* I/O address space decoding changed? */ if (changed & PCIM_CMD_PORTEN) { if (porten(pi)) register_bar(pi, i); else unregister_bar(pi, i); } break; case PCIBAR_MEM32: case PCIBAR_MEM64: /* MMIO address space decoding changed? */ if (changed & PCIM_CMD_MEMEN) { if (memen(pi)) register_bar(pi, i); else unregister_bar(pi, i); } break; default: assert(0); } } /* * If INTx has been unmasked and is pending, assert the * interrupt. */ pci_lintr_update(pi); } static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, int coff, int bytes, uint32_t *eax) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; struct pci_devemu *pe; int idx, needcfg; uint64_t addr, bar, mask; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; pi = si->si_funcs[func].fi_devi; } else pi = NULL; /* * Just return if there is no device at this slot:func or if the * the guest is doing an un-aligned access. */ if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || (coff & (bytes - 1)) != 0) { if (in) *eax = 0xffffffff; return; } /* * Ignore all writes beyond the standard config space and return all * ones on reads. */ if (coff >= PCI_REGMAX + 1) { if (in) { *eax = 0xffffffff; /* * Extended capabilities begin at offset 256 in config * space. Absence of extended capabilities is signaled * with all 0s in the extended capability header at * offset 256. */ if (coff <= PCI_REGMAX + 4) *eax = 0x00000000; } return; } pe = pi->pi_d; /* * Config read */ if (in) { /* Let the device emulation override the default handler */ if (pe->pe_cfgread != NULL) { needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes, eax); } else { needcfg = 1; } if (needcfg) *eax = CFGREAD(pi, coff, bytes); pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax); } else { /* Let the device emulation override the default handler */ if (pe->pe_cfgwrite != NULL && (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0) return; /* * Special handling for write to BAR registers */ if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) { /* * Ignore writes to BAR registers that are not * 4-byte aligned. */ if (bytes != 4 || (coff & 0x3) != 0) return; idx = (coff - PCIR_BAR(0)) / 4; mask = ~(pi->pi_bar[idx].size - 1); switch (pi->pi_bar[idx].type) { case PCIBAR_NONE: pi->pi_bar[idx].addr = bar = 0; break; case PCIBAR_IO: addr = *eax & mask; addr &= 0xffff; bar = addr | PCIM_BAR_IO_SPACE; /* * Register the new BAR value for interception */ if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_IO); } break; case PCIBAR_MEM32: addr = bar = *eax & mask; bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM32); } break; case PCIBAR_MEM64: addr = bar = *eax & mask; bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | PCIM_BAR_MEM_PREFETCH; if (addr != (uint32_t)pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM64); } break; case PCIBAR_MEMHI64: mask = ~(pi->pi_bar[idx - 1].size - 1); addr = ((uint64_t)*eax << 32) & mask; bar = addr >> 32; if (bar != pi->pi_bar[idx - 1].addr >> 32) { update_bar_address(pi, addr, idx - 1, PCIBAR_MEMHI64); } break; default: assert(0); } pci_set_cfgdata32(pi, coff, bar); } else if (pci_emul_iscap(pi, coff)) { - pci_emul_capwrite(pi, coff, bytes, *eax); + pci_emul_capwrite(pi, coff, bytes, *eax, 0, 0); } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) { pci_emul_cmdsts_write(pi, coff, *eax, bytes); } else { CFGWRITE(pi, coff, *eax, bytes); } } } static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; static int pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { uint32_t x; if (bytes != 4) { if (in) *eax = (bytes == 2) ? 0xffff : 0xff; return (0); } if (in) { x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff; if (cfgenable) x |= CONF1_ENABLE; *eax = x; } else { x = *eax; cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; cfgoff = x & PCI_REGMAX; cfgfunc = (x >> 8) & PCI_FUNCMAX; cfgslot = (x >> 11) & PCI_SLOTMAX; cfgbus = (x >> 16) & PCI_BUSMAX; } return (0); } INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); static int pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { int coff; assert(bytes == 1 || bytes == 2 || bytes == 4); coff = cfgoff + (port - CONF1_DATA_PORT); if (cfgenable) { pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes, eax); } else { /* Ignore accesses to cfgdata if not enabled by cfgaddr */ if (in) *eax = 0xffffffff; } return (0); } INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); #define PCI_EMUL_TEST #ifdef PCI_EMUL_TEST /* * Define a dummy test device */ #define DIOSZ 8 #define DMEMSZ 4096 struct pci_emul_dsoftc { uint8_t ioregs[DIOSZ]; uint8_t memregs[2][DMEMSZ]; }; #define PCI_EMUL_MSI_MSGS 4 #define PCI_EMUL_MSIX_MSGS 16 static int pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { int error; struct pci_emul_dsoftc *sc; sc = calloc(1, sizeof(struct pci_emul_dsoftc)); pi->pi_arg = sc; pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS); assert(error == 0); error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ); assert(error == 0); return (0); } static void pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { int i; struct pci_emul_dsoftc *sc = pi->pi_arg; if (baridx == 0) { if (offset + size > DIOSZ) { printf("diow: iow too large, offset %ld size %d\n", offset, size); return; } if (size == 1) { sc->ioregs[offset] = value & 0xff; } else if (size == 2) { *(uint16_t *)&sc->ioregs[offset] = value & 0xffff; } else if (size == 4) { *(uint32_t *)&sc->ioregs[offset] = value; } else { printf("diow: iow unknown size %d\n", size); } /* * Special magic value to generate an interrupt */ if (offset == 4 && size == 4 && pci_msi_enabled(pi)) pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi)); if (value == 0xabcdef) { for (i = 0; i < pci_msi_maxmsgnum(pi); i++) pci_generate_msi(pi, i); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("diow: memw too large, offset %ld size %d\n", offset, size); return; } i = baridx - 1; /* 'memregs' index */ if (size == 1) { sc->memregs[i][offset] = value; } else if (size == 2) { *(uint16_t *)&sc->memregs[i][offset] = value; } else if (size == 4) { *(uint32_t *)&sc->memregs[i][offset] = value; } else if (size == 8) { *(uint64_t *)&sc->memregs[i][offset] = value; } else { printf("diow: memw unknown size %d\n", size); } /* * magic interrupt ?? */ } if (baridx > 2 || baridx < 0) { printf("diow: unknown bar idx %d\n", baridx); } } static uint64_t pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_emul_dsoftc *sc = pi->pi_arg; uint32_t value; int i; if (baridx == 0) { if (offset + size > DIOSZ) { printf("dior: ior too large, offset %ld size %d\n", offset, size); return (0); } value = 0; if (size == 1) { value = sc->ioregs[offset]; } else if (size == 2) { value = *(uint16_t *) &sc->ioregs[offset]; } else if (size == 4) { value = *(uint32_t *) &sc->ioregs[offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("dior: memr too large, offset %ld size %d\n", offset, size); return (0); } i = baridx - 1; /* 'memregs' index */ if (size == 1) { value = sc->memregs[i][offset]; } else if (size == 2) { value = *(uint16_t *) &sc->memregs[i][offset]; } else if (size == 4) { value = *(uint32_t *) &sc->memregs[i][offset]; } else if (size == 8) { value = *(uint64_t *) &sc->memregs[i][offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx > 2 || baridx < 0) { printf("dior: unknown bar idx %d\n", baridx); return (0); } return (value); } struct pci_devemu pci_dummy = { .pe_emu = "dummy", .pe_init = pci_emul_dinit, .pe_barwrite = pci_emul_diow, .pe_barread = pci_emul_dior }; PCI_EMUL_SET(pci_dummy); #endif /* PCI_EMUL_TEST */ Index: releng/12.1/usr.sbin/bhyve/pci_emul.h =================================================================== --- releng/12.1/usr.sbin/bhyve/pci_emul.h (revision 363021) +++ releng/12.1/usr.sbin/bhyve/pci_emul.h (revision 363022) @@ -1,291 +1,289 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _PCI_EMUL_H_ #define _PCI_EMUL_H_ #include #include #include #include #include #include #define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */ struct vmctx; struct pci_devinst; struct memory_region; struct pci_devemu { char *pe_emu; /* Name of device emulation */ /* instance creation */ int (*pe_init)(struct vmctx *, struct pci_devinst *, char *opts); /* ACPI DSDT enumeration */ void (*pe_write_dsdt)(struct pci_devinst *); /* config space read/write callbacks */ int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int offset, int bytes, uint32_t val); int (*pe_cfgread)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int offset, int bytes, uint32_t *retval); /* BAR read/write callbacks */ void (*pe_barwrite)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value); uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size); }; #define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x); enum pcibar_type { PCIBAR_NONE, PCIBAR_IO, PCIBAR_MEM32, PCIBAR_MEM64, PCIBAR_MEMHI64 }; struct pcibar { enum pcibar_type type; /* io or memory */ uint64_t size; uint64_t addr; }; #define PI_NAMESZ 40 struct msix_table_entry { uint64_t addr; uint32_t msg_data; uint32_t vector_control; } __packed; /* * In case the structure is modified to hold extra information, use a define * for the size that should be emulated. */ #define MSIX_TABLE_ENTRY_SIZE 16 #define MAX_MSIX_TABLE_ENTRIES 2048 #define PBA_SIZE(msgnum) (roundup2((msgnum), 64) / 8) enum lintr_stat { IDLE, ASSERTED, PENDING }; struct pci_devinst { struct pci_devemu *pi_d; struct vmctx *pi_vmctx; uint8_t pi_bus, pi_slot, pi_func; char pi_name[PI_NAMESZ]; int pi_bar_getsize; int pi_prevcap; int pi_capend; struct { int8_t pin; enum lintr_stat state; int pirq_pin; int ioapic_irq; pthread_mutex_t lock; } pi_lintr; struct { int enabled; uint64_t addr; uint64_t msg_data; int maxmsgnum; } pi_msi; struct { int enabled; int table_bar; int pba_bar; uint32_t table_offset; int table_count; uint32_t pba_offset; int pba_size; int function_mask; struct msix_table_entry *table; /* allocated at runtime */ void *pba_page; int pba_page_offset; } pi_msix; void *pi_arg; /* devemu-private data */ u_char pi_cfgdata[PCI_REGMAX + 1]; struct pcibar pi_bar[PCI_BARMAX + 1]; }; struct msicap { uint8_t capid; uint8_t nextptr; uint16_t msgctrl; uint32_t addrlo; uint32_t addrhi; uint16_t msgdata; } __packed; static_assert(sizeof(struct msicap) == 14, "compile-time assertion failed"); struct msixcap { uint8_t capid; uint8_t nextptr; uint16_t msgctrl; uint32_t table_info; /* bar index and offset within it */ uint32_t pba_info; /* bar index and offset within it */ } __packed; static_assert(sizeof(struct msixcap) == 12, "compile-time assertion failed"); struct pciecap { uint8_t capid; uint8_t nextptr; uint16_t pcie_capabilities; uint32_t dev_capabilities; /* all devices */ uint16_t dev_control; uint16_t dev_status; uint32_t link_capabilities; /* devices with links */ uint16_t link_control; uint16_t link_status; uint32_t slot_capabilities; /* ports with slots */ uint16_t slot_control; uint16_t slot_status; uint16_t root_control; /* root ports */ uint16_t root_capabilities; uint32_t root_status; uint32_t dev_capabilities2; /* all devices */ uint16_t dev_control2; uint16_t dev_status2; uint32_t link_capabilities2; /* devices with links */ uint16_t link_control2; uint16_t link_status2; uint32_t slot_capabilities2; /* ports with slots */ uint16_t slot_control2; uint16_t slot_status2; } __packed; static_assert(sizeof(struct pciecap) == 60, "compile-time assertion failed"); typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin, int ioapic_irq, void *arg); int init_pci(struct vmctx *ctx); -void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, - int bytes, uint32_t val); -void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, - int bytes, uint32_t val); void pci_callback(void); int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size); int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, enum pcibar_type type, uint64_t size); int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type); +void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, + uint32_t val, uint8_t capoff, int capid); void pci_generate_msi(struct pci_devinst *pi, int msgnum); void pci_generate_msix(struct pci_devinst *pi, int msgnum); void pci_lintr_assert(struct pci_devinst *pi); void pci_lintr_deassert(struct pci_devinst *pi); void pci_lintr_request(struct pci_devinst *pi); int pci_msi_enabled(struct pci_devinst *pi); int pci_msix_enabled(struct pci_devinst *pi); int pci_msix_table_bar(struct pci_devinst *pi); int pci_msix_pba_bar(struct pci_devinst *pi); int pci_msi_maxmsgnum(struct pci_devinst *pi); int pci_parse_slot(char *opt); void pci_print_supported_devices(); void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum); int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, uint64_t value); uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size); int pci_count_lintr(int bus); void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg); void pci_write_dsdt(void); uint64_t pci_ecfg_base(void); int pci_bus_configured(int bus); static __inline void pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val) { assert(offset <= PCI_REGMAX); *(uint8_t *)(pi->pi_cfgdata + offset) = val; } static __inline void pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val) { assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); *(uint16_t *)(pi->pi_cfgdata + offset) = val; } static __inline void pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val) { assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); *(uint32_t *)(pi->pi_cfgdata + offset) = val; } static __inline uint8_t pci_get_cfgdata8(struct pci_devinst *pi, int offset) { assert(offset <= PCI_REGMAX); return (*(uint8_t *)(pi->pi_cfgdata + offset)); } static __inline uint16_t pci_get_cfgdata16(struct pci_devinst *pi, int offset) { assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); return (*(uint16_t *)(pi->pi_cfgdata + offset)); } static __inline uint32_t pci_get_cfgdata32(struct pci_devinst *pi, int offset) { assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); return (*(uint32_t *)(pi->pi_cfgdata + offset)); } #endif /* _PCI_EMUL_H_ */ Index: releng/12.1/usr.sbin/bhyve/pci_passthru.c =================================================================== --- releng/12.1/usr.sbin/bhyve/pci_passthru.c (revision 363021) +++ releng/12.1/usr.sbin/bhyve/pci_passthru.c (revision 363022) @@ -1,937 +1,938 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include "pci_emul.h" #include "mem.h" #ifndef _PATH_DEVPCI #define _PATH_DEVPCI "/dev/pci" #endif #ifndef _PATH_DEVIO #define _PATH_DEVIO "/dev/io" #endif #ifndef _PATH_MEM #define _PATH_MEM "/dev/mem" #endif #define LEGACY_SUPPORT 1 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1) #define MSIX_CAPLEN 12 static int pcifd = -1; static int iofd = -1; static int memfd = -1; struct passthru_softc { struct pci_devinst *psc_pi; struct pcibar psc_bar[PCI_BARMAX + 1]; struct { int capoff; int msgctrl; int emulated; } psc_msi; struct { int capoff; } psc_msix; struct pcisel psc_sel; }; static int msi_caplen(int msgctrl) { int len; len = 10; /* minimum length of msi capability */ if (msgctrl & PCIM_MSICTRL_64BIT) len += 4; #if 0 /* * Ignore the 'mask' and 'pending' bits in the MSI capability. * We'll let the guest manipulate them directly. */ if (msgctrl & PCIM_MSICTRL_VECTOR) len += 10; #endif return (len); } static uint32_t read_config(const struct pcisel *sel, long reg, int width) { struct pci_io pi; bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; if (ioctl(pcifd, PCIOCREAD, &pi) < 0) return (0); /* XXX */ else return (pi.pi_data); } static void write_config(const struct pcisel *sel, long reg, int width, uint32_t data) { struct pci_io pi; bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; pi.pi_data = data; (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */ } #ifdef LEGACY_SUPPORT static int passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr) { int capoff, i; struct msicap msicap; u_char *capdata; pci_populate_msicap(&msicap, msgnum, nextptr); /* * XXX * Copy the msi capability structure in the last 16 bytes of the * config space. This is wrong because it could shadow something * useful to the device. */ capoff = 256 - roundup(sizeof(msicap), 4); capdata = (u_char *)&msicap; for (i = 0; i < sizeof(msicap); i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); return (capoff); } #endif /* LEGACY_SUPPORT */ static int cfginitmsi(struct passthru_softc *sc) { int i, ptr, capptr, cap, sts, caplen, table_size; uint32_t u32; struct pcisel sel; struct pci_devinst *pi; struct msixcap msixcap; uint32_t *msixcap_ptr; pi = sc->psc_pi; sel = sc->psc_sel; /* * Parse the capabilities and cache the location of the MSI * and MSI-X capabilities. */ sts = read_config(&sel, PCIR_STATUS, 2); if (sts & PCIM_STATUS_CAPPRESENT) { ptr = read_config(&sel, PCIR_CAP_PTR, 1); while (ptr != 0 && ptr != 0xff) { cap = read_config(&sel, ptr + PCICAP_ID, 1); if (cap == PCIY_MSI) { /* * Copy the MSI capability into the config * space of the emulated pci device */ sc->psc_msi.capoff = ptr; sc->psc_msi.msgctrl = read_config(&sel, ptr + 2, 2); sc->psc_msi.emulated = 0; caplen = msi_caplen(sc->psc_msi.msgctrl); capptr = ptr; while (caplen > 0) { u32 = read_config(&sel, capptr, 4); pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; } } else if (cap == PCIY_MSIX) { /* * Copy the MSI-X capability */ sc->psc_msix.capoff = ptr; caplen = 12; msixcap_ptr = (uint32_t*) &msixcap; capptr = ptr; while (caplen > 0) { u32 = read_config(&sel, capptr, 4); *msixcap_ptr = u32; pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; msixcap_ptr++; } } ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1); } } if (sc->psc_msix.capoff != 0) { pi->pi_msix.pba_bar = msixcap.pba_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.pba_offset = msixcap.pba_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_bar = msixcap.table_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.table_offset = msixcap.table_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl); pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count); /* Allocate the emulated MSI-X table array */ table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* Mask all table entries */ for (i = 0; i < pi->pi_msix.table_count; i++) { pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } } #ifdef LEGACY_SUPPORT /* * If the passthrough device does not support MSI then craft a * MSI capability for it. We link the new MSI capability at the * head of the list of capabilities. */ if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) { int origptr, msiptr; origptr = read_config(&sel, PCIR_CAP_PTR, 1); msiptr = passthru_add_msicap(pi, 1, origptr); sc->psc_msi.capoff = msiptr; sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2); sc->psc_msi.emulated = 1; pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr); } #endif /* Make sure one of the capabilities is present */ if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) return (-1); else return (0); } static uint64_t msix_table_read(struct passthru_softc *sc, uint64_t offset, int size) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *src8; uint16_t *src16; uint32_t *src32; uint64_t *src64; uint64_t data; size_t entry_offset; int index; pi = sc->psc_pi; if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset && offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { switch(size) { case 1: src8 = (uint8_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src8; break; case 2: src16 = (uint16_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src16; break; case 4: src32 = (uint32_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src32; break; case 8: src64 = (uint64_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src64; break; default: return (-1); } return (data); } if (offset < pi->pi_msix.table_offset) return (-1); offset -= pi->pi_msix.table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; if (index >= pi->pi_msix.table_count) return (-1); entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; switch(size) { case 1: src8 = (uint8_t *)((void *)entry + entry_offset); data = *src8; break; case 2: src16 = (uint16_t *)((void *)entry + entry_offset); data = *src16; break; case 4: src32 = (uint32_t *)((void *)entry + entry_offset); data = *src32; break; case 8: src64 = (uint64_t *)((void *)entry + entry_offset); data = *src64; break; default: return (-1); } return (data); } static void msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc, uint64_t offset, int size, uint64_t data) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *dest8; uint16_t *dest16; uint32_t *dest32; uint64_t *dest64; size_t entry_offset; uint32_t vector_control; int index; pi = sc->psc_pi; if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset && offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { switch(size) { case 1: dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest8 = data; break; case 2: dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest16 = data; break; case 4: dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest32 = data; break; case 8: dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest64 = data; break; default: break; } return; } if (offset < pi->pi_msix.table_offset) return; offset -= pi->pi_msix.table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; if (index >= pi->pi_msix.table_count) return; entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* Only 4 byte naturally-aligned writes are supported */ assert(size == 4); assert(entry_offset % 4 == 0); vector_control = entry->vector_control; dest32 = (uint32_t *)((void *)entry + entry_offset); *dest32 = data; /* If MSI-X hasn't been enabled, do nothing */ if (pi->pi_msix.enabled) { /* If the entry is masked, don't set it up */ if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 || (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { (void)vm_setup_pptdev_msix(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, index, entry->addr, entry->msg_data, entry->vector_control); } } } static int init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base) { int b, s, f; int error, idx; size_t len, remaining; uint32_t table_size, table_offset; uint32_t pba_size, pba_offset; vm_paddr_t start; struct pci_devinst *pi = sc->psc_pi; assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0); b = sc->psc_sel.pc_bus; s = sc->psc_sel.pc_dev; f = sc->psc_sel.pc_func; /* * If the MSI-X table BAR maps memory intended for * other uses, it is at least assured that the table * either resides in its own page within the region, * or it resides in a page shared with only the PBA. */ table_offset = rounddown2(pi->pi_msix.table_offset, 4096); table_size = pi->pi_msix.table_offset - table_offset; table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; table_size = roundup2(table_size, 4096); idx = pi->pi_msix.table_bar; start = pi->pi_bar[idx].addr; remaining = pi->pi_bar[idx].size; if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) { pba_offset = pi->pi_msix.pba_offset; pba_size = pi->pi_msix.pba_size; if (pba_offset >= table_offset + table_size || table_offset >= pba_offset + pba_size) { /* * If the PBA does not share a page with the MSI-x * tables, no PBA emulation is required. */ pi->pi_msix.pba_page = NULL; pi->pi_msix.pba_page_offset = 0; } else { /* * The PBA overlaps with either the first or last * page of the MSI-X table region. Map the * appropriate page. */ if (pba_offset <= table_offset) pi->pi_msix.pba_page_offset = table_offset; else pi->pi_msix.pba_page_offset = table_offset + table_size - 4096; pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, memfd, start + pi->pi_msix.pba_page_offset); if (pi->pi_msix.pba_page == MAP_FAILED) { warn( "Failed to map PBA page for MSI-X on %d/%d/%d", b, s, f); return (-1); } } } /* Map everything before the MSI-X table */ if (table_offset > 0) { len = table_offset; error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base); if (error) return (error); base += len; start += len; remaining -= len; } /* Skip the MSI-X table */ base += table_size; start += table_size; remaining -= table_size; /* Map everything beyond the end of the MSI-X table */ if (remaining > 0) { len = remaining; error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base); if (error) return (error); } return (0); } static int cfginitbar(struct vmctx *ctx, struct passthru_softc *sc) { int i, error; struct pci_devinst *pi; struct pci_bar_io bar; enum pcibar_type bartype; uint64_t base, size; pi = sc->psc_pi; /* * Initialize BAR registers */ for (i = 0; i <= PCI_BARMAX; i++) { bzero(&bar, sizeof(bar)); bar.pbi_sel = sc->psc_sel; bar.pbi_reg = PCIR_BAR(i); if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0) continue; if (PCI_BAR_IO(bar.pbi_base)) { bartype = PCIBAR_IO; base = bar.pbi_base & PCIM_BAR_IO_BASE; } else { switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) { case PCIM_BAR_MEM_64: bartype = PCIBAR_MEM64; break; default: bartype = PCIBAR_MEM32; break; } base = bar.pbi_base & PCIM_BAR_MEM_BASE; } size = bar.pbi_length; if (bartype != PCIBAR_IO) { if (((base | size) & PAGE_MASK) != 0) { warnx("passthru device %d/%d/%d BAR %d: " "base %#lx or size %#lx not page aligned\n", sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, base, size); return (-1); } } /* Cache information about the "real" BAR */ sc->psc_bar[i].type = bartype; sc->psc_bar[i].size = size; sc->psc_bar[i].addr = base; /* Allocate the BAR in the guest I/O or MMIO space */ error = pci_emul_alloc_pbar(pi, i, base, bartype, size); if (error) return (-1); /* The MSI-X table needs special handling */ if (i == pci_msix_table_bar(pi)) { error = init_msix_table(ctx, sc, base); if (error) return (-1); } else if (bartype != PCIBAR_IO) { /* Map the physical BAR in the guest MMIO space */ error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_bar[i].addr, pi->pi_bar[i].size, base); if (error) return (-1); } /* * 64-bit BAR takes up two slots so skip the next one. */ if (bartype == PCIBAR_MEM64) { i++; assert(i <= PCI_BARMAX); sc->psc_bar[i].type = PCIBAR_MEMHI64; } } return (0); } static int cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func) { int error; struct passthru_softc *sc; error = 1; sc = pi->pi_arg; bzero(&sc->psc_sel, sizeof(struct pcisel)); sc->psc_sel.pc_bus = bus; sc->psc_sel.pc_dev = slot; sc->psc_sel.pc_func = func; if (cfginitmsi(sc) != 0) { warnx("failed to initialize MSI for PCI %d/%d/%d", bus, slot, func); goto done; } if (cfginitbar(ctx, sc) != 0) { warnx("failed to initialize BARs for PCI %d/%d/%d", bus, slot, func); goto done; } error = 0; /* success */ done: return (error); } static int passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { int bus, slot, func, error, memflags; struct passthru_softc *sc; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t pci_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR }; cap_ioctl_t io_ioctls[] = { IODEV_PIO }; #endif sc = NULL; error = 1; #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_IOCTL, CAP_READ, CAP_WRITE); #endif memflags = vm_get_memflags(ctx); if (!(memflags & VM_MEM_F_WIRED)) { warnx("passthru requires guest memory to be wired"); goto done; } if (pcifd < 0) { pcifd = open(_PATH_DEVPCI, O_RDWR, 0); if (pcifd < 0) { warn("failed to open %s", _PATH_DEVPCI); goto done; } } #ifndef WITHOUT_CAPSICUM if (caph_rights_limit(pcifd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_ioctls_limit(pcifd, pci_ioctls, nitems(pci_ioctls)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif if (iofd < 0) { iofd = open(_PATH_DEVIO, O_RDWR, 0); if (iofd < 0) { warn("failed to open %s", _PATH_DEVIO); goto done; } } #ifndef WITHOUT_CAPSICUM if (caph_rights_limit(iofd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_ioctls_limit(iofd, io_ioctls, nitems(io_ioctls)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif if (memfd < 0) { memfd = open(_PATH_MEM, O_RDWR, 0); if (memfd < 0) { warn("failed to open %s", _PATH_MEM); goto done; } } #ifndef WITHOUT_CAPSICUM cap_rights_clear(&rights, CAP_IOCTL); cap_rights_set(&rights, CAP_MMAP_RW); if (caph_rights_limit(memfd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif if (opts == NULL || sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) { warnx("invalid passthru options"); goto done; } if (vm_assign_pptdev(ctx, bus, slot, func) != 0) { warnx("PCI device at %d/%d/%d is not using the ppt(4) driver", bus, slot, func); goto done; } sc = calloc(1, sizeof(struct passthru_softc)); pi->pi_arg = sc; sc->psc_pi = pi; /* initialize config space */ if ((error = cfginit(ctx, pi, bus, slot, func)) != 0) goto done; error = 0; /* success */ done: if (error) { free(sc); vm_unassign_pptdev(ctx, bus, slot, func); } return (error); } static int bar_access(int coff) { if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) return (1); else return (0); } static int msicap_access(struct passthru_softc *sc, int coff) { int caplen; if (sc->psc_msi.capoff == 0) return (0); caplen = msi_caplen(sc->psc_msi.msgctrl); if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen) return (1); else return (0); } static int msixcap_access(struct passthru_softc *sc, int coff) { if (sc->psc_msix.capoff == 0) return (0); return (coff >= sc->psc_msix.capoff && coff < sc->psc_msix.capoff + MSIX_CAPLEN); } static int passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t *rv) { struct passthru_softc *sc; sc = pi->pi_arg; /* * PCI BARs and MSI capability is emulated. */ if (bar_access(coff) || msicap_access(sc, coff)) return (-1); #ifdef LEGACY_SUPPORT /* * Emulate PCIR_CAP_PTR if this device does not support MSI capability * natively. */ if (sc->psc_msi.emulated) { if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4) return (-1); } #endif /* Everything else just read from the device's config space */ *rv = read_config(&sc->psc_sel, coff, bytes); return (0); } static int passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t val) { int error, msix_table_entries, i; struct passthru_softc *sc; sc = pi->pi_arg; /* * PCI BARs are emulated */ if (bar_access(coff)) return (-1); /* * MSI capability is emulated */ if (msicap_access(sc, coff)) { - msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val); - + pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff, + PCIY_MSI); error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum); if (error != 0) err(1, "vm_setup_pptdev_msi"); return (0); } if (msixcap_access(sc, coff)) { - msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val); + pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff, + PCIY_MSIX); if (pi->pi_msix.enabled) { msix_table_entries = pi->pi_msix.table_count; for (i = 0; i < msix_table_entries; i++) { error = vm_setup_pptdev_msix(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, pi->pi_msix.table[i].addr, pi->pi_msix.table[i].msg_data, pi->pi_msix.table[i].vector_control); if (error) err(1, "vm_setup_pptdev_msix"); } } return (0); } #ifdef LEGACY_SUPPORT /* * If this device does not support MSI natively then we cannot let * the guest disable legacy interrupts from the device. It is the * legacy interrupt that is triggering the virtual MSI to the guest. */ if (sc->psc_msi.emulated && pci_msi_enabled(pi)) { if (coff == PCIR_COMMAND && bytes == 2) val &= ~PCIM_CMD_INTxDIS; } #endif write_config(&sc->psc_sel, coff, bytes, val); return (0); } static void passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct passthru_softc *sc; struct iodev_pio_req pio; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { msix_table_write(ctx, vcpu, sc, offset, size, value); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); bzero(&pio, sizeof(struct iodev_pio_req)); pio.access = IODEV_PIO_WRITE; pio.port = sc->psc_bar[baridx].addr + offset; pio.width = size; pio.val = value; (void)ioctl(iofd, IODEV_PIO, &pio); } } static uint64_t passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct passthru_softc *sc; struct iodev_pio_req pio; uint64_t val; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { val = msix_table_read(sc, offset, size); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); bzero(&pio, sizeof(struct iodev_pio_req)); pio.access = IODEV_PIO_READ; pio.port = sc->psc_bar[baridx].addr + offset; pio.width = size; pio.val = 0; (void)ioctl(iofd, IODEV_PIO, &pio); val = pio.val; } return (val); } struct pci_devemu passthru = { .pe_emu = "passthru", .pe_init = passthru_init, .pe_cfgwrite = passthru_cfgwrite, .pe_cfgread = passthru_cfgread, .pe_barwrite = passthru_write, .pe_barread = passthru_read, }; PCI_EMUL_SET(passthru);