diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h index 031a6113fac4..5b6a17119960 100644 --- a/usr.sbin/bhyve/pci_emul.h +++ b/usr.sbin/bhyve/pci_emul.h @@ -1,307 +1,307 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _PCI_EMUL_H_ #define _PCI_EMUL_H_ #include #include #include #include #include #include #include #define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */ struct vmctx; struct pci_devinst; struct memory_region; struct vm_snapshot_meta; struct pci_devemu { char *pe_emu; /* Name of device emulation */ /* instance creation */ int (*pe_init)(struct vmctx *, struct pci_devinst *, nvlist_t *); int (*pe_legacy_config)(nvlist_t *, const char *); const char *pe_alias; /* ACPI DSDT enumeration */ void (*pe_write_dsdt)(struct pci_devinst *); /* config space read/write callbacks */ int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int offset, int bytes, uint32_t val); int (*pe_cfgread)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int offset, int bytes, uint32_t *retval); /* BAR read/write callbacks */ void (*pe_barwrite)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value); uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size); void (*pe_baraddr)(struct vmctx *ctx, struct pci_devinst *pi, int baridx, int enabled, uint64_t address); /* Save/restore device state */ int (*pe_snapshot)(struct vm_snapshot_meta *meta); int (*pe_pause)(struct vmctx *ctx, struct pci_devinst *pi); int (*pe_resume)(struct vmctx *ctx, struct pci_devinst *pi); }; #define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x); enum pcibar_type { PCIBAR_NONE, PCIBAR_IO, PCIBAR_MEM32, PCIBAR_MEM64, PCIBAR_MEMHI64 }; struct pcibar { enum pcibar_type type; /* io or memory */ uint64_t size; uint64_t addr; }; #define PI_NAMESZ 40 struct msix_table_entry { uint64_t addr; uint32_t msg_data; uint32_t vector_control; } __packed; /* * In case the structure is modified to hold extra information, use a define * for the size that should be emulated. */ #define MSIX_TABLE_ENTRY_SIZE 16 #define MAX_MSIX_TABLE_ENTRIES 2048 #define PBA_SIZE(msgnum) (roundup2((msgnum), 64) / 8) enum lintr_stat { IDLE, ASSERTED, PENDING }; struct pci_devinst { struct pci_devemu *pi_d; struct vmctx *pi_vmctx; uint8_t pi_bus, pi_slot, pi_func; char pi_name[PI_NAMESZ]; int pi_bar_getsize; int pi_prevcap; int pi_capend; struct { int8_t pin; enum lintr_stat state; int pirq_pin; int ioapic_irq; pthread_mutex_t lock; } pi_lintr; struct { int enabled; uint64_t addr; uint64_t msg_data; int maxmsgnum; } pi_msi; struct { int enabled; int table_bar; int pba_bar; uint32_t table_offset; int table_count; uint32_t pba_offset; int pba_size; int function_mask; struct msix_table_entry *table; /* allocated at runtime */ - void *pba_page; - int pba_page_offset; + uint8_t *mapped_addr; + size_t mapped_size; } pi_msix; void *pi_arg; /* devemu-private data */ u_char pi_cfgdata[PCI_REGMAX + 1]; struct pcibar pi_bar[PCI_BARMAX + 1]; }; struct msicap { uint8_t capid; uint8_t nextptr; uint16_t msgctrl; uint32_t addrlo; uint32_t addrhi; uint16_t msgdata; } __packed; static_assert(sizeof(struct msicap) == 14, "compile-time assertion failed"); struct msixcap { uint8_t capid; uint8_t nextptr; uint16_t msgctrl; uint32_t table_info; /* bar index and offset within it */ uint32_t pba_info; /* bar index and offset within it */ } __packed; static_assert(sizeof(struct msixcap) == 12, "compile-time assertion failed"); struct pciecap { uint8_t capid; uint8_t nextptr; uint16_t pcie_capabilities; uint32_t dev_capabilities; /* all devices */ uint16_t dev_control; uint16_t dev_status; uint32_t link_capabilities; /* devices with links */ uint16_t link_control; uint16_t link_status; uint32_t slot_capabilities; /* ports with slots */ uint16_t slot_control; uint16_t slot_status; uint16_t root_control; /* root ports */ uint16_t root_capabilities; uint32_t root_status; uint32_t dev_capabilities2; /* all devices */ uint16_t dev_control2; uint16_t dev_status2; uint32_t link_capabilities2; /* devices with links */ uint16_t link_control2; uint16_t link_status2; uint32_t slot_capabilities2; /* ports with slots */ uint16_t slot_control2; uint16_t slot_status2; } __packed; static_assert(sizeof(struct pciecap) == 60, "compile-time assertion failed"); typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin, int ioapic_irq, void *arg); int init_pci(struct vmctx *ctx); void pci_callback(void); int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size); int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type); void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val, uint8_t capoff, int capid); void pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old); void pci_generate_msi(struct pci_devinst *pi, int msgnum); void pci_generate_msix(struct pci_devinst *pi, int msgnum); void pci_lintr_assert(struct pci_devinst *pi); void pci_lintr_deassert(struct pci_devinst *pi); void pci_lintr_request(struct pci_devinst *pi); int pci_msi_enabled(struct pci_devinst *pi); int pci_msix_enabled(struct pci_devinst *pi); int pci_msix_table_bar(struct pci_devinst *pi); int pci_msix_pba_bar(struct pci_devinst *pi); int pci_msi_maxmsgnum(struct pci_devinst *pi); int pci_parse_legacy_config(nvlist_t *nvl, const char *opt); int pci_parse_slot(char *opt); void pci_print_supported_devices(); void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum); int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, uint64_t value); uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size); int pci_count_lintr(int bus); void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg); void pci_write_dsdt(void); uint64_t pci_ecfg_base(void); int pci_bus_configured(int bus); #ifdef BHYVE_SNAPSHOT int pci_snapshot(struct vm_snapshot_meta *meta); int pci_pause(struct vmctx *ctx, const char *dev_name); int pci_resume(struct vmctx *ctx, const char *dev_name); #endif static __inline void pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val) { assert(offset <= PCI_REGMAX); *(uint8_t *)(pi->pi_cfgdata + offset) = val; } static __inline void pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val) { assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); *(uint16_t *)(pi->pi_cfgdata + offset) = val; } static __inline void pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val) { assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); *(uint32_t *)(pi->pi_cfgdata + offset) = val; } static __inline uint8_t pci_get_cfgdata8(struct pci_devinst *pi, int offset) { assert(offset <= PCI_REGMAX); return (*(uint8_t *)(pi->pi_cfgdata + offset)); } static __inline uint16_t pci_get_cfgdata16(struct pci_devinst *pi, int offset) { assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); return (*(uint16_t *)(pi->pi_cfgdata + offset)); } static __inline uint32_t pci_get_cfgdata32(struct pci_devinst *pi, int offset) { assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); return (*(uint32_t *)(pi->pi_cfgdata + offset)); } #endif /* _PCI_EMUL_H_ */ diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c index 2c6a2ebd8dd9..bf99c646c480 100644 --- a/usr.sbin/bhyve/pci_passthru.c +++ b/usr.sbin/bhyve/pci_passthru.c @@ -1,1044 +1,1006 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include +#include + #include +#include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include "config.h" #include "debug.h" #include "pci_emul.h" #include "mem.h" #ifndef _PATH_DEVPCI #define _PATH_DEVPCI "/dev/pci" #endif -#ifndef _PATH_MEM -#define _PATH_MEM "/dev/mem" -#endif - #define LEGACY_SUPPORT 1 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1) #define MSIX_CAPLEN 12 static int pcifd = -1; -static int memfd = -1; struct passthru_softc { struct pci_devinst *psc_pi; struct pcibar psc_bar[PCI_BARMAX + 1]; struct { int capoff; int msgctrl; int emulated; } psc_msi; struct { int capoff; } psc_msix; struct pcisel psc_sel; }; static int msi_caplen(int msgctrl) { int len; len = 10; /* minimum length of msi capability */ if (msgctrl & PCIM_MSICTRL_64BIT) len += 4; #if 0 /* * Ignore the 'mask' and 'pending' bits in the MSI capability. * We'll let the guest manipulate them directly. */ if (msgctrl & PCIM_MSICTRL_VECTOR) len += 10; #endif return (len); } static uint32_t read_config(const struct pcisel *sel, long reg, int width) { struct pci_io pi; bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; if (ioctl(pcifd, PCIOCREAD, &pi) < 0) return (0); /* XXX */ else return (pi.pi_data); } static void write_config(const struct pcisel *sel, long reg, int width, uint32_t data) { struct pci_io pi; bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; pi.pi_data = data; (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */ } #ifdef LEGACY_SUPPORT static int passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr) { int capoff, i; struct msicap msicap; u_char *capdata; pci_populate_msicap(&msicap, msgnum, nextptr); /* * XXX * Copy the msi capability structure in the last 16 bytes of the * config space. This is wrong because it could shadow something * useful to the device. */ capoff = 256 - roundup(sizeof(msicap), 4); capdata = (u_char *)&msicap; for (i = 0; i < sizeof(msicap); i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); return (capoff); } #endif /* LEGACY_SUPPORT */ static int cfginitmsi(struct passthru_softc *sc) { int i, ptr, capptr, cap, sts, caplen, table_size; uint32_t u32; struct pcisel sel; struct pci_devinst *pi; struct msixcap msixcap; uint32_t *msixcap_ptr; pi = sc->psc_pi; sel = sc->psc_sel; /* * Parse the capabilities and cache the location of the MSI * and MSI-X capabilities. */ sts = read_config(&sel, PCIR_STATUS, 2); if (sts & PCIM_STATUS_CAPPRESENT) { ptr = read_config(&sel, PCIR_CAP_PTR, 1); while (ptr != 0 && ptr != 0xff) { cap = read_config(&sel, ptr + PCICAP_ID, 1); if (cap == PCIY_MSI) { /* * Copy the MSI capability into the config * space of the emulated pci device */ sc->psc_msi.capoff = ptr; sc->psc_msi.msgctrl = read_config(&sel, ptr + 2, 2); sc->psc_msi.emulated = 0; caplen = msi_caplen(sc->psc_msi.msgctrl); capptr = ptr; while (caplen > 0) { u32 = read_config(&sel, capptr, 4); pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; } } else if (cap == PCIY_MSIX) { /* * Copy the MSI-X capability */ sc->psc_msix.capoff = ptr; caplen = 12; msixcap_ptr = (uint32_t*) &msixcap; capptr = ptr; while (caplen > 0) { u32 = read_config(&sel, capptr, 4); *msixcap_ptr = u32; pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; msixcap_ptr++; } } ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1); } } if (sc->psc_msix.capoff != 0) { pi->pi_msix.pba_bar = msixcap.pba_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.pba_offset = msixcap.pba_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_bar = msixcap.table_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.table_offset = msixcap.table_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl); pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count); /* Allocate the emulated MSI-X table array */ table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* Mask all table entries */ for (i = 0; i < pi->pi_msix.table_count; i++) { pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } } #ifdef LEGACY_SUPPORT /* * If the passthrough device does not support MSI then craft a * MSI capability for it. We link the new MSI capability at the * head of the list of capabilities. */ if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) { int origptr, msiptr; origptr = read_config(&sel, PCIR_CAP_PTR, 1); msiptr = passthru_add_msicap(pi, 1, origptr); sc->psc_msi.capoff = msiptr; sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2); sc->psc_msi.emulated = 1; pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr); } #endif /* Make sure one of the capabilities is present */ if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) return (-1); else return (0); } static uint64_t msix_table_read(struct passthru_softc *sc, uint64_t offset, int size) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *src8; uint16_t *src16; uint32_t *src32; uint64_t *src64; uint64_t data; size_t entry_offset; - int index; + uint32_t table_offset; + int index, table_count; pi = sc->psc_pi; - if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset && - offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { - switch(size) { + + table_offset = pi->pi_msix.table_offset; + table_count = pi->pi_msix.table_count; + if (offset < table_offset || + offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) { + switch (size) { case 1: - src8 = (uint8_t *)(pi->pi_msix.pba_page + offset - - pi->pi_msix.pba_page_offset); + src8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset); data = *src8; break; case 2: - src16 = (uint16_t *)(pi->pi_msix.pba_page + offset - - pi->pi_msix.pba_page_offset); + src16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset); data = *src16; break; case 4: - src32 = (uint32_t *)(pi->pi_msix.pba_page + offset - - pi->pi_msix.pba_page_offset); + src32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset); data = *src32; break; case 8: - src64 = (uint64_t *)(pi->pi_msix.pba_page + offset - - pi->pi_msix.pba_page_offset); + src64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset); data = *src64; break; default: return (-1); } return (data); } - if (offset < pi->pi_msix.table_offset) - return (-1); - - offset -= pi->pi_msix.table_offset; + offset -= table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; - if (index >= pi->pi_msix.table_count) - return (-1); + assert(index < table_count); entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; - switch(size) { + switch (size) { case 1: - src8 = (uint8_t *)((void *)entry + entry_offset); + src8 = (uint8_t *)((uint8_t *)entry + entry_offset); data = *src8; break; case 2: - src16 = (uint16_t *)((void *)entry + entry_offset); + src16 = (uint16_t *)((uint8_t *)entry + entry_offset); data = *src16; break; case 4: - src32 = (uint32_t *)((void *)entry + entry_offset); + src32 = (uint32_t *)((uint8_t *)entry + entry_offset); data = *src32; break; case 8: - src64 = (uint64_t *)((void *)entry + entry_offset); + src64 = (uint64_t *)((uint8_t *)entry + entry_offset); data = *src64; break; default: return (-1); } return (data); } static void msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc, uint64_t offset, int size, uint64_t data) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *dest8; uint16_t *dest16; uint32_t *dest32; uint64_t *dest64; size_t entry_offset; - uint32_t vector_control; - int index; + uint32_t table_offset, vector_control; + int index, table_count; pi = sc->psc_pi; - if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset && - offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { - switch(size) { + + table_offset = pi->pi_msix.table_offset; + table_count = pi->pi_msix.table_count; + if (offset < table_offset || + offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) { + switch (size) { case 1: - dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset - - pi->pi_msix.pba_page_offset); + dest8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset); *dest8 = data; break; case 2: - dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset - - pi->pi_msix.pba_page_offset); + dest16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset); *dest16 = data; break; case 4: - dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset - - pi->pi_msix.pba_page_offset); + dest32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset); *dest32 = data; break; case 8: - dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset - - pi->pi_msix.pba_page_offset); + dest64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset); *dest64 = data; break; - default: - break; } return; } - if (offset < pi->pi_msix.table_offset) - return; - - offset -= pi->pi_msix.table_offset; + offset -= table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; - if (index >= pi->pi_msix.table_count) - return; + assert(index < table_count); entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* Only 4 byte naturally-aligned writes are supported */ assert(size == 4); assert(entry_offset % 4 == 0); vector_control = entry->vector_control; dest32 = (uint32_t *)((void *)entry + entry_offset); *dest32 = data; /* If MSI-X hasn't been enabled, do nothing */ if (pi->pi_msix.enabled) { /* If the entry is masked, don't set it up */ if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 || (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { (void)vm_setup_pptdev_msix(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, index, entry->addr, entry->msg_data, entry->vector_control); } } } static int init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base) { + struct pci_devinst *pi = sc->psc_pi; + struct pci_bar_mmap pbm; int b, s, f; - int idx; - size_t remaining; uint32_t table_size, table_offset; - uint32_t pba_size, pba_offset; - vm_paddr_t start; - struct pci_devinst *pi = sc->psc_pi; assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0); b = sc->psc_sel.pc_bus; s = sc->psc_sel.pc_dev; f = sc->psc_sel.pc_func; - /* - * If the MSI-X table BAR maps memory intended for - * other uses, it is at least assured that the table - * either resides in its own page within the region, - * or it resides in a page shared with only the PBA. + /* + * Map the region of the BAR containing the MSI-X table. This is + * necessary for two reasons: + * 1. The PBA may reside in the first or last page containing the MSI-X + * table. + * 2. While PCI devices are not supposed to use the page(s) containing + * the MSI-X table for other purposes, some do in practice. */ + memset(&pbm, 0, sizeof(pbm)); + pbm.pbm_sel = sc->psc_sel; + pbm.pbm_flags = PCIIO_BAR_MMAP_RW; + pbm.pbm_reg = PCIR_BAR(pi->pi_msix.pba_bar); + pbm.pbm_memattr = VM_MEMATTR_DEVICE; + + if (ioctl(pcifd, PCIOCBARMMAP, &pbm) != 0) { + warn("Failed to map MSI-X table BAR on %d/%d/%d", b, s, f); + return (-1); + } + assert(pbm.pbm_bar_off == 0); + pi->pi_msix.mapped_addr = (uint8_t *)(uintptr_t)pbm.pbm_map_base; + pi->pi_msix.mapped_size = pbm.pbm_map_length; + table_offset = rounddown2(pi->pi_msix.table_offset, 4096); table_size = pi->pi_msix.table_offset - table_offset; table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; table_size = roundup2(table_size, 4096); - idx = pi->pi_msix.table_bar; - start = pi->pi_bar[idx].addr; - remaining = pi->pi_bar[idx].size; - - if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) { - pba_offset = pi->pi_msix.pba_offset; - pba_size = pi->pi_msix.pba_size; - if (pba_offset >= table_offset + table_size || - table_offset >= pba_offset + pba_size) { - /* - * If the PBA does not share a page with the MSI-x - * tables, no PBA emulation is required. - */ - pi->pi_msix.pba_page = NULL; - pi->pi_msix.pba_page_offset = 0; - } else { - /* - * The PBA overlaps with either the first or last - * page of the MSI-X table region. Map the - * appropriate page. - */ - if (pba_offset <= table_offset) - pi->pi_msix.pba_page_offset = table_offset; - else - pi->pi_msix.pba_page_offset = table_offset + - table_size - 4096; - pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ | - PROT_WRITE, MAP_SHARED, memfd, start + - pi->pi_msix.pba_page_offset); - if (pi->pi_msix.pba_page == MAP_FAILED) { - warn( - "Failed to map PBA page for MSI-X on %d/%d/%d", - b, s, f); - return (-1); - } - } - } + /* + * Unmap any pages not covered by the table, we do not need to emulate + * accesses to them. Avoid releasing address space to help ensure that + * a buggy out-of-bounds access causes a crash. + */ + if (table_offset != 0) + if (mprotect(pi->pi_msix.mapped_addr, table_offset, + PROT_NONE) != 0) + warn("Failed to unmap MSI-X table BAR region"); + if (table_offset + table_size != pi->pi_msix.mapped_size) + if (mprotect(pi->pi_msix.mapped_addr, + pi->pi_msix.mapped_size - (table_offset + table_size), + PROT_NONE) != 0) + warn("Failed to unmap MSI-X table BAR region"); return (0); } static int cfginitbar(struct vmctx *ctx, struct passthru_softc *sc) { int i, error; struct pci_devinst *pi; struct pci_bar_io bar; enum pcibar_type bartype; uint64_t base, size; pi = sc->psc_pi; /* * Initialize BAR registers */ for (i = 0; i <= PCI_BARMAX; i++) { bzero(&bar, sizeof(bar)); bar.pbi_sel = sc->psc_sel; bar.pbi_reg = PCIR_BAR(i); if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0) continue; if (PCI_BAR_IO(bar.pbi_base)) { bartype = PCIBAR_IO; base = bar.pbi_base & PCIM_BAR_IO_BASE; } else { switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) { case PCIM_BAR_MEM_64: bartype = PCIBAR_MEM64; break; default: bartype = PCIBAR_MEM32; break; } base = bar.pbi_base & PCIM_BAR_MEM_BASE; } size = bar.pbi_length; if (bartype != PCIBAR_IO) { if (((base | size) & PAGE_MASK) != 0) { warnx("passthru device %d/%d/%d BAR %d: " "base %#lx or size %#lx not page aligned\n", sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, base, size); return (-1); } } /* Cache information about the "real" BAR */ sc->psc_bar[i].type = bartype; sc->psc_bar[i].size = size; sc->psc_bar[i].addr = base; /* Allocate the BAR in the guest I/O or MMIO space */ error = pci_emul_alloc_bar(pi, i, bartype, size); if (error) return (-1); /* The MSI-X table needs special handling */ if (i == pci_msix_table_bar(pi)) { error = init_msix_table(ctx, sc, base); if (error) return (-1); } /* * 64-bit BAR takes up two slots so skip the next one. */ if (bartype == PCIBAR_MEM64) { i++; assert(i <= PCI_BARMAX); sc->psc_bar[i].type = PCIBAR_MEMHI64; } } return (0); } static int cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func) { int error; struct passthru_softc *sc; error = 1; sc = pi->pi_arg; bzero(&sc->psc_sel, sizeof(struct pcisel)); sc->psc_sel.pc_bus = bus; sc->psc_sel.pc_dev = slot; sc->psc_sel.pc_func = func; if (cfginitmsi(sc) != 0) { warnx("failed to initialize MSI for PCI %d/%d/%d", bus, slot, func); goto done; } if (cfginitbar(ctx, sc) != 0) { warnx("failed to initialize BARs for PCI %d/%d/%d", bus, slot, func); goto done; } pci_set_cfgdata16(pi, PCIR_COMMAND, read_config(&sc->psc_sel, PCIR_COMMAND, 2)); error = 0; /* success */ done: return (error); } static int passthru_legacy_config(nvlist_t *nvl, const char *opts) { char value[16]; int bus, slot, func; if (opts == NULL) return (0); if (sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) { EPRINTLN("passthru: invalid options \"%s\"", opts); return (-1); } snprintf(value, sizeof(value), "%d", bus); set_config_value_node(nvl, "bus", value); snprintf(value, sizeof(value), "%d", slot); set_config_value_node(nvl, "slot", value); snprintf(value, sizeof(value), "%d", func); set_config_value_node(nvl, "func", value); return (0); } static int passthru_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) { int bus, slot, func, error, memflags; struct passthru_softc *sc; const char *value; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t pci_ioctls[] = - { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR, PCIOCBARIO }; + { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR, PCIOCBARIO, PCIOCBARMMAP }; #endif sc = NULL; error = 1; #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_IOCTL, CAP_READ, CAP_WRITE); #endif memflags = vm_get_memflags(ctx); if (!(memflags & VM_MEM_F_WIRED)) { warnx("passthru requires guest memory to be wired"); return (error); } if (pcifd < 0) { pcifd = open(_PATH_DEVPCI, O_RDWR, 0); if (pcifd < 0) { warn("failed to open %s", _PATH_DEVPCI); return (error); } } #ifndef WITHOUT_CAPSICUM if (caph_rights_limit(pcifd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_ioctls_limit(pcifd, pci_ioctls, nitems(pci_ioctls)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif - if (memfd < 0) { - memfd = open(_PATH_MEM, O_RDWR, 0); - if (memfd < 0) { - warn("failed to open %s", _PATH_MEM); - return (error); - } - } - -#ifndef WITHOUT_CAPSICUM - cap_rights_clear(&rights, CAP_IOCTL); - cap_rights_set(&rights, CAP_MMAP_RW); - if (caph_rights_limit(memfd, &rights) == -1) - errx(EX_OSERR, "Unable to apply rights for sandbox"); -#endif - #define GET_INT_CONFIG(var, name) do { \ value = get_config_value_node(nvl, name); \ if (value == NULL) { \ EPRINTLN("passthru: missing required %s setting", name); \ return (error); \ } \ var = atoi(value); \ } while (0) GET_INT_CONFIG(bus, "bus"); GET_INT_CONFIG(slot, "slot"); GET_INT_CONFIG(func, "func"); if (vm_assign_pptdev(ctx, bus, slot, func) != 0) { warnx("PCI device at %d/%d/%d is not using the ppt(4) driver", bus, slot, func); goto done; } sc = calloc(1, sizeof(struct passthru_softc)); pi->pi_arg = sc; sc->psc_pi = pi; /* initialize config space */ error = cfginit(ctx, pi, bus, slot, func); done: if (error) { free(sc); vm_unassign_pptdev(ctx, bus, slot, func); } return (error); } static int bar_access(int coff) { if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) return (1); else return (0); } static int msicap_access(struct passthru_softc *sc, int coff) { int caplen; if (sc->psc_msi.capoff == 0) return (0); caplen = msi_caplen(sc->psc_msi.msgctrl); if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen) return (1); else return (0); } static int msixcap_access(struct passthru_softc *sc, int coff) { if (sc->psc_msix.capoff == 0) return (0); return (coff >= sc->psc_msix.capoff && coff < sc->psc_msix.capoff + MSIX_CAPLEN); } static int passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t *rv) { struct passthru_softc *sc; sc = pi->pi_arg; /* * PCI BARs and MSI capability is emulated. */ if (bar_access(coff) || msicap_access(sc, coff)) return (-1); #ifdef LEGACY_SUPPORT /* * Emulate PCIR_CAP_PTR if this device does not support MSI capability * natively. */ if (sc->psc_msi.emulated) { if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4) return (-1); } #endif /* * Emulate the command register. If a single read reads both the * command and status registers, read the status register from the * device's config space. */ if (coff == PCIR_COMMAND) { if (bytes <= 2) return (-1); *rv = read_config(&sc->psc_sel, PCIR_STATUS, 2) << 16 | pci_get_cfgdata16(pi, PCIR_COMMAND); return (0); } /* Everything else just read from the device's config space */ *rv = read_config(&sc->psc_sel, coff, bytes); return (0); } static int passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t val) { int error, msix_table_entries, i; struct passthru_softc *sc; uint16_t cmd_old; sc = pi->pi_arg; /* * PCI BARs are emulated */ if (bar_access(coff)) return (-1); /* * MSI capability is emulated */ if (msicap_access(sc, coff)) { pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff, PCIY_MSI); error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum); if (error != 0) err(1, "vm_setup_pptdev_msi"); return (0); } if (msixcap_access(sc, coff)) { pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff, PCIY_MSIX); if (pi->pi_msix.enabled) { msix_table_entries = pi->pi_msix.table_count; for (i = 0; i < msix_table_entries; i++) { error = vm_setup_pptdev_msix(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, pi->pi_msix.table[i].addr, pi->pi_msix.table[i].msg_data, pi->pi_msix.table[i].vector_control); if (error) err(1, "vm_setup_pptdev_msix"); } } else { error = vm_disable_pptdev_msix(ctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func); if (error) err(1, "vm_disable_pptdev_msix"); } return (0); } #ifdef LEGACY_SUPPORT /* * If this device does not support MSI natively then we cannot let * the guest disable legacy interrupts from the device. It is the * legacy interrupt that is triggering the virtual MSI to the guest. */ if (sc->psc_msi.emulated && pci_msi_enabled(pi)) { if (coff == PCIR_COMMAND && bytes == 2) val &= ~PCIM_CMD_INTxDIS; } #endif write_config(&sc->psc_sel, coff, bytes, val); if (coff == PCIR_COMMAND) { cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND); if (bytes == 1) pci_set_cfgdata8(pi, PCIR_COMMAND, val); else if (bytes == 2) pci_set_cfgdata16(pi, PCIR_COMMAND, val); pci_emul_cmd_changed(pi, cmd_old); } return (0); } static void passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct passthru_softc *sc; struct pci_bar_ioreq pio; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { msix_table_write(ctx, vcpu, sc, offset, size, value); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); assert(size == 1 || size == 2 || size == 4); assert(offset <= UINT32_MAX && offset + size <= UINT32_MAX); bzero(&pio, sizeof(pio)); pio.pbi_sel = sc->psc_sel; pio.pbi_op = PCIBARIO_WRITE; pio.pbi_bar = baridx; pio.pbi_offset = (uint32_t)offset; pio.pbi_width = size; pio.pbi_value = (uint32_t)value; (void)ioctl(pcifd, PCIOCBARIO, &pio); } } static uint64_t passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct passthru_softc *sc; struct pci_bar_ioreq pio; uint64_t val; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { val = msix_table_read(sc, offset, size); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); assert(size == 1 || size == 2 || size == 4); assert(offset <= UINT32_MAX && offset + size <= UINT32_MAX); bzero(&pio, sizeof(pio)); pio.pbi_sel = sc->psc_sel; pio.pbi_op = PCIBARIO_READ; pio.pbi_bar = baridx; pio.pbi_offset = (uint32_t)offset; pio.pbi_width = size; (void)ioctl(pcifd, PCIOCBARIO, &pio); val = pio.pbi_value; } return (val); } static void passthru_msix_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx, int enabled, uint64_t address) { struct passthru_softc *sc; size_t remaining; uint32_t table_size, table_offset; sc = pi->pi_arg; table_offset = rounddown2(pi->pi_msix.table_offset, 4096); if (table_offset > 0) { if (!enabled) { if (vm_unmap_pptdev_mmio(ctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, table_offset) != 0) warnx("pci_passthru: unmap_pptdev_mmio failed"); } else { if (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, table_offset, sc->psc_bar[baridx].addr) != 0) warnx("pci_passthru: map_pptdev_mmio failed"); } } table_size = pi->pi_msix.table_offset - table_offset; table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; table_size = roundup2(table_size, 4096); remaining = pi->pi_bar[baridx].size - table_offset - table_size; if (remaining > 0) { address += table_offset + table_size; if (!enabled) { if (vm_unmap_pptdev_mmio(ctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, remaining) != 0) warnx("pci_passthru: unmap_pptdev_mmio failed"); } else { if (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, remaining, sc->psc_bar[baridx].addr + table_offset + table_size) != 0) warnx("pci_passthru: map_pptdev_mmio failed"); } } } static void passthru_mmio_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx, int enabled, uint64_t address) { struct passthru_softc *sc; sc = pi->pi_arg; if (!enabled) { if (vm_unmap_pptdev_mmio(ctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, sc->psc_bar[baridx].size) != 0) warnx("pci_passthru: unmap_pptdev_mmio failed"); } else { if (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, sc->psc_bar[baridx].size, sc->psc_bar[baridx].addr) != 0) warnx("pci_passthru: map_pptdev_mmio failed"); } } static void passthru_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx, int enabled, uint64_t address) { if (pi->pi_bar[baridx].type == PCIBAR_IO) return; if (baridx == pci_msix_table_bar(pi)) passthru_msix_addr(ctx, pi, baridx, enabled, address); else passthru_mmio_addr(ctx, pi, baridx, enabled, address); } struct pci_devemu passthru = { .pe_emu = "passthru", .pe_init = passthru_init, .pe_legacy_config = passthru_legacy_config, .pe_cfgwrite = passthru_cfgwrite, .pe_cfgread = passthru_cfgread, .pe_barwrite = passthru_write, .pe_barread = passthru_read, .pe_baraddr = passthru_addr, }; PCI_EMUL_SET(passthru);