Index: sys/amd64/vmm/vmm_instruction_emul.c =================================================================== --- sys/amd64/vmm/vmm_instruction_emul.c +++ sys/amd64/vmm/vmm_instruction_emul.c @@ -82,6 +82,10 @@ .op_byte = 0xB6, .op_type = VIE_OP_TYPE_MOVZX, }, + [0xB7] = { + .op_byte = 0xB7, + .op_type = VIE_OP_TYPE_MOVZX, + }, [0xBE] = { .op_byte = 0xBE, .op_type = VIE_OP_TYPE_MOVSX, @@ -505,6 +509,25 @@ /* write the result */ error = vie_update_register(vm, vcpuid, reg, val, size); break; + case 0xB7: + /* + * MOV and zero extend word from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F B7/r movzx r32, r/m16 + * REX.W + 0F B7/r movzx r64, r/m16 + */ + error = memread(vm, vcpuid, gpa, &val, 2, arg); + if (error) + return (error); + + reg = gpr_map[vie->reg]; + + /* zero-extend word */ + val = (uint16_t)val; + + error = vie_update_register(vm, vcpuid, reg, val, size); + break; case 0xBE: /* * MOV and sign extend byte from mem (ModRM:r/m) to Index: usr.sbin/bhyve/acpi.c =================================================================== --- usr.sbin/bhyve/acpi.c +++ usr.sbin/bhyve/acpi.c @@ -40,12 +40,13 @@ * Layout * ------ * RSDP -> 0xf2400 (36 bytes fixed) - * RSDT -> 0xf2440 (36 bytes + 4*N table addrs, 2 used) - * XSDT -> 0xf2480 (36 bytes + 8*N table addrs, 2 used) + * RSDT -> 0xf2440 (36 bytes + 4*7 table addrs, 4 used) + * XSDT -> 0xf2480 (36 bytes + 8*7 table addrs, 4 used) * MADT -> 0xf2500 (depends on #CPUs) * FADT -> 0xf2600 (268 bytes) * HPET -> 0xf2740 (56 bytes) - * FACS -> 0xf2780 (64 bytes) + * MCFG -> 0xf2780 (60 bytes) + * FACS -> 0xf27C0 (64 bytes) * DSDT -> 0xf2800 (variable - can go up to 0x100000) */ @@ -80,7 +81,8 @@ #define MADT_OFFSET 0x100 #define FADT_OFFSET 0x200 #define HPET_OFFSET 0x340 -#define FACS_OFFSET 0x380 +#define MCFG_OFFSET 0x380 +#define FACS_OFFSET 0x3C0 #define DSDT_OFFSET 0x400 #define BHYVE_ASL_TEMPLATE "bhyve.XXXXXXX" @@ -178,6 +180,8 @@ basl_acpi_base + FADT_OFFSET); EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n", basl_acpi_base + HPET_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n", + basl_acpi_base + MCFG_OFFSET); EFFLUSH(fp); @@ -216,6 +220,8 @@ basl_acpi_base + FADT_OFFSET); EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n", basl_acpi_base + HPET_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n", + basl_acpi_base + MCFG_OFFSET); EFFLUSH(fp); @@ -583,6 +589,39 @@ } static int +basl_fwrite_mcfg(FILE *fp) +{ + int err = 0; + + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve MCFG template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "[0008]\t\tReserved : 0\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base()); + EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n"); + EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n"); + EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n"); + EFPRINTF(fp, "[0004]\t\tReserved : 0\n"); + EFFLUSH(fp); + return (0); +err_exit: + return (errno); +} + +static int basl_fwrite_facs(FILE *fp) { int err; @@ -921,6 +960,7 @@ { basl_fwrite_madt, MADT_OFFSET }, { basl_fwrite_fadt, FADT_OFFSET }, { basl_fwrite_hpet, HPET_OFFSET }, + { basl_fwrite_mcfg, MCFG_OFFSET }, { basl_fwrite_facs, FACS_OFFSET }, { basl_fwrite_dsdt, DSDT_OFFSET }, { NULL } Index: usr.sbin/bhyve/mem.h =================================================================== --- usr.sbin/bhyve/mem.h +++ usr.sbin/bhyve/mem.h @@ -48,6 +48,7 @@ #define MEM_F_READ 0x1 #define MEM_F_WRITE 0x2 #define MEM_F_RW 0x3 +#define MEM_F_IMMUTABLE 0x4 /* mem_range cannot be unregistered */ void init_mem(void); int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie, Index: usr.sbin/bhyve/mem.c =================================================================== --- usr.sbin/bhyve/mem.c +++ usr.sbin/bhyve/mem.c @@ -162,7 +162,7 @@ { struct mmio_rb_range *entry; - int err; + int err, immutable; pthread_rwlock_rdlock(&mmio_rwlock); /* @@ -186,9 +186,27 @@ } assert(entry != NULL); + + /* + * An 'immutable' memory range is guaranteed to be never removed + * so there is no need to hold 'mmio_rwlock' while calling the + * handler. + * + * XXX writes to the PCIR_COMMAND register can cause register_mem() + * to be called. If the guest is using PCI extended config space + * to modify the PCIR_COMMAND register then register_mem() can + * deadlock on 'mmio_rwlock'. However by registering the extended + * config space window as 'immutable' the deadlock can be avoided. + */ + immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE); + if (immutable) + pthread_rwlock_unlock(&mmio_rwlock); + err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging, mem_read, mem_write, &entry->mr_param); - pthread_rwlock_unlock(&mmio_rwlock); + + if (!immutable) + pthread_rwlock_unlock(&mmio_rwlock); return (err); } @@ -246,6 +264,7 @@ mr = &entry->mr_param; assert(mr->name == memp->name); assert(mr->base == memp->base && mr->size == memp->size); + assert((mr->flags & MEM_F_IMMUTABLE) == 0); RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry); /* flush Per-vCPU cache */ Index: usr.sbin/bhyve/pci_emul.h =================================================================== --- usr.sbin/bhyve/pci_emul.h +++ usr.sbin/bhyve/pci_emul.h @@ -235,6 +235,7 @@ int pci_count_lintr(int bus); void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg); void pci_write_dsdt(void); +uint64_t pci_ecfg_base(void); int pci_bus_configured(int bus); static __inline void Index: usr.sbin/bhyve/pci_emul.c =================================================================== --- usr.sbin/bhyve/pci_emul.c +++ usr.sbin/bhyve/pci_emul.c @@ -109,16 +109,20 @@ #define PCI_EMUL_IOBASE 0x2000 #define PCI_EMUL_IOLIMIT 0x10000 -#define PCI_EMUL_MEMLIMIT32 0xE0000000 /* 3.5GB */ +#define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ +#define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ +SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); + +#define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE #define PCI_EMUL_MEMBASE64 0xD000000000UL #define PCI_EMUL_MEMLIMIT64 0xFD00000000UL static struct pci_devemu *pci_emul_finddev(char *name); -static void pci_lintr_route(struct pci_devinst *pi); -static void pci_lintr_update(struct pci_devinst *pi); - -static struct mem_range pci_mem_hole; +static void pci_lintr_route(struct pci_devinst *pi); +static void pci_lintr_update(struct pci_devinst *pi); +static void cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, + int func, int coff, int bytes, uint32_t *val); /* * I/O access @@ -1023,12 +1027,37 @@ return (0); } +static int +pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int bytes, uint64_t *val, void *arg1, long arg2) +{ + int bus, slot, func, coff, in; + + coff = addr & 0xfff; + func = (addr >> 12) & 0x7; + slot = (addr >> 15) & 0x1f; + bus = (addr >> 20) & 0xff; + in = (dir == MEM_F_READ); + if (in) + *val = ~0UL; + cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val); + return (0); +} + +uint64_t +pci_ecfg_base(void) +{ + + return (PCI_EMUL_ECFG_BASE); +} + #define BUSIO_ROUNDUP 32 #define BUSMEM_ROUNDUP (1024 * 1024) int init_pci(struct vmctx *ctx) { + struct mem_range mr; struct pci_devemu *pde; struct businfo *bi; struct slotinfo *si; @@ -1112,22 +1141,34 @@ * The guest physical memory map looks like the following: * [0, lowmem) guest system memory * [lowmem, lowmem_limit) memory hole (may be absent) - * [lowmem_limit, 4GB) PCI hole (32-bit BAR allocation) + * [lowmem_limit, 0xE0000000) PCI hole (32-bit BAR allocation) + * [0xE0000000, 0xF0000000) PCI extended config window + * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware * [4GB, 4GB + highmem) - * + */ + + /* * Accesses to memory addresses that are not allocated to system * memory or PCI devices return 0xff's. */ lowmem = vm_get_lowmem_size(ctx); + bzero(&mr, sizeof(struct mem_range)); + mr.name = "PCI hole"; + mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; + mr.base = lowmem; + mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; + mr.handler = pci_emul_fallback_handler; + error = register_mem_fallback(&mr); + assert(error == 0); - memset(&pci_mem_hole, 0, sizeof(struct mem_range)); - pci_mem_hole.name = "PCI hole"; - pci_mem_hole.flags = MEM_F_RW; - pci_mem_hole.base = lowmem; - pci_mem_hole.size = (4ULL * 1024 * 1024 * 1024) - lowmem; - pci_mem_hole.handler = pci_emul_fallback_handler; - - error = register_mem_fallback(&pci_mem_hole); + /* PCI extended config space */ + bzero(&mr, sizeof(struct mem_range)); + mr.name = "PCI ECFG"; + mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; + mr.base = PCI_EMUL_ECFG_BASE; + mr.size = PCI_EMUL_ECFG_SIZE; + mr.handler = pci_emul_ecfg_handler; + error = register_mem(&mr); assert(error == 0); return (0); @@ -1612,41 +1653,6 @@ } } -static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; - -static int -pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) -{ - uint32_t x; - - if (bytes != 4) { - if (in) - *eax = (bytes == 2) ? 0xffff : 0xff; - return (0); - } - - if (in) { - x = (cfgbus << 16) | - (cfgslot << 11) | - (cfgfunc << 8) | - cfgoff; - if (cfgenable) - x |= CONF1_ENABLE; - *eax = x; - } else { - x = *eax; - cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; - cfgoff = x & PCI_REGMAX; - cfgfunc = (x >> 8) & PCI_FUNCMAX; - cfgslot = (x >> 11) & PCI_SLOTMAX; - cfgbus = (x >> 16) & PCI_BUSMAX; - } - - return (0); -} -INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); - static uint32_t bits_changed(uint32_t old, uint32_t new, uint32_t mask) { @@ -1709,41 +1715,51 @@ pci_lintr_update(pi); } -static int -pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) +static void +cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, + int coff, int bytes, uint32_t *eax) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; struct pci_devemu *pe; - int coff, idx, needcfg; + int idx, needcfg; uint64_t addr, bar, mask; - assert(bytes == 1 || bytes == 2 || bytes == 4); - - if ((bi = pci_businfo[cfgbus]) != NULL) { - si = &bi->slotinfo[cfgslot]; - pi = si->si_funcs[cfgfunc].fi_devi; + if ((bi = pci_businfo[bus]) != NULL) { + si = &bi->slotinfo[slot]; + pi = si->si_funcs[func].fi_devi; } else pi = NULL; - coff = cfgoff + (port - CONF1_DATA_PORT); - -#if 0 - printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r", - in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc); -#endif - /* - * Just return if there is no device at this cfgslot:cfgfunc, - * if the guest is doing an un-aligned access, or if the config - * address word isn't enabled. + * Just return if there is no device at this slot:func or if the + * the guest is doing an un-aligned access. */ - if (!cfgenable || pi == NULL || (coff & (bytes - 1)) != 0) { + if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || + (coff & (bytes - 1)) != 0) { if (in) *eax = 0xffffffff; - return (0); + return; + } + + /* + * Ignore all writes beyond the standard config space and return all + * ones on reads. + */ + if (coff >= PCI_REGMAX + 1) { + if (in) { + *eax = 0xffffffff; + /* + * Extended capabilities begin at offset 256 in config + * space. Absence of extended capabilities is signaled + * with all 0s in the extended capability header at + * offset 256. + */ + if (coff <= PCI_REGMAX + 4) + *eax = 0x00000000; + } + return; } pe = pi->pi_d; @@ -1754,8 +1770,8 @@ if (in) { /* Let the device emulation override the default handler */ if (pe->pe_cfgread != NULL) { - needcfg = pe->pe_cfgread(ctx, vcpu, pi, - coff, bytes, eax); + needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes, + eax); } else { needcfg = 1; } @@ -1769,12 +1785,12 @@ *eax = pci_get_cfgdata32(pi, coff); } - pci_emul_hdrtype_fixup(cfgbus, cfgslot, coff, bytes, eax); + pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax); } else { /* Let the device emulation override the default handler */ if (pe->pe_cfgwrite != NULL && (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0) - return (0); + return; /* * Special handling for write to BAR registers @@ -1785,7 +1801,7 @@ * 4-byte aligned. */ if (bytes != 4 || (coff & 0x3) != 0) - return (0); + return; idx = (coff - PCIR_BAR(0)) / 4; mask = ~(pi->pi_bar[idx].size - 1); switch (pi->pi_bar[idx].type) { @@ -1843,7 +1859,60 @@ CFGWRITE(pi, coff, *eax, bytes); } } +} + +static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; + +static int +pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + uint32_t x; + + if (bytes != 4) { + if (in) + *eax = (bytes == 2) ? 0xffff : 0xff; + return (0); + } + if (in) { + x = (cfgbus << 16) | + (cfgslot << 11) | + (cfgfunc << 8) | + cfgoff; + if (cfgenable) + x |= CONF1_ENABLE; + *eax = x; + } else { + x = *eax; + cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; + cfgoff = x & PCI_REGMAX; + cfgfunc = (x >> 8) & PCI_FUNCMAX; + cfgslot = (x >> 11) & PCI_SLOTMAX; + cfgbus = (x >> 16) & PCI_BUSMAX; + } + + return (0); +} +INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); + +static int +pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int coff; + + assert(bytes == 1 || bytes == 2 || bytes == 4); + + coff = cfgoff + (port - CONF1_DATA_PORT); + if (cfgenable) { + cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes, + eax); + } else { + /* Ignore accesses to cfgdata if not enabled by cfgaddr */ + if (in) + *eax = 0xffffffff; + } return (0); }