diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index 2e77ece96742..a65b1251e52b 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -1,1744 +1,1767 @@ /*- * Copyright (c) 2012 Sandvine, Inc. * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #ifdef _KERNEL #include #include #include #include #include #include #include #include #else /* !_KERNEL */ #include #include #include #include #include #include #define KASSERT(exp,msg) assert((exp)) #endif /* _KERNEL */ #include #include #include /* struct vie_op.op_type */ enum { VIE_OP_TYPE_NONE = 0, VIE_OP_TYPE_MOV, VIE_OP_TYPE_MOVSX, VIE_OP_TYPE_MOVZX, VIE_OP_TYPE_AND, VIE_OP_TYPE_OR, VIE_OP_TYPE_TWO_BYTE, VIE_OP_TYPE_PUSH, VIE_OP_TYPE_CMP, VIE_OP_TYPE_LAST }; /* struct vie_op.op_flags */ #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ #define VIE_OP_F_NO_MODRM (1 << 3) static const struct vie_op two_byte_opcodes[256] = { [0xB6] = { .op_byte = 0xB6, .op_type = VIE_OP_TYPE_MOVZX, }, + [0xB7] = { + .op_byte = 0xB7, + .op_type = VIE_OP_TYPE_MOVZX, + }, [0xBE] = { .op_byte = 0xBE, .op_type = VIE_OP_TYPE_MOVSX, }, }; static const struct vie_op one_byte_opcodes[256] = { [0x0F] = { .op_byte = 0x0F, .op_type = VIE_OP_TYPE_TWO_BYTE }, [0x3B] = { .op_byte = 0x3B, .op_type = VIE_OP_TYPE_CMP, }, [0x88] = { .op_byte = 0x88, .op_type = VIE_OP_TYPE_MOV, }, [0x89] = { .op_byte = 0x89, .op_type = VIE_OP_TYPE_MOV, }, [0x8A] = { .op_byte = 0x8A, .op_type = VIE_OP_TYPE_MOV, }, [0x8B] = { .op_byte = 0x8B, .op_type = VIE_OP_TYPE_MOV, }, [0xA1] = { .op_byte = 0xA1, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, }, [0xA3] = { .op_byte = 0xA3, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, }, [0xC6] = { /* XXX Group 11 extended opcode - not just MOV */ .op_byte = 0xC6, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_IMM8, }, [0xC7] = { .op_byte = 0xC7, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_IMM, }, [0x23] = { .op_byte = 0x23, .op_type = VIE_OP_TYPE_AND, }, [0x81] = { /* XXX Group 1 extended opcode - not just AND */ .op_byte = 0x81, .op_type = VIE_OP_TYPE_AND, .op_flags = VIE_OP_F_IMM, }, [0x83] = { /* XXX Group 1 extended opcode - not just OR */ .op_byte = 0x83, .op_type = VIE_OP_TYPE_OR, .op_flags = VIE_OP_F_IMM8, }, [0xFF] = { /* XXX Group 5 extended opcode - not just PUSH */ .op_byte = 0xFF, .op_type = VIE_OP_TYPE_PUSH, } }; /* struct vie.mod */ #define VIE_MOD_INDIRECT 0 #define VIE_MOD_INDIRECT_DISP8 1 #define VIE_MOD_INDIRECT_DISP32 2 #define VIE_MOD_DIRECT 3 /* struct vie.rm */ #define VIE_RM_SIB 4 #define VIE_RM_DISP32 5 #define GB (1024 * 1024 * 1024) static enum vm_reg_name gpr_map[16] = { VM_REG_GUEST_RAX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RBX, VM_REG_GUEST_RSP, VM_REG_GUEST_RBP, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15 }; static uint64_t size2mask[] = { [1] = 0xff, [2] = 0xffff, [4] = 0xffffffff, [8] = 0xffffffffffffffff, }; static int vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) { int error; error = vm_get_register(vm, vcpuid, reg, rval); return (error); } static void vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) { *lhbr = 0; *reg = gpr_map[vie->reg]; /* * 64-bit mode imposes limitations on accessing legacy high byte * registers (lhbr). * * The legacy high-byte registers cannot be addressed if the REX * prefix is present. In this case the values 4, 5, 6 and 7 of the * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. * * If the REX prefix is not present then the values 4, 5, 6 and 7 * of the 'ModRM:reg' field address the legacy high-byte registers, * %ah, %ch, %dh and %bh respectively. */ if (!vie->rex_present) { if (vie->reg & 0x4) { *lhbr = 1; *reg = gpr_map[vie->reg & 0x3]; } } } static int vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) { uint64_t val; int error, lhbr; enum vm_reg_name reg; vie_calc_bytereg(vie, ®, &lhbr); error = vm_get_register(vm, vcpuid, reg, &val); /* * To obtain the value of a legacy high byte register shift the * base register right by 8 bits (%ah = %rax >> 8). */ if (lhbr) *rval = val >> 8; else *rval = val; return (error); } static int vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) { uint64_t origval, val, mask; int error, lhbr; enum vm_reg_name reg; vie_calc_bytereg(vie, ®, &lhbr); error = vm_get_register(vm, vcpuid, reg, &origval); if (error == 0) { val = byte; mask = 0xff; if (lhbr) { /* * Shift left by 8 to store 'byte' in a legacy high * byte register. */ val <<= 8; mask <<= 8; } val |= origval & ~mask; error = vm_set_register(vm, vcpuid, reg, val); } return (error); } int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t val, int size) { int error; uint64_t origval; switch (size) { case 1: case 2: error = vie_read_register(vm, vcpuid, reg, &origval); if (error) return (error); val &= size2mask[size]; val |= origval & ~size2mask[size]; break; case 4: val &= 0xffffffffUL; break; case 8: break; default: return (EINVAL); } error = vm_set_register(vm, vcpuid, reg, val); return (error); } /* * Return the status flags that would result from doing (x - y). */ static u_long getcc16(uint16_t x, uint16_t y) { u_long rflags; __asm __volatile("sub %1,%2; pushfq; popq %0" : "=r" (rflags) : "m" (y), "r" (x)); return (rflags); } static u_long getcc32(uint32_t x, uint32_t y) { u_long rflags; __asm __volatile("sub %1,%2; pushfq; popq %0" : "=r" (rflags) : "m" (y), "r" (x)); return (rflags); } static u_long getcc64(uint64_t x, uint64_t y) { u_long rflags; __asm __volatile("sub %1,%2; pushfq; popq %0" : "=r" (rflags) : "m" (y), "r" (x)); return (rflags); } static u_long getcc(int opsize, uint64_t x, uint64_t y) { KASSERT(opsize == 2 || opsize == 4 || opsize == 8, ("getcc: invalid operand size %d", opsize)); if (opsize == 2) return (getcc16(x, y)); else if (opsize == 4) return (getcc32(x, y)); else return (getcc64(x, y)); } static int emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; enum vm_reg_name reg; uint8_t byte; uint64_t val; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x88: /* * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) * 88/r: mov r/m8, r8 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) */ size = 1; /* override for byte operation */ error = vie_read_bytereg(vm, vcpuid, vie, &byte); if (error == 0) error = memwrite(vm, vcpuid, gpa, byte, size, arg); break; case 0x89: /* * MOV from reg (ModRM:reg) to mem (ModRM:r/m) * 89/r: mov r/m16, r16 * 89/r: mov r/m32, r32 * REX.W + 89/r mov r/m64, r64 */ reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, &val); if (error == 0) { val &= size2mask[size]; error = memwrite(vm, vcpuid, gpa, val, size, arg); } break; case 0x8A: /* * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) * 8A/r: mov r8, r/m8 * REX + 8A/r: mov r8, r/m8 */ size = 1; /* override for byte operation */ error = memread(vm, vcpuid, gpa, &val, size, arg); if (error == 0) error = vie_write_bytereg(vm, vcpuid, vie, val); break; case 0x8B: /* * MOV from mem (ModRM:r/m) to reg (ModRM:reg) * 8B/r: mov r16, r/m16 * 8B/r: mov r32, r/m32 * REX.W 8B/r: mov r64, r/m64 */ error = memread(vm, vcpuid, gpa, &val, size, arg); if (error == 0) { reg = gpr_map[vie->reg]; error = vie_update_register(vm, vcpuid, reg, val, size); } break; case 0xA1: /* * MOV from seg:moffset to AX/EAX/RAX * A1: mov AX, moffs16 * A1: mov EAX, moffs32 * REX.W + A1: mov RAX, moffs64 */ error = memread(vm, vcpuid, gpa, &val, size, arg); if (error == 0) { reg = VM_REG_GUEST_RAX; error = vie_update_register(vm, vcpuid, reg, val, size); } break; case 0xA3: /* * MOV from AX/EAX/RAX to seg:moffset * A3: mov moffs16, AX * A3: mov moffs32, EAX * REX.W + A3: mov moffs64, RAX */ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); if (error == 0) { val &= size2mask[size]; error = memwrite(vm, vcpuid, gpa, val, size, arg); } break; case 0xC6: /* * MOV from imm8 to mem (ModRM:r/m) * C6/0 mov r/m8, imm8 * REX + C6/0 mov r/m8, imm8 */ size = 1; /* override for byte operation */ error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); break; case 0xC7: /* * MOV from imm16/imm32 to mem (ModRM:r/m) * C7/0 mov r/m16, imm16 * C7/0 mov r/m32, imm32 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) */ val = vie->immediate & size2mask[size]; error = memwrite(vm, vcpuid, gpa, val, size, arg); break; default: break; } return (error); } static int emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; enum vm_reg_name reg; uint64_t val; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0xB6: /* * MOV and zero extend byte from mem (ModRM:r/m) to * reg (ModRM:reg). * * 0F B6/r movzx r16, r/m8 * 0F B6/r movzx r32, r/m8 * REX.W + 0F B6/r movzx r64, r/m8 */ /* get the first operand */ error = memread(vm, vcpuid, gpa, &val, 1, arg); if (error) break; /* get the second operand */ reg = gpr_map[vie->reg]; /* zero-extend byte */ val = (uint8_t)val; /* write the result */ + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + case 0xB7: + /* + * MOV and zero extend word from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F B7/r movzx r32, r/m16 + * REX.W + 0F B7/r movzx r64, r/m16 + */ + error = memread(vm, vcpuid, gpa, &val, 2, arg); + if (error) + return (error); + + reg = gpr_map[vie->reg]; + + /* zero-extend word */ + val = (uint16_t)val; + error = vie_update_register(vm, vcpuid, reg, val, size); break; case 0xBE: /* * MOV and sign extend byte from mem (ModRM:r/m) to * reg (ModRM:reg). * * 0F BE/r movsx r16, r/m8 * 0F BE/r movsx r32, r/m8 * REX.W + 0F BE/r movsx r64, r/m8 */ /* get the first operand */ error = memread(vm, vcpuid, gpa, &val, 1, arg); if (error) break; /* get the second operand */ reg = gpr_map[vie->reg]; /* sign extend byte */ val = (int8_t)val; /* write the result */ error = vie_update_register(vm, vcpuid, reg, val, size); break; default: break; } return (error); } static int emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; enum vm_reg_name reg; uint64_t val1, val2; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x23: /* * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the * result in reg. * * 23/r and r16, r/m16 * 23/r and r32, r/m32 * REX.W + 23/r and r64, r/m64 */ /* get the first operand */ reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, &val1); if (error) break; /* get the second operand */ error = memread(vm, vcpuid, gpa, &val2, size, arg); if (error) break; /* perform the operation and write the result */ val1 &= val2; error = vie_update_register(vm, vcpuid, reg, val1, size); break; case 0x81: /* * AND mem (ModRM:r/m) with immediate and store the * result in mem. * * 81 /4 and r/m16, imm16 * 81 /4 and r/m32, imm32 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 * * Currently, only the AND operation of the 0x81 opcode * is implemented (ModRM:reg = b100). */ if ((vie->reg & 7) != 4) break; /* get the first operand */ error = memread(vm, vcpuid, gpa, &val1, size, arg); if (error) break; /* * perform the operation with the pre-fetched immediate * operand and write the result */ val1 &= vie->immediate; error = memwrite(vm, vcpuid, gpa, val1, size, arg); break; default: break; } return (error); } static int emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; uint64_t val1; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x83: /* * OR mem (ModRM:r/m) with immediate and store the * result in mem. * * 83 /1 OR r/m16, imm8 sign-extended to 16 * 83 /1 OR r/m32, imm8 sign-extended to 32 * REX.W + 83/1 OR r/m64, imm8 sign-extended to 64 * * Currently, only the OR operation of the 0x83 opcode * is implemented (ModRM:reg = b001). */ if ((vie->reg & 7) != 1) break; /* get the first operand */ error = memread(vm, vcpuid, gpa, &val1, size, arg); if (error) break; /* * perform the operation with the pre-fetched immediate * operand and write the result */ val1 |= vie->immediate; error = memwrite(vm, vcpuid, gpa, val1, size, arg); break; default: break; } return (error); } #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) static int emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; uint64_t op1, op2, rflags, rflags2; enum vm_reg_name reg; size = vie->opsize; switch (vie->op.op_byte) { case 0x3B: /* * 3B/r CMP r16, r/m16 * 3B/r CMP r32, r/m32 * REX.W + 3B/r CMP r64, r/m64 * * Compare first operand (reg) with second operand (r/m) and * set status flags in EFLAGS register. The comparison is * performed by subtracting the second operand from the first * operand and then setting the status flags. */ /* Get the first operand */ reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, &op1); if (error) return (error); /* Get the second operand */ error = memread(vm, vcpuid, gpa, &op2, size, arg); if (error) return (error); break; default: return (EINVAL); } rflags2 = getcc(size, op1, op2); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & RFLAGS_STATUS_BITS; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { #ifdef _KERNEL struct vm_copyinfo copyinfo[2]; #else struct iovec copyinfo[2]; #endif struct seg_desc ss_desc; uint64_t cr0, rflags, rsp, stack_gla, val; int error, size, stackaddrsize; /* * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. * * PUSH is part of the group 5 extended opcodes and is identified * by ModRM:reg = b110. */ if ((vie->reg & 7) != 6) return (EINVAL); size = vie->opsize; /* * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 */ if (paging->cpu_mode == CPU_MODE_REAL) { stackaddrsize = 2; } else if (paging->cpu_mode == CPU_MODE_64BIT) { /* * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 * - Stack pointer size is always 64-bits. * - PUSH/POP of 32-bit values is not possible in 64-bit mode. * - 16-bit PUSH/POP is supported by using the operand size * override prefix (66H). */ stackaddrsize = 8; size = vie->opsize_override ? 2 : 8; } else { /* * In protected or compability mode the 'B' flag in the * stack-segment descriptor determines the size of the * stack pointer. */ error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); KASSERT(error == 0, ("%s: error %d getting SS descriptor", __func__, error)); if (SEG_DESC_DEF32(ss_desc.access)) stackaddrsize = 4; else stackaddrsize = 2; } error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); rsp -= size; if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, rsp, size, stackaddrsize, PROT_WRITE, &stack_gla)) { vm_inject_ss(vm, vcpuid, 0); return (0); } if (vie_canonical_check(paging->cpu_mode, stack_gla)) { vm_inject_ss(vm, vcpuid, 0); return (0); } if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { vm_inject_ac(vm, vcpuid, 0); return (0); } error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, PROT_WRITE, copyinfo, nitems(copyinfo)); if (error == -1) { /* * XXX cannot return a negative error value here because it * ends up being the return value of the VM_RUN() ioctl and * is interpreted as a pseudo-error (for e.g. ERESTART). */ return (EFAULT); } else if (error == 1) { /* Resume guest execution to handle page fault */ return (0); } error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); if (error == 0) { vm_copyout(vm, vcpuid, &val, copyinfo, size); error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, stackaddrsize); KASSERT(error == 0, ("error %d updating rsp", error)); } #ifdef _KERNEL vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); #endif return (error); } int vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) { int error; if (!vie->decoded) return (EINVAL); switch (vie->op.op_type) { case VIE_OP_TYPE_PUSH: error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_CMP: error = emulate_cmp(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_MOV: error = emulate_mov(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_MOVSX: case VIE_OP_TYPE_MOVZX: error = emulate_movx(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_AND: error = emulate_and(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_OR: error = emulate_or(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; default: error = EINVAL; break; } return (error); } int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) { KASSERT(size == 1 || size == 2 || size == 4 || size == 8, ("%s: invalid size %d", __func__, size)); KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) return (0); return ((gla & (size - 1)) ? 1 : 0); } int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) { uint64_t mask; if (cpu_mode != CPU_MODE_64BIT) return (0); /* * The value of the bit 47 in the 'gla' should be replicated in the * most significant 16 bits. */ mask = ~((1UL << 48) - 1); if (gla & (1UL << 47)) return ((gla & mask) != mask); else return ((gla & mask) != 0); } uint64_t vie_size2mask(int size) { KASSERT(size == 1 || size == 2 || size == 4 || size == 8, ("vie_size2mask: invalid size %d", size)); return (size2mask[size]); } int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, struct seg_desc *desc, uint64_t offset, int length, int addrsize, int prot, uint64_t *gla) { uint64_t firstoff, low_limit, high_limit, segbase; int glasize, type; KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, ("%s: invalid segment %d", __func__, seg)); KASSERT(length == 1 || length == 2 || length == 4 || length == 8, ("%s: invalid operand size %d", __func__, length)); KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, ("%s: invalid prot %#x", __func__, prot)); firstoff = offset; if (cpu_mode == CPU_MODE_64BIT) { KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); glasize = 8; } else { KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); glasize = 4; /* * If the segment selector is loaded with a NULL selector * then the descriptor is unusable and attempting to use * it results in a #GP(0). */ if (SEG_DESC_UNUSABLE(desc->access)) return (-1); /* * The processor generates a #NP exception when a segment * register is loaded with a selector that points to a * descriptor that is not present. If this was the case then * it would have been checked before the VM-exit. */ KASSERT(SEG_DESC_PRESENT(desc->access), ("segment %d not present: %#x", seg, desc->access)); /* * The descriptor type must indicate a code/data segment. */ type = SEG_DESC_TYPE(desc->access); KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " "descriptor type %#x", seg, type)); if (prot & PROT_READ) { /* #GP on a read access to a exec-only code segment */ if ((type & 0xA) == 0x8) return (-1); } if (prot & PROT_WRITE) { /* * #GP on a write access to a code segment or a * read-only data segment. */ if (type & 0x8) /* code segment */ return (-1); if ((type & 0xA) == 0) /* read-only data seg */ return (-1); } /* * 'desc->limit' is fully expanded taking granularity into * account. */ if ((type & 0xC) == 0x4) { /* expand-down data segment */ low_limit = desc->limit + 1; high_limit = SEG_DESC_DEF32(desc->access) ? 0xffffffff : 0xffff; } else { /* code segment or expand-up data segment */ low_limit = 0; high_limit = desc->limit; } while (length > 0) { offset &= vie_size2mask(addrsize); if (offset < low_limit || offset > high_limit) return (-1); offset++; length--; } } /* * In 64-bit mode all segments except %fs and %gs have a segment * base address of 0. */ if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) { segbase = 0; } else { segbase = desc->base; } /* * Truncate 'firstoff' to the effective address size before adding * it to the segment base. */ firstoff &= vie_size2mask(addrsize); *gla = (segbase + firstoff) & vie_size2mask(glasize); return (0); } #ifdef _KERNEL void vie_init(struct vie *vie) { bzero(vie, sizeof(struct vie)); vie->base_register = VM_REG_LAST; vie->index_register = VM_REG_LAST; } static int pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) { int error_code = 0; if (pte & PG_V) error_code |= PGEX_P; if (prot & VM_PROT_WRITE) error_code |= PGEX_W; if (usermode) error_code |= PGEX_U; if (rsvd) error_code |= PGEX_RSV; if (prot & VM_PROT_EXECUTE) error_code |= PGEX_I; return (error_code); } static void ptp_release(void **cookie) { if (*cookie != NULL) { vm_gpa_release(*cookie); *cookie = NULL; } } static void * ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie) { void *ptr; ptp_release(cookie); ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie); return (ptr); } int vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa) { int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; u_int retries; uint64_t *ptpbase, ptpphys, pte, pgsize; uint32_t *ptpbase32, pte32; void *cookie; usermode = (paging->cpl == 3 ? 1 : 0); writable = prot & VM_PROT_WRITE; cookie = NULL; retval = 0; retries = 0; restart: ptpphys = paging->cr3; /* root of the page tables */ ptp_release(&cookie); if (retries++ > 0) maybe_yield(); if (vie_canonical_check(paging->cpu_mode, gla)) { /* * XXX assuming a non-stack reference otherwise a stack fault * should be generated. */ vm_inject_gp(vm, vcpuid); goto fault; } if (paging->paging_mode == PAGING_MODE_FLAT) { *gpa = gla; goto done; } if (paging->paging_mode == PAGING_MODE_32) { nlevels = 2; while (--nlevels >= 0) { /* Zero out the lower 12 bits. */ ptpphys &= ~0xfff; ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); if (ptpbase32 == NULL) goto error; ptpshift = PAGE_SHIFT + nlevels * 10; ptpindex = (gla >> ptpshift) & 0x3FF; pgsize = 1UL << ptpshift; pte32 = ptpbase32[ptpindex]; if ((pte32 & PG_V) == 0 || (usermode && (pte32 & PG_U) == 0) || (writable && (pte32 & PG_RW) == 0)) { pfcode = pf_error_code(usermode, prot, 0, pte32); vm_inject_pf(vm, vcpuid, pfcode, gla); goto fault; } /* * Emulate the x86 MMU's management of the accessed * and dirty flags. While the accessed flag is set * at every level of the page table, the dirty flag * is only set at the last level providing the guest * physical address. */ if ((pte32 & PG_A) == 0) { if (atomic_cmpset_32(&ptpbase32[ptpindex], pte32, pte32 | PG_A) == 0) { goto restart; } } /* XXX must be ignored if CR4.PSE=0 */ if (nlevels > 0 && (pte32 & PG_PS) != 0) break; ptpphys = pte32; } /* Set the dirty bit in the page table entry if necessary */ if (writable && (pte32 & PG_M) == 0) { if (atomic_cmpset_32(&ptpbase32[ptpindex], pte32, pte32 | PG_M) == 0) { goto restart; } } /* Zero out the lower 'ptpshift' bits */ pte32 >>= ptpshift; pte32 <<= ptpshift; *gpa = pte32 | (gla & (pgsize - 1)); goto done; } if (paging->paging_mode == PAGING_MODE_PAE) { /* Zero out the lower 5 bits and the upper 32 bits */ ptpphys &= 0xffffffe0UL; ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie); if (ptpbase == NULL) goto error; ptpindex = (gla >> 30) & 0x3; pte = ptpbase[ptpindex]; if ((pte & PG_V) == 0) { pfcode = pf_error_code(usermode, prot, 0, pte); vm_inject_pf(vm, vcpuid, pfcode, gla); goto fault; } ptpphys = pte; nlevels = 2; } else nlevels = 4; while (--nlevels >= 0) { /* Zero out the lower 12 bits and the upper 12 bits */ ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); if (ptpbase == NULL) goto error; ptpshift = PAGE_SHIFT + nlevels * 9; ptpindex = (gla >> ptpshift) & 0x1FF; pgsize = 1UL << ptpshift; pte = ptpbase[ptpindex]; if ((pte & PG_V) == 0 || (usermode && (pte & PG_U) == 0) || (writable && (pte & PG_RW) == 0)) { pfcode = pf_error_code(usermode, prot, 0, pte); vm_inject_pf(vm, vcpuid, pfcode, gla); goto fault; } /* Set the accessed bit in the page table entry */ if ((pte & PG_A) == 0) { if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_A) == 0) { goto restart; } } if (nlevels > 0 && (pte & PG_PS) != 0) { if (pgsize > 1 * GB) { pfcode = pf_error_code(usermode, prot, 1, pte); vm_inject_pf(vm, vcpuid, pfcode, gla); goto fault; } break; } ptpphys = pte; } /* Set the dirty bit in the page table entry if necessary */ if (writable && (pte & PG_M) == 0) { if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) goto restart; } /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; *gpa = pte | (gla & (pgsize - 1)); done: ptp_release(&cookie); return (retval); error: retval = -1; goto done; fault: retval = 1; goto done; } int vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t rip, int inst_length, struct vie *vie) { struct vm_copyinfo copyinfo[2]; int error, prot; if (inst_length > VIE_INST_SIZE) panic("vmm_fetch_instruction: invalid length %d", inst_length); prot = PROT_READ | PROT_EXEC; error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, copyinfo, nitems(copyinfo)); if (error == 0) { vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); vie->num_valid = inst_length; } return (error); } static int vie_peek(struct vie *vie, uint8_t *x) { if (vie->num_processed < vie->num_valid) { *x = vie->inst[vie->num_processed]; return (0); } else return (-1); } static void vie_advance(struct vie *vie) { vie->num_processed++; } static int decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) { uint8_t x; while (1) { if (vie_peek(vie, &x)) return (-1); if (x == 0x66) vie->opsize_override = 1; else if (x == 0x67) vie->addrsize_override = 1; else break; vie_advance(vie); } /* * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: * - Only one REX prefix is allowed per instruction. * - The REX prefix must immediately precede the opcode byte or the * escape opcode byte. * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) * the mandatory prefix must come before the REX prefix. */ if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { vie->rex_present = 1; vie->rex_w = x & 0x8 ? 1 : 0; vie->rex_r = x & 0x4 ? 1 : 0; vie->rex_x = x & 0x2 ? 1 : 0; vie->rex_b = x & 0x1 ? 1 : 0; vie_advance(vie); } /* * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 */ if (cpu_mode == CPU_MODE_64BIT) { /* * Default address size is 64-bits and default operand size * is 32-bits. */ vie->addrsize = vie->addrsize_override ? 4 : 8; if (vie->rex_w) vie->opsize = 8; else if (vie->opsize_override) vie->opsize = 2; else vie->opsize = 4; } else if (cs_d) { /* Default address and operand sizes are 32-bits */ vie->addrsize = vie->addrsize_override ? 2 : 4; vie->opsize = vie->opsize_override ? 2 : 4; } else { /* Default address and operand sizes are 16-bits */ vie->addrsize = vie->addrsize_override ? 4 : 2; vie->opsize = vie->opsize_override ? 4 : 2; } return (0); } static int decode_two_byte_opcode(struct vie *vie) { uint8_t x; if (vie_peek(vie, &x)) return (-1); vie->op = two_byte_opcodes[x]; if (vie->op.op_type == VIE_OP_TYPE_NONE) return (-1); vie_advance(vie); return (0); } static int decode_opcode(struct vie *vie) { uint8_t x; if (vie_peek(vie, &x)) return (-1); vie->op = one_byte_opcodes[x]; if (vie->op.op_type == VIE_OP_TYPE_NONE) return (-1); vie_advance(vie); if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) return (decode_two_byte_opcode(vie)); return (0); } static int decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) { uint8_t x; if (cpu_mode == CPU_MODE_REAL) return (-1); if (vie->op.op_flags & VIE_OP_F_NO_MODRM) return (0); if (vie_peek(vie, &x)) return (-1); vie->mod = (x >> 6) & 0x3; vie->rm = (x >> 0) & 0x7; vie->reg = (x >> 3) & 0x7; /* * A direct addressing mode makes no sense in the context of an EPT * fault. There has to be a memory access involved to cause the * EPT fault. */ if (vie->mod == VIE_MOD_DIRECT) return (-1); if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { /* * Table 2-5: Special Cases of REX Encodings * * mod=0, r/m=5 is used in the compatibility mode to * indicate a disp32 without a base register. * * mod!=3, r/m=4 is used in the compatibility mode to * indicate that the SIB byte is present. * * The 'b' bit in the REX prefix is don't care in * this case. */ } else { vie->rm |= (vie->rex_b << 3); } vie->reg |= (vie->rex_r << 3); /* SIB */ if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) goto done; vie->base_register = gpr_map[vie->rm]; switch (vie->mod) { case VIE_MOD_INDIRECT_DISP8: vie->disp_bytes = 1; break; case VIE_MOD_INDIRECT_DISP32: vie->disp_bytes = 4; break; case VIE_MOD_INDIRECT: if (vie->rm == VIE_RM_DISP32) { vie->disp_bytes = 4; /* * Table 2-7. RIP-Relative Addressing * * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 * whereas in compatibility mode it just implies disp32. */ if (cpu_mode == CPU_MODE_64BIT) vie->base_register = VM_REG_GUEST_RIP; else vie->base_register = VM_REG_LAST; } break; } done: vie_advance(vie); return (0); } static int decode_sib(struct vie *vie) { uint8_t x; /* Proceed only if SIB byte is present */ if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) return (0); if (vie_peek(vie, &x)) return (-1); /* De-construct the SIB byte */ vie->ss = (x >> 6) & 0x3; vie->index = (x >> 3) & 0x7; vie->base = (x >> 0) & 0x7; /* Apply the REX prefix modifiers */ vie->index |= vie->rex_x << 3; vie->base |= vie->rex_b << 3; switch (vie->mod) { case VIE_MOD_INDIRECT_DISP8: vie->disp_bytes = 1; break; case VIE_MOD_INDIRECT_DISP32: vie->disp_bytes = 4; break; } if (vie->mod == VIE_MOD_INDIRECT && (vie->base == 5 || vie->base == 13)) { /* * Special case when base register is unused if mod = 0 * and base = %rbp or %r13. * * Documented in: * Table 2-3: 32-bit Addressing Forms with the SIB Byte * Table 2-5: Special Cases of REX Encodings */ vie->disp_bytes = 4; } else { vie->base_register = gpr_map[vie->base]; } /* * All encodings of 'index' are valid except for %rsp (4). * * Documented in: * Table 2-3: 32-bit Addressing Forms with the SIB Byte * Table 2-5: Special Cases of REX Encodings */ if (vie->index != 4) vie->index_register = gpr_map[vie->index]; /* 'scale' makes sense only in the context of an index register */ if (vie->index_register < VM_REG_LAST) vie->scale = 1 << vie->ss; vie_advance(vie); return (0); } static int decode_displacement(struct vie *vie) { int n, i; uint8_t x; union { char buf[4]; int8_t signed8; int32_t signed32; } u; if ((n = vie->disp_bytes) == 0) return (0); if (n != 1 && n != 4) panic("decode_displacement: invalid disp_bytes %d", n); for (i = 0; i < n; i++) { if (vie_peek(vie, &x)) return (-1); u.buf[i] = x; vie_advance(vie); } if (n == 1) vie->displacement = u.signed8; /* sign-extended */ else vie->displacement = u.signed32; /* sign-extended */ return (0); } static int decode_immediate(struct vie *vie) { int i, n; uint8_t x; union { char buf[4]; int8_t signed8; int16_t signed16; int32_t signed32; } u; /* Figure out immediate operand size (if any) */ if (vie->op.op_flags & VIE_OP_F_IMM) { /* * Section 2.2.1.5 "Immediates", Intel SDM: * In 64-bit mode the typical size of immediate operands * remains 32-bits. When the operand size if 64-bits, the * processor sign-extends all immediates to 64-bits prior * to their use. */ if (vie->opsize == 4 || vie->opsize == 8) vie->imm_bytes = 4; else vie->imm_bytes = 2; } else if (vie->op.op_flags & VIE_OP_F_IMM8) { vie->imm_bytes = 1; } if ((n = vie->imm_bytes) == 0) return (0); KASSERT(n == 1 || n == 2 || n == 4, ("%s: invalid number of immediate bytes: %d", __func__, n)); for (i = 0; i < n; i++) { if (vie_peek(vie, &x)) return (-1); u.buf[i] = x; vie_advance(vie); } /* sign-extend the immediate value before use */ if (n == 1) vie->immediate = u.signed8; else if (n == 2) vie->immediate = u.signed16; else vie->immediate = u.signed32; return (0); } static int decode_moffset(struct vie *vie) { int i, n; uint8_t x; union { char buf[8]; uint64_t u64; } u; if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) return (0); /* * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: * The memory offset size follows the address-size of the instruction. */ n = vie->addrsize; KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); u.u64 = 0; for (i = 0; i < n; i++) { if (vie_peek(vie, &x)) return (-1); u.buf[i] = x; vie_advance(vie); } vie->displacement = u.u64; return (0); } /* * Verify that all the bytes in the instruction buffer were consumed. */ static int verify_inst_length(struct vie *vie) { if (vie->num_processed == vie->num_valid) return (0); else return (-1); } /* * Verify that the 'guest linear address' provided as collateral of the nested * page table fault matches with our instruction decoding. */ static int verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) { int error; uint64_t base, idx, gla2; /* Skip 'gla' verification */ if (gla == VIE_INVALID_GLA) return (0); base = 0; if (vie->base_register != VM_REG_LAST) { error = vm_get_register(vm, cpuid, vie->base_register, &base); if (error) { printf("verify_gla: error %d getting base reg %d\n", error, vie->base_register); return (-1); } /* * RIP-relative addressing starts from the following * instruction */ if (vie->base_register == VM_REG_GUEST_RIP) base += vie->num_valid; } idx = 0; if (vie->index_register != VM_REG_LAST) { error = vm_get_register(vm, cpuid, vie->index_register, &idx); if (error) { printf("verify_gla: error %d getting index reg %d\n", error, vie->index_register); return (-1); } } /* XXX assuming that the base address of the segment is 0 */ gla2 = base + vie->scale * idx + vie->displacement; gla2 &= size2mask[vie->addrsize]; if (gla != gla2) { printf("verify_gla mismatch: " "base(0x%0lx), scale(%d), index(0x%0lx), " "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", base, vie->scale, idx, vie->displacement, gla, gla2); return (-1); } return (0); } int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) { if (decode_prefixes(vie, cpu_mode, cs_d)) return (-1); if (decode_opcode(vie)) return (-1); if (decode_modrm(vie, cpu_mode)) return (-1); if (decode_sib(vie)) return (-1); if (decode_displacement(vie)) return (-1); if (decode_immediate(vie)) return (-1); if (decode_moffset(vie)) return (-1); if (verify_inst_length(vie)) return (-1); if (verify_gla(vm, cpuid, gla, vie)) return (-1); vie->decoded = 1; /* success */ return (0); } #endif /* _KERNEL */ diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c index c4ec020bd5e1..5dea3001de22 100644 --- a/usr.sbin/bhyve/acpi.c +++ b/usr.sbin/bhyve/acpi.c @@ -1,969 +1,1009 @@ /*- * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * bhyve ACPI table generator. * * Create the minimal set of ACPI tables required to boot FreeBSD (and * hopefully other o/s's) by writing out ASL template files for each of * the tables and the compiling them to AML with the Intel iasl compiler. * The AML files are then read into guest memory. * * The tables are placed in the guest's ROM area just below 1MB physical, * above the MPTable. * * Layout * ------ * RSDP -> 0xf2400 (36 bytes fixed) - * RSDT -> 0xf2440 (36 bytes + 4*N table addrs, 2 used) - * XSDT -> 0xf2480 (36 bytes + 8*N table addrs, 2 used) + * RSDT -> 0xf2440 (36 bytes + 4*7 table addrs, 4 used) + * XSDT -> 0xf2480 (36 bytes + 8*7 table addrs, 4 used) * MADT -> 0xf2500 (depends on #CPUs) * FADT -> 0xf2600 (268 bytes) * HPET -> 0xf2740 (56 bytes) - * FACS -> 0xf2780 (64 bytes) + * MCFG -> 0xf2780 (60 bytes) + * FACS -> 0xf27C0 (64 bytes) * DSDT -> 0xf2800 (variable - can go up to 0x100000) */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "acpi.h" #include "pci_emul.h" /* * Define the base address of the ACPI tables, and the offsets to * the individual tables */ #define BHYVE_ACPI_BASE 0xf2400 #define RSDT_OFFSET 0x040 #define XSDT_OFFSET 0x080 #define MADT_OFFSET 0x100 #define FADT_OFFSET 0x200 #define HPET_OFFSET 0x340 -#define FACS_OFFSET 0x380 +#define MCFG_OFFSET 0x380 +#define FACS_OFFSET 0x3C0 #define DSDT_OFFSET 0x400 #define BHYVE_ASL_TEMPLATE "bhyve.XXXXXXX" #define BHYVE_ASL_SUFFIX ".aml" #define BHYVE_ASL_COMPILER "/usr/sbin/iasl" static int basl_keep_temps; static int basl_verbose_iasl; static int basl_ncpu; static uint32_t basl_acpi_base = BHYVE_ACPI_BASE; static uint32_t hpet_capabilities; /* * Contains the full pathname of the template to be passed * to mkstemp/mktemps(3) */ static char basl_template[MAXPATHLEN]; static char basl_stemplate[MAXPATHLEN]; /* * State for dsdt_line(), dsdt_indent(), and dsdt_unindent(). */ static FILE *dsdt_fp; static int dsdt_indent_level; static int dsdt_error; struct basl_fio { int fd; FILE *fp; char f_name[MAXPATHLEN]; }; #define EFPRINTF(...) \ err = fprintf(__VA_ARGS__); if (err < 0) goto err_exit; #define EFFLUSH(x) \ err = fflush(x); if (err != 0) goto err_exit; static int basl_fwrite_rsdp(FILE *fp) { int err; err = 0; EFPRINTF(fp, "/*\n"); EFPRINTF(fp, " * bhyve RSDP template\n"); EFPRINTF(fp, " */\n"); EFPRINTF(fp, "[0008]\t\tSignature : \"RSD PTR \"\n"); EFPRINTF(fp, "[0001]\t\tChecksum : 43\n"); EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); EFPRINTF(fp, "[0001]\t\tRevision : 02\n"); EFPRINTF(fp, "[0004]\t\tRSDT Address : %08X\n", basl_acpi_base + RSDT_OFFSET); EFPRINTF(fp, "[0004]\t\tLength : 00000024\n"); EFPRINTF(fp, "[0008]\t\tXSDT Address : 00000000%08X\n", basl_acpi_base + XSDT_OFFSET); EFPRINTF(fp, "[0001]\t\tExtended Checksum : 00\n"); EFPRINTF(fp, "[0003]\t\tReserved : 000000\n"); EFFLUSH(fp); return (0); err_exit: return (errno); } static int basl_fwrite_rsdt(FILE *fp) { int err; err = 0; EFPRINTF(fp, "/*\n"); EFPRINTF(fp, " * bhyve RSDT template\n"); EFPRINTF(fp, " */\n"); EFPRINTF(fp, "[0004]\t\tSignature : \"RSDT\"\n"); EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVRSDT \"\n"); EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); /* iasl will fill in the compiler ID/revision fields */ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); EFPRINTF(fp, "\n"); /* Add in pointers to the MADT, FADT and HPET */ EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : %08X\n", basl_acpi_base + MADT_OFFSET); EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : %08X\n", basl_acpi_base + FADT_OFFSET); EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n", basl_acpi_base + HPET_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n", + basl_acpi_base + MCFG_OFFSET); EFFLUSH(fp); return (0); err_exit: return (errno); } static int basl_fwrite_xsdt(FILE *fp) { int err; err = 0; EFPRINTF(fp, "/*\n"); EFPRINTF(fp, " * bhyve XSDT template\n"); EFPRINTF(fp, " */\n"); EFPRINTF(fp, "[0004]\t\tSignature : \"XSDT\"\n"); EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVXSDT \"\n"); EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); /* iasl will fill in the compiler ID/revision fields */ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); EFPRINTF(fp, "\n"); /* Add in pointers to the MADT, FADT and HPET */ EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : 00000000%08X\n", basl_acpi_base + MADT_OFFSET); EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : 00000000%08X\n", basl_acpi_base + FADT_OFFSET); EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n", basl_acpi_base + HPET_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n", + basl_acpi_base + MCFG_OFFSET); EFFLUSH(fp); return (0); err_exit: return (errno); } static int basl_fwrite_madt(FILE *fp) { int err; int i; err = 0; EFPRINTF(fp, "/*\n"); EFPRINTF(fp, " * bhyve MADT template\n"); EFPRINTF(fp, " */\n"); EFPRINTF(fp, "[0004]\t\tSignature : \"APIC\"\n"); EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMADT \"\n"); EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); /* iasl will fill in the compiler ID/revision fields */ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0004]\t\tLocal Apic Address : FEE00000\n"); EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n"); EFPRINTF(fp, "\t\t\tPC-AT Compatibility : 1\n"); EFPRINTF(fp, "\n"); /* Add a Processor Local APIC entry for each CPU */ for (i = 0; i < basl_ncpu; i++) { EFPRINTF(fp, "[0001]\t\tSubtable Type : 00\n"); EFPRINTF(fp, "[0001]\t\tLength : 08\n"); /* iasl expects hex values for the proc and apic id's */ EFPRINTF(fp, "[0001]\t\tProcessor ID : %02x\n", i); EFPRINTF(fp, "[0001]\t\tLocal Apic ID : %02x\n", i); EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n"); EFPRINTF(fp, "\t\t\tProcessor Enabled : 1\n"); EFPRINTF(fp, "\n"); } /* Always a single IOAPIC entry, with ID 0 */ EFPRINTF(fp, "[0001]\t\tSubtable Type : 01\n"); EFPRINTF(fp, "[0001]\t\tLength : 0C\n"); /* iasl expects a hex value for the i/o apic id */ EFPRINTF(fp, "[0001]\t\tI/O Apic ID : %02x\n", 0); EFPRINTF(fp, "[0001]\t\tReserved : 00\n"); EFPRINTF(fp, "[0004]\t\tAddress : fec00000\n"); EFPRINTF(fp, "[0004]\t\tInterrupt : 00000000\n"); EFPRINTF(fp, "\n"); /* Legacy IRQ0 is connected to pin 2 of the IOAPIC */ EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n"); EFPRINTF(fp, "[0001]\t\tLength : 0A\n"); EFPRINTF(fp, "[0001]\t\tBus : 00\n"); EFPRINTF(fp, "[0001]\t\tSource : 00\n"); EFPRINTF(fp, "[0004]\t\tInterrupt : 00000002\n"); EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n"); EFPRINTF(fp, "\t\t\tPolarity : 1\n"); EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n"); EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n"); EFPRINTF(fp, "[0001]\t\tLength : 0A\n"); EFPRINTF(fp, "[0001]\t\tBus : 00\n"); EFPRINTF(fp, "[0001]\t\tSource : %02X\n", SCI_INT); EFPRINTF(fp, "[0004]\t\tInterrupt : %08X\n", SCI_INT); EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0000\n"); EFPRINTF(fp, "\t\t\tPolarity : 3\n"); EFPRINTF(fp, "\t\t\tTrigger Mode : 3\n"); EFPRINTF(fp, "\n"); /* Local APIC NMI is connected to LINT 1 on all CPUs */ EFPRINTF(fp, "[0001]\t\tSubtable Type : 04\n"); EFPRINTF(fp, "[0001]\t\tLength : 06\n"); EFPRINTF(fp, "[0001]\t\tProcessorId : FF\n"); EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n"); EFPRINTF(fp, "\t\t\tPolarity : 1\n"); EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n"); EFPRINTF(fp, "[0001]\t\tInterrupt : 01\n"); EFPRINTF(fp, "\n"); EFFLUSH(fp); return (0); err_exit: return (errno); } static int basl_fwrite_fadt(FILE *fp) { int err; err = 0; EFPRINTF(fp, "/*\n"); EFPRINTF(fp, " * bhyve FADT template\n"); EFPRINTF(fp, " */\n"); EFPRINTF(fp, "[0004]\t\tSignature : \"FACP\"\n"); EFPRINTF(fp, "[0004]\t\tTable Length : 0000010C\n"); EFPRINTF(fp, "[0001]\t\tRevision : 05\n"); EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVFACP \"\n"); EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); /* iasl will fill in the compiler ID/revision fields */ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0004]\t\tFACS Address : %08X\n", basl_acpi_base + FACS_OFFSET); EFPRINTF(fp, "[0004]\t\tDSDT Address : %08X\n", basl_acpi_base + DSDT_OFFSET); EFPRINTF(fp, "[0001]\t\tModel : 01\n"); EFPRINTF(fp, "[0001]\t\tPM Profile : 00 [Unspecified]\n"); EFPRINTF(fp, "[0002]\t\tSCI Interrupt : %04X\n", SCI_INT); EFPRINTF(fp, "[0004]\t\tSMI Command Port : %08X\n", SMI_CMD); EFPRINTF(fp, "[0001]\t\tACPI Enable Value : %02X\n", BHYVE_ACPI_ENABLE); EFPRINTF(fp, "[0001]\t\tACPI Disable Value : %02X\n", BHYVE_ACPI_DISABLE); EFPRINTF(fp, "[0001]\t\tS4BIOS Command : 00\n"); EFPRINTF(fp, "[0001]\t\tP-State Control : 00\n"); EFPRINTF(fp, "[0004]\t\tPM1A Event Block Address : %08X\n", PM1A_EVT_ADDR); EFPRINTF(fp, "[0004]\t\tPM1B Event Block Address : 00000000\n"); EFPRINTF(fp, "[0004]\t\tPM1A Control Block Address : %08X\n", PM1A_CNT_ADDR); EFPRINTF(fp, "[0004]\t\tPM1B Control Block Address : 00000000\n"); EFPRINTF(fp, "[0004]\t\tPM2 Control Block Address : 00000000\n"); EFPRINTF(fp, "[0004]\t\tPM Timer Block Address : %08X\n", IO_PMTMR); EFPRINTF(fp, "[0004]\t\tGPE0 Block Address : 00000000\n"); EFPRINTF(fp, "[0004]\t\tGPE1 Block Address : 00000000\n"); EFPRINTF(fp, "[0001]\t\tPM1 Event Block Length : 04\n"); EFPRINTF(fp, "[0001]\t\tPM1 Control Block Length : 02\n"); EFPRINTF(fp, "[0001]\t\tPM2 Control Block Length : 00\n"); EFPRINTF(fp, "[0001]\t\tPM Timer Block Length : 04\n"); EFPRINTF(fp, "[0001]\t\tGPE0 Block Length : 00\n"); EFPRINTF(fp, "[0001]\t\tGPE1 Block Length : 00\n"); EFPRINTF(fp, "[0001]\t\tGPE1 Base Offset : 00\n"); EFPRINTF(fp, "[0001]\t\t_CST Support : 00\n"); EFPRINTF(fp, "[0002]\t\tC2 Latency : 0000\n"); EFPRINTF(fp, "[0002]\t\tC3 Latency : 0000\n"); EFPRINTF(fp, "[0002]\t\tCPU Cache Size : 0000\n"); EFPRINTF(fp, "[0002]\t\tCache Flush Stride : 0000\n"); EFPRINTF(fp, "[0001]\t\tDuty Cycle Offset : 00\n"); EFPRINTF(fp, "[0001]\t\tDuty Cycle Width : 00\n"); EFPRINTF(fp, "[0001]\t\tRTC Day Alarm Index : 00\n"); EFPRINTF(fp, "[0001]\t\tRTC Month Alarm Index : 00\n"); EFPRINTF(fp, "[0001]\t\tRTC Century Index : 00\n"); EFPRINTF(fp, "[0002]\t\tBoot Flags (decoded below) : 0000\n"); EFPRINTF(fp, "\t\t\tLegacy Devices Supported (V2) : 0\n"); EFPRINTF(fp, "\t\t\t8042 Present on ports 60/64 (V2) : 0\n"); EFPRINTF(fp, "\t\t\tVGA Not Present (V4) : 1\n"); EFPRINTF(fp, "\t\t\tMSI Not Supported (V4) : 0\n"); EFPRINTF(fp, "\t\t\tPCIe ASPM Not Supported (V4) : 1\n"); EFPRINTF(fp, "\t\t\tCMOS RTC Not Present (V5) : 0\n"); EFPRINTF(fp, "[0001]\t\tReserved : 00\n"); EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n"); EFPRINTF(fp, "\t\t\tWBINVD instruction is operational (V1) : 1\n"); EFPRINTF(fp, "\t\t\tWBINVD flushes all caches (V1) : 0\n"); EFPRINTF(fp, "\t\t\tAll CPUs support C1 (V1) : 1\n"); EFPRINTF(fp, "\t\t\tC2 works on MP system (V1) : 0\n"); EFPRINTF(fp, "\t\t\tControl Method Power Button (V1) : 0\n"); EFPRINTF(fp, "\t\t\tControl Method Sleep Button (V1) : 1\n"); EFPRINTF(fp, "\t\t\tRTC wake not in fixed reg space (V1) : 0\n"); EFPRINTF(fp, "\t\t\tRTC can wake system from S4 (V1) : 0\n"); EFPRINTF(fp, "\t\t\t32-bit PM Timer (V1) : 1\n"); EFPRINTF(fp, "\t\t\tDocking Supported (V1) : 0\n"); EFPRINTF(fp, "\t\t\tReset Register Supported (V2) : 1\n"); EFPRINTF(fp, "\t\t\tSealed Case (V3) : 0\n"); EFPRINTF(fp, "\t\t\tHeadless - No Video (V3) : 1\n"); EFPRINTF(fp, "\t\t\tUse native instr after SLP_TYPx (V3) : 0\n"); EFPRINTF(fp, "\t\t\tPCIEXP_WAK Bits Supported (V4) : 0\n"); EFPRINTF(fp, "\t\t\tUse Platform Timer (V4) : 0\n"); EFPRINTF(fp, "\t\t\tRTC_STS valid on S4 wake (V4) : 0\n"); EFPRINTF(fp, "\t\t\tRemote Power-on capable (V4) : 0\n"); EFPRINTF(fp, "\t\t\tUse APIC Cluster Model (V4) : 0\n"); EFPRINTF(fp, "\t\t\tUse APIC Physical Destination Mode (V4) : 1\n"); EFPRINTF(fp, "\t\t\tHardware Reduced (V5) : 0\n"); EFPRINTF(fp, "\t\t\tLow Power S0 Idle (V5) : 0\n"); EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0012]\t\tReset Register : [Generic Address Structure]\n"); EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000CF9\n"); EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0001]\t\tValue to cause reset : 06\n"); EFPRINTF(fp, "[0003]\t\tReserved : 000000\n"); EFPRINTF(fp, "[0008]\t\tFACS Address : 00000000%08X\n", basl_acpi_base + FACS_OFFSET); EFPRINTF(fp, "[0008]\t\tDSDT Address : 00000000%08X\n", basl_acpi_base + DSDT_OFFSET); EFPRINTF(fp, "[0012]\t\tPM1A Event Block : [Generic Address Structure]\n"); EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); EFPRINTF(fp, "[0001]\t\tBit Width : 20\n"); EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n"); EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n", PM1A_EVT_ADDR); EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0012]\t\tPM1B Event Block : [Generic Address Structure]\n"); EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0012]\t\tPM1A Control Block : [Generic Address Structure]\n"); EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); EFPRINTF(fp, "[0001]\t\tBit Width : 10\n"); EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n"); EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n", PM1A_CNT_ADDR); EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0012]\t\tPM1B Control Block : [Generic Address Structure]\n"); EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0012]\t\tPM2 Control Block : [Generic Address Structure]\n"); EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); EFPRINTF(fp, "\n"); /* Valid for bhyve */ EFPRINTF(fp, "[0012]\t\tPM Timer Block : [Generic Address Structure]\n"); EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); EFPRINTF(fp, "[0001]\t\tBit Width : 32\n"); EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 03 [DWord Access:32]\n"); EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n", IO_PMTMR); EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0012]\t\tGPE0 Block : [Generic Address Structure]\n"); EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); EFPRINTF(fp, "[0001]\t\tBit Width : 80\n"); EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0012]\t\tGPE1 Block : [Generic Address Structure]\n"); EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0012]\t\tSleep Control Register : [Generic Address Structure]\n"); EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0012]\t\tSleep Status Register : [Generic Address Structure]\n"); EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); EFFLUSH(fp); return (0); err_exit: return (errno); } static int basl_fwrite_hpet(FILE *fp) { int err; err = 0; EFPRINTF(fp, "/*\n"); EFPRINTF(fp, " * bhyve HPET template\n"); EFPRINTF(fp, " */\n"); EFPRINTF(fp, "[0004]\t\tSignature : \"HPET\"\n"); EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVHPET \"\n"); EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); /* iasl will fill in the compiler ID/revision fields */ EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0004]\t\tTimer Block ID : %08X\n", hpet_capabilities); EFPRINTF(fp, "[0012]\t\tTimer Block Register : [Generic Address Structure]\n"); EFPRINTF(fp, "[0001]\t\tSpace ID : 00 [SystemMemory]\n"); EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); EFPRINTF(fp, "[0008]\t\tAddress : 00000000FED00000\n"); EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0001]\t\tHPET Number : 00\n"); EFPRINTF(fp, "[0002]\t\tMinimum Clock Ticks : 0000\n"); EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n"); EFPRINTF(fp, "\t\t\t4K Page Protect : 1\n"); EFPRINTF(fp, "\t\t\t64K Page Protect : 0\n"); EFPRINTF(fp, "\n"); EFFLUSH(fp); return (0); err_exit: return (errno); } +static int +basl_fwrite_mcfg(FILE *fp) +{ + int err = 0; + + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve MCFG template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "[0008]\t\tReserved : 0\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base()); + EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n"); + EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n"); + EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n"); + EFPRINTF(fp, "[0004]\t\tReserved : 0\n"); + EFFLUSH(fp); + return (0); +err_exit: + return (errno); +} + static int basl_fwrite_facs(FILE *fp) { int err; err = 0; EFPRINTF(fp, "/*\n"); EFPRINTF(fp, " * bhyve FACS template\n"); EFPRINTF(fp, " */\n"); EFPRINTF(fp, "[0004]\t\tSignature : \"FACS\"\n"); EFPRINTF(fp, "[0004]\t\tLength : 00000040\n"); EFPRINTF(fp, "[0004]\t\tHardware Signature : 00000000\n"); EFPRINTF(fp, "[0004]\t\t32 Firmware Waking Vector : 00000000\n"); EFPRINTF(fp, "[0004]\t\tGlobal Lock : 00000000\n"); EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n"); EFPRINTF(fp, "\t\t\tS4BIOS Support Present : 0\n"); EFPRINTF(fp, "\t\t\t64-bit Wake Supported (V2) : 0\n"); EFPRINTF(fp, "[0008]\t\t64 Firmware Waking Vector : 0000000000000000\n"); EFPRINTF(fp, "[0001]\t\tVersion : 02\n"); EFPRINTF(fp, "[0003]\t\tReserved : 000000\n"); EFPRINTF(fp, "[0004]\t\tOspmFlags (decoded below) : 00000000\n"); EFPRINTF(fp, "\t\t\t64-bit Wake Env Required (V2) : 0\n"); EFFLUSH(fp); return (0); err_exit: return (errno); } /* * Helper routines for writing to the DSDT from other modules. */ void dsdt_line(const char *fmt, ...) { va_list ap; int err; if (dsdt_error != 0) return; if (strcmp(fmt, "") != 0) { if (dsdt_indent_level != 0) EFPRINTF(dsdt_fp, "%*c", dsdt_indent_level * 2, ' '); va_start(ap, fmt); if (vfprintf(dsdt_fp, fmt, ap) < 0) goto err_exit; va_end(ap); } EFPRINTF(dsdt_fp, "\n"); return; err_exit: dsdt_error = errno; } void dsdt_indent(int levels) { dsdt_indent_level += levels; assert(dsdt_indent_level >= 0); } void dsdt_unindent(int levels) { assert(dsdt_indent_level >= levels); dsdt_indent_level -= levels; } void dsdt_fixed_ioport(uint16_t iobase, uint16_t length) { dsdt_line("IO (Decode16,"); dsdt_line(" 0x%04X, // Range Minimum", iobase); dsdt_line(" 0x%04X, // Range Maximum", iobase); dsdt_line(" 0x01, // Alignment"); dsdt_line(" 0x%02X, // Length", length); dsdt_line(" )"); } void dsdt_fixed_irq(uint8_t irq) { dsdt_line("IRQNoFlags ()"); dsdt_line(" {%d}", irq); } void dsdt_fixed_mem32(uint32_t base, uint32_t length) { dsdt_line("Memory32Fixed (ReadWrite,"); dsdt_line(" 0x%08X, // Address Base", base); dsdt_line(" 0x%08X, // Address Length", length); dsdt_line(" )"); } static int basl_fwrite_dsdt(FILE *fp) { int err; err = 0; dsdt_fp = fp; dsdt_error = 0; dsdt_indent_level = 0; dsdt_line("/*"); dsdt_line(" * bhyve DSDT template"); dsdt_line(" */"); dsdt_line("DefinitionBlock (\"bhyve_dsdt.aml\", \"DSDT\", 2," "\"BHYVE \", \"BVDSDT \", 0x00000001)"); dsdt_line("{"); dsdt_line(" Name (_S5, Package ()"); dsdt_line(" {"); dsdt_line(" 0x05,"); dsdt_line(" Zero,"); dsdt_line(" })"); pci_write_dsdt(); dsdt_line(""); dsdt_line(" Scope (_SB.PC00)"); dsdt_line(" {"); dsdt_line(" Device (HPET)"); dsdt_line(" {"); dsdt_line(" Name (_HID, EISAID(\"PNP0103\"))"); dsdt_line(" Name (_UID, 0)"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(4); dsdt_fixed_mem32(0xFED00000, 0x400); dsdt_unindent(4); dsdt_line(" })"); dsdt_line(" }"); dsdt_line(" }"); dsdt_line("}"); if (dsdt_error != 0) return (dsdt_error); EFFLUSH(fp); return (0); err_exit: return (errno); } static int basl_open(struct basl_fio *bf, int suffix) { int err; err = 0; if (suffix) { strncpy(bf->f_name, basl_stemplate, MAXPATHLEN); bf->fd = mkstemps(bf->f_name, strlen(BHYVE_ASL_SUFFIX)); } else { strncpy(bf->f_name, basl_template, MAXPATHLEN); bf->fd = mkstemp(bf->f_name); } if (bf->fd > 0) { bf->fp = fdopen(bf->fd, "w+"); if (bf->fp == NULL) { unlink(bf->f_name); close(bf->fd); } } else { err = 1; } return (err); } static void basl_close(struct basl_fio *bf) { if (!basl_keep_temps) unlink(bf->f_name); fclose(bf->fp); } static int basl_start(struct basl_fio *in, struct basl_fio *out) { int err; err = basl_open(in, 0); if (!err) { err = basl_open(out, 1); if (err) { basl_close(in); } } return (err); } static void basl_end(struct basl_fio *in, struct basl_fio *out) { basl_close(in); basl_close(out); } static int basl_load(struct vmctx *ctx, int fd, uint64_t off) { struct stat sb; void *gaddr; if (fstat(fd, &sb) < 0) return (errno); gaddr = paddr_guest2host(ctx, basl_acpi_base + off, sb.st_size); if (gaddr == NULL) return (EFAULT); if (read(fd, gaddr, sb.st_size) < 0) return (errno); return (0); } static int basl_compile(struct vmctx *ctx, int (*fwrite_section)(FILE *), uint64_t offset) { struct basl_fio io[2]; static char iaslbuf[3*MAXPATHLEN + 10]; char *fmt; int err; err = basl_start(&io[0], &io[1]); if (!err) { err = (*fwrite_section)(io[0].fp); if (!err) { /* * iasl sends the results of the compilation to * stdout. Shut this down by using the shell to * redirect stdout to /dev/null, unless the user * has requested verbose output for debugging * purposes */ fmt = basl_verbose_iasl ? "%s -p %s %s" : "/bin/sh -c \"%s -p %s %s\" 1> /dev/null"; snprintf(iaslbuf, sizeof(iaslbuf), fmt, BHYVE_ASL_COMPILER, io[1].f_name, io[0].f_name); err = system(iaslbuf); if (!err) { /* * Copy the aml output file into guest * memory at the specified location */ err = basl_load(ctx, io[1].fd, offset); } } basl_end(&io[0], &io[1]); } return (err); } static int basl_make_templates(void) { const char *tmpdir; int err; int len; err = 0; /* * */ if ((tmpdir = getenv("BHYVE_TMPDIR")) == NULL || *tmpdir == '\0' || (tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0') { tmpdir = _PATH_TMP; } len = strlen(tmpdir); if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1) < MAXPATHLEN) { strcpy(basl_template, tmpdir); while (len > 0 && basl_template[len - 1] == '/') len--; basl_template[len] = '/'; strcpy(&basl_template[len + 1], BHYVE_ASL_TEMPLATE); } else err = E2BIG; if (!err) { /* * len has been intialized (and maybe adjusted) above */ if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1 + sizeof(BHYVE_ASL_SUFFIX)) < MAXPATHLEN) { strcpy(basl_stemplate, tmpdir); basl_stemplate[len] = '/'; strcpy(&basl_stemplate[len + 1], BHYVE_ASL_TEMPLATE); len = strlen(basl_stemplate); strcpy(&basl_stemplate[len], BHYVE_ASL_SUFFIX); } else err = E2BIG; } return (err); } static struct { int (*wsect)(FILE *fp); uint64_t offset; } basl_ftables[] = { { basl_fwrite_rsdp, 0}, { basl_fwrite_rsdt, RSDT_OFFSET }, { basl_fwrite_xsdt, XSDT_OFFSET }, { basl_fwrite_madt, MADT_OFFSET }, { basl_fwrite_fadt, FADT_OFFSET }, { basl_fwrite_hpet, HPET_OFFSET }, + { basl_fwrite_mcfg, MCFG_OFFSET }, { basl_fwrite_facs, FACS_OFFSET }, { basl_fwrite_dsdt, DSDT_OFFSET }, { NULL } }; int acpi_build(struct vmctx *ctx, int ncpu) { int err; int i; basl_ncpu = ncpu; err = vm_get_hpet_capabilities(ctx, &hpet_capabilities); if (err != 0) return (err); /* * For debug, allow the user to have iasl compiler output sent * to stdout rather than /dev/null */ if (getenv("BHYVE_ACPI_VERBOSE_IASL")) basl_verbose_iasl = 1; /* * Allow the user to keep the generated ASL files for debugging * instead of deleting them following use */ if (getenv("BHYVE_ACPI_KEEPTMPS")) basl_keep_temps = 1; i = 0; err = basl_make_templates(); /* * Run through all the ASL files, compiling them and * copying them into guest memory */ while (!err && basl_ftables[i].wsect != NULL) { err = basl_compile(ctx, basl_ftables[i].wsect, basl_ftables[i].offset); i++; } return (err); } diff --git a/usr.sbin/bhyve/mem.c b/usr.sbin/bhyve/mem.c index 37cf055f2ccf..2a9f430c8262 100644 --- a/usr.sbin/bhyve/mem.c +++ b/usr.sbin/bhyve/mem.c @@ -1,272 +1,291 @@ /*- * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Memory ranges are represented with an RB tree. On insertion, the range * is checked for overlaps. On lookup, the key has the same base and limit * so it can be searched within the range. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include "mem.h" struct mmio_rb_range { RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */ struct mem_range mr_param; uint64_t mr_base; uint64_t mr_end; }; struct mmio_rb_tree; RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rb_root, mmio_rb_fallback; /* * Per-vCPU cache. Since most accesses from a vCPU will be to * consecutive addresses in a range, it makes sense to cache the * result of a lookup. */ static struct mmio_rb_range *mmio_hint[VM_MAXCPU]; static pthread_rwlock_t mmio_rwlock; static int mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b) { if (a->mr_end < b->mr_base) return (-1); else if (a->mr_base > b->mr_end) return (1); return (0); } static int mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr, struct mmio_rb_range **entry) { struct mmio_rb_range find, *res; find.mr_base = find.mr_end = addr; res = RB_FIND(mmio_rb_tree, rbt, &find); if (res != NULL) { *entry = res; return (0); } return (ENOENT); } static int mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new) { struct mmio_rb_range *overlap; overlap = RB_INSERT(mmio_rb_tree, rbt, new); if (overlap != NULL) { #ifdef RB_DEBUG printf("overlap detected: new %lx:%lx, tree %lx:%lx\n", new->mr_base, new->mr_end, overlap->mr_base, overlap->mr_end); #endif return (EEXIST); } return (0); } #if 0 static void mmio_rb_dump(struct mmio_rb_tree *rbt) { struct mmio_rb_range *np; pthread_rwlock_rdlock(&mmio_rwlock); RB_FOREACH(np, mmio_rb_tree, rbt) { printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end, np->mr_param.name); } pthread_rwlock_unlock(&mmio_rwlock); } #endif RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); static int mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg) { int error; struct mem_range *mr = arg; error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size, rval, mr->arg1, mr->arg2); return (error); } static int mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg) { int error; struct mem_range *mr = arg; error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size, &wval, mr->arg1, mr->arg2); return (error); } int emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, struct vm_guest_paging *paging) { struct mmio_rb_range *entry; - int err; + int err, immutable; pthread_rwlock_rdlock(&mmio_rwlock); /* * First check the per-vCPU cache */ if (mmio_hint[vcpu] && paddr >= mmio_hint[vcpu]->mr_base && paddr <= mmio_hint[vcpu]->mr_end) { entry = mmio_hint[vcpu]; } else entry = NULL; if (entry == NULL) { if (mmio_rb_lookup(&mmio_rb_root, paddr, &entry) == 0) { /* Update the per-vCPU cache */ mmio_hint[vcpu] = entry; } else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) { pthread_rwlock_unlock(&mmio_rwlock); return (ESRCH); } } assert(entry != NULL); + + /* + * An 'immutable' memory range is guaranteed to be never removed + * so there is no need to hold 'mmio_rwlock' while calling the + * handler. + * + * XXX writes to the PCIR_COMMAND register can cause register_mem() + * to be called. If the guest is using PCI extended config space + * to modify the PCIR_COMMAND register then register_mem() can + * deadlock on 'mmio_rwlock'. However by registering the extended + * config space window as 'immutable' the deadlock can be avoided. + */ + immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE); + if (immutable) + pthread_rwlock_unlock(&mmio_rwlock); + err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging, mem_read, mem_write, &entry->mr_param); - pthread_rwlock_unlock(&mmio_rwlock); + + if (!immutable) + pthread_rwlock_unlock(&mmio_rwlock); return (err); } static int register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp) { struct mmio_rb_range *entry, *mrp; int err; err = 0; mrp = malloc(sizeof(struct mmio_rb_range)); if (mrp != NULL) { mrp->mr_param = *memp; mrp->mr_base = memp->base; mrp->mr_end = memp->base + memp->size - 1; pthread_rwlock_wrlock(&mmio_rwlock); if (mmio_rb_lookup(rbt, memp->base, &entry) != 0) err = mmio_rb_add(rbt, mrp); pthread_rwlock_unlock(&mmio_rwlock); if (err) free(mrp); } else err = ENOMEM; return (err); } int register_mem(struct mem_range *memp) { return (register_mem_int(&mmio_rb_root, memp)); } int register_mem_fallback(struct mem_range *memp) { return (register_mem_int(&mmio_rb_fallback, memp)); } int unregister_mem(struct mem_range *memp) { struct mem_range *mr; struct mmio_rb_range *entry = NULL; int err, i; pthread_rwlock_wrlock(&mmio_rwlock); err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry); if (err == 0) { mr = &entry->mr_param; assert(mr->name == memp->name); assert(mr->base == memp->base && mr->size == memp->size); + assert((mr->flags & MEM_F_IMMUTABLE) == 0); RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry); /* flush Per-vCPU cache */ for (i=0; i < VM_MAXCPU; i++) { if (mmio_hint[i] == entry) mmio_hint[i] = NULL; } } pthread_rwlock_unlock(&mmio_rwlock); if (entry) free(entry); return (err); } void init_mem(void) { RB_INIT(&mmio_rb_root); RB_INIT(&mmio_rb_fallback); pthread_rwlock_init(&mmio_rwlock, NULL); } diff --git a/usr.sbin/bhyve/mem.h b/usr.sbin/bhyve/mem.h index eb648c145df6..f671eaedf786 100644 --- a/usr.sbin/bhyve/mem.h +++ b/usr.sbin/bhyve/mem.h @@ -1,60 +1,61 @@ /*- * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MEM_H_ #define _MEM_H_ #include struct vmctx; typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2); struct mem_range { const char *name; int flags; mem_func_t handler; void *arg1; long arg2; uint64_t base; uint64_t size; }; #define MEM_F_READ 0x1 #define MEM_F_WRITE 0x2 #define MEM_F_RW 0x3 +#define MEM_F_IMMUTABLE 0x4 /* mem_range cannot be unregistered */ void init_mem(void); int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie, struct vm_guest_paging *paging); int register_mem(struct mem_range *memp); int register_mem_fallback(struct mem_range *memp); int unregister_mem(struct mem_range *memp); #endif /* _MEM_H_ */ diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c index 458ba76480b1..6b906ede42fc 100644 --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -1,2023 +1,2089 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "acpi.h" #include "bhyverun.h" #include "inout.h" #include "ioapic.h" #include "mem.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #define CONF1_ADDR_PORT 0x0cf8 #define CONF1_DATA_PORT 0x0cfc #define CONF1_ENABLE 0x80000000ul #define CFGWRITE(pi,off,val,b) \ do { \ if ((b) == 1) { \ pci_set_cfgdata8((pi),(off),(val)); \ } else if ((b) == 2) { \ pci_set_cfgdata16((pi),(off),(val)); \ } else { \ pci_set_cfgdata32((pi),(off),(val)); \ } \ } while (0) #define MAXBUSES (PCI_BUSMAX + 1) #define MAXSLOTS (PCI_SLOTMAX + 1) #define MAXFUNCS (PCI_FUNCMAX + 1) struct funcinfo { char *fi_name; char *fi_param; struct pci_devinst *fi_devi; }; struct intxinfo { int ii_count; int ii_pirq_pin; int ii_ioapic_irq; }; struct slotinfo { struct intxinfo si_intpins[4]; struct funcinfo si_funcs[MAXFUNCS]; }; struct businfo { uint16_t iobase, iolimit; /* I/O window */ uint32_t membase32, memlimit32; /* mmio window below 4GB */ uint64_t membase64, memlimit64; /* mmio window above 4GB */ struct slotinfo slotinfo[MAXSLOTS]; }; static struct businfo *pci_businfo[MAXBUSES]; SET_DECLARE(pci_devemu_set, struct pci_devemu); static uint64_t pci_emul_iobase; static uint64_t pci_emul_membase32; static uint64_t pci_emul_membase64; #define PCI_EMUL_IOBASE 0x2000 #define PCI_EMUL_IOLIMIT 0x10000 -#define PCI_EMUL_MEMLIMIT32 0xE0000000 /* 3.5GB */ +#define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ +#define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ +SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); + +#define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE #define PCI_EMUL_MEMBASE64 0xD000000000UL #define PCI_EMUL_MEMLIMIT64 0xFD00000000UL static struct pci_devemu *pci_emul_finddev(char *name); -static void pci_lintr_route(struct pci_devinst *pi); -static void pci_lintr_update(struct pci_devinst *pi); - -static struct mem_range pci_mem_hole; +static void pci_lintr_route(struct pci_devinst *pi); +static void pci_lintr_update(struct pci_devinst *pi); +static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, + int func, int coff, int bytes, uint32_t *val); /* * I/O access */ /* * Slot options are in the form: * * ::,[,] * [:],[,] * * slot is 0..31 * func is 0..7 * emul is a string describing the type of PCI device e.g. virtio-net * config is an optional string, depending on the device, that can be * used for configuration. * Examples are: * 1,virtio-net,tap0 * 3:0,dummy */ static void pci_parse_slot_usage(char *aopt) { fprintf(stderr, "Invalid PCI slot info field \"%s\"\n", aopt); } int pci_parse_slot(char *opt) { struct businfo *bi; struct slotinfo *si; char *emul, *config, *str, *cp; int error, bnum, snum, fnum; error = -1; str = strdup(opt); emul = config = NULL; if ((cp = strchr(str, ',')) != NULL) { *cp = '\0'; emul = cp + 1; if ((cp = strchr(emul, ',')) != NULL) { *cp = '\0'; config = cp + 1; } } else { pci_parse_slot_usage(opt); goto done; } /* :: */ if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) { bnum = 0; /* : */ if (sscanf(str, "%d:%d", &snum, &fnum) != 2) { fnum = 0; /* */ if (sscanf(str, "%d", &snum) != 1) { snum = -1; } } } if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS || fnum < 0 || fnum >= MAXFUNCS) { pci_parse_slot_usage(opt); goto done; } if (pci_businfo[bnum] == NULL) pci_businfo[bnum] = calloc(1, sizeof(struct businfo)); bi = pci_businfo[bnum]; si = &bi->slotinfo[snum]; if (si->si_funcs[fnum].fi_name != NULL) { fprintf(stderr, "pci slot %d:%d already occupied!\n", snum, fnum); goto done; } if (pci_emul_finddev(emul) == NULL) { fprintf(stderr, "pci slot %d:%d: unknown device \"%s\"\n", snum, fnum, emul); goto done; } error = 0; si->si_funcs[fnum].fi_name = emul; si->si_funcs[fnum].fi_param = config; done: if (error) free(str); return (error); } static int pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) { if (offset < pi->pi_msix.pba_offset) return (0); if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { return (0); } return (1); } int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, uint64_t value) { int msix_entry_offset; int tab_index; char *dest; /* support only 4 or 8 byte writes */ if (size != 4 && size != 8) return (-1); /* * Return if table index is beyond what device supports */ tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index >= pi->pi_msix.table_count) return (-1); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned writes */ if ((msix_entry_offset % size) != 0) return (-1); dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 4) *((uint32_t *)dest) = value; else *((uint64_t *)dest) = value; return (0); } uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) { char *dest; int msix_entry_offset; int tab_index; uint64_t retval = ~0; /* * The PCI standard only allows 4 and 8 byte accesses to the MSI-X * table but we also allow 1 byte access to accomodate reads from * ddb. */ if (size != 1 && size != 4 && size != 8) return (retval); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned reads */ if ((msix_entry_offset % size) != 0) { return (retval); } tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index < pi->pi_msix.table_count) { /* valid MSI-X Table access */ dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 1) retval = *((uint8_t *)dest); else if (size == 4) retval = *((uint32_t *)dest); else retval = *((uint64_t *)dest); } else if (pci_valid_pba_offset(pi, offset)) { /* return 0 for PBA access */ retval = 0; } return (retval); } int pci_msix_table_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.table_bar); else return (-1); } int pci_msix_pba_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.pba_bar); else return (-1); } static int pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { struct pci_devinst *pdi = arg; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int i; for (i = 0; i <= PCI_BARMAX; i++) { if (pdi->pi_bar[i].type == PCIBAR_IO && port >= pdi->pi_bar[i].addr && port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { offset = port - pdi->pi_bar[i].addr; if (in) *eax = (*pe->pe_barread)(ctx, vcpu, pdi, i, offset, bytes); else (*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset, bytes, *eax); return (0); } } return (-1); } static int pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { struct pci_devinst *pdi = arg1; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int bidx = (int) arg2; assert(bidx <= PCI_BARMAX); assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 || pdi->pi_bar[bidx].type == PCIBAR_MEM64); assert(addr >= pdi->pi_bar[bidx].addr && addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); offset = addr - pdi->pi_bar[bidx].addr; if (dir == MEM_F_WRITE) { if (size == 8) { (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, 4, *val & 0xffffffff); (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset + 4, 4, *val >> 32); } else { (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, size, *val); } } else { if (size == 8) { *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset, 4); *val |= (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset + 4, 4) << 32; } else { *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset, size); } } return (0); } static int pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, uint64_t *addr) { uint64_t base; assert((size & (size - 1)) == 0); /* must be a power of 2 */ base = roundup2(*baseptr, size); if (base + size <= limit) { *addr = base; *baseptr = base + size; return (0); } else return (-1); } int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size) { return (pci_emul_alloc_pbar(pdi, idx, 0, type, size)); } /* * Register (or unregister) the MMIO or I/O region associated with the BAR * register 'idx' of an emulated pci device. */ static void modify_bar_registration(struct pci_devinst *pi, int idx, int registration) { int error; struct inout_port iop; struct mem_range mr; switch (pi->pi_bar[idx].type) { case PCIBAR_IO: bzero(&iop, sizeof(struct inout_port)); iop.name = pi->pi_name; iop.port = pi->pi_bar[idx].addr; iop.size = pi->pi_bar[idx].size; if (registration) { iop.flags = IOPORT_F_INOUT; iop.handler = pci_emul_io_handler; iop.arg = pi; error = register_inout(&iop); } else error = unregister_inout(&iop); break; case PCIBAR_MEM32: case PCIBAR_MEM64: bzero(&mr, sizeof(struct mem_range)); mr.name = pi->pi_name; mr.base = pi->pi_bar[idx].addr; mr.size = pi->pi_bar[idx].size; if (registration) { mr.flags = MEM_F_RW; mr.handler = pci_emul_mem_handler; mr.arg1 = pi; mr.arg2 = idx; error = register_mem(&mr); } else error = unregister_mem(&mr); break; default: error = EINVAL; break; } assert(error == 0); } static void unregister_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 0); } static void register_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 1); } /* Are we decoding i/o port accesses for the emulated pci device? */ static int porten(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_PORTEN); } /* Are we decoding memory accesses for the emulated pci device? */ static int memen(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_MEMEN); } /* * Update the MMIO or I/O address that is decoded by the BAR register. * * If the pci device has enabled the address space decoding then intercept * the address range decoded by the BAR register. */ static void update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) { int decode; if (pi->pi_bar[idx].type == PCIBAR_IO) decode = porten(pi); else decode = memen(pi); if (decode) unregister_bar(pi, idx); switch (type) { case PCIBAR_IO: case PCIBAR_MEM32: pi->pi_bar[idx].addr = addr; break; case PCIBAR_MEM64: pi->pi_bar[idx].addr &= ~0xffffffffUL; pi->pi_bar[idx].addr |= addr; break; case PCIBAR_MEMHI64: pi->pi_bar[idx].addr &= 0xffffffff; pi->pi_bar[idx].addr |= addr; break; default: assert(0); } if (decode) register_bar(pi, idx); } int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, enum pcibar_type type, uint64_t size) { int error; uint64_t *baseptr, limit, addr, mask, lobits, bar; assert(idx >= 0 && idx <= PCI_BARMAX); if ((size & (size - 1)) != 0) size = 1UL << flsl(size); /* round up to a power of 2 */ /* Enforce minimum BAR sizes required by the PCI standard */ if (type == PCIBAR_IO) { if (size < 4) size = 4; } else { if (size < 16) size = 16; } switch (type) { case PCIBAR_NONE: baseptr = NULL; addr = mask = lobits = 0; break; case PCIBAR_IO: baseptr = &pci_emul_iobase; limit = PCI_EMUL_IOLIMIT; mask = PCIM_BAR_IO_BASE; lobits = PCIM_BAR_IO_SPACE; break; case PCIBAR_MEM64: /* * XXX * Some drivers do not work well if the 64-bit BAR is allocated * above 4GB. Allow for this by allocating small requests under * 4GB unless then allocation size is larger than some arbitrary * number (32MB currently). */ if (size > 32 * 1024 * 1024) { /* * XXX special case for device requiring peer-peer DMA */ if (size == 0x100000000UL) baseptr = &hostbase; else baseptr = &pci_emul_membase64; limit = PCI_EMUL_MEMLIMIT64; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | PCIM_BAR_MEM_PREFETCH; break; } else { baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64; } break; case PCIBAR_MEM32: baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; break; default: printf("pci_emul_alloc_base: invalid bar type %d\n", type); assert(0); } if (baseptr != NULL) { error = pci_emul_alloc_resource(baseptr, limit, size, &addr); if (error != 0) return (error); } pdi->pi_bar[idx].type = type; pdi->pi_bar[idx].addr = addr; pdi->pi_bar[idx].size = size; /* Initialize the BAR register in config space */ bar = (addr & mask) | lobits; pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar); if (type == PCIBAR_MEM64) { assert(idx + 1 <= PCI_BARMAX); pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); } register_bar(pdi, idx); return (0); } #define CAP_START_OFFSET 0x40 static int pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen) { int i, capoff, reallen; uint16_t sts; assert(caplen > 0); reallen = roundup2(caplen, 4); /* dword aligned */ sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) == 0) capoff = CAP_START_OFFSET; else capoff = pi->pi_capend + 1; /* Check if we have enough space */ if (capoff + reallen > PCI_REGMAX + 1) return (-1); /* Set the previous capability pointer */ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) { pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff); pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT); } else pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff); /* Copy the capability */ for (i = 0; i < caplen; i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); /* Set the next capability pointer */ pci_set_cfgdata8(pi, capoff + 1, 0); pi->pi_prevcap = capoff; pi->pi_capend = capoff + reallen - 1; return (0); } static struct pci_devemu * pci_emul_finddev(char *name) { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; if (!strcmp(pdp->pe_emu, name)) { return (pdp); } } return (NULL); } static int pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot, int func, struct funcinfo *fi) { struct pci_devinst *pdi; int err; pdi = calloc(1, sizeof(struct pci_devinst)); pdi->pi_vmctx = ctx; pdi->pi_bus = bus; pdi->pi_slot = slot; pdi->pi_func = func; pthread_mutex_init(&pdi->pi_lintr.lock, NULL); pdi->pi_lintr.pin = 0; pdi->pi_lintr.state = IDLE; pdi->pi_lintr.pirq_pin = 0; pdi->pi_lintr.ioapic_irq = 0; pdi->pi_d = pde; snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot); /* Disable legacy interrupts */ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN); err = (*pde->pe_init)(ctx, pdi, fi->fi_param); if (err == 0) fi->fi_devi = pdi; else free(pdi); return (err); } void pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) { int mmc; CTASSERT(sizeof(struct msicap) == 14); /* Number of msi messages must be a power of 2 between 1 and 32 */ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); mmc = ffs(msgnum) - 1; bzero(msicap, sizeof(struct msicap)); msicap->capid = PCIY_MSI; msicap->nextptr = nextptr; msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1); } int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) { struct msicap msicap; pci_populate_msicap(&msicap, msgnum, 0); return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); } static void pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, uint32_t msix_tab_size) { CTASSERT(sizeof(struct msixcap) == 12); assert(msix_tab_size % 4096 == 0); bzero(msixcap, sizeof(struct msixcap)); msixcap->capid = PCIY_MSIX; /* * Message Control Register, all fields set to * zero except for the Table Size. * Note: Table size N is encoded as N-1 */ msixcap->msgctrl = msgnum - 1; /* * MSI-X BAR setup: * - MSI-X table start at offset 0 * - PBA table starts at a 4K aligned offset after the MSI-X table */ msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK; msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK); } static void pci_msix_table_init(struct pci_devinst *pi, int table_entries) { int i, table_size; assert(table_entries > 0); assert(table_entries <= MAX_MSIX_TABLE_ENTRIES); table_size = table_entries * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* set mask bit of vector control register */ for (i = 0; i < table_entries; i++) pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) { uint32_t tab_size; struct msixcap msixcap; assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE; /* Align table size to nearest 4K */ tab_size = roundup2(tab_size, 4096); pi->pi_msix.table_bar = barnum; pi->pi_msix.pba_bar = barnum; pi->pi_msix.table_offset = 0; pi->pi_msix.table_count = msgnum; pi->pi_msix.pba_offset = tab_size; pi->pi_msix.pba_size = PBA_SIZE(msgnum); pci_msix_table_init(pi, msgnum); pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size); /* allocate memory for MSI-X Table and PBA */ pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32, tab_size + pi->pi_msix.pba_size); return (pci_emul_add_capability(pi, (u_char *)&msixcap, sizeof(msixcap))); } void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask; int off, table_bar; off = offset - capoff; table_bar = pi->pi_msix.table_bar; /* Message Control Register */ if (off == 2 && bytes == 2) { rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; pci_lintr_update(pi); } CFGWRITE(pi, offset, val, bytes); } void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask, msgdata, mme; uint32_t addrlo; /* * If guest is writing to the message control register make sure * we do not overwrite read-only fields. */ if ((offset - capoff) == 2 && bytes == 2) { rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; addrlo = pci_get_cfgdata32(pi, capoff + 4); if (msgctrl & PCIM_MSICTRL_64BIT) msgdata = pci_get_cfgdata16(pi, capoff + 12); else msgdata = pci_get_cfgdata16(pi, capoff + 8); mme = msgctrl & PCIM_MSICTRL_MME_MASK; pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0; if (pi->pi_msi.enabled) { pi->pi_msi.addr = addrlo; pi->pi_msi.msg_data = msgdata; pi->pi_msi.maxmsgnum = 1 << (mme >> 4); } else { pi->pi_msi.maxmsgnum = 0; } pci_lintr_update(pi); } CFGWRITE(pi, offset, val, bytes); } void pciecap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { /* XXX don't write to the readonly parts */ CFGWRITE(pi, offset, val, bytes); } #define PCIECAP_VERSION 0x2 int pci_emul_add_pciecap(struct pci_devinst *pi, int type) { int err; struct pciecap pciecap; CTASSERT(sizeof(struct pciecap) == 60); if (type != PCIEM_TYPE_ROOT_PORT) return (-1); bzero(&pciecap, sizeof(pciecap)); pciecap.capid = PCIY_EXPRESS; pciecap.pcie_capabilities = PCIECAP_VERSION | PCIEM_TYPE_ROOT_PORT; pciecap.link_capabilities = 0x411; /* gen1, x1 */ pciecap.link_status = 0x11; /* gen1, x1 */ err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap)); return (err); } /* * This function assumes that 'coff' is in the capabilities region of the * config space. */ static void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val) { int capid; uint8_t capoff, nextoff; /* Do not allow un-aligned writes */ if ((offset & (bytes - 1)) != 0) return; /* Find the capability that we want to update */ capoff = CAP_START_OFFSET; while (1) { nextoff = pci_get_cfgdata8(pi, capoff + 1); if (nextoff == 0) break; if (offset >= capoff && offset < nextoff) break; capoff = nextoff; } assert(offset >= capoff); /* * Capability ID and Next Capability Pointer are readonly. * However, some o/s's do 4-byte writes that include these. * For this case, trim the write back to 2 bytes and adjust * the data. */ if (offset == capoff || offset == capoff + 1) { if (offset == capoff && bytes == 4) { bytes = 2; offset += 2; val >>= 16; } else return; } capid = pci_get_cfgdata8(pi, capoff); switch (capid) { case PCIY_MSI: msicap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_MSIX: msixcap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_EXPRESS: pciecap_cfgwrite(pi, capoff, offset, bytes, val); break; default: break; } } static int pci_emul_iscap(struct pci_devinst *pi, int offset) { uint16_t sts; sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) != 0) { if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend) return (1); } return (0); } static int pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { /* * Ignore writes; return 0xff's for reads. The mem read code * will take care of truncating to the correct size. */ if (dir == MEM_F_READ) { *val = 0xffffffffffffffff; } return (0); } +static int +pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int bytes, uint64_t *val, void *arg1, long arg2) +{ + int bus, slot, func, coff, in; + + coff = addr & 0xfff; + func = (addr >> 12) & 0x7; + slot = (addr >> 15) & 0x1f; + bus = (addr >> 20) & 0xff; + in = (dir == MEM_F_READ); + if (in) + *val = ~0UL; + pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val); + return (0); +} + +uint64_t +pci_ecfg_base(void) +{ + + return (PCI_EMUL_ECFG_BASE); +} + #define BUSIO_ROUNDUP 32 #define BUSMEM_ROUNDUP (1024 * 1024) int init_pci(struct vmctx *ctx) { + struct mem_range mr; struct pci_devemu *pde; struct businfo *bi; struct slotinfo *si; struct funcinfo *fi; size_t lowmem; int bus, slot, func; int error; pci_emul_iobase = PCI_EMUL_IOBASE; pci_emul_membase32 = vm_get_lowmem_limit(ctx); pci_emul_membase64 = PCI_EMUL_MEMBASE64; for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; /* * Keep track of the i/o and memory resources allocated to * this bus. */ bi->iobase = pci_emul_iobase; bi->membase32 = pci_emul_membase32; bi->membase64 = pci_emul_membase64; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_name == NULL) continue; pde = pci_emul_finddev(fi->fi_name); assert(pde != NULL); error = pci_emul_init(ctx, pde, bus, slot, func, fi); if (error) return (error); } } /* * Add some slop to the I/O and memory resources decoded by * this bus to give a guest some flexibility if it wants to * reprogram the BARs. */ pci_emul_iobase += BUSIO_ROUNDUP; pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP); bi->iolimit = pci_emul_iobase; pci_emul_membase32 += BUSMEM_ROUNDUP; pci_emul_membase32 = roundup2(pci_emul_membase32, BUSMEM_ROUNDUP); bi->memlimit32 = pci_emul_membase32; pci_emul_membase64 += BUSMEM_ROUNDUP; pci_emul_membase64 = roundup2(pci_emul_membase64, BUSMEM_ROUNDUP); bi->memlimit64 = pci_emul_membase64; } /* * PCI backends are initialized before routing INTx interrupts * so that LPC devices are able to reserve ISA IRQs before * routing PIRQ pins. */ for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_devi == NULL) continue; pci_lintr_route(fi->fi_devi); } } } lpc_pirq_routed(); /* * The guest physical memory map looks like the following: * [0, lowmem) guest system memory * [lowmem, lowmem_limit) memory hole (may be absent) - * [lowmem_limit, 4GB) PCI hole (32-bit BAR allocation) + * [lowmem_limit, 0xE0000000) PCI hole (32-bit BAR allocation) + * [0xE0000000, 0xF0000000) PCI extended config window + * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware * [4GB, 4GB + highmem) - * + */ + + /* * Accesses to memory addresses that are not allocated to system * memory or PCI devices return 0xff's. */ lowmem = vm_get_lowmem_size(ctx); + bzero(&mr, sizeof(struct mem_range)); + mr.name = "PCI hole"; + mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; + mr.base = lowmem; + mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; + mr.handler = pci_emul_fallback_handler; + error = register_mem_fallback(&mr); + assert(error == 0); - memset(&pci_mem_hole, 0, sizeof(struct mem_range)); - pci_mem_hole.name = "PCI hole"; - pci_mem_hole.flags = MEM_F_RW; - pci_mem_hole.base = lowmem; - pci_mem_hole.size = (4ULL * 1024 * 1024 * 1024) - lowmem; - pci_mem_hole.handler = pci_emul_fallback_handler; - - error = register_mem_fallback(&pci_mem_hole); + /* PCI extended config space */ + bzero(&mr, sizeof(struct mem_range)); + mr.name = "PCI ECFG"; + mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; + mr.base = PCI_EMUL_ECFG_BASE; + mr.size = PCI_EMUL_ECFG_SIZE; + mr.handler = pci_emul_ecfg_handler; + error = register_mem(&mr); assert(error == 0); return (0); } static void pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, void *arg) { dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" Zero,"); dsdt_line(" 0x%X", ioapic_irq); dsdt_line(" },"); } static void pci_pirq_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, void *arg) { char *name; name = lpc_pirq_name(pirq_pin); if (name == NULL) return; dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" %s,", name); dsdt_line(" 0x00"); dsdt_line(" },"); free(name); } /* * A bhyve virtual machine has a flat PCI hierarchy with a root port * corresponding to each PCI bus. */ static void pci_bus_write_dsdt(int bus) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; int count, func, slot; /* * If there are no devices on this 'bus' then just return. */ if ((bi = pci_businfo[bus]) == NULL) { /* * Bus 0 is special because it decodes the I/O ports used * for PCI config space access even if there are no devices * on it. */ if (bus != 0) return; } dsdt_line(" Device (PC%02X)", bus); dsdt_line(" {"); dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))"); dsdt_line(" Name (_ADR, Zero)"); dsdt_line(" Method (_BBN, 0, NotSerialized)"); dsdt_line(" {"); dsdt_line(" Return (0x%08X)", bus); dsdt_line(" }"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, " "MaxFixed, PosDecode,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bus); dsdt_line(" 0x%04X, // Range Maximum", bus); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0001, // Length"); dsdt_line(" ,, )"); if (bus == 0) { dsdt_indent(3); dsdt_fixed_ioport(0xCF8, 8); dsdt_unindent(3); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0000, // Range Minimum"); dsdt_line(" 0x0CF7, // Range Maximum"); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0CF8, // Length"); dsdt_line(" ,, , TypeStatic)"); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0D00, // Range Minimum"); dsdt_line(" 0x%04X, // Range Maximum", PCI_EMUL_IOBASE - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", PCI_EMUL_IOBASE - 0x0D00); dsdt_line(" ,, , TypeStatic)"); if (bi == NULL) { dsdt_line(" })"); goto done; } } assert(bi != NULL); /* i/o window */ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bi->iobase); dsdt_line(" 0x%04X, // Range Maximum", bi->iolimit - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", bi->iolimit - bi->iobase); dsdt_line(" ,, , TypeStatic)"); /* mmio window (32-bit) */ dsdt_line(" DWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x00000000, // Granularity"); dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32); dsdt_line(" 0x%08X, // Range Maximum\n", bi->memlimit32 - 1); dsdt_line(" 0x00000000, // Translation Offset"); dsdt_line(" 0x%08X, // Length\n", bi->memlimit32 - bi->membase32); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); /* mmio window (64-bit) */ dsdt_line(" QWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x0000000000000000, // Granularity"); dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64); dsdt_line(" 0x%016lX, // Range Maximum\n", bi->memlimit64 - 1); dsdt_line(" 0x0000000000000000, // Translation Offset"); dsdt_line(" 0x%016lX, // Length\n", bi->memlimit64 - bi->membase64); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); dsdt_line(" })"); count = pci_count_lintr(bus); if (count != 0) { dsdt_indent(2); dsdt_line("Name (PPRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); dsdt_line("})"); dsdt_line("Name (APRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_apic_prt_entry, NULL); dsdt_line("})"); dsdt_line("Method (_PRT, 0, NotSerialized)"); dsdt_line("{"); dsdt_line(" If (PICM)"); dsdt_line(" {"); dsdt_line(" Return (APRT)"); dsdt_line(" }"); dsdt_line(" Else"); dsdt_line(" {"); dsdt_line(" Return (PPRT)"); dsdt_line(" }"); dsdt_line("}"); dsdt_unindent(2); } dsdt_indent(2); for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { pi = si->si_funcs[func].fi_devi; if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL) pi->pi_d->pe_write_dsdt(pi); } } dsdt_unindent(2); done: dsdt_line(" }"); } void pci_write_dsdt(void) { int bus; dsdt_indent(1); dsdt_line("Name (PICM, 0x00)"); dsdt_line("Method (_PIC, 1, NotSerialized)"); dsdt_line("{"); dsdt_line(" Store (Arg0, PICM)"); dsdt_line("}"); dsdt_line(""); dsdt_line("Scope (_SB)"); dsdt_line("{"); for (bus = 0; bus < MAXBUSES; bus++) pci_bus_write_dsdt(bus); dsdt_line("}"); dsdt_unindent(1); } int pci_bus_configured(int bus) { assert(bus >= 0 && bus < MAXBUSES); return (pci_businfo[bus] != NULL); } int pci_msi_enabled(struct pci_devinst *pi) { return (pi->pi_msi.enabled); } int pci_msi_maxmsgnum(struct pci_devinst *pi) { if (pi->pi_msi.enabled) return (pi->pi_msi.maxmsgnum); else return (0); } int pci_msix_enabled(struct pci_devinst *pi) { return (pi->pi_msix.enabled && !pi->pi_msi.enabled); } void pci_generate_msix(struct pci_devinst *pi, int index) { struct msix_table_entry *mte; if (!pci_msix_enabled(pi)) return; if (pi->pi_msix.function_mask) return; if (index >= pi->pi_msix.table_count) return; mte = &pi->pi_msix.table[index]; if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { /* XXX Set PBA bit if interrupt is disabled */ vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data); } } void pci_generate_msi(struct pci_devinst *pi, int index) { if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) { vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr, pi->pi_msi.msg_data + index); } } static bool pci_lintr_permitted(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (!(pi->pi_msi.enabled || pi->pi_msix.enabled || (cmd & PCIM_CMD_INTxDIS))); } void pci_lintr_request(struct pci_devinst *pi) { struct businfo *bi; struct slotinfo *si; int bestpin, bestcount, pin; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); /* * Just allocate a pin from our slot. The pin will be * assigned IRQs later when interrupts are routed. */ si = &bi->slotinfo[pi->pi_slot]; bestpin = 0; bestcount = si->si_intpins[0].ii_count; for (pin = 1; pin < 4; pin++) { if (si->si_intpins[pin].ii_count < bestcount) { bestpin = pin; bestcount = si->si_intpins[pin].ii_count; } } si->si_intpins[bestpin].ii_count++; pi->pi_lintr.pin = bestpin + 1; pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1); } static void pci_lintr_route(struct pci_devinst *pi) { struct businfo *bi; struct intxinfo *ii; if (pi->pi_lintr.pin == 0) return; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1]; /* * Attempt to allocate an I/O APIC pin for this intpin if one * is not yet assigned. */ if (ii->ii_ioapic_irq == 0) ii->ii_ioapic_irq = ioapic_pci_alloc_irq(); assert(ii->ii_ioapic_irq > 0); /* * Attempt to allocate a PIRQ pin for this intpin if one is * not yet assigned. */ if (ii->ii_pirq_pin == 0) ii->ii_pirq_pin = pirq_alloc_pin(pi->pi_vmctx); assert(ii->ii_pirq_pin > 0); pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq; pi->pi_lintr.pirq_pin = ii->ii_pirq_pin; pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin)); } void pci_lintr_assert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == IDLE) { if (pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } else pi->pi_lintr.state = PENDING; } pthread_mutex_unlock(&pi->pi_lintr.lock); } void pci_lintr_deassert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED) { pi->pi_lintr.state = IDLE; pci_irq_deassert(pi); } else if (pi->pi_lintr.state == PENDING) pi->pi_lintr.state = IDLE; pthread_mutex_unlock(&pi->pi_lintr.lock); } static void pci_lintr_update(struct pci_devinst *pi) { pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) { pci_irq_deassert(pi); pi->pi_lintr.state = PENDING; } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } pthread_mutex_unlock(&pi->pi_lintr.lock); } int pci_count_lintr(int bus) { int count, slot, pin; struct slotinfo *slotinfo; count = 0; if (pci_businfo[bus] != NULL) { for (slot = 0; slot < MAXSLOTS; slot++) { slotinfo = &pci_businfo[bus]->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { if (slotinfo->si_intpins[pin].ii_count != 0) count++; } } } return (count); } void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg) { struct businfo *bi; struct slotinfo *si; struct intxinfo *ii; int slot, pin; if ((bi = pci_businfo[bus]) == NULL) return; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { ii = &si->si_intpins[pin]; if (ii->ii_count != 0) cb(bus, slot, pin + 1, ii->ii_pirq_pin, ii->ii_ioapic_irq, arg); } } } /* * Return 1 if the emulated device in 'slot' is a multi-function device. * Return 0 otherwise. */ static int pci_emul_is_mfdev(int bus, int slot) { struct businfo *bi; struct slotinfo *si; int f, numfuncs; numfuncs = 0; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; for (f = 0; f < MAXFUNCS; f++) { if (si->si_funcs[f].fi_devi != NULL) { numfuncs++; } } } return (numfuncs > 1); } /* * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on * whether or not is a multi-function being emulated in the pci 'slot'. */ static void pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) { int mfdev; if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) { mfdev = pci_emul_is_mfdev(bus, slot); switch (bytes) { case 1: case 2: *rv &= ~PCIM_MFDEV; if (mfdev) { *rv |= PCIM_MFDEV; } break; case 4: *rv &= ~(PCIM_MFDEV << 16); if (mfdev) { *rv |= (PCIM_MFDEV << 16); } break; } } } -static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; - -static int -pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) -{ - uint32_t x; - - if (bytes != 4) { - if (in) - *eax = (bytes == 2) ? 0xffff : 0xff; - return (0); - } - - if (in) { - x = (cfgbus << 16) | - (cfgslot << 11) | - (cfgfunc << 8) | - cfgoff; - if (cfgenable) - x |= CONF1_ENABLE; - *eax = x; - } else { - x = *eax; - cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; - cfgoff = x & PCI_REGMAX; - cfgfunc = (x >> 8) & PCI_FUNCMAX; - cfgslot = (x >> 11) & PCI_SLOTMAX; - cfgbus = (x >> 16) & PCI_BUSMAX; - } - - return (0); -} -INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); - static uint32_t bits_changed(uint32_t old, uint32_t new, uint32_t mask) { return ((old ^ new) & mask); } static void pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes) { int i; uint16_t old; /* * The command register is at an offset of 4 bytes and thus the * guest could write 1, 2 or 4 bytes starting at this offset. */ old = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ CFGWRITE(pi, PCIR_COMMAND, new, bytes); /* update config */ new = pci_get_cfgdata16(pi, PCIR_COMMAND); /* get updated value */ /* * If the MMIO or I/O address space decoding has changed then * register/unregister all BARs that decode that address space. */ for (i = 0; i <= PCI_BARMAX; i++) { switch (pi->pi_bar[i].type) { case PCIBAR_NONE: case PCIBAR_MEMHI64: break; case PCIBAR_IO: /* I/O address space decoding changed? */ if (bits_changed(old, new, PCIM_CMD_PORTEN)) { if (porten(pi)) register_bar(pi, i); else unregister_bar(pi, i); } break; case PCIBAR_MEM32: case PCIBAR_MEM64: /* MMIO address space decoding changed? */ if (bits_changed(old, new, PCIM_CMD_MEMEN)) { if (memen(pi)) register_bar(pi, i); else unregister_bar(pi, i); } break; default: assert(0); } } /* * If INTx has been unmasked and is pending, assert the * interrupt. */ pci_lintr_update(pi); } -static int -pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) +static void +pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, + int coff, int bytes, uint32_t *eax) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; struct pci_devemu *pe; - int coff, idx, needcfg; + int idx, needcfg; uint64_t addr, bar, mask; - assert(bytes == 1 || bytes == 2 || bytes == 4); - - if ((bi = pci_businfo[cfgbus]) != NULL) { - si = &bi->slotinfo[cfgslot]; - pi = si->si_funcs[cfgfunc].fi_devi; + if ((bi = pci_businfo[bus]) != NULL) { + si = &bi->slotinfo[slot]; + pi = si->si_funcs[func].fi_devi; } else pi = NULL; - coff = cfgoff + (port - CONF1_DATA_PORT); - -#if 0 - printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r", - in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc); -#endif - /* - * Just return if there is no device at this cfgslot:cfgfunc, - * if the guest is doing an un-aligned access, or if the config - * address word isn't enabled. + * Just return if there is no device at this slot:func or if the + * the guest is doing an un-aligned access. */ - if (!cfgenable || pi == NULL || (coff & (bytes - 1)) != 0) { + if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || + (coff & (bytes - 1)) != 0) { if (in) *eax = 0xffffffff; - return (0); + return; + } + + /* + * Ignore all writes beyond the standard config space and return all + * ones on reads. + */ + if (coff >= PCI_REGMAX + 1) { + if (in) { + *eax = 0xffffffff; + /* + * Extended capabilities begin at offset 256 in config + * space. Absence of extended capabilities is signaled + * with all 0s in the extended capability header at + * offset 256. + */ + if (coff <= PCI_REGMAX + 4) + *eax = 0x00000000; + } + return; } pe = pi->pi_d; /* * Config read */ if (in) { /* Let the device emulation override the default handler */ if (pe->pe_cfgread != NULL) { - needcfg = pe->pe_cfgread(ctx, vcpu, pi, - coff, bytes, eax); + needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes, + eax); } else { needcfg = 1; } if (needcfg) { if (bytes == 1) *eax = pci_get_cfgdata8(pi, coff); else if (bytes == 2) *eax = pci_get_cfgdata16(pi, coff); else *eax = pci_get_cfgdata32(pi, coff); } - pci_emul_hdrtype_fixup(cfgbus, cfgslot, coff, bytes, eax); + pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax); } else { /* Let the device emulation override the default handler */ if (pe->pe_cfgwrite != NULL && (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0) - return (0); + return; /* * Special handling for write to BAR registers */ if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) { /* * Ignore writes to BAR registers that are not * 4-byte aligned. */ if (bytes != 4 || (coff & 0x3) != 0) - return (0); + return; idx = (coff - PCIR_BAR(0)) / 4; mask = ~(pi->pi_bar[idx].size - 1); switch (pi->pi_bar[idx].type) { case PCIBAR_NONE: pi->pi_bar[idx].addr = bar = 0; break; case PCIBAR_IO: addr = *eax & mask; addr &= 0xffff; bar = addr | PCIM_BAR_IO_SPACE; /* * Register the new BAR value for interception */ if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_IO); } break; case PCIBAR_MEM32: addr = bar = *eax & mask; bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM32); } break; case PCIBAR_MEM64: addr = bar = *eax & mask; bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | PCIM_BAR_MEM_PREFETCH; if (addr != (uint32_t)pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM64); } break; case PCIBAR_MEMHI64: mask = ~(pi->pi_bar[idx - 1].size - 1); addr = ((uint64_t)*eax << 32) & mask; bar = addr >> 32; if (bar != pi->pi_bar[idx - 1].addr >> 32) { update_bar_address(pi, addr, idx - 1, PCIBAR_MEMHI64); } break; default: assert(0); } pci_set_cfgdata32(pi, coff, bar); } else if (pci_emul_iscap(pi, coff)) { pci_emul_capwrite(pi, coff, bytes, *eax); } else if (coff == PCIR_COMMAND) { pci_emul_cmdwrite(pi, *eax, bytes); } else { CFGWRITE(pi, coff, *eax, bytes); } } +} + +static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; + +static int +pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + uint32_t x; + + if (bytes != 4) { + if (in) + *eax = (bytes == 2) ? 0xffff : 0xff; + return (0); + } + + if (in) { + x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff; + if (cfgenable) + x |= CONF1_ENABLE; + *eax = x; + } else { + x = *eax; + cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; + cfgoff = x & PCI_REGMAX; + cfgfunc = (x >> 8) & PCI_FUNCMAX; + cfgslot = (x >> 11) & PCI_SLOTMAX; + cfgbus = (x >> 16) & PCI_BUSMAX; + } + + return (0); +} +INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); + +static int +pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int coff; + assert(bytes == 1 || bytes == 2 || bytes == 4); + + coff = cfgoff + (port - CONF1_DATA_PORT); + if (cfgenable) { + pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes, + eax); + } else { + /* Ignore accesses to cfgdata if not enabled by cfgaddr */ + if (in) + *eax = 0xffffffff; + } return (0); } INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); #define PCI_EMUL_TEST #ifdef PCI_EMUL_TEST /* * Define a dummy test device */ #define DIOSZ 8 #define DMEMSZ 4096 struct pci_emul_dsoftc { uint8_t ioregs[DIOSZ]; uint8_t memregs[DMEMSZ]; }; #define PCI_EMUL_MSI_MSGS 4 #define PCI_EMUL_MSIX_MSGS 16 static int pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { int error; struct pci_emul_dsoftc *sc; sc = calloc(1, sizeof(struct pci_emul_dsoftc)); pi->pi_arg = sc; pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS); assert(error == 0); error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); assert(error == 0); return (0); } static void pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { int i; struct pci_emul_dsoftc *sc = pi->pi_arg; if (baridx == 0) { if (offset + size > DIOSZ) { printf("diow: iow too large, offset %ld size %d\n", offset, size); return; } if (size == 1) { sc->ioregs[offset] = value & 0xff; } else if (size == 2) { *(uint16_t *)&sc->ioregs[offset] = value & 0xffff; } else if (size == 4) { *(uint32_t *)&sc->ioregs[offset] = value; } else { printf("diow: iow unknown size %d\n", size); } /* * Special magic value to generate an interrupt */ if (offset == 4 && size == 4 && pci_msi_enabled(pi)) pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi)); if (value == 0xabcdef) { for (i = 0; i < pci_msi_maxmsgnum(pi); i++) pci_generate_msi(pi, i); } } if (baridx == 1) { if (offset + size > DMEMSZ) { printf("diow: memw too large, offset %ld size %d\n", offset, size); return; } if (size == 1) { sc->memregs[offset] = value; } else if (size == 2) { *(uint16_t *)&sc->memregs[offset] = value; } else if (size == 4) { *(uint32_t *)&sc->memregs[offset] = value; } else if (size == 8) { *(uint64_t *)&sc->memregs[offset] = value; } else { printf("diow: memw unknown size %d\n", size); } /* * magic interrupt ?? */ } if (baridx > 1) { printf("diow: unknown bar idx %d\n", baridx); } } static uint64_t pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_emul_dsoftc *sc = pi->pi_arg; uint32_t value; if (baridx == 0) { if (offset + size > DIOSZ) { printf("dior: ior too large, offset %ld size %d\n", offset, size); return (0); } if (size == 1) { value = sc->ioregs[offset]; } else if (size == 2) { value = *(uint16_t *) &sc->ioregs[offset]; } else if (size == 4) { value = *(uint32_t *) &sc->ioregs[offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx == 1) { if (offset + size > DMEMSZ) { printf("dior: memr too large, offset %ld size %d\n", offset, size); return (0); } if (size == 1) { value = sc->memregs[offset]; } else if (size == 2) { value = *(uint16_t *) &sc->memregs[offset]; } else if (size == 4) { value = *(uint32_t *) &sc->memregs[offset]; } else if (size == 8) { value = *(uint64_t *) &sc->memregs[offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx > 1) { printf("dior: unknown bar idx %d\n", baridx); return (0); } return (value); } struct pci_devemu pci_dummy = { .pe_emu = "dummy", .pe_init = pci_emul_dinit, .pe_barwrite = pci_emul_diow, .pe_barread = pci_emul_dior }; PCI_EMUL_SET(pci_dummy); #endif /* PCI_EMUL_TEST */ diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h index 866ffc5b8224..6b8c4e0fd31c 100644 --- a/usr.sbin/bhyve/pci_emul.h +++ b/usr.sbin/bhyve/pci_emul.h @@ -1,282 +1,283 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _PCI_EMUL_H_ #define _PCI_EMUL_H_ #include #include #include #include #include #include #define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */ struct vmctx; struct pci_devinst; struct memory_region; struct pci_devemu { char *pe_emu; /* Name of device emulation */ /* instance creation */ int (*pe_init)(struct vmctx *, struct pci_devinst *, char *opts); /* ACPI DSDT enumeration */ void (*pe_write_dsdt)(struct pci_devinst *); /* config space read/write callbacks */ int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int offset, int bytes, uint32_t val); int (*pe_cfgread)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int offset, int bytes, uint32_t *retval); /* BAR read/write callbacks */ void (*pe_barwrite)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value); uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size); }; #define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x); enum pcibar_type { PCIBAR_NONE, PCIBAR_IO, PCIBAR_MEM32, PCIBAR_MEM64, PCIBAR_MEMHI64 }; struct pcibar { enum pcibar_type type; /* io or memory */ uint64_t size; uint64_t addr; }; #define PI_NAMESZ 40 struct msix_table_entry { uint64_t addr; uint32_t msg_data; uint32_t vector_control; } __packed; /* * In case the structure is modified to hold extra information, use a define * for the size that should be emulated. */ #define MSIX_TABLE_ENTRY_SIZE 16 #define MAX_MSIX_TABLE_ENTRIES 2048 #define PBA_SIZE(msgnum) (roundup2((msgnum), 64) / 8) enum lintr_stat { IDLE, ASSERTED, PENDING }; struct pci_devinst { struct pci_devemu *pi_d; struct vmctx *pi_vmctx; uint8_t pi_bus, pi_slot, pi_func; char pi_name[PI_NAMESZ]; int pi_bar_getsize; int pi_prevcap; int pi_capend; struct { int8_t pin; enum lintr_stat state; int pirq_pin; int ioapic_irq; pthread_mutex_t lock; } pi_lintr; struct { int enabled; uint64_t addr; uint64_t msg_data; int maxmsgnum; } pi_msi; struct { int enabled; int table_bar; int pba_bar; uint32_t table_offset; int table_count; uint32_t pba_offset; int pba_size; int function_mask; struct msix_table_entry *table; /* allocated at runtime */ } pi_msix; void *pi_arg; /* devemu-private data */ u_char pi_cfgdata[PCI_REGMAX + 1]; struct pcibar pi_bar[PCI_BARMAX + 1]; }; struct msicap { uint8_t capid; uint8_t nextptr; uint16_t msgctrl; uint32_t addrlo; uint32_t addrhi; uint16_t msgdata; } __packed; struct msixcap { uint8_t capid; uint8_t nextptr; uint16_t msgctrl; uint32_t table_info; /* bar index and offset within it */ uint32_t pba_info; /* bar index and offset within it */ } __packed; struct pciecap { uint8_t capid; uint8_t nextptr; uint16_t pcie_capabilities; uint32_t dev_capabilities; /* all devices */ uint16_t dev_control; uint16_t dev_status; uint32_t link_capabilities; /* devices with links */ uint16_t link_control; uint16_t link_status; uint32_t slot_capabilities; /* ports with slots */ uint16_t slot_control; uint16_t slot_status; uint16_t root_control; /* root ports */ uint16_t root_capabilities; uint32_t root_status; uint32_t dev_capabilities2; /* all devices */ uint16_t dev_control2; uint16_t dev_status2; uint32_t link_capabilities2; /* devices with links */ uint16_t link_control2; uint16_t link_status2; uint32_t slot_capabilities2; /* ports with slots */ uint16_t slot_control2; uint16_t slot_status2; } __packed; typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin, int ioapic_irq, void *arg); int init_pci(struct vmctx *ctx); void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val); void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val); void pci_callback(void); int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size); int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, enum pcibar_type type, uint64_t size); int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type); void pci_generate_msi(struct pci_devinst *pi, int msgnum); void pci_generate_msix(struct pci_devinst *pi, int msgnum); void pci_lintr_assert(struct pci_devinst *pi); void pci_lintr_deassert(struct pci_devinst *pi); void pci_lintr_request(struct pci_devinst *pi); int pci_msi_enabled(struct pci_devinst *pi); int pci_msix_enabled(struct pci_devinst *pi); int pci_msix_table_bar(struct pci_devinst *pi); int pci_msix_pba_bar(struct pci_devinst *pi); int pci_msi_msgnum(struct pci_devinst *pi); int pci_parse_slot(char *opt); void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum); int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, uint64_t value); uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size); int pci_count_lintr(int bus); void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg); void pci_write_dsdt(void); +uint64_t pci_ecfg_base(void); int pci_bus_configured(int bus); static __inline void pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val) { assert(offset <= PCI_REGMAX); *(uint8_t *)(pi->pi_cfgdata + offset) = val; } static __inline void pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val) { assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); *(uint16_t *)(pi->pi_cfgdata + offset) = val; } static __inline void pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val) { assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); *(uint32_t *)(pi->pi_cfgdata + offset) = val; } static __inline uint8_t pci_get_cfgdata8(struct pci_devinst *pi, int offset) { assert(offset <= PCI_REGMAX); return (*(uint8_t *)(pi->pi_cfgdata + offset)); } static __inline uint16_t pci_get_cfgdata16(struct pci_devinst *pi, int offset) { assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); return (*(uint16_t *)(pi->pi_cfgdata + offset)); } static __inline uint32_t pci_get_cfgdata32(struct pci_devinst *pi, int offset) { assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); return (*(uint32_t *)(pi->pi_cfgdata + offset)); } #endif /* _PCI_EMUL_H_ */