diff --git a/lib/libvmmapi/Makefile b/lib/libvmmapi/Makefile index 9c410833e09c..e5e513d84a51 100644 --- a/lib/libvmmapi/Makefile +++ b/lib/libvmmapi/Makefile @@ -1,12 +1,13 @@ # $FreeBSD$ PACKAGE=lib${LIB} LIB= vmmapi +SHLIB_MAJOR= 6 SRCS= vmmapi.c vmmapi_freebsd.c INCS= vmmapi.h LIBADD= util CFLAGS+= -I${.CURDIR} .include diff --git a/lib/libvmmapi/internal.h b/lib/libvmmapi/internal.h new file mode 100644 index 000000000000..142026e76345 --- /dev/null +++ b/lib/libvmmapi/internal.h @@ -0,0 +1,17 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 John Baldwin + */ + +#ifndef __VMMAPI_INTERNAL_H__ +#define __VMMAPI_INTERNAL_H__ + +struct vmctx; + +struct vcpu { + struct vmctx *ctx; + int vcpuid; +}; + +#endif /* !__VMMAPI_INTERNAL_H__ */ diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c index 451d679dbbba..2d0cc21aeb25 100644 --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -1,1802 +1,1836 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmmapi.h" +#include "internal.h" #define MB (1024 * 1024UL) #define GB (1024 * 1024 * 1024UL) /* * Size of the guard region before and after the virtual address space * mapping the guest physical memory. This must be a multiple of the * superpage size for performance reasons. */ #define VM_MMAP_GUARD_SIZE (4 * MB) #define PROT_RW (PROT_READ | PROT_WRITE) #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) struct vmctx { int fd; uint32_t lowmem_limit; int memflags; size_t lowmem; size_t highmem; char *baseaddr; char *name; }; #define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) #define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) static int vm_device_open(const char *name) { int fd, len; char *vmfile; len = strlen("/dev/vmm/") + strlen(name) + 1; vmfile = malloc(len); assert(vmfile != NULL); snprintf(vmfile, len, "/dev/vmm/%s", name); /* Open the device file */ fd = open(vmfile, O_RDWR, 0); free(vmfile); return (fd); } int vm_create(const char *name) { /* Try to load vmm(4) module before creating a guest. */ if (modfind("vmm") < 0) kldload("vmm"); return (CREATE(name)); } struct vmctx * vm_open(const char *name) { struct vmctx *vm; int saved_errno; vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); assert(vm != NULL); vm->fd = -1; vm->memflags = 0; vm->lowmem_limit = 3 * GB; vm->name = (char *)(vm + 1); strcpy(vm->name, name); if ((vm->fd = vm_device_open(vm->name)) < 0) goto err; return (vm); err: saved_errno = errno; free(vm); errno = saved_errno; return (NULL); } void vm_close(struct vmctx *vm) { assert(vm != NULL); close(vm->fd); free(vm); } void vm_destroy(struct vmctx *vm) { assert(vm != NULL); if (vm->fd >= 0) close(vm->fd); DESTROY(vm->name); free(vm); } +struct vcpu * +vm_vcpu_open(struct vmctx *ctx, int vcpuid) +{ + struct vcpu *vcpu; + + vcpu = malloc(sizeof(*vcpu)); + vcpu->ctx = ctx; + vcpu->vcpuid = vcpuid; + return (vcpu); +} + +void +vm_vcpu_close(struct vcpu *vcpu) +{ + free(vcpu); +} + +int +vcpu_id(struct vcpu *vcpu) +{ + return (vcpu->vcpuid); +} + int vm_parse_memsize(const char *opt, size_t *ret_memsize) { char *endptr; size_t optval; int error; optval = strtoul(opt, &endptr, 0); if (*opt != '\0' && *endptr == '\0') { /* * For the sake of backward compatibility if the memory size * specified on the command line is less than a megabyte then * it is interpreted as being in units of MB. */ if (optval < MB) optval *= MB; *ret_memsize = optval; error = 0; } else error = expand_number(opt, ret_memsize); return (error); } uint32_t vm_get_lowmem_limit(struct vmctx *ctx) { return (ctx->lowmem_limit); } void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit) { ctx->lowmem_limit = limit; } void vm_set_memflags(struct vmctx *ctx, int flags) { ctx->memflags = flags; } int vm_get_memflags(struct vmctx *ctx) { return (ctx->memflags); } /* * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot) { struct vm_memmap memmap; int error, flags; memmap.gpa = gpa; memmap.segid = segid; memmap.segoff = off; memmap.len = len; memmap.prot = prot; memmap.flags = 0; if (ctx->memflags & VM_MEM_F_WIRED) memmap.flags |= VM_MEMMAP_F_WIRED; /* * If this mapping already exists then don't create it again. This * is the common case for SYSMEM mappings created by bhyveload(8). */ error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); if (error == 0 && gpa == memmap.gpa) { if (segid != memmap.segid || off != memmap.segoff || prot != memmap.prot || flags != memmap.flags) { errno = EEXIST; return (-1); } else { return (0); } } error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); return (error); } int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, size_t *lowmem_size, size_t *highmem_size) { *guest_baseaddr = ctx->baseaddr; *lowmem_size = ctx->lowmem; *highmem_size = ctx->highmem; return (0); } int vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len) { struct vm_munmap munmap; int error; munmap.gpa = gpa; munmap.len = len; error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap); return (error); } int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { struct vm_memmap memmap; int error; bzero(&memmap, sizeof(struct vm_memmap)); memmap.gpa = *gpa; error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); if (error == 0) { *gpa = memmap.gpa; *segid = memmap.segid; *segoff = memmap.segoff; *len = memmap.len; *prot = memmap.prot; *flags = memmap.flags; } return (error); } /* * Return 0 if the segments are identical and non-zero otherwise. * * This is slightly complicated by the fact that only device memory segments * are named. */ static int cmpseg(size_t len, const char *str, size_t len2, const char *str2) { if (len == len2) { if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) return (0); } return (-1); } static int vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) { struct vm_memseg memseg; size_t n; int error; /* * If the memory segment has already been created then just return. * This is the usual case for the SYSMEM segment created by userspace * loaders like bhyveload(8). */ error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, sizeof(memseg.name)); if (error) return (error); if (memseg.len != 0) { if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { errno = EINVAL; return (-1); } else { return (0); } } bzero(&memseg, sizeof(struct vm_memseg)); memseg.segid = segid; memseg.len = len; if (name != NULL) { n = strlcpy(memseg.name, name, sizeof(memseg.name)); if (n >= sizeof(memseg.name)) { errno = ENAMETOOLONG; return (-1); } } error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); return (error); } int vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, size_t bufsize) { struct vm_memseg memseg; size_t n; int error; memseg.segid = segid; error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); if (error == 0) { *lenp = memseg.len; n = strlcpy(namebuf, memseg.name, bufsize); if (n >= bufsize) { errno = ENAMETOOLONG; error = -1; } } return (error); } static int setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) { char *ptr; int error, flags; /* Map 'len' bytes starting at 'gpa' in the guest address space */ error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); if (error) return (error); flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap into the process address space on the host */ ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); if (ptr == MAP_FAILED) return (-1); return (0); } int vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) { size_t objsize, len; vm_paddr_t gpa; char *baseaddr, *ptr; int error; assert(vms == VM_MMAP_ALL); /* * If 'memsize' cannot fit entirely in the 'lowmem' segment then * create another 'highmem' segment above 4GB for the remainder. */ if (memsize > ctx->lowmem_limit) { ctx->lowmem = ctx->lowmem_limit; ctx->highmem = memsize - ctx->lowmem_limit; objsize = 4*GB + ctx->highmem; } else { ctx->lowmem = memsize; ctx->highmem = 0; objsize = ctx->lowmem; } error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); if (error) return (error); /* * Stake out a contiguous region covering the guest physical memory * and the adjoining guard regions. */ len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (ptr == MAP_FAILED) return (-1); baseaddr = ptr + VM_MMAP_GUARD_SIZE; if (ctx->highmem > 0) { gpa = 4*GB; len = ctx->highmem; error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } if (ctx->lowmem > 0) { gpa = 0; len = ctx->lowmem; error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } ctx->baseaddr = baseaddr; return (0); } /* * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in * the lowmem or highmem regions. * * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. * The instruction emulation code depends on this behavior. */ void * vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) { if (ctx->lowmem > 0) { if (gaddr < ctx->lowmem && len <= ctx->lowmem && gaddr + len <= ctx->lowmem) return (ctx->baseaddr + gaddr); } if (ctx->highmem > 0) { if (gaddr >= 4*GB) { if (gaddr < 4*GB + ctx->highmem && len <= ctx->highmem && gaddr + len <= 4*GB + ctx->highmem) return (ctx->baseaddr + gaddr); } } return (NULL); } vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr) { vm_paddr_t offaddr; offaddr = (char *)addr - ctx->baseaddr; if (ctx->lowmem > 0) if (offaddr <= ctx->lowmem) return (offaddr); if (ctx->highmem > 0) if (offaddr >= 4*GB && offaddr < 4*GB + ctx->highmem) return (offaddr); return ((vm_paddr_t)-1); } const char * vm_get_name(struct vmctx *ctx) { return (ctx->name); } size_t vm_get_lowmem_size(struct vmctx *ctx) { return (ctx->lowmem); } size_t vm_get_highmem_size(struct vmctx *ctx) { return (ctx->highmem); } void * vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) { char pathname[MAXPATHLEN]; size_t len2; char *base, *ptr; int fd, error, flags; fd = -1; ptr = MAP_FAILED; if (name == NULL || strlen(name) == 0) { errno = EINVAL; goto done; } error = vm_alloc_memseg(ctx, segid, len, name); if (error) goto done; strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); strlcat(pathname, ctx->name, sizeof(pathname)); strlcat(pathname, ".", sizeof(pathname)); strlcat(pathname, name, sizeof(pathname)); fd = open(pathname, O_RDWR); if (fd < 0) goto done; /* * Stake out a contiguous region covering the device memory and the * adjoining guard regions. */ len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (base == MAP_FAILED) goto done; flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap the devmem region in the host address space */ ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); done: if (fd >= 0) close(fd); return (ptr); } +static int +vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg) +{ + /* + * XXX: fragile, handle with care + * Assumes that the first field of the ioctl data + * is the vcpuid. + */ + *(int *)arg = vcpu->vcpuid; + return (ioctl(vcpu->ctx->fd, cmd, arg)); +} + int -vm_set_desc(struct vmctx *ctx, int vcpu, int reg, +vm_set_desc(struct vcpu *vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); - vmsegdesc.cpuid = vcpu; vmsegdesc.regnum = reg; vmsegdesc.desc.base = base; vmsegdesc.desc.limit = limit; vmsegdesc.desc.access = access; - error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc); + error = vcpu_ioctl(vcpu, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc); return (error); } int -vm_get_desc(struct vmctx *ctx, int vcpu, int reg, - uint64_t *base, uint32_t *limit, uint32_t *access) +vm_get_desc(struct vcpu *vcpu, int reg, uint64_t *base, uint32_t *limit, + uint32_t *access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); - vmsegdesc.cpuid = vcpu; vmsegdesc.regnum = reg; - error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc); + error = vcpu_ioctl(vcpu, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc); if (error == 0) { *base = vmsegdesc.desc.base; *limit = vmsegdesc.desc.limit; *access = vmsegdesc.desc.access; } return (error); } int -vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc) +vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *seg_desc) { int error; - error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit, + error = vm_get_desc(vcpu, reg, &seg_desc->base, &seg_desc->limit, &seg_desc->access); return (error); } int -vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val) +vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); - vmreg.cpuid = vcpu; vmreg.regnum = reg; vmreg.regval = val; - error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg); + error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg); return (error); } int -vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val) +vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); - vmreg.cpuid = vcpu; vmreg.regnum = reg; - error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg); + error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg); *ret_val = vmreg.regval; return (error); } int -vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count, +vm_set_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals) { int error; struct vm_register_set vmregset; bzero(&vmregset, sizeof(vmregset)); - vmregset.cpuid = vcpu; vmregset.count = count; vmregset.regnums = regnums; vmregset.regvals = regvals; - error = ioctl(ctx->fd, VM_SET_REGISTER_SET, &vmregset); + error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset); return (error); } int -vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count, +vm_get_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals) { int error; struct vm_register_set vmregset; bzero(&vmregset, sizeof(vmregset)); - vmregset.cpuid = vcpu; vmregset.count = count; vmregset.regnums = regnums; vmregset.regvals = regvals; - error = ioctl(ctx->fd, VM_GET_REGISTER_SET, &vmregset); + error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset); return (error); } int -vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) +vm_run(struct vcpu *vcpu, struct vm_exit *vmexit) { int error; struct vm_run vmrun; bzero(&vmrun, sizeof(vmrun)); - vmrun.cpuid = vcpu; - error = ioctl(ctx->fd, VM_RUN, &vmrun); + error = vcpu_ioctl(vcpu, VM_RUN, &vmrun); bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); return (error); } int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) { struct vm_suspend vmsuspend; bzero(&vmsuspend, sizeof(vmsuspend)); vmsuspend.how = how; return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); } int vm_reinit(struct vmctx *ctx) { return (ioctl(ctx->fd, VM_REINIT, 0)); } int -vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid, +vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction) { struct vm_exception exc; - exc.cpuid = vcpu; exc.vector = vector; exc.error_code = errcode; exc.error_code_valid = errcode_valid; exc.restart_instruction = restart_instruction; - return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc)); + return (vcpu_ioctl(vcpu, VM_INJECT_EXCEPTION, &exc)); } int vm_apicid2vcpu(struct vmctx *ctx __unused, int apicid) { /* * The apic id associated with the 'vcpu' has the same numerical value * as the 'vcpu' itself. */ return (apicid); } int -vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector) +vm_lapic_irq(struct vcpu *vcpu, int vector) { struct vm_lapic_irq vmirq; bzero(&vmirq, sizeof(vmirq)); - vmirq.cpuid = vcpu; vmirq.vector = vector; - return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq)); + return (vcpu_ioctl(vcpu, VM_LAPIC_IRQ, &vmirq)); } int -vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector) +vm_lapic_local_irq(struct vcpu *vcpu, int vector) { struct vm_lapic_irq vmirq; bzero(&vmirq, sizeof(vmirq)); - vmirq.cpuid = vcpu; vmirq.vector = vector; - return (ioctl(ctx->fd, VM_LAPIC_LOCAL_IRQ, &vmirq)); + return (vcpu_ioctl(vcpu, VM_LAPIC_LOCAL_IRQ, &vmirq)); } int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg) { struct vm_lapic_msi vmmsi; bzero(&vmmsi, sizeof(vmmsi)); vmmsi.addr = addr; vmmsi.msg = msg; return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi)); } int vm_ioapic_assert_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq)); } int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq)); } int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq)); } int vm_ioapic_pincount(struct vmctx *ctx, int *pincount) { return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount)); } int -vm_readwrite_kernemu_device(struct vmctx *ctx, int vcpu, vm_paddr_t gpa, +vm_readwrite_kernemu_device(struct vcpu *vcpu, vm_paddr_t gpa, bool write, int size, uint64_t *value) { struct vm_readwrite_kernemu_device irp = { - .vcpuid = vcpu, .access_width = fls(size) - 1, .gpa = gpa, .value = write ? *value : ~0ul, }; long cmd = (write ? VM_SET_KERNEMU_DEV : VM_GET_KERNEMU_DEV); int rc; - rc = ioctl(ctx->fd, cmd, &irp); + rc = vcpu_ioctl(vcpu, cmd, &irp); if (rc == 0 && !write) *value = irp.value; return (rc); } int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq)); } int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq)); } int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq)); } int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger) { struct vm_isa_irq_trigger isa_irq_trigger; bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger)); isa_irq_trigger.atpic_irq = atpic_irq; isa_irq_trigger.trigger = trigger; return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger)); } int -vm_inject_nmi(struct vmctx *ctx, int vcpu) +vm_inject_nmi(struct vcpu *vcpu) { struct vm_nmi vmnmi; bzero(&vmnmi, sizeof(vmnmi)); - vmnmi.cpuid = vcpu; - return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi)); + return (vcpu_ioctl(vcpu, VM_INJECT_NMI, &vmnmi)); } static const char *capstrmap[] = { [VM_CAP_HALT_EXIT] = "hlt_exit", [VM_CAP_MTRAP_EXIT] = "mtrap_exit", [VM_CAP_PAUSE_EXIT] = "pause_exit", [VM_CAP_UNRESTRICTED_GUEST] = "unrestricted_guest", [VM_CAP_ENABLE_INVPCID] = "enable_invpcid", [VM_CAP_BPT_EXIT] = "bpt_exit", }; int vm_capability_name2type(const char *capname) { int i; for (i = 0; i < (int)nitems(capstrmap); i++) { if (strcmp(capstrmap[i], capname) == 0) return (i); } return (-1); } const char * vm_capability_type2name(int type) { if (type >= 0 && type < (int)nitems(capstrmap)) return (capstrmap[type]); return (NULL); } int -vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, - int *retval) +vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval) { int error; struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); - vmcap.cpuid = vcpu; vmcap.captype = cap; - error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap); + error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap); *retval = vmcap.capval; return (error); } int -vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val) +vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val) { struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); - vmcap.cpuid = vcpu; vmcap.captype = cap; vmcap.capval = val; - return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap)); + return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap)); } int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev)); } int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev)); } int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { struct vm_pptdev_mmio pptmmio; bzero(&pptmmio, sizeof(pptmmio)); pptmmio.bus = bus; pptmmio.slot = slot; pptmmio.func = func; pptmmio.gpa = gpa; pptmmio.len = len; pptmmio.hpa = hpa; return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio)); } int vm_unmap_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len) { struct vm_pptdev_mmio pptmmio; bzero(&pptmmio, sizeof(pptmmio)); pptmmio.bus = bus; pptmmio.slot = slot; pptmmio.func = func; pptmmio.gpa = gpa; pptmmio.len = len; return (ioctl(ctx->fd, VM_UNMAP_PPTDEV_MMIO, &pptmmio)); } int -vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, +vm_setup_pptdev_msi(struct vmctx *ctx, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec) { struct vm_pptdev_msi pptmsi; bzero(&pptmsi, sizeof(pptmsi)); - pptmsi.vcpu = vcpu; pptmsi.bus = bus; pptmsi.slot = slot; pptmsi.func = func; pptmsi.msg = msg; pptmsi.addr = addr; pptmsi.numvec = numvec; return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi)); } int -vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, +vm_setup_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control) { struct vm_pptdev_msix pptmsix; bzero(&pptmsix, sizeof(pptmsix)); - pptmsix.vcpu = vcpu; pptmsix.bus = bus; pptmsix.slot = slot; pptmsix.func = func; pptmsix.idx = idx; pptmsix.msg = msg; pptmsix.addr = addr; pptmsix.vector_control = vector_control; return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix); } int vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev ppt; bzero(&ppt, sizeof(ppt)); ppt.bus = bus; ppt.slot = slot; ppt.func = func; return ioctl(ctx->fd, VM_PPTDEV_DISABLE_MSIX, &ppt); } uint64_t * -vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, +vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, int *ret_entries) { static _Thread_local uint64_t *stats_buf; static _Thread_local u_int stats_count; uint64_t *new_stats; struct vm_stats vmstats; u_int count, index; bool have_stats; have_stats = false; - vmstats.cpuid = vcpu; count = 0; for (index = 0;; index += nitems(vmstats.statbuf)) { vmstats.index = index; - if (ioctl(ctx->fd, VM_STATS, &vmstats) != 0) + if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0) break; if (stats_count < index + vmstats.num_entries) { new_stats = realloc(stats_buf, (index + vmstats.num_entries) * sizeof(uint64_t)); if (new_stats == NULL) { errno = ENOMEM; return (NULL); } stats_count = index + vmstats.num_entries; stats_buf = new_stats; } memcpy(stats_buf + index, vmstats.statbuf, vmstats.num_entries * sizeof(uint64_t)); count += vmstats.num_entries; have_stats = true; if (vmstats.num_entries != nitems(vmstats.statbuf)) break; } if (have_stats) { if (ret_entries) *ret_entries = count; if (ret_tv) *ret_tv = vmstats.tv; return (stats_buf); } else return (NULL); } const char * vm_get_stat_desc(struct vmctx *ctx, int index) { static struct vm_stat_desc statdesc; statdesc.index = index; if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) return (statdesc.desc); else return (NULL); } int -vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state) +vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); - x2apic.cpuid = vcpu; - error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic); + error = vcpu_ioctl(vcpu, VM_GET_X2APIC_STATE, &x2apic); *state = x2apic.state; return (error); } int -vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state) +vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); - x2apic.cpuid = vcpu; x2apic.state = state; - error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic); + error = vcpu_ioctl(vcpu, VM_SET_X2APIC_STATE, &x2apic); return (error); } /* * From Intel Vol 3a: * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT */ int -vcpu_reset(struct vmctx *vmctx, int vcpu) +vcpu_reset(struct vcpu *vcpu) { int error; uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx; uint32_t desc_access, desc_limit; uint16_t sel; zero = 0; rflags = 0x2; - error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); + error = vm_set_register(vcpu, VM_REG_GUEST_RFLAGS, rflags); if (error) goto done; rip = 0xfff0; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_RIP, rip)) != 0) goto done; /* * According to Intels Software Developer Manual CR0 should be * initialized with CR0_ET | CR0_NW | CR0_CD but that crashes some * guests like Windows. */ cr0 = CR0_NE; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR0, cr0)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR2, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR2, zero)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR3, zero)) != 0) goto done; cr4 = 0; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR4, cr4)) != 0) goto done; /* * CS: present, r/w, accessed, 16-bit, byte granularity, usable */ desc_base = 0xffff0000; desc_limit = 0xffff; desc_access = 0x0093; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, + error = vm_set_desc(vcpu, VM_REG_GUEST_CS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0xf000; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_CS, sel)) != 0) goto done; /* * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0093; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, + error = vm_set_desc(vcpu, VM_REG_GUEST_SS, desc_base, desc_limit, desc_access); if (error) goto done; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, + error = vm_set_desc(vcpu, VM_REG_GUEST_DS, desc_base, desc_limit, desc_access); if (error) goto done; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, + error = vm_set_desc(vcpu, VM_REG_GUEST_ES, desc_base, desc_limit, desc_access); if (error) goto done; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, + error = vm_set_desc(vcpu, VM_REG_GUEST_FS, desc_base, desc_limit, desc_access); if (error) goto done; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, + error = vm_set_desc(vcpu, VM_REG_GUEST_GS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_SS, sel)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_DS, sel)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_ES, sel)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_FS, sel)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_GS, sel)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_EFER, zero)) != 0) goto done; /* General purpose registers */ rdx = 0xf00; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_RAX, zero)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_RBX, zero)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_RCX, zero)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_RDX, rdx)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_RSI, zero)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_RDI, zero)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_RBP, zero)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_RSP, zero)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R8, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_R8, zero)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R9, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_R9, zero)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R10, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_R10, zero)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R11, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_R11, zero)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R12, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_R12, zero)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R13, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_R13, zero)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R14, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_R14, zero)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R15, zero)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_R15, zero)) != 0) goto done; /* GDTR, IDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, + error = vm_set_desc(vcpu, VM_REG_GUEST_GDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR, + error = vm_set_desc(vcpu, VM_REG_GUEST_IDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; /* TR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0000008b; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); + error = vm_set_desc(vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); if (error) goto done; sel = 0; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_TR, sel)) != 0) goto done; /* LDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x00000082; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base, + error = vm_set_desc(vcpu, VM_REG_GUEST_LDTR, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_LDTR, 0)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DR6, + if ((error = vm_set_register(vcpu, VM_REG_GUEST_DR6, 0xffff0ff0)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DR7, 0x400)) != + if ((error = vm_set_register(vcpu, VM_REG_GUEST_DR7, 0x400)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_INTR_SHADOW, + if ((error = vm_set_register(vcpu, VM_REG_GUEST_INTR_SHADOW, zero)) != 0) goto done; error = 0; done: return (error); } int vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) { int error, i; struct vm_gpa_pte gpapte; bzero(&gpapte, sizeof(gpapte)); gpapte.gpa = gpa; error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); if (error == 0) { *num = gpapte.ptenum; for (i = 0; i < gpapte.ptenum; i++) pte[i] = gpapte.pte[i]; } return (error); } int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities) { int error; struct vm_hpet_cap cap; bzero(&cap, sizeof(struct vm_hpet_cap)); error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap); if (capabilities != NULL) *capabilities = cap.capabilities; return (error); } int -vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; bzero(&gg, sizeof(struct vm_gla2gpa)); - gg.vcpuid = vcpu; gg.prot = prot; gg.gla = gla; gg.paging = *paging; - error = ioctl(ctx->fd, VM_GLA2GPA, &gg); + error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg); if (error == 0) { *fault = gg.fault; *gpa = gg.gpa; } return (error); } int -vm_gla2gpa_nofault(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; bzero(&gg, sizeof(struct vm_gla2gpa)); - gg.vcpuid = vcpu; gg.prot = prot; gg.gla = gla; gg.paging = *paging; - error = ioctl(ctx->fd, VM_GLA2GPA_NOFAULT, &gg); + error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg); if (error == 0) { *fault = gg.fault; *gpa = gg.gpa; } return (error); } #ifndef min #define min(a,b) (((a) < (b)) ? (a) : (b)) #endif int -vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault) { void *va; uint64_t gpa, off; int error, i, n; for (i = 0; i < iovcnt; i++) { iov[i].iov_base = 0; iov[i].iov_len = 0; } while (len) { assert(iovcnt > 0); - error = vm_gla2gpa(ctx, vcpu, paging, gla, prot, &gpa, fault); + error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = MIN(len, PAGE_SIZE - off); - va = vm_map_gpa(ctx, gpa, n); + va = vm_map_gpa(vcpu->ctx, gpa, n); if (va == NULL) return (EFAULT); iov->iov_base = va; iov->iov_len = n; iov++; iovcnt--; gla += n; len -= n; } return (0); } void vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused) { /* * Intentionally empty. This is used by the instruction * emulation code shared with the kernel. The in-kernel * version of this is non-empty. */ } void vm_copyin(struct iovec *iov, void *vp, size_t len) { const char *src; char *dst; size_t n; dst = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); src = iov->iov_base; bcopy(src, dst, n); iov++; dst += n; len -= n; } } void vm_copyout(const void *vp, struct iovec *iov, size_t len) { const char *src; char *dst; size_t n; src = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); dst = iov->iov_base; bcopy(src, dst, n); iov++; src += n; len -= n; } } static int vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) { struct vm_cpuset vm_cpuset; int error; bzero(&vm_cpuset, sizeof(struct vm_cpuset)); vm_cpuset.which = which; vm_cpuset.cpusetsize = sizeof(cpuset_t); vm_cpuset.cpus = cpus; error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); return (error); } int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); } int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); } int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); } int -vm_activate_cpu(struct vmctx *ctx, int vcpu) +vm_activate_cpu(struct vcpu *vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); - ac.vcpuid = vcpu; - error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac); + error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac); return (error); } int -vm_suspend_cpu(struct vmctx *ctx, int vcpu) +vm_suspend_all_cpus(struct vmctx *ctx) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); - ac.vcpuid = vcpu; + ac.vcpuid = -1; error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); return (error); } int -vm_resume_cpu(struct vmctx *ctx, int vcpu) +vm_suspend_cpu(struct vcpu *vcpu) +{ + struct vm_activate_cpu ac; + int error; + + bzero(&ac, sizeof(struct vm_activate_cpu)); + error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac); + return (error); +} + +int +vm_resume_cpu(struct vcpu *vcpu) +{ + struct vm_activate_cpu ac; + int error; + + bzero(&ac, sizeof(struct vm_activate_cpu)); + error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac); + return (error); +} + +int +vm_resume_all_cpus(struct vmctx *ctx) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); - ac.vcpuid = vcpu; + ac.vcpuid = -1; error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); return (error); } int -vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2) +vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); - vmii.vcpuid = vcpu; - error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii); + error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii); if (error == 0) { *info1 = vmii.info1; *info2 = vmii.info2; } return (error); } int -vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1) +vm_set_intinfo(struct vcpu *vcpu, uint64_t info1) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); - vmii.vcpuid = vcpu; vmii.info1 = info1; - error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii); + error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii); return (error); } int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value) { struct vm_rtc_data rtcdata; int error; bzero(&rtcdata, sizeof(struct vm_rtc_data)); rtcdata.offset = offset; rtcdata.value = value; error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata); return (error); } int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval) { struct vm_rtc_data rtcdata; int error; bzero(&rtcdata, sizeof(struct vm_rtc_data)); rtcdata.offset = offset; error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata); if (error == 0) *retval = rtcdata.value; return (error); } int vm_rtc_settime(struct vmctx *ctx, time_t secs) { struct vm_rtc_time rtctime; int error; bzero(&rtctime, sizeof(struct vm_rtc_time)); rtctime.secs = secs; error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime); return (error); } int vm_rtc_gettime(struct vmctx *ctx, time_t *secs) { struct vm_rtc_time rtctime; int error; bzero(&rtctime, sizeof(struct vm_rtc_time)); error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime); if (error == 0) *secs = rtctime.secs; return (error); } int -vm_restart_instruction(struct vmctx *ctx, int vcpu) +vm_restart_instruction(struct vcpu *vcpu) { + int arg; - return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu)); + return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg)); } int vm_snapshot_req(struct vm_snapshot_meta *meta) { if (ioctl(meta->ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { #ifdef SNAPSHOT_DEBUG fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", __func__, meta->dev_name, errno); #endif return (-1); } return (0); } int vm_restore_time(struct vmctx *ctx) { int dummy; dummy = 0; return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); } int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) { struct vm_cpu_topology topology; bzero(&topology, sizeof (struct vm_cpu_topology)); topology.sockets = sockets; topology.cores = cores; topology.threads = threads; topology.maxcpus = maxcpus; return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); } int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) { struct vm_cpu_topology topology; int error; bzero(&topology, sizeof (struct vm_cpu_topology)); error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); if (error == 0) { *sockets = topology.sockets; *cores = topology.cores; *threads = topology.threads; *maxcpus = topology.maxcpus; } return (error); } /* Keep in sync with machine/vmm_dev.h. */ static const cap_ioctl_t vm_ioctl_cmds[] = { VM_RUN, VM_SUSPEND, VM_REINIT, VM_ALLOC_MEMSEG, VM_GET_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_GETNEXT, VM_MUNMAP_MEMSEG, VM_SET_REGISTER, VM_GET_REGISTER, VM_SET_SEGMENT_DESCRIPTOR, VM_GET_SEGMENT_DESCRIPTOR, VM_SET_REGISTER_SET, VM_GET_REGISTER_SET, VM_SET_KERNEMU_DEV, VM_GET_KERNEMU_DEV, VM_INJECT_EXCEPTION, VM_LAPIC_IRQ, VM_LAPIC_LOCAL_IRQ, VM_LAPIC_MSI, VM_IOAPIC_ASSERT_IRQ, VM_IOAPIC_DEASSERT_IRQ, VM_IOAPIC_PULSE_IRQ, VM_IOAPIC_PINCOUNT, VM_ISA_ASSERT_IRQ, VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER, VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV, VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI, VM_PPTDEV_MSIX, VM_UNMAP_PPTDEV_MMIO, VM_PPTDEV_DISABLE_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE, VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA, VM_GLA2GPA_NOFAULT, VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SUSPEND_CPU, VM_RESUME_CPU, VM_SET_INTINFO, VM_GET_INTINFO, VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME, VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY, VM_SNAPSHOT_REQ, VM_RESTORE_TIME }; int vm_limit_rights(struct vmctx *ctx) { cap_rights_t rights; size_t ncmds; cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); if (caph_rights_limit(ctx->fd, &rights) != 0) return (-1); ncmds = nitems(vm_ioctl_cmds); if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, ncmds) != 0) return (-1); return (0); } /* * Avoid using in new code. Operations on the fd should be wrapped here so that * capability rights can be kept in sync. */ int vm_get_device_fd(struct vmctx *ctx) { return (ctx->fd); } /* Legacy interface, do not use. */ const cap_ioctl_t * vm_get_ioctls(size_t *len) { cap_ioctl_t *cmds; if (len == NULL) { cmds = malloc(sizeof(vm_ioctl_cmds)); if (cmds == NULL) return (NULL); bcopy(vm_ioctl_cmds, cmds, sizeof(vm_ioctl_cmds)); return (cmds); } *len = nitems(vm_ioctl_cmds); return (NULL); } diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h index 1be1f19507a9..322b47e4ee0e 100644 --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -1,274 +1,279 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMMAPI_H_ #define _VMMAPI_H_ #include #include #include #include #include #include /* * API version for out-of-tree consumers like grub-bhyve for making compile * time decisions. */ -#define VMMAPI_VERSION 0104 /* 2 digit major followed by 2 digit minor */ +#define VMMAPI_VERSION 0200 /* 2 digit major followed by 2 digit minor */ struct iovec; +struct vcpu; struct vmctx; struct vm_snapshot_meta; enum x2apic_state; /* * Different styles of mapping the memory assigned to a VM into the address * space of the controlling process. */ enum vm_mmap_style { VM_MMAP_NONE, /* no mapping */ VM_MMAP_ALL, /* fully and statically mapped */ VM_MMAP_SPARSE, /* mappings created on-demand */ }; /* * 'flags' value passed to 'vm_set_memflags()'. */ #define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */ #define VM_MEM_F_WIRED 0x02 /* guest memory is wired */ /* * Identifiers for memory segments: * - vm_setup_memory() uses VM_SYSMEM for the system memory segment. * - the remaining identifiers can be used to create devmem segments. */ enum { VM_SYSMEM, VM_BOOTROM, VM_FRAMEBUFFER, VM_PCIROM, }; __BEGIN_DECLS /* * Get the length and name of the memory segment identified by 'segid'. * Note that system memory segments are identified with a nul name. * * Returns 0 on success and non-zero otherwise. */ int vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name, size_t namesiz); /* * Iterate over the guest address space. This function finds an address range * that starts at an address >= *gpa. * * Returns 0 if the next address range was found and non-zero otherwise. */ int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, size_t *lowmem_size, size_t *highmem_size); /* * Create a device memory segment identified by 'segid'. * * Returns a pointer to the memory segment on success and MAP_FAILED otherwise. */ void *vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len); /* * Map the memory segment identified by 'segid' into the guest address space * at [gpa,gpa+len) with protection 'prot'. */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t segoff, size_t len, int prot); int vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len); int vm_create(const char *name); struct vmctx *vm_open(const char *name); void vm_close(struct vmctx *ctx); void vm_destroy(struct vmctx *ctx); int vm_limit_rights(struct vmctx *ctx); +struct vcpu *vm_vcpu_open(struct vmctx *ctx, int vcpuid); +void vm_vcpu_close(struct vcpu *vcpu); +int vcpu_id(struct vcpu *vcpu); int vm_parse_memsize(const char *optarg, size_t *memsize); int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); /* inverse operation to vm_map_gpa - extract guest address from host pointer */ vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr); int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); -int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging, +int vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); -int vm_gla2gpa_nofault(struct vmctx *, int vcpuid, +int vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); uint32_t vm_get_lowmem_limit(struct vmctx *ctx); void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit); void vm_set_memflags(struct vmctx *ctx, int flags); int vm_get_memflags(struct vmctx *ctx); const char *vm_get_name(struct vmctx *ctx); size_t vm_get_lowmem_size(struct vmctx *ctx); size_t vm_get_highmem_size(struct vmctx *ctx); -int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, +int vm_set_desc(struct vcpu *vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access); -int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, +int vm_get_desc(struct vcpu *vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access); -int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, - struct seg_desc *seg_desc); -int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); -int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); -int vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count, +int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *seg_desc); +int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val); +int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval); +int vm_set_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals); -int vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count, +int vm_get_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals); -int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit); +int vm_run(struct vcpu *vcpu, struct vm_exit *ret_vmexit); int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how); int vm_reinit(struct vmctx *ctx); int vm_apicid2vcpu(struct vmctx *ctx, int apicid); -int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, +int vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction); -int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector); -int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector); +int vm_lapic_irq(struct vcpu *vcpu, int vector); +int vm_lapic_local_irq(struct vcpu *vcpu, int vector); int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg); int vm_ioapic_assert_irq(struct vmctx *ctx, int irq); int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq); int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq); int vm_ioapic_pincount(struct vmctx *ctx, int *pincount); -int vm_readwrite_kernemu_device(struct vmctx *ctx, int vcpu, +int vm_readwrite_kernemu_device(struct vcpu *vcpu, vm_paddr_t gpa, bool write, int size, uint64_t *value); int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger); -int vm_inject_nmi(struct vmctx *ctx, int vcpu); +int vm_inject_nmi(struct vcpu *vcpu); int vm_capability_name2type(const char *capname); const char *vm_capability_type2name(int type); -int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, +int vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval); -int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, +int vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val); int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len); -int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, +int vm_setup_pptdev_msi(struct vmctx *ctx, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec); -int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, +int vm_setup_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); int vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func); -int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2); -int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo); +int vm_get_intinfo(struct vcpu *vcpu, uint64_t *i1, uint64_t *i2); +int vm_set_intinfo(struct vcpu *vcpu, uint64_t exit_intinfo); /* * Return a pointer to the statistics buffer. Note that this is not MT-safe. */ -uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, +uint64_t *vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, int *ret_entries); const char *vm_get_stat_desc(struct vmctx *ctx, int index); -int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s); -int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s); +int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *s); +int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state s); int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities); /* * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'. * The 'iovcnt' should be big enough to accommodate all GPA segments. * * retval fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Error */ -int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg, +int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *pg, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault); void vm_copyin(struct iovec *guest_iov, void *host_dst, size_t len); void vm_copyout(const void *host_src, struct iovec *guest_iov, size_t len); void vm_copy_teardown(struct iovec *iov, int iovcnt); /* RTC */ int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value); int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval); int vm_rtc_settime(struct vmctx *ctx, time_t secs); int vm_rtc_gettime(struct vmctx *ctx, time_t *secs); /* Reset vcpu register state */ -int vcpu_reset(struct vmctx *ctx, int vcpu); +int vcpu_reset(struct vcpu *vcpu); int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus); -int vm_activate_cpu(struct vmctx *ctx, int vcpu); -int vm_suspend_cpu(struct vmctx *ctx, int vcpu); -int vm_resume_cpu(struct vmctx *ctx, int vcpu); -int vm_restart_instruction(struct vmctx *vmctx, int vcpu); +int vm_activate_cpu(struct vcpu *vcpu); +int vm_suspend_all_cpus(struct vmctx *ctx); +int vm_suspend_cpu(struct vcpu *vcpu); +int vm_resume_all_cpus(struct vmctx *ctx); +int vm_resume_cpu(struct vcpu *vcpu); +int vm_restart_instruction(struct vcpu *vcpu); /* CPU topology */ int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); /* * FreeBSD specific APIs */ -int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu, +int vm_setup_freebsd_registers(struct vcpu *vcpu, uint64_t rip, uint64_t cr3, uint64_t gdtbase, uint64_t rsp); -int vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu, +int vm_setup_freebsd_registers_i386(struct vcpu *vcpu, uint32_t eip, uint32_t gdtbase, uint32_t esp); void vm_setup_freebsd_gdt(uint64_t *gdtr); /* * Save and restore */ int vm_snapshot_req(struct vm_snapshot_meta *meta); int vm_restore_time(struct vmctx *ctx); /* * Deprecated interfaces, do not use them in new code. */ int vm_get_device_fd(struct vmctx *ctx); const cap_ioctl_t *vm_get_ioctls(size_t *len); __END_DECLS #endif /* _VMMAPI_H_ */ diff --git a/lib/libvmmapi/vmmapi_freebsd.c b/lib/libvmmapi/vmmapi_freebsd.c index adb7717d19fe..443a4a18b8aa 100644 --- a/lib/libvmmapi/vmmapi_freebsd.c +++ b/lib/libvmmapi/vmmapi_freebsd.c @@ -1,347 +1,348 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include "vmmapi.h" +#include "internal.h" #define I386_TSS_SIZE 104 #define DESC_PRESENT 0x00000080 #define DESC_LONGMODE 0x00002000 #define DESC_DEF32 0x00004000 #define DESC_GRAN 0x00008000 #define DESC_UNUSABLE 0x00010000 #define GUEST_NULL_SEL 0 #define GUEST_CODE_SEL 1 #define GUEST_DATA_SEL 2 #define GUEST_TSS_SEL 3 #define GUEST_GDTR_LIMIT64 (3 * 8 - 1) static struct segment_descriptor i386_gdt[] = { {}, /* NULL */ { .sd_lolimit = 0xffff, .sd_type = SDT_MEMER, /* CODE */ .sd_p = 1, .sd_hilimit = 0xf, .sd_def32 = 1, .sd_gran = 1 }, { .sd_lolimit = 0xffff, .sd_type = SDT_MEMRW, /* DATA */ .sd_p = 1, .sd_hilimit = 0xf, .sd_def32 = 1, .sd_gran = 1 }, { .sd_lolimit = I386_TSS_SIZE - 1, /* TSS */ .sd_type = SDT_SYS386TSS, .sd_p = 1 } }; /* * Setup the 'vcpu' register set such that it will begin execution at * 'eip' in flat mode. */ int -vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu, uint32_t eip, +vm_setup_freebsd_registers_i386(struct vcpu *vcpu, uint32_t eip, uint32_t gdtbase, uint32_t esp) { uint64_t cr0, rflags, desc_base; uint32_t desc_access, desc_limit, tssbase; uint16_t gsel; struct segment_descriptor *gdt; int error, tmp; /* A 32-bit guest requires unrestricted mode. */ - error = vm_get_capability(vmctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp); + error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp); if (error) goto done; - error = vm_set_capability(vmctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); + error = vm_set_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); if (error) goto done; cr0 = CR0_PE | CR0_NE; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR0, cr0)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, 0)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR4, 0)) != 0) goto done; /* * Forcing EFER to 0 causes bhyve to clear the "IA-32e guest * mode" entry control. */ - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, 0))) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_EFER, 0))) goto done; - gdt = vm_map_gpa(vmctx, gdtbase, 0x1000); + gdt = vm_map_gpa(vcpu->ctx, gdtbase, 0x1000); if (gdt == NULL) return (EFAULT); memcpy(gdt, i386_gdt, sizeof(i386_gdt)); desc_base = gdtbase; desc_limit = sizeof(i386_gdt) - 1; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, + error = vm_set_desc(vcpu, VM_REG_GUEST_GDTR, desc_base, desc_limit, 0); if (error != 0) goto done; /* Place the TSS one page above the GDT. */ tssbase = gdtbase + 0x1000; gdt[3].sd_lobase = tssbase; rflags = 0x2; - error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); + error = vm_set_register(vcpu, VM_REG_GUEST_RFLAGS, rflags); if (error) goto done; desc_base = 0; desc_limit = 0xffffffff; desc_access = DESC_GRAN | DESC_DEF32 | DESC_PRESENT | SDT_MEMERA; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, + error = vm_set_desc(vcpu, VM_REG_GUEST_CS, desc_base, desc_limit, desc_access); desc_access = DESC_GRAN | DESC_DEF32 | DESC_PRESENT | SDT_MEMRWA; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, + error = vm_set_desc(vcpu, VM_REG_GUEST_DS, desc_base, desc_limit, desc_access); if (error) goto done; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, + error = vm_set_desc(vcpu, VM_REG_GUEST_ES, desc_base, desc_limit, desc_access); if (error) goto done; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, + error = vm_set_desc(vcpu, VM_REG_GUEST_FS, desc_base, desc_limit, desc_access); if (error) goto done; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, + error = vm_set_desc(vcpu, VM_REG_GUEST_GS, desc_base, desc_limit, desc_access); if (error) goto done; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, + error = vm_set_desc(vcpu, VM_REG_GUEST_SS, desc_base, desc_limit, desc_access); if (error) goto done; desc_base = tssbase; desc_limit = I386_TSS_SIZE - 1; desc_access = DESC_PRESENT | SDT_SYS386BSY; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, + error = vm_set_desc(vcpu, VM_REG_GUEST_TR, desc_base, desc_limit, desc_access); if (error) goto done; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, 0, 0, + error = vm_set_desc(vcpu, VM_REG_GUEST_LDTR, 0, 0, DESC_UNUSABLE); if (error) goto done; gsel = GSEL(GUEST_CODE_SEL, SEL_KPL); - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_CS, gsel)) != 0) goto done; gsel = GSEL(GUEST_DATA_SEL, SEL_KPL); - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_DS, gsel)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_ES, gsel)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_FS, gsel)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_GS, gsel)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_SS, gsel)) != 0) goto done; gsel = GSEL(GUEST_TSS_SEL, SEL_KPL); - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, gsel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_TR, gsel)) != 0) goto done; /* LDTR is pointing to the null selector */ - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_LDTR, 0)) != 0) goto done; /* entry point */ - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, eip)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_RIP, eip)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, esp)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_RSP, esp)) != 0) goto done; error = 0; done: return (error); } void vm_setup_freebsd_gdt(uint64_t *gdtr) { gdtr[GUEST_NULL_SEL] = 0; gdtr[GUEST_CODE_SEL] = 0x0020980000000000; gdtr[GUEST_DATA_SEL] = 0x0000900000000000; } /* * Setup the 'vcpu' register set such that it will begin execution at * 'rip' in long mode. */ int -vm_setup_freebsd_registers(struct vmctx *vmctx, int vcpu, +vm_setup_freebsd_registers(struct vcpu *vcpu, uint64_t rip, uint64_t cr3, uint64_t gdtbase, uint64_t rsp) { int error; uint64_t cr0, cr4, efer, rflags, desc_base; uint32_t desc_access, desc_limit; uint16_t gsel; cr0 = CR0_PE | CR0_PG | CR0_NE; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR0, cr0)) != 0) goto done; cr4 = CR4_PAE; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR4, cr4)) != 0) goto done; efer = EFER_LME | EFER_LMA; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, efer))) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_EFER, efer))) goto done; rflags = 0x2; - error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); + error = vm_set_register(vcpu, VM_REG_GUEST_RFLAGS, rflags); if (error) goto done; desc_base = 0; desc_limit = 0; desc_access = 0x0000209B; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, + error = vm_set_desc(vcpu, VM_REG_GUEST_CS, desc_base, desc_limit, desc_access); if (error) goto done; desc_access = 0x00000093; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, + error = vm_set_desc(vcpu, VM_REG_GUEST_DS, desc_base, desc_limit, desc_access); if (error) goto done; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, + error = vm_set_desc(vcpu, VM_REG_GUEST_ES, desc_base, desc_limit, desc_access); if (error) goto done; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, + error = vm_set_desc(vcpu, VM_REG_GUEST_FS, desc_base, desc_limit, desc_access); if (error) goto done; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, + error = vm_set_desc(vcpu, VM_REG_GUEST_GS, desc_base, desc_limit, desc_access); if (error) goto done; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, + error = vm_set_desc(vcpu, VM_REG_GUEST_SS, desc_base, desc_limit, desc_access); if (error) goto done; /* * XXX TR is pointing to null selector even though we set the * TSS segment to be usable with a base address and limit of 0. */ desc_access = 0x0000008b; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); + error = vm_set_desc(vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); if (error) goto done; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, 0, 0, + error = vm_set_desc(vcpu, VM_REG_GUEST_LDTR, 0, 0, DESC_UNUSABLE); if (error) goto done; gsel = GSEL(GUEST_CODE_SEL, SEL_KPL); - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_CS, gsel)) != 0) goto done; gsel = GSEL(GUEST_DATA_SEL, SEL_KPL); - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_DS, gsel)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_ES, gsel)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_FS, gsel)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_GS, gsel)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_SS, gsel)) != 0) goto done; /* XXX TR is pointing to the null selector */ - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, 0)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_TR, 0)) != 0) goto done; /* LDTR is pointing to the null selector */ - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_LDTR, 0)) != 0) goto done; /* entry point */ - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_RIP, rip)) != 0) goto done; /* page table base */ - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, cr3)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR3, cr3)) != 0) goto done; desc_base = gdtbase; desc_limit = GUEST_GDTR_LIMIT64; - error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, + error = vm_set_desc(vcpu, VM_REG_GUEST_GDTR, desc_base, desc_limit, 0); if (error != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, rsp)) != 0) + if ((error = vm_set_register(vcpu, VM_REG_GUEST_RSP, rsp)) != 0) goto done; error = 0; done: return (error); } diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 4d6242a8134d..c3d1aa4f3866 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -1,825 +1,794 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_H_ #define _VMM_H_ #include #include #include +struct vcpu; struct vm_snapshot_meta; #ifdef _KERNEL SDT_PROVIDER_DECLARE(vmm); #endif enum vm_suspend_how { VM_SUSPEND_NONE, VM_SUSPEND_RESET, VM_SUSPEND_POWEROFF, VM_SUSPEND_HALT, VM_SUSPEND_TRIPLEFAULT, VM_SUSPEND_LAST }; /* * Identifiers for architecturally defined registers. */ enum vm_reg_name { VM_REG_GUEST_RAX, VM_REG_GUEST_RBX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_RBP, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15, VM_REG_GUEST_CR0, VM_REG_GUEST_CR3, VM_REG_GUEST_CR4, VM_REG_GUEST_DR7, VM_REG_GUEST_RSP, VM_REG_GUEST_RIP, VM_REG_GUEST_RFLAGS, VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS, VM_REG_GUEST_LDTR, VM_REG_GUEST_TR, VM_REG_GUEST_IDTR, VM_REG_GUEST_GDTR, VM_REG_GUEST_EFER, VM_REG_GUEST_CR2, VM_REG_GUEST_PDPTE0, VM_REG_GUEST_PDPTE1, VM_REG_GUEST_PDPTE2, VM_REG_GUEST_PDPTE3, VM_REG_GUEST_INTR_SHADOW, VM_REG_GUEST_DR0, VM_REG_GUEST_DR1, VM_REG_GUEST_DR2, VM_REG_GUEST_DR3, VM_REG_GUEST_DR6, VM_REG_GUEST_ENTRY_INST_LENGTH, VM_REG_LAST }; enum x2apic_state { X2APIC_DISABLED, X2APIC_ENABLED, X2APIC_STATE_LAST }; #define VM_INTINFO_VECTOR(info) ((info) & 0xff) #define VM_INTINFO_DEL_ERRCODE 0x800 #define VM_INTINFO_RSVD 0x7ffff000 #define VM_INTINFO_VALID 0x80000000 #define VM_INTINFO_TYPE 0x700 #define VM_INTINFO_HWINTR (0 << 8) #define VM_INTINFO_NMI (2 << 8) #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) /* * The VM name has to fit into the pathname length constraints of devfs, * governed primarily by SPECNAMELEN. The length is the total number of * characters in the full path, relative to the mount point and not * including any leading '/' characters. * A prefix and a suffix are added to the name specified by the user. * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters * longer for future use. * The suffix is a string that identifies a bootrom image or some similar * image that is attached to the VM. A separator character gets added to * the suffix automatically when generating the full path, so it must be * accounted for, reducing the effective length by 1. * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37 * bytes for FreeBSD 12. A minimum length is set for safety and supports * a SPECNAMELEN as small as 32 on old systems. */ #define VM_MAX_PREFIXLEN 10 #define VM_MAX_SUFFIXLEN 15 #define VM_MIN_NAMELEN 6 #define VM_MAX_NAMELEN \ (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1) #ifdef _KERNEL CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN); -struct vcpu; struct vm; struct vm_exception; struct seg_desc; struct vm_exit; struct vm_run; struct vhpet; struct vioapic; struct vlapic; struct vmspace; struct vm_object; struct vm_guest_paging; struct pmap; enum snapshot_req; struct vm_eventinfo { cpuset_t *rptr; /* rendezvous cookie */ int *sptr; /* suspend cookie */ int *iptr; /* reqidle cookie */ }; typedef int (*vmm_init_func_t)(int ipinum); typedef int (*vmm_cleanup_func_t)(void); typedef void (*vmm_resume_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); typedef int (*vmi_run_func_t)(void *vcpui, register_t rip, struct pmap *pmap, struct vm_eventinfo *info); typedef void (*vmi_cleanup_func_t)(void *vmi); typedef void * (*vmi_vcpu_init_func_t)(void *vmi, struct vcpu *vcpu, int vcpu_id); typedef void (*vmi_vcpu_cleanup_func_t)(void *vcpui); typedef int (*vmi_get_register_t)(void *vcpui, int num, uint64_t *retval); typedef int (*vmi_set_register_t)(void *vcpui, int num, uint64_t val); typedef int (*vmi_get_desc_t)(void *vcpui, int num, struct seg_desc *desc); typedef int (*vmi_set_desc_t)(void *vcpui, int num, struct seg_desc *desc); typedef int (*vmi_get_cap_t)(void *vcpui, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vcpui, int num, int val); typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); typedef struct vlapic * (*vmi_vlapic_init)(void *vcpui); typedef void (*vmi_vlapic_cleanup)(struct vlapic *vlapic); typedef int (*vmi_snapshot_vcpu_t)(void *vcpui, struct vm_snapshot_meta *meta); typedef int (*vmi_restore_tsc_t)(void *vcpui, uint64_t now); struct vmm_ops { vmm_init_func_t modinit; /* module wide initialization */ vmm_cleanup_func_t modcleanup; vmm_resume_func_t modresume; vmi_init_func_t init; /* vm-specific initialization */ vmi_run_func_t run; vmi_cleanup_func_t cleanup; vmi_vcpu_init_func_t vcpu_init; vmi_vcpu_cleanup_func_t vcpu_cleanup; vmi_get_register_t getreg; vmi_set_register_t setreg; vmi_get_desc_t getdesc; vmi_set_desc_t setdesc; vmi_get_cap_t getcap; vmi_set_cap_t setcap; vmi_vmspace_alloc vmspace_alloc; vmi_vmspace_free vmspace_free; vmi_vlapic_init vlapic_init; vmi_vlapic_cleanup vlapic_cleanup; /* checkpoint operations */ vmi_snapshot_vcpu_t vcpu_snapshot; vmi_restore_tsc_t restore_tsc; }; extern const struct vmm_ops vmm_ops_intel; extern const struct vmm_ops vmm_ops_amd; extern u_int vm_maxcpu; /* maximum virtual cpus */ int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); void vm_slock_vcpus(struct vm *vm); void vm_unlock_vcpus(struct vm *vm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); const char *vm_name(struct vm *vm); uint16_t vm_get_maxcpus(struct vm *vm); void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); /* * APIs that modify the guest memory map require all vcpus to be frozen. */ void vm_slock_memsegs(struct vm *vm); void vm_xlock_memsegs(struct vm *vm); void vm_unlock_memsegs(struct vm *vm); int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot, int flags); int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); void vm_free_memseg(struct vm *vm, int ident); int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); /* * APIs that inspect the guest memory map require only a *single* vcpu to * be frozen. This acts like a read lock on the guest memory map since any * modification requires *all* vcpus to be frozen. */ int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, struct vm_object **objptr); vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); void *vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int prot, void **cookie); void *vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int prot, void **cookie); void *vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int prot, void **cookie); void vm_gpa_release(void *cookie); bool vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa); int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval); int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *ret_desc); int vm_set_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc); int vm_run(struct vcpu *vcpu, struct vm_exit *vme_user); int vm_suspend(struct vm *vm, enum vm_suspend_how how); int vm_inject_nmi(struct vcpu *vcpu); int vm_nmi_pending(struct vcpu *vcpu); void vm_nmi_clear(struct vcpu *vcpu); int vm_inject_extint(struct vcpu *vcpu); int vm_extint_pending(struct vcpu *vcpu); void vm_extint_clear(struct vcpu *vcpu); int vcpu_vcpuid(struct vcpu *vcpu); struct vm *vcpu_vm(struct vcpu *vcpu); struct vcpu *vm_vcpu(struct vm *vm, int cpu); struct vlapic *vm_lapic(struct vcpu *vcpu); struct vioapic *vm_ioapic(struct vm *vm); struct vhpet *vm_hpet(struct vm *vm); int vm_get_capability(struct vcpu *vcpu, int type, int *val); int vm_set_capability(struct vcpu *vcpu, int type, int val); int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state); int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state); int vm_apicid2vcpuid(struct vm *vm, int apicid); int vm_activate_cpu(struct vcpu *vcpu); int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu); int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu); int vm_restart_instruction(struct vcpu *vcpu); struct vm_exit *vm_exitinfo(struct vcpu *vcpu); void vm_exit_suspended(struct vcpu *vcpu, uint64_t rip); void vm_exit_debug(struct vcpu *vcpu, uint64_t rip); void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t rip); void vm_exit_astpending(struct vcpu *vcpu, uint64_t rip); void vm_exit_reqidle(struct vcpu *vcpu, uint64_t rip); int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta); int vm_restore_time(struct vm *vm); #ifdef _SYS__CPUSET_H_ /* * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. * The rendezvous 'func(arg)' is not allowed to do anything that will * cause the thread to be put to sleep. * * The caller cannot hold any locks when initiating the rendezvous. * * The implementation of this API may cause vcpus other than those specified * by 'dest' to be stalled. The caller should not rely on any vcpus making * forward progress when the rendezvous is in progress. */ typedef void (*vm_rendezvous_func_t)(struct vcpu *vcpu, void *arg); int vm_smp_rendezvous(struct vcpu *vcpu, cpuset_t dest, vm_rendezvous_func_t func, void *arg); cpuset_t vm_active_cpus(struct vm *vm); cpuset_t vm_debug_cpus(struct vm *vm); cpuset_t vm_suspended_cpus(struct vm *vm); cpuset_t vm_start_cpus(struct vm *vm, const cpuset_t *tostart); void vm_await_start(struct vm *vm, const cpuset_t *waiting); #endif /* _SYS__CPUSET_H_ */ static __inline int vcpu_rendezvous_pending(struct vcpu *vcpu, struct vm_eventinfo *info) { /* * This check isn't done with atomic operations or under a lock because * there's no need to. If the vcpuid bit is set, the vcpu is part of a * rendezvous and the bit won't be cleared until the vcpu enters the * rendezvous. On rendezvous exit, the cpuset is cleared and the vcpu * will see an empty cpuset. So, the races are harmless. */ return (CPU_ISSET(vcpu_vcpuid(vcpu), info->rptr)); } static __inline int vcpu_suspended(struct vm_eventinfo *info) { return (*info->sptr); } static __inline int vcpu_reqidle(struct vm_eventinfo *info) { return (*info->iptr); } int vcpu_debugged(struct vcpu *vcpu); /* * Return true if device indicated by bus/slot/func is supposed to be a * pci passthrough device. * * Return false otherwise. */ bool vmm_is_pptdev(int bus, int slot, int func); void *vm_iommu_domain(struct vm *vm); enum vcpu_state { VCPU_IDLE, VCPU_FROZEN, VCPU_RUNNING, VCPU_SLEEPING, }; int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle); enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu); static int __inline vcpu_is_running(struct vcpu *vcpu, int *hostcpu) { return (vcpu_get_state(vcpu, hostcpu) == VCPU_RUNNING); } #ifdef _SYS_PROC_H_ static int __inline vcpu_should_yield(struct vcpu *vcpu) { struct thread *td; td = curthread; return (td->td_ast != 0 || td->td_owepreempt != 0); } #endif void *vcpu_stats(struct vcpu *vcpu); void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr); struct vmspace *vm_get_vmspace(struct vm *vm); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); struct vpmtmr *vm_pmtmr(struct vm *vm); struct vrtc *vm_rtc(struct vm *vm); /* * Inject exception 'vector' into the guest vcpu. This function returns 0 on * success and non-zero on failure. * * Wrapper functions like 'vm_inject_gp()' should be preferred to calling * this function directly because they enforce the trap-like or fault-like * behavior of an exception. * * This function should only be called in the context of the thread that is * executing this vcpu. */ int vm_inject_exception(struct vcpu *vcpu, int vector, int err_valid, uint32_t errcode, int restart_instruction); /* * This function is called after a VM-exit that occurred during exception or * interrupt delivery through the IDT. The format of 'intinfo' is described * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. * * If a VM-exit handler completes the event delivery successfully then it * should call vm_exit_intinfo() to extinguish the pending event. For e.g., * if the task switch emulation is triggered via a task gate then it should * call this function with 'intinfo=0' to indicate that the external event * is not pending anymore. * * Return value is 0 on success and non-zero on failure. */ int vm_exit_intinfo(struct vcpu *vcpu, uint64_t intinfo); /* * This function is called before every VM-entry to retrieve a pending * event that should be injected into the guest. This function combines * nested events into a double or triple fault. * * Returns 0 if there are no events that need to be injected into the guest * and non-zero otherwise. */ int vm_entry_intinfo(struct vcpu *vcpu, uint64_t *info); int vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2); /* * Function used to keep track of the guest's TSC offset. The * offset is used by the virutalization extensions to provide a consistent * value for the Time Stamp Counter to the guest. */ void vm_set_tsc_offset(struct vcpu *vcpu, uint64_t offset); enum vm_reg_name vm_segment_name(int seg_encoding); struct vm_copyinfo { uint64_t gpa; size_t len; void *hva; void *cookie; }; /* * Set up 'copyinfo[]' to copy to/from guest linear address space starting * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for * a copyin or PROT_WRITE for a copyout. * * retval is_fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Unrecoverable error * * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if * the return value is 0. The 'copyinfo[]' resources should be freed by calling * 'vm_copy_teardown()' after the copy is done. */ int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *is_fault); void vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo); void vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len); void vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len); int vcpu_trace_exceptions(struct vcpu *vcpu); int vcpu_trap_wbinvd(struct vcpu *vcpu); #endif /* KERNEL */ /* * Identifiers for optional vmm capabilities */ enum vm_cap_type { VM_CAP_HALT_EXIT, VM_CAP_MTRAP_EXIT, VM_CAP_PAUSE_EXIT, VM_CAP_UNRESTRICTED_GUEST, VM_CAP_ENABLE_INVPCID, VM_CAP_BPT_EXIT, VM_CAP_RDPID, VM_CAP_RDTSCP, VM_CAP_IPI_EXIT, VM_CAP_MAX }; enum vm_intr_trigger { EDGE_TRIGGER, LEVEL_TRIGGER }; /* * The 'access' field has the format specified in Table 21-2 of the Intel * Architecture Manual vol 3b. * * XXX The contents of the 'access' field are architecturally defined except * bit 16 - Segment Unusable. */ struct seg_desc { uint64_t base; uint32_t limit; uint32_t access; }; #define SEG_DESC_TYPE(access) ((access) & 0x001f) #define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) #define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) #define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) #define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) #define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) enum vm_cpu_mode { CPU_MODE_REAL, CPU_MODE_PROTECTED, CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ }; enum vm_paging_mode { PAGING_MODE_FLAT, PAGING_MODE_32, PAGING_MODE_PAE, PAGING_MODE_64, PAGING_MODE_64_LA57, }; struct vm_guest_paging { uint64_t cr3; int cpl; enum vm_cpu_mode cpu_mode; enum vm_paging_mode paging_mode; }; /* * The data structures 'vie' and 'vie_op' are meant to be opaque to the * consumers of instruction decoding. The only reason why their contents * need to be exposed is because they are part of the 'vm_exit' structure. */ struct vie_op { uint8_t op_byte; /* actual opcode byte */ uint8_t op_type; /* type of operation (e.g. MOV) */ uint16_t op_flags; }; _Static_assert(sizeof(struct vie_op) == 4, "ABI"); _Static_assert(_Alignof(struct vie_op) == 2, "ABI"); #define VIE_INST_SIZE 15 struct vie { uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ uint8_t num_valid; /* size of the instruction */ /* The following fields are all zeroed upon restart. */ #define vie_startzero num_processed uint8_t num_processed; uint8_t addrsize:4, opsize:4; /* address and operand sizes */ uint8_t rex_w:1, /* REX prefix */ rex_r:1, rex_x:1, rex_b:1, rex_present:1, repz_present:1, /* REP/REPE/REPZ prefix */ repnz_present:1, /* REPNE/REPNZ prefix */ opsize_override:1, /* Operand size override */ addrsize_override:1, /* Address size override */ segment_override:1; /* Segment override */ uint8_t mod:2, /* ModRM byte */ reg:4, rm:4; uint8_t ss:2, /* SIB byte */ vex_present:1, /* VEX prefixed */ vex_l:1, /* L bit */ index:4, /* SIB byte */ base:4; /* SIB byte */ uint8_t disp_bytes; uint8_t imm_bytes; uint8_t scale; uint8_t vex_reg:4, /* vvvv: first source register specifier */ vex_pp:2, /* pp */ _sparebits:2; uint8_t _sparebytes[2]; int base_register; /* VM_REG_GUEST_xyz */ int index_register; /* VM_REG_GUEST_xyz */ int segment_register; /* VM_REG_GUEST_xyz */ int64_t displacement; /* optional addr displacement */ int64_t immediate; /* optional immediate operand */ uint8_t decoded; /* set to 1 if successfully decoded */ uint8_t _sparebyte; struct vie_op op; /* opcode description */ }; _Static_assert(sizeof(struct vie) == 64, "ABI"); _Static_assert(__offsetof(struct vie, disp_bytes) == 22, "ABI"); _Static_assert(__offsetof(struct vie, scale) == 24, "ABI"); _Static_assert(__offsetof(struct vie, base_register) == 28, "ABI"); enum vm_exitcode { VM_EXITCODE_INOUT, VM_EXITCODE_VMX, VM_EXITCODE_BOGUS, VM_EXITCODE_RDMSR, VM_EXITCODE_WRMSR, VM_EXITCODE_HLT, VM_EXITCODE_MTRAP, VM_EXITCODE_PAUSE, VM_EXITCODE_PAGING, VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ VM_EXITCODE_RENDEZVOUS, VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, VM_EXITCODE_INOUT_STR, VM_EXITCODE_TASK_SWITCH, VM_EXITCODE_MONITOR, VM_EXITCODE_MWAIT, VM_EXITCODE_SVM, VM_EXITCODE_REQIDLE, VM_EXITCODE_DEBUG, VM_EXITCODE_VMINSN, VM_EXITCODE_BPT, VM_EXITCODE_IPI, VM_EXITCODE_MAX }; struct vm_inout { uint16_t bytes:3; /* 1 or 2 or 4 */ uint16_t in:1; uint16_t string:1; uint16_t rep:1; uint16_t port; uint32_t eax; /* valid for out */ }; struct vm_inout_str { struct vm_inout inout; /* must be the first element */ struct vm_guest_paging paging; uint64_t rflags; uint64_t cr0; uint64_t index; uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ int addrsize; enum vm_reg_name seg_name; struct seg_desc seg_desc; }; enum task_switch_reason { TSR_CALL, TSR_IRET, TSR_JMP, TSR_IDT_GATE, /* task gate in IDT */ }; struct vm_task_switch { uint16_t tsssel; /* new TSS selector */ int ext; /* task switch due to external event */ uint32_t errcode; int errcode_valid; /* push 'errcode' on the new stack */ enum task_switch_reason reason; struct vm_guest_paging paging; }; struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ uint64_t rip; union { struct vm_inout inout; struct vm_inout_str inout_str; struct { uint64_t gpa; int fault_type; } paging; struct { uint64_t gpa; uint64_t gla; uint64_t cs_base; int cs_d; /* CS.D */ struct vm_guest_paging paging; struct vie vie; } inst_emul; /* * VMX specific payload. Used when there is no "better" * exitcode to represent the VM-exit. */ struct { int status; /* vmx inst status */ /* * 'exit_reason' and 'exit_qualification' are valid * only if 'status' is zero. */ uint32_t exit_reason; uint64_t exit_qualification; /* * 'inst_error' and 'inst_type' are valid * only if 'status' is non-zero. */ int inst_type; int inst_error; } vmx; /* * SVM specific payload. */ struct { uint64_t exitcode; uint64_t exitinfo1; uint64_t exitinfo2; } svm; struct { int inst_length; } bpt; struct { uint32_t code; /* ecx value */ uint64_t wval; } msr; struct { int vcpu; uint64_t rip; } spinup_ap; struct { uint64_t rflags; uint64_t intr_status; } hlt; struct { int vector; } ioapic_eoi; struct { enum vm_suspend_how how; } suspended; struct { uint32_t mode; uint8_t vector; cpuset_t dmask; } ipi; struct vm_task_switch task_switch; } u; }; /* APIs to inject faults into the guest */ -#ifdef _KERNEL void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode); static __inline void vm_inject_ud(struct vcpu *vcpu) { vm_inject_fault(vcpu, IDT_UD, 0, 0); } static __inline void vm_inject_gp(struct vcpu *vcpu) { vm_inject_fault(vcpu, IDT_GP, 1, 0); } static __inline void vm_inject_ac(struct vcpu *vcpu, int errcode) { vm_inject_fault(vcpu, IDT_AC, 1, errcode); } static __inline void vm_inject_ss(struct vcpu *vcpu, int errcode) { vm_inject_fault(vcpu, IDT_SS, 1, errcode); } void vm_inject_pf(struct vcpu *vcpu, int error_code, uint64_t cr2); -#else -void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, - int errcode); - -static __inline void -vm_inject_ud(void *vm, int vcpuid) -{ - vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); -} - -static __inline void -vm_inject_gp(void *vm, int vcpuid) -{ - vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); -} - -static __inline void -vm_inject_ac(void *vm, int vcpuid, int errcode) -{ - vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); -} - -static __inline void -vm_inject_ss(void *vm, int vcpuid, int errcode) -{ - vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); -} - -void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); -#endif #endif /* _VMM_H_ */ diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h index 082405971fe4..49f09625ab8e 100644 --- a/sys/amd64/include/vmm_instruction_emul.h +++ b/sys/amd64/include/vmm_instruction_emul.h @@ -1,148 +1,135 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_INSTRUCTION_EMUL_H_ #define _VMM_INSTRUCTION_EMUL_H_ #include -/* - * Allow for different arguments to identify vCPUs in userspace vs the - * kernel. Eventually we should add struct vcpu in userland and - * always use the kernel arguments removing these macros. - */ -#ifdef _KERNEL -#define VCPU_DECL struct vcpu *vcpu -#define VCPU_ARGS vcpu -#else -#define VCPU_DECL void *vm, int vcpuid -#define VCPU_ARGS vm, vcpuid -#endif - /* * Callback functions to read and write memory regions. */ -typedef int (*mem_region_read_t)(VCPU_DECL, uint64_t gpa, +typedef int (*mem_region_read_t)(struct vcpu *vcpu, uint64_t gpa, uint64_t *rval, int rsize, void *arg); -typedef int (*mem_region_write_t)(VCPU_DECL, uint64_t gpa, +typedef int (*mem_region_write_t)(struct vcpu *vcpu, uint64_t gpa, uint64_t wval, int wsize, void *arg); /* * Emulate the decoded 'vie' instruction. * * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the * callback functions. * * 'void *vm' should be 'struct vm *' when called from kernel context and * 'struct vmctx *' when called from user context. * s */ -int vmm_emulate_instruction(VCPU_DECL, uint64_t gpa, struct vie *vie, +int vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t mrr, mem_region_write_t mrw, void *mrarg); -int vie_update_register(VCPU_DECL, enum vm_reg_name reg, +int vie_update_register(struct vcpu *vcpu, enum vm_reg_name reg, uint64_t val, int size); /* * Returns 1 if an alignment check exception should be injected and 0 otherwise. */ int vie_alignment_check(int cpl, int operand_size, uint64_t cr0, uint64_t rflags, uint64_t gla); /* Returns 1 if the 'gla' is not canonical and 0 otherwise. */ int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla); uint64_t vie_size2mask(int size); int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot, uint64_t *gla); #ifdef _KERNEL /* * APIs to fetch and decode the instruction from nested page fault handler. * * 'vie' must be initialized before calling 'vmm_fetch_instruction()' */ int vmm_fetch_instruction(struct vcpu *vcpu, struct vm_guest_paging *guest_paging, uint64_t rip, int inst_length, struct vie *vie, int *is_fault); /* * Translate the guest linear address 'gla' to a guest physical address. * * retval is_fault Interpretation * 0 0 'gpa' contains result of the translation * 0 1 An exception was injected into the guest * EFAULT N/A An unrecoverable hypervisor error occurred */ int vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *is_fault); /* * Like vm_gla2gpa, but no exceptions are injected into the guest and * PTEs are not changed. */ int vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *is_fault); #endif /* _KERNEL */ void vie_restart(struct vie *vie); void vie_init(struct vie *vie, const char *inst_bytes, int inst_length); /* * Decode the instruction fetched into 'vie' so it can be emulated. * * 'gla' is the guest linear address provided by the hardware assist * that caused the nested page table fault. It is used to verify that * the software instruction decoding is in agreement with the hardware. * * Some hardware assists do not provide the 'gla' to the hypervisor. * To skip the 'gla' verification for this or any other reason pass * in VIE_INVALID_GLA instead. */ #ifdef _KERNEL #define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */ int vmm_decode_instruction(struct vcpu *vcpu, uint64_t gla, enum vm_cpu_mode cpu_mode, int csd, struct vie *vie); #else /* !_KERNEL */ /* * Permit instruction decoding logic to be compiled outside of the kernel for * rapid iteration and validation. No GLA validation is performed, obviously. */ int vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int csd, struct vie *vie); #endif /* _KERNEL */ #endif /* _VMM_INSTRUCTION_EMUL_H_ */ diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index 32791762c9fe..de24d4bf550a 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -1,2944 +1,2944 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Sandvine, Inc. * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #ifdef _KERNEL #include #include #include #include #include #include #include #include #else /* !_KERNEL */ #include #include #include #include #include #include #include #include #include #include #include #include #define __diagused #define KASSERT(exp,msg) assert((exp)) #define panic(...) errx(4, __VA_ARGS__) #endif /* _KERNEL */ #include #include #include /* struct vie_op.op_type */ enum { VIE_OP_TYPE_NONE = 0, VIE_OP_TYPE_MOV, VIE_OP_TYPE_MOVSX, VIE_OP_TYPE_MOVZX, VIE_OP_TYPE_AND, VIE_OP_TYPE_OR, VIE_OP_TYPE_SUB, VIE_OP_TYPE_TWO_BYTE, VIE_OP_TYPE_PUSH, VIE_OP_TYPE_CMP, VIE_OP_TYPE_POP, VIE_OP_TYPE_MOVS, VIE_OP_TYPE_GROUP1, VIE_OP_TYPE_STOS, VIE_OP_TYPE_BITTEST, VIE_OP_TYPE_TWOB_GRP15, VIE_OP_TYPE_ADD, VIE_OP_TYPE_TEST, VIE_OP_TYPE_BEXTR, VIE_OP_TYPE_LAST }; /* struct vie_op.op_flags */ #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ #define VIE_OP_F_NO_MODRM (1 << 3) #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) static const struct vie_op three_byte_opcodes_0f38[256] = { [0xF7] = { .op_byte = 0xF7, .op_type = VIE_OP_TYPE_BEXTR, }, }; static const struct vie_op two_byte_opcodes[256] = { [0xAE] = { .op_byte = 0xAE, .op_type = VIE_OP_TYPE_TWOB_GRP15, }, [0xB6] = { .op_byte = 0xB6, .op_type = VIE_OP_TYPE_MOVZX, }, [0xB7] = { .op_byte = 0xB7, .op_type = VIE_OP_TYPE_MOVZX, }, [0xBA] = { .op_byte = 0xBA, .op_type = VIE_OP_TYPE_BITTEST, .op_flags = VIE_OP_F_IMM8, }, [0xBE] = { .op_byte = 0xBE, .op_type = VIE_OP_TYPE_MOVSX, }, }; static const struct vie_op one_byte_opcodes[256] = { [0x03] = { .op_byte = 0x03, .op_type = VIE_OP_TYPE_ADD, }, [0x0F] = { .op_byte = 0x0F, .op_type = VIE_OP_TYPE_TWO_BYTE }, [0x0B] = { .op_byte = 0x0B, .op_type = VIE_OP_TYPE_OR, }, [0x2B] = { .op_byte = 0x2B, .op_type = VIE_OP_TYPE_SUB, }, [0x39] = { .op_byte = 0x39, .op_type = VIE_OP_TYPE_CMP, }, [0x3B] = { .op_byte = 0x3B, .op_type = VIE_OP_TYPE_CMP, }, [0x88] = { .op_byte = 0x88, .op_type = VIE_OP_TYPE_MOV, }, [0x89] = { .op_byte = 0x89, .op_type = VIE_OP_TYPE_MOV, }, [0x8A] = { .op_byte = 0x8A, .op_type = VIE_OP_TYPE_MOV, }, [0x8B] = { .op_byte = 0x8B, .op_type = VIE_OP_TYPE_MOV, }, [0xA1] = { .op_byte = 0xA1, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, }, [0xA3] = { .op_byte = 0xA3, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, }, [0xA4] = { .op_byte = 0xA4, .op_type = VIE_OP_TYPE_MOVS, .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION }, [0xA5] = { .op_byte = 0xA5, .op_type = VIE_OP_TYPE_MOVS, .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION }, [0xAA] = { .op_byte = 0xAA, .op_type = VIE_OP_TYPE_STOS, .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION }, [0xAB] = { .op_byte = 0xAB, .op_type = VIE_OP_TYPE_STOS, .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION }, [0xC6] = { /* XXX Group 11 extended opcode - not just MOV */ .op_byte = 0xC6, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_IMM8, }, [0xC7] = { .op_byte = 0xC7, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_IMM, }, [0x23] = { .op_byte = 0x23, .op_type = VIE_OP_TYPE_AND, }, [0x80] = { /* Group 1 extended opcode */ .op_byte = 0x80, .op_type = VIE_OP_TYPE_GROUP1, .op_flags = VIE_OP_F_IMM8, }, [0x81] = { /* Group 1 extended opcode */ .op_byte = 0x81, .op_type = VIE_OP_TYPE_GROUP1, .op_flags = VIE_OP_F_IMM, }, [0x83] = { /* Group 1 extended opcode */ .op_byte = 0x83, .op_type = VIE_OP_TYPE_GROUP1, .op_flags = VIE_OP_F_IMM8, }, [0x8F] = { /* XXX Group 1A extended opcode - not just POP */ .op_byte = 0x8F, .op_type = VIE_OP_TYPE_POP, }, [0xF7] = { /* XXX Group 3 extended opcode - not just TEST */ .op_byte = 0xF7, .op_type = VIE_OP_TYPE_TEST, .op_flags = VIE_OP_F_IMM, }, [0xFF] = { /* XXX Group 5 extended opcode - not just PUSH */ .op_byte = 0xFF, .op_type = VIE_OP_TYPE_PUSH, } }; /* struct vie.mod */ #define VIE_MOD_INDIRECT 0 #define VIE_MOD_INDIRECT_DISP8 1 #define VIE_MOD_INDIRECT_DISP32 2 #define VIE_MOD_DIRECT 3 /* struct vie.rm */ #define VIE_RM_SIB 4 #define VIE_RM_DISP32 5 #define GB (1024 * 1024 * 1024) static enum vm_reg_name gpr_map[16] = { VM_REG_GUEST_RAX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RBX, VM_REG_GUEST_RSP, VM_REG_GUEST_RBP, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15 }; static uint64_t size2mask[] = { [1] = 0xff, [2] = 0xffff, [4] = 0xffffffff, [8] = 0xffffffffffffffff, }; static int -vie_read_register(VCPU_DECL, enum vm_reg_name reg, uint64_t *rval) +vie_read_register(struct vcpu *vcpu, enum vm_reg_name reg, uint64_t *rval) { int error; - error = vm_get_register(VCPU_ARGS, reg, rval); + error = vm_get_register(vcpu, reg, rval); return (error); } static void vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) { *lhbr = 0; *reg = gpr_map[vie->reg]; /* * 64-bit mode imposes limitations on accessing legacy high byte * registers (lhbr). * * The legacy high-byte registers cannot be addressed if the REX * prefix is present. In this case the values 4, 5, 6 and 7 of the * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. * * If the REX prefix is not present then the values 4, 5, 6 and 7 * of the 'ModRM:reg' field address the legacy high-byte registers, * %ah, %ch, %dh and %bh respectively. */ if (!vie->rex_present) { if (vie->reg & 0x4) { *lhbr = 1; *reg = gpr_map[vie->reg & 0x3]; } } } static int -vie_read_bytereg(VCPU_DECL, struct vie *vie, uint8_t *rval) +vie_read_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t *rval) { uint64_t val; int error, lhbr; enum vm_reg_name reg; vie_calc_bytereg(vie, ®, &lhbr); - error = vm_get_register(VCPU_ARGS, reg, &val); + error = vm_get_register(vcpu, reg, &val); /* * To obtain the value of a legacy high byte register shift the * base register right by 8 bits (%ah = %rax >> 8). */ if (lhbr) *rval = val >> 8; else *rval = val; return (error); } static int -vie_write_bytereg(VCPU_DECL, struct vie *vie, uint8_t byte) +vie_write_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t byte) { uint64_t origval, val, mask; int error, lhbr; enum vm_reg_name reg; vie_calc_bytereg(vie, ®, &lhbr); - error = vm_get_register(VCPU_ARGS, reg, &origval); + error = vm_get_register(vcpu, reg, &origval); if (error == 0) { val = byte; mask = 0xff; if (lhbr) { /* * Shift left by 8 to store 'byte' in a legacy high * byte register. */ val <<= 8; mask <<= 8; } val |= origval & ~mask; - error = vm_set_register(VCPU_ARGS, reg, val); + error = vm_set_register(vcpu, reg, val); } return (error); } int -vie_update_register(VCPU_DECL, enum vm_reg_name reg, +vie_update_register(struct vcpu *vcpu, enum vm_reg_name reg, uint64_t val, int size) { int error; uint64_t origval; switch (size) { case 1: case 2: - error = vie_read_register(VCPU_ARGS, reg, &origval); + error = vie_read_register(vcpu, reg, &origval); if (error) return (error); val &= size2mask[size]; val |= origval & ~size2mask[size]; break; case 4: val &= 0xffffffffUL; break; case 8: break; default: return (EINVAL); } - error = vm_set_register(VCPU_ARGS, reg, val); + error = vm_set_register(vcpu, reg, val); return (error); } #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) /* * Return the status flags that would result from doing (x - y). */ #define GETCC(sz) \ static u_long \ getcc##sz(uint##sz##_t x, uint##sz##_t y) \ { \ u_long rflags; \ \ __asm __volatile("sub %2,%1; pushfq; popq %0" : \ "=r" (rflags), "+r" (x) : "m" (y)); \ return (rflags); \ } struct __hack GETCC(8); GETCC(16); GETCC(32); GETCC(64); static u_long getcc(int opsize, uint64_t x, uint64_t y) { KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, ("getcc: invalid operand size %d", opsize)); if (opsize == 1) return (getcc8(x, y)); else if (opsize == 2) return (getcc16(x, y)); else if (opsize == 4) return (getcc32(x, y)); else return (getcc64(x, y)); } /* * Macro creation of functions getaddflags{8,16,32,64} */ #define GETADDFLAGS(sz) \ static u_long \ getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ { \ u_long rflags; \ \ __asm __volatile("add %2,%1; pushfq; popq %0" : \ "=r" (rflags), "+r" (x) : "m" (y)); \ return (rflags); \ } struct __hack GETADDFLAGS(8); GETADDFLAGS(16); GETADDFLAGS(32); GETADDFLAGS(64); static u_long getaddflags(int opsize, uint64_t x, uint64_t y) { KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, ("getaddflags: invalid operand size %d", opsize)); if (opsize == 1) return (getaddflags8(x, y)); else if (opsize == 2) return (getaddflags16(x, y)); else if (opsize == 4) return (getaddflags32(x, y)); else return (getaddflags64(x, y)); } /* * Return the status flags that would result from doing (x & y). */ #define GETANDFLAGS(sz) \ static u_long \ getandflags##sz(uint##sz##_t x, uint##sz##_t y) \ { \ u_long rflags; \ \ __asm __volatile("and %2,%1; pushfq; popq %0" : \ "=r" (rflags), "+r" (x) : "m" (y)); \ return (rflags); \ } struct __hack GETANDFLAGS(8); GETANDFLAGS(16); GETANDFLAGS(32); GETANDFLAGS(64); static u_long getandflags(int opsize, uint64_t x, uint64_t y) { KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, ("getandflags: invalid operand size %d", opsize)); if (opsize == 1) return (getandflags8(x, y)); else if (opsize == 2) return (getandflags16(x, y)); else if (opsize == 4) return (getandflags32(x, y)); else return (getandflags64(x, y)); } static int -emulate_mov(VCPU_DECL, uint64_t gpa, struct vie *vie, +emulate_mov(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; enum vm_reg_name reg; uint8_t byte; uint64_t val; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x88: /* * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) * 88/r: mov r/m8, r8 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) */ size = 1; /* override for byte operation */ - error = vie_read_bytereg(VCPU_ARGS, vie, &byte); + error = vie_read_bytereg(vcpu, vie, &byte); if (error == 0) - error = memwrite(VCPU_ARGS, gpa, byte, size, arg); + error = memwrite(vcpu, gpa, byte, size, arg); break; case 0x89: /* * MOV from reg (ModRM:reg) to mem (ModRM:r/m) * 89/r: mov r/m16, r16 * 89/r: mov r/m32, r32 * REX.W + 89/r mov r/m64, r64 */ reg = gpr_map[vie->reg]; - error = vie_read_register(VCPU_ARGS, reg, &val); + error = vie_read_register(vcpu, reg, &val); if (error == 0) { val &= size2mask[size]; - error = memwrite(VCPU_ARGS, gpa, val, size, arg); + error = memwrite(vcpu, gpa, val, size, arg); } break; case 0x8A: /* * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) * 8A/r: mov r8, r/m8 * REX + 8A/r: mov r8, r/m8 */ size = 1; /* override for byte operation */ - error = memread(VCPU_ARGS, gpa, &val, size, arg); + error = memread(vcpu, gpa, &val, size, arg); if (error == 0) - error = vie_write_bytereg(VCPU_ARGS, vie, val); + error = vie_write_bytereg(vcpu, vie, val); break; case 0x8B: /* * MOV from mem (ModRM:r/m) to reg (ModRM:reg) * 8B/r: mov r16, r/m16 * 8B/r: mov r32, r/m32 * REX.W 8B/r: mov r64, r/m64 */ - error = memread(VCPU_ARGS, gpa, &val, size, arg); + error = memread(vcpu, gpa, &val, size, arg); if (error == 0) { reg = gpr_map[vie->reg]; - error = vie_update_register(VCPU_ARGS, reg, val, size); + error = vie_update_register(vcpu, reg, val, size); } break; case 0xA1: /* * MOV from seg:moffset to AX/EAX/RAX * A1: mov AX, moffs16 * A1: mov EAX, moffs32 * REX.W + A1: mov RAX, moffs64 */ - error = memread(VCPU_ARGS, gpa, &val, size, arg); + error = memread(vcpu, gpa, &val, size, arg); if (error == 0) { reg = VM_REG_GUEST_RAX; - error = vie_update_register(VCPU_ARGS, reg, val, size); + error = vie_update_register(vcpu, reg, val, size); } break; case 0xA3: /* * MOV from AX/EAX/RAX to seg:moffset * A3: mov moffs16, AX * A3: mov moffs32, EAX * REX.W + A3: mov moffs64, RAX */ - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RAX, &val); + error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val); if (error == 0) { val &= size2mask[size]; - error = memwrite(VCPU_ARGS, gpa, val, size, arg); + error = memwrite(vcpu, gpa, val, size, arg); } break; case 0xC6: /* * MOV from imm8 to mem (ModRM:r/m) * C6/0 mov r/m8, imm8 * REX + C6/0 mov r/m8, imm8 */ size = 1; /* override for byte operation */ - error = memwrite(VCPU_ARGS, gpa, vie->immediate, size, arg); + error = memwrite(vcpu, gpa, vie->immediate, size, arg); break; case 0xC7: /* * MOV from imm16/imm32 to mem (ModRM:r/m) * C7/0 mov r/m16, imm16 * C7/0 mov r/m32, imm32 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) */ val = vie->immediate & size2mask[size]; - error = memwrite(VCPU_ARGS, gpa, val, size, arg); + error = memwrite(vcpu, gpa, val, size, arg); break; default: break; } return (error); } static int -emulate_movx(VCPU_DECL, uint64_t gpa, struct vie *vie, +emulate_movx(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) { int error, size; enum vm_reg_name reg; uint64_t val; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0xB6: /* * MOV and zero extend byte from mem (ModRM:r/m) to * reg (ModRM:reg). * * 0F B6/r movzx r16, r/m8 * 0F B6/r movzx r32, r/m8 * REX.W + 0F B6/r movzx r64, r/m8 */ /* get the first operand */ - error = memread(VCPU_ARGS, gpa, &val, 1, arg); + error = memread(vcpu, gpa, &val, 1, arg); if (error) break; /* get the second operand */ reg = gpr_map[vie->reg]; /* zero-extend byte */ val = (uint8_t)val; /* write the result */ - error = vie_update_register(VCPU_ARGS, reg, val, size); + error = vie_update_register(vcpu, reg, val, size); break; case 0xB7: /* * MOV and zero extend word from mem (ModRM:r/m) to * reg (ModRM:reg). * * 0F B7/r movzx r32, r/m16 * REX.W + 0F B7/r movzx r64, r/m16 */ - error = memread(VCPU_ARGS, gpa, &val, 2, arg); + error = memread(vcpu, gpa, &val, 2, arg); if (error) return (error); reg = gpr_map[vie->reg]; /* zero-extend word */ val = (uint16_t)val; - error = vie_update_register(VCPU_ARGS, reg, val, size); + error = vie_update_register(vcpu, reg, val, size); break; case 0xBE: /* * MOV and sign extend byte from mem (ModRM:r/m) to * reg (ModRM:reg). * * 0F BE/r movsx r16, r/m8 * 0F BE/r movsx r32, r/m8 * REX.W + 0F BE/r movsx r64, r/m8 */ /* get the first operand */ - error = memread(VCPU_ARGS, gpa, &val, 1, arg); + error = memread(vcpu, gpa, &val, 1, arg); if (error) break; /* get the second operand */ reg = gpr_map[vie->reg]; /* sign extend byte */ val = (int8_t)val; /* write the result */ - error = vie_update_register(VCPU_ARGS, reg, val, size); + error = vie_update_register(vcpu, reg, val, size); break; default: break; } return (error); } /* * Helper function to calculate and validate a linear address. */ static int -get_gla(VCPU_DECL, struct vie *vie __unused, +get_gla(struct vcpu *vcpu, struct vie *vie __unused, struct vm_guest_paging *paging, int opsize, int addrsize, int prot, enum vm_reg_name seg, enum vm_reg_name gpr, uint64_t *gla, int *fault) { struct seg_desc desc; uint64_t cr0, val, rflags; int error __diagused; - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_CR0, &cr0); + error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0); KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags); + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); - error = vm_get_seg_desc(VCPU_ARGS, seg, &desc); + error = vm_get_seg_desc(vcpu, seg, &desc); KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", __func__, error, seg)); - error = vie_read_register(VCPU_ARGS, gpr, &val); + error = vie_read_register(vcpu, gpr, &val); KASSERT(error == 0, ("%s: error %d getting register %d", __func__, error, gpr)); if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, addrsize, prot, gla)) { if (seg == VM_REG_GUEST_SS) - vm_inject_ss(VCPU_ARGS, 0); + vm_inject_ss(vcpu, 0); else - vm_inject_gp(VCPU_ARGS); + vm_inject_gp(vcpu); goto guest_fault; } if (vie_canonical_check(paging->cpu_mode, *gla)) { if (seg == VM_REG_GUEST_SS) - vm_inject_ss(VCPU_ARGS, 0); + vm_inject_ss(vcpu, 0); else - vm_inject_gp(VCPU_ARGS); + vm_inject_gp(vcpu); goto guest_fault; } if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { - vm_inject_ac(VCPU_ARGS, 0); + vm_inject_ac(vcpu, 0); goto guest_fault; } *fault = 0; return (0); guest_fault: *fault = 1; return (0); } static int -emulate_movs(VCPU_DECL, uint64_t gpa, struct vie *vie, +emulate_movs(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { #ifdef _KERNEL struct vm_copyinfo copyinfo[2]; #else struct iovec copyinfo[2]; #endif uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; uint64_t rcx, rdi, rsi, rflags; int error, fault, opsize, seg, repeat; opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; val = 0; error = 0; /* * XXX although the MOVS instruction is only supposed to be used with * the "rep" prefix some guests like FreeBSD will use "repnz" instead. * * Empirically the "repnz" prefix has identical behavior to "rep" * and the zero flag does not make a difference. */ repeat = vie->repz_present | vie->repnz_present; if (repeat) { - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RCX, &rcx); + error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx); KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); /* * The count register is %rcx, %ecx or %cx depending on the * address size of the instruction. */ if ((rcx & vie_size2mask(vie->addrsize)) == 0) { error = 0; goto done; } } /* * Source Destination Comments * -------------------------------------------- * (1) memory memory n/a * (2) memory mmio emulated * (3) mmio memory emulated * (4) mmio mmio emulated * * At this point we don't have sufficient information to distinguish * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this * out because it will succeed only when operating on regular memory. * * XXX the emulation doesn't properly handle the case where 'gpa' * is straddling the boundary between the normal memory and MMIO. */ seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; - error = get_gla(VCPU_ARGS, vie, paging, opsize, vie->addrsize, + error = get_gla(vcpu, vie, paging, opsize, vie->addrsize, PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault); if (error || fault) goto done; - error = vm_copy_setup(VCPU_ARGS, paging, srcaddr, opsize, PROT_READ, + error = vm_copy_setup(vcpu, paging, srcaddr, opsize, PROT_READ, copyinfo, nitems(copyinfo), &fault); if (error == 0) { if (fault) goto done; /* Resume guest to handle fault */ /* * case (2): read from system memory and write to mmio. */ vm_copyin(copyinfo, &val, opsize); vm_copy_teardown(copyinfo, nitems(copyinfo)); - error = memwrite(VCPU_ARGS, gpa, val, opsize, arg); + error = memwrite(vcpu, gpa, val, opsize, arg); if (error) goto done; } else { /* * 'vm_copy_setup()' is expected to fail for cases (3) and (4) * if 'srcaddr' is in the mmio space. */ - error = get_gla(VCPU_ARGS, vie, paging, opsize, vie->addrsize, + error = get_gla(vcpu, vie, paging, opsize, vie->addrsize, PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr, &fault); if (error || fault) goto done; - error = vm_copy_setup(VCPU_ARGS, paging, dstaddr, opsize, + error = vm_copy_setup(vcpu, paging, dstaddr, opsize, PROT_WRITE, copyinfo, nitems(copyinfo), &fault); if (error == 0) { if (fault) goto done; /* Resume guest to handle fault */ /* * case (3): read from MMIO and write to system memory. * * A MMIO read can have side-effects so we * commit to it only after vm_copy_setup() is * successful. If a page-fault needs to be * injected into the guest then it will happen * before the MMIO read is attempted. */ - error = memread(VCPU_ARGS, gpa, &val, opsize, arg); + error = memread(vcpu, gpa, &val, opsize, arg); if (error) goto done; vm_copyout(&val, copyinfo, opsize); vm_copy_teardown(copyinfo, nitems(copyinfo)); } else { /* * Case (4): read from and write to mmio. * * Commit to the MMIO read/write (with potential * side-effects) only after we are sure that the * instruction is not going to be restarted due * to address translation faults. */ - error = vm_gla2gpa(VCPU_ARGS, paging, srcaddr, + error = vm_gla2gpa(vcpu, paging, srcaddr, PROT_READ, &srcgpa, &fault); if (error || fault) goto done; - error = vm_gla2gpa(VCPU_ARGS, paging, dstaddr, + error = vm_gla2gpa(vcpu, paging, dstaddr, PROT_WRITE, &dstgpa, &fault); if (error || fault) goto done; - error = memread(VCPU_ARGS, srcgpa, &val, opsize, arg); + error = memread(vcpu, srcgpa, &val, opsize, arg); if (error) goto done; - error = memwrite(VCPU_ARGS, dstgpa, val, opsize, arg); + error = memwrite(vcpu, dstgpa, val, opsize, arg); if (error) goto done; } } - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RSI, &rsi); + error = vie_read_register(vcpu, VM_REG_GUEST_RSI, &rsi); KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RDI, &rdi); + error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi); KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags); + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); if (rflags & PSL_D) { rsi -= opsize; rdi -= opsize; } else { rsi += opsize; rdi += opsize; } - error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RSI, rsi, + error = vie_update_register(vcpu, VM_REG_GUEST_RSI, rsi, vie->addrsize); KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); - error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RDI, rdi, + error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi, vie->addrsize); KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); if (repeat) { rcx = rcx - 1; - error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RCX, + error = vie_update_register(vcpu, VM_REG_GUEST_RCX, rcx, vie->addrsize); KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); /* * Repeat the instruction if the count register is not zero. */ if ((rcx & vie_size2mask(vie->addrsize)) != 0) - vm_restart_instruction(VCPU_ARGS); + vm_restart_instruction(vcpu); } done: KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d", __func__, error)); return (error); } static int -emulate_stos(VCPU_DECL, uint64_t gpa, struct vie *vie, +emulate_stos(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging __unused, mem_region_read_t memread __unused, mem_region_write_t memwrite, void *arg) { int error, opsize, repeat; uint64_t val; uint64_t rcx, rdi, rflags; opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; repeat = vie->repz_present | vie->repnz_present; if (repeat) { - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RCX, &rcx); + error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx); KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); /* * The count register is %rcx, %ecx or %cx depending on the * address size of the instruction. */ if ((rcx & vie_size2mask(vie->addrsize)) == 0) return (0); } - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RAX, &val); + error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val); KASSERT(!error, ("%s: error %d getting rax", __func__, error)); - error = memwrite(VCPU_ARGS, gpa, val, opsize, arg); + error = memwrite(vcpu, gpa, val, opsize, arg); if (error) return (error); - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RDI, &rdi); + error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi); KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags); + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); if (rflags & PSL_D) rdi -= opsize; else rdi += opsize; - error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RDI, rdi, + error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi, vie->addrsize); KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); if (repeat) { rcx = rcx - 1; - error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RCX, + error = vie_update_register(vcpu, VM_REG_GUEST_RCX, rcx, vie->addrsize); KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); /* * Repeat the instruction if the count register is not zero. */ if ((rcx & vie_size2mask(vie->addrsize)) != 0) - vm_restart_instruction(VCPU_ARGS); + vm_restart_instruction(vcpu); } return (0); } static int -emulate_and(VCPU_DECL, uint64_t gpa, struct vie *vie, +emulate_and(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; enum vm_reg_name reg; uint64_t result, rflags, rflags2, val1, val2; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x23: /* * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the * result in reg. * * 23/r and r16, r/m16 * 23/r and r32, r/m32 * REX.W + 23/r and r64, r/m64 */ /* get the first operand */ reg = gpr_map[vie->reg]; - error = vie_read_register(VCPU_ARGS, reg, &val1); + error = vie_read_register(vcpu, reg, &val1); if (error) break; /* get the second operand */ - error = memread(VCPU_ARGS, gpa, &val2, size, arg); + error = memread(vcpu, gpa, &val2, size, arg); if (error) break; /* perform the operation and write the result */ result = val1 & val2; - error = vie_update_register(VCPU_ARGS, reg, result, size); + error = vie_update_register(vcpu, reg, result, size); break; case 0x81: case 0x83: /* * AND mem (ModRM:r/m) with immediate and store the * result in mem. * * 81 /4 and r/m16, imm16 * 81 /4 and r/m32, imm32 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 * * 83 /4 and r/m16, imm8 sign-extended to 16 * 83 /4 and r/m32, imm8 sign-extended to 32 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 */ /* get the first operand */ - error = memread(VCPU_ARGS, gpa, &val1, size, arg); + error = memread(vcpu, gpa, &val1, size, arg); if (error) break; /* * perform the operation with the pre-fetched immediate * operand and write the result */ result = val1 & vie->immediate; - error = memwrite(VCPU_ARGS, gpa, result, size, arg); + error = memwrite(vcpu, gpa, result, size, arg); break; default: break; } if (error) return (error); - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags); + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); /* * OF and CF are cleared; the SF, ZF and PF flags are set according * to the result; AF is undefined. * * The updated status flags are obtained by subtracting 0 from 'result'. */ rflags2 = getcc(size, result, 0); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); - error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, rflags, 8); + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int -emulate_or(VCPU_DECL, uint64_t gpa, struct vie *vie, +emulate_or(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; enum vm_reg_name reg; uint64_t result, rflags, rflags2, val1, val2; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x0B: /* * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the * result in reg. * * 0b/r or r16, r/m16 * 0b/r or r32, r/m32 * REX.W + 0b/r or r64, r/m64 */ /* get the first operand */ reg = gpr_map[vie->reg]; - error = vie_read_register(VCPU_ARGS, reg, &val1); + error = vie_read_register(vcpu, reg, &val1); if (error) break; /* get the second operand */ - error = memread(VCPU_ARGS, gpa, &val2, size, arg); + error = memread(vcpu, gpa, &val2, size, arg); if (error) break; /* perform the operation and write the result */ result = val1 | val2; - error = vie_update_register(VCPU_ARGS, reg, result, size); + error = vie_update_register(vcpu, reg, result, size); break; case 0x81: case 0x83: /* * OR mem (ModRM:r/m) with immediate and store the * result in mem. * * 81 /1 or r/m16, imm16 * 81 /1 or r/m32, imm32 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 * * 83 /1 or r/m16, imm8 sign-extended to 16 * 83 /1 or r/m32, imm8 sign-extended to 32 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 */ /* get the first operand */ - error = memread(VCPU_ARGS, gpa, &val1, size, arg); + error = memread(vcpu, gpa, &val1, size, arg); if (error) break; /* * perform the operation with the pre-fetched immediate * operand and write the result */ result = val1 | vie->immediate; - error = memwrite(VCPU_ARGS, gpa, result, size, arg); + error = memwrite(vcpu, gpa, result, size, arg); break; default: break; } if (error) return (error); - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags); + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); /* * OF and CF are cleared; the SF, ZF and PF flags are set according * to the result; AF is undefined. * * The updated status flags are obtained by subtracting 0 from 'result'. */ rflags2 = getcc(size, result, 0); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); - error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, rflags, 8); + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int -emulate_cmp(VCPU_DECL, uint64_t gpa, struct vie *vie, +emulate_cmp(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) { int error, size; uint64_t regop, memop, op1, op2, rflags, rflags2; enum vm_reg_name reg; size = vie->opsize; switch (vie->op.op_byte) { case 0x39: case 0x3B: /* * 39/r CMP r/m16, r16 * 39/r CMP r/m32, r32 * REX.W 39/r CMP r/m64, r64 * * 3B/r CMP r16, r/m16 * 3B/r CMP r32, r/m32 * REX.W + 3B/r CMP r64, r/m64 * * Compare the first operand with the second operand and * set status flags in EFLAGS register. The comparison is * performed by subtracting the second operand from the first * operand and then setting the status flags. */ /* Get the register operand */ reg = gpr_map[vie->reg]; - error = vie_read_register(VCPU_ARGS, reg, ®op); + error = vie_read_register(vcpu, reg, ®op); if (error) return (error); /* Get the memory operand */ - error = memread(VCPU_ARGS, gpa, &memop, size, arg); + error = memread(vcpu, gpa, &memop, size, arg); if (error) return (error); if (vie->op.op_byte == 0x3B) { op1 = regop; op2 = memop; } else { op1 = memop; op2 = regop; } rflags2 = getcc(size, op1, op2); break; case 0x80: case 0x81: case 0x83: /* * 80 /7 cmp r/m8, imm8 * REX + 80 /7 cmp r/m8, imm8 * * 81 /7 cmp r/m16, imm16 * 81 /7 cmp r/m32, imm32 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 * * 83 /7 cmp r/m16, imm8 sign-extended to 16 * 83 /7 cmp r/m32, imm8 sign-extended to 32 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 * * Compare mem (ModRM:r/m) with immediate and set * status flags according to the results. The * comparison is performed by subtracting the * immediate from the first operand and then setting * the status flags. * */ if (vie->op.op_byte == 0x80) size = 1; /* get the first operand */ - error = memread(VCPU_ARGS, gpa, &op1, size, arg); + error = memread(vcpu, gpa, &op1, size, arg); if (error) return (error); rflags2 = getcc(size, op1, vie->immediate); break; default: return (EINVAL); } - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags); + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & RFLAGS_STATUS_BITS; - error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, rflags, 8); + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int -emulate_test(VCPU_DECL, uint64_t gpa, struct vie *vie, +emulate_test(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) { int error, size; uint64_t op1, rflags, rflags2; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0xF7: /* * F7 /0 test r/m16, imm16 * F7 /0 test r/m32, imm32 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64 * * Test mem (ModRM:r/m) with immediate and set status * flags according to the results. The comparison is * performed by anding the immediate from the first * operand and then setting the status flags. */ if ((vie->reg & 7) != 0) return (EINVAL); - error = memread(VCPU_ARGS, gpa, &op1, size, arg); + error = memread(vcpu, gpa, &op1, size, arg); if (error) return (error); rflags2 = getandflags(size, op1, vie->immediate); break; default: return (EINVAL); } - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags); + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); /* * OF and CF are cleared; the SF, ZF and PF flags are set according * to the result; AF is undefined. */ rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); - error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, rflags, 8); + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int -emulate_bextr(VCPU_DECL, uint64_t gpa, struct vie *vie, +emulate_bextr(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) { uint64_t src1, src2, dst, rflags; unsigned start, len, size; int error; size = vie->opsize; error = EINVAL; /* * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b * * Destination operand is ModRM:reg. Source operands are ModRM:r/m and * Vex.vvvv. * * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored). */ if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT) size = 4; /* * Extracts contiguous bits from the first /source/ operand (second * operand) using an index and length specified in the second /source/ * operand (third operand). */ - error = memread(VCPU_ARGS, gpa, &src1, size, arg); + error = memread(vcpu, gpa, &src1, size, arg); if (error) return (error); - error = vie_read_register(VCPU_ARGS, gpr_map[vie->vex_reg], &src2); + error = vie_read_register(vcpu, gpr_map[vie->vex_reg], &src2); if (error) return (error); - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags); + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); start = (src2 & 0xff); len = (src2 & 0xff00) >> 8; /* If no bits are extracted, the destination register is cleared. */ dst = 0; /* If START exceeds the operand size, no bits are extracted. */ if (start > size * 8) goto done; /* Length is bounded by both the destination size and start offset. */ if (start + len > size * 8) len = (size * 8) - start; if (len == 0) goto done; if (start > 0) src1 = (src1 >> start); if (len < 64) src1 = src1 & ((1ull << len) - 1); dst = src1; done: - error = vie_update_register(VCPU_ARGS, gpr_map[vie->reg], dst, size); + error = vie_update_register(vcpu, gpr_map[vie->reg], dst, size); if (error) return (error); /* * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result. * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared. */ rflags &= ~RFLAGS_STATUS_BITS; if (dst == 0) rflags |= PSL_Z; - error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, rflags, + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int -emulate_add(VCPU_DECL, uint64_t gpa, struct vie *vie, +emulate_add(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) { int error, size; uint64_t nval, rflags, rflags2, val1, val2; enum vm_reg_name reg; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x03: /* * ADD r/m to r and store the result in r * * 03/r ADD r16, r/m16 * 03/r ADD r32, r/m32 * REX.W + 03/r ADD r64, r/m64 */ /* get the first operand */ reg = gpr_map[vie->reg]; - error = vie_read_register(VCPU_ARGS, reg, &val1); + error = vie_read_register(vcpu, reg, &val1); if (error) break; /* get the second operand */ - error = memread(VCPU_ARGS, gpa, &val2, size, arg); + error = memread(vcpu, gpa, &val2, size, arg); if (error) break; /* perform the operation and write the result */ nval = val1 + val2; - error = vie_update_register(VCPU_ARGS, reg, nval, size); + error = vie_update_register(vcpu, reg, nval, size); break; default: break; } if (!error) { rflags2 = getaddflags(size, val1, val2); - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & RFLAGS_STATUS_BITS; - error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); } return (error); } static int -emulate_sub(VCPU_DECL, uint64_t gpa, struct vie *vie, +emulate_sub(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) { int error, size; uint64_t nval, rflags, rflags2, val1, val2; enum vm_reg_name reg; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x2B: /* * SUB r/m from r and store the result in r * * 2B/r SUB r16, r/m16 * 2B/r SUB r32, r/m32 * REX.W + 2B/r SUB r64, r/m64 */ /* get the first operand */ reg = gpr_map[vie->reg]; - error = vie_read_register(VCPU_ARGS, reg, &val1); + error = vie_read_register(vcpu, reg, &val1); if (error) break; /* get the second operand */ - error = memread(VCPU_ARGS, gpa, &val2, size, arg); + error = memread(vcpu, gpa, &val2, size, arg); if (error) break; /* perform the operation and write the result */ nval = val1 - val2; - error = vie_update_register(VCPU_ARGS, reg, nval, size); + error = vie_update_register(vcpu, reg, nval, size); break; default: break; } if (!error) { rflags2 = getcc(size, val1, val2); - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & RFLAGS_STATUS_BITS; - error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); } return (error); } static int -emulate_stack_op(VCPU_DECL, uint64_t mmio_gpa, struct vie *vie, +emulate_stack_op(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { #ifdef _KERNEL struct vm_copyinfo copyinfo[2]; #else struct iovec copyinfo[2]; #endif struct seg_desc ss_desc; uint64_t cr0, rflags, rsp, stack_gla, val; int error, fault, size, stackaddrsize, pushop; val = 0; size = vie->opsize; pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; /* * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 */ if (paging->cpu_mode == CPU_MODE_REAL) { stackaddrsize = 2; } else if (paging->cpu_mode == CPU_MODE_64BIT) { /* * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 * - Stack pointer size is always 64-bits. * - PUSH/POP of 32-bit values is not possible in 64-bit mode. * - 16-bit PUSH/POP is supported by using the operand size * override prefix (66H). */ stackaddrsize = 8; size = vie->opsize_override ? 2 : 8; } else { /* * In protected or compatibility mode the 'B' flag in the * stack-segment descriptor determines the size of the * stack pointer. */ - error = vm_get_seg_desc(VCPU_ARGS, VM_REG_GUEST_SS, &ss_desc); + error = vm_get_seg_desc(vcpu, VM_REG_GUEST_SS, &ss_desc); KASSERT(error == 0, ("%s: error %d getting SS descriptor", __func__, error)); if (SEG_DESC_DEF32(ss_desc.access)) stackaddrsize = 4; else stackaddrsize = 2; } - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_CR0, &cr0); + error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0); KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags); + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RSP, &rsp); + error = vie_read_register(vcpu, VM_REG_GUEST_RSP, &rsp); KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); if (pushop) { rsp -= size; } if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, &stack_gla)) { - vm_inject_ss(VCPU_ARGS, 0); + vm_inject_ss(vcpu, 0); return (0); } if (vie_canonical_check(paging->cpu_mode, stack_gla)) { - vm_inject_ss(VCPU_ARGS, 0); + vm_inject_ss(vcpu, 0); return (0); } if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { - vm_inject_ac(VCPU_ARGS, 0); + vm_inject_ac(vcpu, 0); return (0); } - error = vm_copy_setup(VCPU_ARGS, paging, stack_gla, size, + error = vm_copy_setup(vcpu, paging, stack_gla, size, pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), &fault); if (error || fault) return (error); if (pushop) { - error = memread(VCPU_ARGS, mmio_gpa, &val, size, arg); + error = memread(vcpu, mmio_gpa, &val, size, arg); if (error == 0) vm_copyout(&val, copyinfo, size); } else { vm_copyin(copyinfo, &val, size); - error = memwrite(VCPU_ARGS, mmio_gpa, val, size, arg); + error = memwrite(vcpu, mmio_gpa, val, size, arg); rsp += size; } vm_copy_teardown(copyinfo, nitems(copyinfo)); if (error == 0) { - error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RSP, rsp, + error = vie_update_register(vcpu, VM_REG_GUEST_RSP, rsp, stackaddrsize); KASSERT(error == 0, ("error %d updating rsp", error)); } return (error); } static int -emulate_push(VCPU_DECL, uint64_t mmio_gpa, struct vie *vie, +emulate_push(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error; /* * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. * * PUSH is part of the group 5 extended opcodes and is identified * by ModRM:reg = b110. */ if ((vie->reg & 7) != 6) return (EINVAL); - error = emulate_stack_op(VCPU_ARGS, mmio_gpa, vie, paging, memread, + error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread, memwrite, arg); return (error); } static int -emulate_pop(VCPU_DECL, uint64_t mmio_gpa, struct vie *vie, +emulate_pop(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error; /* * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. * * POP is part of the group 1A extended opcodes and is identified * by ModRM:reg = b000. */ if ((vie->reg & 7) != 0) return (EINVAL); - error = emulate_stack_op(VCPU_ARGS, mmio_gpa, vie, paging, memread, + error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread, memwrite, arg); return (error); } static int -emulate_group1(VCPU_DECL, uint64_t gpa, struct vie *vie, +emulate_group1(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging __unused, mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) { int error; switch (vie->reg & 7) { case 0x1: /* OR */ - error = emulate_or(VCPU_ARGS, gpa, vie, + error = emulate_or(vcpu, gpa, vie, memread, memwrite, memarg); break; case 0x4: /* AND */ - error = emulate_and(VCPU_ARGS, gpa, vie, + error = emulate_and(vcpu, gpa, vie, memread, memwrite, memarg); break; case 0x7: /* CMP */ - error = emulate_cmp(VCPU_ARGS, gpa, vie, + error = emulate_cmp(vcpu, gpa, vie, memread, memwrite, memarg); break; default: error = EINVAL; break; } return (error); } static int -emulate_bittest(VCPU_DECL, uint64_t gpa, struct vie *vie, +emulate_bittest(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *memarg) { uint64_t val, rflags; int error, bitmask, bitoff; /* * 0F BA is a Group 8 extended opcode. * * Currently we only emulate the 'Bit Test' instruction which is * identified by a ModR/M:reg encoding of 100b. */ if ((vie->reg & 7) != 4) return (EINVAL); - error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags); + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); - error = memread(VCPU_ARGS, gpa, &val, vie->opsize, memarg); + error = memread(vcpu, gpa, &val, vie->opsize, memarg); if (error) return (error); /* * Intel SDM, Vol 2, Table 3-2: * "Range of Bit Positions Specified by Bit Offset Operands" */ bitmask = vie->opsize * 8 - 1; bitoff = vie->immediate & bitmask; /* Copy the bit into the Carry flag in %rflags */ if (val & (1UL << bitoff)) rflags |= PSL_C; else rflags &= ~PSL_C; - error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, rflags, 8); + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); return (0); } static int -emulate_twob_group15(VCPU_DECL, uint64_t gpa, struct vie *vie, +emulate_twob_group15(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *memarg) { int error; uint64_t buf; switch (vie->reg & 7) { case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ if (vie->mod == 0x3) { /* * SFENCE. Ignore it, VM exit provides enough * barriers on its own. */ error = 0; } else { /* * CLFLUSH, CLFLUSHOPT. Only check for access * rights. */ - error = memread(VCPU_ARGS, gpa, &buf, 1, memarg); + error = memread(vcpu, gpa, &buf, 1, memarg); } break; default: error = EINVAL; break; } return (error); } int -vmm_emulate_instruction(VCPU_DECL, uint64_t gpa, struct vie *vie, +vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) { int error; if (!vie->decoded) return (EINVAL); switch (vie->op.op_type) { case VIE_OP_TYPE_GROUP1: - error = emulate_group1(VCPU_ARGS, gpa, vie, paging, memread, + error = emulate_group1(vcpu, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_POP: - error = emulate_pop(VCPU_ARGS, gpa, vie, paging, memread, + error = emulate_pop(vcpu, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_PUSH: - error = emulate_push(VCPU_ARGS, gpa, vie, paging, memread, + error = emulate_push(vcpu, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_CMP: - error = emulate_cmp(VCPU_ARGS, gpa, vie, + error = emulate_cmp(vcpu, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_MOV: - error = emulate_mov(VCPU_ARGS, gpa, vie, + error = emulate_mov(vcpu, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_MOVSX: case VIE_OP_TYPE_MOVZX: - error = emulate_movx(VCPU_ARGS, gpa, vie, + error = emulate_movx(vcpu, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_MOVS: - error = emulate_movs(VCPU_ARGS, gpa, vie, paging, memread, + error = emulate_movs(vcpu, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_STOS: - error = emulate_stos(VCPU_ARGS, gpa, vie, paging, memread, + error = emulate_stos(vcpu, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_AND: - error = emulate_and(VCPU_ARGS, gpa, vie, + error = emulate_and(vcpu, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_OR: - error = emulate_or(VCPU_ARGS, gpa, vie, + error = emulate_or(vcpu, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_SUB: - error = emulate_sub(VCPU_ARGS, gpa, vie, + error = emulate_sub(vcpu, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_BITTEST: - error = emulate_bittest(VCPU_ARGS, gpa, vie, + error = emulate_bittest(vcpu, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_TWOB_GRP15: - error = emulate_twob_group15(VCPU_ARGS, gpa, vie, + error = emulate_twob_group15(vcpu, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_ADD: - error = emulate_add(VCPU_ARGS, gpa, vie, memread, + error = emulate_add(vcpu, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_TEST: - error = emulate_test(VCPU_ARGS, gpa, vie, + error = emulate_test(vcpu, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_BEXTR: - error = emulate_bextr(VCPU_ARGS, gpa, vie, paging, + error = emulate_bextr(vcpu, gpa, vie, paging, memread, memwrite, memarg); break; default: error = EINVAL; break; } return (error); } int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) { KASSERT(size == 1 || size == 2 || size == 4 || size == 8, ("%s: invalid size %d", __func__, size)); KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) return (0); return ((gla & (size - 1)) ? 1 : 0); } int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) { uint64_t mask; if (cpu_mode != CPU_MODE_64BIT) return (0); /* * The value of the bit 47 in the 'gla' should be replicated in the * most significant 16 bits. */ mask = ~((1UL << 48) - 1); if (gla & (1UL << 47)) return ((gla & mask) != mask); else return ((gla & mask) != 0); } uint64_t vie_size2mask(int size) { KASSERT(size == 1 || size == 2 || size == 4 || size == 8, ("vie_size2mask: invalid size %d", size)); return (size2mask[size]); } int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, struct seg_desc *desc, uint64_t offset, int length, int addrsize, int prot, uint64_t *gla) { uint64_t firstoff, low_limit, high_limit, segbase; int glasize, type; KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, ("%s: invalid segment %d", __func__, seg)); KASSERT(length == 1 || length == 2 || length == 4 || length == 8, ("%s: invalid operand size %d", __func__, length)); KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, ("%s: invalid prot %#x", __func__, prot)); firstoff = offset; if (cpu_mode == CPU_MODE_64BIT) { KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); glasize = 8; } else { KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); glasize = 4; /* * If the segment selector is loaded with a NULL selector * then the descriptor is unusable and attempting to use * it results in a #GP(0). */ if (SEG_DESC_UNUSABLE(desc->access)) return (-1); /* * The processor generates a #NP exception when a segment * register is loaded with a selector that points to a * descriptor that is not present. If this was the case then * it would have been checked before the VM-exit. */ KASSERT(SEG_DESC_PRESENT(desc->access), ("segment %d not present: %#x", seg, desc->access)); /* * The descriptor type must indicate a code/data segment. */ type = SEG_DESC_TYPE(desc->access); KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " "descriptor type %#x", seg, type)); if (prot & PROT_READ) { /* #GP on a read access to a exec-only code segment */ if ((type & 0xA) == 0x8) return (-1); } if (prot & PROT_WRITE) { /* * #GP on a write access to a code segment or a * read-only data segment. */ if (type & 0x8) /* code segment */ return (-1); if ((type & 0xA) == 0) /* read-only data seg */ return (-1); } /* * 'desc->limit' is fully expanded taking granularity into * account. */ if ((type & 0xC) == 0x4) { /* expand-down data segment */ low_limit = desc->limit + 1; high_limit = SEG_DESC_DEF32(desc->access) ? 0xffffffff : 0xffff; } else { /* code segment or expand-up data segment */ low_limit = 0; high_limit = desc->limit; } while (length > 0) { offset &= vie_size2mask(addrsize); if (offset < low_limit || offset > high_limit) return (-1); offset++; length--; } } /* * In 64-bit mode all segments except %fs and %gs have a segment * base address of 0. */ if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) { segbase = 0; } else { segbase = desc->base; } /* * Truncate 'firstoff' to the effective address size before adding * it to the segment base. */ firstoff &= vie_size2mask(addrsize); *gla = (segbase + firstoff) & vie_size2mask(glasize); return (0); } /* * Prepare a partially decoded vie for a 2nd attempt. */ void vie_restart(struct vie *vie) { _Static_assert( offsetof(struct vie, inst) < offsetof(struct vie, vie_startzero) && offsetof(struct vie, num_valid) < offsetof(struct vie, vie_startzero), "restart should not erase instruction length or contents"); memset((char *)vie + offsetof(struct vie, vie_startzero), 0, sizeof(*vie) - offsetof(struct vie, vie_startzero)); vie->base_register = VM_REG_LAST; vie->index_register = VM_REG_LAST; vie->segment_register = VM_REG_LAST; } void vie_init(struct vie *vie, const char *inst_bytes, int inst_length) { KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, ("%s: invalid instruction length (%d)", __func__, inst_length)); vie_restart(vie); memset(vie->inst, 0, sizeof(vie->inst)); if (inst_length != 0) memcpy(vie->inst, inst_bytes, inst_length); vie->num_valid = inst_length; } #ifdef _KERNEL static int pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) { int error_code = 0; if (pte & PG_V) error_code |= PGEX_P; if (prot & VM_PROT_WRITE) error_code |= PGEX_W; if (usermode) error_code |= PGEX_U; if (rsvd) error_code |= PGEX_RSV; if (prot & VM_PROT_EXECUTE) error_code |= PGEX_I; return (error_code); } static void ptp_release(void **cookie) { if (*cookie != NULL) { vm_gpa_release(*cookie); *cookie = NULL; } } static void * ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) { void *ptr; ptp_release(cookie); ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie); return (ptr); } static int _vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) { int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; u_int retries; uint64_t *ptpbase, ptpphys, pte, pgsize; uint32_t *ptpbase32, pte32; void *cookie; *guest_fault = 0; usermode = (paging->cpl == 3 ? 1 : 0); writable = prot & VM_PROT_WRITE; cookie = NULL; retval = 0; retries = 0; restart: ptpphys = paging->cr3; /* root of the page tables */ ptp_release(&cookie); if (retries++ > 0) maybe_yield(); if (vie_canonical_check(paging->cpu_mode, gla)) { /* * XXX assuming a non-stack reference otherwise a stack fault * should be generated. */ if (!check_only) vm_inject_gp(vcpu); goto fault; } if (paging->paging_mode == PAGING_MODE_FLAT) { *gpa = gla; goto done; } if (paging->paging_mode == PAGING_MODE_32) { nlevels = 2; while (--nlevels >= 0) { /* Zero out the lower 12 bits. */ ptpphys &= ~0xfff; ptpbase32 = ptp_hold(vcpu, ptpphys, PAGE_SIZE, &cookie); if (ptpbase32 == NULL) goto error; ptpshift = PAGE_SHIFT + nlevels * 10; ptpindex = (gla >> ptpshift) & 0x3FF; pgsize = 1UL << ptpshift; pte32 = ptpbase32[ptpindex]; if ((pte32 & PG_V) == 0 || (usermode && (pte32 & PG_U) == 0) || (writable && (pte32 & PG_RW) == 0)) { if (!check_only) { pfcode = pf_error_code(usermode, prot, 0, pte32); vm_inject_pf(vcpu, pfcode, gla); } goto fault; } /* * Emulate the x86 MMU's management of the accessed * and dirty flags. While the accessed flag is set * at every level of the page table, the dirty flag * is only set at the last level providing the guest * physical address. */ if (!check_only && (pte32 & PG_A) == 0) { if (atomic_cmpset_32(&ptpbase32[ptpindex], pte32, pte32 | PG_A) == 0) { goto restart; } } /* XXX must be ignored if CR4.PSE=0 */ if (nlevels > 0 && (pte32 & PG_PS) != 0) break; ptpphys = pte32; } /* Set the dirty bit in the page table entry if necessary */ if (!check_only && writable && (pte32 & PG_M) == 0) { if (atomic_cmpset_32(&ptpbase32[ptpindex], pte32, pte32 | PG_M) == 0) { goto restart; } } /* Zero out the lower 'ptpshift' bits */ pte32 >>= ptpshift; pte32 <<= ptpshift; *gpa = pte32 | (gla & (pgsize - 1)); goto done; } if (paging->paging_mode == PAGING_MODE_PAE) { /* Zero out the lower 5 bits and the upper 32 bits */ ptpphys &= 0xffffffe0UL; ptpbase = ptp_hold(vcpu, ptpphys, sizeof(*ptpbase) * 4, &cookie); if (ptpbase == NULL) goto error; ptpindex = (gla >> 30) & 0x3; pte = ptpbase[ptpindex]; if ((pte & PG_V) == 0) { if (!check_only) { pfcode = pf_error_code(usermode, prot, 0, pte); vm_inject_pf(vcpu, pfcode, gla); } goto fault; } ptpphys = pte; nlevels = 2; } else if (paging->paging_mode == PAGING_MODE_64_LA57) { nlevels = 5; } else { nlevels = 4; } while (--nlevels >= 0) { /* Zero out the lower 12 bits and the upper 12 bits */ ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; ptpbase = ptp_hold(vcpu, ptpphys, PAGE_SIZE, &cookie); if (ptpbase == NULL) goto error; ptpshift = PAGE_SHIFT + nlevels * 9; ptpindex = (gla >> ptpshift) & 0x1FF; pgsize = 1UL << ptpshift; pte = ptpbase[ptpindex]; if ((pte & PG_V) == 0 || (usermode && (pte & PG_U) == 0) || (writable && (pte & PG_RW) == 0)) { if (!check_only) { pfcode = pf_error_code(usermode, prot, 0, pte); vm_inject_pf(vcpu, pfcode, gla); } goto fault; } /* Set the accessed bit in the page table entry */ if (!check_only && (pte & PG_A) == 0) { if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_A) == 0) { goto restart; } } if (nlevels > 0 && (pte & PG_PS) != 0) { if (pgsize > 1 * GB) { if (!check_only) { pfcode = pf_error_code(usermode, prot, 1, pte); vm_inject_pf(vcpu, pfcode, gla); } goto fault; } break; } ptpphys = pte; } /* Set the dirty bit in the page table entry if necessary */ if (!check_only && writable && (pte & PG_M) == 0) { if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) goto restart; } /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; *gpa = pte | (gla & (pgsize - 1)); done: ptp_release(&cookie); KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d", __func__, retval)); return (retval); error: retval = EFAULT; goto done; fault: *guest_fault = 1; goto done; } int vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) { return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault, false)); } int vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) { return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault, true)); } int vmm_fetch_instruction(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t rip, int inst_length, struct vie *vie, int *faultptr) { struct vm_copyinfo copyinfo[2]; int error, prot; if (inst_length > VIE_INST_SIZE) panic("vmm_fetch_instruction: invalid length %d", inst_length); prot = PROT_READ | PROT_EXEC; error = vm_copy_setup(vcpu, paging, rip, inst_length, prot, copyinfo, nitems(copyinfo), faultptr); if (error || *faultptr) return (error); vm_copyin(copyinfo, vie->inst, inst_length); vm_copy_teardown(copyinfo, nitems(copyinfo)); vie->num_valid = inst_length; return (0); } #endif /* _KERNEL */ static int vie_peek(struct vie *vie, uint8_t *x) { if (vie->num_processed < vie->num_valid) { *x = vie->inst[vie->num_processed]; return (0); } else return (-1); } static void vie_advance(struct vie *vie) { vie->num_processed++; } static bool segment_override(uint8_t x, int *seg) { switch (x) { case 0x2E: *seg = VM_REG_GUEST_CS; break; case 0x36: *seg = VM_REG_GUEST_SS; break; case 0x3E: *seg = VM_REG_GUEST_DS; break; case 0x26: *seg = VM_REG_GUEST_ES; break; case 0x64: *seg = VM_REG_GUEST_FS; break; case 0x65: *seg = VM_REG_GUEST_GS; break; default: return (false); } return (true); } static int decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) { uint8_t x; while (1) { if (vie_peek(vie, &x)) return (-1); if (x == 0x66) vie->opsize_override = 1; else if (x == 0x67) vie->addrsize_override = 1; else if (x == 0xF3) vie->repz_present = 1; else if (x == 0xF2) vie->repnz_present = 1; else if (segment_override(x, &vie->segment_register)) vie->segment_override = 1; else break; vie_advance(vie); } /* * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: * - Only one REX prefix is allowed per instruction. * - The REX prefix must immediately precede the opcode byte or the * escape opcode byte. * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) * the mandatory prefix must come before the REX prefix. */ if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { vie->rex_present = 1; vie->rex_w = x & 0x8 ? 1 : 0; vie->rex_r = x & 0x4 ? 1 : 0; vie->rex_x = x & 0x2 ? 1 : 0; vie->rex_b = x & 0x1 ? 1 : 0; vie_advance(vie); } /* * § 2.3.5, "The VEX Prefix", SDM Vol 2. */ if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY) && x == 0xC4) { const struct vie_op *optab; /* 3-byte VEX prefix. */ vie->vex_present = 1; vie_advance(vie); if (vie_peek(vie, &x)) return (-1); /* * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted * relative to REX encoding. */ vie->rex_r = x & 0x80 ? 0 : 1; vie->rex_x = x & 0x40 ? 0 : 1; vie->rex_b = x & 0x20 ? 0 : 1; switch (x & 0x1F) { case 0x2: /* 0F 38. */ optab = three_byte_opcodes_0f38; break; case 0x1: /* 0F class - nothing handled here yet. */ /* FALLTHROUGH */ case 0x3: /* 0F 3A class - nothing handled here yet. */ /* FALLTHROUGH */ default: /* Reserved (#UD). */ return (-1); } vie_advance(vie); if (vie_peek(vie, &x)) return (-1); /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */ vie->rex_w = x & 0x80 ? 1 : 0; vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3); vie->vex_l = !!(x & 0x4); vie->vex_pp = (x & 0x3); /* PP: 1=66 2=F3 3=F2 prefixes. */ switch (vie->vex_pp) { case 0x1: vie->opsize_override = 1; break; case 0x2: vie->repz_present = 1; break; case 0x3: vie->repnz_present = 1; break; } vie_advance(vie); /* Opcode, sans literal prefix prefix. */ if (vie_peek(vie, &x)) return (-1); vie->op = optab[x]; if (vie->op.op_type == VIE_OP_TYPE_NONE) return (-1); vie_advance(vie); } /* * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 */ if (cpu_mode == CPU_MODE_64BIT) { /* * Default address size is 64-bits and default operand size * is 32-bits. */ vie->addrsize = vie->addrsize_override ? 4 : 8; if (vie->rex_w) vie->opsize = 8; else if (vie->opsize_override) vie->opsize = 2; else vie->opsize = 4; } else if (cs_d) { /* Default address and operand sizes are 32-bits */ vie->addrsize = vie->addrsize_override ? 2 : 4; vie->opsize = vie->opsize_override ? 2 : 4; } else { /* Default address and operand sizes are 16-bits */ vie->addrsize = vie->addrsize_override ? 4 : 2; vie->opsize = vie->opsize_override ? 4 : 2; } return (0); } static int decode_two_byte_opcode(struct vie *vie) { uint8_t x; if (vie_peek(vie, &x)) return (-1); vie->op = two_byte_opcodes[x]; if (vie->op.op_type == VIE_OP_TYPE_NONE) return (-1); vie_advance(vie); return (0); } static int decode_opcode(struct vie *vie) { uint8_t x; if (vie_peek(vie, &x)) return (-1); /* Already did this via VEX prefix. */ if (vie->op.op_type != VIE_OP_TYPE_NONE) return (0); vie->op = one_byte_opcodes[x]; if (vie->op.op_type == VIE_OP_TYPE_NONE) return (-1); vie_advance(vie); if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) return (decode_two_byte_opcode(vie)); return (0); } static int decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) { uint8_t x; if (vie->op.op_flags & VIE_OP_F_NO_MODRM) return (0); if (cpu_mode == CPU_MODE_REAL) return (-1); if (vie_peek(vie, &x)) return (-1); vie->mod = (x >> 6) & 0x3; vie->rm = (x >> 0) & 0x7; vie->reg = (x >> 3) & 0x7; /* * A direct addressing mode makes no sense in the context of an EPT * fault. There has to be a memory access involved to cause the * EPT fault. */ if (vie->mod == VIE_MOD_DIRECT) return (-1); if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { /* * Table 2-5: Special Cases of REX Encodings * * mod=0, r/m=5 is used in the compatibility mode to * indicate a disp32 without a base register. * * mod!=3, r/m=4 is used in the compatibility mode to * indicate that the SIB byte is present. * * The 'b' bit in the REX prefix is don't care in * this case. */ } else { vie->rm |= (vie->rex_b << 3); } vie->reg |= (vie->rex_r << 3); /* SIB */ if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) goto done; vie->base_register = gpr_map[vie->rm]; switch (vie->mod) { case VIE_MOD_INDIRECT_DISP8: vie->disp_bytes = 1; break; case VIE_MOD_INDIRECT_DISP32: vie->disp_bytes = 4; break; case VIE_MOD_INDIRECT: if (vie->rm == VIE_RM_DISP32) { vie->disp_bytes = 4; /* * Table 2-7. RIP-Relative Addressing * * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 * whereas in compatibility mode it just implies disp32. */ if (cpu_mode == CPU_MODE_64BIT) vie->base_register = VM_REG_GUEST_RIP; else vie->base_register = VM_REG_LAST; } break; } done: vie_advance(vie); return (0); } static int decode_sib(struct vie *vie) { uint8_t x; /* Proceed only if SIB byte is present */ if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) return (0); if (vie_peek(vie, &x)) return (-1); /* De-construct the SIB byte */ vie->ss = (x >> 6) & 0x3; vie->index = (x >> 3) & 0x7; vie->base = (x >> 0) & 0x7; /* Apply the REX prefix modifiers */ vie->index |= vie->rex_x << 3; vie->base |= vie->rex_b << 3; switch (vie->mod) { case VIE_MOD_INDIRECT_DISP8: vie->disp_bytes = 1; break; case VIE_MOD_INDIRECT_DISP32: vie->disp_bytes = 4; break; } if (vie->mod == VIE_MOD_INDIRECT && (vie->base == 5 || vie->base == 13)) { /* * Special case when base register is unused if mod = 0 * and base = %rbp or %r13. * * Documented in: * Table 2-3: 32-bit Addressing Forms with the SIB Byte * Table 2-5: Special Cases of REX Encodings */ vie->disp_bytes = 4; } else { vie->base_register = gpr_map[vie->base]; } /* * All encodings of 'index' are valid except for %rsp (4). * * Documented in: * Table 2-3: 32-bit Addressing Forms with the SIB Byte * Table 2-5: Special Cases of REX Encodings */ if (vie->index != 4) vie->index_register = gpr_map[vie->index]; /* 'scale' makes sense only in the context of an index register */ if (vie->index_register < VM_REG_LAST) vie->scale = 1 << vie->ss; vie_advance(vie); return (0); } static int decode_displacement(struct vie *vie) { int n, i; uint8_t x; union { char buf[4]; int8_t signed8; int32_t signed32; } u; if ((n = vie->disp_bytes) == 0) return (0); if (n != 1 && n != 4) panic("decode_displacement: invalid disp_bytes %d", n); for (i = 0; i < n; i++) { if (vie_peek(vie, &x)) return (-1); u.buf[i] = x; vie_advance(vie); } if (n == 1) vie->displacement = u.signed8; /* sign-extended */ else vie->displacement = u.signed32; /* sign-extended */ return (0); } static int decode_immediate(struct vie *vie) { int i, n; uint8_t x; union { char buf[4]; int8_t signed8; int16_t signed16; int32_t signed32; } u; /* Figure out immediate operand size (if any) */ if (vie->op.op_flags & VIE_OP_F_IMM) { /* * Section 2.2.1.5 "Immediates", Intel SDM: * In 64-bit mode the typical size of immediate operands * remains 32-bits. When the operand size if 64-bits, the * processor sign-extends all immediates to 64-bits prior * to their use. */ if (vie->opsize == 4 || vie->opsize == 8) vie->imm_bytes = 4; else vie->imm_bytes = 2; } else if (vie->op.op_flags & VIE_OP_F_IMM8) { vie->imm_bytes = 1; } if ((n = vie->imm_bytes) == 0) return (0); KASSERT(n == 1 || n == 2 || n == 4, ("%s: invalid number of immediate bytes: %d", __func__, n)); for (i = 0; i < n; i++) { if (vie_peek(vie, &x)) return (-1); u.buf[i] = x; vie_advance(vie); } /* sign-extend the immediate value before use */ if (n == 1) vie->immediate = u.signed8; else if (n == 2) vie->immediate = u.signed16; else vie->immediate = u.signed32; return (0); } static int decode_moffset(struct vie *vie) { int i, n; uint8_t x; union { char buf[8]; uint64_t u64; } u; if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) return (0); /* * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: * The memory offset size follows the address-size of the instruction. */ n = vie->addrsize; KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); u.u64 = 0; for (i = 0; i < n; i++) { if (vie_peek(vie, &x)) return (-1); u.buf[i] = x; vie_advance(vie); } vie->displacement = u.u64; return (0); } #ifdef _KERNEL /* * Verify that the 'guest linear address' provided as collateral of the nested * page table fault matches with our instruction decoding. */ static int verify_gla(struct vcpu *vcpu, uint64_t gla, struct vie *vie, enum vm_cpu_mode cpu_mode) { int error; uint64_t base, segbase, idx, gla2; enum vm_reg_name seg; struct seg_desc desc; /* Skip 'gla' verification */ if (gla == VIE_INVALID_GLA) return (0); base = 0; if (vie->base_register != VM_REG_LAST) { error = vm_get_register(vcpu, vie->base_register, &base); if (error) { printf("verify_gla: error %d getting base reg %d\n", error, vie->base_register); return (-1); } /* * RIP-relative addressing starts from the following * instruction */ if (vie->base_register == VM_REG_GUEST_RIP) base += vie->num_processed; } idx = 0; if (vie->index_register != VM_REG_LAST) { error = vm_get_register(vcpu, vie->index_register, &idx); if (error) { printf("verify_gla: error %d getting index reg %d\n", error, vie->index_register); return (-1); } } /* * From "Specifying a Segment Selector", Intel SDM, Vol 1 * * In 64-bit mode, segmentation is generally (but not * completely) disabled. The exceptions are the FS and GS * segments. * * In legacy IA-32 mode, when the ESP or EBP register is used * as the base, the SS segment is the default segment. For * other data references, except when relative to stack or * string destination the DS segment is the default. These * can be overridden to allow other segments to be accessed. */ if (vie->segment_override) seg = vie->segment_register; else if (vie->base_register == VM_REG_GUEST_RSP || vie->base_register == VM_REG_GUEST_RBP) seg = VM_REG_GUEST_SS; else seg = VM_REG_GUEST_DS; if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) { segbase = 0; } else { error = vm_get_seg_desc(vcpu, seg, &desc); if (error) { printf("verify_gla: error %d getting segment" " descriptor %d", error, vie->segment_register); return (-1); } segbase = desc.base; } gla2 = segbase + base + vie->scale * idx + vie->displacement; gla2 &= size2mask[vie->addrsize]; if (gla != gla2) { printf("verify_gla mismatch: segbase(0x%0lx)" "base(0x%0lx), scale(%d), index(0x%0lx), " "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", segbase, base, vie->scale, idx, vie->displacement, gla, gla2); return (-1); } return (0); } #endif /* _KERNEL */ int #ifdef _KERNEL vmm_decode_instruction(struct vcpu *vcpu, uint64_t gla, enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) #else vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) #endif { if (decode_prefixes(vie, cpu_mode, cs_d)) return (-1); if (decode_opcode(vie)) return (-1); if (decode_modrm(vie, cpu_mode)) return (-1); if (decode_sib(vie)) return (-1); if (decode_displacement(vie)) return (-1); if (decode_immediate(vie)) return (-1); if (decode_moffset(vie)) return (-1); #ifdef _KERNEL if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { if (verify_gla(vcpu, gla, vie, cpu_mode)) return (-1); } #endif vie->decoded = 1; /* success */ return (0); } diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index dd30eb4405ef..57d8dd7aea60 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -1,1626 +1,1627 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #ifdef BHYVE_SNAPSHOT #include #include #endif #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #include #include #endif #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "bootrom.h" #include "config.h" #include "inout.h" #include "debug.h" #include "fwctl.h" #include "gdb.h" #include "ioapic.h" #include "kernemu_dev.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "qemu_fwcfg.h" #include "smbiostbl.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include "vmgenc.h" #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ #define MB (1024UL * 1024) #define GB (1024UL * MB) static const char * const vmx_exit_reason_desc[] = { [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", [EXIT_REASON_EXT_INTR] = "External interrupt", [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", [EXIT_REASON_INIT] = "INIT signal", [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", [EXIT_REASON_SMI] = "Other SMI", [EXIT_REASON_INTR_WINDOW] = "Interrupt window", [EXIT_REASON_NMI_WINDOW] = "NMI window", [EXIT_REASON_TASK_SWITCH] = "Task switch", [EXIT_REASON_CPUID] = "CPUID", [EXIT_REASON_GETSEC] = "GETSEC", [EXIT_REASON_HLT] = "HLT", [EXIT_REASON_INVD] = "INVD", [EXIT_REASON_INVLPG] = "INVLPG", [EXIT_REASON_RDPMC] = "RDPMC", [EXIT_REASON_RDTSC] = "RDTSC", [EXIT_REASON_RSM] = "RSM", [EXIT_REASON_VMCALL] = "VMCALL", [EXIT_REASON_VMCLEAR] = "VMCLEAR", [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", [EXIT_REASON_VMPTRLD] = "VMPTRLD", [EXIT_REASON_VMPTRST] = "VMPTRST", [EXIT_REASON_VMREAD] = "VMREAD", [EXIT_REASON_VMRESUME] = "VMRESUME", [EXIT_REASON_VMWRITE] = "VMWRITE", [EXIT_REASON_VMXOFF] = "VMXOFF", [EXIT_REASON_VMXON] = "VMXON", [EXIT_REASON_CR_ACCESS] = "Control-register accesses", [EXIT_REASON_DR_ACCESS] = "MOV DR", [EXIT_REASON_INOUT] = "I/O instruction", [EXIT_REASON_RDMSR] = "RDMSR", [EXIT_REASON_WRMSR] = "WRMSR", [EXIT_REASON_INVAL_VMCS] = "VM-entry failure due to invalid guest state", [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", [EXIT_REASON_MWAIT] = "MWAIT", [EXIT_REASON_MTF] = "Monitor trap flag", [EXIT_REASON_MONITOR] = "MONITOR", [EXIT_REASON_PAUSE] = "PAUSE", [EXIT_REASON_MCE_DURING_ENTRY] = "VM-entry failure due to machine-check event", [EXIT_REASON_TPR] = "TPR below threshold", [EXIT_REASON_APIC_ACCESS] = "APIC access", [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", [EXIT_REASON_EPT_FAULT] = "EPT violation", [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", [EXIT_REASON_INVEPT] = "INVEPT", [EXIT_REASON_RDTSCP] = "RDTSCP", [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", [EXIT_REASON_INVVPID] = "INVVPID", [EXIT_REASON_WBINVD] = "WBINVD", [EXIT_REASON_XSETBV] = "XSETBV", [EXIT_REASON_APIC_WRITE] = "APIC write", [EXIT_REASON_RDRAND] = "RDRAND", [EXIT_REASON_INVPCID] = "INVPCID", [EXIT_REASON_VMFUNC] = "VMFUNC", [EXIT_REASON_ENCLS] = "ENCLS", [EXIT_REASON_RDSEED] = "RDSEED", [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", [EXIT_REASON_XSAVES] = "XSAVES", [EXIT_REASON_XRSTORS] = "XRSTORS" }; -typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); +typedef int (*vmexit_handler_t)(struct vmctx *, struct vcpu *, struct vm_exit *); int guest_ncpus; uint16_t cpu_cores, cpu_sockets, cpu_threads; int raw_stdio = 0; static char *progname; static const int BSP = 0; static cpuset_t cpumask; -static void vm_loop(struct vmctx *ctx, int vcpu); +static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu); static struct bhyvestats { uint64_t vmexit_bogus; uint64_t vmexit_reqidle; uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; uint64_t vmexit_inst_emul; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; } stats; -static struct mt_vmm_info { - pthread_t mt_thr; - struct vmctx *mt_ctx; - int mt_vcpu; -} *mt_vmm_info; +static struct vcpu_info { + struct vmctx *ctx; + struct vcpu *vcpu; + int vcpuid; +} *vcpu_info; static cpuset_t **vcpumap; static void usage(int code) { fprintf(stderr, "Usage: %s [-AaCDeHhPSuWwxY]\n" " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" " %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n" " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n" " -A: create ACPI tables\n" " -a: local apic is in xAPIC mode (deprecated)\n" " -C: include guest memory in core file\n" " -c: number of CPUs and/or topology specification\n" " -D: destroy on power-off\n" " -e: exit on unhandled I/O access\n" " -G: start a debug server\n" " -H: vmexit from the guest on HLT\n" " -h: help\n" " -k: key=value flat config file\n" " -K: PS2 keyboard layout\n" " -l: LPC device configuration\n" " -m: memory size\n" " -o: set config 'var' to 'value'\n" " -P: vmexit from the guest on pause\n" " -p: pin 'vcpu' to 'hostcpu'\n" #ifdef BHYVE_SNAPSHOT " -r: path to checkpoint file\n" #endif " -S: guest memory cannot be swapped\n" " -s: PCI slot config\n" " -U: UUID\n" " -u: RTC keeps UTC time\n" " -W: force virtio to use single-vector MSI\n" " -w: ignore unimplemented MSRs\n" " -x: local APIC is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), "", (int)strlen(progname), "", (int)strlen(progname), ""); exit(code); } /* * XXX This parser is known to have the following issues: * 1. It accepts null key=value tokens ",," as setting "cpus" to an * empty string. * * The acceptance of a null specification ('-c ""') is by design to match the * manual page syntax specification, this results in a topology of 1 vCPU. */ static int topology_parse(const char *opt) { char *cp, *str, *tofree; if (*opt == '\0') { set_config_value("sockets", "1"); set_config_value("cores", "1"); set_config_value("threads", "1"); set_config_value("cpus", "1"); return (0); } tofree = str = strdup(opt); if (str == NULL) errx(4, "Failed to allocate memory"); while ((cp = strsep(&str, ",")) != NULL) { if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) set_config_value("cpus", cp + strlen("cpus=")); else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0) set_config_value("sockets", cp + strlen("sockets=")); else if (strncmp(cp, "cores=", strlen("cores=")) == 0) set_config_value("cores", cp + strlen("cores=")); else if (strncmp(cp, "threads=", strlen("threads=")) == 0) set_config_value("threads", cp + strlen("threads=")); #ifdef notyet /* Do not expose this until vmm.ko implements it */ else if (strncmp(cp, "maxcpus=", strlen("maxcpus=")) == 0) set_config_value("maxcpus", cp + strlen("maxcpus=")); #endif else if (strchr(cp, '=') != NULL) goto out; else set_config_value("cpus", cp); } free(tofree); return (0); out: free(tofree); return (-1); } static int parse_int_value(const char *key, const char *value, int minval, int maxval) { char *cp; long lval; errno = 0; lval = strtol(value, &cp, 0); if (errno != 0 || *cp != '\0' || cp == value || lval < minval || lval > maxval) errx(4, "Invalid value for %s: '%s'", key, value); return (lval); } /* * Set the sockets, cores, threads, and guest_cpus variables based on * the configured topology. * * The limits of UINT16_MAX are due to the types passed to * vm_set_topology(). vmm.ko may enforce tighter limits. */ static void calc_topology(void) { const char *value; bool explicit_cpus; uint64_t ncpus; value = get_config_value("cpus"); if (value != NULL) { guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX); explicit_cpus = true; } else { guest_ncpus = 1; explicit_cpus = false; } value = get_config_value("cores"); if (value != NULL) cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX); else cpu_cores = 1; value = get_config_value("threads"); if (value != NULL) cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX); else cpu_threads = 1; value = get_config_value("sockets"); if (value != NULL) cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX); else cpu_sockets = guest_ncpus; /* * Compute sockets * cores * threads avoiding overflow. The * range check above insures these are 16 bit values. */ ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads; if (ncpus > UINT16_MAX) errx(4, "Computed number of vCPUs too high: %ju", (uintmax_t)ncpus); if (explicit_cpus) { if (guest_ncpus != (int)ncpus) errx(4, "Topology (%d sockets, %d cores, %d threads) " "does not match %d vCPUs", cpu_sockets, cpu_cores, cpu_threads, guest_ncpus); } else guest_ncpus = ncpus; } static int pincpu_parse(const char *opt) { const char *value; char *newval; char key[16]; int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0) { fprintf(stderr, "invalid vcpu '%d'\n", vcpu); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (asprintf(&newval, "%s%s%d", value != NULL ? value : "", value != NULL ? "," : "", pcpu) == -1) { perror("failed to build new cpuset string"); return (-1); } set_config_value(key, newval); free(newval); return (0); } static void parse_cpuset(int vcpu, const char *list, cpuset_t *set) { char *cp, *token; int pcpu, start; CPU_ZERO(set); start = -1; token = __DECONST(char *, list); for (;;) { pcpu = strtoul(token, &cp, 0); if (cp == token) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); if (pcpu < 0 || pcpu >= CPU_SETSIZE) errx(4, "hostcpu '%d' outside valid range from 0 to %d", pcpu, CPU_SETSIZE - 1); switch (*cp) { case ',': case '\0': if (start >= 0) { if (start > pcpu) errx(4, "Invalid hostcpu range %d-%d", start, pcpu); while (start < pcpu) { CPU_SET(start, set); start++; } start = -1; } CPU_SET(pcpu, set); break; case '-': if (start >= 0) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); start = pcpu; break; default: errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); } if (*cp == '\0') break; token = cp + 1; } } static void build_vcpumaps(void) { char key[16]; const char *value; int vcpu; vcpumap = calloc(guest_ncpus, sizeof(*vcpumap)); for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (value == NULL) continue; vcpumap[vcpu] = malloc(sizeof(cpuset_t)); if (vcpumap[vcpu] == NULL) err(4, "Failed to allocate cpuset for vcpu %d", vcpu); parse_cpuset(vcpu, value, vcpumap[vcpu]); } } void -vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, +vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode) { - struct vmctx *ctx; int error, restart_instruction; - ctx = arg; restart_instruction = 1; - error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, + error = vm_inject_exception(vcpu, vector, errcode_valid, errcode, restart_instruction); assert(error == 0); } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr) { return (vm_rev_map_gpa(ctx, addr)); } #endif int fbsdrun_virtio_msix(void) { return (get_config_bool_default("virtio_msix", true)); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; - struct mt_vmm_info *mtp; - int error, vcpu; - - mtp = param; - vcpu = mtp->mt_vcpu; + struct vcpu_info *vi = param; + int error; - snprintf(tname, sizeof(tname), "vcpu %d", vcpu); - pthread_set_name_np(mtp->mt_thr, tname); + snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid); + pthread_set_name_np(pthread_self(), tname); - if (vcpumap[vcpu] != NULL) { - error = pthread_setaffinity_np(mtp->mt_thr, sizeof(cpuset_t), - vcpumap[vcpu]); + if (vcpumap[vi->vcpuid] != NULL) { + error = pthread_setaffinity_np(pthread_self(), + sizeof(cpuset_t), vcpumap[vi->vcpuid]); assert(error == 0); } #ifdef BHYVE_SNAPSHOT - checkpoint_cpu_add(vcpu); + checkpoint_cpu_add(vi->vcpuid); #endif - gdb_cpu_add(vcpu); + gdb_cpu_add(vi->vcpu); - vm_loop(mtp->mt_ctx, vcpu); + vm_loop(vi->ctx, vi->vcpu); /* not reached */ exit(1); return (NULL); } static void -fbsdrun_addcpu(struct vmctx *ctx, int newcpu) +fbsdrun_addcpu(struct vcpu_info *vi) { + pthread_t thr; int error; - error = vm_activate_cpu(ctx, newcpu); + error = vm_activate_cpu(vi->vcpu); if (error != 0) - err(EX_OSERR, "could not activate CPU %d", newcpu); - - CPU_SET_ATOMIC(newcpu, &cpumask); + err(EX_OSERR, "could not activate CPU %d", vi->vcpuid); - vm_suspend_cpu(ctx, newcpu); + CPU_SET_ATOMIC(vi->vcpuid, &cpumask); - mt_vmm_info[newcpu].mt_ctx = ctx; - mt_vmm_info[newcpu].mt_vcpu = newcpu; + vm_suspend_cpu(vi->vcpu); - error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, - fbsdrun_start_thread, &mt_vmm_info[newcpu]); + error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi); assert(error == 0); } static int fbsdrun_deletecpu(int vcpu) { if (!CPU_ISSET(vcpu, &cpumask)) { fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); exit(4); } CPU_CLR_ATOMIC(vcpu, &cpumask); return (CPU_EMPTY(&cpumask)); } static int -vmexit_handle_notify(struct vmctx *ctx __unused, struct vm_exit *vme __unused, - int *pvcpu __unused, uint32_t eax __unused) +vmexit_handle_notify(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, + struct vm_exit *vme __unused, uint32_t eax __unused) { #if BHYVE_DEBUG /* * put guest-driven debug here */ #endif return (VMEXIT_CONTINUE); } static int -vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vme) { int error; int bytes, port, in, out; - int vcpu; - - vcpu = *pvcpu; port = vme->u.inout.port; bytes = vme->u.inout.bytes; in = vme->u.inout.in; out = !in; /* Extra-special case of host notifications */ if (out && port == GUEST_NIO_PORT) { - error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); + error = vmexit_handle_notify(ctx, vcpu, vme, vme->u.inout.eax); return (error); } error = emulate_inout(ctx, vcpu, vme); if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port, vme->rip); return (VMEXIT_ABORT); } else { return (VMEXIT_CONTINUE); } } static int -vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme) { uint64_t val; uint32_t eax, edx; int error; val = 0; - error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val); + error = emulate_rdmsr(vcpu, vme->u.msr.code, &val); if (error != 0) { fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", - vme->u.msr.code, *pvcpu); + vme->u.msr.code, vcpu_id(vcpu)); if (get_config_bool("x86.strictmsr")) { - vm_inject_gp(ctx, *pvcpu); + vm_inject_gp(vcpu); return (VMEXIT_CONTINUE); } } eax = val; - error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax); + error = vm_set_register(vcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; - error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx); + error = vm_set_register(vcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int -vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme) { int error; - error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval); + error = emulate_wrmsr(vcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", - vme->u.msr.code, vme->u.msr.wval, *pvcpu); + vme->u.msr.code, vme->u.msr.wval, vcpu_id(vcpu)); if (get_config_bool("x86.strictmsr")) { - vm_inject_gp(ctx, *pvcpu); + vm_inject_gp(vcpu); return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); } #define DEBUG_EPT_MISCONFIG #ifdef DEBUG_EPT_MISCONFIG #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; static int ept_misconfig_ptenum; #endif static const char * vmexit_vmx_desc(uint32_t exit_reason) { if (exit_reason >= nitems(vmx_exit_reason_desc) || vmx_exit_reason_desc[exit_reason] == NULL) return ("Unknown"); return (vmx_exit_reason_desc[exit_reason]); } static int -vmexit_vmx(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vme) { - fprintf(stderr, "vm exit[%d]\n", *pvcpu); + fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); fprintf(stderr, "\treason\t\tVMX\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vme->u.vmx.status); fprintf(stderr, "\texit_reason\t%u (%s)\n", vme->u.vmx.exit_reason, vmexit_vmx_desc(vme->u.vmx.exit_reason)); fprintf(stderr, "\tqualification\t0x%016lx\n", vme->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vme->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vme->u.vmx.inst_error); #ifdef DEBUG_EPT_MISCONFIG if (vme->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { - vm_get_register(ctx, *pvcpu, + vm_get_register(vcpu, VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), &ept_misconfig_gpa); vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, &ept_misconfig_ptenum); fprintf(stderr, "\tEPT misconfiguration:\n"); fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", ept_misconfig_ptenum, ept_misconfig_pte[0], ept_misconfig_pte[1], ept_misconfig_pte[2], ept_misconfig_pte[3]); } #endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } static int -vmexit_svm(struct vmctx *ctx __unused, struct vm_exit *vme, int *pvcpu) +vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme) { - fprintf(stderr, "vm exit[%d]\n", *pvcpu); + fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); fprintf(stderr, "\treason\t\tSVM\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\texitcode\t%#lx\n", vme->u.svm.exitcode); fprintf(stderr, "\texitinfo1\t%#lx\n", vme->u.svm.exitinfo1); fprintf(stderr, "\texitinfo2\t%#lx\n", vme->u.svm.exitinfo2); return (VMEXIT_ABORT); } static int -vmexit_bogus(struct vmctx *ctx __unused, struct vm_exit *vme, - int *pvcpu __unused) +vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, + struct vm_exit *vme) { assert(vme->inst_length == 0); stats.vmexit_bogus++; return (VMEXIT_CONTINUE); } static int -vmexit_reqidle(struct vmctx *ctx __unused, struct vm_exit *vme, - int *pvcpu __unused) +vmexit_reqidle(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, + struct vm_exit *vme) { assert(vme->inst_length == 0); stats.vmexit_reqidle++; return (VMEXIT_CONTINUE); } static int -vmexit_hlt(struct vmctx *ctx __unused, struct vm_exit *vme __unused, - int *pvcpu __unused) +vmexit_hlt(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, + struct vm_exit *vme __unused) { stats.vmexit_hlt++; /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int -vmexit_pause(struct vmctx *ctx __unused, struct vm_exit *vme __unused, - int *pvcpu __unused) +vmexit_pause(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, + struct vm_exit *vme __unused) { stats.vmexit_pause++; return (VMEXIT_CONTINUE); } static int -vmexit_mtrap(struct vmctx *ctx __unused, struct vm_exit *vme, int *pvcpu) +vmexit_mtrap(struct vmctx *ctx __unused, struct vcpu *vcpu, + struct vm_exit *vme) { assert(vme->inst_length == 0); stats.vmexit_mtrap++; #ifdef BHYVE_SNAPSHOT - checkpoint_cpu_suspend(*pvcpu); + checkpoint_cpu_suspend(vcpu_id(vcpu)); #endif - gdb_cpu_mtrap(*pvcpu); + gdb_cpu_mtrap(vcpu); #ifdef BHYVE_SNAPSHOT - checkpoint_cpu_resume(*pvcpu); + checkpoint_cpu_resume(vcpu_id(vcpu)); #endif return (VMEXIT_CONTINUE); } static int -vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu, + struct vm_exit *vme) { int err, i, cs_d; struct vie *vie; enum vm_cpu_mode mode; stats.vmexit_inst_emul++; vie = &vme->u.inst_emul.vie; if (!vie->decoded) { /* * Attempt to decode in userspace as a fallback. This allows * updating instruction decode in bhyve without rebooting the * kernel (rapid prototyping), albeit with much slower * emulation. */ vie_restart(vie); mode = vme->u.inst_emul.paging.cpu_mode; cs_d = vme->u.inst_emul.cs_d; if (vmm_decode_instruction(mode, cs_d, vie) != 0) goto fail; - if (vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RIP, + if (vm_set_register(vcpu, VM_REG_GUEST_RIP, vme->rip + vie->num_processed) != 0) goto fail; } - err = emulate_mem(ctx, *pvcpu, vme->u.inst_emul.gpa, - vie, &vme->u.inst_emul.paging); + err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie, + &vme->u.inst_emul.paging); if (err) { if (err == ESRCH) { EPRINTLN("Unhandled memory access to 0x%lx\n", vme->u.inst_emul.gpa); } goto fail; } return (VMEXIT_CONTINUE); fail: fprintf(stderr, "Failed to emulate instruction sequence [ "); for (i = 0; i < vie->num_valid; i++) fprintf(stderr, "%02x", vie->inst[i]); FPRINTLN(stderr, " ] at 0x%lx", vme->rip); return (VMEXIT_ABORT); } static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; static int -vmexit_suspend(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vme) { enum vm_suspend_how how; + int vcpuid = vcpu_id(vcpu); how = vme->u.suspended.how; - fbsdrun_deletecpu(*pvcpu); + fbsdrun_deletecpu(vcpuid); - if (*pvcpu != BSP) { + if (vcpuid != BSP) { pthread_mutex_lock(&resetcpu_mtx); pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); } pthread_mutex_lock(&resetcpu_mtx); while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); switch (how) { case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: if (get_config_bool_default("destroy_on_poweroff", false)) vm_destroy(ctx); exit(1); case VM_SUSPEND_HALT: exit(2); case VM_SUSPEND_TRIPLEFAULT: exit(3); default: fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); exit(100); } return (0); /* NOTREACHED */ } static int -vmexit_debug(struct vmctx *ctx __unused, struct vm_exit *vme __unused, - int *pvcpu) +vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu, + struct vm_exit *vme __unused) { #ifdef BHYVE_SNAPSHOT - checkpoint_cpu_suspend(*pvcpu); + checkpoint_cpu_suspend(vcpu_id(vcpu)); #endif - gdb_cpu_suspend(*pvcpu); + gdb_cpu_suspend(vcpu); #ifdef BHYVE_SNAPSHOT - checkpoint_cpu_resume(*pvcpu); + checkpoint_cpu_resume(vcpu_id(vcpu)); #endif /* * XXX-MJ sleep for a short period to avoid chewing up the CPU in the * window between activation of the vCPU thread and the STARTUP IPI. */ usleep(1000); return (VMEXIT_CONTINUE); } static int -vmexit_breakpoint(struct vmctx *ctx __unused, struct vm_exit *vme, int *pvcpu) +vmexit_breakpoint(struct vmctx *ctx __unused, struct vcpu *vcpu, + struct vm_exit *vme) { - gdb_cpu_breakpoint(*pvcpu, vme); + gdb_cpu_breakpoint(vcpu, vme); return (VMEXIT_CONTINUE); } static int -vmexit_ipi(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu __unused) +vmexit_ipi(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, + struct vm_exit *vme) { int error = -1; int i; switch (vme->u.ipi.mode) { case APIC_DELMODE_INIT: CPU_FOREACH_ISSET(i, &vme->u.ipi.dmask) { - error = vm_suspend_cpu(ctx, i); + error = vm_suspend_cpu(vcpu_info[i].vcpu); if (error) { warnx("%s: failed to suspend cpu %d\n", __func__, i); break; } } break; case APIC_DELMODE_STARTUP: CPU_FOREACH_ISSET(i, &vme->u.ipi.dmask) { - spinup_ap(ctx, i, vme->u.ipi.vector << PAGE_SHIFT); + spinup_ap(vcpu_info[i].vcpu, + vme->u.ipi.vector << PAGE_SHIFT); } error = 0; break; default: break; } return (error); } static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, [VM_EXITCODE_BPT] = vmexit_breakpoint, [VM_EXITCODE_IPI] = vmexit_ipi, }; static void -vm_loop(struct vmctx *ctx, int vcpu) +vm_loop(struct vmctx *ctx, struct vcpu *vcpu) { struct vm_exit vme; int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus; error = vm_active_cpus(ctx, &active_cpus); - assert(CPU_ISSET(vcpu, &active_cpus)); + assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus)); while (1) { - error = vm_run(ctx, vcpu, &vme); + error = vm_run(vcpu, &vme); if (error != 0) break; exitcode = vme.exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(4); } - rc = (*handler[exitcode])(ctx, &vme, &vcpu); + rc = (*handler[exitcode])(ctx, vcpu, &vme); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(4); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); } static int -num_vcpus_allowed(struct vmctx *ctx) +num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu) { uint16_t sockets, cores, threads, maxcpus; int tmp, error; /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ - error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); + error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp); if (error != 0) return (1); error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); if (error == 0) return (maxcpus); else return (1); } static void -fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) +fbsdrun_set_capabilities(struct vcpu *vcpu, bool bsp) { int err, tmp; if (get_config_bool_default("x86.vmexit_on_hlt", false)) { - err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); + err = vm_get_capability(vcpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); exit(4); } - vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); - if (cpu == BSP) + vm_set_capability(vcpu, VM_CAP_HALT_EXIT, 1); + if (bsp) handler[VM_EXITCODE_HLT] = vmexit_hlt; } if (get_config_bool_default("x86.vmexit_on_pause", false)) { /* * pause exit support required for this mode */ - err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp); + err = vm_get_capability(vcpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); exit(4); } - vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); - if (cpu == BSP) + vm_set_capability(vcpu, VM_CAP_PAUSE_EXIT, 1); + if (bsp) handler[VM_EXITCODE_PAUSE] = vmexit_pause; } if (get_config_bool_default("x86.x2apic", false)) - err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED); + err = vm_set_x2apic_state(vcpu, X2APIC_ENABLED); else - err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); + err = vm_set_x2apic_state(vcpu, X2APIC_DISABLED); if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); exit(4); } - vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1); + vm_set_capability(vcpu, VM_CAP_ENABLE_INVPCID, 1); - err = vm_set_capability(ctx, cpu, VM_CAP_IPI_EXIT, 1); + err = vm_set_capability(vcpu, VM_CAP_IPI_EXIT, 1); assert(err == 0); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool reinit, romboot; reinit = romboot = false; if (lpc_bootrom()) romboot = true; error = vm_create(vmname); if (error) { if (errno == EEXIST) { if (romboot) { reinit = true; } else { /* * The virtual machine has been setup by the * userspace bootloader. */ } } else { perror("vm_create"); exit(4); } } else { if (!romboot) { /* * If the virtual machine was just created then a * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); exit(4); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(4); } #ifndef WITHOUT_CAPSICUM if (vm_limit_rights(ctx) != 0) err(EX_OSERR, "vm_limit_rights"); #endif if (reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(4); } } error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0 /* maxcpus, unimplemented */); if (error) errx(EX_OSERR, "vm_set_topology"); return (ctx); } static void -spinup_vcpu(struct vmctx *ctx, int vcpu) +spinup_vcpu(struct vcpu_info *vi, bool bsp) { int error; - if (vcpu != BSP) { - fbsdrun_set_capabilities(ctx, vcpu); + if (!bsp) { + fbsdrun_set_capabilities(vi->vcpu, false); /* * Enable the 'unrestricted guest' mode for APs. * * APs startup in power-on 16-bit mode. */ - error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); + error = vm_set_capability(vi->vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); } - fbsdrun_addcpu(ctx, vcpu); + fbsdrun_addcpu(vi); } static bool parse_config_option(const char *option) { const char *value; char *path; value = strchr(option, '='); if (value == NULL || value[1] == '\0') return (false); path = strndup(option, value - option); if (path == NULL) err(4, "Failed to allocate memory"); set_config_value(path, value + 1); return (true); } static void parse_simple_config_file(const char *path) { FILE *fp; char *line, *cp; size_t linecap; unsigned int lineno; fp = fopen(path, "r"); if (fp == NULL) err(4, "Failed to open configuration file %s", path); line = NULL; linecap = 0; lineno = 1; for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) { if (*line == '#' || *line == '\n') continue; cp = strchr(line, '\n'); if (cp != NULL) *cp = '\0'; if (!parse_config_option(line)) errx(4, "%s line %u: invalid config option '%s'", path, lineno, line); } free(line); fclose(fp); } static void parse_gdb_options(const char *opt) { const char *sport; char *colon; if (opt[0] == 'w') { set_config_bool("gdb.wait", true); opt++; } colon = strrchr(opt, ':'); if (colon == NULL) { sport = opt; } else { *colon = '\0'; colon++; sport = colon; set_config_value("gdb.address", opt); } set_config_value("gdb.port", sport); } static void set_defaults(void) { set_config_bool("acpi_tables", false); set_config_value("memory.size", "256M"); set_config_bool("x86.strictmsr", true); set_config_value("lpc.fwcfg", "bhyve"); } int main(int argc, char *argv[]) { int c, error; int max_vcpus, memflags; + struct vcpu *bsp; struct vmctx *ctx; uint64_t rip; size_t memsize; const char *optstr, *value, *vmname; #ifdef BHYVE_SNAPSHOT char *restore_file; struct restore_state rstate; restore_file = NULL; #endif init_config(); set_defaults(); progname = basename(argv[0]); #ifdef BHYVE_SNAPSHOT optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:K:U:r:"; #else optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:K:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': set_config_bool("x86.x2apic", false); break; case 'A': set_config_bool("acpi_tables", true); break; case 'D': set_config_bool("destroy_on_poweroff", true); break; case 'p': if (pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': if (topology_parse(optarg) != 0) { errx(EX_USAGE, "invalid cpu topology " "'%s'", optarg); } break; case 'C': set_config_bool("memory.guest_in_core", true); break; case 'G': parse_gdb_options(optarg); break; case 'k': parse_simple_config_file(optarg); break; case 'K': set_config_value("keyboard.layout", optarg); break; case 'l': if (strncmp(optarg, "help", strlen(optarg)) == 0) { lpc_print_supported_devices(); exit(0); } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; #ifdef BHYVE_SNAPSHOT case 'r': restore_file = optarg; break; #endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); exit(0); } else if (pci_parse_slot(optarg) != 0) exit(4); else break; case 'S': set_config_bool("memory.wired", true); break; case 'm': set_config_value("memory.size", optarg); break; case 'o': if (!parse_config_option(optarg)) errx(EX_USAGE, "invalid configuration option '%s'", optarg); break; case 'H': set_config_bool("x86.vmexit_on_hlt", true); break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': set_config_bool("x86.vmexit_on_pause", true); break; case 'e': set_config_bool("x86.strictio", true); break; case 'u': set_config_bool("rtc.use_localtime", false); break; case 'U': set_config_value("uuid", optarg); break; case 'w': set_config_bool("x86.strictmsr", false); break; case 'W': set_config_bool("virtio_msix", false); break; case 'x': set_config_bool("x86.x2apic", true); break; case 'Y': set_config_bool("x86.mptable", false); break; case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; if (argc > 1) usage(1); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { error = load_restore_file(restore_file, &rstate); if (error) { fprintf(stderr, "Failed to read checkpoint info from " "file: '%s'.\n", restore_file); exit(1); } vmname = lookup_vmname(&rstate); if (vmname != NULL) set_config_value("name", vmname); } #endif if (argc == 1) set_config_value("name", argv[0]); vmname = get_config_value("name"); if (vmname == NULL) usage(1); if (get_config_bool_default("config.dump", false)) { dump_config(); exit(1); } calc_topology(); build_vcpumaps(); value = get_config_value("memory.size"); error = vm_parse_memsize(value, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", value); ctx = do_open(vmname); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { guest_ncpus = lookup_guest_ncpus(&rstate); memflags = lookup_memflags(&rstate); memsize = lookup_memsize(&rstate); } if (guest_ncpus < 1) { fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); exit(1); } #endif - max_vcpus = num_vcpus_allowed(ctx); + bsp = vm_vcpu_open(ctx, BSP); + max_vcpus = num_vcpus_allowed(ctx, bsp); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(4); } - fbsdrun_set_capabilities(ctx, BSP); + fbsdrun_set_capabilities(bsp, true); + + /* Allocate per-VCPU resources. */ + vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info)); + for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) { + vcpu_info[vcpuid].ctx = ctx; + vcpu_info[vcpuid].vcpuid = vcpuid; + if (vcpuid == BSP) + vcpu_info[vcpuid].vcpu = bsp; + else + vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid); + } memflags = 0; if (get_config_bool_default("memory.wired", false)) memflags |= VM_MEM_F_WIRED; if (get_config_bool_default("memory.guest_in_core", false)) memflags |= VM_MEM_F_INCORE; vm_set_memflags(ctx, memflags); error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (error) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); exit(4); } init_mem(guest_ncpus); init_inout(); kernemu_dev_init(); init_bootrom(ctx); atkbdc_init(ctx); pci_irq_init(ctx); ioapic_init(ctx); rtc_init(ctx); sci_init(ctx); if (qemu_fwcfg_init(ctx) != 0) { fprintf(stderr, "qemu fwcfg initialization error"); exit(4); } if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus), &guest_ncpus) != 0) { fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu"); exit(4); } /* * Exit if a device emulation finds an error in its initilization */ if (init_pci(ctx) != 0) { perror("device emulation initialization error"); exit(4); } /* * Initialize after PCI, to allow a bootrom file to reserve the high * region. */ if (get_config_bool("acpi_tables")) vmgenc_init(ctx); init_gdb(ctx); if (lpc_bootrom()) { - if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { + if (vm_set_capability(bsp, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); exit(4); } - error = vcpu_reset(ctx, BSP); + error = vcpu_reset(bsp); assert(error == 0); } - /* Allocate per-VCPU resources. */ - mt_vmm_info = calloc(guest_ncpus, sizeof(*mt_vmm_info)); - /* * Add all vCPUs. */ - for (int vcpu = 0; vcpu < guest_ncpus; vcpu++) { - spinup_vcpu(ctx, vcpu); - } + for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) + spinup_vcpu(&vcpu_info[vcpuid], vcpuid == BSP); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { fprintf(stdout, "Pausing pci devs...\r\n"); if (vm_pause_user_devs() != 0) { fprintf(stderr, "Failed to pause PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring vm mem...\r\n"); if (restore_vm_mem(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore VM memory.\n"); exit(1); } fprintf(stdout, "Restoring pci devs...\r\n"); if (vm_restore_user_devs(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring kernel structs...\r\n"); if (vm_restore_kern_structs(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore kernel structs.\n"); exit(1); } fprintf(stdout, "Resuming pci devs...\r\n"); if (vm_resume_user_devs() != 0) { fprintf(stderr, "Failed to resume PCI device state.\n"); exit(1); } } #endif - error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); + error = vm_get_register(bsp, VM_REG_GUEST_RIP, &rip); assert(error == 0); /* * build the guest tables, MP etc. */ if (get_config_bool_default("x86.mptable", true)) { error = mptable_build(ctx, guest_ncpus); if (error) { perror("error to build the guest tables"); exit(4); } } error = smbios_build(ctx); if (error != 0) exit(4); if (get_config_bool("acpi_tables")) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } if (lpc_bootrom() && strcmp(lpc_fwcfg(), "bhyve") == 0) { fwctl_init(); } /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); #ifdef BHYVE_SNAPSHOT /* initialize mutex/cond variables */ init_snapshot(); /* * checkpointing thread for communication with bhyvectl */ if (init_checkpoint_thread(ctx) != 0) errx(EX_OSERR, "Failed to start checkpoint thread"); #endif #ifndef WITHOUT_CAPSICUM caph_cache_catpages(); if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_enter() == -1) errx(EX_OSERR, "cap_enter() failed"); #endif #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { destroy_restore_state(&rstate); if (vm_restore_time(ctx) < 0) err(EX_OSERR, "Unable to restore time"); - for (int i = 0; i < guest_ncpus; i++) { - if (i == BSP) - continue; - vm_resume_cpu(ctx, i); - } - } + for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) + vm_resume_cpu(vcpu_info[vcpuid].vcpu); + } else #endif - vm_resume_cpu(ctx, BSP); + vm_resume_cpu(bsp); /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(4); } diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h index d3eb8c8b23da..dfc7d7463519 100644 --- a/usr.sbin/bhyve/bhyverun.h +++ b/usr.sbin/bhyve/bhyverun.h @@ -1,52 +1,53 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _FBSDRUN_H_ #define _FBSDRUN_H_ #define VMEXIT_CONTINUE (0) #define VMEXIT_ABORT (-1) extern int guest_ncpus; extern uint16_t cpu_cores, cpu_sockets, cpu_threads; +struct vcpu; struct vmctx; struct vm_exit; void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len); #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr); #endif int fbsdrun_virtio_msix(void); -int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); +int vmexit_task_switch(struct vmctx *, struct vcpu *, struct vm_exit *); #endif diff --git a/usr.sbin/bhyve/bootrom.c b/usr.sbin/bhyve/bootrom.c index b851d39b485c..e671b32286db 100644 --- a/usr.sbin/bhyve/bootrom.c +++ b/usr.sbin/bhyve/bootrom.c @@ -1,316 +1,315 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "bootrom.h" #include "debug.h" #include "mem.h" #define BOOTROM_SIZE (16 * 1024 * 1024) /* 16 MB */ /* * ROM region is 16 MB at the top of 4GB ("low") memory. * * The size is limited so it doesn't encroach into reserved MMIO space (e.g., * APIC, HPET, MSI). * * It is allocated in page-multiple blocks on a first-come first-serve basis, * from high to low, during initialization, and does not change at runtime. */ static char *romptr; /* Pointer to userspace-mapped bootrom region. */ static vm_paddr_t gpa_base; /* GPA of low end of region. */ static vm_paddr_t gpa_allocbot; /* Low GPA of free region. */ static vm_paddr_t gpa_alloctop; /* High GPA, minus 1, of free region. */ #define CFI_BCS_WRITE_BYTE 0x10 #define CFI_BCS_CLEAR_STATUS 0x50 #define CFI_BCS_READ_STATUS 0x70 #define CFI_BCS_READ_ARRAY 0xff static struct bootrom_var_state { uint8_t *mmap; uint64_t gpa; off_t size; uint8_t cmd; } var = { NULL, 0, 0, CFI_BCS_READ_ARRAY }; /* * Emulate just those CFI basic commands that will convince EDK II * that the Firmware Volume area is writable and persistent. */ static int -bootrom_var_mem_handler(struct vmctx *ctx __unused, int vcpu __unused, int dir, - uint64_t addr, int size, uint64_t *val, void *arg1 __unused, - long arg2 __unused) +bootrom_var_mem_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1 __unused, long arg2 __unused) { off_t offset; offset = addr - var.gpa; if (offset + size > var.size || offset < 0 || offset + size <= offset) return (EINVAL); if (dir == MEM_F_WRITE) { switch (var.cmd) { case CFI_BCS_WRITE_BYTE: memcpy(var.mmap + offset, val, size); var.cmd = CFI_BCS_READ_ARRAY; break; default: var.cmd = *(uint8_t *)val; } } else { switch (var.cmd) { case CFI_BCS_CLEAR_STATUS: case CFI_BCS_READ_STATUS: memset(val, 0, size); var.cmd = CFI_BCS_READ_ARRAY; break; default: memcpy(val, var.mmap + offset, size); break; } } return (0); } void init_bootrom(struct vmctx *ctx) { romptr = vm_create_devmem(ctx, VM_BOOTROM, "bootrom", BOOTROM_SIZE); if (romptr == MAP_FAILED) err(4, "%s: vm_create_devmem", __func__); gpa_base = (1ULL << 32) - BOOTROM_SIZE; gpa_allocbot = gpa_base; gpa_alloctop = (1ULL << 32) - 1; } int bootrom_alloc(struct vmctx *ctx, size_t len, int prot, int flags, char **region_out, uint64_t *gpa_out) { static const int bootrom_valid_flags = BOOTROM_ALLOC_TOP; vm_paddr_t gpa; vm_ooffset_t segoff; if (flags & ~bootrom_valid_flags) { warnx("%s: Invalid flags: %x", __func__, flags & ~bootrom_valid_flags); return (EINVAL); } if (prot & ~_PROT_ALL) { warnx("%s: Invalid protection: %x", __func__, prot & ~_PROT_ALL); return (EINVAL); } if (len == 0 || len > BOOTROM_SIZE) { warnx("ROM size %zu is invalid", len); return (EINVAL); } if (len & PAGE_MASK) { warnx("ROM size %zu is not a multiple of the page size", len); return (EINVAL); } if (flags & BOOTROM_ALLOC_TOP) { gpa = (gpa_alloctop - len) + 1; if (gpa < gpa_allocbot) { warnx("No room for %zu ROM in bootrom region", len); return (ENOMEM); } } else { gpa = gpa_allocbot; if (gpa > (gpa_alloctop - len) + 1) { warnx("No room for %zu ROM in bootrom region", len); return (ENOMEM); } } segoff = gpa - gpa_base; if (vm_mmap_memseg(ctx, gpa, VM_BOOTROM, segoff, len, prot) != 0) { int serrno = errno; warn("%s: vm_mmap_mapseg", __func__); return (serrno); } if (flags & BOOTROM_ALLOC_TOP) gpa_alloctop = gpa - 1; else gpa_allocbot = gpa + len; *region_out = romptr + segoff; if (gpa_out != NULL) *gpa_out = gpa; return (0); } int bootrom_loadrom(struct vmctx *ctx, const nvlist_t *nvl) { struct stat sbuf; ssize_t rlen; off_t rom_size, var_size, total_size; char *ptr, *romfile; int fd, varfd, i, rv; const char *bootrom, *varfile; rv = -1; varfd = -1; bootrom = get_config_value_node(nvl, "bootrom"); if (bootrom == NULL) { return (-1); } /* * get_config_value_node may use a thread local buffer to return * variables. So, when we query the second variable, the first variable * might get overwritten. For that reason, the bootrom should be * duplicated. */ romfile = strdup(bootrom); if (romfile == NULL) { return (-1); } fd = open(romfile, O_RDONLY); if (fd < 0) { EPRINTLN("Error opening bootrom \"%s\": %s", romfile, strerror(errno)); goto done; } if (fstat(fd, &sbuf) < 0) { EPRINTLN("Could not fstat bootrom file \"%s\": %s", romfile, strerror(errno)); goto done; } rom_size = sbuf.st_size; varfile = get_config_value_node(nvl, "bootvars"); var_size = 0; if (varfile != NULL) { varfd = open(varfile, O_RDWR); if (varfd < 0) { fprintf(stderr, "Error opening bootrom variable file " "\"%s\": %s\n", varfile, strerror(errno)); goto done; } if (fstat(varfd, &sbuf) < 0) { fprintf(stderr, "Could not fstat bootrom variable file \"%s\": %s\n", varfile, strerror(errno)); goto done; } var_size = sbuf.st_size; } if (var_size > BOOTROM_SIZE || (var_size != 0 && var_size < PAGE_SIZE)) { fprintf(stderr, "Invalid bootrom variable size %ld\n", var_size); goto done; } total_size = rom_size + var_size; if (total_size > BOOTROM_SIZE) { fprintf(stderr, "Invalid bootrom and variable aggregate size " "%ld\n", total_size); goto done; } /* Map the bootrom into the guest address space */ if (bootrom_alloc(ctx, rom_size, PROT_READ | PROT_EXEC, BOOTROM_ALLOC_TOP, &ptr, NULL) != 0) { goto done; } /* Read 'romfile' into the guest address space */ for (i = 0; i < rom_size / PAGE_SIZE; i++) { rlen = read(fd, ptr + i * PAGE_SIZE, PAGE_SIZE); if (rlen != PAGE_SIZE) { EPRINTLN("Incomplete read of page %d of bootrom " "file %s: %ld bytes", i, romfile, rlen); goto done; } } if (varfd >= 0) { var.mmap = mmap(NULL, var_size, PROT_READ | PROT_WRITE, MAP_SHARED, varfd, 0); if (var.mmap == MAP_FAILED) goto done; var.size = var_size; var.gpa = (gpa_alloctop - var_size) + 1; gpa_alloctop = var.gpa - 1; rv = register_mem(&(struct mem_range){ .name = "bootrom variable", .flags = MEM_F_RW, .handler = bootrom_var_mem_handler, .base = var.gpa, .size = var.size, }); if (rv != 0) goto done; } rv = 0; done: if (varfd >= 0) close(varfd); if (fd >= 0) close(fd); free(romfile); return (rv); } diff --git a/usr.sbin/bhyve/gdb.c b/usr.sbin/bhyve/gdb.c index 6368d5cc79a9..9e50602e92be 100644 --- a/usr.sbin/bhyve/gdb.c +++ b/usr.sbin/bhyve/gdb.c @@ -1,1906 +1,1917 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017-2018 John H. Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "gdb.h" #include "mem.h" #include "mevent.h" /* * GDB_SIGNAL_* numbers are part of the GDB remote protocol. Most stops * use SIGTRAP. */ #define GDB_SIGNAL_TRAP 5 static void gdb_resume_vcpus(void); static void check_command(int fd); static struct mevent *read_event, *write_event; static cpuset_t vcpus_active, vcpus_suspended, vcpus_waiting; static pthread_mutex_t gdb_lock; static pthread_cond_t idle_vcpus; static bool first_stop, report_next_stop, swbreak_enabled; /* * An I/O buffer contains 'capacity' bytes of room at 'data'. For a * read buffer, 'start' is unused and 'len' contains the number of * valid bytes in the buffer. For a write buffer, 'start' is set to * the index of the next byte in 'data' to send, and 'len' contains * the remaining number of valid bytes to send. */ struct io_buffer { uint8_t *data; size_t capacity; size_t start; size_t len; }; struct breakpoint { uint64_t gpa; uint8_t shadow_inst; TAILQ_ENTRY(breakpoint) link; }; /* * When a vCPU stops to due to an event that should be reported to the * debugger, information about the event is stored in this structure. * The vCPU thread then sets 'stopped_vcpu' if it is not already set * and stops other vCPUs so the event can be reported. The * report_stop() function reports the event for the 'stopped_vcpu' * vCPU. When the debugger resumes execution via continue or step, * the event for 'stopped_vcpu' is cleared. vCPUs will loop in their * event handlers until the associated event is reported or disabled. * * An idle vCPU will have all of the boolean fields set to false. * * When a vCPU is stepped, 'stepping' is set to true when the vCPU is * released to execute the stepped instruction. When the vCPU reports * the stepping trap, 'stepped' is set. * * When a vCPU hits a breakpoint set by the debug server, * 'hit_swbreak' is set to true. */ struct vcpu_state { bool stepping; bool stepped; bool hit_swbreak; }; static struct io_buffer cur_comm, cur_resp; static uint8_t cur_csum; static struct vmctx *ctx; static int cur_fd = -1; static TAILQ_HEAD(, breakpoint) breakpoints; static struct vcpu_state *vcpu_state; +static struct vcpu **vcpus; static int cur_vcpu, stopped_vcpu; static bool gdb_active = false; static const int gdb_regset[] = { VM_REG_GUEST_RAX, VM_REG_GUEST_RBX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_RBP, VM_REG_GUEST_RSP, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15, VM_REG_GUEST_RIP, VM_REG_GUEST_RFLAGS, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_ES, VM_REG_GUEST_FS, VM_REG_GUEST_GS }; static const int gdb_regsize[] = { 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4 }; #ifdef GDB_LOG #include #include static void __printflike(1, 2) debug(const char *fmt, ...) { static FILE *logfile; va_list ap; if (logfile == NULL) { logfile = fopen("/tmp/bhyve_gdb.log", "w"); if (logfile == NULL) return; #ifndef WITHOUT_CAPSICUM if (caph_limit_stream(fileno(logfile), CAPH_WRITE) == -1) { fclose(logfile); logfile = NULL; return; } #endif setlinebuf(logfile); } va_start(ap, fmt); vfprintf(logfile, fmt, ap); va_end(ap); } #else #define debug(...) #endif static void remove_all_sw_breakpoints(void); static int -guest_paging_info(int vcpu, struct vm_guest_paging *paging) +guest_paging_info(struct vcpu *vcpu, struct vm_guest_paging *paging) { uint64_t regs[4]; const int regset[4] = { VM_REG_GUEST_CR0, VM_REG_GUEST_CR3, VM_REG_GUEST_CR4, VM_REG_GUEST_EFER }; - if (vm_get_register_set(ctx, vcpu, nitems(regset), regset, regs) == -1) + if (vm_get_register_set(vcpu, nitems(regset), regset, regs) == -1) return (-1); /* * For the debugger, always pretend to be the kernel (CPL 0), * and if long-mode is enabled, always parse addresses as if * in 64-bit mode. */ paging->cr3 = regs[1]; paging->cpl = 0; if (regs[3] & EFER_LMA) paging->cpu_mode = CPU_MODE_64BIT; else if (regs[0] & CR0_PE) paging->cpu_mode = CPU_MODE_PROTECTED; else paging->cpu_mode = CPU_MODE_REAL; if (!(regs[0] & CR0_PG)) paging->paging_mode = PAGING_MODE_FLAT; else if (!(regs[2] & CR4_PAE)) paging->paging_mode = PAGING_MODE_32; else if (regs[3] & EFER_LME) paging->paging_mode = (regs[2] & CR4_LA57) ? PAGING_MODE_64_LA57 : PAGING_MODE_64; else paging->paging_mode = PAGING_MODE_PAE; return (0); } /* * Map a guest virtual address to a physical address (for a given vcpu). * If a guest virtual address is valid, return 1. If the address is * not valid, return 0. If an error occurs obtaining the mapping, * return -1. */ static int -guest_vaddr2paddr(int vcpu, uint64_t vaddr, uint64_t *paddr) +guest_vaddr2paddr(struct vcpu *vcpu, uint64_t vaddr, uint64_t *paddr) { struct vm_guest_paging paging; int fault; if (guest_paging_info(vcpu, &paging) == -1) return (-1); /* * Always use PROT_READ. We really care if the VA is * accessible, not if the current vCPU can write. */ - if (vm_gla2gpa_nofault(ctx, vcpu, &paging, vaddr, PROT_READ, paddr, + if (vm_gla2gpa_nofault(vcpu, &paging, vaddr, PROT_READ, paddr, &fault) == -1) return (-1); if (fault) return (0); return (1); } static void io_buffer_reset(struct io_buffer *io) { io->start = 0; io->len = 0; } /* Available room for adding data. */ static size_t io_buffer_avail(struct io_buffer *io) { return (io->capacity - (io->start + io->len)); } static uint8_t * io_buffer_head(struct io_buffer *io) { return (io->data + io->start); } static uint8_t * io_buffer_tail(struct io_buffer *io) { return (io->data + io->start + io->len); } static void io_buffer_advance(struct io_buffer *io, size_t amount) { assert(amount <= io->len); io->start += amount; io->len -= amount; } static void io_buffer_consume(struct io_buffer *io, size_t amount) { io_buffer_advance(io, amount); if (io->len == 0) { io->start = 0; return; } /* * XXX: Consider making this move optional and compacting on a * future read() before realloc(). */ memmove(io->data, io_buffer_head(io), io->len); io->start = 0; } static void io_buffer_grow(struct io_buffer *io, size_t newsize) { uint8_t *new_data; size_t avail, new_cap; avail = io_buffer_avail(io); if (newsize <= avail) return; new_cap = io->capacity + (newsize - avail); new_data = realloc(io->data, new_cap); if (new_data == NULL) err(1, "Failed to grow GDB I/O buffer"); io->data = new_data; io->capacity = new_cap; } static bool response_pending(void) { if (cur_resp.start == 0 && cur_resp.len == 0) return (false); if (cur_resp.start + cur_resp.len == 1 && cur_resp.data[0] == '+') return (false); return (true); } static void close_connection(void) { /* * XXX: This triggers a warning because mevent does the close * before the EV_DELETE. */ pthread_mutex_lock(&gdb_lock); mevent_delete(write_event); mevent_delete_close(read_event); write_event = NULL; read_event = NULL; io_buffer_reset(&cur_comm); io_buffer_reset(&cur_resp); cur_fd = -1; remove_all_sw_breakpoints(); /* Clear any pending events. */ memset(vcpu_state, 0, guest_ncpus * sizeof(*vcpu_state)); /* Resume any stopped vCPUs. */ gdb_resume_vcpus(); pthread_mutex_unlock(&gdb_lock); } static uint8_t hex_digit(uint8_t nibble) { if (nibble <= 9) return (nibble + '0'); else return (nibble + 'a' - 10); } static uint8_t parse_digit(uint8_t v) { if (v >= '0' && v <= '9') return (v - '0'); if (v >= 'a' && v <= 'f') return (v - 'a' + 10); if (v >= 'A' && v <= 'F') return (v - 'A' + 10); return (0xF); } /* Parses big-endian hexadecimal. */ static uintmax_t parse_integer(const uint8_t *p, size_t len) { uintmax_t v; v = 0; while (len > 0) { v <<= 4; v |= parse_digit(*p); p++; len--; } return (v); } static uint8_t parse_byte(const uint8_t *p) { return (parse_digit(p[0]) << 4 | parse_digit(p[1])); } static void send_pending_data(int fd) { ssize_t nwritten; if (cur_resp.len == 0) { mevent_disable(write_event); return; } nwritten = write(fd, io_buffer_head(&cur_resp), cur_resp.len); if (nwritten == -1) { warn("Write to GDB socket failed"); close_connection(); } else { io_buffer_advance(&cur_resp, nwritten); if (cur_resp.len == 0) mevent_disable(write_event); else mevent_enable(write_event); } } /* Append a single character to the output buffer. */ static void send_char(uint8_t data) { io_buffer_grow(&cur_resp, 1); *io_buffer_tail(&cur_resp) = data; cur_resp.len++; } /* Append an array of bytes to the output buffer. */ static void send_data(const uint8_t *data, size_t len) { io_buffer_grow(&cur_resp, len); memcpy(io_buffer_tail(&cur_resp), data, len); cur_resp.len += len; } static void format_byte(uint8_t v, uint8_t *buf) { buf[0] = hex_digit(v >> 4); buf[1] = hex_digit(v & 0xf); } /* * Append a single byte (formatted as two hex characters) to the * output buffer. */ static void send_byte(uint8_t v) { uint8_t buf[2]; format_byte(v, buf); send_data(buf, sizeof(buf)); } static void start_packet(void) { send_char('$'); cur_csum = 0; } static void finish_packet(void) { send_char('#'); send_byte(cur_csum); debug("-> %.*s\n", (int)cur_resp.len, io_buffer_head(&cur_resp)); } /* * Append a single character (for the packet payload) and update the * checksum. */ static void append_char(uint8_t v) { send_char(v); cur_csum += v; } /* * Append an array of bytes (for the packet payload) and update the * checksum. */ static void append_packet_data(const uint8_t *data, size_t len) { send_data(data, len); while (len > 0) { cur_csum += *data; data++; len--; } } static void append_string(const char *str) { append_packet_data(str, strlen(str)); } static void append_byte(uint8_t v) { uint8_t buf[2]; format_byte(v, buf); append_packet_data(buf, sizeof(buf)); } static void append_unsigned_native(uintmax_t value, size_t len) { size_t i; for (i = 0; i < len; i++) { append_byte(value); value >>= 8; } } static void append_unsigned_be(uintmax_t value, size_t len) { char buf[len * 2]; size_t i; for (i = 0; i < len; i++) { format_byte(value, buf + (len - i - 1) * 2); value >>= 8; } append_packet_data(buf, sizeof(buf)); } static void append_integer(unsigned int value) { if (value == 0) append_char('0'); else append_unsigned_be(value, (fls(value) + 7) / 8); } static void append_asciihex(const char *str) { while (*str != '\0') { append_byte(*str); str++; } } static void send_empty_response(void) { start_packet(); finish_packet(); } static void send_error(int error) { start_packet(); append_char('E'); append_byte(error); finish_packet(); } static void send_ok(void) { start_packet(); append_string("OK"); finish_packet(); } static int parse_threadid(const uint8_t *data, size_t len) { if (len == 1 && *data == '0') return (0); if (len == 2 && memcmp(data, "-1", 2) == 0) return (-1); if (len == 0) return (-2); return (parse_integer(data, len)); } /* * Report the current stop event to the debugger. If the stop is due * to an event triggered on a specific vCPU such as a breakpoint or * stepping trap, stopped_vcpu will be set to the vCPU triggering the * stop. If 'set_cur_vcpu' is true, then cur_vcpu will be updated to * the reporting vCPU for vCPU events. */ static void report_stop(bool set_cur_vcpu) { struct vcpu_state *vs; start_packet(); if (stopped_vcpu == -1) { append_char('S'); append_byte(GDB_SIGNAL_TRAP); } else { vs = &vcpu_state[stopped_vcpu]; if (set_cur_vcpu) cur_vcpu = stopped_vcpu; append_char('T'); append_byte(GDB_SIGNAL_TRAP); append_string("thread:"); append_integer(stopped_vcpu + 1); append_char(';'); if (vs->hit_swbreak) { debug("$vCPU %d reporting swbreak\n", stopped_vcpu); if (swbreak_enabled) append_string("swbreak:;"); } else if (vs->stepped) debug("$vCPU %d reporting step\n", stopped_vcpu); else debug("$vCPU %d reporting ???\n", stopped_vcpu); } finish_packet(); report_next_stop = false; } /* * If this stop is due to a vCPU event, clear that event to mark it as * acknowledged. */ static void discard_stop(void) { struct vcpu_state *vs; if (stopped_vcpu != -1) { vs = &vcpu_state[stopped_vcpu]; vs->hit_swbreak = false; vs->stepped = false; stopped_vcpu = -1; } report_next_stop = true; } static void gdb_finish_suspend_vcpus(void) { if (first_stop) { first_stop = false; stopped_vcpu = -1; } else if (report_next_stop) { assert(!response_pending()); report_stop(true); send_pending_data(cur_fd); } } /* * vCPU threads invoke this function whenever the vCPU enters the * debug server to pause or report an event. vCPU threads wait here * as long as the debug server keeps them suspended. */ static void -_gdb_cpu_suspend(int vcpu, bool report_stop) +_gdb_cpu_suspend(struct vcpu *vcpu, bool report_stop) { + int vcpuid = vcpu_id(vcpu); - debug("$vCPU %d suspending\n", vcpu); - CPU_SET(vcpu, &vcpus_waiting); + debug("$vCPU %d suspending\n", vcpuid); + CPU_SET(vcpuid, &vcpus_waiting); if (report_stop && CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0) gdb_finish_suspend_vcpus(); - while (CPU_ISSET(vcpu, &vcpus_suspended)) + while (CPU_ISSET(vcpuid, &vcpus_suspended)) pthread_cond_wait(&idle_vcpus, &gdb_lock); - CPU_CLR(vcpu, &vcpus_waiting); - debug("$vCPU %d resuming\n", vcpu); + CPU_CLR(vcpuid, &vcpus_waiting); + debug("$vCPU %d resuming\n", vcpuid); } /* * Invoked at the start of a vCPU thread's execution to inform the * debug server about the new thread. */ void -gdb_cpu_add(int vcpu) +gdb_cpu_add(struct vcpu *vcpu) { + int vcpuid; if (!gdb_active) return; - debug("$vCPU %d starting\n", vcpu); + vcpuid = vcpu_id(vcpu); + debug("$vCPU %d starting\n", vcpuid); pthread_mutex_lock(&gdb_lock); - assert(vcpu < guest_ncpus); - CPU_SET(vcpu, &vcpus_active); + assert(vcpuid < guest_ncpus); + assert(vcpus[vcpuid] == NULL); + vcpus[vcpuid] = vcpu; + CPU_SET(vcpuid, &vcpus_active); if (!TAILQ_EMPTY(&breakpoints)) { - vm_set_capability(ctx, vcpu, VM_CAP_BPT_EXIT, 1); + vm_set_capability(vcpu, VM_CAP_BPT_EXIT, 1); debug("$vCPU %d enabled breakpoint exits\n", vcpu); } /* * If a vcpu is added while vcpus are stopped, suspend the new * vcpu so that it will pop back out with a debug exit before * executing the first instruction. */ if (!CPU_EMPTY(&vcpus_suspended)) { - CPU_SET(vcpu, &vcpus_suspended); + CPU_SET(vcpuid, &vcpus_suspended); _gdb_cpu_suspend(vcpu, false); } pthread_mutex_unlock(&gdb_lock); } /* * Invoked by vCPU before resuming execution. This enables stepping * if the vCPU is marked as stepping. */ static void -gdb_cpu_resume(int vcpu) +gdb_cpu_resume(struct vcpu *vcpu) { struct vcpu_state *vs; int error; - vs = &vcpu_state[vcpu]; + vs = &vcpu_state[vcpu_id(vcpu)]; /* * Any pending event should already be reported before * resuming. */ assert(vs->hit_swbreak == false); assert(vs->stepped == false); if (vs->stepping) { - error = vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 1); + error = vm_set_capability(vcpu, VM_CAP_MTRAP_EXIT, 1); assert(error == 0); } } /* * Handler for VM_EXITCODE_DEBUG used to suspend a vCPU when the guest * has been suspended due to an event on different vCPU or in response * to a guest-wide suspend such as Ctrl-C or the stop on attach. */ void -gdb_cpu_suspend(int vcpu) +gdb_cpu_suspend(struct vcpu *vcpu) { if (!gdb_active) return; pthread_mutex_lock(&gdb_lock); _gdb_cpu_suspend(vcpu, true); gdb_cpu_resume(vcpu); pthread_mutex_unlock(&gdb_lock); } static void gdb_suspend_vcpus(void) { assert(pthread_mutex_isowned_np(&gdb_lock)); debug("suspending all CPUs\n"); vcpus_suspended = vcpus_active; - vm_suspend_cpu(ctx, -1); + vm_suspend_all_cpus(ctx); if (CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0) gdb_finish_suspend_vcpus(); } /* * Handler for VM_EXITCODE_MTRAP reported when a vCPU single-steps via * the VT-x-specific MTRAP exit. */ void -gdb_cpu_mtrap(int vcpu) +gdb_cpu_mtrap(struct vcpu *vcpu) { struct vcpu_state *vs; + int vcpuid; if (!gdb_active) return; - debug("$vCPU %d MTRAP\n", vcpu); + vcpuid = vcpu_id(vcpu); + debug("$vCPU %d MTRAP\n", vcpuid); pthread_mutex_lock(&gdb_lock); - vs = &vcpu_state[vcpu]; + vs = &vcpu_state[vcpuid]; if (vs->stepping) { vs->stepping = false; vs->stepped = true; - vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 0); + vm_set_capability(vcpu, VM_CAP_MTRAP_EXIT, 0); while (vs->stepped) { if (stopped_vcpu == -1) { - debug("$vCPU %d reporting step\n", vcpu); - stopped_vcpu = vcpu; + debug("$vCPU %d reporting step\n", vcpuid); + stopped_vcpu = vcpuid; gdb_suspend_vcpus(); } _gdb_cpu_suspend(vcpu, true); } gdb_cpu_resume(vcpu); } pthread_mutex_unlock(&gdb_lock); } static struct breakpoint * find_breakpoint(uint64_t gpa) { struct breakpoint *bp; TAILQ_FOREACH(bp, &breakpoints, link) { if (bp->gpa == gpa) return (bp); } return (NULL); } void -gdb_cpu_breakpoint(int vcpu, struct vm_exit *vmexit) +gdb_cpu_breakpoint(struct vcpu *vcpu, struct vm_exit *vmexit) { struct breakpoint *bp; struct vcpu_state *vs; uint64_t gpa; - int error; + int error, vcpuid; if (!gdb_active) { fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n"); exit(4); } + vcpuid = vcpu_id(vcpu); pthread_mutex_lock(&gdb_lock); error = guest_vaddr2paddr(vcpu, vmexit->rip, &gpa); assert(error == 1); bp = find_breakpoint(gpa); if (bp != NULL) { - vs = &vcpu_state[vcpu]; + vs = &vcpu_state[vcpuid]; assert(vs->stepping == false); assert(vs->stepped == false); assert(vs->hit_swbreak == false); vs->hit_swbreak = true; - vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, vmexit->rip); + vm_set_register(vcpu, VM_REG_GUEST_RIP, vmexit->rip); for (;;) { if (stopped_vcpu == -1) { - debug("$vCPU %d reporting breakpoint at rip %#lx\n", vcpu, - vmexit->rip); - stopped_vcpu = vcpu; + debug("$vCPU %d reporting breakpoint at rip %#lx\n", + vcpuid, vmexit->rip); + stopped_vcpu = vcpuid; gdb_suspend_vcpus(); } _gdb_cpu_suspend(vcpu, true); if (!vs->hit_swbreak) { /* Breakpoint reported. */ break; } bp = find_breakpoint(gpa); if (bp == NULL) { /* Breakpoint was removed. */ vs->hit_swbreak = false; break; } } gdb_cpu_resume(vcpu); } else { - debug("$vCPU %d injecting breakpoint at rip %#lx\n", vcpu, + debug("$vCPU %d injecting breakpoint at rip %#lx\n", vcpuid, vmexit->rip); - error = vm_set_register(ctx, vcpu, - VM_REG_GUEST_ENTRY_INST_LENGTH, vmexit->u.bpt.inst_length); + error = vm_set_register(vcpu, VM_REG_GUEST_ENTRY_INST_LENGTH, + vmexit->u.bpt.inst_length); assert(error == 0); - error = vm_inject_exception(ctx, vcpu, IDT_BP, 0, 0, 0); + error = vm_inject_exception(vcpu, IDT_BP, 0, 0, 0); assert(error == 0); } pthread_mutex_unlock(&gdb_lock); } static bool -gdb_step_vcpu(int vcpu) +gdb_step_vcpu(struct vcpu *vcpu) { - int error, val; + int error, val, vcpuid; - debug("$vCPU %d step\n", vcpu); - error = vm_get_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, &val); + vcpuid = vcpu_id(vcpu); + debug("$vCPU %d step\n", vcpuid); + error = vm_get_capability(vcpu, VM_CAP_MTRAP_EXIT, &val); if (error < 0) return (false); discard_stop(); - vcpu_state[vcpu].stepping = true; - vm_resume_cpu(ctx, vcpu); - CPU_CLR(vcpu, &vcpus_suspended); + vcpu_state[vcpuid].stepping = true; + vm_resume_cpu(vcpu); + CPU_CLR(vcpuid, &vcpus_suspended); pthread_cond_broadcast(&idle_vcpus); return (true); } static void gdb_resume_vcpus(void) { assert(pthread_mutex_isowned_np(&gdb_lock)); - vm_resume_cpu(ctx, -1); + vm_resume_all_cpus(ctx); debug("resuming all CPUs\n"); CPU_ZERO(&vcpus_suspended); pthread_cond_broadcast(&idle_vcpus); } static void gdb_read_regs(void) { uint64_t regvals[nitems(gdb_regset)]; - if (vm_get_register_set(ctx, cur_vcpu, nitems(gdb_regset), + if (vm_get_register_set(vcpus[cur_vcpu], nitems(gdb_regset), gdb_regset, regvals) == -1) { send_error(errno); return; } start_packet(); for (size_t i = 0; i < nitems(regvals); i++) append_unsigned_native(regvals[i], gdb_regsize[i]); finish_packet(); } static void gdb_read_mem(const uint8_t *data, size_t len) { uint64_t gpa, gva, val; uint8_t *cp; size_t resid, todo, bytes; bool started; int error; /* Skip 'm' */ data += 1; len -= 1; /* Parse and consume address. */ cp = memchr(data, ',', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } gva = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Parse length. */ resid = parse_integer(data, len); started = false; while (resid > 0) { - error = guest_vaddr2paddr(cur_vcpu, gva, &gpa); + error = guest_vaddr2paddr(vcpus[cur_vcpu], gva, &gpa); if (error == -1) { if (started) finish_packet(); else send_error(errno); return; } if (error == 0) { if (started) finish_packet(); else send_error(EFAULT); return; } /* Read bytes from current page. */ todo = getpagesize() - gpa % getpagesize(); if (todo > resid) todo = resid; cp = paddr_guest2host(ctx, gpa, todo); if (cp != NULL) { /* * If this page is guest RAM, read it a byte * at a time. */ if (!started) { start_packet(); started = true; } while (todo > 0) { append_byte(*cp); cp++; gpa++; gva++; resid--; todo--; } } else { /* * If this page isn't guest RAM, try to handle * it via MMIO. For MMIO requests, use * aligned reads of words when possible. */ while (todo > 0) { if (gpa & 1 || todo == 1) bytes = 1; else if (gpa & 2 || todo == 2) bytes = 2; else bytes = 4; - error = read_mem(ctx, cur_vcpu, gpa, &val, + error = read_mem(vcpus[cur_vcpu], gpa, &val, bytes); if (error == 0) { if (!started) { start_packet(); started = true; } gpa += bytes; gva += bytes; resid -= bytes; todo -= bytes; while (bytes > 0) { append_byte(val); val >>= 8; bytes--; } } else { if (started) finish_packet(); else send_error(EFAULT); return; } } } assert(resid == 0 || gpa % getpagesize() == 0); } if (!started) start_packet(); finish_packet(); } static void gdb_write_mem(const uint8_t *data, size_t len) { uint64_t gpa, gva, val; uint8_t *cp; size_t resid, todo, bytes; int error; /* Skip 'M' */ data += 1; len -= 1; /* Parse and consume address. */ cp = memchr(data, ',', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } gva = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Parse and consume length. */ cp = memchr(data, ':', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } resid = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Verify the available bytes match the length. */ if (len != resid * 2) { send_error(EINVAL); return; } while (resid > 0) { - error = guest_vaddr2paddr(cur_vcpu, gva, &gpa); + error = guest_vaddr2paddr(vcpus[cur_vcpu], gva, &gpa); if (error == -1) { send_error(errno); return; } if (error == 0) { send_error(EFAULT); return; } /* Write bytes to current page. */ todo = getpagesize() - gpa % getpagesize(); if (todo > resid) todo = resid; cp = paddr_guest2host(ctx, gpa, todo); if (cp != NULL) { /* * If this page is guest RAM, write it a byte * at a time. */ while (todo > 0) { assert(len >= 2); *cp = parse_byte(data); data += 2; len -= 2; cp++; gpa++; gva++; resid--; todo--; } } else { /* * If this page isn't guest RAM, try to handle * it via MMIO. For MMIO requests, use * aligned writes of words when possible. */ while (todo > 0) { if (gpa & 1 || todo == 1) { bytes = 1; val = parse_byte(data); } else if (gpa & 2 || todo == 2) { bytes = 2; val = be16toh(parse_integer(data, 4)); } else { bytes = 4; val = be32toh(parse_integer(data, 8)); } - error = write_mem(ctx, cur_vcpu, gpa, val, + error = write_mem(vcpus[cur_vcpu], gpa, val, bytes); if (error == 0) { gpa += bytes; gva += bytes; resid -= bytes; todo -= bytes; data += 2 * bytes; len -= 2 * bytes; } else { send_error(EFAULT); return; } } } assert(resid == 0 || gpa % getpagesize() == 0); } assert(len == 0); send_ok(); } static bool set_breakpoint_caps(bool enable) { cpuset_t mask; int vcpu; mask = vcpus_active; while (!CPU_EMPTY(&mask)) { vcpu = CPU_FFS(&mask) - 1; CPU_CLR(vcpu, &mask); - if (vm_set_capability(ctx, vcpu, VM_CAP_BPT_EXIT, + if (vm_set_capability(vcpus[vcpu], VM_CAP_BPT_EXIT, enable ? 1 : 0) < 0) return (false); debug("$vCPU %d %sabled breakpoint exits\n", vcpu, enable ? "en" : "dis"); } return (true); } static void remove_all_sw_breakpoints(void) { struct breakpoint *bp, *nbp; uint8_t *cp; if (TAILQ_EMPTY(&breakpoints)) return; TAILQ_FOREACH_SAFE(bp, &breakpoints, link, nbp) { debug("remove breakpoint at %#lx\n", bp->gpa); cp = paddr_guest2host(ctx, bp->gpa, 1); *cp = bp->shadow_inst; TAILQ_REMOVE(&breakpoints, bp, link); free(bp); } TAILQ_INIT(&breakpoints); set_breakpoint_caps(false); } static void update_sw_breakpoint(uint64_t gva, int kind, bool insert) { struct breakpoint *bp; uint64_t gpa; uint8_t *cp; int error; if (kind != 1) { send_error(EINVAL); return; } - error = guest_vaddr2paddr(cur_vcpu, gva, &gpa); + error = guest_vaddr2paddr(vcpus[cur_vcpu], gva, &gpa); if (error == -1) { send_error(errno); return; } if (error == 0) { send_error(EFAULT); return; } cp = paddr_guest2host(ctx, gpa, 1); /* Only permit breakpoints in guest RAM. */ if (cp == NULL) { send_error(EFAULT); return; } /* Find any existing breakpoint. */ bp = find_breakpoint(gpa); /* * Silently ignore duplicate commands since the protocol * requires these packets to be idempotent. */ if (insert) { if (bp == NULL) { if (TAILQ_EMPTY(&breakpoints) && !set_breakpoint_caps(true)) { send_empty_response(); return; } bp = malloc(sizeof(*bp)); bp->gpa = gpa; bp->shadow_inst = *cp; *cp = 0xcc; /* INT 3 */ TAILQ_INSERT_TAIL(&breakpoints, bp, link); debug("new breakpoint at %#lx\n", gpa); } } else { if (bp != NULL) { debug("remove breakpoint at %#lx\n", gpa); *cp = bp->shadow_inst; TAILQ_REMOVE(&breakpoints, bp, link); free(bp); if (TAILQ_EMPTY(&breakpoints)) set_breakpoint_caps(false); } } send_ok(); } static void parse_breakpoint(const uint8_t *data, size_t len) { uint64_t gva; uint8_t *cp; bool insert; int kind, type; insert = data[0] == 'Z'; /* Skip 'Z/z' */ data += 1; len -= 1; /* Parse and consume type. */ cp = memchr(data, ',', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } type = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Parse and consume address. */ cp = memchr(data, ',', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } gva = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Parse and consume kind. */ cp = memchr(data, ';', len); if (cp == data) { send_error(EINVAL); return; } if (cp != NULL) { /* * We do not advertise support for either the * ConditionalBreakpoints or BreakpointCommands * features, so we should not be getting conditions or * commands from the remote end. */ send_empty_response(); return; } kind = parse_integer(data, len); data += len; len = 0; switch (type) { case 0: update_sw_breakpoint(gva, kind, insert); break; default: send_empty_response(); break; } } static bool command_equals(const uint8_t *data, size_t len, const char *cmd) { if (strlen(cmd) > len) return (false); return (memcmp(data, cmd, strlen(cmd)) == 0); } static void check_features(const uint8_t *data, size_t len) { char *feature, *next_feature, *str, *value; bool supported; str = malloc(len + 1); memcpy(str, data, len); str[len] = '\0'; next_feature = str; while ((feature = strsep(&next_feature, ";")) != NULL) { /* * Null features shouldn't exist, but skip if they * do. */ if (strcmp(feature, "") == 0) continue; /* * Look for the value or supported / not supported * flag. */ value = strchr(feature, '='); if (value != NULL) { *value = '\0'; value++; supported = true; } else { value = feature + strlen(feature) - 1; switch (*value) { case '+': supported = true; break; case '-': supported = false; break; default: /* * This is really a protocol error, * but we just ignore malformed * features for ease of * implementation. */ continue; } value = NULL; } if (strcmp(feature, "swbreak") == 0) swbreak_enabled = supported; } free(str); start_packet(); /* This is an arbitrary limit. */ append_string("PacketSize=4096"); append_string(";swbreak+"); finish_packet(); } static void gdb_query(const uint8_t *data, size_t len) { /* * TODO: * - qSearch */ if (command_equals(data, len, "qAttached")) { start_packet(); append_char('1'); finish_packet(); } else if (command_equals(data, len, "qC")) { start_packet(); append_string("QC"); append_integer(cur_vcpu + 1); finish_packet(); } else if (command_equals(data, len, "qfThreadInfo")) { cpuset_t mask; bool first; int vcpu; if (CPU_EMPTY(&vcpus_active)) { send_error(EINVAL); return; } mask = vcpus_active; start_packet(); append_char('m'); first = true; while (!CPU_EMPTY(&mask)) { vcpu = CPU_FFS(&mask) - 1; CPU_CLR(vcpu, &mask); if (first) first = false; else append_char(','); append_integer(vcpu + 1); } finish_packet(); } else if (command_equals(data, len, "qsThreadInfo")) { start_packet(); append_char('l'); finish_packet(); } else if (command_equals(data, len, "qSupported")) { data += strlen("qSupported"); len -= strlen("qSupported"); check_features(data, len); } else if (command_equals(data, len, "qThreadExtraInfo")) { char buf[16]; int tid; data += strlen("qThreadExtraInfo"); len -= strlen("qThreadExtraInfo"); if (*data != ',') { send_error(EINVAL); return; } tid = parse_threadid(data + 1, len - 1); if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) { send_error(EINVAL); return; } snprintf(buf, sizeof(buf), "vCPU %d", tid - 1); start_packet(); append_asciihex(buf); finish_packet(); } else send_empty_response(); } static void handle_command(const uint8_t *data, size_t len) { /* Reject packets with a sequence-id. */ if (len >= 3 && data[0] >= '0' && data[0] <= '9' && data[0] >= '0' && data[0] <= '9' && data[2] == ':') { send_empty_response(); return; } switch (*data) { case 'c': if (len != 1) { send_error(EINVAL); break; } discard_stop(); gdb_resume_vcpus(); break; case 'D': send_ok(); /* TODO: Resume any stopped CPUs. */ break; case 'g': { gdb_read_regs(); break; } case 'H': { int tid; if (data[1] != 'g' && data[1] != 'c') { send_error(EINVAL); break; } tid = parse_threadid(data + 2, len - 2); if (tid == -2) { send_error(EINVAL); break; } if (CPU_EMPTY(&vcpus_active)) { send_error(EINVAL); break; } if (tid == -1 || tid == 0) cur_vcpu = CPU_FFS(&vcpus_active) - 1; else if (CPU_ISSET(tid - 1, &vcpus_active)) cur_vcpu = tid - 1; else { send_error(EINVAL); break; } send_ok(); break; } case 'm': gdb_read_mem(data, len); break; case 'M': gdb_write_mem(data, len); break; case 'T': { int tid; tid = parse_threadid(data + 1, len - 1); if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) { send_error(EINVAL); return; } send_ok(); break; } case 'q': gdb_query(data, len); break; case 's': if (len != 1) { send_error(EINVAL); break; } /* Don't send a reply until a stop occurs. */ - if (!gdb_step_vcpu(cur_vcpu)) { + if (!gdb_step_vcpu(vcpus[cur_vcpu])) { send_error(EOPNOTSUPP); break; } break; case 'z': case 'Z': parse_breakpoint(data, len); break; case '?': report_stop(false); break; case 'G': /* TODO */ case 'v': /* Handle 'vCont' */ /* 'vCtrlC' */ case 'p': /* TODO */ case 'P': /* TODO */ case 'Q': /* TODO */ case 't': /* TODO */ case 'X': /* TODO */ default: send_empty_response(); } } /* Check for a valid packet in the command buffer. */ static void check_command(int fd) { uint8_t *head, *hash, *p, sum; size_t avail, plen; for (;;) { avail = cur_comm.len; if (avail == 0) return; head = io_buffer_head(&cur_comm); switch (*head) { case 0x03: debug("<- Ctrl-C\n"); io_buffer_consume(&cur_comm, 1); gdb_suspend_vcpus(); break; case '+': /* ACK of previous response. */ debug("<- +\n"); if (response_pending()) io_buffer_reset(&cur_resp); io_buffer_consume(&cur_comm, 1); if (stopped_vcpu != -1 && report_next_stop) { report_stop(true); send_pending_data(fd); } break; case '-': /* NACK of previous response. */ debug("<- -\n"); if (response_pending()) { cur_resp.len += cur_resp.start; cur_resp.start = 0; if (cur_resp.data[0] == '+') io_buffer_advance(&cur_resp, 1); debug("-> %.*s\n", (int)cur_resp.len, io_buffer_head(&cur_resp)); } io_buffer_consume(&cur_comm, 1); send_pending_data(fd); break; case '$': /* Packet. */ if (response_pending()) { warnx("New GDB command while response in " "progress"); io_buffer_reset(&cur_resp); } /* Is packet complete? */ hash = memchr(head, '#', avail); if (hash == NULL) return; plen = (hash - head + 1) + 2; if (avail < plen) return; debug("<- %.*s\n", (int)plen, head); /* Verify checksum. */ for (sum = 0, p = head + 1; p < hash; p++) sum += *p; if (sum != parse_byte(hash + 1)) { io_buffer_consume(&cur_comm, plen); debug("-> -\n"); send_char('-'); send_pending_data(fd); break; } send_char('+'); handle_command(head + 1, hash - (head + 1)); io_buffer_consume(&cur_comm, plen); if (!response_pending()) debug("-> +\n"); send_pending_data(fd); break; default: /* XXX: Possibly drop connection instead. */ debug("-> %02x\n", *head); io_buffer_consume(&cur_comm, 1); break; } } } static void gdb_readable(int fd, enum ev_type event __unused, void *arg __unused) { size_t pending; ssize_t nread; int n; if (ioctl(fd, FIONREAD, &n) == -1) { warn("FIONREAD on GDB socket"); return; } assert(n >= 0); pending = n; /* * 'pending' might be zero due to EOF. We need to call read * with a non-zero length to detect EOF. */ if (pending == 0) pending = 1; /* Ensure there is room in the command buffer. */ io_buffer_grow(&cur_comm, pending); assert(io_buffer_avail(&cur_comm) >= pending); nread = read(fd, io_buffer_tail(&cur_comm), io_buffer_avail(&cur_comm)); if (nread == 0) { close_connection(); } else if (nread == -1) { if (errno == EAGAIN) return; warn("Read from GDB socket"); close_connection(); } else { cur_comm.len += nread; pthread_mutex_lock(&gdb_lock); check_command(fd); pthread_mutex_unlock(&gdb_lock); } } static void gdb_writable(int fd, enum ev_type event __unused, void *arg __unused) { send_pending_data(fd); } static void new_connection(int fd, enum ev_type event __unused, void *arg) { int optval, s; s = accept4(fd, NULL, NULL, SOCK_NONBLOCK); if (s == -1) { if (arg != NULL) err(1, "Failed accepting initial GDB connection"); /* Silently ignore errors post-startup. */ return; } optval = 1; if (setsockopt(s, SOL_SOCKET, SO_NOSIGPIPE, &optval, sizeof(optval)) == -1) { warn("Failed to disable SIGPIPE for GDB connection"); close(s); return; } pthread_mutex_lock(&gdb_lock); if (cur_fd != -1) { close(s); warnx("Ignoring additional GDB connection."); } read_event = mevent_add(s, EVF_READ, gdb_readable, NULL); if (read_event == NULL) { if (arg != NULL) err(1, "Failed to setup initial GDB connection"); pthread_mutex_unlock(&gdb_lock); return; } write_event = mevent_add(s, EVF_WRITE, gdb_writable, NULL); if (write_event == NULL) { if (arg != NULL) err(1, "Failed to setup initial GDB connection"); mevent_delete_close(read_event); read_event = NULL; } cur_fd = s; cur_vcpu = 0; stopped_vcpu = -1; /* Break on attach. */ first_stop = true; report_next_stop = false; gdb_suspend_vcpus(); pthread_mutex_unlock(&gdb_lock); } #ifndef WITHOUT_CAPSICUM static void limit_gdb_socket(int s) { cap_rights_t rights; unsigned long ioctls[] = { FIONREAD }; cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE, CAP_SETSOCKOPT, CAP_IOCTL); if (caph_rights_limit(s, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_ioctls_limit(s, ioctls, nitems(ioctls)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); } #endif void init_gdb(struct vmctx *_ctx) { int error, flags, optval, s; struct addrinfo hints; struct addrinfo *gdbaddr; const char *saddr, *value; char *sport; bool wait; value = get_config_value("gdb.port"); if (value == NULL) return; sport = strdup(value); if (sport == NULL) errx(4, "Failed to allocate memory"); wait = get_config_bool_default("gdb.wait", false); saddr = get_config_value("gdb.address"); if (saddr == NULL) { saddr = "localhost"; } debug("==> starting on %s:%s, %swaiting\n", saddr, sport, wait ? "" : "not "); error = pthread_mutex_init(&gdb_lock, NULL); if (error != 0) errc(1, error, "gdb mutex init"); error = pthread_cond_init(&idle_vcpus, NULL); if (error != 0) errc(1, error, "gdb cv init"); memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; hints.ai_flags = AI_NUMERICSERV | AI_PASSIVE; error = getaddrinfo(saddr, sport, &hints, &gdbaddr); if (error != 0) errx(1, "gdb address resolution: %s", gai_strerror(error)); ctx = _ctx; s = socket(gdbaddr->ai_family, gdbaddr->ai_socktype, 0); if (s < 0) err(1, "gdb socket create"); optval = 1; (void)setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)); if (bind(s, gdbaddr->ai_addr, gdbaddr->ai_addrlen) < 0) err(1, "gdb socket bind"); if (listen(s, 1) < 0) err(1, "gdb socket listen"); stopped_vcpu = -1; TAILQ_INIT(&breakpoints); + vcpus = calloc(guest_ncpus, sizeof(*vcpus)); vcpu_state = calloc(guest_ncpus, sizeof(*vcpu_state)); if (wait) { /* * Set vcpu 0 in vcpus_suspended. This will trigger the * logic in gdb_cpu_add() to suspend the first vcpu before * it starts execution. The vcpu will remain suspended * until a debugger connects. */ CPU_SET(0, &vcpus_suspended); stopped_vcpu = 0; } flags = fcntl(s, F_GETFL); if (fcntl(s, F_SETFL, flags | O_NONBLOCK) == -1) err(1, "Failed to mark gdb socket non-blocking"); #ifndef WITHOUT_CAPSICUM limit_gdb_socket(s); #endif mevent_add(s, EVF_READ, new_connection, NULL); gdb_active = true; freeaddrinfo(gdbaddr); free(sport); } diff --git a/usr.sbin/bhyve/gdb.h b/usr.sbin/bhyve/gdb.h index c5fa522c63e0..f132707cce3f 100644 --- a/usr.sbin/bhyve/gdb.h +++ b/usr.sbin/bhyve/gdb.h @@ -1,39 +1,39 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017 John H. Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __GDB_H__ #define __GDB_H__ -void gdb_cpu_add(int vcpu); -void gdb_cpu_breakpoint(int vcpu, struct vm_exit *vmexit); -void gdb_cpu_mtrap(int vcpu); -void gdb_cpu_suspend(int vcpu); +void gdb_cpu_add(struct vcpu *vcpu); +void gdb_cpu_breakpoint(struct vcpu *vcpu, struct vm_exit *vmexit); +void gdb_cpu_mtrap(struct vcpu *vcpu); +void gdb_cpu_suspend(struct vcpu *vcpu); void init_gdb(struct vmctx *ctx); #endif /* !__GDB_H__ */ diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c index 225697906393..043fd5eef7af 100644 --- a/usr.sbin/bhyve/inout.c +++ b/usr.sbin/bhyve/inout.c @@ -1,301 +1,301 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "inout.h" SET_DECLARE(inout_port_set, struct inout_port); #define MAX_IOPORTS (1 << 16) #define VERIFY_IOPORT(port, size) \ assert((port) >= 0 && (size) > 0 && ((port) + (size)) <= MAX_IOPORTS) static struct { const char *name; int flags; inout_func_t handler; void *arg; } inout_handlers[MAX_IOPORTS]; static int default_inout(struct vmctx *ctx __unused, int in, int port __unused, int bytes, uint32_t *eax, void *arg __unused) { if (in) { switch (bytes) { case 4: *eax = 0xffffffff; break; case 2: *eax = 0xffff; break; case 1: *eax = 0xff; break; } } return (0); } static void register_default_iohandler(int start, int size) { struct inout_port iop; VERIFY_IOPORT(start, size); bzero(&iop, sizeof(iop)); iop.name = "default"; iop.port = start; iop.size = size; iop.flags = IOPORT_F_INOUT | IOPORT_F_DEFAULT; iop.handler = default_inout; register_inout(&iop); } int -emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) +emulate_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vmexit) { int addrsize, bytes, flags, in, port, prot, rep; uint32_t eax, val; inout_func_t handler; void *arg; int error, fault, retval; enum vm_reg_name idxreg; uint64_t gla, index, iterations, count; struct vm_inout_str *vis; struct iovec iov[2]; bytes = vmexit->u.inout.bytes; in = vmexit->u.inout.in; port = vmexit->u.inout.port; assert(port < MAX_IOPORTS); assert(bytes == 1 || bytes == 2 || bytes == 4); handler = inout_handlers[port].handler; if (handler == default_inout && get_config_bool_default("x86.strictio", false)) return (-1); flags = inout_handlers[port].flags; arg = inout_handlers[port].arg; if (in) { if (!(flags & IOPORT_F_IN)) return (-1); } else { if (!(flags & IOPORT_F_OUT)) return (-1); } retval = 0; if (vmexit->u.inout.string) { vis = &vmexit->u.inout_str; rep = vis->inout.rep; addrsize = vis->addrsize; prot = in ? PROT_WRITE : PROT_READ; assert(addrsize == 2 || addrsize == 4 || addrsize == 8); /* Index register */ idxreg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; index = vis->index & vie_size2mask(addrsize); /* Count register */ count = vis->count & vie_size2mask(addrsize); /* Limit number of back-to-back in/out emulations to 16 */ iterations = MIN(count, 16); while (iterations > 0) { assert(retval == 0); if (vie_calculate_gla(vis->paging.cpu_mode, vis->seg_name, &vis->seg_desc, index, bytes, addrsize, prot, &gla)) { - vm_inject_gp(ctx, vcpu); + vm_inject_gp(vcpu); break; } - error = vm_copy_setup(ctx, vcpu, &vis->paging, gla, + error = vm_copy_setup(vcpu, &vis->paging, gla, bytes, prot, iov, nitems(iov), &fault); if (error) { retval = -1; /* Unrecoverable error */ break; } else if (fault) { retval = 0; /* Resume guest to handle fault */ break; } if (vie_alignment_check(vis->paging.cpl, bytes, vis->cr0, vis->rflags, gla)) { - vm_inject_ac(ctx, vcpu, 0); + vm_inject_ac(vcpu, 0); break; } val = 0; if (!in) vm_copyin(iov, &val, bytes); retval = handler(ctx, in, port, bytes, &val, arg); if (retval != 0) break; if (in) vm_copyout(&val, iov, bytes); /* Update index */ if (vis->rflags & PSL_D) index -= bytes; else index += bytes; count--; iterations--; } /* Update index register */ - error = vie_update_register(ctx, vcpu, idxreg, index, addrsize); + error = vie_update_register(vcpu, idxreg, index, addrsize); assert(error == 0); /* * Update count register only if the instruction had a repeat * prefix. */ if (rep) { - error = vie_update_register(ctx, vcpu, VM_REG_GUEST_RCX, + error = vie_update_register(vcpu, VM_REG_GUEST_RCX, count, addrsize); assert(error == 0); } /* Restart the instruction if more iterations remain */ if (retval == 0 && count != 0) { - error = vm_restart_instruction(ctx, vcpu); + error = vm_restart_instruction(vcpu); assert(error == 0); } } else { eax = vmexit->u.inout.eax; val = eax & vie_size2mask(bytes); retval = handler(ctx, in, port, bytes, &val, arg); if (retval == 0 && in) { eax &= ~vie_size2mask(bytes); eax |= val & vie_size2mask(bytes); - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, + error = vm_set_register(vcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); } } return (retval); } void init_inout(void) { struct inout_port **iopp, *iop; /* * Set up the default handler for all ports */ register_default_iohandler(0, MAX_IOPORTS); /* * Overwrite with specified handlers */ SET_FOREACH(iopp, inout_port_set) { iop = *iopp; assert(iop->port < MAX_IOPORTS); inout_handlers[iop->port].name = iop->name; inout_handlers[iop->port].flags = iop->flags; inout_handlers[iop->port].handler = iop->handler; inout_handlers[iop->port].arg = NULL; } } int register_inout(struct inout_port *iop) { int i; VERIFY_IOPORT(iop->port, iop->size); /* * Verify that the new registration is not overwriting an already * allocated i/o range. */ if ((iop->flags & IOPORT_F_DEFAULT) == 0) { for (i = iop->port; i < iop->port + iop->size; i++) { if ((inout_handlers[i].flags & IOPORT_F_DEFAULT) == 0) return (-1); } } for (i = iop->port; i < iop->port + iop->size; i++) { inout_handlers[i].name = iop->name; inout_handlers[i].flags = iop->flags; inout_handlers[i].handler = iop->handler; inout_handlers[i].arg = iop->arg; } return (0); } int unregister_inout(struct inout_port *iop) { VERIFY_IOPORT(iop->port, iop->size); assert(inout_handlers[iop->port].name == iop->name); register_default_iohandler(iop->port, iop->size); return (0); } diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h index a3a37e03ff68..e21bfe7fba19 100644 --- a/usr.sbin/bhyve/inout.h +++ b/usr.sbin/bhyve/inout.h @@ -1,79 +1,80 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _INOUT_H_ #define _INOUT_H_ #include +struct vcpu; struct vmctx; struct vm_exit; /* * inout emulation handlers return 0 on success and -1 on failure. */ typedef int (*inout_func_t)(struct vmctx *ctx, int in, int port, int bytes, uint32_t *eax, void *arg); struct inout_port { const char *name; int port; int size; int flags; inout_func_t handler; void *arg; }; #define IOPORT_F_IN 0x1 #define IOPORT_F_OUT 0x2 #define IOPORT_F_INOUT (IOPORT_F_IN | IOPORT_F_OUT) /* * The following flags are used internally and must not be used by * device models. */ #define IOPORT_F_DEFAULT 0x80000000 /* claimed by default handler */ #define INOUT_PORT(name, port, flags, handler) \ static struct inout_port __CONCAT(__inout_port, __LINE__) = { \ #name, \ (port), \ 1, \ (flags), \ (handler), \ 0 \ }; \ DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__)) void init_inout(void); -int emulate_inout(struct vmctx *, int vcpu, struct vm_exit *vmexit); +int emulate_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vmexit); int register_inout(struct inout_port *iop); int unregister_inout(struct inout_port *iop); #endif /* _INOUT_H_ */ diff --git a/usr.sbin/bhyve/kernemu_dev.c b/usr.sbin/bhyve/kernemu_dev.c index 2fa0c3dc1f35..84f096a16c3a 100644 --- a/usr.sbin/bhyve/kernemu_dev.c +++ b/usr.sbin/bhyve/kernemu_dev.c @@ -1,98 +1,98 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright 2020 Conrad Meyer . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include struct vm; struct vm_hpet_cap; #include #include #include #include #include #include "kernemu_dev.h" #include "mem.h" static int -apic_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int size, +apic_handler(struct vcpu *vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1 __unused, long arg2 __unused) { - if (vm_readwrite_kernemu_device(ctx, vcpu, addr, (dir == MEM_F_WRITE), + if (vm_readwrite_kernemu_device(vcpu, addr, (dir == MEM_F_WRITE), size, val) != 0) return (errno); return (0); } static struct mem_range lapic_mmio = { .name = "kern-lapic-mmio", .base = DEFAULT_APIC_BASE, .size = PAGE_SIZE, .flags = MEM_F_RW | MEM_F_IMMUTABLE, .handler = apic_handler, }; static struct mem_range ioapic_mmio = { .name = "kern-ioapic-mmio", .base = VIOAPIC_BASE, .size = VIOAPIC_SIZE, .flags = MEM_F_RW | MEM_F_IMMUTABLE, .handler = apic_handler, }; static struct mem_range hpet_mmio = { .name = "kern-hpet-mmio", .base = VHPET_BASE, .size = VHPET_SIZE, .flags = MEM_F_RW | MEM_F_IMMUTABLE, .handler = apic_handler, }; void kernemu_dev_init(void) { int rc; rc = register_mem(&lapic_mmio); if (rc != 0) errc(4, rc, "register_mem: LAPIC (0x%08x)", (unsigned)lapic_mmio.base); rc = register_mem(&ioapic_mmio); if (rc != 0) errc(4, rc, "register_mem: IOAPIC (0x%08x)", (unsigned)ioapic_mmio.base); rc = register_mem(&hpet_mmio); if (rc != 0) errc(4, rc, "register_mem: HPET (0x%08x)", (unsigned)hpet_mmio.base); } diff --git a/usr.sbin/bhyve/mem.c b/usr.sbin/bhyve/mem.c index 10cc7bbaa7fc..ccb0c69c5a8e 100644 --- a/usr.sbin/bhyve/mem.c +++ b/usr.sbin/bhyve/mem.c @@ -1,379 +1,377 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Memory ranges are represented with an RB tree. On insertion, the range * is checked for overlaps. On lookup, the key has the same base and limit * so it can be searched within the range. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include +#include #include "mem.h" struct mmio_rb_range { RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */ struct mem_range mr_param; uint64_t mr_base; uint64_t mr_end; }; struct mmio_rb_tree; RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); static RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rb_root, mmio_rb_fallback; /* * Per-vCPU cache. Since most accesses from a vCPU will be to * consecutive addresses in a range, it makes sense to cache the * result of a lookup. */ static struct mmio_rb_range **mmio_hint; static int mmio_ncpu; static pthread_rwlock_t mmio_rwlock; static int mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b) { if (a->mr_end < b->mr_base) return (-1); else if (a->mr_base > b->mr_end) return (1); return (0); } static int mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr, struct mmio_rb_range **entry) { struct mmio_rb_range find, *res; find.mr_base = find.mr_end = addr; res = RB_FIND(mmio_rb_tree, rbt, &find); if (res != NULL) { *entry = res; return (0); } return (ENOENT); } static int mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new) { struct mmio_rb_range *overlap; overlap = RB_INSERT(mmio_rb_tree, rbt, new); if (overlap != NULL) { #ifdef RB_DEBUG printf("overlap detected: new %lx:%lx, tree %lx:%lx, '%s' " "claims region already claimed for '%s'\n", new->mr_base, new->mr_end, overlap->mr_base, overlap->mr_end, new->mr_param.name, overlap->mr_param.name); #endif return (EEXIST); } return (0); } #if 0 static void mmio_rb_dump(struct mmio_rb_tree *rbt) { int perror; struct mmio_rb_range *np; pthread_rwlock_rdlock(&mmio_rwlock); RB_FOREACH(np, mmio_rb_tree, rbt) { printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end, np->mr_param.name); } perror = pthread_rwlock_unlock(&mmio_rwlock); assert(perror == 0); } #endif RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); -typedef int (mem_cb_t)(struct vmctx *ctx, int vcpu, uint64_t gpa, - struct mem_range *mr, void *arg); +typedef int (mem_cb_t)(struct vcpu *vcpu, uint64_t gpa, struct mem_range *mr, + void *arg); static int -mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg) +mem_read(struct vcpu *vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg) { int error; struct mem_range *mr = arg; - error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size, - rval, mr->arg1, mr->arg2); + error = (*mr->handler)(vcpu, MEM_F_READ, gpa, size, rval, mr->arg1, + mr->arg2); return (error); } static int -mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg) +mem_write(struct vcpu *vcpu, uint64_t gpa, uint64_t wval, int size, void *arg) { int error; struct mem_range *mr = arg; - error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size, - &wval, mr->arg1, mr->arg2); + error = (*mr->handler)(vcpu, MEM_F_WRITE, gpa, size, &wval, mr->arg1, + mr->arg2); return (error); } static int -access_memory(struct vmctx *ctx, int vcpu, uint64_t paddr, mem_cb_t *cb, - void *arg) +access_memory(struct vcpu *vcpu, uint64_t paddr, mem_cb_t *cb, void *arg) { struct mmio_rb_range *entry; - int err, perror, immutable; + int err, perror, immutable, vcpuid; + vcpuid = vcpu_id(vcpu); pthread_rwlock_rdlock(&mmio_rwlock); /* * First check the per-vCPU cache */ - if (mmio_hint[vcpu] && - paddr >= mmio_hint[vcpu]->mr_base && - paddr <= mmio_hint[vcpu]->mr_end) { - entry = mmio_hint[vcpu]; + if (mmio_hint[vcpuid] && + paddr >= mmio_hint[vcpuid]->mr_base && + paddr <= mmio_hint[vcpuid]->mr_end) { + entry = mmio_hint[vcpuid]; } else entry = NULL; if (entry == NULL) { if (mmio_rb_lookup(&mmio_rb_root, paddr, &entry) == 0) { /* Update the per-vCPU cache */ - mmio_hint[vcpu] = entry; + mmio_hint[vcpuid] = entry; } else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) { perror = pthread_rwlock_unlock(&mmio_rwlock); assert(perror == 0); return (ESRCH); } } assert(entry != NULL); /* * An 'immutable' memory range is guaranteed to be never removed * so there is no need to hold 'mmio_rwlock' while calling the * handler. * * XXX writes to the PCIR_COMMAND register can cause register_mem() * to be called. If the guest is using PCI extended config space * to modify the PCIR_COMMAND register then register_mem() can * deadlock on 'mmio_rwlock'. However by registering the extended * config space window as 'immutable' the deadlock can be avoided. */ immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE); if (immutable) { perror = pthread_rwlock_unlock(&mmio_rwlock); assert(perror == 0); } - err = cb(ctx, vcpu, paddr, &entry->mr_param, arg); + err = cb(vcpu, paddr, &entry->mr_param, arg); if (!immutable) { perror = pthread_rwlock_unlock(&mmio_rwlock); assert(perror == 0); } - return (err); } struct emulate_mem_args { struct vie *vie; struct vm_guest_paging *paging; }; static int -emulate_mem_cb(struct vmctx *ctx, int vcpu, uint64_t paddr, struct mem_range *mr, +emulate_mem_cb(struct vcpu *vcpu, uint64_t paddr, struct mem_range *mr, void *arg) { struct emulate_mem_args *ema; ema = arg; - return (vmm_emulate_instruction(ctx, vcpu, paddr, ema->vie, ema->paging, + return (vmm_emulate_instruction(vcpu, paddr, ema->vie, ema->paging, mem_read, mem_write, mr)); } int -emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, +emulate_mem(struct vcpu *vcpu, uint64_t paddr, struct vie *vie, struct vm_guest_paging *paging) - { struct emulate_mem_args ema; ema.vie = vie; ema.paging = paging; - return (access_memory(ctx, vcpu, paddr, emulate_mem_cb, &ema)); + return (access_memory(vcpu, paddr, emulate_mem_cb, &ema)); } struct rw_mem_args { uint64_t *val; int size; int operation; }; static int -rw_mem_cb(struct vmctx *ctx, int vcpu, uint64_t paddr, struct mem_range *mr, - void *arg) +rw_mem_cb(struct vcpu *vcpu, uint64_t paddr, struct mem_range *mr, void *arg) { struct rw_mem_args *rma; rma = arg; - return (mr->handler(ctx, vcpu, rma->operation, paddr, rma->size, + return (mr->handler(vcpu, rma->operation, paddr, rma->size, rma->val, mr->arg1, mr->arg2)); } int -read_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size) +read_mem(struct vcpu *vcpu, uint64_t gpa, uint64_t *rval, int size) { struct rw_mem_args rma; rma.val = rval; rma.size = size; rma.operation = MEM_F_READ; - return (access_memory(ctx, vcpu, gpa, rw_mem_cb, &rma)); + return (access_memory(vcpu, gpa, rw_mem_cb, &rma)); } int -write_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size) +write_mem(struct vcpu *vcpu, uint64_t gpa, uint64_t wval, int size) { struct rw_mem_args rma; rma.val = &wval; rma.size = size; rma.operation = MEM_F_WRITE; - return (access_memory(ctx, vcpu, gpa, rw_mem_cb, &rma)); + return (access_memory(vcpu, gpa, rw_mem_cb, &rma)); } static int register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp) { struct mmio_rb_range *entry, *mrp; int err, perror; err = 0; mrp = malloc(sizeof(struct mmio_rb_range)); if (mrp == NULL) { warn("%s: couldn't allocate memory for mrp\n", __func__); err = ENOMEM; } else { mrp->mr_param = *memp; mrp->mr_base = memp->base; mrp->mr_end = memp->base + memp->size - 1; pthread_rwlock_wrlock(&mmio_rwlock); if (mmio_rb_lookup(rbt, memp->base, &entry) != 0) err = mmio_rb_add(rbt, mrp); perror = pthread_rwlock_unlock(&mmio_rwlock); assert(perror == 0); if (err) free(mrp); } return (err); } int register_mem(struct mem_range *memp) { return (register_mem_int(&mmio_rb_root, memp)); } int register_mem_fallback(struct mem_range *memp) { return (register_mem_int(&mmio_rb_fallback, memp)); } int unregister_mem(struct mem_range *memp) { struct mem_range *mr; struct mmio_rb_range *entry = NULL; int err, perror, i; pthread_rwlock_wrlock(&mmio_rwlock); err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry); if (err == 0) { mr = &entry->mr_param; assert(mr->name == memp->name); assert(mr->base == memp->base && mr->size == memp->size); assert((mr->flags & MEM_F_IMMUTABLE) == 0); RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry); /* flush Per-vCPU cache */ for (i = 0; i < mmio_ncpu; i++) { if (mmio_hint[i] == entry) mmio_hint[i] = NULL; } } perror = pthread_rwlock_unlock(&mmio_rwlock); assert(perror == 0); if (entry) free(entry); return (err); } void init_mem(int ncpu) { mmio_ncpu = ncpu; mmio_hint = calloc(ncpu, sizeof(*mmio_hint)); RB_INIT(&mmio_rb_root); RB_INIT(&mmio_rb_fallback); pthread_rwlock_init(&mmio_rwlock, NULL); } diff --git a/usr.sbin/bhyve/mem.h b/usr.sbin/bhyve/mem.h index 965079107476..c5ed70070fb9 100644 --- a/usr.sbin/bhyve/mem.h +++ b/usr.sbin/bhyve/mem.h @@ -1,67 +1,65 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MEM_H_ #define _MEM_H_ #include -struct vmctx; +struct vcpu; -typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, +typedef int (*mem_func_t)(struct vcpu *vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2); struct mem_range { const char *name; int flags; mem_func_t handler; void *arg1; long arg2; uint64_t base; uint64_t size; }; #define MEM_F_READ 0x1 #define MEM_F_WRITE 0x2 #define MEM_F_RW 0x3 #define MEM_F_IMMUTABLE 0x4 /* mem_range cannot be unregistered */ void init_mem(int ncpu); -int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie, +int emulate_mem(struct vcpu *vcpu, uint64_t paddr, struct vie *vie, struct vm_guest_paging *paging); -int read_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t *rval, - int size); +int read_mem(struct vcpu *vpu, uint64_t gpa, uint64_t *rval, int size); int register_mem(struct mem_range *memp); int register_mem_fallback(struct mem_range *memp); int unregister_mem(struct mem_range *memp); -int write_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t wval, - int size); +int write_mem(struct vcpu *vcpu, uint64_t gpa, uint64_t wval, int size); #endif /* _MEM_H_ */ diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c index 93411751d635..48d80a63ea51 100644 --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -1,2642 +1,2641 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "acpi.h" #include "bhyverun.h" #include "config.h" #include "debug.h" #include "inout.h" #include "ioapic.h" #include "mem.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #define CONF1_ADDR_PORT 0x0cf8 #define CONF1_DATA_PORT 0x0cfc #define CONF1_ENABLE 0x80000000ul #define MAXBUSES (PCI_BUSMAX + 1) #define MAXSLOTS (PCI_SLOTMAX + 1) #define MAXFUNCS (PCI_FUNCMAX + 1) #define GB (1024 * 1024 * 1024UL) struct funcinfo { nvlist_t *fi_config; struct pci_devemu *fi_pde; struct pci_devinst *fi_devi; }; struct intxinfo { int ii_count; int ii_pirq_pin; int ii_ioapic_irq; }; struct slotinfo { struct intxinfo si_intpins[4]; struct funcinfo si_funcs[MAXFUNCS]; }; struct businfo { uint16_t iobase, iolimit; /* I/O window */ uint32_t membase32, memlimit32; /* mmio window below 4GB */ uint64_t membase64, memlimit64; /* mmio window above 4GB */ struct slotinfo slotinfo[MAXSLOTS]; }; static struct businfo *pci_businfo[MAXBUSES]; SET_DECLARE(pci_devemu_set, struct pci_devemu); static uint64_t pci_emul_iobase; static uint8_t *pci_emul_rombase; static uint64_t pci_emul_romoffset; static uint8_t *pci_emul_romlim; static uint64_t pci_emul_membase32; static uint64_t pci_emul_membase64; static uint64_t pci_emul_memlim64; struct pci_bar_allocation { TAILQ_ENTRY(pci_bar_allocation) chain; struct pci_devinst *pdi; int idx; enum pcibar_type type; uint64_t size; }; static TAILQ_HEAD(pci_bar_list, pci_bar_allocation) pci_bars = TAILQ_HEAD_INITIALIZER(pci_bars); #define PCI_EMUL_IOBASE 0x2000 #define PCI_EMUL_IOLIMIT 0x10000 #define PCI_EMUL_ROMSIZE 0x10000000 #define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ #define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); /* * OVMF always uses 0xC0000000 as base address for 32 bit PCI MMIO. Don't * change this address without changing it in OVMF. */ #define PCI_EMUL_MEMBASE32 0xC0000000 #define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE #define PCI_EMUL_MEMSIZE64 (32*GB) static struct pci_devemu *pci_emul_finddev(const char *name); static void pci_lintr_route(struct pci_devinst *pi); static void pci_lintr_update(struct pci_devinst *pi); static void pci_cfgrw(int in, int bus, int slot, int func, int coff, int bytes, uint32_t *val); static __inline void CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes) { if (bytes == 1) pci_set_cfgdata8(pi, coff, val); else if (bytes == 2) pci_set_cfgdata16(pi, coff, val); else pci_set_cfgdata32(pi, coff, val); } static __inline uint32_t CFGREAD(struct pci_devinst *pi, int coff, int bytes) { if (bytes == 1) return (pci_get_cfgdata8(pi, coff)); else if (bytes == 2) return (pci_get_cfgdata16(pi, coff)); else return (pci_get_cfgdata32(pi, coff)); } static int is_pcir_bar(int coff) { return (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)); } static int is_pcir_bios(int coff) { return (coff >= PCIR_BIOS && coff < PCIR_BIOS + 4); } /* * I/O access */ /* * Slot options are in the form: * * ::,[,] * [:],[,] * * slot is 0..31 * func is 0..7 * emul is a string describing the type of PCI device e.g. virtio-net * config is an optional string, depending on the device, that can be * used for configuration. * Examples are: * 1,virtio-net,tap0 * 3:0,dummy */ static void pci_parse_slot_usage(char *aopt) { EPRINTLN("Invalid PCI slot info field \"%s\"", aopt); } /* * Helper function to parse a list of comma-separated options where * each option is formatted as "name[=value]". If no value is * provided, the option is treated as a boolean and is given a value * of true. */ int pci_parse_legacy_config(nvlist_t *nvl, const char *opt) { char *config, *name, *tofree, *value; if (opt == NULL) return (0); config = tofree = strdup(opt); while ((name = strsep(&config, ",")) != NULL) { value = strchr(name, '='); if (value != NULL) { *value = '\0'; value++; set_config_value_node(nvl, name, value); } else set_config_bool_node(nvl, name, true); } free(tofree); return (0); } /* * PCI device configuration is stored in MIBs that encode the device's * location: * * pci... * * Where "bus", "slot", and "func" are all decimal values without * leading zeroes. Each valid device must have a "device" node which * identifies the driver model of the device. * * Device backends can provide a parser for the "config" string. If * a custom parser is not provided, pci_parse_legacy_config() is used * to parse the string. */ int pci_parse_slot(char *opt) { char node_name[sizeof("pci.XXX.XX.X")]; struct pci_devemu *pde; char *emul, *config, *str, *cp; int error, bnum, snum, fnum; nvlist_t *nvl; error = -1; str = strdup(opt); emul = config = NULL; if ((cp = strchr(str, ',')) != NULL) { *cp = '\0'; emul = cp + 1; if ((cp = strchr(emul, ',')) != NULL) { *cp = '\0'; config = cp + 1; } } else { pci_parse_slot_usage(opt); goto done; } /* :: */ if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) { bnum = 0; /* : */ if (sscanf(str, "%d:%d", &snum, &fnum) != 2) { fnum = 0; /* */ if (sscanf(str, "%d", &snum) != 1) { snum = -1; } } } if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS || fnum < 0 || fnum >= MAXFUNCS) { pci_parse_slot_usage(opt); goto done; } pde = pci_emul_finddev(emul); if (pde == NULL) { EPRINTLN("pci slot %d:%d:%d: unknown device \"%s\"", bnum, snum, fnum, emul); goto done; } snprintf(node_name, sizeof(node_name), "pci.%d.%d.%d", bnum, snum, fnum); nvl = find_config_node(node_name); if (nvl != NULL) { EPRINTLN("pci slot %d:%d:%d already occupied!", bnum, snum, fnum); goto done; } nvl = create_config_node(node_name); if (pde->pe_alias != NULL) set_config_value_node(nvl, "device", pde->pe_alias); else set_config_value_node(nvl, "device", pde->pe_emu); if (pde->pe_legacy_config != NULL) error = pde->pe_legacy_config(nvl, config); else error = pci_parse_legacy_config(nvl, config); done: free(str); return (error); } void pci_print_supported_devices(void) { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; printf("%s\n", pdp->pe_emu); } } static int pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) { if (offset < pi->pi_msix.pba_offset) return (0); if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { return (0); } return (1); } int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, uint64_t value) { int msix_entry_offset; int tab_index; char *dest; /* support only 4 or 8 byte writes */ if (size != 4 && size != 8) return (-1); /* * Return if table index is beyond what device supports */ tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index >= pi->pi_msix.table_count) return (-1); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned writes */ if ((msix_entry_offset % size) != 0) return (-1); dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 4) *((uint32_t *)dest) = value; else *((uint64_t *)dest) = value; return (0); } uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) { char *dest; int msix_entry_offset; int tab_index; uint64_t retval = ~0; /* * The PCI standard only allows 4 and 8 byte accesses to the MSI-X * table but we also allow 1 byte access to accommodate reads from * ddb. */ if (size != 1 && size != 4 && size != 8) return (retval); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned reads */ if ((msix_entry_offset % size) != 0) { return (retval); } tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index < pi->pi_msix.table_count) { /* valid MSI-X Table access */ dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 1) retval = *((uint8_t *)dest); else if (size == 4) retval = *((uint32_t *)dest); else retval = *((uint64_t *)dest); } else if (pci_valid_pba_offset(pi, offset)) { /* return 0 for PBA access */ retval = 0; } return (retval); } int pci_msix_table_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.table_bar); else return (-1); } int pci_msix_pba_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.pba_bar); else return (-1); } static int pci_emul_io_handler(struct vmctx *ctx __unused, int in, int port, int bytes, uint32_t *eax, void *arg) { struct pci_devinst *pdi = arg; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int i; assert(port >= 0); for (i = 0; i <= PCI_BARMAX; i++) { if (pdi->pi_bar[i].type == PCIBAR_IO && (uint64_t)port >= pdi->pi_bar[i].addr && (uint64_t)port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { offset = port - pdi->pi_bar[i].addr; if (in) *eax = (*pe->pe_barread)(pdi, i, offset, bytes); else (*pe->pe_barwrite)(pdi, i, offset, bytes, *eax); return (0); } } return (-1); } static int -pci_emul_mem_handler(struct vmctx *ctx __unused, int vcpu __unused, int dir, +pci_emul_mem_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { struct pci_devinst *pdi = arg1; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int bidx = (int) arg2; assert(bidx <= PCI_BARMAX); assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 || pdi->pi_bar[bidx].type == PCIBAR_MEM64); assert(addr >= pdi->pi_bar[bidx].addr && addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); offset = addr - pdi->pi_bar[bidx].addr; if (dir == MEM_F_WRITE) { if (size == 8) { (*pe->pe_barwrite)(pdi, bidx, offset, 4, *val & 0xffffffff); (*pe->pe_barwrite)(pdi, bidx, offset + 4, 4, *val >> 32); } else { (*pe->pe_barwrite)(pdi, bidx, offset, size, *val); } } else { if (size == 8) { *val = (*pe->pe_barread)(pdi, bidx, offset, 4); *val |= (*pe->pe_barread)(pdi, bidx, offset + 4, 4) << 32; } else { *val = (*pe->pe_barread)(pdi, bidx, offset, size); } } return (0); } static int pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, uint64_t *addr) { uint64_t base; assert((size & (size - 1)) == 0); /* must be a power of 2 */ base = roundup2(*baseptr, size); if (base + size <= limit) { *addr = base; *baseptr = base + size; return (0); } else return (-1); } /* * Register (or unregister) the MMIO or I/O region associated with the BAR * register 'idx' of an emulated pci device. */ static void modify_bar_registration(struct pci_devinst *pi, int idx, int registration) { struct pci_devemu *pe; int error; struct inout_port iop; struct mem_range mr; pe = pi->pi_d; switch (pi->pi_bar[idx].type) { case PCIBAR_IO: bzero(&iop, sizeof(struct inout_port)); iop.name = pi->pi_name; iop.port = pi->pi_bar[idx].addr; iop.size = pi->pi_bar[idx].size; if (registration) { iop.flags = IOPORT_F_INOUT; iop.handler = pci_emul_io_handler; iop.arg = pi; error = register_inout(&iop); } else error = unregister_inout(&iop); if (pe->pe_baraddr != NULL) (*pe->pe_baraddr)(pi, idx, registration, pi->pi_bar[idx].addr); break; case PCIBAR_MEM32: case PCIBAR_MEM64: bzero(&mr, sizeof(struct mem_range)); mr.name = pi->pi_name; mr.base = pi->pi_bar[idx].addr; mr.size = pi->pi_bar[idx].size; if (registration) { mr.flags = MEM_F_RW; mr.handler = pci_emul_mem_handler; mr.arg1 = pi; mr.arg2 = idx; error = register_mem(&mr); } else error = unregister_mem(&mr); if (pe->pe_baraddr != NULL) (*pe->pe_baraddr)(pi, idx, registration, pi->pi_bar[idx].addr); break; case PCIBAR_ROM: error = 0; if (pe->pe_baraddr != NULL) (*pe->pe_baraddr)(pi, idx, registration, pi->pi_bar[idx].addr); break; default: error = EINVAL; break; } assert(error == 0); } static void unregister_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 0); } static void register_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 1); } /* Is the ROM enabled for the emulated pci device? */ static int romen(struct pci_devinst *pi) { return (pi->pi_bar[PCI_ROM_IDX].lobits & PCIM_BIOS_ENABLE) == PCIM_BIOS_ENABLE; } /* Are we decoding i/o port accesses for the emulated pci device? */ static int porten(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_PORTEN); } /* Are we decoding memory accesses for the emulated pci device? */ static int memen(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_MEMEN); } /* * Update the MMIO or I/O address that is decoded by the BAR register. * * If the pci device has enabled the address space decoding then intercept * the address range decoded by the BAR register. */ static void update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) { int decode; if (pi->pi_bar[idx].type == PCIBAR_IO) decode = porten(pi); else decode = memen(pi); if (decode) unregister_bar(pi, idx); switch (type) { case PCIBAR_IO: case PCIBAR_MEM32: pi->pi_bar[idx].addr = addr; break; case PCIBAR_MEM64: pi->pi_bar[idx].addr &= ~0xffffffffUL; pi->pi_bar[idx].addr |= addr; break; case PCIBAR_MEMHI64: pi->pi_bar[idx].addr &= 0xffffffff; pi->pi_bar[idx].addr |= addr; break; default: assert(0); } if (decode) register_bar(pi, idx); } int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size) { assert((type == PCIBAR_ROM) || (idx >= 0 && idx <= PCI_BARMAX)); assert((type != PCIBAR_ROM) || (idx == PCI_ROM_IDX)); if ((size & (size - 1)) != 0) size = 1UL << flsl(size); /* round up to a power of 2 */ /* Enforce minimum BAR sizes required by the PCI standard */ if (type == PCIBAR_IO) { if (size < 4) size = 4; } else if (type == PCIBAR_ROM) { if (size < ~PCIM_BIOS_ADDR_MASK + 1) size = ~PCIM_BIOS_ADDR_MASK + 1; } else { if (size < 16) size = 16; } /* * To reduce fragmentation of the MMIO space, we allocate the BARs by * size. Therefore, don't allocate the BAR yet. We create a list of all * BAR allocation which is sorted by BAR size. When all PCI devices are * initialized, we will assign an address to the BARs. */ /* create a new list entry */ struct pci_bar_allocation *const new_bar = malloc(sizeof(*new_bar)); memset(new_bar, 0, sizeof(*new_bar)); new_bar->pdi = pdi; new_bar->idx = idx; new_bar->type = type; new_bar->size = size; /* * Search for a BAR which size is lower than the size of our newly * allocated BAR. */ struct pci_bar_allocation *bar = NULL; TAILQ_FOREACH(bar, &pci_bars, chain) { if (bar->size < size) { break; } } if (bar == NULL) { /* * Either the list is empty or new BAR is the smallest BAR of * the list. Append it to the end of our list. */ TAILQ_INSERT_TAIL(&pci_bars, new_bar, chain); } else { /* * The found BAR is smaller than our new BAR. For that reason, * insert our new BAR before the found BAR. */ TAILQ_INSERT_BEFORE(bar, new_bar, chain); } /* * pci_passthru devices synchronize their physical and virtual command * register on init. For that reason, the virtual cmd reg should be * updated as early as possible. */ uint16_t enbit = 0; switch (type) { case PCIBAR_IO: enbit = PCIM_CMD_PORTEN; break; case PCIBAR_MEM64: case PCIBAR_MEM32: enbit = PCIM_CMD_MEMEN; break; default: enbit = 0; break; } const uint16_t cmd = pci_get_cfgdata16(pdi, PCIR_COMMAND); pci_set_cfgdata16(pdi, PCIR_COMMAND, cmd | enbit); return (0); } static int pci_emul_assign_bar(struct pci_devinst *const pdi, const int idx, const enum pcibar_type type, const uint64_t size) { int error; uint64_t *baseptr, limit, addr, mask, lobits, bar; switch (type) { case PCIBAR_NONE: baseptr = NULL; addr = mask = lobits = 0; break; case PCIBAR_IO: baseptr = &pci_emul_iobase; limit = PCI_EMUL_IOLIMIT; mask = PCIM_BAR_IO_BASE; lobits = PCIM_BAR_IO_SPACE; break; case PCIBAR_MEM64: /* * XXX * Some drivers do not work well if the 64-bit BAR is allocated * above 4GB. Allow for this by allocating small requests under * 4GB unless then allocation size is larger than some arbitrary * number (128MB currently). */ if (size > 128 * 1024 * 1024) { baseptr = &pci_emul_membase64; limit = pci_emul_memlim64; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | PCIM_BAR_MEM_PREFETCH; } else { baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64; } break; case PCIBAR_MEM32: baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; break; case PCIBAR_ROM: /* do not claim memory for ROM. OVMF will do it for us. */ baseptr = NULL; limit = 0; mask = PCIM_BIOS_ADDR_MASK; lobits = 0; break; default: printf("pci_emul_alloc_base: invalid bar type %d\n", type); assert(0); } if (baseptr != NULL) { error = pci_emul_alloc_resource(baseptr, limit, size, &addr); if (error != 0) return (error); } else { addr = 0; } pdi->pi_bar[idx].type = type; pdi->pi_bar[idx].addr = addr; pdi->pi_bar[idx].size = size; /* * passthru devices are using same lobits as physical device they set * this property */ if (pdi->pi_bar[idx].lobits != 0) { lobits = pdi->pi_bar[idx].lobits; } else { pdi->pi_bar[idx].lobits = lobits; } /* Initialize the BAR register in config space */ bar = (addr & mask) | lobits; pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar); if (type == PCIBAR_MEM64) { assert(idx + 1 <= PCI_BARMAX); pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); } if (type != PCIBAR_ROM) { register_bar(pdi, idx); } return (0); } int pci_emul_alloc_rom(struct pci_devinst *const pdi, const uint64_t size, void **const addr) { /* allocate ROM space once on first call */ if (pci_emul_rombase == 0) { pci_emul_rombase = vm_create_devmem(pdi->pi_vmctx, VM_PCIROM, "pcirom", PCI_EMUL_ROMSIZE); if (pci_emul_rombase == MAP_FAILED) { warnx("%s: failed to create rom segment", __func__); return (-1); } pci_emul_romlim = pci_emul_rombase + PCI_EMUL_ROMSIZE; pci_emul_romoffset = 0; } /* ROM size should be a power of 2 and greater than 2 KB */ const uint64_t rom_size = MAX(1UL << flsl(size), ~PCIM_BIOS_ADDR_MASK + 1); /* check if ROM fits into ROM space */ if (pci_emul_romoffset + rom_size > PCI_EMUL_ROMSIZE) { warnx("%s: no space left in rom segment:", __func__); warnx("%16lu bytes left", PCI_EMUL_ROMSIZE - pci_emul_romoffset); warnx("%16lu bytes required by %d/%d/%d", rom_size, pdi->pi_bus, pdi->pi_slot, pdi->pi_func); return (-1); } /* allocate ROM BAR */ const int error = pci_emul_alloc_bar(pdi, PCI_ROM_IDX, PCIBAR_ROM, rom_size); if (error) return error; /* return address */ *addr = pci_emul_rombase + pci_emul_romoffset; /* save offset into ROM Space */ pdi->pi_romoffset = pci_emul_romoffset; /* increase offset for next ROM */ pci_emul_romoffset += rom_size; return (0); } #define CAP_START_OFFSET 0x40 static int pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen) { int i, capoff, reallen; uint16_t sts; assert(caplen > 0); reallen = roundup2(caplen, 4); /* dword aligned */ sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) == 0) capoff = CAP_START_OFFSET; else capoff = pi->pi_capend + 1; /* Check if we have enough space */ if (capoff + reallen > PCI_REGMAX + 1) return (-1); /* Set the previous capability pointer */ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) { pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff); pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT); } else pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff); /* Copy the capability */ for (i = 0; i < caplen; i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); /* Set the next capability pointer */ pci_set_cfgdata8(pi, capoff + 1, 0); pi->pi_prevcap = capoff; pi->pi_capend = capoff + reallen - 1; return (0); } static struct pci_devemu * pci_emul_finddev(const char *name) { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; if (!strcmp(pdp->pe_emu, name)) { return (pdp); } } return (NULL); } static int pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot, int func, struct funcinfo *fi) { struct pci_devinst *pdi; int err; pdi = calloc(1, sizeof(struct pci_devinst)); pdi->pi_vmctx = ctx; pdi->pi_bus = bus; pdi->pi_slot = slot; pdi->pi_func = func; pthread_mutex_init(&pdi->pi_lintr.lock, NULL); pdi->pi_lintr.pin = 0; pdi->pi_lintr.state = IDLE; pdi->pi_lintr.pirq_pin = 0; pdi->pi_lintr.ioapic_irq = 0; pdi->pi_d = pde; snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot); /* Disable legacy interrupts */ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_BUSMASTEREN); err = (*pde->pe_init)(pdi, fi->fi_config); if (err == 0) fi->fi_devi = pdi; else free(pdi); return (err); } void pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) { int mmc; /* Number of msi messages must be a power of 2 between 1 and 32 */ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); mmc = ffs(msgnum) - 1; bzero(msicap, sizeof(struct msicap)); msicap->capid = PCIY_MSI; msicap->nextptr = nextptr; msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1); } int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) { struct msicap msicap; pci_populate_msicap(&msicap, msgnum, 0); return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); } static void pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, uint32_t msix_tab_size) { assert(msix_tab_size % 4096 == 0); bzero(msixcap, sizeof(struct msixcap)); msixcap->capid = PCIY_MSIX; /* * Message Control Register, all fields set to * zero except for the Table Size. * Note: Table size N is encoded as N-1 */ msixcap->msgctrl = msgnum - 1; /* * MSI-X BAR setup: * - MSI-X table start at offset 0 * - PBA table starts at a 4K aligned offset after the MSI-X table */ msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK; msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK); } static void pci_msix_table_init(struct pci_devinst *pi, int table_entries) { int i, table_size; assert(table_entries > 0); assert(table_entries <= MAX_MSIX_TABLE_ENTRIES); table_size = table_entries * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* set mask bit of vector control register */ for (i = 0; i < table_entries; i++) pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) { uint32_t tab_size; struct msixcap msixcap; assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE; /* Align table size to nearest 4K */ tab_size = roundup2(tab_size, 4096); pi->pi_msix.table_bar = barnum; pi->pi_msix.pba_bar = barnum; pi->pi_msix.table_offset = 0; pi->pi_msix.table_count = msgnum; pi->pi_msix.pba_offset = tab_size; pi->pi_msix.pba_size = PBA_SIZE(msgnum); pci_msix_table_init(pi, msgnum); pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size); /* allocate memory for MSI-X Table and PBA */ pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32, tab_size + pi->pi_msix.pba_size); return (pci_emul_add_capability(pi, (u_char *)&msixcap, sizeof(msixcap))); } static void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask; int off; off = offset - capoff; /* Message Control Register */ if (off == 2 && bytes == 2) { rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; pci_lintr_update(pi); } CFGWRITE(pi, offset, val, bytes); } static void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask, msgdata, mme; uint32_t addrlo; /* * If guest is writing to the message control register make sure * we do not overwrite read-only fields. */ if ((offset - capoff) == 2 && bytes == 2) { rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; } CFGWRITE(pi, offset, val, bytes); msgctrl = pci_get_cfgdata16(pi, capoff + 2); addrlo = pci_get_cfgdata32(pi, capoff + 4); if (msgctrl & PCIM_MSICTRL_64BIT) msgdata = pci_get_cfgdata16(pi, capoff + 12); else msgdata = pci_get_cfgdata16(pi, capoff + 8); mme = msgctrl & PCIM_MSICTRL_MME_MASK; pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0; if (pi->pi_msi.enabled) { pi->pi_msi.addr = addrlo; pi->pi_msi.msg_data = msgdata; pi->pi_msi.maxmsgnum = 1 << (mme >> 4); } else { pi->pi_msi.maxmsgnum = 0; } pci_lintr_update(pi); } static void pciecap_cfgwrite(struct pci_devinst *pi, int capoff __unused, int offset, int bytes, uint32_t val) { /* XXX don't write to the readonly parts */ CFGWRITE(pi, offset, val, bytes); } #define PCIECAP_VERSION 0x2 int pci_emul_add_pciecap(struct pci_devinst *pi, int type) { int err; struct pciecap pciecap; bzero(&pciecap, sizeof(pciecap)); /* * Use the integrated endpoint type for endpoints on a root complex bus. * * NB: bhyve currently only supports a single PCI bus that is the root * complex bus, so all endpoints are integrated. */ if ((type == PCIEM_TYPE_ENDPOINT) && (pi->pi_bus == 0)) type = PCIEM_TYPE_ROOT_INT_EP; pciecap.capid = PCIY_EXPRESS; pciecap.pcie_capabilities = PCIECAP_VERSION | type; if (type != PCIEM_TYPE_ROOT_INT_EP) { pciecap.link_capabilities = 0x411; /* gen1, x1 */ pciecap.link_status = 0x11; /* gen1, x1 */ } err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap)); return (err); } /* * This function assumes that 'coff' is in the capabilities region of the * config space. A capoff parameter of zero will force a search for the * offset and type. */ void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val, uint8_t capoff, int capid) { uint8_t nextoff; /* Do not allow un-aligned writes */ if ((offset & (bytes - 1)) != 0) return; if (capoff == 0) { /* Find the capability that we want to update */ capoff = CAP_START_OFFSET; while (1) { nextoff = pci_get_cfgdata8(pi, capoff + 1); if (nextoff == 0) break; if (offset >= capoff && offset < nextoff) break; capoff = nextoff; } assert(offset >= capoff); capid = pci_get_cfgdata8(pi, capoff); } /* * Capability ID and Next Capability Pointer are readonly. * However, some o/s's do 4-byte writes that include these. * For this case, trim the write back to 2 bytes and adjust * the data. */ if (offset == capoff || offset == capoff + 1) { if (offset == capoff && bytes == 4) { bytes = 2; offset += 2; val >>= 16; } else return; } switch (capid) { case PCIY_MSI: msicap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_MSIX: msixcap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_EXPRESS: pciecap_cfgwrite(pi, capoff, offset, bytes, val); break; default: break; } } static int pci_emul_iscap(struct pci_devinst *pi, int offset) { uint16_t sts; sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) != 0) { if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend) return (1); } return (0); } static int -pci_emul_fallback_handler(struct vmctx *ctx __unused, int vcpu __unused, - int dir, uint64_t addr __unused, int size __unused, uint64_t *val, +pci_emul_fallback_handler(struct vcpu *vcpu __unused, int dir, + uint64_t addr __unused, int size __unused, uint64_t *val, void *arg1 __unused, long arg2 __unused) { /* * Ignore writes; return 0xff's for reads. The mem read code * will take care of truncating to the correct size. */ if (dir == MEM_F_READ) { *val = 0xffffffffffffffff; } return (0); } static int -pci_emul_ecfg_handler(struct vmctx *ctx __unused, int vcpu __unused, int dir, - uint64_t addr, int bytes, uint64_t *val, void *arg1 __unused, - long arg2 __unused) +pci_emul_ecfg_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, + int bytes, uint64_t *val, void *arg1 __unused, long arg2 __unused) { int bus, slot, func, coff, in; coff = addr & 0xfff; func = (addr >> 12) & 0x7; slot = (addr >> 15) & 0x1f; bus = (addr >> 20) & 0xff; in = (dir == MEM_F_READ); if (in) *val = ~0UL; pci_cfgrw(in, bus, slot, func, coff, bytes, (uint32_t *)val); return (0); } uint64_t pci_ecfg_base(void) { return (PCI_EMUL_ECFG_BASE); } #define BUSIO_ROUNDUP 32 #define BUSMEM32_ROUNDUP (1024 * 1024) #define BUSMEM64_ROUNDUP (512 * 1024 * 1024) int init_pci(struct vmctx *ctx) { char node_name[sizeof("pci.XXX.XX.X")]; struct mem_range mr; struct pci_devemu *pde; struct businfo *bi; struct slotinfo *si; struct funcinfo *fi; nvlist_t *nvl; const char *emul; size_t lowmem; int bus, slot, func; int error; if (vm_get_lowmem_limit(ctx) > PCI_EMUL_MEMBASE32) errx(EX_OSERR, "Invalid lowmem limit"); pci_emul_iobase = PCI_EMUL_IOBASE; pci_emul_membase32 = PCI_EMUL_MEMBASE32; pci_emul_membase64 = 4*GB + vm_get_highmem_size(ctx); pci_emul_membase64 = roundup2(pci_emul_membase64, PCI_EMUL_MEMSIZE64); pci_emul_memlim64 = pci_emul_membase64 + PCI_EMUL_MEMSIZE64; for (bus = 0; bus < MAXBUSES; bus++) { snprintf(node_name, sizeof(node_name), "pci.%d", bus); nvl = find_config_node(node_name); if (nvl == NULL) continue; pci_businfo[bus] = calloc(1, sizeof(struct businfo)); bi = pci_businfo[bus]; /* * Keep track of the i/o and memory resources allocated to * this bus. */ bi->iobase = pci_emul_iobase; bi->membase32 = pci_emul_membase32; bi->membase64 = pci_emul_membase64; /* first run: init devices */ for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; snprintf(node_name, sizeof(node_name), "pci.%d.%d.%d", bus, slot, func); nvl = find_config_node(node_name); if (nvl == NULL) continue; fi->fi_config = nvl; emul = get_config_value_node(nvl, "device"); if (emul == NULL) { EPRINTLN("pci slot %d:%d:%d: missing " "\"device\" value", bus, slot, func); return (EINVAL); } pde = pci_emul_finddev(emul); if (pde == NULL) { EPRINTLN("pci slot %d:%d:%d: unknown " "device \"%s\"", bus, slot, func, emul); return (EINVAL); } if (pde->pe_alias != NULL) { EPRINTLN("pci slot %d:%d:%d: legacy " "device \"%s\", use \"%s\" instead", bus, slot, func, emul, pde->pe_alias); return (EINVAL); } fi->fi_pde = pde; error = pci_emul_init(ctx, pde, bus, slot, func, fi); if (error) return (error); } } /* second run: assign BARs and free list */ struct pci_bar_allocation *bar; struct pci_bar_allocation *bar_tmp; TAILQ_FOREACH_SAFE(bar, &pci_bars, chain, bar_tmp) { pci_emul_assign_bar(bar->pdi, bar->idx, bar->type, bar->size); free(bar); } TAILQ_INIT(&pci_bars); /* * Add some slop to the I/O and memory resources decoded by * this bus to give a guest some flexibility if it wants to * reprogram the BARs. */ pci_emul_iobase += BUSIO_ROUNDUP; pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP); bi->iolimit = pci_emul_iobase; pci_emul_membase32 += BUSMEM32_ROUNDUP; pci_emul_membase32 = roundup2(pci_emul_membase32, BUSMEM32_ROUNDUP); bi->memlimit32 = pci_emul_membase32; pci_emul_membase64 += BUSMEM64_ROUNDUP; pci_emul_membase64 = roundup2(pci_emul_membase64, BUSMEM64_ROUNDUP); bi->memlimit64 = pci_emul_membase64; } /* * PCI backends are initialized before routing INTx interrupts * so that LPC devices are able to reserve ISA IRQs before * routing PIRQ pins. */ for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_devi == NULL) continue; pci_lintr_route(fi->fi_devi); } } } lpc_pirq_routed(); /* * The guest physical memory map looks like the following: * [0, lowmem) guest system memory * [lowmem, 0xC0000000) memory hole (may be absent) * [0xC0000000, 0xE0000000) PCI hole (32-bit BAR allocation) * [0xE0000000, 0xF0000000) PCI extended config window * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware * [4GB, 4GB + highmem) */ /* * Accesses to memory addresses that are not allocated to system * memory or PCI devices return 0xff's. */ lowmem = vm_get_lowmem_size(ctx); bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI hole"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = lowmem; mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; mr.handler = pci_emul_fallback_handler; error = register_mem_fallback(&mr); assert(error == 0); /* PCI extended config space */ bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI ECFG"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = PCI_EMUL_ECFG_BASE; mr.size = PCI_EMUL_ECFG_SIZE; mr.handler = pci_emul_ecfg_handler; error = register_mem(&mr); assert(error == 0); return (0); } static void pci_apic_prt_entry(int bus __unused, int slot, int pin, int pirq_pin __unused, int ioapic_irq, void *arg __unused) { dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" Zero,"); dsdt_line(" 0x%X", ioapic_irq); dsdt_line(" },"); } static void pci_pirq_prt_entry(int bus __unused, int slot, int pin, int pirq_pin, int ioapic_irq __unused, void *arg __unused) { char *name; name = lpc_pirq_name(pirq_pin); if (name == NULL) return; dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" %s,", name); dsdt_line(" 0x00"); dsdt_line(" },"); free(name); } /* * A bhyve virtual machine has a flat PCI hierarchy with a root port * corresponding to each PCI bus. */ static void pci_bus_write_dsdt(int bus) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; int count, func, slot; /* * If there are no devices on this 'bus' then just return. */ if ((bi = pci_businfo[bus]) == NULL) { /* * Bus 0 is special because it decodes the I/O ports used * for PCI config space access even if there are no devices * on it. */ if (bus != 0) return; } dsdt_line(" Device (PC%02X)", bus); dsdt_line(" {"); dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))"); dsdt_line(" Method (_BBN, 0, NotSerialized)"); dsdt_line(" {"); dsdt_line(" Return (0x%08X)", bus); dsdt_line(" }"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, " "MaxFixed, PosDecode,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bus); dsdt_line(" 0x%04X, // Range Maximum", bus); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0001, // Length"); dsdt_line(" ,, )"); if (bus == 0) { dsdt_indent(3); dsdt_fixed_ioport(0xCF8, 8); dsdt_unindent(3); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0000, // Range Minimum"); dsdt_line(" 0x0CF7, // Range Maximum"); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0CF8, // Length"); dsdt_line(" ,, , TypeStatic)"); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0D00, // Range Minimum"); dsdt_line(" 0x%04X, // Range Maximum", PCI_EMUL_IOBASE - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", PCI_EMUL_IOBASE - 0x0D00); dsdt_line(" ,, , TypeStatic)"); if (bi == NULL) { dsdt_line(" })"); goto done; } } assert(bi != NULL); /* i/o window */ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bi->iobase); dsdt_line(" 0x%04X, // Range Maximum", bi->iolimit - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", bi->iolimit - bi->iobase); dsdt_line(" ,, , TypeStatic)"); /* mmio window (32-bit) */ dsdt_line(" DWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x00000000, // Granularity"); dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32); dsdt_line(" 0x%08X, // Range Maximum\n", bi->memlimit32 - 1); dsdt_line(" 0x00000000, // Translation Offset"); dsdt_line(" 0x%08X, // Length\n", bi->memlimit32 - bi->membase32); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); /* mmio window (64-bit) */ dsdt_line(" QWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x0000000000000000, // Granularity"); dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64); dsdt_line(" 0x%016lX, // Range Maximum\n", bi->memlimit64 - 1); dsdt_line(" 0x0000000000000000, // Translation Offset"); dsdt_line(" 0x%016lX, // Length\n", bi->memlimit64 - bi->membase64); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); dsdt_line(" })"); count = pci_count_lintr(bus); if (count != 0) { dsdt_indent(2); dsdt_line("Name (PPRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); dsdt_line("})"); dsdt_line("Name (APRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_apic_prt_entry, NULL); dsdt_line("})"); dsdt_line("Method (_PRT, 0, NotSerialized)"); dsdt_line("{"); dsdt_line(" If (PICM)"); dsdt_line(" {"); dsdt_line(" Return (APRT)"); dsdt_line(" }"); dsdt_line(" Else"); dsdt_line(" {"); dsdt_line(" Return (PPRT)"); dsdt_line(" }"); dsdt_line("}"); dsdt_unindent(2); } dsdt_indent(2); for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { pi = si->si_funcs[func].fi_devi; if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL) pi->pi_d->pe_write_dsdt(pi); } } dsdt_unindent(2); done: dsdt_line(" }"); } void pci_write_dsdt(void) { int bus; dsdt_indent(1); dsdt_line("Name (PICM, 0x00)"); dsdt_line("Method (_PIC, 1, NotSerialized)"); dsdt_line("{"); dsdt_line(" Store (Arg0, PICM)"); dsdt_line("}"); dsdt_line(""); dsdt_line("Scope (_SB)"); dsdt_line("{"); for (bus = 0; bus < MAXBUSES; bus++) pci_bus_write_dsdt(bus); dsdt_line("}"); dsdt_unindent(1); } int pci_bus_configured(int bus) { assert(bus >= 0 && bus < MAXBUSES); return (pci_businfo[bus] != NULL); } int pci_msi_enabled(struct pci_devinst *pi) { return (pi->pi_msi.enabled); } int pci_msi_maxmsgnum(struct pci_devinst *pi) { if (pi->pi_msi.enabled) return (pi->pi_msi.maxmsgnum); else return (0); } int pci_msix_enabled(struct pci_devinst *pi) { return (pi->pi_msix.enabled && !pi->pi_msi.enabled); } void pci_generate_msix(struct pci_devinst *pi, int index) { struct msix_table_entry *mte; if (!pci_msix_enabled(pi)) return; if (pi->pi_msix.function_mask) return; if (index >= pi->pi_msix.table_count) return; mte = &pi->pi_msix.table[index]; if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { /* XXX Set PBA bit if interrupt is disabled */ vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data); } } void pci_generate_msi(struct pci_devinst *pi, int index) { if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) { vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr, pi->pi_msi.msg_data + index); } } static bool pci_lintr_permitted(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (!(pi->pi_msi.enabled || pi->pi_msix.enabled || (cmd & PCIM_CMD_INTxDIS))); } void pci_lintr_request(struct pci_devinst *pi) { struct businfo *bi; struct slotinfo *si; int bestpin, bestcount, pin; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); /* * Just allocate a pin from our slot. The pin will be * assigned IRQs later when interrupts are routed. */ si = &bi->slotinfo[pi->pi_slot]; bestpin = 0; bestcount = si->si_intpins[0].ii_count; for (pin = 1; pin < 4; pin++) { if (si->si_intpins[pin].ii_count < bestcount) { bestpin = pin; bestcount = si->si_intpins[pin].ii_count; } } si->si_intpins[bestpin].ii_count++; pi->pi_lintr.pin = bestpin + 1; pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1); } static void pci_lintr_route(struct pci_devinst *pi) { struct businfo *bi; struct intxinfo *ii; if (pi->pi_lintr.pin == 0) return; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1]; /* * Attempt to allocate an I/O APIC pin for this intpin if one * is not yet assigned. */ if (ii->ii_ioapic_irq == 0) ii->ii_ioapic_irq = ioapic_pci_alloc_irq(pi); assert(ii->ii_ioapic_irq > 0); /* * Attempt to allocate a PIRQ pin for this intpin if one is * not yet assigned. */ if (ii->ii_pirq_pin == 0) ii->ii_pirq_pin = pirq_alloc_pin(pi); assert(ii->ii_pirq_pin > 0); pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq; pi->pi_lintr.pirq_pin = ii->ii_pirq_pin; pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin)); } void pci_lintr_assert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == IDLE) { if (pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } else pi->pi_lintr.state = PENDING; } pthread_mutex_unlock(&pi->pi_lintr.lock); } void pci_lintr_deassert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED) { pi->pi_lintr.state = IDLE; pci_irq_deassert(pi); } else if (pi->pi_lintr.state == PENDING) pi->pi_lintr.state = IDLE; pthread_mutex_unlock(&pi->pi_lintr.lock); } static void pci_lintr_update(struct pci_devinst *pi) { pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) { pci_irq_deassert(pi); pi->pi_lintr.state = PENDING; } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } pthread_mutex_unlock(&pi->pi_lintr.lock); } int pci_count_lintr(int bus) { int count, slot, pin; struct slotinfo *slotinfo; count = 0; if (pci_businfo[bus] != NULL) { for (slot = 0; slot < MAXSLOTS; slot++) { slotinfo = &pci_businfo[bus]->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { if (slotinfo->si_intpins[pin].ii_count != 0) count++; } } } return (count); } void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg) { struct businfo *bi; struct slotinfo *si; struct intxinfo *ii; int slot, pin; if ((bi = pci_businfo[bus]) == NULL) return; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { ii = &si->si_intpins[pin]; if (ii->ii_count != 0) cb(bus, slot, pin + 1, ii->ii_pirq_pin, ii->ii_ioapic_irq, arg); } } } /* * Return 1 if the emulated device in 'slot' is a multi-function device. * Return 0 otherwise. */ static int pci_emul_is_mfdev(int bus, int slot) { struct businfo *bi; struct slotinfo *si; int f, numfuncs; numfuncs = 0; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; for (f = 0; f < MAXFUNCS; f++) { if (si->si_funcs[f].fi_devi != NULL) { numfuncs++; } } } return (numfuncs > 1); } /* * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on * whether or not is a multi-function being emulated in the pci 'slot'. */ static void pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) { int mfdev; if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) { mfdev = pci_emul_is_mfdev(bus, slot); switch (bytes) { case 1: case 2: *rv &= ~PCIM_MFDEV; if (mfdev) { *rv |= PCIM_MFDEV; } break; case 4: *rv &= ~(PCIM_MFDEV << 16); if (mfdev) { *rv |= (PCIM_MFDEV << 16); } break; } } } /* * Update device state in response to changes to the PCI command * register. */ void pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old) { int i; uint16_t changed, new; new = pci_get_cfgdata16(pi, PCIR_COMMAND); changed = old ^ new; /* * If the MMIO or I/O address space decoding has changed then * register/unregister all BARs that decode that address space. */ for (i = 0; i <= PCI_BARMAX_WITH_ROM; i++) { switch (pi->pi_bar[i].type) { case PCIBAR_NONE: case PCIBAR_MEMHI64: break; case PCIBAR_IO: /* I/O address space decoding changed? */ if (changed & PCIM_CMD_PORTEN) { if (new & PCIM_CMD_PORTEN) register_bar(pi, i); else unregister_bar(pi, i); } break; case PCIBAR_ROM: /* skip (un-)register of ROM if it disabled */ if (!romen(pi)) break; /* fallthrough */ case PCIBAR_MEM32: case PCIBAR_MEM64: /* MMIO address space decoding changed? */ if (changed & PCIM_CMD_MEMEN) { if (new & PCIM_CMD_MEMEN) register_bar(pi, i); else unregister_bar(pi, i); } break; default: assert(0); } } /* * If INTx has been unmasked and is pending, assert the * interrupt. */ pci_lintr_update(pi); } static void pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes) { int rshift; uint32_t cmd, old, readonly; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ /* * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3. * * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are * 'write 1 to clear'. However these bits are not set to '1' by * any device emulation so it is simpler to treat them as readonly. */ rshift = (coff & 0x3) * 8; readonly = 0xFFFFF880 >> rshift; old = CFGREAD(pi, coff, bytes); new &= ~readonly; new |= (old & readonly); CFGWRITE(pi, coff, new, bytes); /* update config */ pci_emul_cmd_changed(pi, cmd); } static void pci_cfgrw(int in, int bus, int slot, int func, int coff, int bytes, uint32_t *eax) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; struct pci_devemu *pe; int idx, needcfg; uint64_t addr, bar, mask; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; pi = si->si_funcs[func].fi_devi; } else pi = NULL; /* * Just return if there is no device at this slot:func or if the * the guest is doing an un-aligned access. */ if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || (coff & (bytes - 1)) != 0) { if (in) *eax = 0xffffffff; return; } /* * Ignore all writes beyond the standard config space and return all * ones on reads. */ if (coff >= PCI_REGMAX + 1) { if (in) { *eax = 0xffffffff; /* * Extended capabilities begin at offset 256 in config * space. Absence of extended capabilities is signaled * with all 0s in the extended capability header at * offset 256. */ if (coff <= PCI_REGMAX + 4) *eax = 0x00000000; } return; } pe = pi->pi_d; /* * Config read */ if (in) { /* Let the device emulation override the default handler */ if (pe->pe_cfgread != NULL) { needcfg = pe->pe_cfgread(pi, coff, bytes, eax); } else { needcfg = 1; } if (needcfg) *eax = CFGREAD(pi, coff, bytes); pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax); } else { /* Let the device emulation override the default handler */ if (pe->pe_cfgwrite != NULL && (*pe->pe_cfgwrite)(pi, coff, bytes, *eax) == 0) return; /* * Special handling for write to BAR and ROM registers */ if (is_pcir_bar(coff) || is_pcir_bios(coff)) { /* * Ignore writes to BAR registers that are not * 4-byte aligned. */ if (bytes != 4 || (coff & 0x3) != 0) return; if (is_pcir_bar(coff)) { idx = (coff - PCIR_BAR(0)) / 4; } else if (is_pcir_bios(coff)) { idx = PCI_ROM_IDX; } else { errx(4, "%s: invalid BAR offset %d", __func__, coff); } mask = ~(pi->pi_bar[idx].size - 1); switch (pi->pi_bar[idx].type) { case PCIBAR_NONE: pi->pi_bar[idx].addr = bar = 0; break; case PCIBAR_IO: addr = *eax & mask; addr &= 0xffff; bar = addr | pi->pi_bar[idx].lobits; /* * Register the new BAR value for interception */ if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_IO); } break; case PCIBAR_MEM32: addr = bar = *eax & mask; bar |= pi->pi_bar[idx].lobits; if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM32); } break; case PCIBAR_MEM64: addr = bar = *eax & mask; bar |= pi->pi_bar[idx].lobits; if (addr != (uint32_t)pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM64); } break; case PCIBAR_MEMHI64: mask = ~(pi->pi_bar[idx - 1].size - 1); addr = ((uint64_t)*eax << 32) & mask; bar = addr >> 32; if (bar != pi->pi_bar[idx - 1].addr >> 32) { update_bar_address(pi, addr, idx - 1, PCIBAR_MEMHI64); } break; case PCIBAR_ROM: addr = bar = *eax & mask; if (memen(pi) && romen(pi)) { unregister_bar(pi, idx); } pi->pi_bar[idx].addr = addr; pi->pi_bar[idx].lobits = *eax & PCIM_BIOS_ENABLE; /* romen could have changed it value */ if (memen(pi) && romen(pi)) { register_bar(pi, idx); } bar |= pi->pi_bar[idx].lobits; break; default: assert(0); } pci_set_cfgdata32(pi, coff, bar); } else if (pci_emul_iscap(pi, coff)) { pci_emul_capwrite(pi, coff, bytes, *eax, 0, 0); } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) { pci_emul_cmdsts_write(pi, coff, *eax, bytes); } else { CFGWRITE(pi, coff, *eax, bytes); } } } static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; static int pci_emul_cfgaddr(struct vmctx *ctx __unused, int in, int port __unused, int bytes, uint32_t *eax, void *arg __unused) { uint32_t x; if (bytes != 4) { if (in) *eax = (bytes == 2) ? 0xffff : 0xff; return (0); } if (in) { x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff; if (cfgenable) x |= CONF1_ENABLE; *eax = x; } else { x = *eax; cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; cfgoff = (x & PCI_REGMAX) & ~0x03; cfgfunc = (x >> 8) & PCI_FUNCMAX; cfgslot = (x >> 11) & PCI_SLOTMAX; cfgbus = (x >> 16) & PCI_BUSMAX; } return (0); } INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); static int pci_emul_cfgdata(struct vmctx *ctx __unused, int in, int port, int bytes, uint32_t *eax, void *arg __unused) { int coff; assert(bytes == 1 || bytes == 2 || bytes == 4); coff = cfgoff + (port - CONF1_DATA_PORT); if (cfgenable) { pci_cfgrw(in, cfgbus, cfgslot, cfgfunc, coff, bytes, eax); } else { /* Ignore accesses to cfgdata if not enabled by cfgaddr */ if (in) *eax = 0xffffffff; } return (0); } INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); #ifdef BHYVE_SNAPSHOT /* * Saves/restores PCI device emulated state. Returns 0 on success. */ static int pci_snapshot_pci_dev(struct vm_snapshot_meta *meta) { struct pci_devinst *pi; int i; int ret; pi = meta->dev_data; SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.addr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.msg_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.maxmsgnum, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_bar, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_bar, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_offset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_offset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_size, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.function_mask, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(pi->pi_cfgdata, sizeof(pi->pi_cfgdata), meta, ret, done); for (i = 0; i < (int)nitems(pi->pi_bar); i++) { SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].type, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].size, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].addr, meta, ret, done); } /* Restore MSI-X table. */ for (i = 0; i < pi->pi_msix.table_count; i++) { SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].addr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].msg_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].vector_control, meta, ret, done); } done: return (ret); } static int pci_find_slotted_dev(const char *dev_name, struct pci_devemu **pde, struct pci_devinst **pdi) { struct businfo *bi; struct slotinfo *si; struct funcinfo *fi; int bus, slot, func; assert(dev_name != NULL); assert(pde != NULL); assert(pdi != NULL); for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_pde == NULL) continue; if (strcmp(dev_name, fi->fi_pde->pe_emu) != 0) continue; *pde = fi->fi_pde; *pdi = fi->fi_devi; return (0); } } } return (EINVAL); } int pci_snapshot(struct vm_snapshot_meta *meta) { struct pci_devemu *pde; struct pci_devinst *pdi; int ret; assert(meta->dev_name != NULL); ret = pci_find_slotted_dev(meta->dev_name, &pde, &pdi); if (ret != 0) { fprintf(stderr, "%s: no such name: %s\r\n", __func__, meta->dev_name); memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); return (0); } meta->dev_data = pdi; if (pde->pe_snapshot == NULL) { fprintf(stderr, "%s: not implemented yet for: %s\r\n", __func__, meta->dev_name); return (-1); } ret = pci_snapshot_pci_dev(meta); if (ret != 0) { fprintf(stderr, "%s: failed to snapshot pci dev\r\n", __func__); return (-1); } ret = (*pde->pe_snapshot)(meta); return (ret); } int pci_pause(const char *dev_name) { struct pci_devemu *pde; struct pci_devinst *pdi; int ret; assert(dev_name != NULL); ret = pci_find_slotted_dev(dev_name, &pde, &pdi); if (ret != 0) { /* * It is possible to call this function without * checking that the device is inserted first. */ fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name); return (0); } if (pde->pe_pause == NULL) { /* The pause/resume functionality is optional. */ fprintf(stderr, "%s: not implemented for: %s\n", __func__, dev_name); return (0); } return (*pde->pe_pause)(pdi); } int pci_resume(const char *dev_name) { struct pci_devemu *pde; struct pci_devinst *pdi; int ret; assert(dev_name != NULL); ret = pci_find_slotted_dev(dev_name, &pde, &pdi); if (ret != 0) { /* * It is possible to call this function without * checking that the device is inserted first. */ fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name); return (0); } if (pde->pe_resume == NULL) { /* The pause/resume functionality is optional. */ fprintf(stderr, "%s: not implemented for: %s\n", __func__, dev_name); return (0); } return (*pde->pe_resume)(pdi); } #endif #define PCI_EMUL_TEST #ifdef PCI_EMUL_TEST /* * Define a dummy test device */ #define DIOSZ 8 #define DMEMSZ 4096 struct pci_emul_dsoftc { uint8_t ioregs[DIOSZ]; uint8_t memregs[2][DMEMSZ]; }; #define PCI_EMUL_MSI_MSGS 4 #define PCI_EMUL_MSIX_MSGS 16 static int pci_emul_dinit(struct pci_devinst *pi, nvlist_t *nvl __unused) { int error; struct pci_emul_dsoftc *sc; sc = calloc(1, sizeof(struct pci_emul_dsoftc)); pi->pi_arg = sc; pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS); assert(error == 0); error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ); assert(error == 0); return (0); } static void pci_emul_diow(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { int i; struct pci_emul_dsoftc *sc = pi->pi_arg; if (baridx == 0) { if (offset + size > DIOSZ) { printf("diow: iow too large, offset %ld size %d\n", offset, size); return; } if (size == 1) { sc->ioregs[offset] = value & 0xff; } else if (size == 2) { *(uint16_t *)&sc->ioregs[offset] = value & 0xffff; } else if (size == 4) { *(uint32_t *)&sc->ioregs[offset] = value; } else { printf("diow: iow unknown size %d\n", size); } /* * Special magic value to generate an interrupt */ if (offset == 4 && size == 4 && pci_msi_enabled(pi)) pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi)); if (value == 0xabcdef) { for (i = 0; i < pci_msi_maxmsgnum(pi); i++) pci_generate_msi(pi, i); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("diow: memw too large, offset %ld size %d\n", offset, size); return; } i = baridx - 1; /* 'memregs' index */ if (size == 1) { sc->memregs[i][offset] = value; } else if (size == 2) { *(uint16_t *)&sc->memregs[i][offset] = value; } else if (size == 4) { *(uint32_t *)&sc->memregs[i][offset] = value; } else if (size == 8) { *(uint64_t *)&sc->memregs[i][offset] = value; } else { printf("diow: memw unknown size %d\n", size); } /* * magic interrupt ?? */ } if (baridx > 2 || baridx < 0) { printf("diow: unknown bar idx %d\n", baridx); } } static uint64_t pci_emul_dior(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_emul_dsoftc *sc = pi->pi_arg; uint32_t value; int i; if (baridx == 0) { if (offset + size > DIOSZ) { printf("dior: ior too large, offset %ld size %d\n", offset, size); return (0); } value = 0; if (size == 1) { value = sc->ioregs[offset]; } else if (size == 2) { value = *(uint16_t *) &sc->ioregs[offset]; } else if (size == 4) { value = *(uint32_t *) &sc->ioregs[offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("dior: memr too large, offset %ld size %d\n", offset, size); return (0); } i = baridx - 1; /* 'memregs' index */ if (size == 1) { value = sc->memregs[i][offset]; } else if (size == 2) { value = *(uint16_t *) &sc->memregs[i][offset]; } else if (size == 4) { value = *(uint32_t *) &sc->memregs[i][offset]; } else if (size == 8) { value = *(uint64_t *) &sc->memregs[i][offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx > 2 || baridx < 0) { printf("dior: unknown bar idx %d\n", baridx); return (0); } return (value); } #ifdef BHYVE_SNAPSHOT static int pci_emul_snapshot(struct vm_snapshot_meta *meta __unused) { return (0); } #endif static const struct pci_devemu pci_dummy = { .pe_emu = "dummy", .pe_init = pci_emul_dinit, .pe_barwrite = pci_emul_diow, .pe_barread = pci_emul_dior, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_emul_snapshot, #endif }; PCI_EMUL_SET(pci_dummy); #endif /* PCI_EMUL_TEST */ diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c index f42bbbda655f..d4382f6cc20e 100644 --- a/usr.sbin/bhyve/pci_passthru.c +++ b/usr.sbin/bhyve/pci_passthru.c @@ -1,1218 +1,1218 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include "config.h" #include "debug.h" #include "mem.h" #include "pci_passthru.h" #ifndef _PATH_DEVPCI #define _PATH_DEVPCI "/dev/pci" #endif #define LEGACY_SUPPORT 1 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1) #define MSIX_CAPLEN 12 static int pcifd = -1; struct passthru_softc { struct pci_devinst *psc_pi; /* ROM is handled like a BAR */ struct pcibar psc_bar[PCI_BARMAX_WITH_ROM + 1]; struct { int capoff; int msgctrl; int emulated; } psc_msi; struct { int capoff; } psc_msix; struct pcisel psc_sel; }; static int msi_caplen(int msgctrl) { int len; len = 10; /* minimum length of msi capability */ if (msgctrl & PCIM_MSICTRL_64BIT) len += 4; #if 0 /* * Ignore the 'mask' and 'pending' bits in the MSI capability. * We'll let the guest manipulate them directly. */ if (msgctrl & PCIM_MSICTRL_VECTOR) len += 10; #endif return (len); } static int pcifd_init(void) { pcifd = open(_PATH_DEVPCI, O_RDWR, 0); if (pcifd < 0) { warn("failed to open %s", _PATH_DEVPCI); return (1); } #ifndef WITHOUT_CAPSICUM cap_rights_t pcifd_rights; cap_rights_init(&pcifd_rights, CAP_IOCTL, CAP_READ, CAP_WRITE); if (caph_rights_limit(pcifd, &pcifd_rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); const cap_ioctl_t pcifd_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR, PCIOCBARIO, PCIOCBARMMAP, PCIOCGETCONF }; if (caph_ioctls_limit(pcifd, pcifd_ioctls, nitems(pcifd_ioctls)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif return (0); } uint32_t read_config(const struct pcisel *sel, long reg, int width) { struct pci_io pi; if (pcifd < 0 && pcifd_init()) { return (0); } bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; if (ioctl(pcifd, PCIOCREAD, &pi) < 0) return (0); /* XXX */ else return (pi.pi_data); } void write_config(const struct pcisel *sel, long reg, int width, uint32_t data) { struct pci_io pi; if (pcifd < 0 && pcifd_init()) { return; } bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; pi.pi_data = data; (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */ } #ifdef LEGACY_SUPPORT static int passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr) { int capoff; struct msicap msicap; u_char *capdata; pci_populate_msicap(&msicap, msgnum, nextptr); /* * XXX * Copy the msi capability structure in the last 16 bytes of the * config space. This is wrong because it could shadow something * useful to the device. */ capoff = 256 - roundup(sizeof(msicap), 4); capdata = (u_char *)&msicap; for (size_t i = 0; i < sizeof(msicap); i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); return (capoff); } #endif /* LEGACY_SUPPORT */ static int cfginitmsi(struct passthru_softc *sc) { int i, ptr, capptr, cap, sts, caplen, table_size; uint32_t u32; struct pcisel sel; struct pci_devinst *pi; struct msixcap msixcap; char *msixcap_ptr; pi = sc->psc_pi; sel = sc->psc_sel; /* * Parse the capabilities and cache the location of the MSI * and MSI-X capabilities. */ sts = read_config(&sel, PCIR_STATUS, 2); if (sts & PCIM_STATUS_CAPPRESENT) { ptr = read_config(&sel, PCIR_CAP_PTR, 1); while (ptr != 0 && ptr != 0xff) { cap = read_config(&sel, ptr + PCICAP_ID, 1); if (cap == PCIY_MSI) { /* * Copy the MSI capability into the config * space of the emulated pci device */ sc->psc_msi.capoff = ptr; sc->psc_msi.msgctrl = read_config(&sel, ptr + 2, 2); sc->psc_msi.emulated = 0; caplen = msi_caplen(sc->psc_msi.msgctrl); capptr = ptr; while (caplen > 0) { u32 = read_config(&sel, capptr, 4); pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; } } else if (cap == PCIY_MSIX) { /* * Copy the MSI-X capability */ sc->psc_msix.capoff = ptr; caplen = 12; msixcap_ptr = (char *)&msixcap; capptr = ptr; while (caplen > 0) { u32 = read_config(&sel, capptr, 4); memcpy(msixcap_ptr, &u32, 4); pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; msixcap_ptr += 4; } } ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1); } } if (sc->psc_msix.capoff != 0) { pi->pi_msix.pba_bar = msixcap.pba_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.pba_offset = msixcap.pba_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_bar = msixcap.table_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.table_offset = msixcap.table_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl); pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count); /* Allocate the emulated MSI-X table array */ table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* Mask all table entries */ for (i = 0; i < pi->pi_msix.table_count; i++) { pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } } #ifdef LEGACY_SUPPORT /* * If the passthrough device does not support MSI then craft a * MSI capability for it. We link the new MSI capability at the * head of the list of capabilities. */ if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) { int origptr, msiptr; origptr = read_config(&sel, PCIR_CAP_PTR, 1); msiptr = passthru_add_msicap(pi, 1, origptr); sc->psc_msi.capoff = msiptr; sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2); sc->psc_msi.emulated = 1; pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr); } #endif /* Make sure one of the capabilities is present */ if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) return (-1); else return (0); } static uint64_t msix_table_read(struct passthru_softc *sc, uint64_t offset, int size) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *src8; uint16_t *src16; uint32_t *src32; uint64_t *src64; uint64_t data; size_t entry_offset; uint32_t table_offset; int index, table_count; pi = sc->psc_pi; table_offset = pi->pi_msix.table_offset; table_count = pi->pi_msix.table_count; if (offset < table_offset || offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) { switch (size) { case 1: src8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset); data = *src8; break; case 2: src16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset); data = *src16; break; case 4: src32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset); data = *src32; break; case 8: src64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset); data = *src64; break; default: return (-1); } return (data); } offset -= table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; assert(index < table_count); entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; switch (size) { case 1: src8 = (uint8_t *)((uint8_t *)entry + entry_offset); data = *src8; break; case 2: src16 = (uint16_t *)((uint8_t *)entry + entry_offset); data = *src16; break; case 4: src32 = (uint32_t *)((uint8_t *)entry + entry_offset); data = *src32; break; case 8: src64 = (uint64_t *)((uint8_t *)entry + entry_offset); data = *src64; break; default: return (-1); } return (data); } static void msix_table_write(struct passthru_softc *sc, uint64_t offset, int size, uint64_t data) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *dest8; uint16_t *dest16; uint32_t *dest32; uint64_t *dest64; size_t entry_offset; uint32_t table_offset, vector_control; int index, table_count; pi = sc->psc_pi; table_offset = pi->pi_msix.table_offset; table_count = pi->pi_msix.table_count; if (offset < table_offset || offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) { switch (size) { case 1: dest8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset); *dest8 = data; break; case 2: dest16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset); *dest16 = data; break; case 4: dest32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset); *dest32 = data; break; case 8: dest64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset); *dest64 = data; break; } return; } offset -= table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; assert(index < table_count); entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* Only 4 byte naturally-aligned writes are supported */ assert(size == 4); assert(entry_offset % 4 == 0); vector_control = entry->vector_control; dest32 = (uint32_t *)((uint8_t *)entry + entry_offset); *dest32 = data; /* If MSI-X hasn't been enabled, do nothing */ if (pi->pi_msix.enabled) { /* If the entry is masked, don't set it up */ if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 || (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { - (void)vm_setup_pptdev_msix(sc->psc_pi->pi_vmctx, 0, + (void)vm_setup_pptdev_msix(sc->psc_pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, index, entry->addr, entry->msg_data, entry->vector_control); } } } static int init_msix_table(struct passthru_softc *sc) { struct pci_devinst *pi = sc->psc_pi; struct pci_bar_mmap pbm; int b, s, f; uint32_t table_size, table_offset; assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0); b = sc->psc_sel.pc_bus; s = sc->psc_sel.pc_dev; f = sc->psc_sel.pc_func; /* * Map the region of the BAR containing the MSI-X table. This is * necessary for two reasons: * 1. The PBA may reside in the first or last page containing the MSI-X * table. * 2. While PCI devices are not supposed to use the page(s) containing * the MSI-X table for other purposes, some do in practice. */ memset(&pbm, 0, sizeof(pbm)); pbm.pbm_sel = sc->psc_sel; pbm.pbm_flags = PCIIO_BAR_MMAP_RW; pbm.pbm_reg = PCIR_BAR(pi->pi_msix.table_bar); pbm.pbm_memattr = VM_MEMATTR_DEVICE; if (ioctl(pcifd, PCIOCBARMMAP, &pbm) != 0) { warn("Failed to map MSI-X table BAR on %d/%d/%d", b, s, f); return (-1); } assert(pbm.pbm_bar_off == 0); pi->pi_msix.mapped_addr = (uint8_t *)(uintptr_t)pbm.pbm_map_base; pi->pi_msix.mapped_size = pbm.pbm_map_length; table_offset = rounddown2(pi->pi_msix.table_offset, 4096); table_size = pi->pi_msix.table_offset - table_offset; table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; table_size = roundup2(table_size, 4096); /* * Unmap any pages not containing the table, we do not need to emulate * accesses to them. Avoid releasing address space to help ensure that * a buggy out-of-bounds access causes a crash. */ if (table_offset != 0) if (mprotect(pi->pi_msix.mapped_addr, table_offset, PROT_NONE) != 0) warn("Failed to unmap MSI-X table BAR region"); if (table_offset + table_size != pi->pi_msix.mapped_size) if (mprotect( pi->pi_msix.mapped_addr + table_offset + table_size, pi->pi_msix.mapped_size - (table_offset + table_size), PROT_NONE) != 0) warn("Failed to unmap MSI-X table BAR region"); return (0); } static int cfginitbar(struct passthru_softc *sc) { int i, error; struct pci_devinst *pi; struct pci_bar_io bar; enum pcibar_type bartype; uint64_t base, size; pi = sc->psc_pi; /* * Initialize BAR registers */ for (i = 0; i <= PCI_BARMAX; i++) { bzero(&bar, sizeof(bar)); bar.pbi_sel = sc->psc_sel; bar.pbi_reg = PCIR_BAR(i); if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0) continue; if (PCI_BAR_IO(bar.pbi_base)) { bartype = PCIBAR_IO; base = bar.pbi_base & PCIM_BAR_IO_BASE; } else { switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) { case PCIM_BAR_MEM_64: bartype = PCIBAR_MEM64; break; default: bartype = PCIBAR_MEM32; break; } base = bar.pbi_base & PCIM_BAR_MEM_BASE; } size = bar.pbi_length; if (bartype != PCIBAR_IO) { if (((base | size) & PAGE_MASK) != 0) { warnx("passthru device %d/%d/%d BAR %d: " "base %#lx or size %#lx not page aligned\n", sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, base, size); return (-1); } } /* Cache information about the "real" BAR */ sc->psc_bar[i].type = bartype; sc->psc_bar[i].size = size; sc->psc_bar[i].addr = base; sc->psc_bar[i].lobits = 0; /* Allocate the BAR in the guest I/O or MMIO space */ error = pci_emul_alloc_bar(pi, i, bartype, size); if (error) return (-1); /* Use same lobits as physical bar */ uint8_t lobits = read_config(&sc->psc_sel, PCIR_BAR(i), 0x01); if (bartype == PCIBAR_MEM32 || bartype == PCIBAR_MEM64) { lobits &= ~PCIM_BAR_MEM_BASE; } else { lobits &= ~PCIM_BAR_IO_BASE; } sc->psc_bar[i].lobits = lobits; pi->pi_bar[i].lobits = lobits; /* * 64-bit BAR takes up two slots so skip the next one. */ if (bartype == PCIBAR_MEM64) { i++; assert(i <= PCI_BARMAX); sc->psc_bar[i].type = PCIBAR_MEMHI64; } } return (0); } static int cfginit(struct pci_devinst *pi, int bus, int slot, int func) { int error; struct passthru_softc *sc; error = 1; sc = pi->pi_arg; bzero(&sc->psc_sel, sizeof(struct pcisel)); sc->psc_sel.pc_bus = bus; sc->psc_sel.pc_dev = slot; sc->psc_sel.pc_func = func; if (cfginitmsi(sc) != 0) { warnx("failed to initialize MSI for PCI %d/%d/%d", bus, slot, func); goto done; } if (cfginitbar(sc) != 0) { warnx("failed to initialize BARs for PCI %d/%d/%d", bus, slot, func); goto done; } write_config(&sc->psc_sel, PCIR_COMMAND, 2, pci_get_cfgdata16(pi, PCIR_COMMAND)); /* * We need to do this after PCIR_COMMAND got possibly updated, e.g., * a BAR was enabled, as otherwise the PCIOCBARMMAP might fail on us. */ if (pci_msix_table_bar(pi) >= 0) { error = init_msix_table(sc); if (error != 0) { warnx( "failed to initialize MSI-X table for PCI %d/%d/%d: %d", bus, slot, func, error); goto done; } } error = 0; /* success */ done: return (error); } static int passthru_legacy_config(nvlist_t *nvl, const char *opts) { const char *cp; char *tofree; char value[16]; int bus, slot, func; if (opts == NULL) return (0); cp = strchr(opts, ','); if (strncmp(opts, "ppt", strlen("ppt")) == 0) { tofree = strndup(opts, cp - opts); set_config_value_node(nvl, "pptdev", tofree); free(tofree); } else if (sscanf(opts, "pci0:%d:%d:%d", &bus, &slot, &func) == 3 || sscanf(opts, "pci%d:%d:%d", &bus, &slot, &func) == 3 || sscanf(opts, "%d/%d/%d", &bus, &slot, &func) == 3) { snprintf(value, sizeof(value), "%d", bus); set_config_value_node(nvl, "bus", value); snprintf(value, sizeof(value), "%d", slot); set_config_value_node(nvl, "slot", value); snprintf(value, sizeof(value), "%d", func); set_config_value_node(nvl, "func", value); } else { EPRINTLN("passthru: invalid options \"%s\"", opts); return (-1); } if (cp == NULL) { return (0); } return (pci_parse_legacy_config(nvl, cp + 1)); } static int passthru_init_rom(struct passthru_softc *const sc, const char *const romfile) { if (romfile == NULL) { return (0); } const int fd = open(romfile, O_RDONLY); if (fd < 0) { warnx("%s: can't open romfile \"%s\"", __func__, romfile); return (-1); } struct stat sbuf; if (fstat(fd, &sbuf) < 0) { warnx("%s: can't fstat romfile \"%s\"", __func__, romfile); close(fd); return (-1); } const uint64_t rom_size = sbuf.st_size; void *const rom_data = mmap(NULL, rom_size, PROT_READ, MAP_SHARED, fd, 0); if (rom_data == MAP_FAILED) { warnx("%s: unable to mmap romfile \"%s\" (%d)", __func__, romfile, errno); close(fd); return (-1); } void *rom_addr; int error = pci_emul_alloc_rom(sc->psc_pi, rom_size, &rom_addr); if (error) { warnx("%s: failed to alloc rom segment", __func__); munmap(rom_data, rom_size); close(fd); return (error); } memcpy(rom_addr, rom_data, rom_size); sc->psc_bar[PCI_ROM_IDX].type = PCIBAR_ROM; sc->psc_bar[PCI_ROM_IDX].addr = (uint64_t)rom_addr; sc->psc_bar[PCI_ROM_IDX].size = rom_size; munmap(rom_data, rom_size); close(fd); return (0); } static bool passthru_lookup_pptdev(const char *name, int *bus, int *slot, int *func) { struct pci_conf_io pc; struct pci_conf conf[1]; struct pci_match_conf patterns[1]; char *cp; bzero(&pc, sizeof(struct pci_conf_io)); pc.match_buf_len = sizeof(conf); pc.matches = conf; bzero(&patterns, sizeof(patterns)); /* * The pattern structure requires the unit to be split out from * the driver name. Walk backwards from the end of the name to * find the start of the unit. */ cp = strchr(name, '\0'); assert(cp != NULL); while (cp != name && isdigit(cp[-1])) cp--; if (cp == name || !isdigit(*cp)) { EPRINTLN("Invalid passthru device name %s", name); return (false); } if ((size_t)(cp - name) + 1 > sizeof(patterns[0].pd_name)) { EPRINTLN("Passthru device name %s is too long", name); return (false); } memcpy(patterns[0].pd_name, name, cp - name); patterns[0].pd_unit = strtol(cp, &cp, 10); if (*cp != '\0') { EPRINTLN("Invalid passthru device name %s", name); return (false); } patterns[0].flags = PCI_GETCONF_MATCH_NAME | PCI_GETCONF_MATCH_UNIT; pc.num_patterns = 1; pc.pat_buf_len = sizeof(patterns); pc.patterns = patterns; if (ioctl(pcifd, PCIOCGETCONF, &pc) == -1) { EPRINTLN("ioctl(PCIOCGETCONF): %s", strerror(errno)); return (false); } if (pc.status != PCI_GETCONF_LAST_DEVICE && pc.status != PCI_GETCONF_MORE_DEVS) { EPRINTLN("error returned from PCIOCGETCONF ioctl"); return (false); } if (pc.num_matches == 0) { EPRINTLN("Passthru device %s not found", name); return (false); } if (conf[0].pc_sel.pc_domain != 0) { EPRINTLN("Passthru device %s on unsupported domain", name); return (false); } *bus = conf[0].pc_sel.pc_bus; *slot = conf[0].pc_sel.pc_dev; *func = conf[0].pc_sel.pc_func; return (true); } static int passthru_init(struct pci_devinst *pi, nvlist_t *nvl) { int bus, slot, func, error, memflags; struct passthru_softc *sc; const char *value; sc = NULL; error = 1; memflags = vm_get_memflags(pi->pi_vmctx); if (!(memflags & VM_MEM_F_WIRED)) { warnx("passthru requires guest memory to be wired"); return (error); } if (pcifd < 0 && pcifd_init()) { return (error); } #define GET_INT_CONFIG(var, name) do { \ value = get_config_value_node(nvl, name); \ if (value == NULL) { \ EPRINTLN("passthru: missing required %s setting", name); \ return (error); \ } \ var = atoi(value); \ } while (0) value = get_config_value_node(nvl, "pptdev"); if (value != NULL) { if (!passthru_lookup_pptdev(value, &bus, &slot, &func)) return (error); } else { GET_INT_CONFIG(bus, "bus"); GET_INT_CONFIG(slot, "slot"); GET_INT_CONFIG(func, "func"); } if (vm_assign_pptdev(pi->pi_vmctx, bus, slot, func) != 0) { warnx("PCI device at %d/%d/%d is not using the ppt(4) driver", bus, slot, func); goto done; } sc = calloc(1, sizeof(struct passthru_softc)); pi->pi_arg = sc; sc->psc_pi = pi; /* initialize config space */ if ((error = cfginit(pi, bus, slot, func)) != 0) goto done; /* initialize ROM */ if ((error = passthru_init_rom(sc, get_config_value_node(nvl, "rom"))) != 0) goto done; error = 0; /* success */ done: if (error) { free(sc); vm_unassign_pptdev(pi->pi_vmctx, bus, slot, func); } return (error); } static int bar_access(int coff) { if ((coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) || coff == PCIR_BIOS) return (1); else return (0); } static int msicap_access(struct passthru_softc *sc, int coff) { int caplen; if (sc->psc_msi.capoff == 0) return (0); caplen = msi_caplen(sc->psc_msi.msgctrl); if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen) return (1); else return (0); } static int msixcap_access(struct passthru_softc *sc, int coff) { if (sc->psc_msix.capoff == 0) return (0); return (coff >= sc->psc_msix.capoff && coff < sc->psc_msix.capoff + MSIX_CAPLEN); } static int passthru_cfgread(struct pci_devinst *pi, int coff, int bytes, uint32_t *rv) { struct passthru_softc *sc; sc = pi->pi_arg; /* * PCI BARs and MSI capability is emulated. */ if (bar_access(coff) || msicap_access(sc, coff) || msixcap_access(sc, coff)) return (-1); #ifdef LEGACY_SUPPORT /* * Emulate PCIR_CAP_PTR if this device does not support MSI capability * natively. */ if (sc->psc_msi.emulated) { if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4) return (-1); } #endif /* * Emulate the command register. If a single read reads both the * command and status registers, read the status register from the * device's config space. */ if (coff == PCIR_COMMAND) { if (bytes <= 2) return (-1); *rv = read_config(&sc->psc_sel, PCIR_STATUS, 2) << 16 | pci_get_cfgdata16(pi, PCIR_COMMAND); return (0); } /* Everything else just read from the device's config space */ *rv = read_config(&sc->psc_sel, coff, bytes); return (0); } static int passthru_cfgwrite(struct pci_devinst *pi, int coff, int bytes, uint32_t val) { int error, msix_table_entries, i; struct passthru_softc *sc; uint16_t cmd_old; sc = pi->pi_arg; /* * PCI BARs are emulated */ if (bar_access(coff)) return (-1); /* * MSI capability is emulated */ if (msicap_access(sc, coff)) { pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff, PCIY_MSI); - error = vm_setup_pptdev_msi(pi->pi_vmctx, 0, sc->psc_sel.pc_bus, + error = vm_setup_pptdev_msi(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum); if (error != 0) err(1, "vm_setup_pptdev_msi"); return (0); } if (msixcap_access(sc, coff)) { pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff, PCIY_MSIX); if (pi->pi_msix.enabled) { msix_table_entries = pi->pi_msix.table_count; for (i = 0; i < msix_table_entries; i++) { - error = vm_setup_pptdev_msix(pi->pi_vmctx, 0, + error = vm_setup_pptdev_msix(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, pi->pi_msix.table[i].addr, pi->pi_msix.table[i].msg_data, pi->pi_msix.table[i].vector_control); if (error) err(1, "vm_setup_pptdev_msix"); } } else { error = vm_disable_pptdev_msix(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func); if (error) err(1, "vm_disable_pptdev_msix"); } return (0); } #ifdef LEGACY_SUPPORT /* * If this device does not support MSI natively then we cannot let * the guest disable legacy interrupts from the device. It is the * legacy interrupt that is triggering the virtual MSI to the guest. */ if (sc->psc_msi.emulated && pci_msi_enabled(pi)) { if (coff == PCIR_COMMAND && bytes == 2) val &= ~PCIM_CMD_INTxDIS; } #endif write_config(&sc->psc_sel, coff, bytes, val); if (coff == PCIR_COMMAND) { cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND); if (bytes == 1) pci_set_cfgdata8(pi, PCIR_COMMAND, val); else if (bytes == 2) pci_set_cfgdata16(pi, PCIR_COMMAND, val); pci_emul_cmd_changed(pi, cmd_old); } return (0); } static void passthru_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct passthru_softc *sc; struct pci_bar_ioreq pio; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { msix_table_write(sc, offset, size, value); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); assert(size == 1 || size == 2 || size == 4); assert(offset <= UINT32_MAX && offset + size <= UINT32_MAX); bzero(&pio, sizeof(pio)); pio.pbi_sel = sc->psc_sel; pio.pbi_op = PCIBARIO_WRITE; pio.pbi_bar = baridx; pio.pbi_offset = (uint32_t)offset; pio.pbi_width = size; pio.pbi_value = (uint32_t)value; (void)ioctl(pcifd, PCIOCBARIO, &pio); } } static uint64_t passthru_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct passthru_softc *sc; struct pci_bar_ioreq pio; uint64_t val; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { val = msix_table_read(sc, offset, size); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); assert(size == 1 || size == 2 || size == 4); assert(offset <= UINT32_MAX && offset + size <= UINT32_MAX); bzero(&pio, sizeof(pio)); pio.pbi_sel = sc->psc_sel; pio.pbi_op = PCIBARIO_READ; pio.pbi_bar = baridx; pio.pbi_offset = (uint32_t)offset; pio.pbi_width = size; (void)ioctl(pcifd, PCIOCBARIO, &pio); val = pio.pbi_value; } return (val); } static void passthru_msix_addr(struct pci_devinst *pi, int baridx, int enabled, uint64_t address) { struct passthru_softc *sc; size_t remaining; uint32_t table_size, table_offset; sc = pi->pi_arg; table_offset = rounddown2(pi->pi_msix.table_offset, 4096); if (table_offset > 0) { if (!enabled) { if (vm_unmap_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, table_offset) != 0) warnx("pci_passthru: unmap_pptdev_mmio failed"); } else { if (vm_map_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, table_offset, sc->psc_bar[baridx].addr) != 0) warnx("pci_passthru: map_pptdev_mmio failed"); } } table_size = pi->pi_msix.table_offset - table_offset; table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; table_size = roundup2(table_size, 4096); remaining = pi->pi_bar[baridx].size - table_offset - table_size; if (remaining > 0) { address += table_offset + table_size; if (!enabled) { if (vm_unmap_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, remaining) != 0) warnx("pci_passthru: unmap_pptdev_mmio failed"); } else { if (vm_map_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, remaining, sc->psc_bar[baridx].addr + table_offset + table_size) != 0) warnx("pci_passthru: map_pptdev_mmio failed"); } } } static void passthru_mmio_addr(struct pci_devinst *pi, int baridx, int enabled, uint64_t address) { struct passthru_softc *sc; sc = pi->pi_arg; if (!enabled) { if (vm_unmap_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, sc->psc_bar[baridx].size) != 0) warnx("pci_passthru: unmap_pptdev_mmio failed"); } else { if (vm_map_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, sc->psc_bar[baridx].size, sc->psc_bar[baridx].addr) != 0) warnx("pci_passthru: map_pptdev_mmio failed"); } } static void passthru_addr_rom(struct pci_devinst *const pi, const int idx, const int enabled) { const uint64_t addr = pi->pi_bar[idx].addr; const uint64_t size = pi->pi_bar[idx].size; if (!enabled) { if (vm_munmap_memseg(pi->pi_vmctx, addr, size) != 0) { errx(4, "%s: munmap_memseg @ [%016lx - %016lx] failed", __func__, addr, addr + size); } } else { if (vm_mmap_memseg(pi->pi_vmctx, addr, VM_PCIROM, pi->pi_romoffset, size, PROT_READ | PROT_EXEC) != 0) { errx(4, "%s: mmap_memseg @ [%016lx - %016lx] failed", __func__, addr, addr + size); } } } static void passthru_addr(struct pci_devinst *pi, int baridx, int enabled, uint64_t address) { switch (pi->pi_bar[baridx].type) { case PCIBAR_IO: /* IO BARs are emulated */ break; case PCIBAR_ROM: passthru_addr_rom(pi, baridx, enabled); break; case PCIBAR_MEM32: case PCIBAR_MEM64: if (baridx == pci_msix_table_bar(pi)) passthru_msix_addr(pi, baridx, enabled, address); else passthru_mmio_addr(pi, baridx, enabled, address); break; default: errx(4, "%s: invalid BAR type %d", __func__, pi->pi_bar[baridx].type); } } static const struct pci_devemu passthru = { .pe_emu = "passthru", .pe_init = passthru_init, .pe_legacy_config = passthru_legacy_config, .pe_cfgwrite = passthru_cfgwrite, .pe_cfgread = passthru_cfgread, .pe_barwrite = passthru_write, .pe_barread = passthru_read, .pe_baraddr = passthru_addr, }; PCI_EMUL_SET(passthru); diff --git a/usr.sbin/bhyve/pctestdev.c b/usr.sbin/bhyve/pctestdev.c index 8b810e641bbc..bd62c2fdb003 100644 --- a/usr.sbin/bhyve/pctestdev.c +++ b/usr.sbin/bhyve/pctestdev.c @@ -1,262 +1,262 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Adam Fenn * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Emulation of selected legacy test/debug interfaces expected by KVM-unit-tests */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include "debug.h" #include "inout.h" #include "mem.h" #include "pctestdev.h" #define DEBUGEXIT_BASE 0xf4 #define DEBUGEXIT_LEN 4 #define DEBUGEXIT_NAME "isa-debug-exit" #define IOMEM_BASE 0xff000000 #define IOMEM_LEN 0x10000 #define IOMEM_NAME "pc-testdev-iomem" #define IOPORT_BASE 0xe0 #define IOPORT_LEN 4 #define IOPORT_NAME "pc-testdev-ioport" #define IRQ_BASE 0x2000 #define IRQ_IOAPIC_PINCOUNT_MIN 24 #define IRQ_IOAPIC_PINCOUNT_MAX 32 #define IRQ_NAME "pc-testdev-irq-line" #define PCTESTDEV_NAME "pc-testdev" static bool pctestdev_inited; static uint8_t pctestdev_iomem_buf[IOMEM_LEN]; static uint32_t pctestdev_ioport_data; static int pctestdev_debugexit_io(struct vmctx *ctx, int in, int port, int bytes, uint32_t *eax, void *arg); -static int pctestdev_iomem_io(struct vmctx *ctx, int vcpu, int dir, +static int pctestdev_iomem_io(struct vcpu *vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2); static int pctestdev_ioport_io(struct vmctx *ctx, int in, int port, int bytes, uint32_t *eax, void *arg); static int pctestdev_irq_io(struct vmctx *ctx, int in, int port, int bytes, uint32_t *eax, void *arg); const char * pctestdev_getname(void) { return (PCTESTDEV_NAME); } int pctestdev_init(struct vmctx *ctx) { struct mem_range iomem; struct inout_port debugexit, ioport, irq; int err, pincount; if (pctestdev_inited) { EPRINTLN("Only one pc-testdev device is allowed."); return (-1); } err = vm_ioapic_pincount(ctx, &pincount); if (err != 0) { EPRINTLN("pc-testdev: Failed to obtain IOAPIC pin count."); return (-1); } if (pincount < IRQ_IOAPIC_PINCOUNT_MIN || pincount > IRQ_IOAPIC_PINCOUNT_MAX) { EPRINTLN("pc-testdev: Unsupported IOAPIC pin count: %d.", pincount); return (-1); } debugexit.name = DEBUGEXIT_NAME; debugexit.port = DEBUGEXIT_BASE; debugexit.size = DEBUGEXIT_LEN; debugexit.flags = IOPORT_F_INOUT; debugexit.handler = pctestdev_debugexit_io; debugexit.arg = NULL; iomem.name = IOMEM_NAME; iomem.flags = MEM_F_RW | MEM_F_IMMUTABLE; iomem.handler = pctestdev_iomem_io; iomem.arg1 = NULL; iomem.arg2 = 0; iomem.base = IOMEM_BASE; iomem.size = IOMEM_LEN; ioport.name = IOPORT_NAME; ioport.port = IOPORT_BASE; ioport.size = IOPORT_LEN; ioport.flags = IOPORT_F_INOUT; ioport.handler = pctestdev_ioport_io; ioport.arg = NULL; irq.name = IRQ_NAME; irq.port = IRQ_BASE; irq.size = pincount; irq.flags = IOPORT_F_INOUT; irq.handler = pctestdev_irq_io; irq.arg = NULL; err = register_inout(&debugexit); if (err != 0) goto fail; err = register_inout(&ioport); if (err != 0) goto fail_after_debugexit_reg; err = register_inout(&irq); if (err != 0) goto fail_after_ioport_reg; err = register_mem(&iomem); if (err != 0) goto fail_after_irq_reg; pctestdev_inited = true; return (0); fail_after_irq_reg: (void)unregister_inout(&irq); fail_after_ioport_reg: (void)unregister_inout(&ioport); fail_after_debugexit_reg: (void)unregister_inout(&debugexit); fail: return (err); } static int pctestdev_debugexit_io(struct vmctx *ctx __unused, int in, int port __unused, int bytes __unused, uint32_t *eax, void *arg __unused) { if (in) *eax = 0; else exit((*eax << 1) | 1); return (0); } static int -pctestdev_iomem_io(struct vmctx *ctx __unused, int vcpu __unused, int dir, +pctestdev_iomem_io(struct vcpu *vcpu __unused, int dir, uint64_t addr, int size, uint64_t *val, void *arg1 __unused, long arg2 __unused) { uint64_t offset; if (addr + size > IOMEM_BASE + IOMEM_LEN) return (-1); offset = addr - IOMEM_BASE; if (dir == MEM_F_READ) { (void)memcpy(val, pctestdev_iomem_buf + offset, size); } else { assert(dir == MEM_F_WRITE); (void)memcpy(pctestdev_iomem_buf + offset, val, size); } return (0); } static int pctestdev_ioport_io(struct vmctx *ctx __unused, int in, int port, int bytes, uint32_t *eax, void *arg __unused) { uint32_t mask; int lsb; if (port + bytes > IOPORT_BASE + IOPORT_LEN) return (-1); lsb = (port & 0x3) * 8; mask = (-1UL >> (32 - (bytes * 8))) << lsb; if (in) *eax = (pctestdev_ioport_data & mask) >> lsb; else { pctestdev_ioport_data &= ~mask; pctestdev_ioport_data |= *eax << lsb; } return (0); } static int pctestdev_irq_io(struct vmctx *ctx, int in, int port, int bytes, uint32_t *eax, void *arg __unused) { int irq; if (bytes != 1) return (-1); if (in) { *eax = 0; return (0); } else { irq = port - IRQ_BASE; if (irq < 16) { if (*eax) return (vm_isa_assert_irq(ctx, irq, irq)); else return (vm_isa_deassert_irq(ctx, irq, irq)); } else { if (*eax) return (vm_ioapic_assert_irq(ctx, irq)); else return (vm_ioapic_deassert_irq(ctx, irq)); } } } diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c index 37aba32a1929..72b63c506c41 100644 --- a/usr.sbin/bhyve/snapshot.c +++ b/usr.sbin/bhyve/snapshot.c @@ -1,1708 +1,1708 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Flavius Anton * Copyright (c) 2016 Mihai Tiganus * Copyright (c) 2016-2019 Mihai Carabas * Copyright (c) 2017-2019 Darius Mihai * Copyright (c) 2017-2019 Elena Mihailescu * Copyright (c) 2018-2019 Sergiu Weisz * All rights reserved. * The bhyve-snapshot feature was developed under sponsorships * from Matthew Grooms. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "debug.h" #include "inout.h" #include "ipc.h" #include "fwctl.h" #include "ioapic.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "smbiostbl.h" #include "snapshot.h" #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include #include struct spinner_info { const size_t *crtval; const size_t maxval; const size_t total; }; extern int guest_ncpus; static struct winsize winsize; static sig_t old_winch_handler; #define KB (1024UL) #define MB (1024UL * KB) #define GB (1024UL * MB) #define SNAPSHOT_CHUNK (4 * MB) #define PROG_BUF_SZ (8192) #define SNAPSHOT_BUFFER_SIZE (20 * MB) #define JSON_STRUCT_ARR_KEY "structs" #define JSON_DEV_ARR_KEY "devices" #define JSON_BASIC_METADATA_KEY "basic metadata" #define JSON_SNAPSHOT_REQ_KEY "snapshot_req" #define JSON_SIZE_KEY "size" #define JSON_FILE_OFFSET_KEY "file_offset" #define JSON_NCPUS_KEY "ncpus" #define JSON_VMNAME_KEY "vmname" #define JSON_MEMSIZE_KEY "memsize" #define JSON_MEMFLAGS_KEY "memflags" #define min(a,b) \ ({ \ __typeof__ (a) _a = (a); \ __typeof__ (b) _b = (b); \ _a < _b ? _a : _b; \ }) static const struct vm_snapshot_dev_info snapshot_devs[] = { { "atkbdc", atkbdc_snapshot, NULL, NULL }, { "virtio-net", pci_snapshot, pci_pause, pci_resume }, { "virtio-blk", pci_snapshot, pci_pause, pci_resume }, { "virtio-rnd", pci_snapshot, NULL, NULL }, { "lpc", pci_snapshot, NULL, NULL }, { "fbuf", pci_snapshot, NULL, NULL }, { "xhci", pci_snapshot, NULL, NULL }, { "e1000", pci_snapshot, NULL, NULL }, { "ahci", pci_snapshot, pci_pause, pci_resume }, { "ahci-hd", pci_snapshot, pci_pause, pci_resume }, { "ahci-cd", pci_snapshot, pci_pause, pci_resume }, }; static const struct vm_snapshot_kern_info snapshot_kern_structs[] = { { "vhpet", STRUCT_VHPET }, { "vm", STRUCT_VM }, { "vioapic", STRUCT_VIOAPIC }, { "vlapic", STRUCT_VLAPIC }, { "vmcx", STRUCT_VMCX }, { "vatpit", STRUCT_VATPIT }, { "vatpic", STRUCT_VATPIC }, { "vpmtmr", STRUCT_VPMTMR }, { "vrtc", STRUCT_VRTC }, }; static cpuset_t vcpus_active, vcpus_suspended; static pthread_mutex_t vcpu_lock; static pthread_cond_t vcpus_idle, vcpus_can_run; static bool checkpoint_active; /* * TODO: Harden this function and all of its callers since 'base_str' is a user * provided string. */ static char * strcat_extension(const char *base_str, const char *ext) { char *res; size_t base_len, ext_len; base_len = strnlen(base_str, NAME_MAX); ext_len = strnlen(ext, NAME_MAX); if (base_len + ext_len > NAME_MAX) { fprintf(stderr, "Filename exceeds maximum length.\n"); return (NULL); } res = malloc(base_len + ext_len + 1); if (res == NULL) { perror("Failed to allocate memory."); return (NULL); } memcpy(res, base_str, base_len); memcpy(res + base_len, ext, ext_len); res[base_len + ext_len] = 0; return (res); } void destroy_restore_state(struct restore_state *rstate) { if (rstate == NULL) { fprintf(stderr, "Attempting to destroy NULL restore struct.\n"); return; } if (rstate->kdata_map != MAP_FAILED) munmap(rstate->kdata_map, rstate->kdata_len); if (rstate->kdata_fd > 0) close(rstate->kdata_fd); if (rstate->vmmem_fd > 0) close(rstate->vmmem_fd); if (rstate->meta_root_obj != NULL) ucl_object_unref(rstate->meta_root_obj); if (rstate->meta_parser != NULL) ucl_parser_free(rstate->meta_parser); } static int load_vmmem_file(const char *filename, struct restore_state *rstate) { struct stat sb; int err; rstate->vmmem_fd = open(filename, O_RDONLY); if (rstate->vmmem_fd < 0) { perror("Failed to open restore file"); return (-1); } err = fstat(rstate->vmmem_fd, &sb); if (err < 0) { perror("Failed to stat restore file"); goto err_load_vmmem; } if (sb.st_size == 0) { fprintf(stderr, "Restore file is empty.\n"); goto err_load_vmmem; } rstate->vmmem_len = sb.st_size; return (0); err_load_vmmem: if (rstate->vmmem_fd > 0) close(rstate->vmmem_fd); return (-1); } static int load_kdata_file(const char *filename, struct restore_state *rstate) { struct stat sb; int err; rstate->kdata_fd = open(filename, O_RDONLY); if (rstate->kdata_fd < 0) { perror("Failed to open kernel data file"); return (-1); } err = fstat(rstate->kdata_fd, &sb); if (err < 0) { perror("Failed to stat kernel data file"); goto err_load_kdata; } if (sb.st_size == 0) { fprintf(stderr, "Kernel data file is empty.\n"); goto err_load_kdata; } rstate->kdata_len = sb.st_size; rstate->kdata_map = mmap(NULL, rstate->kdata_len, PROT_READ, MAP_SHARED, rstate->kdata_fd, 0); if (rstate->kdata_map == MAP_FAILED) { perror("Failed to map restore file"); goto err_load_kdata; } return (0); err_load_kdata: if (rstate->kdata_fd > 0) close(rstate->kdata_fd); return (-1); } static int load_metadata_file(const char *filename, struct restore_state *rstate) { ucl_object_t *obj; struct ucl_parser *parser; int err; parser = ucl_parser_new(UCL_PARSER_DEFAULT); if (parser == NULL) { fprintf(stderr, "Failed to initialize UCL parser.\n"); err = -1; goto err_load_metadata; } err = ucl_parser_add_file(parser, filename); if (err == 0) { fprintf(stderr, "Failed to parse metadata file: '%s'\n", filename); err = -1; goto err_load_metadata; } obj = ucl_parser_get_object(parser); if (obj == NULL) { fprintf(stderr, "Failed to parse object.\n"); err = -1; goto err_load_metadata; } rstate->meta_parser = parser; rstate->meta_root_obj = (ucl_object_t *)obj; return (0); err_load_metadata: if (parser != NULL) ucl_parser_free(parser); return (err); } int load_restore_file(const char *filename, struct restore_state *rstate) { int err = 0; char *kdata_filename = NULL, *meta_filename = NULL; assert(filename != NULL); assert(rstate != NULL); memset(rstate, 0, sizeof(*rstate)); rstate->kdata_map = MAP_FAILED; err = load_vmmem_file(filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest RAM file.\n"); goto err_restore; } kdata_filename = strcat_extension(filename, ".kern"); if (kdata_filename == NULL) { fprintf(stderr, "Failed to construct kernel data filename.\n"); goto err_restore; } err = load_kdata_file(kdata_filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest kernel data file.\n"); goto err_restore; } meta_filename = strcat_extension(filename, ".meta"); if (meta_filename == NULL) { fprintf(stderr, "Failed to construct kernel metadata filename.\n"); goto err_restore; } err = load_metadata_file(meta_filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest metadata file.\n"); goto err_restore; } return (0); err_restore: destroy_restore_state(rstate); if (kdata_filename != NULL) free(kdata_filename); if (meta_filename != NULL) free(meta_filename); return (-1); } #define JSON_GET_INT_OR_RETURN(key, obj, result_ptr, ret) \ do { \ const ucl_object_t *obj__; \ obj__ = ucl_object_lookup(obj, key); \ if (obj__ == NULL) { \ fprintf(stderr, "Missing key: '%s'", key); \ return (ret); \ } \ if (!ucl_object_toint_safe(obj__, result_ptr)) { \ fprintf(stderr, "Cannot convert '%s' value to int.", key); \ return (ret); \ } \ } while(0) #define JSON_GET_STRING_OR_RETURN(key, obj, result_ptr, ret) \ do { \ const ucl_object_t *obj__; \ obj__ = ucl_object_lookup(obj, key); \ if (obj__ == NULL) { \ fprintf(stderr, "Missing key: '%s'", key); \ return (ret); \ } \ if (!ucl_object_tostring_safe(obj__, result_ptr)) { \ fprintf(stderr, "Cannot convert '%s' value to string.", key); \ return (ret); \ } \ } while(0) static void * lookup_struct(enum snapshot_req struct_id, struct restore_state *rstate, size_t *struct_size) { const ucl_object_t *structs = NULL, *obj = NULL; ucl_object_iter_t it = NULL; int64_t snapshot_req, size, file_offset; structs = ucl_object_lookup(rstate->meta_root_obj, JSON_STRUCT_ARR_KEY); if (structs == NULL) { fprintf(stderr, "Failed to find '%s' object.\n", JSON_STRUCT_ARR_KEY); return (NULL); } if (ucl_object_type(structs) != UCL_ARRAY) { fprintf(stderr, "Object '%s' is not an array.\n", JSON_STRUCT_ARR_KEY); return (NULL); } while ((obj = ucl_object_iterate(structs, &it, true)) != NULL) { snapshot_req = -1; JSON_GET_INT_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj, &snapshot_req, NULL); assert(snapshot_req >= 0); if ((enum snapshot_req) snapshot_req == struct_id) { JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj, &size, NULL); assert(size >= 0); JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj, &file_offset, NULL); assert(file_offset >= 0); assert((uint64_t)file_offset + size <= rstate->kdata_len); *struct_size = (size_t)size; return ((uint8_t *)rstate->kdata_map + file_offset); } } return (NULL); } static void * lookup_check_dev(const char *dev_name, struct restore_state *rstate, const ucl_object_t *obj, size_t *data_size) { const char *snapshot_req; int64_t size, file_offset; snapshot_req = NULL; JSON_GET_STRING_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj, &snapshot_req, NULL); assert(snapshot_req != NULL); if (!strcmp(snapshot_req, dev_name)) { JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj, &size, NULL); assert(size >= 0); JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj, &file_offset, NULL); assert(file_offset >= 0); assert((uint64_t)file_offset + size <= rstate->kdata_len); *data_size = (size_t)size; return ((uint8_t *)rstate->kdata_map + file_offset); } return (NULL); } static void* lookup_dev(const char *dev_name, struct restore_state *rstate, size_t *data_size) { const ucl_object_t *devs = NULL, *obj = NULL; ucl_object_iter_t it = NULL; void *ret; devs = ucl_object_lookup(rstate->meta_root_obj, JSON_DEV_ARR_KEY); if (devs == NULL) { fprintf(stderr, "Failed to find '%s' object.\n", JSON_DEV_ARR_KEY); return (NULL); } if (ucl_object_type(devs) != UCL_ARRAY) { fprintf(stderr, "Object '%s' is not an array.\n", JSON_DEV_ARR_KEY); return (NULL); } while ((obj = ucl_object_iterate(devs, &it, true)) != NULL) { ret = lookup_check_dev(dev_name, rstate, obj, data_size); if (ret != NULL) return (ret); } return (NULL); } static const ucl_object_t * lookup_basic_metadata_object(struct restore_state *rstate) { const ucl_object_t *basic_meta_obj = NULL; basic_meta_obj = ucl_object_lookup(rstate->meta_root_obj, JSON_BASIC_METADATA_KEY); if (basic_meta_obj == NULL) { fprintf(stderr, "Failed to find '%s' object.\n", JSON_BASIC_METADATA_KEY); return (NULL); } if (ucl_object_type(basic_meta_obj) != UCL_OBJECT) { fprintf(stderr, "Object '%s' is not a JSON object.\n", JSON_BASIC_METADATA_KEY); return (NULL); } return (basic_meta_obj); } const char * lookup_vmname(struct restore_state *rstate) { const char *vmname; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (NULL); JSON_GET_STRING_OR_RETURN(JSON_VMNAME_KEY, obj, &vmname, NULL); return (vmname); } int lookup_memflags(struct restore_state *rstate) { int64_t memflags; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_MEMFLAGS_KEY, obj, &memflags, 0); return ((int)memflags); } size_t lookup_memsize(struct restore_state *rstate) { int64_t memsize; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_MEMSIZE_KEY, obj, &memsize, 0); if (memsize < 0) memsize = 0; return ((size_t)memsize); } int lookup_guest_ncpus(struct restore_state *rstate) { int64_t ncpus; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_NCPUS_KEY, obj, &ncpus, 0); return ((int)ncpus); } static void winch_handler(int signal __unused) { #ifdef TIOCGWINSZ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize); #endif /* TIOCGWINSZ */ } static int print_progress(size_t crtval, const size_t maxval) { size_t rc; double crtval_gb, maxval_gb; size_t i, win_width, prog_start, prog_done, prog_end; int mval_len; static char prog_buf[PROG_BUF_SZ]; static const size_t len = sizeof(prog_buf); static size_t div; static const char *div_str; static char wip_bar[] = { '/', '-', '\\', '|' }; static int wip_idx = 0; if (maxval == 0) { printf("[0B / 0B]\r\n"); return (0); } if (crtval > maxval) crtval = maxval; if (maxval > 10 * GB) { div = GB; div_str = "GiB"; } else if (maxval > 10 * MB) { div = MB; div_str = "MiB"; } else { div = KB; div_str = "KiB"; } crtval_gb = (double) crtval / div; maxval_gb = (double) maxval / div; rc = snprintf(prog_buf, len, "%.03lf", maxval_gb); if (rc == len) { fprintf(stderr, "Maxval too big\n"); return (-1); } mval_len = rc; rc = snprintf(prog_buf, len, "\r[%*.03lf%s / %.03lf%s] |", mval_len, crtval_gb, div_str, maxval_gb, div_str); if (rc == len) { fprintf(stderr, "Buffer too small to print progress\n"); return (-1); } win_width = min(winsize.ws_col, len); prog_start = rc; if (prog_start < (win_width - 2)) { prog_end = win_width - prog_start - 2; prog_done = prog_end * (crtval_gb / maxval_gb); for (i = prog_start; i < prog_start + prog_done; i++) prog_buf[i] = '#'; if (crtval != maxval) { prog_buf[i] = wip_bar[wip_idx]; wip_idx = (wip_idx + 1) % sizeof(wip_bar); i++; } else { prog_buf[i++] = '#'; } for (; i < win_width - 2; i++) prog_buf[i] = '_'; prog_buf[win_width - 2] = '|'; } prog_buf[win_width - 1] = '\0'; write(STDOUT_FILENO, prog_buf, win_width); return (0); } static void * snapshot_spinner_cb(void *arg) { int rc; size_t crtval, maxval, total; struct spinner_info *si; struct timespec ts; si = arg; if (si == NULL) pthread_exit(NULL); ts.tv_sec = 0; ts.tv_nsec = 50 * 1000 * 1000; /* 50 ms sleep time */ do { crtval = *si->crtval; maxval = si->maxval; total = si->total; rc = print_progress(crtval, total); if (rc < 0) { fprintf(stderr, "Failed to parse progress\n"); break; } nanosleep(&ts, NULL); } while (crtval < maxval); pthread_exit(NULL); return NULL; } static int vm_snapshot_mem_part(const int snapfd, const size_t foff, void *src, const size_t len, const size_t totalmem, const bool op_wr) { int rc; size_t part_done, todo, rem; ssize_t done; bool show_progress; pthread_t spinner_th; struct spinner_info *si; if (lseek(snapfd, foff, SEEK_SET) < 0) { perror("Failed to change file offset"); return (-1); } show_progress = false; if (isatty(STDIN_FILENO) && (winsize.ws_col != 0)) show_progress = true; part_done = foff; rem = len; if (show_progress) { si = &(struct spinner_info) { .crtval = &part_done, .maxval = foff + len, .total = totalmem }; rc = pthread_create(&spinner_th, 0, snapshot_spinner_cb, si); if (rc) { perror("Unable to create spinner thread"); show_progress = false; } } while (rem > 0) { if (show_progress) todo = min(SNAPSHOT_CHUNK, rem); else todo = rem; if (op_wr) done = write(snapfd, src, todo); else done = read(snapfd, src, todo); if (done < 0) { perror("Failed to write in file"); return (-1); } src = (uint8_t *)src + done; part_done += done; rem -= done; } if (show_progress) { rc = pthread_join(spinner_th, NULL); if (rc) perror("Unable to end spinner thread"); } return (0); } static size_t vm_snapshot_mem(struct vmctx *ctx, int snapfd, size_t memsz, const bool op_wr) { int ret; size_t lowmem, highmem, totalmem; char *baseaddr; ret = vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem, &highmem); if (ret) { fprintf(stderr, "%s: unable to retrieve guest memory size\r\n", __func__); return (0); } totalmem = lowmem + highmem; if ((op_wr == false) && (totalmem != memsz)) { fprintf(stderr, "%s: mem size mismatch: %ld vs %ld\r\n", __func__, totalmem, memsz); return (0); } winsize.ws_col = 80; #ifdef TIOCGWINSZ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize); #endif /* TIOCGWINSZ */ old_winch_handler = signal(SIGWINCH, winch_handler); ret = vm_snapshot_mem_part(snapfd, 0, baseaddr, lowmem, totalmem, op_wr); if (ret) { fprintf(stderr, "%s: Could not %s lowmem\r\n", __func__, op_wr ? "write" : "read"); totalmem = 0; goto done; } if (highmem == 0) goto done; ret = vm_snapshot_mem_part(snapfd, lowmem, baseaddr + 4*GB, highmem, totalmem, op_wr); if (ret) { fprintf(stderr, "%s: Could not %s highmem\r\n", __func__, op_wr ? "write" : "read"); totalmem = 0; goto done; } done: printf("\r\n"); signal(SIGWINCH, old_winch_handler); return (totalmem); } int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate) { size_t restored; restored = vm_snapshot_mem(ctx, rstate->vmmem_fd, rstate->vmmem_len, false); if (restored != rstate->vmmem_len) return (-1); return (0); } static int vm_restore_kern_struct(struct vmctx *ctx, struct restore_state *rstate, const struct vm_snapshot_kern_info *info) { void *struct_ptr; size_t struct_size; int ret; struct vm_snapshot_meta *meta; struct_ptr = lookup_struct(info->req, rstate, &struct_size); if (struct_ptr == NULL) { fprintf(stderr, "%s: Failed to lookup struct %s\r\n", __func__, info->struct_name); ret = -1; goto done; } if (struct_size == 0) { fprintf(stderr, "%s: Kernel struct size was 0 for: %s\r\n", __func__, info->struct_name); ret = -1; goto done; } meta = &(struct vm_snapshot_meta) { .ctx = ctx, .dev_name = info->struct_name, .dev_req = info->req, .buffer.buf_start = struct_ptr, .buffer.buf_size = struct_size, .buffer.buf = struct_ptr, .buffer.buf_rem = struct_size, .op = VM_SNAPSHOT_RESTORE, }; ret = vm_snapshot_req(meta); if (ret != 0) { fprintf(stderr, "%s: Failed to restore struct: %s\r\n", __func__, info->struct_name); goto done; } done: return (ret); } int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate) { size_t i; int ret; for (i = 0; i < nitems(snapshot_kern_structs); i++) { ret = vm_restore_kern_struct(ctx, rstate, &snapshot_kern_structs[i]); if (ret != 0) return (ret); } return (0); } static int vm_restore_user_dev(struct vmctx *ctx, struct restore_state *rstate, const struct vm_snapshot_dev_info *info) { void *dev_ptr; size_t dev_size; int ret; struct vm_snapshot_meta *meta; dev_ptr = lookup_dev(info->dev_name, rstate, &dev_size); if (dev_ptr == NULL) { fprintf(stderr, "Failed to lookup dev: %s\r\n", info->dev_name); fprintf(stderr, "Continuing the restore/migration process\r\n"); return (0); } if (dev_size == 0) { fprintf(stderr, "%s: Device size is 0. " "Assuming %s is not used\r\n", __func__, info->dev_name); return (0); } meta = &(struct vm_snapshot_meta) { .ctx = ctx, .dev_name = info->dev_name, .buffer.buf_start = dev_ptr, .buffer.buf_size = dev_size, .buffer.buf = dev_ptr, .buffer.buf_rem = dev_size, .op = VM_SNAPSHOT_RESTORE, }; ret = (*info->snapshot_cb)(meta); if (ret != 0) { fprintf(stderr, "Failed to restore dev: %s\r\n", info->dev_name); return (-1); } return (0); } int vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate) { size_t i; int ret; for (i = 0; i < nitems(snapshot_devs); i++) { ret = vm_restore_user_dev(ctx, rstate, &snapshot_devs[i]); if (ret != 0) return (ret); } return 0; } int vm_pause_user_devs(void) { const struct vm_snapshot_dev_info *info; size_t i; int ret; for (i = 0; i < nitems(snapshot_devs); i++) { info = &snapshot_devs[i]; if (info->pause_cb == NULL) continue; ret = info->pause_cb(info->dev_name); if (ret != 0) return (ret); } return (0); } int vm_resume_user_devs(void) { const struct vm_snapshot_dev_info *info; size_t i; int ret; for (i = 0; i < nitems(snapshot_devs); i++) { info = &snapshot_devs[i]; if (info->resume_cb == NULL) continue; ret = info->resume_cb(info->dev_name); if (ret != 0) return (ret); } return (0); } static int vm_snapshot_kern_struct(int data_fd, xo_handle_t *xop, const char *array_key, struct vm_snapshot_meta *meta, off_t *offset) { int ret; size_t data_size; ssize_t write_cnt; ret = vm_snapshot_req(meta); if (ret != 0) { fprintf(stderr, "%s: Failed to snapshot struct %s\r\n", __func__, meta->dev_name); ret = -1; goto done; } data_size = vm_get_snapshot_size(meta); /* XXX-MJ no handling for short writes. */ write_cnt = write(data_fd, meta->buffer.buf_start, data_size); if (write_cnt < 0 || (size_t)write_cnt != data_size) { perror("Failed to write all snapshotted data."); ret = -1; goto done; } /* Write metadata. */ xo_open_instance_h(xop, array_key); xo_emit_h(xop, "{:debug_name/%s}\n", meta->dev_name); xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%d}\n", meta->dev_req); xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); xo_close_instance_h(xop, JSON_STRUCT_ARR_KEY); *offset += data_size; done: return (ret); } static int vm_snapshot_kern_structs(struct vmctx *ctx, int data_fd, xo_handle_t *xop) { int ret, error; size_t buf_size, i, offset; char *buffer; struct vm_snapshot_meta *meta; error = 0; offset = 0; buf_size = SNAPSHOT_BUFFER_SIZE; buffer = malloc(SNAPSHOT_BUFFER_SIZE * sizeof(char)); if (buffer == NULL) { error = ENOMEM; perror("Failed to allocate memory for snapshot buffer"); goto err_vm_snapshot_kern_data; } meta = &(struct vm_snapshot_meta) { .ctx = ctx, .buffer.buf_start = buffer, .buffer.buf_size = buf_size, .op = VM_SNAPSHOT_SAVE, }; xo_open_list_h(xop, JSON_STRUCT_ARR_KEY); for (i = 0; i < nitems(snapshot_kern_structs); i++) { meta->dev_name = snapshot_kern_structs[i].struct_name; meta->dev_req = snapshot_kern_structs[i].req; memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); meta->buffer.buf = meta->buffer.buf_start; meta->buffer.buf_rem = meta->buffer.buf_size; ret = vm_snapshot_kern_struct(data_fd, xop, JSON_DEV_ARR_KEY, meta, &offset); if (ret != 0) { error = -1; goto err_vm_snapshot_kern_data; } } xo_close_list_h(xop, JSON_STRUCT_ARR_KEY); err_vm_snapshot_kern_data: if (buffer != NULL) free(buffer); return (error); } static int vm_snapshot_basic_metadata(struct vmctx *ctx, xo_handle_t *xop, size_t memsz) { xo_open_container_h(xop, JSON_BASIC_METADATA_KEY); xo_emit_h(xop, "{:" JSON_NCPUS_KEY "/%ld}\n", guest_ncpus); xo_emit_h(xop, "{:" JSON_VMNAME_KEY "/%s}\n", vm_get_name(ctx)); xo_emit_h(xop, "{:" JSON_MEMSIZE_KEY "/%lu}\n", memsz); xo_emit_h(xop, "{:" JSON_MEMFLAGS_KEY "/%d}\n", vm_get_memflags(ctx)); xo_close_container_h(xop, JSON_BASIC_METADATA_KEY); return (0); } static int vm_snapshot_dev_write_data(int data_fd, xo_handle_t *xop, const char *array_key, struct vm_snapshot_meta *meta, off_t *offset) { ssize_t ret; size_t data_size; data_size = vm_get_snapshot_size(meta); /* XXX-MJ no handling for short writes. */ ret = write(data_fd, meta->buffer.buf_start, data_size); if (ret < 0 || (size_t)ret != data_size) { perror("Failed to write all snapshotted data."); return (-1); } /* Write metadata. */ xo_open_instance_h(xop, array_key); xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name); xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); xo_close_instance_h(xop, array_key); *offset += data_size; return (0); } static int vm_snapshot_user_dev(const struct vm_snapshot_dev_info *info, int data_fd, xo_handle_t *xop, struct vm_snapshot_meta *meta, off_t *offset) { int ret; ret = (*info->snapshot_cb)(meta); if (ret != 0) { fprintf(stderr, "Failed to snapshot %s; ret=%d\r\n", meta->dev_name, ret); return (ret); } ret = vm_snapshot_dev_write_data(data_fd, xop, JSON_DEV_ARR_KEY, meta, offset); if (ret != 0) return (ret); return (0); } static int vm_snapshot_user_devs(struct vmctx *ctx, int data_fd, xo_handle_t *xop) { int ret; off_t offset; void *buffer; size_t buf_size, i; struct vm_snapshot_meta *meta; buf_size = SNAPSHOT_BUFFER_SIZE; offset = lseek(data_fd, 0, SEEK_CUR); if (offset < 0) { perror("Failed to get data file current offset."); return (-1); } buffer = malloc(buf_size); if (buffer == NULL) { perror("Failed to allocate memory for snapshot buffer"); ret = ENOSPC; goto snapshot_err; } meta = &(struct vm_snapshot_meta) { .ctx = ctx, .buffer.buf_start = buffer, .buffer.buf_size = buf_size, .op = VM_SNAPSHOT_SAVE, }; xo_open_list_h(xop, JSON_DEV_ARR_KEY); /* Restore other devices that support this feature */ for (i = 0; i < nitems(snapshot_devs); i++) { meta->dev_name = snapshot_devs[i].dev_name; memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); meta->buffer.buf = meta->buffer.buf_start; meta->buffer.buf_rem = meta->buffer.buf_size; ret = vm_snapshot_user_dev(&snapshot_devs[i], data_fd, xop, meta, &offset); if (ret != 0) goto snapshot_err; } xo_close_list_h(xop, JSON_DEV_ARR_KEY); snapshot_err: if (buffer != NULL) free(buffer); return (ret); } void checkpoint_cpu_add(int vcpu) { pthread_mutex_lock(&vcpu_lock); CPU_SET(vcpu, &vcpus_active); if (checkpoint_active) { CPU_SET(vcpu, &vcpus_suspended); while (checkpoint_active) pthread_cond_wait(&vcpus_can_run, &vcpu_lock); CPU_CLR(vcpu, &vcpus_suspended); } pthread_mutex_unlock(&vcpu_lock); } /* * When a vCPU is suspended for any reason, it calls * checkpoint_cpu_suspend(). This records that the vCPU is idle. * Before returning from suspension, checkpoint_cpu_resume() is * called. In suspend we note that the vCPU is idle. In resume we * pause the vCPU thread until the checkpoint is complete. The reason * for the two-step process is that vCPUs might already be stopped in * the debug server when a checkpoint is requested. This approach * allows us to account for and handle those vCPUs. */ void checkpoint_cpu_suspend(int vcpu) { pthread_mutex_lock(&vcpu_lock); CPU_SET(vcpu, &vcpus_suspended); if (checkpoint_active && CPU_CMP(&vcpus_active, &vcpus_suspended) == 0) pthread_cond_signal(&vcpus_idle); pthread_mutex_unlock(&vcpu_lock); } void checkpoint_cpu_resume(int vcpu) { pthread_mutex_lock(&vcpu_lock); while (checkpoint_active) pthread_cond_wait(&vcpus_can_run, &vcpu_lock); CPU_CLR(vcpu, &vcpus_suspended); pthread_mutex_unlock(&vcpu_lock); } static void vm_vcpu_pause(struct vmctx *ctx) { pthread_mutex_lock(&vcpu_lock); checkpoint_active = true; - vm_suspend_cpu(ctx, -1); + vm_suspend_all_cpus(ctx); while (CPU_CMP(&vcpus_active, &vcpus_suspended) != 0) pthread_cond_wait(&vcpus_idle, &vcpu_lock); pthread_mutex_unlock(&vcpu_lock); } static void vm_vcpu_resume(struct vmctx *ctx) { pthread_mutex_lock(&vcpu_lock); checkpoint_active = false; pthread_mutex_unlock(&vcpu_lock); - vm_resume_cpu(ctx, -1); + vm_resume_all_cpus(ctx); pthread_cond_broadcast(&vcpus_can_run); } static int vm_checkpoint(struct vmctx *ctx, const char *checkpoint_file, bool stop_vm) { int fd_checkpoint = 0, kdata_fd = 0; int ret = 0; int error = 0; size_t memsz; xo_handle_t *xop = NULL; char *meta_filename = NULL; char *kdata_filename = NULL; FILE *meta_file = NULL; kdata_filename = strcat_extension(checkpoint_file, ".kern"); if (kdata_filename == NULL) { fprintf(stderr, "Failed to construct kernel data filename.\n"); return (-1); } kdata_fd = open(kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); if (kdata_fd < 0) { perror("Failed to open kernel data snapshot file."); error = -1; goto done; } fd_checkpoint = open(checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700); if (fd_checkpoint < 0) { perror("Failed to create checkpoint file"); error = -1; goto done; } meta_filename = strcat_extension(checkpoint_file, ".meta"); if (meta_filename == NULL) { fprintf(stderr, "Failed to construct vm metadata filename.\n"); goto done; } meta_file = fopen(meta_filename, "w"); if (meta_file == NULL) { perror("Failed to open vm metadata snapshot file."); goto done; } xop = xo_create_to_file(meta_file, XO_STYLE_JSON, XOF_PRETTY); if (xop == NULL) { perror("Failed to get libxo handle on metadata file."); goto done; } vm_vcpu_pause(ctx); ret = vm_pause_user_devs(); if (ret != 0) { fprintf(stderr, "Could not pause devices\r\n"); error = ret; goto done; } memsz = vm_snapshot_mem(ctx, fd_checkpoint, 0, true); if (memsz == 0) { perror("Could not write guest memory to file"); error = -1; goto done; } ret = vm_snapshot_basic_metadata(ctx, xop, memsz); if (ret != 0) { fprintf(stderr, "Failed to snapshot vm basic metadata.\n"); error = -1; goto done; } ret = vm_snapshot_kern_structs(ctx, kdata_fd, xop); if (ret != 0) { fprintf(stderr, "Failed to snapshot vm kernel data.\n"); error = -1; goto done; } ret = vm_snapshot_user_devs(ctx, kdata_fd, xop); if (ret != 0) { fprintf(stderr, "Failed to snapshot device state.\n"); error = -1; goto done; } xo_finish_h(xop); if (stop_vm) { vm_destroy(ctx); exit(0); } done: ret = vm_resume_user_devs(); if (ret != 0) fprintf(stderr, "Could not resume devices\r\n"); vm_vcpu_resume(ctx); if (fd_checkpoint > 0) close(fd_checkpoint); if (meta_filename != NULL) free(meta_filename); if (kdata_filename != NULL) free(kdata_filename); if (xop != NULL) xo_destroy(xop); if (meta_file != NULL) fclose(meta_file); if (kdata_fd > 0) close(kdata_fd); return (error); } static int handle_message(struct vmctx *ctx, nvlist_t *nvl) { const char *cmd; struct ipc_command **ipc_cmd; if (!nvlist_exists_string(nvl, "cmd")) return (EINVAL); cmd = nvlist_get_string(nvl, "cmd"); IPC_COMMAND_FOREACH(ipc_cmd, ipc_cmd_set) { if (strcmp(cmd, (*ipc_cmd)->name) == 0) return ((*ipc_cmd)->handler(ctx, nvl)); } return (EOPNOTSUPP); } /* * Listen for commands from bhyvectl */ void * checkpoint_thread(void *param) { int fd; struct checkpoint_thread_info *thread_info; nvlist_t *nvl; pthread_set_name_np(pthread_self(), "checkpoint thread"); thread_info = (struct checkpoint_thread_info *)param; while ((fd = accept(thread_info->socket_fd, NULL, NULL)) != -1) { nvl = nvlist_recv(fd, 0); if (nvl != NULL) handle_message(thread_info->ctx, nvl); else EPRINTLN("nvlist_recv() failed: %s", strerror(errno)); close(fd); nvlist_destroy(nvl); } return (NULL); } static int vm_do_checkpoint(struct vmctx *ctx, const nvlist_t *nvl) { int error; if (!nvlist_exists_string(nvl, "filename") || !nvlist_exists_bool(nvl, "suspend")) error = EINVAL; else error = vm_checkpoint(ctx, nvlist_get_string(nvl, "filename"), nvlist_get_bool(nvl, "suspend")); return (error); } IPC_COMMAND(ipc_cmd_set, checkpoint, vm_do_checkpoint); void init_snapshot(void) { int err; err = pthread_mutex_init(&vcpu_lock, NULL); if (err != 0) errc(1, err, "checkpoint mutex init"); err = pthread_cond_init(&vcpus_idle, NULL); if (err != 0) errc(1, err, "checkpoint cv init (vcpus_idle)"); err = pthread_cond_init(&vcpus_can_run, NULL); if (err != 0) errc(1, err, "checkpoint cv init (vcpus_can_run)"); } /* * Create the listening socket for IPC with bhyvectl */ int init_checkpoint_thread(struct vmctx *ctx) { struct checkpoint_thread_info *checkpoint_info = NULL; struct sockaddr_un addr; int socket_fd; pthread_t checkpoint_pthread; int err; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif memset(&addr, 0, sizeof(addr)); socket_fd = socket(PF_UNIX, SOCK_STREAM, 0); if (socket_fd < 0) { EPRINTLN("Socket creation failed: %s", strerror(errno)); err = -1; goto fail; } addr.sun_family = AF_UNIX; snprintf(addr.sun_path, sizeof(addr.sun_path), "%s%s", BHYVE_RUN_DIR, vm_get_name(ctx)); addr.sun_len = SUN_LEN(&addr); unlink(addr.sun_path); if (bind(socket_fd, (struct sockaddr *)&addr, addr.sun_len) != 0) { EPRINTLN("Failed to bind socket \"%s\": %s\n", addr.sun_path, strerror(errno)); err = -1; goto fail; } if (listen(socket_fd, 10) < 0) { EPRINTLN("ipc socket listen: %s\n", strerror(errno)); err = errno; goto fail; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_ACCEPT, CAP_READ, CAP_RECV, CAP_WRITE, CAP_SEND, CAP_GETSOCKOPT); if (caph_rights_limit(socket_fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif checkpoint_info = calloc(1, sizeof(*checkpoint_info)); checkpoint_info->ctx = ctx; checkpoint_info->socket_fd = socket_fd; err = pthread_create(&checkpoint_pthread, NULL, checkpoint_thread, checkpoint_info); if (err != 0) goto fail; return (0); fail: free(checkpoint_info); if (socket_fd > 0) close(socket_fd); unlink(addr.sun_path); return (err); } void vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op) { const char *__op; if (op == VM_SNAPSHOT_SAVE) __op = "save"; else if (op == VM_SNAPSHOT_RESTORE) __op = "restore"; else __op = "unknown"; fprintf(stderr, "%s: snapshot-%s failed for %s\r\n", __func__, __op, bufname); } int vm_snapshot_buf(void *data, size_t data_size, struct vm_snapshot_meta *meta) { struct vm_snapshot_buffer *buffer; int op; buffer = &meta->buffer; op = meta->op; if (buffer->buf_rem < data_size) { fprintf(stderr, "%s: buffer too small\r\n", __func__); return (E2BIG); } if (op == VM_SNAPSHOT_SAVE) memcpy(buffer->buf, data, data_size); else if (op == VM_SNAPSHOT_RESTORE) memcpy(data, buffer->buf, data_size); else return (EINVAL); buffer->buf += data_size; buffer->buf_rem -= data_size; return (0); } size_t vm_get_snapshot_size(struct vm_snapshot_meta *meta) { size_t length; struct vm_snapshot_buffer *buffer; buffer = &meta->buffer; if (buffer->buf_size < buffer->buf_rem) { fprintf(stderr, "%s: Invalid buffer: size = %zu, rem = %zu\r\n", __func__, buffer->buf_size, buffer->buf_rem); length = 0; } else { length = buffer->buf_size - buffer->buf_rem; } return (length); } int vm_snapshot_guest2host_addr(void **addrp, size_t len, bool restore_null, struct vm_snapshot_meta *meta) { int ret; vm_paddr_t gaddr; if (meta->op == VM_SNAPSHOT_SAVE) { gaddr = paddr_host2guest(meta->ctx, *addrp); if (gaddr == (vm_paddr_t) -1) { if (!restore_null || (restore_null && (*addrp != NULL))) { ret = EFAULT; goto done; } } SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); } else if (meta->op == VM_SNAPSHOT_RESTORE) { SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); if (gaddr == (vm_paddr_t) -1) { if (!restore_null) { ret = EFAULT; goto done; } } *addrp = paddr_guest2host(meta->ctx, gaddr, len); } else { ret = EINVAL; } done: return (ret); } int vm_snapshot_buf_cmp(void *data, size_t data_size, struct vm_snapshot_meta *meta) { struct vm_snapshot_buffer *buffer; int op; int ret; buffer = &meta->buffer; op = meta->op; if (buffer->buf_rem < data_size) { fprintf(stderr, "%s: buffer too small\r\n", __func__); ret = E2BIG; goto done; } if (op == VM_SNAPSHOT_SAVE) { ret = 0; memcpy(buffer->buf, data, data_size); } else if (op == VM_SNAPSHOT_RESTORE) { ret = memcmp(data, buffer->buf, data_size); } else { ret = EINVAL; goto done; } buffer->buf += data_size; buffer->buf_rem -= data_size; done: return (ret); } diff --git a/usr.sbin/bhyve/spinup_ap.c b/usr.sbin/bhyve/spinup_ap.c index 0498ee91232d..61cbef96ed6a 100644 --- a/usr.sbin/bhyve/spinup_ap.c +++ b/usr.sbin/bhyve/spinup_ap.c @@ -1,92 +1,89 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include "bhyverun.h" #include "spinup_ap.h" static void -spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t rip) +spinup_ap_realmode(struct vcpu *newcpu, uint64_t rip) { int vector, error; uint16_t cs; uint64_t desc_base; uint32_t desc_limit, desc_access; vector = rip >> PAGE_SHIFT; /* * Update the %cs and %rip of the guest so that it starts * executing real mode code at at 'vector << 12'. */ - error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, 0); + error = vm_set_register(newcpu, VM_REG_GUEST_RIP, 0); assert(error == 0); - error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base, + error = vm_get_desc(newcpu, VM_REG_GUEST_CS, &desc_base, &desc_limit, &desc_access); assert(error == 0); desc_base = vector << PAGE_SHIFT; - error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS, + error = vm_set_desc(newcpu, VM_REG_GUEST_CS, desc_base, desc_limit, desc_access); assert(error == 0); cs = (vector << PAGE_SHIFT) >> 4; - error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs); + error = vm_set_register(newcpu, VM_REG_GUEST_CS, cs); assert(error == 0); } void -spinup_ap(struct vmctx *ctx, int newcpu, uint64_t rip) +spinup_ap(struct vcpu *newcpu, uint64_t rip) { int error; - assert(newcpu != 0); - assert(newcpu < guest_ncpus); - - error = vcpu_reset(ctx, newcpu); + error = vcpu_reset(newcpu); assert(error == 0); - spinup_ap_realmode(ctx, newcpu, rip); + spinup_ap_realmode(newcpu, rip); - vm_resume_cpu(ctx, newcpu); + vm_resume_cpu(newcpu); } diff --git a/usr.sbin/bhyve/spinup_ap.h b/usr.sbin/bhyve/spinup_ap.h index ee201427c585..db98edccbe82 100644 --- a/usr.sbin/bhyve/spinup_ap.h +++ b/usr.sbin/bhyve/spinup_ap.h @@ -1,36 +1,36 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SPINUP_AP_H_ #define _SPINUP_AP_H_ -void spinup_ap(struct vmctx *ctx, int newcpu, uint64_t rip); +void spinup_ap(struct vcpu *newcpu, uint64_t rip); #endif diff --git a/usr.sbin/bhyve/task_switch.c b/usr.sbin/bhyve/task_switch.c index 0dfb536f09f8..b29478ea5b01 100644 --- a/usr.sbin/bhyve/task_switch.c +++ b/usr.sbin/bhyve/task_switch.c @@ -1,942 +1,941 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "debug.h" /* * Using 'struct i386tss' is tempting but causes myriad sign extension * issues because all of its fields are defined as signed integers. */ struct tss32 { uint16_t tss_link; uint16_t rsvd1; uint32_t tss_esp0; uint16_t tss_ss0; uint16_t rsvd2; uint32_t tss_esp1; uint16_t tss_ss1; uint16_t rsvd3; uint32_t tss_esp2; uint16_t tss_ss2; uint16_t rsvd4; uint32_t tss_cr3; uint32_t tss_eip; uint32_t tss_eflags; uint32_t tss_eax; uint32_t tss_ecx; uint32_t tss_edx; uint32_t tss_ebx; uint32_t tss_esp; uint32_t tss_ebp; uint32_t tss_esi; uint32_t tss_edi; uint16_t tss_es; uint16_t rsvd5; uint16_t tss_cs; uint16_t rsvd6; uint16_t tss_ss; uint16_t rsvd7; uint16_t tss_ds; uint16_t rsvd8; uint16_t tss_fs; uint16_t rsvd9; uint16_t tss_gs; uint16_t rsvd10; uint16_t tss_ldt; uint16_t rsvd11; uint16_t tss_trap; uint16_t tss_iomap; }; static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed"); #define SEL_START(sel) (((sel) & ~0x7)) #define SEL_LIMIT(sel) (((sel) | 0x7)) #define TSS_BUSY(type) (((type) & 0x2) != 0) static uint64_t -GETREG(struct vmctx *ctx, int vcpu, int reg) +GETREG(struct vcpu *vcpu, int reg) { uint64_t val; int error; - error = vm_get_register(ctx, vcpu, reg, &val); + error = vm_get_register(vcpu, reg, &val); assert(error == 0); return (val); } static void -SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val) +SETREG(struct vcpu *vcpu, int reg, uint64_t val) { int error; - error = vm_set_register(ctx, vcpu, reg, val); + error = vm_set_register(vcpu, reg, val); assert(error == 0); } static struct seg_desc usd_to_seg_desc(struct user_segment_descriptor *usd) { struct seg_desc seg_desc; seg_desc.base = (u_int)USD_GETBASE(usd); if (usd->sd_gran) seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; else seg_desc.limit = (u_int)USD_GETLIMIT(usd); seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7; seg_desc.access |= usd->sd_xx << 12; seg_desc.access |= usd->sd_def32 << 14; seg_desc.access |= usd->sd_gran << 15; return (seg_desc); } /* * Inject an exception with an error code that is a segment selector. * The format of the error code is described in section 6.13, "Error Code", * Intel SDM volume 3. * * Bit 0 (EXT) denotes whether the exception occurred during delivery * of an external event like an interrupt. * * Bit 1 (IDT) indicates whether the selector points to a gate descriptor * in the IDT. * * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). */ static void -sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext) +sel_exception(struct vcpu *vcpu, int vector, uint16_t sel, int ext) { /* * Bit 2 from the selector is retained as-is in the error code. * * Bit 1 can be safely cleared because none of the selectors * encountered during task switch emulation refer to a task * gate in the IDT. * * Bit 0 is set depending on the value of 'ext'. */ sel &= ~0x3; if (ext) sel |= 0x1; - vm_inject_fault(ctx, vcpu, vector, 1, sel); + vm_inject_fault(vcpu, vector, 1, sel); } /* * Return 0 if the selector 'sel' in within the limits of the GDT/LDT * and non-zero otherwise. */ static int -desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel) +desc_table_limit_check(struct vcpu *vcpu, uint16_t sel) { uint64_t base; uint32_t limit, access; int error, reg; reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; - error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); + error = vm_get_desc(vcpu, reg, &base, &limit, &access); assert(error == 0); if (reg == VM_REG_GUEST_LDTR) { if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) return (-1); } if (limit < SEL_LIMIT(sel)) return (-1); else return (0); } /* * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced * by the selector 'sel'. * * Returns 0 on success. * Returns 1 if an exception was injected into the guest. * Returns -1 otherwise. */ static int -desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +desc_table_rw(struct vcpu *vcpu, struct vm_guest_paging *paging, uint16_t sel, struct user_segment_descriptor *desc, bool doread, int *faultptr) { struct iovec iov[2]; uint64_t base; uint32_t limit, access; int error, reg; reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; - error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); + error = vm_get_desc(vcpu, reg, &base, &limit, &access); assert(error == 0); assert(limit >= SEL_LIMIT(sel)); - error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel), + error = vm_copy_setup(vcpu, paging, base + SEL_START(sel), sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov), faultptr); if (error || *faultptr) return (error); if (doread) vm_copyin(iov, desc, sizeof(*desc)); else vm_copyout(desc, iov, sizeof(*desc)); return (0); } static int -desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +desc_table_read(struct vcpu *vcpu, struct vm_guest_paging *paging, uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) { - return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr)); + return (desc_table_rw(vcpu, paging, sel, desc, true, faultptr)); } static int -desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +desc_table_write(struct vcpu *vcpu, struct vm_guest_paging *paging, uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) { - return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr)); + return (desc_table_rw(vcpu, paging, sel, desc, false, faultptr)); } /* * Read the TSS descriptor referenced by 'sel' into 'desc'. * * Returns 0 on success. * Returns 1 if an exception was injected into the guest. * Returns -1 otherwise. */ static int -read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, +read_tss_descriptor(struct vcpu *vcpu, struct vm_task_switch *ts, uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) { struct vm_guest_paging sup_paging; int error; assert(!ISLDT(sel)); assert(IDXSEL(sel) != 0); /* Fetch the new TSS descriptor */ - if (desc_table_limit_check(ctx, vcpu, sel)) { + if (desc_table_limit_check(vcpu, sel)) { if (ts->reason == TSR_IRET) - sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + sel_exception(vcpu, IDT_TS, sel, ts->ext); else - sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext); + sel_exception(vcpu, IDT_GP, sel, ts->ext); return (1); } sup_paging = ts->paging; sup_paging.cpl = 0; /* implicit supervisor mode */ - error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr); + error = desc_table_read(vcpu, &sup_paging, sel, desc, faultptr); return (error); } static bool code_desc(int sd_type) { /* code descriptor */ return ((sd_type & 0x18) == 0x18); } static bool stack_desc(int sd_type) { /* writable data descriptor */ return ((sd_type & 0x1A) == 0x12); } static bool data_desc(int sd_type) { /* data descriptor or a readable code descriptor */ return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); } static bool ldt_desc(int sd_type) { return (sd_type == SDT_SYSLDT); } /* * Validate the descriptor 'seg_desc' associated with 'segment'. */ static int -validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, +validate_seg_desc(struct vcpu *vcpu, struct vm_task_switch *ts, int segment, struct seg_desc *seg_desc, int *faultptr) { struct vm_guest_paging sup_paging; struct user_segment_descriptor usd; int error, idtvec; int cpl, dpl, rpl; uint16_t sel, cs; bool ldtseg, codeseg, stackseg, dataseg, conforming; ldtseg = codeseg = stackseg = dataseg = false; switch (segment) { case VM_REG_GUEST_LDTR: ldtseg = true; break; case VM_REG_GUEST_CS: codeseg = true; break; case VM_REG_GUEST_SS: stackseg = true; break; case VM_REG_GUEST_DS: case VM_REG_GUEST_ES: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: dataseg = true; break; default: assert(0); } /* Get the segment selector */ - sel = GETREG(ctx, vcpu, segment); + sel = GETREG(vcpu, segment); /* LDT selector must point into the GDT */ if (ldtseg && ISLDT(sel)) { - sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + sel_exception(vcpu, IDT_TS, sel, ts->ext); return (1); } /* Descriptor table limit check */ - if (desc_table_limit_check(ctx, vcpu, sel)) { - sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + if (desc_table_limit_check(vcpu, sel)) { + sel_exception(vcpu, IDT_TS, sel, ts->ext); return (1); } /* NULL selector */ if (IDXSEL(sel) == 0) { /* Code and stack segment selectors cannot be NULL */ if (codeseg || stackseg) { - sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + sel_exception(vcpu, IDT_TS, sel, ts->ext); return (1); } seg_desc->base = 0; seg_desc->limit = 0; seg_desc->access = 0x10000; /* unusable */ return (0); } /* Read the descriptor from the GDT/LDT */ sup_paging = ts->paging; sup_paging.cpl = 0; /* implicit supervisor mode */ - error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr); + error = desc_table_read(vcpu, &sup_paging, sel, &usd, faultptr); if (error || *faultptr) return (error); /* Verify that the descriptor type is compatible with the segment */ if ((ldtseg && !ldt_desc(usd.sd_type)) || (codeseg && !code_desc(usd.sd_type)) || (dataseg && !data_desc(usd.sd_type)) || (stackseg && !stack_desc(usd.sd_type))) { - sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + sel_exception(vcpu, IDT_TS, sel, ts->ext); return (1); } /* Segment must be marked present */ if (!usd.sd_p) { if (ldtseg) idtvec = IDT_TS; else if (stackseg) idtvec = IDT_SS; else idtvec = IDT_NP; - sel_exception(ctx, vcpu, idtvec, sel, ts->ext); + sel_exception(vcpu, idtvec, sel, ts->ext); return (1); } - cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); + cs = GETREG(vcpu, VM_REG_GUEST_CS); cpl = cs & SEL_RPL_MASK; rpl = sel & SEL_RPL_MASK; dpl = usd.sd_dpl; if (stackseg && (rpl != cpl || dpl != cpl)) { - sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + sel_exception(vcpu, IDT_TS, sel, ts->ext); return (1); } if (codeseg) { conforming = (usd.sd_type & 0x4) ? true : false; if ((conforming && (cpl < dpl)) || (!conforming && (cpl != dpl))) { - sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + sel_exception(vcpu, IDT_TS, sel, ts->ext); return (1); } } if (dataseg) { /* * A data segment is always non-conforming except when it's * descriptor is a readable, conforming code segment. */ if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) conforming = true; else conforming = false; if (!conforming && (rpl > dpl || cpl > dpl)) { - sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + sel_exception(vcpu, IDT_TS, sel, ts->ext); return (1); } } *seg_desc = usd_to_seg_desc(&usd); return (0); } static void -tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch, +tss32_save(struct vcpu *vcpu, struct vm_task_switch *task_switch, uint32_t eip, struct tss32 *tss, struct iovec *iov) { /* General purpose registers */ - tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX); - tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX); - tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX); - tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX); - tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); - tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP); - tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI); - tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI); + tss->tss_eax = GETREG(vcpu, VM_REG_GUEST_RAX); + tss->tss_ecx = GETREG(vcpu, VM_REG_GUEST_RCX); + tss->tss_edx = GETREG(vcpu, VM_REG_GUEST_RDX); + tss->tss_ebx = GETREG(vcpu, VM_REG_GUEST_RBX); + tss->tss_esp = GETREG(vcpu, VM_REG_GUEST_RSP); + tss->tss_ebp = GETREG(vcpu, VM_REG_GUEST_RBP); + tss->tss_esi = GETREG(vcpu, VM_REG_GUEST_RSI); + tss->tss_edi = GETREG(vcpu, VM_REG_GUEST_RDI); /* Segment selectors */ - tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES); - tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); - tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS); - tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS); - tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS); - tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS); + tss->tss_es = GETREG(vcpu, VM_REG_GUEST_ES); + tss->tss_cs = GETREG(vcpu, VM_REG_GUEST_CS); + tss->tss_ss = GETREG(vcpu, VM_REG_GUEST_SS); + tss->tss_ds = GETREG(vcpu, VM_REG_GUEST_DS); + tss->tss_fs = GETREG(vcpu, VM_REG_GUEST_FS); + tss->tss_gs = GETREG(vcpu, VM_REG_GUEST_GS); /* eflags and eip */ - tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); + tss->tss_eflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS); if (task_switch->reason == TSR_IRET) tss->tss_eflags &= ~PSL_NT; tss->tss_eip = eip; /* Copy updated old TSS into guest memory */ vm_copyout(tss, iov, sizeof(struct tss32)); } static void -update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd) +update_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *sd) { int error; - error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access); + error = vm_set_desc(vcpu, reg, sd->base, sd->limit, sd->access); assert(error == 0); } /* * Update the vcpu registers to reflect the state of the new task. */ static int -tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, +tss32_restore(struct vmctx *ctx, struct vcpu *vcpu, struct vm_task_switch *ts, uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr) { struct seg_desc seg_desc, seg_desc2; uint64_t *pdpte, maxphyaddr, reserved; uint32_t eflags; int error, i; bool nested; nested = false; if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { tss->tss_link = ot_sel; nested = true; } eflags = tss->tss_eflags; if (nested) eflags |= PSL_NT; /* LDTR */ - SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); + SETREG(vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); /* PBDR */ if (ts->paging.paging_mode != PAGING_MODE_FLAT) { if (ts->paging.paging_mode == PAGING_MODE_PAE) { /* * XXX Assuming 36-bit MAXPHYADDR. */ maxphyaddr = (1UL << 36) - 1; pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32); for (i = 0; i < 4; i++) { /* Check reserved bits if the PDPTE is valid */ if (!(pdpte[i] & 0x1)) continue; /* * Bits 2:1, 8:5 and bits above the processor's * maximum physical address are reserved. */ reserved = ~maxphyaddr | 0x1E6; if (pdpte[i] & reserved) { - vm_inject_gp(ctx, vcpu); + vm_inject_gp(vcpu); return (1); } } - SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); - SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); - SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); - SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); + SETREG(vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); + SETREG(vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); + SETREG(vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); + SETREG(vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); } - SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); + SETREG(vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); ts->paging.cr3 = tss->tss_cr3; } /* eflags and eip */ - SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags); - SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip); + SETREG(vcpu, VM_REG_GUEST_RFLAGS, eflags); + SETREG(vcpu, VM_REG_GUEST_RIP, tss->tss_eip); /* General purpose registers */ - SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax); - SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); - SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx); - SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); - SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp); - SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); - SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi); - SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi); + SETREG(vcpu, VM_REG_GUEST_RAX, tss->tss_eax); + SETREG(vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); + SETREG(vcpu, VM_REG_GUEST_RDX, tss->tss_edx); + SETREG(vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); + SETREG(vcpu, VM_REG_GUEST_RSP, tss->tss_esp); + SETREG(vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); + SETREG(vcpu, VM_REG_GUEST_RSI, tss->tss_esi); + SETREG(vcpu, VM_REG_GUEST_RDI, tss->tss_edi); /* Segment selectors */ - SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es); - SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs); - SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss); - SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds); - SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs); - SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs); + SETREG(vcpu, VM_REG_GUEST_ES, tss->tss_es); + SETREG(vcpu, VM_REG_GUEST_CS, tss->tss_cs); + SETREG(vcpu, VM_REG_GUEST_SS, tss->tss_ss); + SETREG(vcpu, VM_REG_GUEST_DS, tss->tss_ds); + SETREG(vcpu, VM_REG_GUEST_FS, tss->tss_fs); + SETREG(vcpu, VM_REG_GUEST_GS, tss->tss_gs); /* * If this is a nested task then write out the new TSS to update * the previous link field. */ if (nested) vm_copyout(tss, iov, sizeof(*tss)); /* Validate segment descriptors */ - error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc, + error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc, faultptr); if (error || *faultptr) return (error); - update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc); + update_seg_desc(vcpu, VM_REG_GUEST_LDTR, &seg_desc); /* * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. * * The SS and CS attribute checks on VM-entry are inter-dependent so * we need to make sure that both segments are valid before updating * either of them. This ensures that the VMCS state can pass the * VM-entry checks so the guest can handle any exception injected * during task switch emulation. */ - error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc, + error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_CS, &seg_desc, faultptr); if (error || *faultptr) return (error); - error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2, + error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_SS, &seg_desc2, faultptr); if (error || *faultptr) return (error); - update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc); - update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2); + update_seg_desc(vcpu, VM_REG_GUEST_CS, &seg_desc); + update_seg_desc(vcpu, VM_REG_GUEST_SS, &seg_desc2); ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; - error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc, + error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_DS, &seg_desc, faultptr); if (error || *faultptr) return (error); - update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc); + update_seg_desc(vcpu, VM_REG_GUEST_DS, &seg_desc); - error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc, + error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_ES, &seg_desc, faultptr); if (error || *faultptr) return (error); - update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc); + update_seg_desc(vcpu, VM_REG_GUEST_ES, &seg_desc); - error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc, + error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_FS, &seg_desc, faultptr); if (error || *faultptr) return (error); - update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc); + update_seg_desc(vcpu, VM_REG_GUEST_FS, &seg_desc); - error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc, + error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_GS, &seg_desc, faultptr); if (error || *faultptr) return (error); - update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc); + update_seg_desc(vcpu, VM_REG_GUEST_GS, &seg_desc); return (0); } /* * Push an error code on the stack of the new task. This is needed if the * task switch was triggered by a hardware exception that causes an error * code to be saved (e.g. #PF). */ static int -push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +push_errcode(struct vcpu *vcpu, struct vm_guest_paging *paging, int task_type, uint32_t errcode, int *faultptr) { struct iovec iov[2]; struct seg_desc seg_desc; int stacksize, bytes, error; uint64_t gla, cr0, rflags; uint32_t esp; uint16_t stacksel; *faultptr = 0; - cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); - rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); - stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS); + cr0 = GETREG(vcpu, VM_REG_GUEST_CR0); + rflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS); + stacksel = GETREG(vcpu, VM_REG_GUEST_SS); - error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base, + error = vm_get_desc(vcpu, VM_REG_GUEST_SS, &seg_desc.base, &seg_desc.limit, &seg_desc.access); assert(error == 0); /* * Section "Error Code" in the Intel SDM vol 3: the error code is * pushed on the stack as a doubleword or word (depending on the * default interrupt, trap or task gate size). */ if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) bytes = 4; else bytes = 2; /* * PUSH instruction from Intel SDM vol 2: the 'B' flag in the * stack-segment descriptor determines the size of the stack * pointer outside of 64-bit mode. */ if (SEG_DESC_DEF32(seg_desc.access)) stacksize = 4; else stacksize = 2; - esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); + esp = GETREG(vcpu, VM_REG_GUEST_RSP); esp -= bytes; if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { - sel_exception(ctx, vcpu, IDT_SS, stacksel, 1); + sel_exception(vcpu, IDT_SS, stacksel, 1); *faultptr = 1; return (0); } if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { - vm_inject_ac(ctx, vcpu, 1); + vm_inject_ac(vcpu, 1); *faultptr = 1; return (0); } - error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE, + error = vm_copy_setup(vcpu, paging, gla, bytes, PROT_WRITE, iov, nitems(iov), faultptr); if (error || *faultptr) return (error); vm_copyout(&errcode, iov, bytes); - SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp); + SETREG(vcpu, VM_REG_GUEST_RSP, esp); return (0); } /* * Evaluate return value from helper functions and potentially return to * the VM run loop. */ #define CHKERR(error,fault) \ do { \ assert((error == 0) || (error == EFAULT)); \ if (error) \ return (VMEXIT_ABORT); \ else if (fault) \ return (VMEXIT_CONTINUE); \ } while (0) int -vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +vmexit_task_switch(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vmexit) { struct seg_desc nt; struct tss32 oldtss, newtss; struct vm_task_switch *task_switch; struct vm_guest_paging *paging, sup_paging; struct user_segment_descriptor nt_desc, ot_desc; struct iovec nt_iov[2], ot_iov[2]; uint64_t cr0, ot_base; uint32_t eip, ot_lim, access; - int error, ext, fault, minlimit, nt_type, ot_type, vcpu; + int error, ext, fault, minlimit, nt_type, ot_type; enum task_switch_reason reason; uint16_t nt_sel, ot_sel; task_switch = &vmexit->u.task_switch; nt_sel = task_switch->tsssel; ext = vmexit->u.task_switch.ext; reason = vmexit->u.task_switch.reason; paging = &vmexit->u.task_switch.paging; - vcpu = *pvcpu; assert(paging->cpu_mode == CPU_MODE_PROTECTED); /* * Calculate the instruction pointer to store in the old TSS. */ eip = vmexit->rip + vmexit->inst_length; /* * Section 4.6, "Access Rights" in Intel SDM Vol 3. * The following page table accesses are implicitly supervisor mode: * - accesses to GDT or LDT to load segment descriptors * - accesses to the task state segment during task switch */ sup_paging = *paging; sup_paging.cpl = 0; /* implicit supervisor mode */ /* Fetch the new TSS descriptor */ - error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc, + error = read_tss_descriptor(vcpu, task_switch, nt_sel, &nt_desc, &fault); CHKERR(error, fault); nt = usd_to_seg_desc(&nt_desc); /* Verify the type of the new TSS */ nt_type = SEG_DESC_TYPE(nt.access); if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { - sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + sel_exception(vcpu, IDT_TS, nt_sel, ext); goto done; } /* TSS descriptor must have present bit set */ if (!SEG_DESC_PRESENT(nt.access)) { - sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext); + sel_exception(vcpu, IDT_NP, nt_sel, ext); goto done; } /* * TSS must have a minimum length of 104 bytes for a 32-bit TSS and * 44 bytes for a 16-bit TSS. */ if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) minlimit = 104 - 1; else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) minlimit = 44 - 1; else minlimit = 0; assert(minlimit > 0); if (nt.limit < (unsigned int)minlimit) { - sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + sel_exception(vcpu, IDT_TS, nt_sel, ext); goto done; } /* TSS must be busy if task switch is due to IRET */ if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { - sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + sel_exception(vcpu, IDT_TS, nt_sel, ext); goto done; } /* * TSS must be available (not busy) if task switch reason is * CALL, JMP, exception or interrupt. */ if (reason != TSR_IRET && TSS_BUSY(nt_type)) { - sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext); + sel_exception(vcpu, IDT_GP, nt_sel, ext); goto done; } /* Fetch the new TSS */ - error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1, + error = vm_copy_setup(vcpu, &sup_paging, nt.base, minlimit + 1, PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault); CHKERR(error, fault); vm_copyin(nt_iov, &newtss, minlimit + 1); /* Get the old TSS selector from the guest's task register */ - ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR); + ot_sel = GETREG(vcpu, VM_REG_GUEST_TR); if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { /* * This might happen if a task switch was attempted without * ever loading the task register with LTR. In this case the * TR would contain the values from power-on: * (sel = 0, base = 0, limit = 0xffff). */ - sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext); + sel_exception(vcpu, IDT_TS, ot_sel, task_switch->ext); goto done; } /* Get the old TSS base and limit from the guest's task register */ - error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, + error = vm_get_desc(vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, &access); assert(error == 0); assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); ot_type = SEG_DESC_TYPE(access); assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); /* Fetch the old TSS descriptor */ - error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc, + error = read_tss_descriptor(vcpu, task_switch, ot_sel, &ot_desc, &fault); CHKERR(error, fault); /* Get the old TSS */ - error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1, + error = vm_copy_setup(vcpu, &sup_paging, ot_base, minlimit + 1, PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault); CHKERR(error, fault); vm_copyin(ot_iov, &oldtss, minlimit + 1); /* * Clear the busy bit in the old TSS descriptor if the task switch * due to an IRET or JMP instruction. */ if (reason == TSR_IRET || reason == TSR_JMP) { ot_desc.sd_type &= ~0x2; - error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel, + error = desc_table_write(vcpu, &sup_paging, ot_sel, &ot_desc, &fault); CHKERR(error, fault); } if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { EPRINTLN("Task switch to 16-bit TSS not supported"); return (VMEXIT_ABORT); } /* Save processor state in old TSS */ - tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov); + tss32_save(vcpu, task_switch, eip, &oldtss, ot_iov); /* * If the task switch was triggered for any reason other than IRET * then set the busy bit in the new TSS descriptor. */ if (reason != TSR_IRET) { nt_desc.sd_type |= 0x2; - error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel, + error = desc_table_write(vcpu, &sup_paging, nt_sel, &nt_desc, &fault); CHKERR(error, fault); } /* Update task register to point at the new TSS */ - SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel); + SETREG(vcpu, VM_REG_GUEST_TR, nt_sel); /* Update the hidden descriptor state of the task register */ nt = usd_to_seg_desc(&nt_desc); - update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt); + update_seg_desc(vcpu, VM_REG_GUEST_TR, &nt); /* Set CR0.TS */ - cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); - SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); + cr0 = GETREG(vcpu, VM_REG_GUEST_CR0); + SETREG(vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); /* * We are now committed to the task switch. Any exceptions encountered * after this point will be handled in the context of the new task and * the saved instruction pointer will belong to the new task. */ - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); + error = vm_set_register(vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); assert(error == 0); /* Load processor state from new TSS */ error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov, &fault); CHKERR(error, fault); /* * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception * caused an error code to be generated, this error code is copied * to the stack of the new task. */ if (task_switch->errcode_valid) { assert(task_switch->ext); assert(task_switch->reason == TSR_IDT_GATE); - error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type, + error = push_errcode(vcpu, &task_switch->paging, nt_type, task_switch->errcode, &fault); CHKERR(error, fault); } /* * Treatment of virtual-NMI blocking if NMI is delivered through * a task gate. * * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: * If the virtual NMIs VM-execution control is 1, VM entry injects * an NMI, and delivery of the NMI causes a task switch that causes * a VM exit, virtual-NMI blocking is in effect before the VM exit * commences. * * Thus, virtual-NMI blocking is in effect at the time of the task * switch VM exit. */ /* * Treatment of virtual-NMI unblocking on IRET from NMI handler task. * * Section "Changes to Instruction Behavior in VMX Non-Root Operation" * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. * This unblocking of virtual-NMI occurs even if IRET causes a fault. * * Thus, virtual-NMI blocking is cleared at the time of the task switch * VM exit. */ /* * If the task switch was triggered by an event delivered through * the IDT then extinguish the pending event from the vcpu's * exitintinfo. */ if (task_switch->reason == TSR_IDT_GATE) { - error = vm_set_intinfo(ctx, vcpu, 0); + error = vm_set_intinfo(vcpu, 0); assert(error == 0); } /* * XXX should inject debug exception if 'T' bit is 1 */ done: return (VMEXIT_CONTINUE); } diff --git a/usr.sbin/bhyve/vga.c b/usr.sbin/bhyve/vga.c index f139dc38937b..a63943efe486 100644 --- a/usr.sbin/bhyve/vga.c +++ b/usr.sbin/bhyve/vga.c @@ -1,1334 +1,1333 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "bhyvegc.h" #include "console.h" #include "inout.h" #include "mem.h" #include "vga.h" #define KB (1024UL) #define MB (1024 * 1024UL) struct vga_softc { struct mem_range mr; struct bhyvegc *gc; int gc_width; int gc_height; struct bhyvegc_image *gc_image; uint8_t *vga_ram; /* * General registers */ uint8_t vga_misc; uint8_t vga_sts1; /* * Sequencer */ struct { int seq_index; uint8_t seq_reset; uint8_t seq_clock_mode; int seq_cm_dots; uint8_t seq_map_mask; uint8_t seq_cmap_sel; int seq_cmap_pri_off; int seq_cmap_sec_off; uint8_t seq_mm; } vga_seq; /* * CRT Controller */ struct { int crtc_index; uint8_t crtc_mode_ctrl; uint8_t crtc_horiz_total; uint8_t crtc_horiz_disp_end; uint8_t crtc_start_horiz_blank; uint8_t crtc_end_horiz_blank; uint8_t crtc_start_horiz_retrace; uint8_t crtc_end_horiz_retrace; uint8_t crtc_vert_total; uint8_t crtc_overflow; uint8_t crtc_present_row_scan; uint8_t crtc_max_scan_line; uint8_t crtc_cursor_start; uint8_t crtc_cursor_on; uint8_t crtc_cursor_end; uint8_t crtc_start_addr_high; uint8_t crtc_start_addr_low; uint16_t crtc_start_addr; uint8_t crtc_cursor_loc_low; uint8_t crtc_cursor_loc_high; uint16_t crtc_cursor_loc; uint8_t crtc_vert_retrace_start; uint8_t crtc_vert_retrace_end; uint8_t crtc_vert_disp_end; uint8_t crtc_offset; uint8_t crtc_underline_loc; uint8_t crtc_start_vert_blank; uint8_t crtc_end_vert_blank; uint8_t crtc_line_compare; } vga_crtc; /* * Graphics Controller */ struct { int gc_index; uint8_t gc_set_reset; uint8_t gc_enb_set_reset; uint8_t gc_color_compare; uint8_t gc_rotate; uint8_t gc_op; uint8_t gc_read_map_sel; uint8_t gc_mode; bool gc_mode_c4; /* chain 4 */ bool gc_mode_oe; /* odd/even */ uint8_t gc_mode_rm; /* read mode */ uint8_t gc_mode_wm; /* write mode */ uint8_t gc_misc; uint8_t gc_misc_gm; /* graphics mode */ uint8_t gc_misc_mm; /* memory map */ uint8_t gc_color_dont_care; uint8_t gc_bit_mask; uint8_t gc_latch0; uint8_t gc_latch1; uint8_t gc_latch2; uint8_t gc_latch3; } vga_gc; /* * Attribute Controller */ struct { int atc_flipflop; int atc_index; uint8_t atc_palette[16]; uint8_t atc_mode; uint8_t atc_overscan_color; uint8_t atc_color_plane_enb; uint8_t atc_horiz_pixel_panning; uint8_t atc_color_select; uint8_t atc_color_select_45; uint8_t atc_color_select_67; } vga_atc; /* * DAC */ struct { uint8_t dac_state; uint8_t dac_rd_index; uint8_t dac_rd_subindex; uint8_t dac_wr_index; uint8_t dac_wr_subindex; uint8_t dac_palette[3 * 256]; uint32_t dac_palette_rgb[256]; } vga_dac; }; static bool vga_in_reset(struct vga_softc *sc) { return (((sc->vga_seq.seq_clock_mode & SEQ_CM_SO) != 0) || ((sc->vga_seq.seq_reset & SEQ_RESET_ASYNC) == 0) || ((sc->vga_seq.seq_reset & SEQ_RESET_SYNC) == 0) || ((sc->vga_crtc.crtc_mode_ctrl & CRTC_MC_TE) == 0)); } static void vga_check_size(struct bhyvegc *gc, struct vga_softc *sc) { int old_width, old_height; if (vga_in_reset(sc)) return; //old_width = sc->gc_width; //old_height = sc->gc_height; old_width = sc->gc_image->width; old_height = sc->gc_image->height; /* * Horizontal Display End: For text modes this is the number * of characters. For graphics modes this is the number of * pixels per scanlines divided by the number of pixels per * character clock. */ sc->gc_width = (sc->vga_crtc.crtc_horiz_disp_end + 1) * sc->vga_seq.seq_cm_dots; sc->gc_height = (sc->vga_crtc.crtc_vert_disp_end | (((sc->vga_crtc.crtc_overflow & CRTC_OF_VDE8) >> CRTC_OF_VDE8_SHIFT) << 8) | (((sc->vga_crtc.crtc_overflow & CRTC_OF_VDE9) >> CRTC_OF_VDE9_SHIFT) << 9)) + 1; if (old_width != sc->gc_width || old_height != sc->gc_height) bhyvegc_resize(gc, sc->gc_width, sc->gc_height); } static uint32_t vga_get_pixel(struct vga_softc *sc, int x, int y) { int offset; int bit; uint8_t data; uint8_t idx; offset = (y * sc->gc_width / 8) + (x / 8); bit = 7 - (x % 8); data = (((sc->vga_ram[offset + 0 * 64*KB] >> bit) & 0x1) << 0) | (((sc->vga_ram[offset + 1 * 64*KB] >> bit) & 0x1) << 1) | (((sc->vga_ram[offset + 2 * 64*KB] >> bit) & 0x1) << 2) | (((sc->vga_ram[offset + 3 * 64*KB] >> bit) & 0x1) << 3); data &= sc->vga_atc.atc_color_plane_enb; if (sc->vga_atc.atc_mode & ATC_MC_IPS) { idx = sc->vga_atc.atc_palette[data] & 0x0f; idx |= sc->vga_atc.atc_color_select_45; } else { idx = sc->vga_atc.atc_palette[data]; } idx |= sc->vga_atc.atc_color_select_67; return (sc->vga_dac.dac_palette_rgb[idx]); } static void vga_render_graphics(struct vga_softc *sc) { int x, y; for (y = 0; y < sc->gc_height; y++) { for (x = 0; x < sc->gc_width; x++) { int offset; offset = y * sc->gc_width + x; sc->gc_image->data[offset] = vga_get_pixel(sc, x, y); } } } static uint32_t vga_get_text_pixel(struct vga_softc *sc, int x, int y) { int dots, offset, bit, font_offset; uint8_t ch, attr, font; uint8_t idx; dots = sc->vga_seq.seq_cm_dots; offset = 2 * sc->vga_crtc.crtc_start_addr; offset += (y / 16 * sc->gc_width / dots) * 2 + (x / dots) * 2; bit = 7 - (x % dots > 7 ? 7 : x % dots); ch = sc->vga_ram[offset + 0 * 64*KB]; attr = sc->vga_ram[offset + 1 * 64*KB]; if (sc->vga_crtc.crtc_cursor_on && (offset == (sc->vga_crtc.crtc_cursor_loc * 2)) && ((y % 16) >= (sc->vga_crtc.crtc_cursor_start & CRTC_CS_CS)) && ((y % 16) <= (sc->vga_crtc.crtc_cursor_end & CRTC_CE_CE))) { idx = sc->vga_atc.atc_palette[attr & 0xf]; return (sc->vga_dac.dac_palette_rgb[idx]); } if ((sc->vga_seq.seq_mm & SEQ_MM_EM) && sc->vga_seq.seq_cmap_pri_off != sc->vga_seq.seq_cmap_sec_off) { if (attr & 0x8) font_offset = sc->vga_seq.seq_cmap_pri_off + (ch << 5) + y % 16; else font_offset = sc->vga_seq.seq_cmap_sec_off + (ch << 5) + y % 16; attr &= ~0x8; } else { font_offset = (ch << 5) + y % 16; } font = sc->vga_ram[font_offset + 2 * 64*KB]; if (font & (1 << bit)) idx = sc->vga_atc.atc_palette[attr & 0xf]; else idx = sc->vga_atc.atc_palette[attr >> 4]; return (sc->vga_dac.dac_palette_rgb[idx]); } static void vga_render_text(struct vga_softc *sc) { int x, y; for (y = 0; y < sc->gc_height; y++) { for (x = 0; x < sc->gc_width; x++) { int offset; offset = y * sc->gc_width + x; sc->gc_image->data[offset] = vga_get_text_pixel(sc, x, y); } } } void vga_render(struct bhyvegc *gc, void *arg) { struct vga_softc *sc = arg; vga_check_size(gc, sc); if (vga_in_reset(sc)) { memset(sc->gc_image->data, 0, sc->gc_image->width * sc->gc_image->height * sizeof (uint32_t)); return; } if (sc->vga_gc.gc_misc_gm && (sc->vga_atc.atc_mode & ATC_MC_GA)) vga_render_graphics(sc); else vga_render_text(sc); } static uint64_t -vga_mem_rd_handler(struct vmctx *ctx __unused, uint64_t addr, void *arg1) +vga_mem_rd_handler(uint64_t addr, void *arg1) { struct vga_softc *sc = arg1; uint8_t map_sel; int offset; offset = addr; switch (sc->vga_gc.gc_misc_mm) { case 0x0: /* * extended mode: base 0xa0000 size 128k */ offset -=0xa0000; offset &= (128 * KB - 1); break; case 0x1: /* * EGA/VGA mode: base 0xa0000 size 64k */ offset -=0xa0000; offset &= (64 * KB - 1); break; case 0x2: /* * monochrome text mode: base 0xb0000 size 32kb */ assert(0); case 0x3: /* * color text mode and CGA: base 0xb8000 size 32kb */ offset -=0xb8000; offset &= (32 * KB - 1); break; } /* Fill latches. */ sc->vga_gc.gc_latch0 = sc->vga_ram[offset + 0*64*KB]; sc->vga_gc.gc_latch1 = sc->vga_ram[offset + 1*64*KB]; sc->vga_gc.gc_latch2 = sc->vga_ram[offset + 2*64*KB]; sc->vga_gc.gc_latch3 = sc->vga_ram[offset + 3*64*KB]; if (sc->vga_gc.gc_mode_rm) { /* read mode 1 */ assert(0); } map_sel = sc->vga_gc.gc_read_map_sel; if (sc->vga_gc.gc_mode_oe) { map_sel |= (offset & 1); offset &= ~1; } /* read mode 0: return the byte from the selected plane. */ offset += map_sel * 64*KB; return (sc->vga_ram[offset]); } static void -vga_mem_wr_handler(struct vmctx *ctx __unused, uint64_t addr, uint8_t val, - void *arg1) +vga_mem_wr_handler(uint64_t addr, uint8_t val, void *arg1) { struct vga_softc *sc = arg1; uint8_t c0, c1, c2, c3; uint8_t m0, m1, m2, m3; uint8_t set_reset; uint8_t enb_set_reset; uint8_t mask; int offset; offset = addr; switch (sc->vga_gc.gc_misc_mm) { case 0x0: /* * extended mode: base 0xa0000 size 128kb */ offset -=0xa0000; offset &= (128 * KB - 1); break; case 0x1: /* * EGA/VGA mode: base 0xa0000 size 64kb */ offset -=0xa0000; offset &= (64 * KB - 1); break; case 0x2: /* * monochrome text mode: base 0xb0000 size 32kb */ assert(0); case 0x3: /* * color text mode and CGA: base 0xb8000 size 32kb */ offset -=0xb8000; offset &= (32 * KB - 1); break; } set_reset = sc->vga_gc.gc_set_reset; enb_set_reset = sc->vga_gc.gc_enb_set_reset; c0 = sc->vga_gc.gc_latch0; c1 = sc->vga_gc.gc_latch1; c2 = sc->vga_gc.gc_latch2; c3 = sc->vga_gc.gc_latch3; switch (sc->vga_gc.gc_mode_wm) { case 0: /* write mode 0 */ mask = sc->vga_gc.gc_bit_mask; val = (val >> sc->vga_gc.gc_rotate) | (val << (8 - sc->vga_gc.gc_rotate)); switch (sc->vga_gc.gc_op) { case 0x00: /* replace */ m0 = (set_reset & 1) ? mask : 0x00; m1 = (set_reset & 2) ? mask : 0x00; m2 = (set_reset & 4) ? mask : 0x00; m3 = (set_reset & 8) ? mask : 0x00; c0 = (enb_set_reset & 1) ? (c0 & ~mask) : (val & mask); c1 = (enb_set_reset & 2) ? (c1 & ~mask) : (val & mask); c2 = (enb_set_reset & 4) ? (c2 & ~mask) : (val & mask); c3 = (enb_set_reset & 8) ? (c3 & ~mask) : (val & mask); c0 |= m0; c1 |= m1; c2 |= m2; c3 |= m3; break; case 0x08: /* AND */ m0 = set_reset & 1 ? 0xff : ~mask; m1 = set_reset & 2 ? 0xff : ~mask; m2 = set_reset & 4 ? 0xff : ~mask; m3 = set_reset & 8 ? 0xff : ~mask; c0 = enb_set_reset & 1 ? c0 & m0 : val & m0; c1 = enb_set_reset & 2 ? c1 & m1 : val & m1; c2 = enb_set_reset & 4 ? c2 & m2 : val & m2; c3 = enb_set_reset & 8 ? c3 & m3 : val & m3; break; case 0x10: /* OR */ m0 = set_reset & 1 ? mask : 0x00; m1 = set_reset & 2 ? mask : 0x00; m2 = set_reset & 4 ? mask : 0x00; m3 = set_reset & 8 ? mask : 0x00; c0 = enb_set_reset & 1 ? c0 | m0 : val | m0; c1 = enb_set_reset & 2 ? c1 | m1 : val | m1; c2 = enb_set_reset & 4 ? c2 | m2 : val | m2; c3 = enb_set_reset & 8 ? c3 | m3 : val | m3; break; case 0x18: /* XOR */ m0 = set_reset & 1 ? mask : 0x00; m1 = set_reset & 2 ? mask : 0x00; m2 = set_reset & 4 ? mask : 0x00; m3 = set_reset & 8 ? mask : 0x00; c0 = enb_set_reset & 1 ? c0 ^ m0 : val ^ m0; c1 = enb_set_reset & 2 ? c1 ^ m1 : val ^ m1; c2 = enb_set_reset & 4 ? c2 ^ m2 : val ^ m2; c3 = enb_set_reset & 8 ? c3 ^ m3 : val ^ m3; break; } break; case 1: /* write mode 1 */ break; case 2: /* write mode 2 */ mask = sc->vga_gc.gc_bit_mask; switch (sc->vga_gc.gc_op) { case 0x00: /* replace */ m0 = (val & 1 ? 0xff : 0x00) & mask; m1 = (val & 2 ? 0xff : 0x00) & mask; m2 = (val & 4 ? 0xff : 0x00) & mask; m3 = (val & 8 ? 0xff : 0x00) & mask; c0 &= ~mask; c1 &= ~mask; c2 &= ~mask; c3 &= ~mask; c0 |= m0; c1 |= m1; c2 |= m2; c3 |= m3; break; case 0x08: /* AND */ m0 = (val & 1 ? 0xff : 0x00) | ~mask; m1 = (val & 2 ? 0xff : 0x00) | ~mask; m2 = (val & 4 ? 0xff : 0x00) | ~mask; m3 = (val & 8 ? 0xff : 0x00) | ~mask; c0 &= m0; c1 &= m1; c2 &= m2; c3 &= m3; break; case 0x10: /* OR */ m0 = (val & 1 ? 0xff : 0x00) & mask; m1 = (val & 2 ? 0xff : 0x00) & mask; m2 = (val & 4 ? 0xff : 0x00) & mask; m3 = (val & 8 ? 0xff : 0x00) & mask; c0 |= m0; c1 |= m1; c2 |= m2; c3 |= m3; break; case 0x18: /* XOR */ m0 = (val & 1 ? 0xff : 0x00) & mask; m1 = (val & 2 ? 0xff : 0x00) & mask; m2 = (val & 4 ? 0xff : 0x00) & mask; m3 = (val & 8 ? 0xff : 0x00) & mask; c0 ^= m0; c1 ^= m1; c2 ^= m2; c3 ^= m3; break; } break; case 3: /* write mode 3 */ mask = sc->vga_gc.gc_bit_mask & val; val = (val >> sc->vga_gc.gc_rotate) | (val << (8 - sc->vga_gc.gc_rotate)); switch (sc->vga_gc.gc_op) { case 0x00: /* replace */ m0 = (set_reset & 1 ? 0xff : 0x00) & mask; m1 = (set_reset & 2 ? 0xff : 0x00) & mask; m2 = (set_reset & 4 ? 0xff : 0x00) & mask; m3 = (set_reset & 8 ? 0xff : 0x00) & mask; c0 &= ~mask; c1 &= ~mask; c2 &= ~mask; c3 &= ~mask; c0 |= m0; c1 |= m1; c2 |= m2; c3 |= m3; break; case 0x08: /* AND */ m0 = (set_reset & 1 ? 0xff : 0x00) | ~mask; m1 = (set_reset & 2 ? 0xff : 0x00) | ~mask; m2 = (set_reset & 4 ? 0xff : 0x00) | ~mask; m3 = (set_reset & 8 ? 0xff : 0x00) | ~mask; c0 &= m0; c1 &= m1; c2 &= m2; c3 &= m3; break; case 0x10: /* OR */ m0 = (set_reset & 1 ? 0xff : 0x00) & mask; m1 = (set_reset & 2 ? 0xff : 0x00) & mask; m2 = (set_reset & 4 ? 0xff : 0x00) & mask; m3 = (set_reset & 8 ? 0xff : 0x00) & mask; c0 |= m0; c1 |= m1; c2 |= m2; c3 |= m3; break; case 0x18: /* XOR */ m0 = (set_reset & 1 ? 0xff : 0x00) & mask; m1 = (set_reset & 2 ? 0xff : 0x00) & mask; m2 = (set_reset & 4 ? 0xff : 0x00) & mask; m3 = (set_reset & 8 ? 0xff : 0x00) & mask; c0 ^= m0; c1 ^= m1; c2 ^= m2; c3 ^= m3; break; } break; } if (sc->vga_gc.gc_mode_oe) { if (offset & 1) { offset &= ~1; if (sc->vga_seq.seq_map_mask & 2) sc->vga_ram[offset + 1*64*KB] = c1; if (sc->vga_seq.seq_map_mask & 8) sc->vga_ram[offset + 3*64*KB] = c3; } else { if (sc->vga_seq.seq_map_mask & 1) sc->vga_ram[offset + 0*64*KB] = c0; if (sc->vga_seq.seq_map_mask & 4) sc->vga_ram[offset + 2*64*KB] = c2; } } else { if (sc->vga_seq.seq_map_mask & 1) sc->vga_ram[offset + 0*64*KB] = c0; if (sc->vga_seq.seq_map_mask & 2) sc->vga_ram[offset + 1*64*KB] = c1; if (sc->vga_seq.seq_map_mask & 4) sc->vga_ram[offset + 2*64*KB] = c2; if (sc->vga_seq.seq_map_mask & 8) sc->vga_ram[offset + 3*64*KB] = c3; } } static int -vga_mem_handler(struct vmctx *ctx, int vcpu __unused, int dir, uint64_t addr, - int size, uint64_t *val, void *arg1, long arg2 __unused) +vga_mem_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, int size, + uint64_t *val, void *arg1, long arg2 __unused) { if (dir == MEM_F_WRITE) { switch (size) { case 1: - vga_mem_wr_handler(ctx, addr, *val, arg1); + vga_mem_wr_handler(addr, *val, arg1); break; case 2: - vga_mem_wr_handler(ctx, addr, *val, arg1); - vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1); + vga_mem_wr_handler(addr, *val, arg1); + vga_mem_wr_handler(addr + 1, *val >> 8, arg1); break; case 4: - vga_mem_wr_handler(ctx, addr, *val, arg1); - vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1); - vga_mem_wr_handler(ctx, addr + 2, *val >> 16, arg1); - vga_mem_wr_handler(ctx, addr + 3, *val >> 24, arg1); + vga_mem_wr_handler(addr, *val, arg1); + vga_mem_wr_handler(addr + 1, *val >> 8, arg1); + vga_mem_wr_handler(addr + 2, *val >> 16, arg1); + vga_mem_wr_handler(addr + 3, *val >> 24, arg1); break; case 8: - vga_mem_wr_handler(ctx, addr, *val, arg1); - vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1); - vga_mem_wr_handler(ctx, addr + 2, *val >> 16, arg1); - vga_mem_wr_handler(ctx, addr + 3, *val >> 24, arg1); - vga_mem_wr_handler(ctx, addr + 4, *val >> 32, arg1); - vga_mem_wr_handler(ctx, addr + 5, *val >> 40, arg1); - vga_mem_wr_handler(ctx, addr + 6, *val >> 48, arg1); - vga_mem_wr_handler(ctx, addr + 7, *val >> 56, arg1); + vga_mem_wr_handler(addr, *val, arg1); + vga_mem_wr_handler(addr + 1, *val >> 8, arg1); + vga_mem_wr_handler(addr + 2, *val >> 16, arg1); + vga_mem_wr_handler(addr + 3, *val >> 24, arg1); + vga_mem_wr_handler(addr + 4, *val >> 32, arg1); + vga_mem_wr_handler(addr + 5, *val >> 40, arg1); + vga_mem_wr_handler(addr + 6, *val >> 48, arg1); + vga_mem_wr_handler(addr + 7, *val >> 56, arg1); break; } } else { switch (size) { case 1: - *val = vga_mem_rd_handler(ctx, addr, arg1); + *val = vga_mem_rd_handler(addr, arg1); break; case 2: - *val = vga_mem_rd_handler(ctx, addr, arg1); - *val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8; + *val = vga_mem_rd_handler(addr, arg1); + *val |= vga_mem_rd_handler(addr + 1, arg1) << 8; break; case 4: - *val = vga_mem_rd_handler(ctx, addr, arg1); - *val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8; - *val |= vga_mem_rd_handler(ctx, addr + 2, arg1) << 16; - *val |= vga_mem_rd_handler(ctx, addr + 3, arg1) << 24; + *val = vga_mem_rd_handler(addr, arg1); + *val |= vga_mem_rd_handler(addr + 1, arg1) << 8; + *val |= vga_mem_rd_handler(addr + 2, arg1) << 16; + *val |= vga_mem_rd_handler(addr + 3, arg1) << 24; break; case 8: - *val = vga_mem_rd_handler(ctx, addr, arg1); - *val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8; - *val |= vga_mem_rd_handler(ctx, addr + 2, arg1) << 16; - *val |= vga_mem_rd_handler(ctx, addr + 3, arg1) << 24; - *val |= vga_mem_rd_handler(ctx, addr + 4, arg1) << 32; - *val |= vga_mem_rd_handler(ctx, addr + 5, arg1) << 40; - *val |= vga_mem_rd_handler(ctx, addr + 6, arg1) << 48; - *val |= vga_mem_rd_handler(ctx, addr + 7, arg1) << 56; + *val = vga_mem_rd_handler(addr, arg1); + *val |= vga_mem_rd_handler(addr + 1, arg1) << 8; + *val |= vga_mem_rd_handler(addr + 2, arg1) << 16; + *val |= vga_mem_rd_handler(addr + 3, arg1) << 24; + *val |= vga_mem_rd_handler(addr + 4, arg1) << 32; + *val |= vga_mem_rd_handler(addr + 5, arg1) << 40; + *val |= vga_mem_rd_handler(addr + 6, arg1) << 48; + *val |= vga_mem_rd_handler(addr + 7, arg1) << 56; break; } } return (0); } static int vga_port_in_handler(struct vmctx *ctx __unused, int in __unused, int port, int bytes __unused, uint8_t *val, void *arg) { struct vga_softc *sc = arg; switch (port) { case CRTC_IDX_MONO_PORT: case CRTC_IDX_COLOR_PORT: *val = sc->vga_crtc.crtc_index; break; case CRTC_DATA_MONO_PORT: case CRTC_DATA_COLOR_PORT: switch (sc->vga_crtc.crtc_index) { case CRTC_HORIZ_TOTAL: *val = sc->vga_crtc.crtc_horiz_total; break; case CRTC_HORIZ_DISP_END: *val = sc->vga_crtc.crtc_horiz_disp_end; break; case CRTC_START_HORIZ_BLANK: *val = sc->vga_crtc.crtc_start_horiz_blank; break; case CRTC_END_HORIZ_BLANK: *val = sc->vga_crtc.crtc_end_horiz_blank; break; case CRTC_START_HORIZ_RETRACE: *val = sc->vga_crtc.crtc_start_horiz_retrace; break; case CRTC_END_HORIZ_RETRACE: *val = sc->vga_crtc.crtc_end_horiz_retrace; break; case CRTC_VERT_TOTAL: *val = sc->vga_crtc.crtc_vert_total; break; case CRTC_OVERFLOW: *val = sc->vga_crtc.crtc_overflow; break; case CRTC_PRESET_ROW_SCAN: *val = sc->vga_crtc.crtc_present_row_scan; break; case CRTC_MAX_SCAN_LINE: *val = sc->vga_crtc.crtc_max_scan_line; break; case CRTC_CURSOR_START: *val = sc->vga_crtc.crtc_cursor_start; break; case CRTC_CURSOR_END: *val = sc->vga_crtc.crtc_cursor_end; break; case CRTC_START_ADDR_HIGH: *val = sc->vga_crtc.crtc_start_addr_high; break; case CRTC_START_ADDR_LOW: *val = sc->vga_crtc.crtc_start_addr_low; break; case CRTC_CURSOR_LOC_HIGH: *val = sc->vga_crtc.crtc_cursor_loc_high; break; case CRTC_CURSOR_LOC_LOW: *val = sc->vga_crtc.crtc_cursor_loc_low; break; case CRTC_VERT_RETRACE_START: *val = sc->vga_crtc.crtc_vert_retrace_start; break; case CRTC_VERT_RETRACE_END: *val = sc->vga_crtc.crtc_vert_retrace_end; break; case CRTC_VERT_DISP_END: *val = sc->vga_crtc.crtc_vert_disp_end; break; case CRTC_OFFSET: *val = sc->vga_crtc.crtc_offset; break; case CRTC_UNDERLINE_LOC: *val = sc->vga_crtc.crtc_underline_loc; break; case CRTC_START_VERT_BLANK: *val = sc->vga_crtc.crtc_start_vert_blank; break; case CRTC_END_VERT_BLANK: *val = sc->vga_crtc.crtc_end_vert_blank; break; case CRTC_MODE_CONTROL: *val = sc->vga_crtc.crtc_mode_ctrl; break; case CRTC_LINE_COMPARE: *val = sc->vga_crtc.crtc_line_compare; break; default: //printf("XXX VGA CRTC: inb 0x%04x at index %d\n", port, sc->vga_crtc.crtc_index); assert(0); break; } break; case ATC_IDX_PORT: *val = sc->vga_atc.atc_index; break; case ATC_DATA_PORT: switch (sc->vga_atc.atc_index) { case ATC_PALETTE0 ... ATC_PALETTE15: *val = sc->vga_atc.atc_palette[sc->vga_atc.atc_index]; break; case ATC_MODE_CONTROL: *val = sc->vga_atc.atc_mode; break; case ATC_OVERSCAN_COLOR: *val = sc->vga_atc.atc_overscan_color; break; case ATC_COLOR_PLANE_ENABLE: *val = sc->vga_atc.atc_color_plane_enb; break; case ATC_HORIZ_PIXEL_PANNING: *val = sc->vga_atc.atc_horiz_pixel_panning; break; case ATC_COLOR_SELECT: *val = sc->vga_atc.atc_color_select; break; default: //printf("XXX VGA ATC inb 0x%04x at index %d\n", port , sc->vga_atc.atc_index); assert(0); break; } break; case SEQ_IDX_PORT: *val = sc->vga_seq.seq_index; break; case SEQ_DATA_PORT: switch (sc->vga_seq.seq_index) { case SEQ_RESET: *val = sc->vga_seq.seq_reset; break; case SEQ_CLOCKING_MODE: *val = sc->vga_seq.seq_clock_mode; break; case SEQ_MAP_MASK: *val = sc->vga_seq.seq_map_mask; break; case SEQ_CHAR_MAP_SELECT: *val = sc->vga_seq.seq_cmap_sel; break; case SEQ_MEMORY_MODE: *val = sc->vga_seq.seq_mm; break; default: //printf("XXX VGA SEQ: inb 0x%04x at index %d\n", port, sc->vga_seq.seq_index); assert(0); break; } break; case DAC_DATA_PORT: *val = sc->vga_dac.dac_palette[3 * sc->vga_dac.dac_rd_index + sc->vga_dac.dac_rd_subindex]; sc->vga_dac.dac_rd_subindex++; if (sc->vga_dac.dac_rd_subindex == 3) { sc->vga_dac.dac_rd_index++; sc->vga_dac.dac_rd_subindex = 0; } break; case GC_IDX_PORT: *val = sc->vga_gc.gc_index; break; case GC_DATA_PORT: switch (sc->vga_gc.gc_index) { case GC_SET_RESET: *val = sc->vga_gc.gc_set_reset; break; case GC_ENABLE_SET_RESET: *val = sc->vga_gc.gc_enb_set_reset; break; case GC_COLOR_COMPARE: *val = sc->vga_gc.gc_color_compare; break; case GC_DATA_ROTATE: *val = sc->vga_gc.gc_rotate; break; case GC_READ_MAP_SELECT: *val = sc->vga_gc.gc_read_map_sel; break; case GC_MODE: *val = sc->vga_gc.gc_mode; break; case GC_MISCELLANEOUS: *val = sc->vga_gc.gc_misc; break; case GC_COLOR_DONT_CARE: *val = sc->vga_gc.gc_color_dont_care; break; case GC_BIT_MASK: *val = sc->vga_gc.gc_bit_mask; break; default: //printf("XXX VGA GC: inb 0x%04x at index %d\n", port, sc->vga_crtc.crtc_index); assert(0); break; } break; case GEN_MISC_OUTPUT_PORT: *val = sc->vga_misc; break; case GEN_INPUT_STS0_PORT: assert(0); break; case GEN_INPUT_STS1_MONO_PORT: case GEN_INPUT_STS1_COLOR_PORT: sc->vga_atc.atc_flipflop = 0; sc->vga_sts1 = GEN_IS1_VR | GEN_IS1_DE; //sc->vga_sts1 ^= (GEN_IS1_VR | GEN_IS1_DE); *val = sc->vga_sts1; break; case GEN_FEATURE_CTRL_PORT: // OpenBSD calls this with bytes = 1 //assert(0); *val = 0; break; case 0x3c3: *val = 0; break; default: printf("XXX vga_port_in_handler() unhandled port 0x%x\n", port); //assert(0); return (-1); } return (0); } static int vga_port_out_handler(struct vmctx *ctx __unused, int in __unused, int port, int bytes __unused, uint8_t val, void *arg) { struct vga_softc *sc = arg; switch (port) { case CRTC_IDX_MONO_PORT: case CRTC_IDX_COLOR_PORT: sc->vga_crtc.crtc_index = val; break; case CRTC_DATA_MONO_PORT: case CRTC_DATA_COLOR_PORT: switch (sc->vga_crtc.crtc_index) { case CRTC_HORIZ_TOTAL: sc->vga_crtc.crtc_horiz_total = val; break; case CRTC_HORIZ_DISP_END: sc->vga_crtc.crtc_horiz_disp_end = val; break; case CRTC_START_HORIZ_BLANK: sc->vga_crtc.crtc_start_horiz_blank = val; break; case CRTC_END_HORIZ_BLANK: sc->vga_crtc.crtc_end_horiz_blank = val; break; case CRTC_START_HORIZ_RETRACE: sc->vga_crtc.crtc_start_horiz_retrace = val; break; case CRTC_END_HORIZ_RETRACE: sc->vga_crtc.crtc_end_horiz_retrace = val; break; case CRTC_VERT_TOTAL: sc->vga_crtc.crtc_vert_total = val; break; case CRTC_OVERFLOW: sc->vga_crtc.crtc_overflow = val; break; case CRTC_PRESET_ROW_SCAN: sc->vga_crtc.crtc_present_row_scan = val; break; case CRTC_MAX_SCAN_LINE: sc->vga_crtc.crtc_max_scan_line = val; break; case CRTC_CURSOR_START: sc->vga_crtc.crtc_cursor_start = val; sc->vga_crtc.crtc_cursor_on = (val & CRTC_CS_CO) == 0; break; case CRTC_CURSOR_END: sc->vga_crtc.crtc_cursor_end = val; break; case CRTC_START_ADDR_HIGH: sc->vga_crtc.crtc_start_addr_high = val; sc->vga_crtc.crtc_start_addr &= 0x00ff; sc->vga_crtc.crtc_start_addr |= (val << 8); break; case CRTC_START_ADDR_LOW: sc->vga_crtc.crtc_start_addr_low = val; sc->vga_crtc.crtc_start_addr &= 0xff00; sc->vga_crtc.crtc_start_addr |= (val & 0xff); break; case CRTC_CURSOR_LOC_HIGH: sc->vga_crtc.crtc_cursor_loc_high = val; sc->vga_crtc.crtc_cursor_loc &= 0x00ff; sc->vga_crtc.crtc_cursor_loc |= (val << 8); break; case CRTC_CURSOR_LOC_LOW: sc->vga_crtc.crtc_cursor_loc_low = val; sc->vga_crtc.crtc_cursor_loc &= 0xff00; sc->vga_crtc.crtc_cursor_loc |= (val & 0xff); break; case CRTC_VERT_RETRACE_START: sc->vga_crtc.crtc_vert_retrace_start = val; break; case CRTC_VERT_RETRACE_END: sc->vga_crtc.crtc_vert_retrace_end = val; break; case CRTC_VERT_DISP_END: sc->vga_crtc.crtc_vert_disp_end = val; break; case CRTC_OFFSET: sc->vga_crtc.crtc_offset = val; break; case CRTC_UNDERLINE_LOC: sc->vga_crtc.crtc_underline_loc = val; break; case CRTC_START_VERT_BLANK: sc->vga_crtc.crtc_start_vert_blank = val; break; case CRTC_END_VERT_BLANK: sc->vga_crtc.crtc_end_vert_blank = val; break; case CRTC_MODE_CONTROL: sc->vga_crtc.crtc_mode_ctrl = val; break; case CRTC_LINE_COMPARE: sc->vga_crtc.crtc_line_compare = val; break; default: //printf("XXX VGA CRTC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_crtc.crtc_index); assert(0); break; } break; case ATC_IDX_PORT: if (sc->vga_atc.atc_flipflop == 0) { if (sc->vga_atc.atc_index & 0x20) assert(0); sc->vga_atc.atc_index = val & ATC_IDX_MASK; } else { switch (sc->vga_atc.atc_index) { case ATC_PALETTE0 ... ATC_PALETTE15: sc->vga_atc.atc_palette[sc->vga_atc.atc_index] = val & 0x3f; break; case ATC_MODE_CONTROL: sc->vga_atc.atc_mode = val; break; case ATC_OVERSCAN_COLOR: sc->vga_atc.atc_overscan_color = val; break; case ATC_COLOR_PLANE_ENABLE: sc->vga_atc.atc_color_plane_enb = val; break; case ATC_HORIZ_PIXEL_PANNING: sc->vga_atc.atc_horiz_pixel_panning = val; break; case ATC_COLOR_SELECT: sc->vga_atc.atc_color_select = val; sc->vga_atc.atc_color_select_45 = (val & ATC_CS_C45) << 4; sc->vga_atc.atc_color_select_67 = ((val & ATC_CS_C67) >> 2) << 6; break; default: //printf("XXX VGA ATC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_atc.atc_index); assert(0); break; } } sc->vga_atc.atc_flipflop ^= 1; break; case ATC_DATA_PORT: break; case SEQ_IDX_PORT: sc->vga_seq.seq_index = val & 0x1f; break; case SEQ_DATA_PORT: switch (sc->vga_seq.seq_index) { case SEQ_RESET: sc->vga_seq.seq_reset = val; break; case SEQ_CLOCKING_MODE: sc->vga_seq.seq_clock_mode = val; sc->vga_seq.seq_cm_dots = (val & SEQ_CM_89) ? 8 : 9; break; case SEQ_MAP_MASK: sc->vga_seq.seq_map_mask = val; break; case SEQ_CHAR_MAP_SELECT: sc->vga_seq.seq_cmap_sel = val; sc->vga_seq.seq_cmap_pri_off = ((((val & SEQ_CMS_SA) >> SEQ_CMS_SA_SHIFT) * 2) + ((val & SEQ_CMS_SAH) >> SEQ_CMS_SAH_SHIFT)) * 8 * KB; sc->vga_seq.seq_cmap_sec_off = ((((val & SEQ_CMS_SB) >> SEQ_CMS_SB_SHIFT) * 2) + ((val & SEQ_CMS_SBH) >> SEQ_CMS_SBH_SHIFT)) * 8 * KB; break; case SEQ_MEMORY_MODE: sc->vga_seq.seq_mm = val; /* Windows queries Chain4 */ //assert((sc->vga_seq.seq_mm & SEQ_MM_C4) == 0); break; default: //printf("XXX VGA SEQ: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_seq.seq_index); assert(0); break; } break; case DAC_MASK: break; case DAC_IDX_RD_PORT: sc->vga_dac.dac_rd_index = val; sc->vga_dac.dac_rd_subindex = 0; break; case DAC_IDX_WR_PORT: sc->vga_dac.dac_wr_index = val; sc->vga_dac.dac_wr_subindex = 0; break; case DAC_DATA_PORT: sc->vga_dac.dac_palette[3 * sc->vga_dac.dac_wr_index + sc->vga_dac.dac_wr_subindex] = val; sc->vga_dac.dac_wr_subindex++; if (sc->vga_dac.dac_wr_subindex == 3) { sc->vga_dac.dac_palette_rgb[sc->vga_dac.dac_wr_index] = ((((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] << 2) | ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] & 0x1) << 1) | (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] & 0x1)) << 16) | (((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] << 2) | ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] & 0x1) << 1) | (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] & 0x1)) << 8) | (((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] << 2) | ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] & 0x1) << 1) | (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] & 0x1)) << 0)); sc->vga_dac.dac_wr_index++; sc->vga_dac.dac_wr_subindex = 0; } break; case GC_IDX_PORT: sc->vga_gc.gc_index = val; break; case GC_DATA_PORT: switch (sc->vga_gc.gc_index) { case GC_SET_RESET: sc->vga_gc.gc_set_reset = val; break; case GC_ENABLE_SET_RESET: sc->vga_gc.gc_enb_set_reset = val; break; case GC_COLOR_COMPARE: sc->vga_gc.gc_color_compare = val; break; case GC_DATA_ROTATE: sc->vga_gc.gc_rotate = val; sc->vga_gc.gc_op = (val >> 3) & 0x3; break; case GC_READ_MAP_SELECT: sc->vga_gc.gc_read_map_sel = val; break; case GC_MODE: sc->vga_gc.gc_mode = val; sc->vga_gc.gc_mode_c4 = (val & GC_MODE_C4) != 0; assert(!sc->vga_gc.gc_mode_c4); sc->vga_gc.gc_mode_oe = (val & GC_MODE_OE) != 0; sc->vga_gc.gc_mode_rm = (val >> 3) & 0x1; sc->vga_gc.gc_mode_wm = val & 0x3; if (sc->gc_image) sc->gc_image->vgamode = 1; break; case GC_MISCELLANEOUS: sc->vga_gc.gc_misc = val; sc->vga_gc.gc_misc_gm = val & GC_MISC_GM; sc->vga_gc.gc_misc_mm = (val & GC_MISC_MM) >> GC_MISC_MM_SHIFT; break; case GC_COLOR_DONT_CARE: sc->vga_gc.gc_color_dont_care = val; break; case GC_BIT_MASK: sc->vga_gc.gc_bit_mask = val; break; default: //printf("XXX VGA GC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_gc.gc_index); assert(0); break; } break; case GEN_INPUT_STS0_PORT: /* write to Miscellaneous Output Register */ sc->vga_misc = val; break; case GEN_INPUT_STS1_MONO_PORT: case GEN_INPUT_STS1_COLOR_PORT: /* write to Feature Control Register */ break; // case 0x3c3: // break; default: printf("XXX vga_port_out_handler() unhandled port 0x%x, val 0x%x\n", port, val); //assert(0); return (-1); } return (0); } static int vga_port_handler(struct vmctx *ctx, int in, int port, int bytes, uint32_t *eax, void *arg) { uint8_t val; int error; switch (bytes) { case 1: if (in) { *eax &= ~0xff; error = vga_port_in_handler(ctx, in, port, 1, &val, arg); if (!error) { *eax |= val & 0xff; } } else { val = *eax & 0xff; error = vga_port_out_handler(ctx, in, port, 1, val, arg); } break; case 2: if (in) { *eax &= ~0xffff; error = vga_port_in_handler(ctx, in, port, 1, &val, arg); if (!error) { *eax |= val & 0xff; } error = vga_port_in_handler(ctx, in, port + 1, 1, &val, arg); if (!error) { *eax |= (val & 0xff) << 8; } } else { val = *eax & 0xff; error = vga_port_out_handler(ctx, in, port, 1, val, arg); val = (*eax >> 8) & 0xff; error =vga_port_out_handler(ctx, in, port + 1, 1, val, arg); } break; default: assert(0); return (-1); } return (error); } void * vga_init(int io_only) { struct inout_port iop; struct vga_softc *sc; int port, error; sc = calloc(1, sizeof(struct vga_softc)); bzero(&iop, sizeof(struct inout_port)); iop.name = "VGA"; for (port = VGA_IOPORT_START; port <= VGA_IOPORT_END; port++) { iop.port = port; iop.size = 1; iop.flags = IOPORT_F_INOUT; iop.handler = vga_port_handler; iop.arg = sc; error = register_inout(&iop); assert(error == 0); } sc->gc_image = console_get_image(); /* only handle io ports; vga graphics is disabled */ if (io_only) return(sc); sc->mr.name = "VGA memory"; sc->mr.flags = MEM_F_RW; sc->mr.base = 640 * KB; sc->mr.size = 128 * KB; sc->mr.handler = vga_mem_handler; sc->mr.arg1 = sc; error = register_mem_fallback(&sc->mr); assert(error == 0); sc->vga_ram = malloc(256 * KB); memset(sc->vga_ram, 0, 256 * KB); { static uint8_t palette[] = { 0x00,0x00,0x00, 0x00,0x00,0x2a, 0x00,0x2a,0x00, 0x00,0x2a,0x2a, 0x2a,0x00,0x00, 0x2a,0x00,0x2a, 0x2a,0x2a,0x00, 0x2a,0x2a,0x2a, 0x00,0x00,0x15, 0x00,0x00,0x3f, 0x00,0x2a,0x15, 0x00,0x2a,0x3f, 0x2a,0x00,0x15, 0x2a,0x00,0x3f, 0x2a,0x2a,0x15, 0x2a,0x2a,0x3f, }; int i; memcpy(sc->vga_dac.dac_palette, palette, 16 * 3 * sizeof (uint8_t)); for (i = 0; i < 16; i++) { sc->vga_dac.dac_palette_rgb[i] = ((((sc->vga_dac.dac_palette[3*i + 0] << 2) | ((sc->vga_dac.dac_palette[3*i + 0] & 0x1) << 1) | (sc->vga_dac.dac_palette[3*i + 0] & 0x1)) << 16) | (((sc->vga_dac.dac_palette[3*i + 1] << 2) | ((sc->vga_dac.dac_palette[3*i + 1] & 0x1) << 1) | (sc->vga_dac.dac_palette[3*i + 1] & 0x1)) << 8) | (((sc->vga_dac.dac_palette[3*i + 2] << 2) | ((sc->vga_dac.dac_palette[3*i + 2] & 0x1) << 1) | (sc->vga_dac.dac_palette[3*i + 2] & 0x1)) << 0)); } } return (sc); } diff --git a/usr.sbin/bhyve/xmsr.c b/usr.sbin/bhyve/xmsr.c index 5f735152e92c..8672752a0cde 100644 --- a/usr.sbin/bhyve/xmsr.c +++ b/usr.sbin/bhyve/xmsr.c @@ -1,248 +1,246 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "debug.h" #include "xmsr.h" static int cpu_vendor_intel, cpu_vendor_amd, cpu_vendor_hygon; int -emulate_wrmsr(struct vmctx *ctx __unused, int vcpu __unused, uint32_t num, - uint64_t val __unused) +emulate_wrmsr(struct vcpu *vcpu __unused, uint32_t num, uint64_t val __unused) { if (cpu_vendor_intel) { switch (num) { case 0xd04: /* Sandy Bridge uncore PMCs */ case 0xc24: return (0); case MSR_BIOS_UPDT_TRIG: return (0); case MSR_BIOS_SIGN: return (0); default: break; } } else if (cpu_vendor_amd || cpu_vendor_hygon) { switch (num) { case MSR_HWCR: /* * Ignore writes to hardware configuration MSR. */ return (0); case MSR_NB_CFG1: case MSR_LS_CFG: case MSR_IC_CFG: return (0); /* Ignore writes */ case MSR_PERFEVSEL0: case MSR_PERFEVSEL1: case MSR_PERFEVSEL2: case MSR_PERFEVSEL3: /* Ignore writes to the PerfEvtSel MSRs */ return (0); case MSR_K7_PERFCTR0: case MSR_K7_PERFCTR1: case MSR_K7_PERFCTR2: case MSR_K7_PERFCTR3: /* Ignore writes to the PerfCtr MSRs */ return (0); case MSR_P_STATE_CONTROL: /* Ignore write to change the P-state */ return (0); default: break; } } return (-1); } int -emulate_rdmsr(struct vmctx *ctx __unused, int vcpu __unused, uint32_t num, - uint64_t *val) +emulate_rdmsr(struct vcpu *vcpu __unused, uint32_t num, uint64_t *val) { int error = 0; if (cpu_vendor_intel) { switch (num) { case MSR_BIOS_SIGN: case MSR_IA32_PLATFORM_ID: case MSR_PKG_ENERGY_STATUS: case MSR_PP0_ENERGY_STATUS: case MSR_PP1_ENERGY_STATUS: case MSR_DRAM_ENERGY_STATUS: case MSR_MISC_FEATURE_ENABLES: *val = 0; break; case MSR_RAPL_POWER_UNIT: /* * Use the default value documented in section * "RAPL Interfaces" in Intel SDM vol3. */ *val = 0x000a1003; break; case MSR_IA32_FEATURE_CONTROL: /* * Windows guests check this MSR. * Set the lock bit to avoid writes * to this MSR. */ *val = IA32_FEATURE_CONTROL_LOCK; break; default: error = -1; break; } } else if (cpu_vendor_amd || cpu_vendor_hygon) { switch (num) { case MSR_BIOS_SIGN: *val = 0; break; case MSR_HWCR: /* * Bios and Kernel Developer's Guides for AMD Families * 12H, 14H, 15H and 16H. */ *val = 0x01000010; /* Reset value */ *val |= 1 << 9; /* MONITOR/MWAIT disable */ break; case MSR_NB_CFG1: case MSR_LS_CFG: case MSR_IC_CFG: /* * The reset value is processor family dependent so * just return 0. */ *val = 0; break; case MSR_PERFEVSEL0: case MSR_PERFEVSEL1: case MSR_PERFEVSEL2: case MSR_PERFEVSEL3: /* * PerfEvtSel MSRs are not properly virtualized so just * return zero. */ *val = 0; break; case MSR_K7_PERFCTR0: case MSR_K7_PERFCTR1: case MSR_K7_PERFCTR2: case MSR_K7_PERFCTR3: /* * PerfCtr MSRs are not properly virtualized so just * return zero. */ *val = 0; break; case MSR_SMM_ADDR: case MSR_SMM_MASK: /* * Return the reset value defined in the AMD Bios and * Kernel Developer's Guide. */ *val = 0; break; case MSR_P_STATE_LIMIT: case MSR_P_STATE_CONTROL: case MSR_P_STATE_STATUS: case MSR_P_STATE_CONFIG(0): /* P0 configuration */ *val = 0; break; /* * OpenBSD guests test bit 0 of this MSR to detect if the * workaround for erratum 721 is already applied. * https://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf */ case 0xC0011029: *val = 1; break; default: error = -1; break; } } else { error = -1; } return (error); } int init_msr(void) { int error; u_int regs[4]; char cpu_vendor[13]; do_cpuid(0, regs); ((u_int *)&cpu_vendor)[0] = regs[1]; ((u_int *)&cpu_vendor)[1] = regs[3]; ((u_int *)&cpu_vendor)[2] = regs[2]; cpu_vendor[12] = '\0'; error = 0; if (strcmp(cpu_vendor, "AuthenticAMD") == 0) { cpu_vendor_amd = 1; } else if (strcmp(cpu_vendor, "HygonGenuine") == 0) { cpu_vendor_hygon = 1; } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) { cpu_vendor_intel = 1; } else { EPRINTLN("Unknown cpu vendor \"%s\"", cpu_vendor); error = -1; } return (error); } diff --git a/usr.sbin/bhyve/xmsr.h b/usr.sbin/bhyve/xmsr.h index 1fb47c3ae2f0..31349c4621d7 100644 --- a/usr.sbin/bhyve/xmsr.h +++ b/usr.sbin/bhyve/xmsr.h @@ -1,38 +1,38 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _XMSR_H_ #define _XMSR_H_ int init_msr(void); -int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val); -int emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t *val); +int emulate_wrmsr(struct vcpu *vcpu, uint32_t code, uint64_t val); +int emulate_rdmsr(struct vcpu *vcpu, uint32_t code, uint64_t *val); #endif diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c index e5789eaf30e8..5c3440ce8343 100644 --- a/usr.sbin/bhyvectl/bhyvectl.c +++ b/usr.sbin/bhyvectl/bhyvectl.c @@ -1,2392 +1,2394 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "amd/vmcb.h" #include "intel/vmcs.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #define MB (1UL << 20) #define GB (1UL << 30) #define REQ_ARG required_argument #define NO_ARG no_argument #define OPT_ARG optional_argument static const char *progname; static void usage(bool cpu_intel) { (void)fprintf(stderr, "Usage: %s --vm=\n" " [--cpu=]\n" " [--create]\n" " [--destroy]\n" #ifdef BHYVE_SNAPSHOT " [--checkpoint= | --suspend=]\n" #endif " [--get-all]\n" " [--get-stats]\n" " [--set-desc-ds]\n" " [--get-desc-ds]\n" " [--set-desc-es]\n" " [--get-desc-es]\n" " [--set-desc-gs]\n" " [--get-desc-gs]\n" " [--set-desc-fs]\n" " [--get-desc-fs]\n" " [--set-desc-cs]\n" " [--get-desc-cs]\n" " [--set-desc-ss]\n" " [--get-desc-ss]\n" " [--set-desc-tr]\n" " [--get-desc-tr]\n" " [--set-desc-ldtr]\n" " [--get-desc-ldtr]\n" " [--set-desc-gdtr]\n" " [--get-desc-gdtr]\n" " [--set-desc-idtr]\n" " [--get-desc-idtr]\n" " [--run]\n" " [--capname=]\n" " [--getcap]\n" " [--setcap=<0|1>]\n" " [--desc-base=]\n" " [--desc-limit=]\n" " [--desc-access=]\n" " [--set-cr0=]\n" " [--get-cr0]\n" " [--set-cr2=]\n" " [--get-cr2]\n" " [--set-cr3=]\n" " [--get-cr3]\n" " [--set-cr4=]\n" " [--get-cr4]\n" " [--set-dr0=]\n" " [--get-dr0]\n" " [--set-dr1=]\n" " [--get-dr1]\n" " [--set-dr2=]\n" " [--get-dr2]\n" " [--set-dr3=]\n" " [--get-dr3]\n" " [--set-dr6=]\n" " [--get-dr6]\n" " [--set-dr7=]\n" " [--get-dr7]\n" " [--set-rsp=]\n" " [--get-rsp]\n" " [--set-rip=]\n" " [--get-rip]\n" " [--get-rax]\n" " [--set-rax=]\n" " [--get-rbx]\n" " [--get-rcx]\n" " [--get-rdx]\n" " [--get-rsi]\n" " [--get-rdi]\n" " [--get-rbp]\n" " [--get-r8]\n" " [--get-r9]\n" " [--get-r10]\n" " [--get-r11]\n" " [--get-r12]\n" " [--get-r13]\n" " [--get-r14]\n" " [--get-r15]\n" " [--set-rflags=]\n" " [--get-rflags]\n" " [--set-cs]\n" " [--get-cs]\n" " [--set-ds]\n" " [--get-ds]\n" " [--set-es]\n" " [--get-es]\n" " [--set-fs]\n" " [--get-fs]\n" " [--set-gs]\n" " [--get-gs]\n" " [--set-ss]\n" " [--get-ss]\n" " [--get-tr]\n" " [--get-ldtr]\n" " [--set-x2apic-state=]\n" " [--get-x2apic-state]\n" " [--unassign-pptdev=]\n" " [--set-mem=]\n" " [--get-lowmem]\n" " [--get-highmem]\n" " [--get-gpa-pmap]\n" " [--assert-lapic-lvt=]\n" " [--inject-nmi]\n" " [--force-reset]\n" " [--force-poweroff]\n" " [--get-rtc-time]\n" " [--set-rtc-time=]\n" " [--get-rtc-nvram]\n" " [--set-rtc-nvram=]\n" " [--rtc-nvram-offset=]\n" " [--get-active-cpus]\n" " [--get-suspended-cpus]\n" " [--get-intinfo]\n" " [--get-eptp]\n" " [--set-exception-bitmap]\n" " [--get-exception-bitmap]\n" " [--get-tsc-offset]\n" " [--get-guest-pat]\n" " [--get-io-bitmap-address]\n" " [--get-msr-bitmap]\n" " [--get-msr-bitmap-address]\n" " [--get-guest-sysenter]\n" " [--get-exit-reason]\n" " [--get-cpu-topology]\n", progname); if (cpu_intel) { (void)fprintf(stderr, " [--get-vmcs-pinbased-ctls]\n" " [--get-vmcs-procbased-ctls]\n" " [--get-vmcs-procbased-ctls2]\n" " [--get-vmcs-entry-interruption-info]\n" " [--set-vmcs-entry-interruption-info=]\n" " [--get-vmcs-guest-physical-address\n" " [--get-vmcs-guest-linear-address\n" " [--get-vmcs-host-pat]\n" " [--get-vmcs-host-cr0]\n" " [--get-vmcs-host-cr3]\n" " [--get-vmcs-host-cr4]\n" " [--get-vmcs-host-rip]\n" " [--get-vmcs-host-rsp]\n" " [--get-vmcs-cr0-mask]\n" " [--get-vmcs-cr0-shadow]\n" " [--get-vmcs-cr4-mask]\n" " [--get-vmcs-cr4-shadow]\n" " [--get-vmcs-cr3-targets]\n" " [--get-vmcs-apic-access-address]\n" " [--get-vmcs-virtual-apic-address]\n" " [--get-vmcs-tpr-threshold]\n" " [--get-vmcs-vpid]\n" " [--get-vmcs-instruction-error]\n" " [--get-vmcs-exit-ctls]\n" " [--get-vmcs-entry-ctls]\n" " [--get-vmcs-link]\n" " [--get-vmcs-exit-qualification]\n" " [--get-vmcs-exit-interruption-info]\n" " [--get-vmcs-exit-interruption-error]\n" " [--get-vmcs-interruptibility]\n" ); } else { (void)fprintf(stderr, " [--get-vmcb-intercepts]\n" " [--get-vmcb-asid]\n" " [--get-vmcb-exit-details]\n" " [--get-vmcb-tlb-ctrl]\n" " [--get-vmcb-virq]\n" " [--get-avic-apic-bar]\n" " [--get-avic-backing-page]\n" " [--get-avic-table]\n" ); } exit(1); } static int get_rtc_time, set_rtc_time; static int get_rtc_nvram, set_rtc_nvram; static int rtc_nvram_offset; static uint8_t rtc_nvram_value; static time_t rtc_secs; static int get_stats, getcap, setcap, capval, get_gpa_pmap; static int inject_nmi, assert_lapic_lvt; static int force_reset, force_poweroff; static const char *capname; static int create, destroy, get_memmap, get_memseg; static int get_intinfo; static int get_active_cpus, get_suspended_cpus; static uint64_t memsize; static int set_cr0, get_cr0, set_cr2, get_cr2, set_cr3, get_cr3; static int set_cr4, get_cr4; static int set_efer, get_efer; static int set_dr0, get_dr0; static int set_dr1, get_dr1; static int set_dr2, get_dr2; static int set_dr3, get_dr3; static int set_dr6, get_dr6; static int set_dr7, get_dr7; static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags; static int set_rax, get_rax; static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp; static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15; static int set_desc_ds, get_desc_ds; static int set_desc_es, get_desc_es; static int set_desc_fs, get_desc_fs; static int set_desc_gs, get_desc_gs; static int set_desc_cs, get_desc_cs; static int set_desc_ss, get_desc_ss; static int set_desc_gdtr, get_desc_gdtr; static int set_desc_idtr, get_desc_idtr; static int set_desc_tr, get_desc_tr; static int set_desc_ldtr, get_desc_ldtr; static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr; static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr; static int set_x2apic_state, get_x2apic_state; static enum x2apic_state x2apic_state; static int unassign_pptdev, bus, slot, func; static int run; static int get_cpu_topology; #ifdef BHYVE_SNAPSHOT static int vm_suspend_opt; #endif /* * VMCB specific. */ static int get_vmcb_intercept, get_vmcb_exit_details, get_vmcb_tlb_ctrl; static int get_vmcb_virq, get_avic_table; /* * VMCS-specific fields */ static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2; static int get_eptp, get_io_bitmap, get_tsc_offset; static int get_vmcs_entry_interruption_info; static int get_vmcs_interruptibility; static int get_vmcs_gpa, get_vmcs_gla; static int get_exception_bitmap; static int get_cr0_mask, get_cr0_shadow; static int get_cr4_mask, get_cr4_shadow; static int get_cr3_targets; static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold; static int get_msr_bitmap, get_msr_bitmap_address; static int get_vpid_asid; static int get_inst_err, get_exit_ctls, get_entry_ctls; static int get_host_cr0, get_host_cr3, get_host_cr4; static int get_host_rip, get_host_rsp; static int get_guest_pat, get_host_pat; static int get_guest_sysenter, get_vmcs_link; static int get_exit_reason, get_vmcs_exit_qualification; static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error; static int get_vmcs_exit_inst_length; static uint64_t desc_base; static uint32_t desc_limit, desc_access; static int get_all; static void dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu) { printf("vm exit[%d]\n", vcpu); printf("\trip\t\t0x%016lx\n", vmexit->rip); printf("\tinst_length\t%d\n", vmexit->inst_length); switch (vmexit->exitcode) { case VM_EXITCODE_INOUT: printf("\treason\t\tINOUT\n"); printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT"); printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes); printf("\tflags\t\t%s%s\n", vmexit->u.inout.string ? "STRING " : "", vmexit->u.inout.rep ? "REP " : ""); printf("\tport\t\t0x%04x\n", vmexit->u.inout.port); printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax); break; case VM_EXITCODE_VMX: printf("\treason\t\tVMX\n"); printf("\tstatus\t\t%d\n", vmexit->u.vmx.status); printf("\texit_reason\t0x%08x (%u)\n", vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason); printf("\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); printf("\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); printf("\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); break; case VM_EXITCODE_SVM: printf("\treason\t\tSVM\n"); printf("\texit_reason\t\t%#lx\n", vmexit->u.svm.exitcode); printf("\texitinfo1\t\t%#lx\n", vmexit->u.svm.exitinfo1); printf("\texitinfo2\t\t%#lx\n", vmexit->u.svm.exitinfo2); break; default: printf("*** unknown vm run exitcode %d\n", vmexit->exitcode); break; } } /* AMD 6th generation and Intel compatible MSRs */ #define MSR_AMD6TH_START 0xC0000000 #define MSR_AMD6TH_END 0xC0001FFF /* AMD 7th and 8th generation compatible MSRs */ #define MSR_AMD7TH_START 0xC0010000 #define MSR_AMD7TH_END 0xC0011FFF static const char * msr_name(uint32_t msr) { static char buf[32]; switch(msr) { case MSR_TSC: return ("MSR_TSC"); case MSR_EFER: return ("MSR_EFER"); case MSR_STAR: return ("MSR_STAR"); case MSR_LSTAR: return ("MSR_LSTAR"); case MSR_CSTAR: return ("MSR_CSTAR"); case MSR_SF_MASK: return ("MSR_SF_MASK"); case MSR_FSBASE: return ("MSR_FSBASE"); case MSR_GSBASE: return ("MSR_GSBASE"); case MSR_KGSBASE: return ("MSR_KGSBASE"); case MSR_SYSENTER_CS_MSR: return ("MSR_SYSENTER_CS_MSR"); case MSR_SYSENTER_ESP_MSR: return ("MSR_SYSENTER_ESP_MSR"); case MSR_SYSENTER_EIP_MSR: return ("MSR_SYSENTER_EIP_MSR"); case MSR_PAT: return ("MSR_PAT"); } snprintf(buf, sizeof(buf), "MSR %#08x", msr); return (buf); } static inline void print_msr_pm(uint64_t msr, int vcpu, int readable, int writeable) { if (readable || writeable) { printf("%-20s[%d]\t\t%c%c\n", msr_name(msr), vcpu, readable ? 'R' : '-', writeable ? 'W' : '-'); } } /* * Reference APM vol2, section 15.11 MSR Intercepts. */ static void dump_amd_msr_pm(const char *bitmap, int vcpu) { int byte, bit, readable, writeable; uint32_t msr; for (msr = 0; msr < 0x2000; msr++) { byte = msr / 4; bit = (msr % 4) * 2; /* Look at MSRs in the range 0x00000000 to 0x00001FFF */ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; print_msr_pm(msr, vcpu, readable, writeable); /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */ byte += 2048; readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable, writeable); /* MSR 0xC0010000 to 0xC0011FF is only for AMD */ byte += 4096; readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; print_msr_pm(msr + MSR_AMD7TH_START, vcpu, readable, writeable); } } /* * Reference Intel SDM Vol3 Section 24.6.9 MSR-Bitmap Address */ static void dump_intel_msr_pm(const char *bitmap, int vcpu) { int byte, bit, readable, writeable; uint32_t msr; for (msr = 0; msr < 0x2000; msr++) { byte = msr / 8; bit = msr & 0x7; /* Look at MSRs in the range 0x00000000 to 0x00001FFF */ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; print_msr_pm(msr, vcpu, readable, writeable); /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */ byte += 1024; readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable, writeable); } } static int dump_msr_bitmap(int vcpu, uint64_t addr, bool cpu_intel) { char *bitmap; int error, fd, map_size; error = -1; bitmap = MAP_FAILED; fd = open("/dev/mem", O_RDONLY, 0); if (fd < 0) { perror("Couldn't open /dev/mem"); goto done; } if (cpu_intel) map_size = PAGE_SIZE; else map_size = 2 * PAGE_SIZE; bitmap = mmap(NULL, map_size, PROT_READ, MAP_SHARED, fd, addr); if (bitmap == MAP_FAILED) { perror("mmap failed"); goto done; } if (cpu_intel) dump_intel_msr_pm(bitmap, vcpu); else dump_amd_msr_pm(bitmap, vcpu); error = 0; done: if (bitmap != MAP_FAILED) munmap((void *)bitmap, map_size); if (fd >= 0) close(fd); return (error); } static int -vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val) +vm_get_vmcs_field(struct vcpu *vcpu, int field, uint64_t *ret_val) { - return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val)); + return (vm_get_register(vcpu, VMCS_IDENT(field), ret_val)); } static int -vm_get_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes, +vm_get_vmcb_field(struct vcpu *vcpu, int off, int bytes, uint64_t *ret_val) { - return (vm_get_register(ctx, vcpu, VMCB_ACCESS(off, bytes), ret_val)); + return (vm_get_register(vcpu, VMCB_ACCESS(off, bytes), ret_val)); } enum { VMNAME = 1000, /* avoid collision with return values from getopt */ VCPU, SET_MEM, SET_EFER, SET_CR0, SET_CR2, SET_CR3, SET_CR4, SET_DR0, SET_DR1, SET_DR2, SET_DR3, SET_DR6, SET_DR7, SET_RSP, SET_RIP, SET_RAX, SET_RFLAGS, DESC_BASE, DESC_LIMIT, DESC_ACCESS, SET_CS, SET_DS, SET_ES, SET_FS, SET_GS, SET_SS, SET_TR, SET_LDTR, SET_X2APIC_STATE, SET_CAP, CAPNAME, UNASSIGN_PPTDEV, GET_GPA_PMAP, ASSERT_LAPIC_LVT, SET_RTC_TIME, SET_RTC_NVRAM, RTC_NVRAM_OFFSET, #ifdef BHYVE_SNAPSHOT SET_CHECKPOINT_FILE, SET_SUSPEND_FILE, #endif }; static void print_cpus(const char *banner, const cpuset_t *cpus) { int i, first; first = 1; printf("%s:\t", banner); if (!CPU_EMPTY(cpus)) { for (i = 0; i < CPU_SETSIZE; i++) { if (CPU_ISSET(i, cpus)) { printf("%s%d", first ? " " : ", ", i); first = 0; } } } else printf(" (none)"); printf("\n"); } static void print_intinfo(const char *banner, uint64_t info) { int type; printf("%s:\t", banner); if (info & VM_INTINFO_VALID) { type = info & VM_INTINFO_TYPE; switch (type) { case VM_INTINFO_HWINTR: printf("extint"); break; case VM_INTINFO_NMI: printf("nmi"); break; case VM_INTINFO_SWINTR: printf("swint"); break; default: printf("exception"); break; } printf(" vector %d", (int)VM_INTINFO_VECTOR(info)); if (info & VM_INTINFO_DEL_ERRCODE) printf(" errcode %#x", (u_int)(info >> 32)); } else { printf("n/a"); } printf("\n"); } static bool cpu_vendor_intel(void) { u_int regs[4], v[3]; do_cpuid(0, regs); v[0] = regs[1]; v[1] = regs[3]; v[2] = regs[2]; if (memcmp(v, "GenuineIntel", sizeof(v)) == 0) return (true); if (memcmp(v, "AuthenticAMD", sizeof(v)) == 0 || memcmp(v, "HygonGenuine", sizeof(v)) == 0) return (false); fprintf(stderr, "Unknown cpu vendor \"%s\"\n", (const char *)v); exit(1); } static int -get_all_registers(struct vmctx *ctx, int vcpu) +get_all_registers(struct vcpu *vcpu, int vcpuid) { uint64_t cr0, cr2, cr3, cr4, dr0, dr1, dr2, dr3, dr6, dr7; uint64_t rsp, rip, rflags, efer; uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp; uint64_t r8, r9, r10, r11, r12, r13, r14, r15; int error = 0; if (!error && (get_efer || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer); + error = vm_get_register(vcpu, VM_REG_GUEST_EFER, &efer); if (error == 0) - printf("efer[%d]\t\t0x%016lx\n", vcpu, efer); + printf("efer[%d]\t\t0x%016lx\n", vcpuid, efer); } if (!error && (get_cr0 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0); + error = vm_get_register(vcpu, VM_REG_GUEST_CR0, &cr0); if (error == 0) - printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0); + printf("cr0[%d]\t\t0x%016lx\n", vcpuid, cr0); } if (!error && (get_cr2 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR2, &cr2); + error = vm_get_register(vcpu, VM_REG_GUEST_CR2, &cr2); if (error == 0) - printf("cr2[%d]\t\t0x%016lx\n", vcpu, cr2); + printf("cr2[%d]\t\t0x%016lx\n", vcpuid, cr2); } if (!error && (get_cr3 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3); + error = vm_get_register(vcpu, VM_REG_GUEST_CR3, &cr3); if (error == 0) - printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3); + printf("cr3[%d]\t\t0x%016lx\n", vcpuid, cr3); } if (!error && (get_cr4 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4); + error = vm_get_register(vcpu, VM_REG_GUEST_CR4, &cr4); if (error == 0) - printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4); + printf("cr4[%d]\t\t0x%016lx\n", vcpuid, cr4); } if (!error && (get_dr0 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR0, &dr0); + error = vm_get_register(vcpu, VM_REG_GUEST_DR0, &dr0); if (error == 0) - printf("dr0[%d]\t\t0x%016lx\n", vcpu, dr0); + printf("dr0[%d]\t\t0x%016lx\n", vcpuid, dr0); } if (!error && (get_dr1 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR1, &dr1); + error = vm_get_register(vcpu, VM_REG_GUEST_DR1, &dr1); if (error == 0) - printf("dr1[%d]\t\t0x%016lx\n", vcpu, dr1); + printf("dr1[%d]\t\t0x%016lx\n", vcpuid, dr1); } if (!error && (get_dr2 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR2, &dr2); + error = vm_get_register(vcpu, VM_REG_GUEST_DR2, &dr2); if (error == 0) - printf("dr2[%d]\t\t0x%016lx\n", vcpu, dr2); + printf("dr2[%d]\t\t0x%016lx\n", vcpuid, dr2); } if (!error && (get_dr3 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR3, &dr3); + error = vm_get_register(vcpu, VM_REG_GUEST_DR3, &dr3); if (error == 0) - printf("dr3[%d]\t\t0x%016lx\n", vcpu, dr3); + printf("dr3[%d]\t\t0x%016lx\n", vcpuid, dr3); } if (!error && (get_dr6 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR6, &dr6); + error = vm_get_register(vcpu, VM_REG_GUEST_DR6, &dr6); if (error == 0) - printf("dr6[%d]\t\t0x%016lx\n", vcpu, dr6); + printf("dr6[%d]\t\t0x%016lx\n", vcpuid, dr6); } if (!error && (get_dr7 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7); + error = vm_get_register(vcpu, VM_REG_GUEST_DR7, &dr7); if (error == 0) - printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7); + printf("dr7[%d]\t\t0x%016lx\n", vcpuid, dr7); } if (!error && (get_rsp || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp); + error = vm_get_register(vcpu, VM_REG_GUEST_RSP, &rsp); if (error == 0) - printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp); + printf("rsp[%d]\t\t0x%016lx\n", vcpuid, rsp); } if (!error && (get_rip || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); + error = vm_get_register(vcpu, VM_REG_GUEST_RIP, &rip); if (error == 0) - printf("rip[%d]\t\t0x%016lx\n", vcpu, rip); + printf("rip[%d]\t\t0x%016lx\n", vcpuid, rip); } if (!error && (get_rax || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax); + error = vm_get_register(vcpu, VM_REG_GUEST_RAX, &rax); if (error == 0) - printf("rax[%d]\t\t0x%016lx\n", vcpu, rax); + printf("rax[%d]\t\t0x%016lx\n", vcpuid, rax); } if (!error && (get_rbx || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx); + error = vm_get_register(vcpu, VM_REG_GUEST_RBX, &rbx); if (error == 0) - printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx); + printf("rbx[%d]\t\t0x%016lx\n", vcpuid, rbx); } if (!error && (get_rcx || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx); + error = vm_get_register(vcpu, VM_REG_GUEST_RCX, &rcx); if (error == 0) - printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx); + printf("rcx[%d]\t\t0x%016lx\n", vcpuid, rcx); } if (!error && (get_rdx || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx); + error = vm_get_register(vcpu, VM_REG_GUEST_RDX, &rdx); if (error == 0) - printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx); + printf("rdx[%d]\t\t0x%016lx\n", vcpuid, rdx); } if (!error && (get_rsi || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi); + error = vm_get_register(vcpu, VM_REG_GUEST_RSI, &rsi); if (error == 0) - printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi); + printf("rsi[%d]\t\t0x%016lx\n", vcpuid, rsi); } if (!error && (get_rdi || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi); + error = vm_get_register(vcpu, VM_REG_GUEST_RDI, &rdi); if (error == 0) - printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi); + printf("rdi[%d]\t\t0x%016lx\n", vcpuid, rdi); } if (!error && (get_rbp || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp); + error = vm_get_register(vcpu, VM_REG_GUEST_RBP, &rbp); if (error == 0) - printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp); + printf("rbp[%d]\t\t0x%016lx\n", vcpuid, rbp); } if (!error && (get_r8 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8); + error = vm_get_register(vcpu, VM_REG_GUEST_R8, &r8); if (error == 0) - printf("r8[%d]\t\t0x%016lx\n", vcpu, r8); + printf("r8[%d]\t\t0x%016lx\n", vcpuid, r8); } if (!error && (get_r9 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9); + error = vm_get_register(vcpu, VM_REG_GUEST_R9, &r9); if (error == 0) - printf("r9[%d]\t\t0x%016lx\n", vcpu, r9); + printf("r9[%d]\t\t0x%016lx\n", vcpuid, r9); } if (!error && (get_r10 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10); + error = vm_get_register(vcpu, VM_REG_GUEST_R10, &r10); if (error == 0) - printf("r10[%d]\t\t0x%016lx\n", vcpu, r10); + printf("r10[%d]\t\t0x%016lx\n", vcpuid, r10); } if (!error && (get_r11 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11); + error = vm_get_register(vcpu, VM_REG_GUEST_R11, &r11); if (error == 0) - printf("r11[%d]\t\t0x%016lx\n", vcpu, r11); + printf("r11[%d]\t\t0x%016lx\n", vcpuid, r11); } if (!error && (get_r12 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12); + error = vm_get_register(vcpu, VM_REG_GUEST_R12, &r12); if (error == 0) - printf("r12[%d]\t\t0x%016lx\n", vcpu, r12); + printf("r12[%d]\t\t0x%016lx\n", vcpuid, r12); } if (!error && (get_r13 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13); + error = vm_get_register(vcpu, VM_REG_GUEST_R13, &r13); if (error == 0) - printf("r13[%d]\t\t0x%016lx\n", vcpu, r13); + printf("r13[%d]\t\t0x%016lx\n", vcpuid, r13); } if (!error && (get_r14 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14); + error = vm_get_register(vcpu, VM_REG_GUEST_R14, &r14); if (error == 0) - printf("r14[%d]\t\t0x%016lx\n", vcpu, r14); + printf("r14[%d]\t\t0x%016lx\n", vcpuid, r14); } if (!error && (get_r15 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15); + error = vm_get_register(vcpu, VM_REG_GUEST_R15, &r15); if (error == 0) - printf("r15[%d]\t\t0x%016lx\n", vcpu, r15); + printf("r15[%d]\t\t0x%016lx\n", vcpuid, r15); } if (!error && (get_rflags || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, + error = vm_get_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); if (error == 0) - printf("rflags[%d]\t0x%016lx\n", vcpu, rflags); + printf("rflags[%d]\t0x%016lx\n", vcpuid, rflags); } return (error); } static int -get_all_segments(struct vmctx *ctx, int vcpu) +get_all_segments(struct vcpu *vcpu, int vcpuid) { uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; int error = 0; if (!error && (get_desc_ds || get_all)) { - error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS, + error = vm_get_desc(vcpu, VM_REG_GUEST_DS, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_es || get_all)) { - error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES, + error = vm_get_desc(vcpu, VM_REG_GUEST_ES, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_fs || get_all)) { - error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS, + error = vm_get_desc(vcpu, VM_REG_GUEST_FS, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_gs || get_all)) { - error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS, + error = vm_get_desc(vcpu, VM_REG_GUEST_GS, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_ss || get_all)) { - error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, + error = vm_get_desc(vcpu, VM_REG_GUEST_SS, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_cs || get_all)) { - error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS, + error = vm_get_desc(vcpu, VM_REG_GUEST_CS, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_tr || get_all)) { - error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, + error = vm_get_desc(vcpu, VM_REG_GUEST_TR, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_ldtr || get_all)) { - error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR, + error = vm_get_desc(vcpu, VM_REG_GUEST_LDTR, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpuid, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_gdtr || get_all)) { - error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR, + error = vm_get_desc(vcpu, VM_REG_GUEST_GDTR, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("gdtr[%d]\t\t0x%016lx/0x%08x\n", - vcpu, desc_base, desc_limit); + vcpuid, desc_base, desc_limit); } } if (!error && (get_desc_idtr || get_all)) { - error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR, + error = vm_get_desc(vcpu, VM_REG_GUEST_IDTR, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("idtr[%d]\t\t0x%016lx/0x%08x\n", - vcpu, desc_base, desc_limit); + vcpuid, desc_base, desc_limit); } } if (!error && (get_cs || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs); + error = vm_get_register(vcpu, VM_REG_GUEST_CS, &cs); if (error == 0) - printf("cs[%d]\t\t0x%04lx\n", vcpu, cs); + printf("cs[%d]\t\t0x%04lx\n", vcpuid, cs); } if (!error && (get_ds || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds); + error = vm_get_register(vcpu, VM_REG_GUEST_DS, &ds); if (error == 0) - printf("ds[%d]\t\t0x%04lx\n", vcpu, ds); + printf("ds[%d]\t\t0x%04lx\n", vcpuid, ds); } if (!error && (get_es || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es); + error = vm_get_register(vcpu, VM_REG_GUEST_ES, &es); if (error == 0) - printf("es[%d]\t\t0x%04lx\n", vcpu, es); + printf("es[%d]\t\t0x%04lx\n", vcpuid, es); } if (!error && (get_fs || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs); + error = vm_get_register(vcpu, VM_REG_GUEST_FS, &fs); if (error == 0) - printf("fs[%d]\t\t0x%04lx\n", vcpu, fs); + printf("fs[%d]\t\t0x%04lx\n", vcpuid, fs); } if (!error && (get_gs || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs); + error = vm_get_register(vcpu, VM_REG_GUEST_GS, &gs); if (error == 0) - printf("gs[%d]\t\t0x%04lx\n", vcpu, gs); + printf("gs[%d]\t\t0x%04lx\n", vcpuid, gs); } if (!error && (get_ss || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss); + error = vm_get_register(vcpu, VM_REG_GUEST_SS, &ss); if (error == 0) - printf("ss[%d]\t\t0x%04lx\n", vcpu, ss); + printf("ss[%d]\t\t0x%04lx\n", vcpuid, ss); } if (!error && (get_tr || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr); + error = vm_get_register(vcpu, VM_REG_GUEST_TR, &tr); if (error == 0) - printf("tr[%d]\t\t0x%04lx\n", vcpu, tr); + printf("tr[%d]\t\t0x%04lx\n", vcpuid, tr); } if (!error && (get_ldtr || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr); + error = vm_get_register(vcpu, VM_REG_GUEST_LDTR, &ldtr); if (error == 0) - printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr); + printf("ldtr[%d]\t\t0x%04lx\n", vcpuid, ldtr); } return (error); } static int -get_misc_vmcs(struct vmctx *ctx, int vcpu) +get_misc_vmcs(struct vcpu *vcpu, int vcpuid) { uint64_t ctl, cr0, cr3, cr4, rsp, rip, pat, addr, u64; int error = 0; if (!error && (get_cr0_mask || get_all)) { uint64_t cr0mask; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask); + error = vm_get_vmcs_field(vcpu, VMCS_CR0_MASK, &cr0mask); if (error == 0) - printf("cr0_mask[%d]\t\t0x%016lx\n", vcpu, cr0mask); + printf("cr0_mask[%d]\t\t0x%016lx\n", vcpuid, cr0mask); } if (!error && (get_cr0_shadow || get_all)) { uint64_t cr0shadow; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW, + error = vm_get_vmcs_field(vcpu, VMCS_CR0_SHADOW, &cr0shadow); if (error == 0) - printf("cr0_shadow[%d]\t\t0x%016lx\n", vcpu, cr0shadow); + printf("cr0_shadow[%d]\t\t0x%016lx\n", vcpuid, cr0shadow); } if (!error && (get_cr4_mask || get_all)) { uint64_t cr4mask; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask); + error = vm_get_vmcs_field(vcpu, VMCS_CR4_MASK, &cr4mask); if (error == 0) - printf("cr4_mask[%d]\t\t0x%016lx\n", vcpu, cr4mask); + printf("cr4_mask[%d]\t\t0x%016lx\n", vcpuid, cr4mask); } if (!error && (get_cr4_shadow || get_all)) { uint64_t cr4shadow; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW, + error = vm_get_vmcs_field(vcpu, VMCS_CR4_SHADOW, &cr4shadow); if (error == 0) - printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpu, cr4shadow); + printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpuid, cr4shadow); } if (!error && (get_cr3_targets || get_all)) { uint64_t target_count, target_addr; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT, + error = vm_get_vmcs_field(vcpu, VMCS_CR3_TARGET_COUNT, &target_count); if (error == 0) { printf("cr3_target_count[%d]\t0x%016lx\n", - vcpu, target_count); + vcpuid, target_count); } - error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0, + error = vm_get_vmcs_field(vcpu, VMCS_CR3_TARGET0, &target_addr); if (error == 0) { printf("cr3_target0[%d]\t\t0x%016lx\n", - vcpu, target_addr); + vcpuid, target_addr); } - error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1, + error = vm_get_vmcs_field(vcpu, VMCS_CR3_TARGET1, &target_addr); if (error == 0) { printf("cr3_target1[%d]\t\t0x%016lx\n", - vcpu, target_addr); + vcpuid, target_addr); } - error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2, + error = vm_get_vmcs_field(vcpu, VMCS_CR3_TARGET2, &target_addr); if (error == 0) { printf("cr3_target2[%d]\t\t0x%016lx\n", - vcpu, target_addr); + vcpuid, target_addr); } - error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3, + error = vm_get_vmcs_field(vcpu, VMCS_CR3_TARGET3, &target_addr); if (error == 0) { printf("cr3_target3[%d]\t\t0x%016lx\n", - vcpu, target_addr); + vcpuid, target_addr); } } if (!error && (get_pinbased_ctls || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl); + error = vm_get_vmcs_field(vcpu, VMCS_PIN_BASED_CTLS, &ctl); if (error == 0) - printf("pinbased_ctls[%d]\t0x%016lx\n", vcpu, ctl); + printf("pinbased_ctls[%d]\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_procbased_ctls || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, + error = vm_get_vmcs_field(vcpu, VMCS_PRI_PROC_BASED_CTLS, &ctl); if (error == 0) - printf("procbased_ctls[%d]\t0x%016lx\n", vcpu, ctl); + printf("procbased_ctls[%d]\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_procbased_ctls2 || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, + error = vm_get_vmcs_field(vcpu, VMCS_SEC_PROC_BASED_CTLS, &ctl); if (error == 0) - printf("procbased_ctls2[%d]\t0x%016lx\n", vcpu, ctl); + printf("procbased_ctls2[%d]\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_vmcs_gla || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, + error = vm_get_vmcs_field(vcpu, VMCS_GUEST_LINEAR_ADDRESS, &u64); if (error == 0) - printf("gla[%d]\t\t0x%016lx\n", vcpu, u64); + printf("gla[%d]\t\t0x%016lx\n", vcpuid, u64); } if (!error && (get_vmcs_gpa || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, + error = vm_get_vmcs_field(vcpu, VMCS_GUEST_PHYSICAL_ADDRESS, &u64); if (error == 0) - printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64); + printf("gpa[%d]\t\t0x%016lx\n", vcpuid, u64); } if (!error && (get_vmcs_entry_interruption_info || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64); + error = vm_get_vmcs_field(vcpu, VMCS_ENTRY_INTR_INFO,&u64); if (error == 0) { printf("entry_interruption_info[%d]\t0x%016lx\n", - vcpu, u64); + vcpuid, u64); } } if (!error && (get_tpr_threshold || get_all)) { uint64_t threshold; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD, + error = vm_get_vmcs_field(vcpu, VMCS_TPR_THRESHOLD, &threshold); if (error == 0) - printf("tpr_threshold[%d]\t0x%016lx\n", vcpu, threshold); + printf("tpr_threshold[%d]\t0x%016lx\n", vcpuid, threshold); } if (!error && (get_inst_err || get_all)) { uint64_t insterr; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR, + error = vm_get_vmcs_field(vcpu, VMCS_INSTRUCTION_ERROR, &insterr); if (error == 0) { printf("instruction_error[%d]\t0x%016lx\n", - vcpu, insterr); + vcpuid, insterr); } } if (!error && (get_exit_ctls || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl); + error = vm_get_vmcs_field(vcpu, VMCS_EXIT_CTLS, &ctl); if (error == 0) - printf("exit_ctls[%d]\t\t0x%016lx\n", vcpu, ctl); + printf("exit_ctls[%d]\t\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_entry_ctls || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl); + error = vm_get_vmcs_field(vcpu, VMCS_ENTRY_CTLS, &ctl); if (error == 0) - printf("entry_ctls[%d]\t\t0x%016lx\n", vcpu, ctl); + printf("entry_ctls[%d]\t\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_host_pat || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat); + error = vm_get_vmcs_field(vcpu, VMCS_HOST_IA32_PAT, &pat); if (error == 0) - printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat); + printf("host_pat[%d]\t\t0x%016lx\n", vcpuid, pat); } if (!error && (get_host_cr0 || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0); + error = vm_get_vmcs_field(vcpu, VMCS_HOST_CR0, &cr0); if (error == 0) - printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0); + printf("host_cr0[%d]\t\t0x%016lx\n", vcpuid, cr0); } if (!error && (get_host_cr3 || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3); + error = vm_get_vmcs_field(vcpu, VMCS_HOST_CR3, &cr3); if (error == 0) - printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3); + printf("host_cr3[%d]\t\t0x%016lx\n", vcpuid, cr3); } if (!error && (get_host_cr4 || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4); + error = vm_get_vmcs_field(vcpu, VMCS_HOST_CR4, &cr4); if (error == 0) - printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4); + printf("host_cr4[%d]\t\t0x%016lx\n", vcpuid, cr4); } if (!error && (get_host_rip || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip); + error = vm_get_vmcs_field(vcpu, VMCS_HOST_RIP, &rip); if (error == 0) - printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip); + printf("host_rip[%d]\t\t0x%016lx\n", vcpuid, rip); } if (!error && (get_host_rsp || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp); + error = vm_get_vmcs_field(vcpu, VMCS_HOST_RSP, &rsp); if (error == 0) - printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp); + printf("host_rsp[%d]\t\t0x%016lx\n", vcpuid, rsp); } if (!error && (get_vmcs_link || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr); + error = vm_get_vmcs_field(vcpu, VMCS_LINK_POINTER, &addr); if (error == 0) - printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr); + printf("vmcs_pointer[%d]\t0x%016lx\n", vcpuid, addr); } if (!error && (get_vmcs_exit_interruption_info || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_INFO, &u64); + error = vm_get_vmcs_field(vcpu, VMCS_EXIT_INTR_INFO, &u64); if (error == 0) { printf("vmcs_exit_interruption_info[%d]\t0x%016lx\n", - vcpu, u64); + vcpuid, u64); } } if (!error && (get_vmcs_exit_interruption_error || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_ERRCODE, + error = vm_get_vmcs_field(vcpu, VMCS_EXIT_INTR_ERRCODE, &u64); if (error == 0) { printf("vmcs_exit_interruption_error[%d]\t0x%016lx\n", - vcpu, u64); + vcpuid, u64); } } if (!error && (get_vmcs_interruptibility || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, + error = vm_get_vmcs_field(vcpu, VMCS_GUEST_INTERRUPTIBILITY, &u64); if (error == 0) { printf("vmcs_guest_interruptibility[%d]\t0x%016lx\n", - vcpu, u64); + vcpuid, u64); } } if (!error && (get_vmcs_exit_inst_length || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, + error = vm_get_vmcs_field(vcpu, VMCS_EXIT_INSTRUCTION_LENGTH, &u64); if (error == 0) - printf("vmcs_exit_inst_length[%d]\t0x%08x\n", vcpu, + printf("vmcs_exit_inst_length[%d]\t0x%08x\n", vcpuid, (uint32_t)u64); } if (!error && (get_vmcs_exit_qualification || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION, + error = vm_get_vmcs_field(vcpu, VMCS_EXIT_QUALIFICATION, &u64); if (error == 0) printf("vmcs_exit_qualification[%d]\t0x%016lx\n", - vcpu, u64); + vcpuid, u64); } return (error); } static int -get_misc_vmcb(struct vmctx *ctx, int vcpu) +get_misc_vmcb(struct vcpu *vcpu, int vcpuid) { uint64_t ctl, addr; int error = 0; if (!error && (get_vmcb_intercept || get_all)) { - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_CR_INTERCEPT, 4, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_CR_INTERCEPT, 4, &ctl); if (error == 0) - printf("cr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + printf("cr_intercept[%d]\t0x%08x\n", vcpuid, (int)ctl); - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_DR_INTERCEPT, 4, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_DR_INTERCEPT, 4, &ctl); if (error == 0) - printf("dr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + printf("dr_intercept[%d]\t0x%08x\n", vcpuid, (int)ctl); - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXC_INTERCEPT, 4, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_EXC_INTERCEPT, 4, &ctl); if (error == 0) - printf("exc_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + printf("exc_intercept[%d]\t0x%08x\n", vcpuid, (int)ctl); - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST1_INTERCEPT, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_INST1_INTERCEPT, 4, &ctl); if (error == 0) - printf("inst1_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + printf("inst1_intercept[%d]\t0x%08x\n", vcpuid, (int)ctl); - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST2_INTERCEPT, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_INST2_INTERCEPT, 4, &ctl); if (error == 0) - printf("inst2_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + printf("inst2_intercept[%d]\t0x%08x\n", vcpuid, (int)ctl); } if (!error && (get_vmcb_tlb_ctrl || get_all)) { - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_TLB_CTRL, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_TLB_CTRL, 4, &ctl); if (error == 0) - printf("TLB ctrl[%d]\t0x%016lx\n", vcpu, ctl); + printf("TLB ctrl[%d]\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_vmcb_exit_details || get_all)) { - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO1, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_EXITINFO1, 8, &ctl); if (error == 0) - printf("exitinfo1[%d]\t0x%016lx\n", vcpu, ctl); - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO2, + printf("exitinfo1[%d]\t0x%016lx\n", vcpuid, ctl); + error = vm_get_vmcb_field(vcpu, VMCB_OFF_EXITINFO2, 8, &ctl); if (error == 0) - printf("exitinfo2[%d]\t0x%016lx\n", vcpu, ctl); - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINTINFO, + printf("exitinfo2[%d]\t0x%016lx\n", vcpuid, ctl); + error = vm_get_vmcb_field(vcpu, VMCB_OFF_EXITINTINFO, 8, &ctl); if (error == 0) - printf("exitintinfo[%d]\t0x%016lx\n", vcpu, ctl); + printf("exitintinfo[%d]\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_vmcb_virq || get_all)) { - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_VIRQ, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_VIRQ, 8, &ctl); if (error == 0) - printf("v_irq/tpr[%d]\t0x%016lx\n", vcpu, ctl); + printf("v_irq/tpr[%d]\t0x%016lx\n", vcpuid, ctl); } if (!error && (get_apic_access_addr || get_all)) { - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_BAR, 8, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_AVIC_BAR, 8, &addr); if (error == 0) - printf("AVIC apic_bar[%d]\t0x%016lx\n", vcpu, addr); + printf("AVIC apic_bar[%d]\t0x%016lx\n", vcpuid, addr); } if (!error && (get_virtual_apic_addr || get_all)) { - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PAGE, 8, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_AVIC_PAGE, 8, &addr); if (error == 0) - printf("AVIC backing page[%d]\t0x%016lx\n", vcpu, addr); + printf("AVIC backing page[%d]\t0x%016lx\n", vcpuid, addr); } if (!error && (get_avic_table || get_all)) { - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_LT, 8, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_AVIC_LT, 8, &addr); if (error == 0) printf("AVIC logical table[%d]\t0x%016lx\n", - vcpu, addr); - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PT, 8, + vcpuid, addr); + error = vm_get_vmcb_field(vcpu, VMCB_OFF_AVIC_PT, 8, &addr); if (error == 0) printf("AVIC physical table[%d]\t0x%016lx\n", - vcpu, addr); + vcpuid, addr); } return (error); } static struct option * setup_options(bool cpu_intel) { const struct option common_opts[] = { { "vm", REQ_ARG, 0, VMNAME }, { "cpu", REQ_ARG, 0, VCPU }, { "set-mem", REQ_ARG, 0, SET_MEM }, { "set-efer", REQ_ARG, 0, SET_EFER }, { "set-cr0", REQ_ARG, 0, SET_CR0 }, { "set-cr2", REQ_ARG, 0, SET_CR2 }, { "set-cr3", REQ_ARG, 0, SET_CR3 }, { "set-cr4", REQ_ARG, 0, SET_CR4 }, { "set-dr0", REQ_ARG, 0, SET_DR0 }, { "set-dr1", REQ_ARG, 0, SET_DR1 }, { "set-dr2", REQ_ARG, 0, SET_DR2 }, { "set-dr3", REQ_ARG, 0, SET_DR3 }, { "set-dr6", REQ_ARG, 0, SET_DR6 }, { "set-dr7", REQ_ARG, 0, SET_DR7 }, { "set-rsp", REQ_ARG, 0, SET_RSP }, { "set-rip", REQ_ARG, 0, SET_RIP }, { "set-rax", REQ_ARG, 0, SET_RAX }, { "set-rflags", REQ_ARG, 0, SET_RFLAGS }, { "desc-base", REQ_ARG, 0, DESC_BASE }, { "desc-limit", REQ_ARG, 0, DESC_LIMIT }, { "desc-access",REQ_ARG, 0, DESC_ACCESS }, { "set-cs", REQ_ARG, 0, SET_CS }, { "set-ds", REQ_ARG, 0, SET_DS }, { "set-es", REQ_ARG, 0, SET_ES }, { "set-fs", REQ_ARG, 0, SET_FS }, { "set-gs", REQ_ARG, 0, SET_GS }, { "set-ss", REQ_ARG, 0, SET_SS }, { "set-tr", REQ_ARG, 0, SET_TR }, { "set-ldtr", REQ_ARG, 0, SET_LDTR }, { "set-x2apic-state",REQ_ARG, 0, SET_X2APIC_STATE }, { "capname", REQ_ARG, 0, CAPNAME }, { "unassign-pptdev", REQ_ARG, 0, UNASSIGN_PPTDEV }, { "setcap", REQ_ARG, 0, SET_CAP }, { "get-gpa-pmap", REQ_ARG, 0, GET_GPA_PMAP }, { "assert-lapic-lvt", REQ_ARG, 0, ASSERT_LAPIC_LVT }, { "get-rtc-time", NO_ARG, &get_rtc_time, 1 }, { "set-rtc-time", REQ_ARG, 0, SET_RTC_TIME }, { "rtc-nvram-offset", REQ_ARG, 0, RTC_NVRAM_OFFSET }, { "get-rtc-nvram", NO_ARG, &get_rtc_nvram, 1 }, { "set-rtc-nvram", REQ_ARG, 0, SET_RTC_NVRAM }, { "getcap", NO_ARG, &getcap, 1 }, { "get-stats", NO_ARG, &get_stats, 1 }, { "get-desc-ds",NO_ARG, &get_desc_ds, 1 }, { "set-desc-ds",NO_ARG, &set_desc_ds, 1 }, { "get-desc-es",NO_ARG, &get_desc_es, 1 }, { "set-desc-es",NO_ARG, &set_desc_es, 1 }, { "get-desc-ss",NO_ARG, &get_desc_ss, 1 }, { "set-desc-ss",NO_ARG, &set_desc_ss, 1 }, { "get-desc-cs",NO_ARG, &get_desc_cs, 1 }, { "set-desc-cs",NO_ARG, &set_desc_cs, 1 }, { "get-desc-fs",NO_ARG, &get_desc_fs, 1 }, { "set-desc-fs",NO_ARG, &set_desc_fs, 1 }, { "get-desc-gs",NO_ARG, &get_desc_gs, 1 }, { "set-desc-gs",NO_ARG, &set_desc_gs, 1 }, { "get-desc-tr",NO_ARG, &get_desc_tr, 1 }, { "set-desc-tr",NO_ARG, &set_desc_tr, 1 }, { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 }, { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 }, { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 }, { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 }, { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 }, { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 }, { "get-memmap", NO_ARG, &get_memmap, 1 }, { "get-memseg", NO_ARG, &get_memseg, 1 }, { "get-efer", NO_ARG, &get_efer, 1 }, { "get-cr0", NO_ARG, &get_cr0, 1 }, { "get-cr2", NO_ARG, &get_cr2, 1 }, { "get-cr3", NO_ARG, &get_cr3, 1 }, { "get-cr4", NO_ARG, &get_cr4, 1 }, { "get-dr0", NO_ARG, &get_dr0, 1 }, { "get-dr1", NO_ARG, &get_dr1, 1 }, { "get-dr2", NO_ARG, &get_dr2, 1 }, { "get-dr3", NO_ARG, &get_dr3, 1 }, { "get-dr6", NO_ARG, &get_dr6, 1 }, { "get-dr7", NO_ARG, &get_dr7, 1 }, { "get-rsp", NO_ARG, &get_rsp, 1 }, { "get-rip", NO_ARG, &get_rip, 1 }, { "get-rax", NO_ARG, &get_rax, 1 }, { "get-rbx", NO_ARG, &get_rbx, 1 }, { "get-rcx", NO_ARG, &get_rcx, 1 }, { "get-rdx", NO_ARG, &get_rdx, 1 }, { "get-rsi", NO_ARG, &get_rsi, 1 }, { "get-rdi", NO_ARG, &get_rdi, 1 }, { "get-rbp", NO_ARG, &get_rbp, 1 }, { "get-r8", NO_ARG, &get_r8, 1 }, { "get-r9", NO_ARG, &get_r9, 1 }, { "get-r10", NO_ARG, &get_r10, 1 }, { "get-r11", NO_ARG, &get_r11, 1 }, { "get-r12", NO_ARG, &get_r12, 1 }, { "get-r13", NO_ARG, &get_r13, 1 }, { "get-r14", NO_ARG, &get_r14, 1 }, { "get-r15", NO_ARG, &get_r15, 1 }, { "get-rflags", NO_ARG, &get_rflags, 1 }, { "get-cs", NO_ARG, &get_cs, 1 }, { "get-ds", NO_ARG, &get_ds, 1 }, { "get-es", NO_ARG, &get_es, 1 }, { "get-fs", NO_ARG, &get_fs, 1 }, { "get-gs", NO_ARG, &get_gs, 1 }, { "get-ss", NO_ARG, &get_ss, 1 }, { "get-tr", NO_ARG, &get_tr, 1 }, { "get-ldtr", NO_ARG, &get_ldtr, 1 }, { "get-eptp", NO_ARG, &get_eptp, 1 }, { "get-exception-bitmap", NO_ARG, &get_exception_bitmap, 1 }, { "get-io-bitmap-address", NO_ARG, &get_io_bitmap, 1 }, { "get-tsc-offset", NO_ARG, &get_tsc_offset, 1 }, { "get-msr-bitmap", NO_ARG, &get_msr_bitmap, 1 }, { "get-msr-bitmap-address", NO_ARG, &get_msr_bitmap_address, 1 }, { "get-guest-pat", NO_ARG, &get_guest_pat, 1 }, { "get-guest-sysenter", NO_ARG, &get_guest_sysenter, 1 }, { "get-exit-reason", NO_ARG, &get_exit_reason, 1 }, { "get-x2apic-state", NO_ARG, &get_x2apic_state, 1 }, { "get-all", NO_ARG, &get_all, 1 }, { "run", NO_ARG, &run, 1 }, { "create", NO_ARG, &create, 1 }, { "destroy", NO_ARG, &destroy, 1 }, { "inject-nmi", NO_ARG, &inject_nmi, 1 }, { "force-reset", NO_ARG, &force_reset, 1 }, { "force-poweroff", NO_ARG, &force_poweroff, 1 }, { "get-active-cpus", NO_ARG, &get_active_cpus, 1 }, { "get-suspended-cpus", NO_ARG, &get_suspended_cpus, 1 }, { "get-intinfo", NO_ARG, &get_intinfo, 1 }, { "get-cpu-topology", NO_ARG, &get_cpu_topology, 1 }, #ifdef BHYVE_SNAPSHOT { "checkpoint", REQ_ARG, 0, SET_CHECKPOINT_FILE}, { "suspend", REQ_ARG, 0, SET_SUSPEND_FILE}, #endif }; const struct option intel_opts[] = { { "get-vmcs-pinbased-ctls", NO_ARG, &get_pinbased_ctls, 1 }, { "get-vmcs-procbased-ctls", NO_ARG, &get_procbased_ctls, 1 }, { "get-vmcs-procbased-ctls2", NO_ARG, &get_procbased_ctls2, 1 }, { "get-vmcs-guest-linear-address", NO_ARG, &get_vmcs_gla, 1 }, { "get-vmcs-guest-physical-address", NO_ARG, &get_vmcs_gpa, 1 }, { "get-vmcs-entry-interruption-info", NO_ARG, &get_vmcs_entry_interruption_info, 1}, { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 }, { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 }, { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 }, { "get-vmcs-cr4-shadow", NO_ARG, &get_cr4_shadow, 1 }, { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1 }, { "get-vmcs-tpr-threshold", NO_ARG, &get_tpr_threshold, 1 }, { "get-vmcs-vpid", NO_ARG, &get_vpid_asid, 1 }, { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 }, { "get-vmcs-entry-ctls", NO_ARG, &get_entry_ctls, 1 }, { "get-vmcs-instruction-error", NO_ARG, &get_inst_err, 1 }, { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 }, { "get-vmcs-host-cr0", NO_ARG, &get_host_cr0, 1 }, { "get-vmcs-exit-qualification", NO_ARG, &get_vmcs_exit_qualification, 1 }, { "get-vmcs-exit-inst-length", NO_ARG, &get_vmcs_exit_inst_length, 1 }, { "get-vmcs-interruptibility", NO_ARG, &get_vmcs_interruptibility, 1 }, { "get-vmcs-exit-interruption-error", NO_ARG, &get_vmcs_exit_interruption_error, 1 }, { "get-vmcs-exit-interruption-info", NO_ARG, &get_vmcs_exit_interruption_info, 1 }, { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 }, { "get-vmcs-host-cr3", NO_ARG, &get_host_cr3, 1 }, { "get-vmcs-host-cr4", NO_ARG, &get_host_cr4, 1 }, { "get-vmcs-host-rip", NO_ARG, &get_host_rip, 1 }, { "get-vmcs-host-rsp", NO_ARG, &get_host_rsp, 1 }, { "get-apic-access-address", NO_ARG, &get_apic_access_addr, 1}, { "get-virtual-apic-address", NO_ARG, &get_virtual_apic_addr, 1} }; const struct option amd_opts[] = { { "get-vmcb-intercepts", NO_ARG, &get_vmcb_intercept, 1 }, { "get-vmcb-asid", NO_ARG, &get_vpid_asid, 1 }, { "get-vmcb-exit-details", NO_ARG, &get_vmcb_exit_details, 1 }, { "get-vmcb-tlb-ctrl", NO_ARG, &get_vmcb_tlb_ctrl, 1 }, { "get-vmcb-virq", NO_ARG, &get_vmcb_virq, 1 }, { "get-avic-apic-bar", NO_ARG, &get_apic_access_addr, 1 }, { "get-avic-backing-page", NO_ARG, &get_virtual_apic_addr, 1 }, { "get-avic-table", NO_ARG, &get_avic_table, 1 } }; const struct option null_opt = { NULL, 0, NULL, 0 }; struct option *all_opts; char *cp; int optlen; optlen = sizeof(common_opts); if (cpu_intel) optlen += sizeof(intel_opts); else optlen += sizeof(amd_opts); optlen += sizeof(null_opt); all_opts = malloc(optlen); cp = (char *)all_opts; memcpy(cp, common_opts, sizeof(common_opts)); cp += sizeof(common_opts); if (cpu_intel) { memcpy(cp, intel_opts, sizeof(intel_opts)); cp += sizeof(intel_opts); } else { memcpy(cp, amd_opts, sizeof(amd_opts)); cp += sizeof(amd_opts); } memcpy(cp, &null_opt, sizeof(null_opt)); cp += sizeof(null_opt); return (all_opts); } static const char * wday_str(int idx) { static const char *weekdays[] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" }; if (idx >= 0 && idx < 7) return (weekdays[idx]); else return ("UNK"); } static const char * mon_str(int idx) { static const char *months[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" }; if (idx >= 0 && idx < 12) return (months[idx]); else return ("UNK"); } static int show_memmap(struct vmctx *ctx) { char name[SPECNAMELEN + 1], numbuf[8]; vm_ooffset_t segoff; vm_paddr_t gpa; size_t maplen, seglen; int error, flags, prot, segid, delim; printf("Address Length Segment Offset "); printf("Prot Flags\n"); gpa = 0; while (1) { error = vm_mmap_getnext(ctx, &gpa, &segid, &segoff, &maplen, &prot, &flags); if (error) return (errno == ENOENT ? 0 : error); error = vm_get_memseg(ctx, segid, &seglen, name, sizeof(name)); if (error) return (error); printf("%-12lX", gpa); humanize_number(numbuf, sizeof(numbuf), maplen, "B", HN_AUTOSCALE, HN_NOSPACE); printf("%-12s", numbuf); printf("%-12s", name[0] ? name : "sysmem"); printf("%-12lX", segoff); printf("%c%c%c ", prot & PROT_READ ? 'R' : '-', prot & PROT_WRITE ? 'W' : '-', prot & PROT_EXEC ? 'X' : '-'); delim = '\0'; if (flags & VM_MEMMAP_F_WIRED) { printf("%cwired", delim); delim = '/'; } if (flags & VM_MEMMAP_F_IOMMU) { printf("%ciommu", delim); delim = '/'; } printf("\n"); gpa += maplen; } } static int show_memseg(struct vmctx *ctx) { char name[SPECNAMELEN + 1], numbuf[8]; size_t seglen; int error, segid; printf("ID Length Name\n"); segid = 0; while (1) { error = vm_get_memseg(ctx, segid, &seglen, name, sizeof(name)); if (error) return (errno == EINVAL ? 0 : error); if (seglen) { printf("%-4d", segid); humanize_number(numbuf, sizeof(numbuf), seglen, "B", HN_AUTOSCALE, HN_NOSPACE); printf("%-12s", numbuf); printf("%s", name[0] ? name : "sysmem"); printf("\n"); } segid++; } } #ifdef BHYVE_SNAPSHOT static int send_message(const char *vmname, nvlist_t *nvl) { struct sockaddr_un addr; int err = 0, socket_fd; socket_fd = socket(PF_UNIX, SOCK_STREAM, 0); if (socket_fd < 0) { perror("Error creating bhyvectl socket"); err = errno; goto done; } memset(&addr, 0, sizeof(struct sockaddr_un)); snprintf(addr.sun_path, sizeof(addr.sun_path), "%s%s", BHYVE_RUN_DIR, vmname); addr.sun_family = AF_UNIX; addr.sun_len = SUN_LEN(&addr); if (connect(socket_fd, (struct sockaddr *)&addr, addr.sun_len) != 0) { perror("connect() failed"); err = errno; goto done; } if (nvlist_send(socket_fd, nvl) < 0) { perror("nvlist_send() failed"); err = errno; } done: nvlist_destroy(nvl); if (socket_fd >= 0) close(socket_fd); return (err); } static int snapshot_request(const char *vmname, const char *file, bool suspend) { nvlist_t *nvl; nvl = nvlist_create(0); nvlist_add_string(nvl, "cmd", "checkpoint"); nvlist_add_string(nvl, "filename", file); nvlist_add_bool(nvl, "suspend", suspend); return (send_message(vmname, nvl)); } #endif int main(int argc, char *argv[]) { char *vmname; - int error, ch, vcpu, ptenum; + int error, ch, vcpuid, ptenum; vm_paddr_t gpa_pmap; struct vm_exit vmexit; uint64_t rax, cr0, cr2, cr3, cr4, dr0, dr1, dr2, dr3, dr6, dr7; uint64_t rsp, rip, rflags, efer, pat; uint64_t eptp, bm, addr, u64, pteval[4], *pte, info[2]; struct vmctx *ctx; + struct vcpu *vcpu; cpuset_t cpus; bool cpu_intel; uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; struct tm tm; struct option *opts; #ifdef BHYVE_SNAPSHOT char *checkpoint_file = NULL; #endif cpu_intel = cpu_vendor_intel(); opts = setup_options(cpu_intel); - vcpu = 0; + vcpuid = 0; vmname = NULL; assert_lapic_lvt = -1; progname = basename(argv[0]); while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) { switch (ch) { case 0: break; case VMNAME: vmname = optarg; break; case VCPU: - vcpu = atoi(optarg); + vcpuid = atoi(optarg); break; case SET_MEM: memsize = atoi(optarg) * MB; memsize = roundup(memsize, 2 * MB); break; case SET_EFER: efer = strtoul(optarg, NULL, 0); set_efer = 1; break; case SET_CR0: cr0 = strtoul(optarg, NULL, 0); set_cr0 = 1; break; case SET_CR2: cr2 = strtoul(optarg, NULL, 0); set_cr2 = 1; break; case SET_CR3: cr3 = strtoul(optarg, NULL, 0); set_cr3 = 1; break; case SET_CR4: cr4 = strtoul(optarg, NULL, 0); set_cr4 = 1; break; case SET_DR0: dr0 = strtoul(optarg, NULL, 0); set_dr0 = 1; break; case SET_DR1: dr1 = strtoul(optarg, NULL, 0); set_dr1 = 1; break; case SET_DR2: dr2 = strtoul(optarg, NULL, 0); set_dr2 = 1; break; case SET_DR3: dr3 = strtoul(optarg, NULL, 0); set_dr3 = 1; break; case SET_DR6: dr6 = strtoul(optarg, NULL, 0); set_dr6 = 1; break; case SET_DR7: dr7 = strtoul(optarg, NULL, 0); set_dr7 = 1; break; case SET_RSP: rsp = strtoul(optarg, NULL, 0); set_rsp = 1; break; case SET_RIP: rip = strtoul(optarg, NULL, 0); set_rip = 1; break; case SET_RAX: rax = strtoul(optarg, NULL, 0); set_rax = 1; break; case SET_RFLAGS: rflags = strtoul(optarg, NULL, 0); set_rflags = 1; break; case DESC_BASE: desc_base = strtoul(optarg, NULL, 0); break; case DESC_LIMIT: desc_limit = strtoul(optarg, NULL, 0); break; case DESC_ACCESS: desc_access = strtoul(optarg, NULL, 0); break; case SET_CS: cs = strtoul(optarg, NULL, 0); set_cs = 1; break; case SET_DS: ds = strtoul(optarg, NULL, 0); set_ds = 1; break; case SET_ES: es = strtoul(optarg, NULL, 0); set_es = 1; break; case SET_FS: fs = strtoul(optarg, NULL, 0); set_fs = 1; break; case SET_GS: gs = strtoul(optarg, NULL, 0); set_gs = 1; break; case SET_SS: ss = strtoul(optarg, NULL, 0); set_ss = 1; break; case SET_TR: tr = strtoul(optarg, NULL, 0); set_tr = 1; break; case SET_LDTR: ldtr = strtoul(optarg, NULL, 0); set_ldtr = 1; break; case SET_X2APIC_STATE: x2apic_state = strtol(optarg, NULL, 0); set_x2apic_state = 1; break; case SET_CAP: capval = strtoul(optarg, NULL, 0); setcap = 1; break; case SET_RTC_TIME: rtc_secs = strtoul(optarg, NULL, 0); set_rtc_time = 1; break; case SET_RTC_NVRAM: rtc_nvram_value = (uint8_t)strtoul(optarg, NULL, 0); set_rtc_nvram = 1; break; case RTC_NVRAM_OFFSET: rtc_nvram_offset = strtoul(optarg, NULL, 0); break; case GET_GPA_PMAP: gpa_pmap = strtoul(optarg, NULL, 0); get_gpa_pmap = 1; break; case CAPNAME: capname = optarg; break; case UNASSIGN_PPTDEV: unassign_pptdev = 1; if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3) usage(cpu_intel); break; case ASSERT_LAPIC_LVT: assert_lapic_lvt = atoi(optarg); break; #ifdef BHYVE_SNAPSHOT case SET_CHECKPOINT_FILE: case SET_SUSPEND_FILE: if (checkpoint_file != NULL) usage(cpu_intel); checkpoint_file = optarg; vm_suspend_opt = (ch == SET_SUSPEND_FILE); break; #endif default: usage(cpu_intel); } } argc -= optind; argv += optind; if (vmname == NULL) usage(cpu_intel); error = 0; if (!error && create) error = vm_create(vmname); if (!error) { ctx = vm_open(vmname); if (ctx == NULL) { fprintf(stderr, "vm_open: %s could not be opened: %s\n", vmname, strerror(errno)); exit (1); } + vcpu = vm_vcpu_open(ctx, vcpuid); } if (!error && memsize) error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (!error && set_efer) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer); + error = vm_set_register(vcpu, VM_REG_GUEST_EFER, efer); if (!error && set_cr0) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0); + error = vm_set_register(vcpu, VM_REG_GUEST_CR0, cr0); if (!error && set_cr2) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR2, cr2); + error = vm_set_register(vcpu, VM_REG_GUEST_CR2, cr2); if (!error && set_cr3) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3); + error = vm_set_register(vcpu, VM_REG_GUEST_CR3, cr3); if (!error && set_cr4) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4); + error = vm_set_register(vcpu, VM_REG_GUEST_CR4, cr4); if (!error && set_dr0) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR0, dr0); + error = vm_set_register(vcpu, VM_REG_GUEST_DR0, dr0); if (!error && set_dr1) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR1, dr1); + error = vm_set_register(vcpu, VM_REG_GUEST_DR1, dr1); if (!error && set_dr2) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR2, dr2); + error = vm_set_register(vcpu, VM_REG_GUEST_DR2, dr2); if (!error && set_dr3) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR3, dr3); + error = vm_set_register(vcpu, VM_REG_GUEST_DR3, dr3); if (!error && set_dr6) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR6, dr6); + error = vm_set_register(vcpu, VM_REG_GUEST_DR6, dr6); if (!error && set_dr7) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7); + error = vm_set_register(vcpu, VM_REG_GUEST_DR7, dr7); if (!error && set_rsp) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp); + error = vm_set_register(vcpu, VM_REG_GUEST_RSP, rsp); if (!error && set_rip) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip); + error = vm_set_register(vcpu, VM_REG_GUEST_RIP, rip); if (!error && set_rax) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax); + error = vm_set_register(vcpu, VM_REG_GUEST_RAX, rax); if (!error && set_rflags) { - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, + error = vm_set_register(vcpu, VM_REG_GUEST_RFLAGS, rflags); } if (!error && set_desc_ds) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS, + error = vm_set_desc(vcpu, VM_REG_GUEST_DS, desc_base, desc_limit, desc_access); } if (!error && set_desc_es) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES, + error = vm_set_desc(vcpu, VM_REG_GUEST_ES, desc_base, desc_limit, desc_access); } if (!error && set_desc_ss) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS, + error = vm_set_desc(vcpu, VM_REG_GUEST_SS, desc_base, desc_limit, desc_access); } if (!error && set_desc_cs) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS, + error = vm_set_desc(vcpu, VM_REG_GUEST_CS, desc_base, desc_limit, desc_access); } if (!error && set_desc_fs) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS, + error = vm_set_desc(vcpu, VM_REG_GUEST_FS, desc_base, desc_limit, desc_access); } if (!error && set_desc_gs) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS, + error = vm_set_desc(vcpu, VM_REG_GUEST_GS, desc_base, desc_limit, desc_access); } if (!error && set_desc_tr) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR, + error = vm_set_desc(vcpu, VM_REG_GUEST_TR, desc_base, desc_limit, desc_access); } if (!error && set_desc_ldtr) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR, + error = vm_set_desc(vcpu, VM_REG_GUEST_LDTR, desc_base, desc_limit, desc_access); } if (!error && set_desc_gdtr) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR, + error = vm_set_desc(vcpu, VM_REG_GUEST_GDTR, desc_base, desc_limit, 0); } if (!error && set_desc_idtr) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR, + error = vm_set_desc(vcpu, VM_REG_GUEST_IDTR, desc_base, desc_limit, 0); } if (!error && set_cs) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs); + error = vm_set_register(vcpu, VM_REG_GUEST_CS, cs); if (!error && set_ds) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds); + error = vm_set_register(vcpu, VM_REG_GUEST_DS, ds); if (!error && set_es) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es); + error = vm_set_register(vcpu, VM_REG_GUEST_ES, es); if (!error && set_fs) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs); + error = vm_set_register(vcpu, VM_REG_GUEST_FS, fs); if (!error && set_gs) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs); + error = vm_set_register(vcpu, VM_REG_GUEST_GS, gs); if (!error && set_ss) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss); + error = vm_set_register(vcpu, VM_REG_GUEST_SS, ss); if (!error && set_tr) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr); + error = vm_set_register(vcpu, VM_REG_GUEST_TR, tr); if (!error && set_ldtr) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr); + error = vm_set_register(vcpu, VM_REG_GUEST_LDTR, ldtr); if (!error && set_x2apic_state) - error = vm_set_x2apic_state(ctx, vcpu, x2apic_state); + error = vm_set_x2apic_state(vcpu, x2apic_state); if (!error && unassign_pptdev) error = vm_unassign_pptdev(ctx, bus, slot, func); if (!error && inject_nmi) { - error = vm_inject_nmi(ctx, vcpu); + error = vm_inject_nmi(vcpu); } if (!error && assert_lapic_lvt != -1) { - error = vm_lapic_local_irq(ctx, vcpu, assert_lapic_lvt); + error = vm_lapic_local_irq(vcpu, assert_lapic_lvt); } if (!error && (get_memseg || get_all)) error = show_memseg(ctx); if (!error && (get_memmap || get_all)) error = show_memmap(ctx); if (!error) - error = get_all_registers(ctx, vcpu); + error = get_all_registers(vcpu, vcpuid); if (!error) - error = get_all_segments(ctx, vcpu); + error = get_all_segments(vcpu, vcpuid); if (!error) { if (cpu_intel) - error = get_misc_vmcs(ctx, vcpu); + error = get_misc_vmcs(vcpu, vcpuid); else - error = get_misc_vmcb(ctx, vcpu); + error = get_misc_vmcb(vcpu, vcpuid); } if (!error && (get_x2apic_state || get_all)) { - error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state); + error = vm_get_x2apic_state(vcpu, &x2apic_state); if (error == 0) - printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state); + printf("x2apic_state[%d]\t%d\n", vcpuid, x2apic_state); } if (!error && (get_eptp || get_all)) { if (cpu_intel) - error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp); + error = vm_get_vmcs_field(vcpu, VMCS_EPTP, &eptp); else - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_NPT_BASE, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_NPT_BASE, 8, &eptp); if (error == 0) printf("%s[%d]\t\t0x%016lx\n", - cpu_intel ? "eptp" : "rvi/npt", vcpu, eptp); + cpu_intel ? "eptp" : "rvi/npt", vcpuid, eptp); } if (!error && (get_exception_bitmap || get_all)) { if(cpu_intel) - error = vm_get_vmcs_field(ctx, vcpu, + error = vm_get_vmcs_field(vcpu, VMCS_EXCEPTION_BITMAP, &bm); else - error = vm_get_vmcb_field(ctx, vcpu, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_EXC_INTERCEPT, 4, &bm); if (error == 0) - printf("exception_bitmap[%d]\t%#lx\n", vcpu, bm); + printf("exception_bitmap[%d]\t%#lx\n", vcpuid, bm); } if (!error && (get_io_bitmap || get_all)) { if (cpu_intel) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, + error = vm_get_vmcs_field(vcpu, VMCS_IO_BITMAP_A, &bm); if (error == 0) - printf("io_bitmap_a[%d]\t%#lx\n", vcpu, bm); - error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, + printf("io_bitmap_a[%d]\t%#lx\n", vcpuid, bm); + error = vm_get_vmcs_field(vcpu, VMCS_IO_BITMAP_B, &bm); if (error == 0) - printf("io_bitmap_b[%d]\t%#lx\n", vcpu, bm); + printf("io_bitmap_b[%d]\t%#lx\n", vcpuid, bm); } else { - error = vm_get_vmcb_field(ctx, vcpu, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_IO_PERM, 8, &bm); if (error == 0) - printf("io_bitmap[%d]\t%#lx\n", vcpu, bm); + printf("io_bitmap[%d]\t%#lx\n", vcpuid, bm); } } if (!error && (get_tsc_offset || get_all)) { uint64_t tscoff; if (cpu_intel) - error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, + error = vm_get_vmcs_field(vcpu, VMCS_TSC_OFFSET, &tscoff); else - error = vm_get_vmcb_field(ctx, vcpu, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_TSC_OFFSET, 8, &tscoff); if (error == 0) - printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff); + printf("tsc_offset[%d]\t0x%016lx\n", vcpuid, tscoff); } if (!error && (get_msr_bitmap_address || get_all)) { if (cpu_intel) - error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, + error = vm_get_vmcs_field(vcpu, VMCS_MSR_BITMAP, &addr); else - error = vm_get_vmcb_field(ctx, vcpu, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_MSR_PERM, 8, &addr); if (error == 0) - printf("msr_bitmap[%d]\t\t%#lx\n", vcpu, addr); + printf("msr_bitmap[%d]\t\t%#lx\n", vcpuid, addr); } if (!error && (get_msr_bitmap || get_all)) { if (cpu_intel) { - error = vm_get_vmcs_field(ctx, vcpu, + error = vm_get_vmcs_field(vcpu, VMCS_MSR_BITMAP, &addr); } else { - error = vm_get_vmcb_field(ctx, vcpu, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_MSR_PERM, 8, &addr); } if (error == 0) - error = dump_msr_bitmap(vcpu, addr, cpu_intel); + error = dump_msr_bitmap(vcpuid, addr, cpu_intel); } if (!error && (get_vpid_asid || get_all)) { uint64_t vpid; if (cpu_intel) - error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid); + error = vm_get_vmcs_field(vcpu, VMCS_VPID, &vpid); else - error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_ASID, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_ASID, 4, &vpid); if (error == 0) printf("%s[%d]\t\t0x%04lx\n", - cpu_intel ? "vpid" : "asid", vcpu, vpid); + cpu_intel ? "vpid" : "asid", vcpuid, vpid); } if (!error && (get_guest_pat || get_all)) { if (cpu_intel) - error = vm_get_vmcs_field(ctx, vcpu, + error = vm_get_vmcs_field(vcpu, VMCS_GUEST_IA32_PAT, &pat); else - error = vm_get_vmcb_field(ctx, vcpu, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_GUEST_PAT, 8, &pat); if (error == 0) - printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat); + printf("guest_pat[%d]\t\t0x%016lx\n", vcpuid, pat); } if (!error && (get_guest_sysenter || get_all)) { if (cpu_intel) - error = vm_get_vmcs_field(ctx, vcpu, + error = vm_get_vmcs_field(vcpu, VMCS_GUEST_IA32_SYSENTER_CS, &cs); else - error = vm_get_vmcb_field(ctx, vcpu, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_SYSENTER_CS, 8, &cs); if (error == 0) - printf("guest_sysenter_cs[%d]\t%#lx\n", vcpu, cs); + printf("guest_sysenter_cs[%d]\t%#lx\n", vcpuid, cs); if (cpu_intel) - error = vm_get_vmcs_field(ctx, vcpu, + error = vm_get_vmcs_field(vcpu, VMCS_GUEST_IA32_SYSENTER_ESP, &rsp); else - error = vm_get_vmcb_field(ctx, vcpu, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_SYSENTER_ESP, 8, &rsp); if (error == 0) - printf("guest_sysenter_sp[%d]\t%#lx\n", vcpu, rsp); + printf("guest_sysenter_sp[%d]\t%#lx\n", vcpuid, rsp); if (cpu_intel) - error = vm_get_vmcs_field(ctx, vcpu, + error = vm_get_vmcs_field(vcpu, VMCS_GUEST_IA32_SYSENTER_EIP, &rip); else - error = vm_get_vmcb_field(ctx, vcpu, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_SYSENTER_EIP, 8, &rip); if (error == 0) - printf("guest_sysenter_ip[%d]\t%#lx\n", vcpu, rip); + printf("guest_sysenter_ip[%d]\t%#lx\n", vcpuid, rip); } if (!error && (get_exit_reason || get_all)) { if (cpu_intel) - error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, + error = vm_get_vmcs_field(vcpu, VMCS_EXIT_REASON, &u64); else - error = vm_get_vmcb_field(ctx, vcpu, + error = vm_get_vmcb_field(vcpu, VMCB_OFF_EXIT_REASON, 8, &u64); if (error == 0) - printf("exit_reason[%d]\t%#lx\n", vcpu, u64); + printf("exit_reason[%d]\t%#lx\n", vcpuid, u64); } if (!error && setcap) { int captype; captype = vm_capability_name2type(capname); - error = vm_set_capability(ctx, vcpu, captype, capval); + error = vm_set_capability(vcpu, captype, capval); if (error != 0 && errno == ENOENT) printf("Capability \"%s\" is not available\n", capname); } if (!error && get_gpa_pmap) { error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum); if (error == 0) { printf("gpa %#lx:", gpa_pmap); pte = &pteval[0]; while (ptenum-- > 0) printf(" %#lx", *pte++); printf("\n"); } } if (!error && set_rtc_nvram) error = vm_rtc_write(ctx, rtc_nvram_offset, rtc_nvram_value); if (!error && (get_rtc_nvram || get_all)) { error = vm_rtc_read(ctx, rtc_nvram_offset, &rtc_nvram_value); if (error == 0) { printf("rtc nvram[%03d]: 0x%02x\n", rtc_nvram_offset, rtc_nvram_value); } } if (!error && set_rtc_time) error = vm_rtc_settime(ctx, rtc_secs); if (!error && (get_rtc_time || get_all)) { error = vm_rtc_gettime(ctx, &rtc_secs); if (error == 0) { gmtime_r(&rtc_secs, &tm); printf("rtc time %#lx: %s %s %02d %02d:%02d:%02d %d\n", rtc_secs, wday_str(tm.tm_wday), mon_str(tm.tm_mon), tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, 1900 + tm.tm_year); } } if (!error && (getcap || get_all)) { int captype, val, getcaptype; if (getcap && capname) getcaptype = vm_capability_name2type(capname); else getcaptype = -1; for (captype = 0; captype < VM_CAP_MAX; captype++) { if (getcaptype >= 0 && captype != getcaptype) continue; - error = vm_get_capability(ctx, vcpu, captype, &val); + error = vm_get_capability(vcpu, captype, &val); if (error == 0) { printf("Capability \"%s\" is %s on vcpu %d\n", vm_capability_type2name(captype), - val ? "set" : "not set", vcpu); + val ? "set" : "not set", vcpuid); } else if (errno == ENOENT) { error = 0; printf("Capability \"%s\" is not available\n", vm_capability_type2name(captype)); } else { break; } } } if (!error && (get_active_cpus || get_all)) { error = vm_active_cpus(ctx, &cpus); if (!error) print_cpus("active cpus", &cpus); } if (!error && (get_suspended_cpus || get_all)) { error = vm_suspended_cpus(ctx, &cpus); if (!error) print_cpus("suspended cpus", &cpus); } if (!error && (get_intinfo || get_all)) { - error = vm_get_intinfo(ctx, vcpu, &info[0], &info[1]); + error = vm_get_intinfo(vcpu, &info[0], &info[1]); if (!error) { print_intinfo("pending", info[0]); print_intinfo("current", info[1]); } } if (!error && (get_stats || get_all)) { int i, num_stats; uint64_t *stats; struct timeval tv; const char *desc; - stats = vm_get_stats(ctx, vcpu, &tv, &num_stats); + stats = vm_get_stats(vcpu, &tv, &num_stats); if (stats != NULL) { - printf("vcpu%d stats:\n", vcpu); + printf("vcpu%d stats:\n", vcpuid); for (i = 0; i < num_stats; i++) { desc = vm_get_stat_desc(ctx, i); printf("%-40s\t%ld\n", desc, stats[i]); } } } if (!error && (get_cpu_topology || get_all)) { uint16_t sockets, cores, threads, maxcpus; vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); printf("cpu_topology:\tsockets=%hu, cores=%hu, threads=%hu, " "maxcpus=%hu\n", sockets, cores, threads, maxcpus); } if (!error && run) { - error = vm_run(ctx, vcpu, &vmexit); + error = vm_run(vcpu, &vmexit); if (error == 0) - dump_vm_run_exitcode(&vmexit, vcpu); + dump_vm_run_exitcode(&vmexit, vcpuid); else printf("vm_run error %d\n", error); } if (!error && force_reset) error = vm_suspend(ctx, VM_SUSPEND_RESET); if (!error && force_poweroff) error = vm_suspend(ctx, VM_SUSPEND_POWEROFF); if (error) printf("errno = %d\n", errno); if (!error && destroy) vm_destroy(ctx); #ifdef BHYVE_SNAPSHOT if (!error && checkpoint_file) error = snapshot_request(vmname, checkpoint_file, vm_suspend_opt); #endif free (opts); exit(error); } diff --git a/usr.sbin/bhyveload/bhyveload.c b/usr.sbin/bhyveload/bhyveload.c index 09a9653ece34..6801acf75803 100644 --- a/usr.sbin/bhyveload/bhyveload.c +++ b/usr.sbin/bhyveload/bhyveload.c @@ -1,861 +1,867 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /*- * Copyright (c) 2011 Google, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "userboot.h" #define MB (1024 * 1024UL) #define GB (1024 * 1024 * 1024UL) #define BSP 0 #define NDISKS 32 static char *host_base; static struct termios term, oldterm; static int disk_fd[NDISKS]; static int ndisks; static int consin_fd, consout_fd; static int need_reinit; static void *loader_hdl; static char *loader; static int explicit_loader; static jmp_buf jb; static char *vmname, *progname; static struct vmctx *ctx; +static struct vcpu *vcpu; static uint64_t gdtbase, cr3, rsp; static void cb_exit(void *arg, int v); /* * Console i/o callbacks */ static void cb_putc(void *arg __unused, int ch) { char c = ch; (void) write(consout_fd, &c, 1); } static int cb_getc(void *arg __unused) { char c; if (read(consin_fd, &c, 1) == 1) return (c); return (-1); } static int cb_poll(void *arg __unused) { int n; if (ioctl(consin_fd, FIONREAD, &n) >= 0) return (n > 0); return (0); } /* * Host filesystem i/o callbacks */ struct cb_file { int cf_isdir; size_t cf_size; struct stat cf_stat; union { int fd; DIR *dir; } cf_u; }; static int cb_open(void *arg __unused, const char *filename, void **hp) { struct cb_file *cf; char path[PATH_MAX]; if (!host_base) return (ENOENT); strlcpy(path, host_base, PATH_MAX); if (path[strlen(path) - 1] == '/') path[strlen(path) - 1] = 0; strlcat(path, filename, PATH_MAX); cf = malloc(sizeof(struct cb_file)); if (stat(path, &cf->cf_stat) < 0) { free(cf); return (errno); } cf->cf_size = cf->cf_stat.st_size; if (S_ISDIR(cf->cf_stat.st_mode)) { cf->cf_isdir = 1; cf->cf_u.dir = opendir(path); if (!cf->cf_u.dir) goto out; *hp = cf; return (0); } if (S_ISREG(cf->cf_stat.st_mode)) { cf->cf_isdir = 0; cf->cf_u.fd = open(path, O_RDONLY); if (cf->cf_u.fd < 0) goto out; *hp = cf; return (0); } out: free(cf); return (EINVAL); } static int cb_close(void *arg __unused, void *h) { struct cb_file *cf = h; if (cf->cf_isdir) closedir(cf->cf_u.dir); else close(cf->cf_u.fd); free(cf); return (0); } static int cb_isdir(void *arg __unused, void *h) { struct cb_file *cf = h; return (cf->cf_isdir); } static int cb_read(void *arg __unused, void *h, void *buf, size_t size, size_t *resid) { struct cb_file *cf = h; ssize_t sz; if (cf->cf_isdir) return (EINVAL); sz = read(cf->cf_u.fd, buf, size); if (sz < 0) return (EINVAL); *resid = size - sz; return (0); } static int cb_readdir(void *arg __unused, void *h, uint32_t *fileno_return, uint8_t *type_return, size_t *namelen_return, char *name) { struct cb_file *cf = h; struct dirent *dp; if (!cf->cf_isdir) return (EINVAL); dp = readdir(cf->cf_u.dir); if (!dp) return (ENOENT); /* * Note: d_namlen is in the range 0..255 and therefore less * than PATH_MAX so we don't need to test before copying. */ *fileno_return = dp->d_fileno; *type_return = dp->d_type; *namelen_return = dp->d_namlen; memcpy(name, dp->d_name, dp->d_namlen); name[dp->d_namlen] = 0; return (0); } static int cb_seek(void *arg __unused, void *h, uint64_t offset, int whence) { struct cb_file *cf = h; if (cf->cf_isdir) return (EINVAL); if (lseek(cf->cf_u.fd, offset, whence) < 0) return (errno); return (0); } static int cb_stat(void *arg __unused, void *h, struct stat *sbp) { struct cb_file *cf = h; memset(sbp, 0, sizeof(struct stat)); sbp->st_mode = cf->cf_stat.st_mode; sbp->st_uid = cf->cf_stat.st_uid; sbp->st_gid = cf->cf_stat.st_gid; sbp->st_size = cf->cf_stat.st_size; sbp->st_mtime = cf->cf_stat.st_mtime; sbp->st_dev = cf->cf_stat.st_dev; sbp->st_ino = cf->cf_stat.st_ino; return (0); } /* * Disk image i/o callbacks */ static int cb_diskread(void *arg __unused, int unit, uint64_t from, void *to, size_t size, size_t *resid) { ssize_t n; if (unit < 0 || unit >= ndisks) return (EIO); n = pread(disk_fd[unit], to, size, from); if (n < 0) return (errno); *resid = size - n; return (0); } static int cb_diskwrite(void *arg __unused, int unit, uint64_t offset, void *src, size_t size, size_t *resid) { ssize_t n; if (unit < 0 || unit >= ndisks) return (EIO); n = pwrite(disk_fd[unit], src, size, offset); if (n < 0) return (errno); *resid = size - n; return (0); } static int cb_diskioctl(void *arg __unused, int unit, u_long cmd, void *data) { struct stat sb; if (unit < 0 || unit >= ndisks) return (EBADF); switch (cmd) { case DIOCGSECTORSIZE: *(u_int *)data = 512; break; case DIOCGMEDIASIZE: if (fstat(disk_fd[unit], &sb) != 0) return (ENOTTY); if (S_ISCHR(sb.st_mode) && ioctl(disk_fd[unit], DIOCGMEDIASIZE, &sb.st_size) != 0) return (ENOTTY); *(off_t *)data = sb.st_size; break; default: return (ENOTTY); } return (0); } /* * Guest virtual machine i/o callbacks */ static int cb_copyin(void *arg __unused, const void *from, uint64_t to, size_t size) { char *ptr; to &= 0x7fffffff; ptr = vm_map_gpa(ctx, to, size); if (ptr == NULL) return (EFAULT); memcpy(ptr, from, size); return (0); } static int cb_copyout(void *arg __unused, uint64_t from, void *to, size_t size) { char *ptr; from &= 0x7fffffff; ptr = vm_map_gpa(ctx, from, size); if (ptr == NULL) return (EFAULT); memcpy(to, ptr, size); return (0); } static void cb_setreg(void *arg __unused, int r, uint64_t v) { int error; enum vm_reg_name vmreg; vmreg = VM_REG_LAST; switch (r) { case 4: vmreg = VM_REG_GUEST_RSP; rsp = v; break; default: break; } if (vmreg == VM_REG_LAST) { printf("test_setreg(%d): not implemented\n", r); cb_exit(NULL, USERBOOT_EXIT_QUIT); } - error = vm_set_register(ctx, BSP, vmreg, v); + error = vm_set_register(vcpu, vmreg, v); if (error) { perror("vm_set_register"); cb_exit(NULL, USERBOOT_EXIT_QUIT); } } static void cb_setmsr(void *arg __unused, int r, uint64_t v) { int error; enum vm_reg_name vmreg; vmreg = VM_REG_LAST; switch (r) { case MSR_EFER: vmreg = VM_REG_GUEST_EFER; break; default: break; } if (vmreg == VM_REG_LAST) { printf("test_setmsr(%d): not implemented\n", r); cb_exit(NULL, USERBOOT_EXIT_QUIT); } - error = vm_set_register(ctx, BSP, vmreg, v); + error = vm_set_register(vcpu, vmreg, v); if (error) { perror("vm_set_msr"); cb_exit(NULL, USERBOOT_EXIT_QUIT); } } static void cb_setcr(void *arg __unused, int r, uint64_t v) { int error; enum vm_reg_name vmreg; vmreg = VM_REG_LAST; switch (r) { case 0: vmreg = VM_REG_GUEST_CR0; break; case 3: vmreg = VM_REG_GUEST_CR3; cr3 = v; break; case 4: vmreg = VM_REG_GUEST_CR4; break; default: break; } if (vmreg == VM_REG_LAST) { printf("test_setcr(%d): not implemented\n", r); cb_exit(NULL, USERBOOT_EXIT_QUIT); } - error = vm_set_register(ctx, BSP, vmreg, v); + error = vm_set_register(vcpu, vmreg, v); if (error) { perror("vm_set_cr"); cb_exit(NULL, USERBOOT_EXIT_QUIT); } } static void cb_setgdt(void *arg __unused, uint64_t base, size_t size) { int error; - error = vm_set_desc(ctx, BSP, VM_REG_GUEST_GDTR, base, size - 1, 0); + error = vm_set_desc(vcpu, VM_REG_GUEST_GDTR, base, size - 1, 0); if (error != 0) { perror("vm_set_desc(gdt)"); cb_exit(NULL, USERBOOT_EXIT_QUIT); } gdtbase = base; } static void cb_exec(void *arg __unused, uint64_t rip) { int error; if (cr3 == 0) - error = vm_setup_freebsd_registers_i386(ctx, BSP, rip, gdtbase, + error = vm_setup_freebsd_registers_i386(vcpu, rip, gdtbase, rsp); else - error = vm_setup_freebsd_registers(ctx, BSP, rip, cr3, gdtbase, + error = vm_setup_freebsd_registers(vcpu, rip, cr3, gdtbase, rsp); if (error) { perror("vm_setup_freebsd_registers"); cb_exit(NULL, USERBOOT_EXIT_QUIT); } cb_exit(NULL, 0); } /* * Misc */ static void cb_delay(void *arg __unused, int usec) { usleep(usec); } static void cb_exit(void *arg __unused, int v) { tcsetattr(consout_fd, TCSAFLUSH, &oldterm); exit(v); } static void cb_getmem(void *arg __unused, uint64_t *ret_lowmem, uint64_t *ret_highmem) { *ret_lowmem = vm_get_lowmem_size(ctx); *ret_highmem = vm_get_highmem_size(ctx); } struct env { char *str; /* name=value */ SLIST_ENTRY(env) next; }; static SLIST_HEAD(envhead, env) envhead; static void addenv(const char *str) { struct env *env; env = malloc(sizeof(struct env)); if (env == NULL) err(EX_OSERR, "malloc"); env->str = strdup(str); if (env->str == NULL) err(EX_OSERR, "strdup"); SLIST_INSERT_HEAD(&envhead, env, next); } static char * cb_getenv(void *arg __unused, int num) { int i; struct env *env; i = 0; SLIST_FOREACH(env, &envhead, next) { if (i == num) return (env->str); i++; } return (NULL); } static int -cb_vm_set_register(void *arg __unused, int vcpu, int reg, uint64_t val) +cb_vm_set_register(void *arg __unused, int vcpuid, int reg, uint64_t val) { - return (vm_set_register(ctx, vcpu, reg, val)); + assert(vcpuid == BSP); + return (vm_set_register(vcpu, reg, val)); } static int -cb_vm_set_desc(void *arg __unused, int vcpu, int reg, uint64_t base, +cb_vm_set_desc(void *arg __unused, int vcpuid, int reg, uint64_t base, u_int limit, u_int access) { - return (vm_set_desc(ctx, vcpu, reg, base, limit, access)); + assert(vcpuid == BSP); + return (vm_set_desc(vcpu, reg, base, limit, access)); } static void cb_swap_interpreter(void *arg __unused, const char *interp_req) { /* * If the user specified a loader but we detected a mismatch, we should * not try to pivot to a different loader on them. */ free(loader); if (explicit_loader == 1) { perror("requested loader interpreter does not match guest userboot"); cb_exit(NULL, 1); } if (interp_req == NULL || *interp_req == '\0') { perror("guest failed to request an interpreter"); cb_exit(NULL, 1); } if (asprintf(&loader, "/boot/userboot_%s.so", interp_req) == -1) err(EX_OSERR, "malloc"); need_reinit = 1; longjmp(jb, 1); } static struct loader_callbacks cb = { .getc = cb_getc, .putc = cb_putc, .poll = cb_poll, .open = cb_open, .close = cb_close, .isdir = cb_isdir, .read = cb_read, .readdir = cb_readdir, .seek = cb_seek, .stat = cb_stat, .diskread = cb_diskread, .diskwrite = cb_diskwrite, .diskioctl = cb_diskioctl, .copyin = cb_copyin, .copyout = cb_copyout, .setreg = cb_setreg, .setmsr = cb_setmsr, .setcr = cb_setcr, .setgdt = cb_setgdt, .exec = cb_exec, .delay = cb_delay, .exit = cb_exit, .getmem = cb_getmem, .getenv = cb_getenv, /* Version 4 additions */ .vm_set_register = cb_vm_set_register, .vm_set_desc = cb_vm_set_desc, /* Version 5 additions */ .swap_interpreter = cb_swap_interpreter, }; static int altcons_open(char *path) { struct stat sb; int err; int fd; /* * Allow stdio to be passed in so that the same string * can be used for the bhyveload console and bhyve com-port * parameters */ if (!strcmp(path, "stdio")) return (0); err = stat(path, &sb); if (err == 0) { if (!S_ISCHR(sb.st_mode)) err = ENOTSUP; else { fd = open(path, O_RDWR | O_NONBLOCK); if (fd < 0) err = errno; else consin_fd = consout_fd = fd; } } return (err); } static int disk_open(char *path) { int fd; if (ndisks >= NDISKS) return (ERANGE); fd = open(path, O_RDWR); if (fd < 0) return (errno); disk_fd[ndisks] = fd; ndisks++; return (0); } static void usage(void) { fprintf(stderr, "usage: %s [-S][-c ] [-d ] [-e ]\n" " %*s [-h ] [-m memsize[K|k|M|m|G|g|T|t]] \n", progname, (int)strlen(progname), ""); exit(1); } int main(int argc, char** argv) { void (*func)(struct loader_callbacks *, void *, int, int); uint64_t mem_size; int opt, error, memflags; progname = basename(argv[0]); memflags = 0; mem_size = 256 * MB; consin_fd = STDIN_FILENO; consout_fd = STDOUT_FILENO; while ((opt = getopt(argc, argv, "CSc:d:e:h:l:m:")) != -1) { switch (opt) { case 'c': error = altcons_open(optarg); if (error != 0) errx(EX_USAGE, "Could not open '%s'", optarg); break; case 'd': error = disk_open(optarg); if (error != 0) errx(EX_USAGE, "Could not open '%s'", optarg); break; case 'e': addenv(optarg); break; case 'h': host_base = optarg; break; case 'l': if (loader != NULL) errx(EX_USAGE, "-l can only be given once"); loader = strdup(optarg); if (loader == NULL) err(EX_OSERR, "malloc"); explicit_loader = 1; break; case 'm': error = vm_parse_memsize(optarg, &mem_size); if (error != 0) errx(EX_USAGE, "Invalid memsize '%s'", optarg); break; case 'C': memflags |= VM_MEM_F_INCORE; break; case 'S': memflags |= VM_MEM_F_WIRED; break; case '?': usage(); } } argc -= optind; argv += optind; if (argc != 1) usage(); vmname = argv[0]; need_reinit = 0; error = vm_create(vmname); if (error) { if (errno != EEXIST) { perror("vm_create"); exit(1); } need_reinit = 1; } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(1); } + vcpu = vm_vcpu_open(ctx, BSP); + /* * setjmp in the case the guest wants to swap out interpreter, * cb_swap_interpreter will swap out loader as appropriate and set * need_reinit so that we end up in a clean state once again. */ setjmp(jb); if (need_reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(1); } } vm_set_memflags(ctx, memflags); error = vm_setup_memory(ctx, mem_size, VM_MMAP_ALL); if (error) { perror("vm_setup_memory"); exit(1); } if (loader == NULL) { loader = strdup("/boot/userboot.so"); if (loader == NULL) err(EX_OSERR, "malloc"); } if (loader_hdl != NULL) dlclose(loader_hdl); loader_hdl = dlopen(loader, RTLD_LOCAL); if (!loader_hdl) { printf("%s\n", dlerror()); free(loader); return (1); } func = dlsym(loader_hdl, "loader_main"); if (!func) { printf("%s\n", dlerror()); free(loader); return (1); } tcgetattr(consout_fd, &term); oldterm = term; cfmakeraw(&term); term.c_cflag |= CLOCAL; tcsetattr(consout_fd, TCSAFLUSH, &term); addenv("smbios.bios.vendor=BHYVE"); addenv("boot_serial=1"); func(&cb, NULL, USERBOOT_VERSION_5, ndisks); free(loader); return (0); }