diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c index 454d7ee21b36..a65b77300b3a 100644 --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -1,1800 +1,1802 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmmapi.h" #define MB (1024 * 1024UL) #define GB (1024 * 1024 * 1024UL) /* * Size of the guard region before and after the virtual address space * mapping the guest physical memory. This must be a multiple of the * superpage size for performance reasons. */ #define VM_MMAP_GUARD_SIZE (4 * MB) #define PROT_RW (PROT_READ | PROT_WRITE) #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) struct vmctx { int fd; uint32_t lowmem_limit; int memflags; size_t lowmem; size_t highmem; char *baseaddr; char *name; }; #define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) #define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) static int vm_device_open(const char *name) { int fd, len; char *vmfile; len = strlen("/dev/vmm/") + strlen(name) + 1; vmfile = malloc(len); assert(vmfile != NULL); snprintf(vmfile, len, "/dev/vmm/%s", name); /* Open the device file */ fd = open(vmfile, O_RDWR, 0); free(vmfile); return (fd); } int vm_create(const char *name) { /* Try to load vmm(4) module before creating a guest. */ if (modfind("vmm") < 0) kldload("vmm"); return (CREATE(name)); } struct vmctx * vm_open(const char *name) { struct vmctx *vm; int saved_errno; vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); assert(vm != NULL); vm->fd = -1; vm->memflags = 0; vm->lowmem_limit = 3 * GB; vm->name = (char *)(vm + 1); strcpy(vm->name, name); if ((vm->fd = vm_device_open(vm->name)) < 0) goto err; return (vm); err: saved_errno = errno; free(vm); errno = saved_errno; return (NULL); } void vm_close(struct vmctx *vm) { assert(vm != NULL); close(vm->fd); free(vm); } void vm_destroy(struct vmctx *vm) { assert(vm != NULL); if (vm->fd >= 0) close(vm->fd); DESTROY(vm->name); free(vm); } int vm_parse_memsize(const char *opt, size_t *ret_memsize) { char *endptr; size_t optval; int error; optval = strtoul(opt, &endptr, 0); if (*opt != '\0' && *endptr == '\0') { /* * For the sake of backward compatibility if the memory size * specified on the command line is less than a megabyte then * it is interpreted as being in units of MB. */ if (optval < MB) optval *= MB; *ret_memsize = optval; error = 0; } else error = expand_number(opt, ret_memsize); return (error); } uint32_t vm_get_lowmem_limit(struct vmctx *ctx) { return (ctx->lowmem_limit); } void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit) { ctx->lowmem_limit = limit; } void vm_set_memflags(struct vmctx *ctx, int flags) { ctx->memflags = flags; } int vm_get_memflags(struct vmctx *ctx) { return (ctx->memflags); } /* * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot) { struct vm_memmap memmap; int error, flags; memmap.gpa = gpa; memmap.segid = segid; memmap.segoff = off; memmap.len = len; memmap.prot = prot; memmap.flags = 0; if (ctx->memflags & VM_MEM_F_WIRED) memmap.flags |= VM_MEMMAP_F_WIRED; /* * If this mapping already exists then don't create it again. This * is the common case for SYSMEM mappings created by bhyveload(8). */ error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); if (error == 0 && gpa == memmap.gpa) { if (segid != memmap.segid || off != memmap.segoff || prot != memmap.prot || flags != memmap.flags) { errno = EEXIST; return (-1); } else { return (0); } } error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); return (error); } int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, size_t *lowmem_size, size_t *highmem_size) { *guest_baseaddr = ctx->baseaddr; *lowmem_size = ctx->lowmem; *highmem_size = ctx->highmem; return (0); } int vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len) { struct vm_munmap munmap; int error; munmap.gpa = gpa; munmap.len = len; error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap); return (error); } int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { struct vm_memmap memmap; int error; bzero(&memmap, sizeof(struct vm_memmap)); memmap.gpa = *gpa; error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); if (error == 0) { *gpa = memmap.gpa; *segid = memmap.segid; *segoff = memmap.segoff; *len = memmap.len; *prot = memmap.prot; *flags = memmap.flags; } return (error); } /* * Return 0 if the segments are identical and non-zero otherwise. * * This is slightly complicated by the fact that only device memory segments * are named. */ static int cmpseg(size_t len, const char *str, size_t len2, const char *str2) { if (len == len2) { if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) return (0); } return (-1); } static int vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) { struct vm_memseg memseg; size_t n; int error; /* * If the memory segment has already been created then just return. * This is the usual case for the SYSMEM segment created by userspace * loaders like bhyveload(8). */ error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, sizeof(memseg.name)); if (error) return (error); if (memseg.len != 0) { if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { errno = EINVAL; return (-1); } else { return (0); } } bzero(&memseg, sizeof(struct vm_memseg)); memseg.segid = segid; memseg.len = len; if (name != NULL) { n = strlcpy(memseg.name, name, sizeof(memseg.name)); if (n >= sizeof(memseg.name)) { errno = ENAMETOOLONG; return (-1); } } error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); return (error); } int vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, size_t bufsize) { struct vm_memseg memseg; size_t n; int error; memseg.segid = segid; error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); if (error == 0) { *lenp = memseg.len; n = strlcpy(namebuf, memseg.name, bufsize); if (n >= bufsize) { errno = ENAMETOOLONG; error = -1; } } return (error); } static int setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) { char *ptr; int error, flags; /* Map 'len' bytes starting at 'gpa' in the guest address space */ error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); if (error) return (error); flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap into the process address space on the host */ ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); if (ptr == MAP_FAILED) return (-1); return (0); } int vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) { size_t objsize, len; vm_paddr_t gpa; char *baseaddr, *ptr; int error; assert(vms == VM_MMAP_ALL); /* * If 'memsize' cannot fit entirely in the 'lowmem' segment then * create another 'highmem' segment above 4GB for the remainder. */ if (memsize > ctx->lowmem_limit) { ctx->lowmem = ctx->lowmem_limit; ctx->highmem = memsize - ctx->lowmem_limit; objsize = 4*GB + ctx->highmem; } else { ctx->lowmem = memsize; ctx->highmem = 0; objsize = ctx->lowmem; } error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); if (error) return (error); /* * Stake out a contiguous region covering the guest physical memory * and the adjoining guard regions. */ len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (ptr == MAP_FAILED) return (-1); baseaddr = ptr + VM_MMAP_GUARD_SIZE; if (ctx->highmem > 0) { gpa = 4*GB; len = ctx->highmem; error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } if (ctx->lowmem > 0) { gpa = 0; len = ctx->lowmem; error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } ctx->baseaddr = baseaddr; return (0); } /* * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in * the lowmem or highmem regions. * * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. * The instruction emulation code depends on this behavior. */ void * vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) { if (ctx->lowmem > 0) { if (gaddr < ctx->lowmem && len <= ctx->lowmem && gaddr + len <= ctx->lowmem) return (ctx->baseaddr + gaddr); } if (ctx->highmem > 0) { if (gaddr >= 4*GB) { if (gaddr < 4*GB + ctx->highmem && len <= ctx->highmem && gaddr + len <= 4*GB + ctx->highmem) return (ctx->baseaddr + gaddr); } } return (NULL); } vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr) { vm_paddr_t offaddr; offaddr = (char *)addr - ctx->baseaddr; if (ctx->lowmem > 0) if (offaddr <= ctx->lowmem) return (offaddr); if (ctx->highmem > 0) if (offaddr >= 4*GB && offaddr < 4*GB + ctx->highmem) return (offaddr); return ((vm_paddr_t)-1); } const char * vm_get_name(struct vmctx *ctx) { return (ctx->name); } size_t vm_get_lowmem_size(struct vmctx *ctx) { return (ctx->lowmem); } size_t vm_get_highmem_size(struct vmctx *ctx) { return (ctx->highmem); } void * vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) { char pathname[MAXPATHLEN]; size_t len2; char *base, *ptr; int fd, error, flags; fd = -1; ptr = MAP_FAILED; if (name == NULL || strlen(name) == 0) { errno = EINVAL; goto done; } error = vm_alloc_memseg(ctx, segid, len, name); if (error) goto done; strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); strlcat(pathname, ctx->name, sizeof(pathname)); strlcat(pathname, ".", sizeof(pathname)); strlcat(pathname, name, sizeof(pathname)); fd = open(pathname, O_RDWR); if (fd < 0) goto done; /* * Stake out a contiguous region covering the device memory and the * adjoining guard regions. */ len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (base == MAP_FAILED) goto done; flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap the devmem region in the host address space */ ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); done: if (fd >= 0) close(fd); return (ptr); } int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); vmsegdesc.cpuid = vcpu; vmsegdesc.regnum = reg; vmsegdesc.desc.base = base; vmsegdesc.desc.limit = limit; vmsegdesc.desc.access = access; error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc); return (error); } int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); vmsegdesc.cpuid = vcpu; vmsegdesc.regnum = reg; error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc); if (error == 0) { *base = vmsegdesc.desc.base; *limit = vmsegdesc.desc.limit; *access = vmsegdesc.desc.access; } return (error); } int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc) { int error; error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit, &seg_desc->access); return (error); } int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.cpuid = vcpu; vmreg.regnum = reg; vmreg.regval = val; error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg); return (error); } int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.cpuid = vcpu; vmreg.regnum = reg; error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg); *ret_val = vmreg.regval; return (error); } int vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count, const int *regnums, uint64_t *regvals) { int error; struct vm_register_set vmregset; bzero(&vmregset, sizeof(vmregset)); vmregset.cpuid = vcpu; vmregset.count = count; vmregset.regnums = regnums; vmregset.regvals = regvals; error = ioctl(ctx->fd, VM_SET_REGISTER_SET, &vmregset); return (error); } int vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count, const int *regnums, uint64_t *regvals) { int error; struct vm_register_set vmregset; bzero(&vmregset, sizeof(vmregset)); vmregset.cpuid = vcpu; vmregset.count = count; vmregset.regnums = regnums; vmregset.regvals = regvals; error = ioctl(ctx->fd, VM_GET_REGISTER_SET, &vmregset); return (error); } int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) { int error; struct vm_run vmrun; bzero(&vmrun, sizeof(vmrun)); vmrun.cpuid = vcpu; error = ioctl(ctx->fd, VM_RUN, &vmrun); bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); return (error); } int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) { struct vm_suspend vmsuspend; bzero(&vmsuspend, sizeof(vmsuspend)); vmsuspend.how = how; return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); } int vm_reinit(struct vmctx *ctx) { return (ioctl(ctx->fd, VM_REINIT, 0)); } int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction) { struct vm_exception exc; exc.cpuid = vcpu; exc.vector = vector; exc.error_code = errcode; exc.error_code_valid = errcode_valid; exc.restart_instruction = restart_instruction; return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc)); } int vm_apicid2vcpu(struct vmctx *ctx __unused, int apicid) { /* * The apic id associated with the 'vcpu' has the same numerical value * as the 'vcpu' itself. */ return (apicid); } int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector) { struct vm_lapic_irq vmirq; bzero(&vmirq, sizeof(vmirq)); vmirq.cpuid = vcpu; vmirq.vector = vector; return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq)); } int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector) { struct vm_lapic_irq vmirq; bzero(&vmirq, sizeof(vmirq)); vmirq.cpuid = vcpu; vmirq.vector = vector; return (ioctl(ctx->fd, VM_LAPIC_LOCAL_IRQ, &vmirq)); } int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg) { struct vm_lapic_msi vmmsi; bzero(&vmmsi, sizeof(vmmsi)); vmmsi.addr = addr; vmmsi.msg = msg; return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi)); } int vm_ioapic_assert_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq)); } int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq)); } int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq)); } int vm_ioapic_pincount(struct vmctx *ctx, int *pincount) { return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount)); } int vm_readwrite_kernemu_device(struct vmctx *ctx, int vcpu, vm_paddr_t gpa, bool write, int size, uint64_t *value) { struct vm_readwrite_kernemu_device irp = { .vcpuid = vcpu, .access_width = fls(size) - 1, .gpa = gpa, .value = write ? *value : ~0ul, }; long cmd = (write ? VM_SET_KERNEMU_DEV : VM_GET_KERNEMU_DEV); int rc; rc = ioctl(ctx->fd, cmd, &irp); if (rc == 0 && !write) *value = irp.value; return (rc); } int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq)); } int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq)); } int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq)); } int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger) { struct vm_isa_irq_trigger isa_irq_trigger; bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger)); isa_irq_trigger.atpic_irq = atpic_irq; isa_irq_trigger.trigger = trigger; return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger)); } int vm_inject_nmi(struct vmctx *ctx, int vcpu) { struct vm_nmi vmnmi; bzero(&vmnmi, sizeof(vmnmi)); vmnmi.cpuid = vcpu; return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi)); } static const char *capstrmap[] = { [VM_CAP_HALT_EXIT] = "hlt_exit", [VM_CAP_MTRAP_EXIT] = "mtrap_exit", [VM_CAP_PAUSE_EXIT] = "pause_exit", [VM_CAP_UNRESTRICTED_GUEST] = "unrestricted_guest", [VM_CAP_ENABLE_INVPCID] = "enable_invpcid", [VM_CAP_BPT_EXIT] = "bpt_exit", }; int vm_capability_name2type(const char *capname) { int i; for (i = 0; i < (int)nitems(capstrmap); i++) { if (strcmp(capstrmap[i], capname) == 0) return (i); } return (-1); } const char * vm_capability_type2name(int type) { if (type >= 0 && type < (int)nitems(capstrmap)) return (capstrmap[type]); return (NULL); } int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int *retval) { int error; struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.cpuid = vcpu; vmcap.captype = cap; error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap); *retval = vmcap.capval; return (error); } int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val) { struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.cpuid = vcpu; vmcap.captype = cap; vmcap.capval = val; return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap)); } int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev)); } int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev)); } int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { struct vm_pptdev_mmio pptmmio; bzero(&pptmmio, sizeof(pptmmio)); pptmmio.bus = bus; pptmmio.slot = slot; pptmmio.func = func; pptmmio.gpa = gpa; pptmmio.len = len; pptmmio.hpa = hpa; return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio)); } int vm_unmap_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len) { struct vm_pptdev_mmio pptmmio; bzero(&pptmmio, sizeof(pptmmio)); pptmmio.bus = bus; pptmmio.slot = slot; pptmmio.func = func; pptmmio.gpa = gpa; pptmmio.len = len; return (ioctl(ctx->fd, VM_UNMAP_PPTDEV_MMIO, &pptmmio)); } int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec) { struct vm_pptdev_msi pptmsi; bzero(&pptmsi, sizeof(pptmsi)); pptmsi.vcpu = vcpu; pptmsi.bus = bus; pptmsi.slot = slot; pptmsi.func = func; pptmsi.msg = msg; pptmsi.addr = addr; pptmsi.numvec = numvec; return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi)); } int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control) { struct vm_pptdev_msix pptmsix; bzero(&pptmsix, sizeof(pptmsix)); pptmsix.vcpu = vcpu; pptmsix.bus = bus; pptmsix.slot = slot; pptmsix.func = func; pptmsix.idx = idx; pptmsix.msg = msg; pptmsix.addr = addr; pptmsix.vector_control = vector_control; return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix); } int vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev ppt; bzero(&ppt, sizeof(ppt)); ppt.bus = bus; ppt.slot = slot; ppt.func = func; return ioctl(ctx->fd, VM_PPTDEV_DISABLE_MSIX, &ppt); } uint64_t * vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, int *ret_entries) { static _Thread_local uint64_t *stats_buf; static _Thread_local u_int stats_count; uint64_t *new_stats; struct vm_stats vmstats; u_int count, index; bool have_stats; have_stats = false; vmstats.cpuid = vcpu; count = 0; for (index = 0;; index += nitems(vmstats.statbuf)) { vmstats.index = index; if (ioctl(ctx->fd, VM_STATS, &vmstats) != 0) break; if (stats_count < index + vmstats.num_entries) { new_stats = realloc(stats_buf, (index + vmstats.num_entries) * sizeof(uint64_t)); if (new_stats == NULL) { errno = ENOMEM; return (NULL); } stats_count = index + vmstats.num_entries; stats_buf = new_stats; } memcpy(stats_buf + index, vmstats.statbuf, vmstats.num_entries * sizeof(uint64_t)); count += vmstats.num_entries; have_stats = true; if (vmstats.num_entries != nitems(vmstats.statbuf)) break; } if (have_stats) { if (ret_entries) *ret_entries = count; if (ret_tv) *ret_tv = vmstats.tv; return (stats_buf); } else return (NULL); } const char * vm_get_stat_desc(struct vmctx *ctx, int index) { static struct vm_stat_desc statdesc; statdesc.index = index; if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) return (statdesc.desc); else return (NULL); } int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); x2apic.cpuid = vcpu; error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic); *state = x2apic.state; return (error); } int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); x2apic.cpuid = vcpu; x2apic.state = state; error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic); return (error); } /* * From Intel Vol 3a: * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT */ int vcpu_reset(struct vmctx *vmctx, int vcpu) { int error; uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx; uint32_t desc_access, desc_limit; uint16_t sel; zero = 0; rflags = 0x2; error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); if (error) goto done; rip = 0xfff0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0) goto done; /* * According to Intels Software Developer Manual CR0 should be * initialized with CR0_ET | CR0_NW | CR0_CD but that crashes some * guests like Windows. */ cr0 = CR0_NE; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR2, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0) goto done; cr4 = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0) goto done; /* * CS: present, r/w, accessed, 16-bit, byte granularity, usable */ desc_base = 0xffff0000; desc_limit = 0xffff; desc_access = 0x0093; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0xf000; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0) goto done; /* * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0093; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, zero)) != 0) goto done; /* General purpose registers */ rdx = 0xf00; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R8, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R9, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R10, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R11, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R12, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R13, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R14, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_R15, zero)) != 0) goto done; /* GDTR, IDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; /* TR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0000008b; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0) goto done; /* LDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x00000082; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DR6, 0xffff0ff0)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DR7, 0x400)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_INTR_SHADOW, zero)) != 0) goto done; error = 0; done: return (error); } int vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) { int error, i; struct vm_gpa_pte gpapte; bzero(&gpapte, sizeof(gpapte)); gpapte.gpa = gpa; error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); if (error == 0) { *num = gpapte.ptenum; for (i = 0; i < gpapte.ptenum; i++) pte[i] = gpapte.pte[i]; } return (error); } int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities) { int error; struct vm_hpet_cap cap; bzero(&cap, sizeof(struct vm_hpet_cap)); error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap); if (capabilities != NULL) *capabilities = cap.capabilities; return (error); } int vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; bzero(&gg, sizeof(struct vm_gla2gpa)); gg.vcpuid = vcpu; gg.prot = prot; gg.gla = gla; gg.paging = *paging; error = ioctl(ctx->fd, VM_GLA2GPA, &gg); if (error == 0) { *fault = gg.fault; *gpa = gg.gpa; } return (error); } int vm_gla2gpa_nofault(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; bzero(&gg, sizeof(struct vm_gla2gpa)); gg.vcpuid = vcpu; gg.prot = prot; gg.gla = gla; gg.paging = *paging; error = ioctl(ctx->fd, VM_GLA2GPA_NOFAULT, &gg); if (error == 0) { *fault = gg.fault; *gpa = gg.gpa; } return (error); } #ifndef min #define min(a,b) (((a) < (b)) ? (a) : (b)) #endif int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault) { void *va; uint64_t gpa, off; int error, i, n; for (i = 0; i < iovcnt; i++) { iov[i].iov_base = 0; iov[i].iov_len = 0; } while (len) { assert(iovcnt > 0); error = vm_gla2gpa(ctx, vcpu, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = MIN(len, PAGE_SIZE - off); va = vm_map_gpa(ctx, gpa, n); if (va == NULL) return (EFAULT); iov->iov_base = va; iov->iov_len = n; iov++; iovcnt--; gla += n; len -= n; } return (0); } void -vm_copy_teardown(struct vmctx *ctx __unused, int vcpu __unused, - struct iovec *iov __unused, int iovcnt __unused) +vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused) { + /* + * Intentionally empty. This is used by the instruction + * emulation code shared with the kernel. The in-kernel + * version of this is non-empty. + */ } void -vm_copyin(struct vmctx *ctx __unused, int vcpu __unused, struct iovec *iov, - void *vp, size_t len) +vm_copyin(struct iovec *iov, void *vp, size_t len) { const char *src; char *dst; size_t n; dst = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); src = iov->iov_base; bcopy(src, dst, n); iov++; dst += n; len -= n; } } void -vm_copyout(struct vmctx *ctx __unused, int vcpu __unused, const void *vp, - struct iovec *iov, size_t len) +vm_copyout(const void *vp, struct iovec *iov, size_t len) { const char *src; char *dst; size_t n; src = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); dst = iov->iov_base; bcopy(src, dst, n); iov++; src += n; len -= n; } } static int vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) { struct vm_cpuset vm_cpuset; int error; bzero(&vm_cpuset, sizeof(struct vm_cpuset)); vm_cpuset.which = which; vm_cpuset.cpusetsize = sizeof(cpuset_t); vm_cpuset.cpus = cpus; error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); return (error); } int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); } int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); } int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); } int vm_activate_cpu(struct vmctx *ctx, int vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = vcpu; error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac); return (error); } int vm_suspend_cpu(struct vmctx *ctx, int vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = vcpu; error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); return (error); } int vm_resume_cpu(struct vmctx *ctx, int vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = vcpu; error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); return (error); } int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); vmii.vcpuid = vcpu; error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii); if (error == 0) { *info1 = vmii.info1; *info2 = vmii.info2; } return (error); } int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); vmii.vcpuid = vcpu; vmii.info1 = info1; error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii); return (error); } int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value) { struct vm_rtc_data rtcdata; int error; bzero(&rtcdata, sizeof(struct vm_rtc_data)); rtcdata.offset = offset; rtcdata.value = value; error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata); return (error); } int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval) { struct vm_rtc_data rtcdata; int error; bzero(&rtcdata, sizeof(struct vm_rtc_data)); rtcdata.offset = offset; error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata); if (error == 0) *retval = rtcdata.value; return (error); } int vm_rtc_settime(struct vmctx *ctx, time_t secs) { struct vm_rtc_time rtctime; int error; bzero(&rtctime, sizeof(struct vm_rtc_time)); rtctime.secs = secs; error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime); return (error); } int vm_rtc_gettime(struct vmctx *ctx, time_t *secs) { struct vm_rtc_time rtctime; int error; bzero(&rtctime, sizeof(struct vm_rtc_time)); error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime); if (error == 0) *secs = rtctime.secs; return (error); } int vm_restart_instruction(void *arg, int vcpu) { struct vmctx *ctx = arg; return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu)); } int vm_snapshot_req(struct vm_snapshot_meta *meta) { if (ioctl(meta->ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { #ifdef SNAPSHOT_DEBUG fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", __func__, meta->dev_name, errno); #endif return (-1); } return (0); } int vm_restore_time(struct vmctx *ctx) { int dummy; dummy = 0; return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); } int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) { struct vm_cpu_topology topology; bzero(&topology, sizeof (struct vm_cpu_topology)); topology.sockets = sockets; topology.cores = cores; topology.threads = threads; topology.maxcpus = maxcpus; return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); } int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) { struct vm_cpu_topology topology; int error; bzero(&topology, sizeof (struct vm_cpu_topology)); error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); if (error == 0) { *sockets = topology.sockets; *cores = topology.cores; *threads = topology.threads; *maxcpus = topology.maxcpus; } return (error); } /* Keep in sync with machine/vmm_dev.h. */ static const cap_ioctl_t vm_ioctl_cmds[] = { VM_RUN, VM_SUSPEND, VM_REINIT, VM_ALLOC_MEMSEG, VM_GET_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_GETNEXT, VM_MUNMAP_MEMSEG, VM_SET_REGISTER, VM_GET_REGISTER, VM_SET_SEGMENT_DESCRIPTOR, VM_GET_SEGMENT_DESCRIPTOR, VM_SET_REGISTER_SET, VM_GET_REGISTER_SET, VM_SET_KERNEMU_DEV, VM_GET_KERNEMU_DEV, VM_INJECT_EXCEPTION, VM_LAPIC_IRQ, VM_LAPIC_LOCAL_IRQ, VM_LAPIC_MSI, VM_IOAPIC_ASSERT_IRQ, VM_IOAPIC_DEASSERT_IRQ, VM_IOAPIC_PULSE_IRQ, VM_IOAPIC_PINCOUNT, VM_ISA_ASSERT_IRQ, VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER, VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV, VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI, VM_PPTDEV_MSIX, VM_UNMAP_PPTDEV_MMIO, VM_PPTDEV_DISABLE_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE, VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA, VM_GLA2GPA_NOFAULT, VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SUSPEND_CPU, VM_RESUME_CPU, VM_SET_INTINFO, VM_GET_INTINFO, VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME, VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY }; int vm_limit_rights(struct vmctx *ctx) { cap_rights_t rights; size_t ncmds; cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); if (caph_rights_limit(ctx->fd, &rights) != 0) return (-1); ncmds = nitems(vm_ioctl_cmds); if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, ncmds) != 0) return (-1); return (0); } /* * Avoid using in new code. Operations on the fd should be wrapped here so that * capability rights can be kept in sync. */ int vm_get_device_fd(struct vmctx *ctx) { return (ctx->fd); } /* Legacy interface, do not use. */ const cap_ioctl_t * vm_get_ioctls(size_t *len) { cap_ioctl_t *cmds; if (len == NULL) { cmds = malloc(sizeof(vm_ioctl_cmds)); if (cmds == NULL) return (NULL); bcopy(vm_ioctl_cmds, cmds, sizeof(vm_ioctl_cmds)); return (cmds); } *len = nitems(vm_ioctl_cmds); return (NULL); } diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h index b26f12f7c60e..82c3dcdf3118 100644 --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -1,276 +1,273 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMMAPI_H_ #define _VMMAPI_H_ #include #include #include #include #include #include /* * API version for out-of-tree consumers like grub-bhyve for making compile * time decisions. */ -#define VMMAPI_VERSION 0103 /* 2 digit major followed by 2 digit minor */ +#define VMMAPI_VERSION 0104 /* 2 digit major followed by 2 digit minor */ struct iovec; struct vmctx; struct vm_snapshot_meta; enum x2apic_state; /* * Different styles of mapping the memory assigned to a VM into the address * space of the controlling process. */ enum vm_mmap_style { VM_MMAP_NONE, /* no mapping */ VM_MMAP_ALL, /* fully and statically mapped */ VM_MMAP_SPARSE, /* mappings created on-demand */ }; /* * 'flags' value passed to 'vm_set_memflags()'. */ #define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */ #define VM_MEM_F_WIRED 0x02 /* guest memory is wired */ /* * Identifiers for memory segments: * - vm_setup_memory() uses VM_SYSMEM for the system memory segment. * - the remaining identifiers can be used to create devmem segments. */ enum { VM_SYSMEM, VM_BOOTROM, VM_FRAMEBUFFER, VM_PCIROM, }; __BEGIN_DECLS /* * Get the length and name of the memory segment identified by 'segid'. * Note that system memory segments are identified with a nul name. * * Returns 0 on success and non-zero otherwise. */ int vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name, size_t namesiz); /* * Iterate over the guest address space. This function finds an address range * that starts at an address >= *gpa. * * Returns 0 if the next address range was found and non-zero otherwise. */ int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, size_t *lowmem_size, size_t *highmem_size); /* * Create a device memory segment identified by 'segid'. * * Returns a pointer to the memory segment on success and MAP_FAILED otherwise. */ void *vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len); /* * Map the memory segment identified by 'segid' into the guest address space * at [gpa,gpa+len) with protection 'prot'. */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t segoff, size_t len, int prot); int vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len); int vm_create(const char *name); struct vmctx *vm_open(const char *name); void vm_close(struct vmctx *ctx); void vm_destroy(struct vmctx *ctx); int vm_limit_rights(struct vmctx *ctx); int vm_parse_memsize(const char *optarg, size_t *memsize); int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); /* inverse operation to vm_map_gpa - extract guest address from host pointer */ vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr); int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); int vm_gla2gpa_nofault(struct vmctx *, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); uint32_t vm_get_lowmem_limit(struct vmctx *ctx); void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit); void vm_set_memflags(struct vmctx *ctx, int flags); int vm_get_memflags(struct vmctx *ctx); const char *vm_get_name(struct vmctx *ctx); size_t vm_get_lowmem_size(struct vmctx *ctx); size_t vm_get_highmem_size(struct vmctx *ctx); int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access); int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access); int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc); int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); int vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count, const int *regnums, uint64_t *regvals); int vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count, const int *regnums, uint64_t *regvals); int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit); int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how); int vm_reinit(struct vmctx *ctx); int vm_apicid2vcpu(struct vmctx *ctx, int apicid); int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction); int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector); int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector); int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg); int vm_ioapic_assert_irq(struct vmctx *ctx, int irq); int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq); int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq); int vm_ioapic_pincount(struct vmctx *ctx, int *pincount); int vm_readwrite_kernemu_device(struct vmctx *ctx, int vcpu, vm_paddr_t gpa, bool write, int size, uint64_t *value); int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger); int vm_inject_nmi(struct vmctx *ctx, int vcpu); int vm_capability_name2type(const char *capname); const char *vm_capability_type2name(int type); int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int *retval); int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val); int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len); int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec); int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); int vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func); int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2); int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo); /* * Return a pointer to the statistics buffer. Note that this is not MT-safe. */ uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, int *ret_entries); const char *vm_get_stat_desc(struct vmctx *ctx, int index); int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s); int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s); int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities); /* * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'. * The 'iovcnt' should be big enough to accommodate all GPA segments. * * retval fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Error */ int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault); -void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov, - void *host_dst, size_t len); -void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src, - struct iovec *guest_iov, size_t len); -void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, - int iovcnt); +void vm_copyin(struct iovec *guest_iov, void *host_dst, size_t len); +void vm_copyout(const void *host_src, struct iovec *guest_iov, size_t len); +void vm_copy_teardown(struct iovec *iov, int iovcnt); /* RTC */ int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value); int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval); int vm_rtc_settime(struct vmctx *ctx, time_t secs); int vm_rtc_gettime(struct vmctx *ctx, time_t *secs); /* Reset vcpu register state */ int vcpu_reset(struct vmctx *ctx, int vcpu); int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_activate_cpu(struct vmctx *ctx, int vcpu); int vm_suspend_cpu(struct vmctx *ctx, int vcpu); int vm_resume_cpu(struct vmctx *ctx, int vcpu); /* CPU topology */ int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); /* * FreeBSD specific APIs */ int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu, uint64_t rip, uint64_t cr3, uint64_t gdtbase, uint64_t rsp); int vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu, uint32_t eip, uint32_t gdtbase, uint32_t esp); void vm_setup_freebsd_gdt(uint64_t *gdtr); /* * Save and restore */ int vm_snapshot_req(struct vm_snapshot_meta *meta); int vm_restore_time(struct vmctx *ctx); /* * Deprecated interfaces, do not use them in new code. */ int vm_get_device_fd(struct vmctx *ctx); const cap_ioctl_t *vm_get_ioctls(size_t *len); __END_DECLS #endif /* _VMMAPI_H_ */ diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 7896a4997f53..a223eb17afe9 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -1,788 +1,785 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_H_ #define _VMM_H_ #include #include #include struct vm_snapshot_meta; #ifdef _KERNEL SDT_PROVIDER_DECLARE(vmm); #endif enum vm_suspend_how { VM_SUSPEND_NONE, VM_SUSPEND_RESET, VM_SUSPEND_POWEROFF, VM_SUSPEND_HALT, VM_SUSPEND_TRIPLEFAULT, VM_SUSPEND_LAST }; /* * Identifiers for architecturally defined registers. */ enum vm_reg_name { VM_REG_GUEST_RAX, VM_REG_GUEST_RBX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_RBP, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15, VM_REG_GUEST_CR0, VM_REG_GUEST_CR3, VM_REG_GUEST_CR4, VM_REG_GUEST_DR7, VM_REG_GUEST_RSP, VM_REG_GUEST_RIP, VM_REG_GUEST_RFLAGS, VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS, VM_REG_GUEST_LDTR, VM_REG_GUEST_TR, VM_REG_GUEST_IDTR, VM_REG_GUEST_GDTR, VM_REG_GUEST_EFER, VM_REG_GUEST_CR2, VM_REG_GUEST_PDPTE0, VM_REG_GUEST_PDPTE1, VM_REG_GUEST_PDPTE2, VM_REG_GUEST_PDPTE3, VM_REG_GUEST_INTR_SHADOW, VM_REG_GUEST_DR0, VM_REG_GUEST_DR1, VM_REG_GUEST_DR2, VM_REG_GUEST_DR3, VM_REG_GUEST_DR6, VM_REG_GUEST_ENTRY_INST_LENGTH, VM_REG_LAST }; enum x2apic_state { X2APIC_DISABLED, X2APIC_ENABLED, X2APIC_STATE_LAST }; #define VM_INTINFO_VECTOR(info) ((info) & 0xff) #define VM_INTINFO_DEL_ERRCODE 0x800 #define VM_INTINFO_RSVD 0x7ffff000 #define VM_INTINFO_VALID 0x80000000 #define VM_INTINFO_TYPE 0x700 #define VM_INTINFO_HWINTR (0 << 8) #define VM_INTINFO_NMI (2 << 8) #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) /* * The VM name has to fit into the pathname length constraints of devfs, * governed primarily by SPECNAMELEN. The length is the total number of * characters in the full path, relative to the mount point and not * including any leading '/' characters. * A prefix and a suffix are added to the name specified by the user. * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters * longer for future use. * The suffix is a string that identifies a bootrom image or some similar * image that is attached to the VM. A separator character gets added to * the suffix automatically when generating the full path, so it must be * accounted for, reducing the effective length by 1. * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37 * bytes for FreeBSD 12. A minimum length is set for safety and supports * a SPECNAMELEN as small as 32 on old systems. */ #define VM_MAX_PREFIXLEN 10 #define VM_MAX_SUFFIXLEN 15 #define VM_MIN_NAMELEN 6 #define VM_MAX_NAMELEN \ (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1) #ifdef _KERNEL CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN); struct vcpu; struct vm; struct vm_exception; struct seg_desc; struct vm_exit; struct vm_run; struct vhpet; struct vioapic; struct vlapic; struct vmspace; struct vm_object; struct vm_guest_paging; struct pmap; enum snapshot_req; struct vm_eventinfo { void *rptr; /* rendezvous cookie */ int *sptr; /* suspend cookie */ int *iptr; /* reqidle cookie */ }; typedef int (*vmm_init_func_t)(int ipinum); typedef int (*vmm_cleanup_func_t)(void); typedef void (*vmm_resume_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); typedef int (*vmi_run_func_t)(void *vcpui, register_t rip, struct pmap *pmap, struct vm_eventinfo *info); typedef void (*vmi_cleanup_func_t)(void *vmi); typedef void * (*vmi_vcpu_init_func_t)(void *vmi, struct vcpu *vcpu, int vcpu_id); typedef void (*vmi_vcpu_cleanup_func_t)(void *vcpui); typedef int (*vmi_get_register_t)(void *vcpui, int num, uint64_t *retval); typedef int (*vmi_set_register_t)(void *vcpui, int num, uint64_t val); typedef int (*vmi_get_desc_t)(void *vcpui, int num, struct seg_desc *desc); typedef int (*vmi_set_desc_t)(void *vcpui, int num, struct seg_desc *desc); typedef int (*vmi_get_cap_t)(void *vcpui, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vcpui, int num, int val); typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); typedef struct vlapic * (*vmi_vlapic_init)(void *vcpui); typedef void (*vmi_vlapic_cleanup)(struct vlapic *vlapic); typedef int (*vmi_snapshot_t)(void *vmi, struct vm_snapshot_meta *meta); typedef int (*vmi_snapshot_vcpu_t)(void *vcpui, struct vm_snapshot_meta *meta); typedef int (*vmi_restore_tsc_t)(void *vcpui, uint64_t now); struct vmm_ops { vmm_init_func_t modinit; /* module wide initialization */ vmm_cleanup_func_t modcleanup; vmm_resume_func_t modresume; vmi_init_func_t init; /* vm-specific initialization */ vmi_run_func_t run; vmi_cleanup_func_t cleanup; vmi_vcpu_init_func_t vcpu_init; vmi_vcpu_cleanup_func_t vcpu_cleanup; vmi_get_register_t getreg; vmi_set_register_t setreg; vmi_get_desc_t getdesc; vmi_set_desc_t setdesc; vmi_get_cap_t getcap; vmi_set_cap_t setcap; vmi_vmspace_alloc vmspace_alloc; vmi_vmspace_free vmspace_free; vmi_vlapic_init vlapic_init; vmi_vlapic_cleanup vlapic_cleanup; /* checkpoint operations */ vmi_snapshot_t snapshot; vmi_snapshot_vcpu_t vcpu_snapshot; vmi_restore_tsc_t restore_tsc; }; extern const struct vmm_ops vmm_ops_intel; extern const struct vmm_ops vmm_ops_amd; int vm_create(const char *name, struct vm **retvm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); const char *vm_name(struct vm *vm); uint16_t vm_get_maxcpus(struct vm *vm); void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); /* * APIs that modify the guest memory map require all vcpus to be frozen. */ int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot, int flags); int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); void vm_free_memseg(struct vm *vm, int ident); int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); /* * APIs that inspect the guest memory map require only a *single* vcpu to * be frozen. This acts like a read lock on the guest memory map since any * modification requires *all* vcpus to be frozen. */ int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, struct vm_object **objptr); vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len, int prot, void **cookie); void vm_gpa_release(void *cookie); bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa); int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *ret_desc); int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc); int vm_run(struct vm *vm, struct vm_run *vmrun); int vm_suspend(struct vm *vm, enum vm_suspend_how how); int vm_inject_nmi(struct vm *vm, int vcpu); int vm_nmi_pending(struct vm *vm, int vcpuid); void vm_nmi_clear(struct vm *vm, int vcpuid); int vm_inject_extint(struct vm *vm, int vcpu); int vm_extint_pending(struct vm *vm, int vcpuid); void vm_extint_clear(struct vm *vm, int vcpuid); int vcpu_vcpuid(struct vcpu *vcpu); struct vm *vcpu_vm(struct vcpu *vcpu); struct vcpu *vm_vcpu(struct vm *vm, int cpu); struct vlapic *vm_lapic(struct vm *vm, int cpu); struct vioapic *vm_ioapic(struct vm *vm); struct vhpet *vm_hpet(struct vm *vm); int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); int vm_set_capability(struct vm *vm, int vcpu, int type, int val); int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); int vm_apicid2vcpuid(struct vm *vm, int apicid); int vm_activate_cpu(struct vm *vm, int vcpu); int vm_suspend_cpu(struct vm *vm, int vcpu); int vm_resume_cpu(struct vm *vm, int vcpu); struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta); int vm_restore_time(struct vm *vm); #ifdef _SYS__CPUSET_H_ /* * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. * The rendezvous 'func(arg)' is not allowed to do anything that will * cause the thread to be put to sleep. * * If the rendezvous is being initiated from a vcpu context then the * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1. * * The caller cannot hold any locks when initiating the rendezvous. * * The implementation of this API may cause vcpus other than those specified * by 'dest' to be stalled. The caller should not rely on any vcpus making * forward progress when the rendezvous is in progress. */ typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); int vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, vm_rendezvous_func_t func, void *arg); cpuset_t vm_active_cpus(struct vm *vm); cpuset_t vm_debug_cpus(struct vm *vm); cpuset_t vm_suspended_cpus(struct vm *vm); #endif /* _SYS__CPUSET_H_ */ static __inline int vcpu_rendezvous_pending(struct vm_eventinfo *info) { return (*((uintptr_t *)(info->rptr)) != 0); } static __inline int vcpu_suspended(struct vm_eventinfo *info) { return (*info->sptr); } static __inline int vcpu_reqidle(struct vm_eventinfo *info) { return (*info->iptr); } int vcpu_debugged(struct vm *vm, int vcpuid); /* * Return true if device indicated by bus/slot/func is supposed to be a * pci passthrough device. * * Return false otherwise. */ bool vmm_is_pptdev(int bus, int slot, int func); void *vm_iommu_domain(struct vm *vm); enum vcpu_state { VCPU_IDLE, VCPU_FROZEN, VCPU_RUNNING, VCPU_SLEEPING, }; int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, bool from_idle); enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); static int __inline vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) { return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); } #ifdef _SYS_PROC_H_ static int __inline vcpu_should_yield(struct vm *vm, int vcpu) { struct thread *td; td = curthread; return (td->td_ast != 0 || td->td_owepreempt != 0); } #endif void *vcpu_stats(struct vcpu *vcpu); void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); struct vmspace *vm_get_vmspace(struct vm *vm); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); struct vpmtmr *vm_pmtmr(struct vm *vm); struct vrtc *vm_rtc(struct vm *vm); /* * Inject exception 'vector' into the guest vcpu. This function returns 0 on * success and non-zero on failure. * * Wrapper functions like 'vm_inject_gp()' should be preferred to calling * this function directly because they enforce the trap-like or fault-like * behavior of an exception. * * This function should only be called in the context of the thread that is * executing this vcpu. */ int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid, uint32_t errcode, int restart_instruction); /* * This function is called after a VM-exit that occurred during exception or * interrupt delivery through the IDT. The format of 'intinfo' is described * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. * * If a VM-exit handler completes the event delivery successfully then it * should call vm_exit_intinfo() to extinguish the pending event. For e.g., * if the task switch emulation is triggered via a task gate then it should * call this function with 'intinfo=0' to indicate that the external event * is not pending anymore. * * Return value is 0 on success and non-zero on failure. */ int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); /* * This function is called before every VM-entry to retrieve a pending * event that should be injected into the guest. This function combines * nested events into a double or triple fault. * * Returns 0 if there are no events that need to be injected into the guest * and non-zero otherwise. */ int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); /* * Function used to keep track of the guest's TSC offset. The * offset is used by the virutalization extensions to provide a consistent * value for the Time Stamp Counter to the guest. * * Return value is 0 on success and non-zero on failure. */ int vm_set_tsc_offset(struct vm *vm, int vcpu_id, uint64_t offset); enum vm_reg_name vm_segment_name(int seg_encoding); struct vm_copyinfo { uint64_t gpa; size_t len; void *hva; void *cookie; }; /* * Set up 'copyinfo[]' to copy to/from guest linear address space starting * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for * a copyin or PROT_WRITE for a copyout. * * retval is_fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Unrecoverable error * * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if * the return value is 0. The 'copyinfo[]' resources should be freed by calling * 'vm_copy_teardown()' after the copy is done. */ int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *is_fault); -void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, - int num_copyinfo); -void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, - void *kaddr, size_t len); -void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, - struct vm_copyinfo *copyinfo, size_t len); +void vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo); +void vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len); +void vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len); int vcpu_trace_exceptions(struct vm *vm, int vcpuid); int vcpu_trap_wbinvd(struct vm *vm, int vcpuid); #endif /* KERNEL */ #ifdef _KERNEL #define VM_MAXCPU 16 /* maximum virtual cpus */ #endif /* * Identifiers for optional vmm capabilities */ enum vm_cap_type { VM_CAP_HALT_EXIT, VM_CAP_MTRAP_EXIT, VM_CAP_PAUSE_EXIT, VM_CAP_UNRESTRICTED_GUEST, VM_CAP_ENABLE_INVPCID, VM_CAP_BPT_EXIT, VM_CAP_RDPID, VM_CAP_RDTSCP, VM_CAP_IPI_EXIT, VM_CAP_MAX }; enum vm_intr_trigger { EDGE_TRIGGER, LEVEL_TRIGGER }; /* * The 'access' field has the format specified in Table 21-2 of the Intel * Architecture Manual vol 3b. * * XXX The contents of the 'access' field are architecturally defined except * bit 16 - Segment Unusable. */ struct seg_desc { uint64_t base; uint32_t limit; uint32_t access; }; #define SEG_DESC_TYPE(access) ((access) & 0x001f) #define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) #define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) #define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) #define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) #define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) enum vm_cpu_mode { CPU_MODE_REAL, CPU_MODE_PROTECTED, CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ }; enum vm_paging_mode { PAGING_MODE_FLAT, PAGING_MODE_32, PAGING_MODE_PAE, PAGING_MODE_64, PAGING_MODE_64_LA57, }; struct vm_guest_paging { uint64_t cr3; int cpl; enum vm_cpu_mode cpu_mode; enum vm_paging_mode paging_mode; }; /* * The data structures 'vie' and 'vie_op' are meant to be opaque to the * consumers of instruction decoding. The only reason why their contents * need to be exposed is because they are part of the 'vm_exit' structure. */ struct vie_op { uint8_t op_byte; /* actual opcode byte */ uint8_t op_type; /* type of operation (e.g. MOV) */ uint16_t op_flags; }; _Static_assert(sizeof(struct vie_op) == 4, "ABI"); _Static_assert(_Alignof(struct vie_op) == 2, "ABI"); #define VIE_INST_SIZE 15 struct vie { uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ uint8_t num_valid; /* size of the instruction */ /* The following fields are all zeroed upon restart. */ #define vie_startzero num_processed uint8_t num_processed; uint8_t addrsize:4, opsize:4; /* address and operand sizes */ uint8_t rex_w:1, /* REX prefix */ rex_r:1, rex_x:1, rex_b:1, rex_present:1, repz_present:1, /* REP/REPE/REPZ prefix */ repnz_present:1, /* REPNE/REPNZ prefix */ opsize_override:1, /* Operand size override */ addrsize_override:1, /* Address size override */ segment_override:1; /* Segment override */ uint8_t mod:2, /* ModRM byte */ reg:4, rm:4; uint8_t ss:2, /* SIB byte */ vex_present:1, /* VEX prefixed */ vex_l:1, /* L bit */ index:4, /* SIB byte */ base:4; /* SIB byte */ uint8_t disp_bytes; uint8_t imm_bytes; uint8_t scale; uint8_t vex_reg:4, /* vvvv: first source register specifier */ vex_pp:2, /* pp */ _sparebits:2; uint8_t _sparebytes[2]; int base_register; /* VM_REG_GUEST_xyz */ int index_register; /* VM_REG_GUEST_xyz */ int segment_register; /* VM_REG_GUEST_xyz */ int64_t displacement; /* optional addr displacement */ int64_t immediate; /* optional immediate operand */ uint8_t decoded; /* set to 1 if successfully decoded */ uint8_t _sparebyte; struct vie_op op; /* opcode description */ }; _Static_assert(sizeof(struct vie) == 64, "ABI"); _Static_assert(__offsetof(struct vie, disp_bytes) == 22, "ABI"); _Static_assert(__offsetof(struct vie, scale) == 24, "ABI"); _Static_assert(__offsetof(struct vie, base_register) == 28, "ABI"); enum vm_exitcode { VM_EXITCODE_INOUT, VM_EXITCODE_VMX, VM_EXITCODE_BOGUS, VM_EXITCODE_RDMSR, VM_EXITCODE_WRMSR, VM_EXITCODE_HLT, VM_EXITCODE_MTRAP, VM_EXITCODE_PAUSE, VM_EXITCODE_PAGING, VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ VM_EXITCODE_RENDEZVOUS, VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, VM_EXITCODE_INOUT_STR, VM_EXITCODE_TASK_SWITCH, VM_EXITCODE_MONITOR, VM_EXITCODE_MWAIT, VM_EXITCODE_SVM, VM_EXITCODE_REQIDLE, VM_EXITCODE_DEBUG, VM_EXITCODE_VMINSN, VM_EXITCODE_BPT, VM_EXITCODE_IPI, VM_EXITCODE_MAX }; struct vm_inout { uint16_t bytes:3; /* 1 or 2 or 4 */ uint16_t in:1; uint16_t string:1; uint16_t rep:1; uint16_t port; uint32_t eax; /* valid for out */ }; struct vm_inout_str { struct vm_inout inout; /* must be the first element */ struct vm_guest_paging paging; uint64_t rflags; uint64_t cr0; uint64_t index; uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ int addrsize; enum vm_reg_name seg_name; struct seg_desc seg_desc; }; enum task_switch_reason { TSR_CALL, TSR_IRET, TSR_JMP, TSR_IDT_GATE, /* task gate in IDT */ }; struct vm_task_switch { uint16_t tsssel; /* new TSS selector */ int ext; /* task switch due to external event */ uint32_t errcode; int errcode_valid; /* push 'errcode' on the new stack */ enum task_switch_reason reason; struct vm_guest_paging paging; }; struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ uint64_t rip; union { struct vm_inout inout; struct vm_inout_str inout_str; struct { uint64_t gpa; int fault_type; } paging; struct { uint64_t gpa; uint64_t gla; uint64_t cs_base; int cs_d; /* CS.D */ struct vm_guest_paging paging; struct vie vie; } inst_emul; /* * VMX specific payload. Used when there is no "better" * exitcode to represent the VM-exit. */ struct { int status; /* vmx inst status */ /* * 'exit_reason' and 'exit_qualification' are valid * only if 'status' is zero. */ uint32_t exit_reason; uint64_t exit_qualification; /* * 'inst_error' and 'inst_type' are valid * only if 'status' is non-zero. */ int inst_type; int inst_error; } vmx; /* * SVM specific payload. */ struct { uint64_t exitcode; uint64_t exitinfo1; uint64_t exitinfo2; } svm; struct { int inst_length; } bpt; struct { uint32_t code; /* ecx value */ uint64_t wval; } msr; struct { int vcpu; uint64_t rip; } spinup_ap; struct { uint64_t rflags; uint64_t intr_status; } hlt; struct { int vector; } ioapic_eoi; struct { enum vm_suspend_how how; } suspended; struct { uint32_t mode; uint8_t vector; cpuset_t dmask; } ipi; struct vm_task_switch task_switch; } u; }; /* APIs to inject faults into the guest */ void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, int errcode); static __inline void vm_inject_ud(void *vm, int vcpuid) { vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); } static __inline void vm_inject_gp(void *vm, int vcpuid) { vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); } static __inline void vm_inject_ac(void *vm, int vcpuid, int errcode) { vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); } static __inline void vm_inject_ss(void *vm, int vcpuid, int errcode) { vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); } void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); int vm_restart_instruction(void *vm, int vcpuid); #endif /* _VMM_H_ */ diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 2a4ff4799cc0..bffba382b667 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -1,3010 +1,3007 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_ioport.h" #include "vmm_ktr.h" #include "vmm_host.h" #include "vmm_mem.h" #include "vmm_util.h" #include "vatpic.h" #include "vatpit.h" #include "vhpet.h" #include "vioapic.h" #include "vlapic.h" #include "vpmtmr.h" #include "vrtc.h" #include "vmm_stat.h" #include "vmm_lapic.h" #include "io/ppt.h" #include "io/iommu.h" struct vlapic; /* * Initialization: * (a) allocated when vcpu is created * (i) initialized when vcpu is created and when it is reinitialized * (o) initialized the first time the vcpu is created * (x) initialized before use */ struct vcpu { struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ enum vcpu_state state; /* (o) vcpu state */ int vcpuid; /* (o) */ int hostcpu; /* (o) vcpu's host cpu */ int reqidle; /* (i) request vcpu to idle */ struct vm *vm; /* (o) */ void *cookie; /* (i) cpu-specific data */ struct vlapic *vlapic; /* (i) APIC device model */ enum x2apic_state x2apic_state; /* (i) APIC mode */ uint64_t exitintinfo; /* (i) events pending at VM exit */ int nmi_pending; /* (i) NMI pending */ int extint_pending; /* (i) INTR pending */ int exception_pending; /* (i) exception pending */ int exc_vector; /* (x) exception collateral */ int exc_errcode_valid; uint32_t exc_errcode; struct savefpu *guestfpu; /* (a,i) guest fpu state */ uint64_t guest_xcr0; /* (i) guest %xcr0 register */ void *stats; /* (a,i) statistics */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ uint64_t nextrip; /* (x) next instruction to execute */ uint64_t tsc_offset; /* (o) TSC offsetting */ }; #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) struct mem_seg { size_t len; bool sysmem; struct vm_object *object; }; #define VM_MAX_MEMSEGS 4 struct mem_map { vm_paddr_t gpa; size_t len; vm_ooffset_t segoff; int segid; int prot; int flags; }; #define VM_MAX_MEMMAPS 8 /* * Initialization: * (o) initialized the first time the VM is created * (i) initialized when VM is created and when it is reinitialized * (x) initialized before use */ struct vm { void *cookie; /* (i) cpu-specific data */ void *iommu; /* (x) iommu-specific data */ struct vhpet *vhpet; /* (i) virtual HPET */ struct vioapic *vioapic; /* (i) virtual ioapic */ struct vatpic *vatpic; /* (i) virtual atpic */ struct vatpit *vatpit; /* (i) virtual atpit */ struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ struct vrtc *vrtc; /* (o) virtual RTC */ volatile cpuset_t active_cpus; /* (i) active vcpus */ volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ int suspend; /* (i) stop VM execution */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */ cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */ void *rendezvous_arg; /* (x) rendezvous func/arg */ vm_rendezvous_func_t rendezvous_func; struct mtx rendezvous_mtx; /* (o) rendezvous lock */ struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ struct vmspace *vmspace; /* (o) guest's address space */ char name[VM_MAX_NAMELEN+1]; /* (o) virtual machine name */ struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ /* The following describe the vm cpu topology */ uint16_t sockets; /* (o) num of sockets */ uint16_t cores; /* (o) num of cores/socket */ uint16_t threads; /* (o) num of threads/core */ uint16_t maxcpus; /* (o) max pluggable cpus */ }; #define VMM_CTR0(vcpu, format) \ VCPU_CTR0((vcpu)->vm, (vcpu)->vcpuid, format) #define VMM_CTR1(vcpu, format, p1) \ VCPU_CTR1((vcpu)->vm, (vcpu)->vcpuid, format, p1) #define VMM_CTR2(vcpu, format, p1, p2) \ VCPU_CTR2((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2) #define VMM_CTR3(vcpu, format, p1, p2, p3) \ VCPU_CTR3((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3) #define VMM_CTR4(vcpu, format, p1, p2, p3, p4) \ VCPU_CTR4((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3, p4) static int vmm_initialized; static void vmmops_panic(void); static void vmmops_panic(void) { panic("vmm_ops func called when !vmm_is_intel() && !vmm_is_svm()"); } #define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ DEFINE_IFUNC(static, ret_type, vmmops_##opname, args) \ { \ if (vmm_is_intel()) \ return (vmm_ops_intel.opname); \ else if (vmm_is_svm()) \ return (vmm_ops_amd.opname); \ else \ return ((ret_type (*)args)vmmops_panic); \ } DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum)) DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) DEFINE_VMMOPS_IFUNC(void, modresume, (void)) DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t rip, struct pmap *pmap, struct vm_eventinfo *info)) DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, int vcpu_id)) DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) DEFINE_VMMOPS_IFUNC(int, getdesc, (void *vcpui, int num, struct seg_desc *desc)) DEFINE_VMMOPS_IFUNC(int, setdesc, (void *vcpui, int num, struct seg_desc *desc)) DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, vm_offset_t max)) DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) DEFINE_VMMOPS_IFUNC(struct vlapic *, vlapic_init, (void *vcpui)) DEFINE_VMMOPS_IFUNC(void, vlapic_cleanup, (struct vlapic *vlapic)) #ifdef BHYVE_SNAPSHOT DEFINE_VMMOPS_IFUNC(int, snapshot, (void *vmi, struct vm_snapshot_meta *meta)) DEFINE_VMMOPS_IFUNC(int, vcpu_snapshot, (void *vcpui, struct vm_snapshot_meta *meta)) DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vcpui, uint64_t now)) #endif #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) #define fpu_stop_emulating() clts() SDT_PROVIDER_DEFINE(vmm); static MALLOC_DEFINE(M_VM, "vm", "vm"); /* statistics */ static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL); /* * Halt the guest if all vcpus are executing a HLT instruction with * interrupts disabled. */ static int halt_detection_enabled = 1; SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, &halt_detection_enabled, 0, "Halt VM if all vcpus execute HLT with interrupts disabled"); static int vmm_ipinum; SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, "IPI vector used for vcpu notifications"); static int trace_guest_exceptions; SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, &trace_guest_exceptions, 0, "Trap into hypervisor on all guest exceptions and reflect them back"); static int trap_wbinvd; SYSCTL_INT(_hw_vmm, OID_AUTO, trap_wbinvd, CTLFLAG_RDTUN, &trap_wbinvd, 0, "WBINVD triggers a VM-exit"); static void vm_free_memmap(struct vm *vm, int ident); static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); #ifdef KTR static const char * vcpu_state2str(enum vcpu_state state) { switch (state) { case VCPU_IDLE: return ("idle"); case VCPU_FROZEN: return ("frozen"); case VCPU_RUNNING: return ("running"); case VCPU_SLEEPING: return ("sleeping"); default: return ("unknown"); } } #endif static __inline void * vcpu_cookie(struct vm *vm, int i) { return (vm->vcpu[i].cookie); } static void vcpu_cleanup(struct vm *vm, int i, bool destroy) { struct vcpu *vcpu = &vm->vcpu[i]; vmmops_vlapic_cleanup(vcpu->vlapic); vmmops_vcpu_cleanup(vcpu->cookie); vcpu->cookie = NULL; if (destroy) { vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); } } static void vcpu_init(struct vm *vm, int vcpu_id, bool create) { struct vcpu *vcpu; KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, ("vcpu_init: invalid vcpu %d", vcpu_id)); vcpu = &vm->vcpu[vcpu_id]; if (create) { KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " "initialized", vcpu_id)); vcpu_lock_init(vcpu); vcpu->state = VCPU_IDLE; vcpu->hostcpu = NOCPU; vcpu->vcpuid = vcpu_id; vcpu->vm = vm; vcpu->guestfpu = fpu_save_area_alloc(); vcpu->stats = vmm_stat_alloc(); vcpu->tsc_offset = 0; } vcpu->cookie = vmmops_vcpu_init(vm->cookie, vcpu, vcpu_id); vcpu->vlapic = vmmops_vlapic_init(vcpu->cookie); vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); vcpu->reqidle = 0; vcpu->exitintinfo = 0; vcpu->nmi_pending = 0; vcpu->extint_pending = 0; vcpu->exception_pending = 0; vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; fpu_save_area_reset(vcpu->guestfpu); vmm_stat_init(vcpu->stats); } int vcpu_trace_exceptions(struct vm *vm, int vcpuid) { return (trace_guest_exceptions); } int vcpu_trap_wbinvd(struct vm *vm, int vcpuid) { return (trap_wbinvd); } struct vm_exit * vm_exitinfo(struct vm *vm, int cpuid) { struct vcpu *vcpu; if (cpuid < 0 || cpuid >= vm->maxcpus) panic("vm_exitinfo: invalid cpuid %d", cpuid); vcpu = &vm->vcpu[cpuid]; return (&vcpu->exitinfo); } static int vmm_init(void) { int error; if (!vmm_is_hw_supported()) return (ENXIO); vmm_host_state_init(); vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : &IDTVEC(justreturn)); if (vmm_ipinum < 0) vmm_ipinum = IPI_AST; error = vmm_mem_init(); if (error) return (error); vmm_resume_p = vmmops_modresume; return (vmmops_modinit(vmm_ipinum)); } static int vmm_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: if (vmm_is_hw_supported()) { vmmdev_init(); error = vmm_init(); if (error == 0) vmm_initialized = 1; } else { error = ENXIO; } break; case MOD_UNLOAD: if (vmm_is_hw_supported()) { error = vmmdev_cleanup(); if (error == 0) { vmm_resume_p = NULL; iommu_cleanup(); if (vmm_ipinum != IPI_AST) lapic_ipi_free(vmm_ipinum); error = vmmops_modcleanup(); /* * Something bad happened - prevent new * VMs from being created */ if (error) vmm_initialized = 0; } } else { error = 0; } break; default: error = 0; break; } return (error); } static moduledata_t vmm_kmod = { "vmm", vmm_handler, NULL }; /* * vmm initialization has the following dependencies: * * - VT-x initialization requires smp_rendezvous() and therefore must happen * after SMP is fully functional (after SI_SUB_SMP). */ DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); static void vm_init(struct vm *vm, bool create) { int i; vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); vm->iommu = NULL; vm->vioapic = vioapic_init(vm); vm->vhpet = vhpet_init(vm); vm->vatpic = vatpic_init(vm); vm->vatpit = vatpit_init(vm); vm->vpmtmr = vpmtmr_init(vm); if (create) vm->vrtc = vrtc_init(vm); CPU_ZERO(&vm->active_cpus); CPU_ZERO(&vm->debug_cpus); vm->suspend = 0; CPU_ZERO(&vm->suspended_cpus); for (i = 0; i < vm->maxcpus; i++) vcpu_init(vm, i, create); } /* * The default CPU topology is a single thread per package. */ u_int cores_per_package = 1; u_int threads_per_core = 1; int vm_create(const char *name, struct vm **retvm) { struct vm *vm; struct vmspace *vmspace; /* * If vmm.ko could not be successfully initialized then don't attempt * to create the virtual machine. */ if (!vmm_initialized) return (ENXIO); if (name == NULL || strnlen(name, VM_MAX_NAMELEN + 1) == VM_MAX_NAMELEN + 1) return (EINVAL); vmspace = vmmops_vmspace_alloc(0, VM_MAXUSER_ADDRESS_LA48); if (vmspace == NULL) return (ENOMEM); vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); vm->vmspace = vmspace; mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); vm->sockets = 1; vm->cores = cores_per_package; /* XXX backwards compatibility */ vm->threads = threads_per_core; /* XXX backwards compatibility */ vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ vm_init(vm, true); *retvm = vm; return (0); } void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) { *sockets = vm->sockets; *cores = vm->cores; *threads = vm->threads; *maxcpus = vm->maxcpus; } uint16_t vm_get_maxcpus(struct vm *vm) { return (vm->maxcpus); } int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) { if (maxcpus != 0) return (EINVAL); /* XXX remove when supported */ if ((sockets * cores * threads) > vm->maxcpus) return (EINVAL); /* XXX need to check sockets * cores * threads == vCPU, how? */ vm->sockets = sockets; vm->cores = cores; vm->threads = threads; vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ return(0); } static void vm_cleanup(struct vm *vm, bool destroy) { struct mem_map *mm; int i; ppt_unassign_all(vm); if (vm->iommu != NULL) iommu_destroy_domain(vm->iommu); if (destroy) vrtc_cleanup(vm->vrtc); else vrtc_reset(vm->vrtc); vpmtmr_cleanup(vm->vpmtmr); vatpit_cleanup(vm->vatpit); vhpet_cleanup(vm->vhpet); vatpic_cleanup(vm->vatpic); vioapic_cleanup(vm->vioapic); for (i = 0; i < vm->maxcpus; i++) vcpu_cleanup(vm, i, destroy); vmmops_cleanup(vm->cookie); /* * System memory is removed from the guest address space only when * the VM is destroyed. This is because the mapping remains the same * across VM reset. * * Device memory can be relocated by the guest (e.g. using PCI BARs) * so those mappings are removed on a VM reset. */ for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (destroy || !sysmem_mapping(vm, mm)) vm_free_memmap(vm, i); } if (destroy) { for (i = 0; i < VM_MAX_MEMSEGS; i++) vm_free_memseg(vm, i); vmmops_vmspace_free(vm->vmspace); vm->vmspace = NULL; } } void vm_destroy(struct vm *vm) { vm_cleanup(vm, true); free(vm, M_VM); } int vm_reinit(struct vm *vm) { int error; /* * A virtual machine can be reset only if all vcpus are suspended. */ if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { vm_cleanup(vm, false); vm_init(vm, false); error = 0; } else { error = EBUSY; } return (error); } const char * vm_name(struct vm *vm) { return (vm->name); } int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { vm_object_t obj; if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) return (ENOMEM); else return (0); } int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { vmm_mmio_free(vm->vmspace, gpa, len); return (0); } /* * Return 'true' if 'gpa' is allocated in the guest address space. * * This function is called in the context of a running vcpu which acts as * an implicit lock on 'vm->mem_maps[]'. */ bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) { struct mem_map *mm; int i; #ifdef INVARIANTS int hostcpu, state; state = vcpu_get_state(vm, vcpuid, &hostcpu); KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); #endif for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) return (true); /* 'gpa' is sysmem or devmem */ } if (ppt_is_mmio(vm, gpa)) return (true); /* 'gpa' is pci passthru mmio */ return (false); } int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) { struct mem_seg *seg; vm_object_t obj; if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); if (len == 0 || (len & PAGE_MASK)) return (EINVAL); seg = &vm->mem_segs[ident]; if (seg->object != NULL) { if (seg->len == len && seg->sysmem == sysmem) return (EEXIST); else return (EINVAL); } obj = vm_object_allocate(OBJT_SWAP, len >> PAGE_SHIFT); if (obj == NULL) return (ENOMEM); seg->len = len; seg->object = obj; seg->sysmem = sysmem; return (0); } int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, vm_object_t *objptr) { struct mem_seg *seg; if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); seg = &vm->mem_segs[ident]; if (len) *len = seg->len; if (sysmem) *sysmem = seg->sysmem; if (objptr) *objptr = seg->object; return (0); } void vm_free_memseg(struct vm *vm, int ident) { struct mem_seg *seg; KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, ("%s: invalid memseg ident %d", __func__, ident)); seg = &vm->mem_segs[ident]; if (seg->object != NULL) { vm_object_deallocate(seg->object); bzero(seg, sizeof(struct mem_seg)); } } int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, size_t len, int prot, int flags) { struct mem_seg *seg; struct mem_map *m, *map; vm_ooffset_t last; int i, error; if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0) return (EINVAL); if (flags & ~VM_MEMMAP_F_WIRED) return (EINVAL); if (segid < 0 || segid >= VM_MAX_MEMSEGS) return (EINVAL); seg = &vm->mem_segs[segid]; if (seg->object == NULL) return (EINVAL); last = first + len; if (first < 0 || first >= last || last > seg->len) return (EINVAL); if ((gpa | first | last) & PAGE_MASK) return (EINVAL); map = NULL; for (i = 0; i < VM_MAX_MEMMAPS; i++) { m = &vm->mem_maps[i]; if (m->len == 0) { map = m; break; } } if (map == NULL) return (ENOSPC); error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, len, 0, VMFS_NO_SPACE, prot, prot, 0); if (error != KERN_SUCCESS) return (EFAULT); vm_object_reference(seg->object); if (flags & VM_MEMMAP_F_WIRED) { error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM : EFAULT); } } map->gpa = gpa; map->len = len; map->segoff = first; map->segid = segid; map->prot = prot; map->flags = flags; return (0); } int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) { struct mem_map *m; int i; for (i = 0; i < VM_MAX_MEMMAPS; i++) { m = &vm->mem_maps[i]; if (m->gpa == gpa && m->len == len && (m->flags & VM_MEMMAP_F_IOMMU) == 0) { vm_free_memmap(vm, i); return (0); } } return (EINVAL); } int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { struct mem_map *mm, *mmnext; int i; mmnext = NULL; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (mm->len == 0 || mm->gpa < *gpa) continue; if (mmnext == NULL || mm->gpa < mmnext->gpa) mmnext = mm; } if (mmnext != NULL) { *gpa = mmnext->gpa; if (segid) *segid = mmnext->segid; if (segoff) *segoff = mmnext->segoff; if (len) *len = mmnext->len; if (prot) *prot = mmnext->prot; if (flags) *flags = mmnext->flags; return (0); } else { return (ENOENT); } } static void vm_free_memmap(struct vm *vm, int ident) { struct mem_map *mm; int error __diagused; mm = &vm->mem_maps[ident]; if (mm->len) { error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, mm->gpa + mm->len); KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d", __func__, error)); bzero(mm, sizeof(struct mem_map)); } } static __inline bool sysmem_mapping(struct vm *vm, struct mem_map *mm) { if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) return (true); else return (false); } vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm) { struct mem_map *mm; vm_paddr_t maxaddr; int i; maxaddr = 0; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (sysmem_mapping(vm, mm)) { if (maxaddr < mm->gpa + mm->len) maxaddr = mm->gpa + mm->len; } } return (maxaddr); } static void vm_iommu_modify(struct vm *vm, bool map) { int i, sz; vm_paddr_t gpa, hpa; struct mem_map *mm; void *vp, *cookie, *host_domain; sz = PAGE_SIZE; host_domain = iommu_host_domain(); for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (!sysmem_mapping(vm, mm)) continue; if (map) { KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, ("iommu map found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) continue; mm->flags |= VM_MEMMAP_F_IOMMU; } else { if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) continue; mm->flags &= ~VM_MEMMAP_F_IOMMU; KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, ("iommu unmap found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); } gpa = mm->gpa; while (gpa < mm->gpa + mm->len) { vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE, &cookie); KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", vm_name(vm), gpa)); vm_gpa_release(cookie); hpa = DMAP_TO_PHYS((uintptr_t)vp); if (map) { iommu_create_mapping(vm->iommu, gpa, hpa, sz); } else { iommu_remove_mapping(vm->iommu, gpa, sz); } gpa += PAGE_SIZE; } } /* * Invalidate the cached translations associated with the domain * from which pages were removed. */ if (map) iommu_invalidate_tlb(host_domain); else iommu_invalidate_tlb(vm->iommu); } #define vm_iommu_unmap(vm) vm_iommu_modify((vm), false) #define vm_iommu_map(vm) vm_iommu_modify((vm), true) int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; error = ppt_unassign_device(vm, bus, slot, func); if (error) return (error); if (ppt_assigned_devices(vm) == 0) vm_iommu_unmap(vm); return (0); } int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; vm_paddr_t maxaddr; /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ if (ppt_assigned_devices(vm) == 0) { KASSERT(vm->iommu == NULL, ("vm_assign_pptdev: iommu must be NULL")); maxaddr = vmm_sysmem_maxaddr(vm); vm->iommu = iommu_create_domain(maxaddr); if (vm->iommu == NULL) return (ENXIO); vm_iommu_map(vm); } error = ppt_assign_device(vm, bus, slot, func); return (error); } void * vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { int i, count, pageoff; struct mem_map *mm; vm_page_t m; #ifdef INVARIANTS /* * All vcpus are frozen by ioctls that modify the memory map * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is * guaranteed if at least one vcpu is in the VCPU_FROZEN state. */ int state; KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d", __func__, vcpuid)); for (i = 0; i < vm->maxcpus; i++) { if (vcpuid != -1 && vcpuid != i) continue; state = vcpu_get_state(vm, i, NULL); KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", __func__, state)); } #endif pageoff = gpa & PAGE_MASK; if (len > PAGE_SIZE - pageoff) panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); count = 0; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) { count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); break; } } if (count == 1) { *cookie = m; return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); } else { *cookie = NULL; return (NULL); } } void vm_gpa_release(void *cookie) { vm_page_t m = cookie; vm_page_unwire(m, PQ_ACTIVE); } int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) { if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); return (vmmops_getreg(vcpu_cookie(vm, vcpu), reg, retval)); } int vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) { struct vcpu *vcpu; int error; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; error = vmmops_setreg(vcpu->cookie, reg, val); if (error || reg != VM_REG_GUEST_RIP) return (error); /* Set 'nextrip' to match the value of %rip */ VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val); vcpu->nextrip = val; return (0); } static bool is_descriptor_table(int reg) { switch (reg) { case VM_REG_GUEST_IDTR: case VM_REG_GUEST_GDTR: return (true); default: return (false); } } static bool is_segment_register(int reg) { switch (reg) { case VM_REG_GUEST_ES: case VM_REG_GUEST_CS: case VM_REG_GUEST_SS: case VM_REG_GUEST_DS: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_TR: case VM_REG_GUEST_LDTR: return (true); default: return (false); } } int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (vmmops_getdesc(vcpu_cookie(vm, vcpu), reg, desc)); } int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (vmmops_setdesc(vcpu_cookie(vm, vcpu), reg, desc)); } static void restore_guest_fpustate(struct vcpu *vcpu) { /* flush host state to the pcb */ fpuexit(curthread); /* restore guest FPU state */ fpu_stop_emulating(); fpurestore(vcpu->guestfpu); /* restore guest XCR0 if XSAVE is enabled in the host */ if (rcr4() & CR4_XSAVE) load_xcr(0, vcpu->guest_xcr0); /* * The FPU is now "dirty" with the guest's state so turn on emulation * to trap any access to the FPU by the host. */ fpu_start_emulating(); } static void save_guest_fpustate(struct vcpu *vcpu) { if ((rcr0() & CR0_TS) == 0) panic("fpu emulation not enabled in host!"); /* save guest XCR0 and restore host XCR0 */ if (rcr4() & CR4_XSAVE) { vcpu->guest_xcr0 = rxcr(0); load_xcr(0, vmm_get_host_xcr0()); } /* save guest FPU state */ fpu_stop_emulating(); fpusave(vcpu->guestfpu); fpu_start_emulating(); } static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); static int vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, bool from_idle) { struct vcpu *vcpu; int error; vcpu = &vm->vcpu[vcpuid]; vcpu_assert_locked(vcpu); /* * State transitions from the vmmdev_ioctl() must always begin from * the VCPU_IDLE state. This guarantees that there is only a single * ioctl() operating on a vcpu at any point. */ if (from_idle) { while (vcpu->state != VCPU_IDLE) { vcpu->reqidle = 1; vcpu_notify_event_locked(vcpu, false); VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to " "idle requested", vcpu_state2str(vcpu->state)); msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); } } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); } if (vcpu->state == VCPU_RUNNING) { KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " "mismatch for running vcpu", curcpu, vcpu->hostcpu)); } else { KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " "vcpu that is not running", vcpu->hostcpu)); } /* * The following state transitions are allowed: * IDLE -> FROZEN -> IDLE * FROZEN -> RUNNING -> FROZEN * FROZEN -> SLEEPING -> FROZEN */ switch (vcpu->state) { case VCPU_IDLE: case VCPU_RUNNING: case VCPU_SLEEPING: error = (newstate != VCPU_FROZEN); break; case VCPU_FROZEN: error = (newstate == VCPU_FROZEN); break; default: error = 1; break; } if (error) return (EBUSY); VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); vcpu->state = newstate; if (newstate == VCPU_RUNNING) vcpu->hostcpu = curcpu; else vcpu->hostcpu = NOCPU; if (newstate == VCPU_IDLE) wakeup(&vcpu->state); return (0); } static void vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) panic("Error %d setting state to %d\n", error, newstate); } static void vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) panic("Error %d setting state to %d", error, newstate); } #define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ do { \ if (vcpuid >= 0) \ VCPU_CTR0(vm, vcpuid, fmt); \ else \ VM_CTR0(vm, fmt); \ } while (0) static int vm_handle_rendezvous(struct vm *vm, int vcpuid) { struct thread *td; int error; KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus), ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); error = 0; td = curthread; mtx_lock(&vm->rendezvous_mtx); while (vm->rendezvous_func != NULL) { /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus); if (vcpuid != -1 && CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); CPU_SET(vcpuid, &vm->rendezvous_done_cpus); } if (CPU_CMP(&vm->rendezvous_req_cpus, &vm->rendezvous_done_cpus) == 0) { VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); vm->rendezvous_func = NULL; wakeup(&vm->rendezvous_func); break; } RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, "vmrndv", hz); if (td_ast_pending(td, TDA_SUSPEND)) { mtx_unlock(&vm->rendezvous_mtx); error = thread_check_susp(td, true); if (error != 0) return (error); mtx_lock(&vm->rendezvous_mtx); } } mtx_unlock(&vm->rendezvous_mtx); return (0); } /* * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. */ static int vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) { struct vcpu *vcpu; const char *wmesg; struct thread *td; int error, t, vcpu_halted, vm_halted; KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); vcpu = &vm->vcpu[vcpuid]; vcpu_halted = 0; vm_halted = 0; error = 0; td = curthread; vcpu_lock(vcpu); while (1) { /* * Do a final check for pending NMI or interrupts before * really putting this thread to sleep. Also check for * software events that would cause this vcpu to wakeup. * * These interrupts/events could have happened after the * vcpu returned from vmmops_run() and before it acquired the * vcpu lock above. */ if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle) break; if (vm_nmi_pending(vm, vcpuid)) break; if (!intr_disabled) { if (vm_extint_pending(vm, vcpuid) || vlapic_pending_intr(vcpu->vlapic, NULL)) { break; } } /* Don't go to sleep if the vcpu thread needs to yield */ if (vcpu_should_yield(vm, vcpuid)) break; if (vcpu_debugged(vm, vcpuid)) break; /* * Some Linux guests implement "halt" by having all vcpus * execute HLT with interrupts disabled. 'halted_cpus' keeps * track of the vcpus that have entered this state. When all * vcpus enter the halted state the virtual machine is halted. */ if (intr_disabled) { wmesg = "vmhalt"; VCPU_CTR0(vm, vcpuid, "Halted"); if (!vcpu_halted && halt_detection_enabled) { vcpu_halted = 1; CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); } if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { vm_halted = 1; break; } } else { wmesg = "vmidle"; } t = ticks; vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); /* * XXX msleep_spin() cannot be interrupted by signals so * wake up periodically to check pending signals. */ msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); vmm_stat_incr(vcpu, VCPU_IDLE_TICKS, ticks - t); if (td_ast_pending(td, TDA_SUSPEND)) { vcpu_unlock(vcpu); error = thread_check_susp(td, false); if (error != 0) { if (vcpu_halted) { CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); } return (error); } vcpu_lock(vcpu); } } if (vcpu_halted) CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); vcpu_unlock(vcpu); if (vm_halted) vm_suspend(vm, VM_SUSPEND_HALT); return (0); } static int vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) { int rv, ftype; struct vm_map *map; struct vcpu *vcpu; struct vm_exit *vme; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); ftype = vme->u.paging.fault_type; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, ("vm_handle_paging: invalid fault_type %d", ftype)); if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), vme->u.paging.gpa, ftype); if (rv == 0) { VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx", ftype == VM_PROT_READ ? "accessed" : "dirty", vme->u.paging.gpa); goto done; } } map = &vm->vmspace->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " "ftype = %d", rv, vme->u.paging.gpa, ftype); if (rv != KERN_SUCCESS) return (EFAULT); done: return (0); } static int vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) { struct vie *vie; struct vcpu *vcpu; struct vm_exit *vme; uint64_t gla, gpa, cs_base; struct vm_guest_paging *paging; mem_region_read_t mread; mem_region_write_t mwrite; enum vm_cpu_mode cpu_mode; int cs_d, error, fault; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); gla = vme->u.inst_emul.gla; gpa = vme->u.inst_emul.gpa; cs_base = vme->u.inst_emul.cs_base; cs_d = vme->u.inst_emul.cs_d; vie = &vme->u.inst_emul.vie; paging = &vme->u.inst_emul.paging; cpu_mode = paging->cpu_mode; VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa); /* Fetch, decode and emulate the faulting instruction */ if (vie->num_valid == 0) { error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip + cs_base, VIE_INST_SIZE, vie, &fault); } else { /* * The instruction bytes have already been copied into 'vie' */ error = fault = 0; } if (error || fault) return (error); if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) { VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx", vme->rip + cs_base); *retu = true; /* dump instruction bytes in userspace */ return (0); } /* * Update 'nextrip' based on the length of the emulated instruction. */ vme->inst_length = vie->num_processed; vcpu->nextrip += vie->num_processed; VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction " "decoding", vcpu->nextrip); /* return to userland unless this is an in-kernel emulated device */ if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { mread = lapic_mmio_read; mwrite = lapic_mmio_write; } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { mread = vioapic_mmio_read; mwrite = vioapic_mmio_write; } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { *retu = true; return (0); } error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, mread, mwrite, retu); return (error); } static int vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) { int error, i; struct vcpu *vcpu; struct thread *td; error = 0; vcpu = &vm->vcpu[vcpuid]; td = curthread; CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); /* * Wait until all 'active_cpus' have suspended themselves. * * Since a VM may be suspended at any time including when one or * more vcpus are doing a rendezvous we need to call the rendezvous * handler while we are waiting to prevent a deadlock. */ vcpu_lock(vcpu); while (error == 0) { if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); break; } if (vm->rendezvous_func == NULL) { VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); if (td_ast_pending(td, TDA_SUSPEND)) { vcpu_unlock(vcpu); error = thread_check_susp(td, false); vcpu_lock(vcpu); } } else { VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); vcpu_unlock(vcpu); error = vm_handle_rendezvous(vm, vcpuid); vcpu_lock(vcpu); } } vcpu_unlock(vcpu); /* * Wakeup the other sleeping vcpus and return to userspace. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->suspended_cpus)) { vcpu_notify_event(vm, i, false); } } *retu = true; return (error); } static int vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu) { struct vcpu *vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); vcpu->reqidle = 0; vcpu_unlock(vcpu); *retu = true; return (0); } int vm_suspend(struct vm *vm, enum vm_suspend_how how) { int i; if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) return (EINVAL); if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { VM_CTR2(vm, "virtual machine already suspended %d/%d", vm->suspend, how); return (EALREADY); } VM_CTR1(vm, "virtual machine successfully suspended %d", how); /* * Notify all active vcpus that they are now suspended. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm, i, false); } return (0); } void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_SUSPENDED; vmexit->u.suspended.how = vm->suspend; } void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_DEBUG; } void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress")); vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; vmm_stat_incr(vm_vcpu(vm, vcpuid), VMEXIT_RENDEZVOUS, 1); } void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_REQIDLE; vmm_stat_incr(vm_vcpu(vm, vcpuid), VMEXIT_REQIDLE, 1); } void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vm_vcpu(vm, vcpuid), VMEXIT_ASTPENDING, 1); } int vm_run(struct vm *vm, struct vm_run *vmrun) { struct vm_eventinfo evinfo; int error, vcpuid; struct vcpu *vcpu; struct pcb *pcb; uint64_t tscval; struct vm_exit *vme; bool retu, intr_disabled; pmap_t pmap; vcpuid = vmrun->cpuid; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (!CPU_ISSET(vcpuid, &vm->active_cpus)) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; evinfo.rptr = &vm->rendezvous_func; evinfo.sptr = &vm->suspend; evinfo.iptr = &vcpu->reqidle; restart: critical_enter(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("vm_run: absurd pm_active")); tscval = rdtsc(); pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); restore_guest_fpustate(vcpu); vcpu_require_state(vm, vcpuid, VCPU_RUNNING); error = vmmops_run(vcpu->cookie, vcpu->nextrip, pmap, &evinfo); vcpu_require_state(vm, vcpuid, VCPU_FROZEN); save_guest_fpustate(vcpu); vmm_stat_incr(vcpu, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); critical_exit(); if (error == 0) { retu = false; vcpu->nextrip = vme->rip + vme->inst_length; switch (vme->exitcode) { case VM_EXITCODE_REQIDLE: error = vm_handle_reqidle(vm, vcpuid, &retu); break; case VM_EXITCODE_SUSPENDED: error = vm_handle_suspend(vm, vcpuid, &retu); break; case VM_EXITCODE_IOAPIC_EOI: vioapic_process_eoi(vm, vcpuid, vme->u.ioapic_eoi.vector); break; case VM_EXITCODE_RENDEZVOUS: error = vm_handle_rendezvous(vm, vcpuid); break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); break; case VM_EXITCODE_PAGING: error = vm_handle_paging(vm, vcpuid, &retu); break; case VM_EXITCODE_INST_EMUL: error = vm_handle_inst_emul(vm, vcpuid, &retu); break; case VM_EXITCODE_INOUT: case VM_EXITCODE_INOUT_STR: error = vm_handle_inout(vm, vcpuid, vme, &retu); break; case VM_EXITCODE_MONITOR: case VM_EXITCODE_MWAIT: case VM_EXITCODE_VMINSN: vm_inject_ud(vm, vcpuid); break; default: retu = true; /* handled in userland */ break; } } /* * VM_EXITCODE_INST_EMUL could access the apic which could transform the * exit code into VM_EXITCODE_IPI. */ if (error == 0 && vme->exitcode == VM_EXITCODE_IPI) { retu = false; error = vm_handle_ipi(vm, vcpuid, vme, &retu); } if (error == 0 && retu == false) goto restart; vmm_stat_incr(vcpu, VMEXIT_USERSPACE, 1); VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); /* copy the exit information */ bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); return (error); } int vm_restart_instruction(void *arg, int vcpuid) { struct vm *vm; struct vcpu *vcpu; enum vcpu_state state; uint64_t rip; int error __diagused; vm = arg; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; state = vcpu_get_state(vm, vcpuid, NULL); if (state == VCPU_RUNNING) { /* * When a vcpu is "running" the next instruction is determined * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. * Thus setting 'inst_length' to zero will cause the current * instruction to be restarted. */ vcpu->exitinfo.inst_length = 0; VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by " "setting inst_length to zero", vcpu->exitinfo.rip); } else if (state == VCPU_FROZEN) { /* * When a vcpu is "frozen" it is outside the critical section * around vmmops_run() and 'nextrip' points to the next * instruction. Thus instruction restart is achieved by setting * 'nextrip' to the vcpu's %rip. */ error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); KASSERT(!error, ("%s: error %d getting rip", __func__, error)); VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " "nextrip from %#lx to %#lx", vcpu->nextrip, rip); vcpu->nextrip = rip; } else { panic("%s: invalid state %d", __func__, state); } return (0); } int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) { struct vcpu *vcpu; int type, vector; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; if (info & VM_INTINFO_VALID) { type = info & VM_INTINFO_TYPE; vector = info & 0xff; if (type == VM_INTINFO_NMI && vector != IDT_NMI) return (EINVAL); if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) return (EINVAL); if (info & VM_INTINFO_RSVD) return (EINVAL); } else { info = 0; } VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info); vcpu->exitintinfo = info; return (0); } enum exc_class { EXC_BENIGN, EXC_CONTRIBUTORY, EXC_PAGEFAULT }; #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ static enum exc_class exception_class(uint64_t info) { int type, vector; KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); type = info & VM_INTINFO_TYPE; vector = info & 0xff; /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ switch (type) { case VM_INTINFO_HWINTR: case VM_INTINFO_SWINTR: case VM_INTINFO_NMI: return (EXC_BENIGN); default: /* * Hardware exception. * * SVM and VT-x use identical type values to represent NMI, * hardware interrupt and software interrupt. * * SVM uses type '3' for all exceptions. VT-x uses type '3' * for exceptions except #BP and #OF. #BP and #OF use a type * value of '5' or '6'. Therefore we don't check for explicit * values of 'type' to classify 'intinfo' into a hardware * exception. */ break; } switch (vector) { case IDT_PF: case IDT_VE: return (EXC_PAGEFAULT); case IDT_DE: case IDT_TS: case IDT_NP: case IDT_SS: case IDT_GP: return (EXC_CONTRIBUTORY); default: return (EXC_BENIGN); } } static int nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, uint64_t *retinfo) { enum exc_class exc1, exc2; int type1, vector1; KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); /* * If an exception occurs while attempting to call the double-fault * handler the processor enters shutdown mode (aka triple fault). */ type1 = info1 & VM_INTINFO_TYPE; vector1 = info1 & 0xff; if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", info1, info2); vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); *retinfo = 0; return (0); } /* * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 */ exc1 = exception_class(info1); exc2 = exception_class(info2); if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { /* Convert nested fault into a double fault. */ *retinfo = IDT_DF; *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; *retinfo |= VM_INTINFO_DEL_ERRCODE; } else { /* Handle exceptions serially */ *retinfo = info2; } return (1); } static uint64_t vcpu_exception_intinfo(struct vcpu *vcpu) { uint64_t info = 0; if (vcpu->exception_pending) { info = vcpu->exc_vector & 0xff; info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; if (vcpu->exc_errcode_valid) { info |= VM_INTINFO_DEL_ERRCODE; info |= (uint64_t)vcpu->exc_errcode << 32; } } return (info); } int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) { struct vcpu *vcpu; uint64_t info1, info2; int valid; KASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid)); vcpu = &vm->vcpu[vcpuid]; info1 = vcpu->exitintinfo; vcpu->exitintinfo = 0; info2 = 0; if (vcpu->exception_pending) { info2 = vcpu_exception_intinfo(vcpu); vcpu->exception_pending = 0; VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", vcpu->exc_vector, info2); } if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { valid = nested_fault(vm, vcpuid, info1, info2, retinfo); } else if (info1 & VM_INTINFO_VALID) { *retinfo = info1; valid = 1; } else if (info2 & VM_INTINFO_VALID) { *retinfo = info2; valid = 1; } else { valid = 0; } if (valid) { VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), " "retinfo(%#lx)", __func__, info1, info2, *retinfo); } return (valid); } int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; *info1 = vcpu->exitintinfo; *info2 = vcpu_exception_intinfo(vcpu); return (0); } int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, uint32_t errcode, int restart_instruction) { struct vcpu *vcpu; uint64_t regval; int error __diagused; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (vector < 0 || vector >= 32) return (EINVAL); /* * A double fault exception should never be injected directly into * the guest. It is a derived exception that results from specific * combinations of nested faults. */ if (vector == IDT_DF) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; if (vcpu->exception_pending) { VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " "pending exception %d", vector, vcpu->exc_vector); return (EBUSY); } if (errcode_valid) { /* * Exceptions don't deliver an error code in real mode. */ error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); if (!(regval & CR0_PE)) errcode_valid = 0; } /* * From section 26.6.1 "Interruptibility State" in Intel SDM: * * Event blocking by "STI" or "MOV SS" is cleared after guest executes * one instruction or incurs an exception. */ error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", __func__, error)); if (restart_instruction) vm_restart_instruction(vm, vcpuid); vcpu->exception_pending = 1; vcpu->exc_vector = vector; vcpu->exc_errcode = errcode; vcpu->exc_errcode_valid = errcode_valid; VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); return (0); } void vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, int errcode) { struct vm *vm; int error __diagused, restart_instruction; vm = vmarg; restart_instruction = 1; error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, errcode, restart_instruction); KASSERT(error == 0, ("vm_inject_exception error %d", error)); } void vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2) { struct vm *vm; int error __diagused; vm = vmarg; VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx", error_code, cr2); error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); } static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); int vm_inject_nmi(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; vcpu->nmi_pending = 1; vcpu_notify_event(vm, vcpuid, false); return (0); } int vm_nmi_pending(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; return (vcpu->nmi_pending); } void vm_nmi_clear(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; if (vcpu->nmi_pending == 0) panic("vm_nmi_clear: inconsistent nmi_pending state"); vcpu->nmi_pending = 0; vmm_stat_incr(vcpu, VCPU_NMI_COUNT, 1); } static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); int vm_inject_extint(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; vcpu->extint_pending = 1; vcpu_notify_event(vm, vcpuid, false); return (0); } int vm_extint_pending(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_extint_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; return (vcpu->extint_pending); } void vm_extint_clear(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_extint_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; if (vcpu->extint_pending == 0) panic("vm_extint_clear: inconsistent extint_pending state"); vcpu->extint_pending = 0; vmm_stat_incr(vcpu, VCPU_EXTINT_COUNT, 1); } int vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) { if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (vmmops_getcap(vcpu_cookie(vm, vcpu), type, retval)); } int vm_set_capability(struct vm *vm, int vcpu, int type, int val) { if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (vmmops_setcap(vcpu_cookie(vm, vcpu), type, val)); } struct vm * vcpu_vm(struct vcpu *vcpu) { return (vcpu->vm); } int vcpu_vcpuid(struct vcpu *vcpu) { return (vcpu->vcpuid); } struct vcpu * vm_vcpu(struct vm *vm, int vcpuid) { return (&vm->vcpu[vcpuid]); } struct vlapic * vm_lapic(struct vm *vm, int cpu) { return (vm->vcpu[cpu].vlapic); } struct vioapic * vm_ioapic(struct vm *vm) { return (vm->vioapic); } struct vhpet * vm_hpet(struct vm *vm) { return (vm->vhpet); } bool vmm_is_pptdev(int bus, int slot, int func) { int b, f, i, n, s; char *val, *cp, *cp2; bool found; /* * XXX * The length of an environment variable is limited to 128 bytes which * puts an upper limit on the number of passthru devices that may be * specified using a single environment variable. * * Work around this by scanning multiple environment variable * names instead of a single one - yuck! */ const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ found = false; for (i = 0; names[i] != NULL && !found; i++) { cp = val = kern_getenv(names[i]); while (cp != NULL && *cp != '\0') { if ((cp2 = strchr(cp, ' ')) != NULL) *cp2 = '\0'; n = sscanf(cp, "%d/%d/%d", &b, &s, &f); if (n == 3 && bus == b && slot == s && func == f) { found = true; break; } if (cp2 != NULL) *cp2++ = ' '; cp = cp2; } freeenv(val); } return (found); } void * vm_iommu_domain(struct vm *vm) { return (vm->iommu); } int vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, bool from_idle) { int error; struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_set_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); vcpu_unlock(vcpu); return (error); } enum vcpu_state vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) { struct vcpu *vcpu; enum vcpu_state state; if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_get_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); state = vcpu->state; if (hostcpu != NULL) *hostcpu = vcpu->hostcpu; vcpu_unlock(vcpu); return (state); } int vm_activate_cpu(struct vm *vm, int vcpuid) { if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->active_cpus)) return (EBUSY); VCPU_CTR0(vm, vcpuid, "activated"); CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); return (0); } int vm_suspend_cpu(struct vm *vm, int vcpuid) { int i; if (vcpuid < -1 || vcpuid >= vm->maxcpus) return (EINVAL); if (vcpuid == -1) { vm->debug_cpus = vm->active_cpus; for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm, i, false); } } else { if (!CPU_ISSET(vcpuid, &vm->active_cpus)) return (EINVAL); CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); vcpu_notify_event(vm, vcpuid, false); } return (0); } int vm_resume_cpu(struct vm *vm, int vcpuid) { if (vcpuid < -1 || vcpuid >= vm->maxcpus) return (EINVAL); if (vcpuid == -1) { CPU_ZERO(&vm->debug_cpus); } else { if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) return (EINVAL); CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); } return (0); } int vcpu_debugged(struct vm *vm, int vcpuid) { return (CPU_ISSET(vcpuid, &vm->debug_cpus)); } cpuset_t vm_active_cpus(struct vm *vm) { return (vm->active_cpus); } cpuset_t vm_debug_cpus(struct vm *vm) { return (vm->debug_cpus); } cpuset_t vm_suspended_cpus(struct vm *vm) { return (vm->suspended_cpus); } void * vcpu_stats(struct vcpu *vcpu) { return (vcpu->stats); } int vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) { if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); *state = vm->vcpu[vcpuid].x2apic_state; return (0); } int vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) { if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (state >= X2APIC_STATE_LAST) return (EINVAL); vm->vcpu[vcpuid].x2apic_state = state; vlapic_set_x2apic_state(vm, vcpuid, state); return (0); } /* * This function is called to ensure that a vcpu "sees" a pending event * as soon as possible: * - If the vcpu thread is sleeping then it is woken up. * - If the vcpu is running on a different host_cpu then an IPI will be directed * to the host_cpu to cause the vcpu to trap into the hypervisor. */ static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) { int hostcpu; hostcpu = vcpu->hostcpu; if (vcpu->state == VCPU_RUNNING) { KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); if (hostcpu != curcpu) { if (lapic_intr) { vlapic_post_intr(vcpu->vlapic, hostcpu, vmm_ipinum); } else { ipi_cpu(hostcpu, vmm_ipinum); } } else { /* * If the 'vcpu' is running on 'curcpu' then it must * be sending a notification to itself (e.g. SELF_IPI). * The pending event will be picked up when the vcpu * transitions back to guest context. */ } } else { KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " "with hostcpu %d", vcpu->state, hostcpu)); if (vcpu->state == VCPU_SLEEPING) wakeup_one(vcpu); } } void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) { struct vcpu *vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); vcpu_notify_event_locked(vcpu, lapic_intr); vcpu_unlock(vcpu); } struct vmspace * vm_get_vmspace(struct vm *vm) { return (vm->vmspace); } int vm_apicid2vcpuid(struct vm *vm, int apicid) { /* * XXX apic id is assumed to be numerically identical to vcpu id */ return (apicid); } int vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, vm_rendezvous_func_t func, void *arg) { int error, i; /* * Enforce that this function is called without any locks */ WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus), ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); restart: mtx_lock(&vm->rendezvous_mtx); if (vm->rendezvous_func != NULL) { /* * If a rendezvous is already in progress then we need to * call the rendezvous handler in case this 'vcpuid' is one * of the targets of the rendezvous. */ RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); mtx_unlock(&vm->rendezvous_mtx); error = vm_handle_rendezvous(vm, vcpuid); if (error != 0) return (error); goto restart; } KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " "rendezvous is still in progress")); RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); vm->rendezvous_req_cpus = dest; CPU_ZERO(&vm->rendezvous_done_cpus); vm->rendezvous_arg = arg; vm->rendezvous_func = func; mtx_unlock(&vm->rendezvous_mtx); /* * Wake up any sleeping vcpus and trigger a VM-exit in any running * vcpus so they handle the rendezvous as soon as possible. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &dest)) vcpu_notify_event(vm, i, false); } return (vm_handle_rendezvous(vm, vcpuid)); } struct vatpic * vm_atpic(struct vm *vm) { return (vm->vatpic); } struct vatpit * vm_atpit(struct vm *vm) { return (vm->vatpit); } struct vpmtmr * vm_pmtmr(struct vm *vm) { return (vm->vpmtmr); } struct vrtc * vm_rtc(struct vm *vm) { return (vm->vrtc); } enum vm_reg_name vm_segment_name(int seg) { static enum vm_reg_name seg_names[] = { VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS }; KASSERT(seg >= 0 && seg < nitems(seg_names), ("%s: invalid segment encoding %d", __func__, seg)); return (seg_names[seg]); } void -vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, - int num_copyinfo) +vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo) { int idx; for (idx = 0; idx < num_copyinfo; idx++) { if (copyinfo[idx].cookie != NULL) vm_gpa_release(copyinfo[idx].cookie); } bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); } int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *fault) { int error, idx, nused; size_t n, off, remaining; void *hva, *cookie; uint64_t gpa; bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); nused = 0; remaining = len; while (remaining > 0) { KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = min(remaining, PAGE_SIZE - off); copyinfo[nused].gpa = gpa; copyinfo[nused].len = n; remaining -= n; gla += n; nused++; } for (idx = 0; idx < nused; idx++) { hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa, copyinfo[idx].len, prot, &cookie); if (hva == NULL) break; copyinfo[idx].hva = hva; copyinfo[idx].cookie = cookie; } if (idx != nused) { - vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); + vm_copy_teardown(copyinfo, num_copyinfo); return (EFAULT); } else { *fault = 0; return (0); } } void -vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, - size_t len) +vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len) { char *dst; int idx; dst = kaddr; idx = 0; while (len > 0) { bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); len -= copyinfo[idx].len; dst += copyinfo[idx].len; idx++; } } void -vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, - struct vm_copyinfo *copyinfo, size_t len) +vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len) { const char *src; int idx; src = kaddr; idx = 0; while (len > 0) { bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); len -= copyinfo[idx].len; src += copyinfo[idx].len; idx++; } } /* * Return the amount of in-use and wired memory for the VM. Since * these are global stats, only return the values with for vCPU 0 */ VMM_STAT_DECLARE(VMM_MEM_RESIDENT); VMM_STAT_DECLARE(VMM_MEM_WIRED); static void vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) { if (vcpu == 0) { vmm_stat_set(vm_vcpu(vm, vcpu), VMM_MEM_RESIDENT, PAGE_SIZE * vmspace_resident_count(vm->vmspace)); } } static void vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) { if (vcpu == 0) { vmm_stat_set(vm_vcpu(vm, vcpu), VMM_MEM_WIRED, PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); } } VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); #ifdef BHYVE_SNAPSHOT static int vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta) { uint64_t tsc, now; int ret; struct vcpu *vcpu; uint16_t i, maxcpus; now = rdtsc(); maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = &vm->vcpu[i]; SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done); /* * Save the absolute TSC value by adding now to tsc_offset. * * It will be turned turned back into an actual offset when the * TSC restore function is called */ tsc = now + vcpu->tsc_offset; SNAPSHOT_VAR_OR_LEAVE(tsc, meta, ret, done); } done: return (ret); } static int vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta) { int ret; ret = vm_snapshot_vcpus(vm, meta); if (ret != 0) goto done; done: return (ret); } static int vm_snapshot_vcpu(struct vm *vm, struct vm_snapshot_meta *meta) { int error; struct vcpu *vcpu; uint16_t i, maxcpus; error = 0; maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = &vm->vcpu[i]; error = vmmops_vcpu_snapshot(vcpu->cookie, meta); if (error != 0) { printf("%s: failed to snapshot vmcs/vmcb data for " "vCPU: %d; error: %d\n", __func__, i, error); goto done; } } done: return (error); } /* * Save kernel-side structures to user-space for snapshotting. */ int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta) { int ret = 0; switch (meta->dev_req) { case STRUCT_VMX: ret = vmmops_snapshot(vm->cookie, meta); break; case STRUCT_VMCX: ret = vm_snapshot_vcpu(vm, meta); break; case STRUCT_VM: ret = vm_snapshot_vm(vm, meta); break; case STRUCT_VIOAPIC: ret = vioapic_snapshot(vm_ioapic(vm), meta); break; case STRUCT_VLAPIC: ret = vlapic_snapshot(vm, meta); break; case STRUCT_VHPET: ret = vhpet_snapshot(vm_hpet(vm), meta); break; case STRUCT_VATPIC: ret = vatpic_snapshot(vm_atpic(vm), meta); break; case STRUCT_VATPIT: ret = vatpit_snapshot(vm_atpit(vm), meta); break; case STRUCT_VPMTMR: ret = vpmtmr_snapshot(vm_pmtmr(vm), meta); break; case STRUCT_VRTC: ret = vrtc_snapshot(vm_rtc(vm), meta); break; default: printf("%s: failed to find the requested type %#x\n", __func__, meta->dev_req); ret = (EINVAL); } return (ret); } int vm_set_tsc_offset(struct vm *vm, int vcpuid, uint64_t offset) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm)) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; vcpu->tsc_offset = offset; return (0); } int vm_restore_time(struct vm *vm) { int error; uint64_t now; struct vcpu *vcpu; uint16_t i, maxcpus; now = rdtsc(); error = vhpet_restore_time(vm_hpet(vm)); if (error) return (error); maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = &vm->vcpu[i]; error = vmmops_restore_tsc(vcpu->cookie, vcpu->tsc_offset - now); if (error) return (error); } return (0); } #endif diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index 3b48eee2d8ed..0b774e72b0c3 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -1,2944 +1,2944 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Sandvine, Inc. * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #ifdef _KERNEL #include #include #include #include #include #include #include #include #else /* !_KERNEL */ #include #include #include #include #include #include #include #include #include #include #include #include #define __diagused #define KASSERT(exp,msg) assert((exp)) #define panic(...) errx(4, __VA_ARGS__) #endif /* _KERNEL */ #include #include #include /* struct vie_op.op_type */ enum { VIE_OP_TYPE_NONE = 0, VIE_OP_TYPE_MOV, VIE_OP_TYPE_MOVSX, VIE_OP_TYPE_MOVZX, VIE_OP_TYPE_AND, VIE_OP_TYPE_OR, VIE_OP_TYPE_SUB, VIE_OP_TYPE_TWO_BYTE, VIE_OP_TYPE_PUSH, VIE_OP_TYPE_CMP, VIE_OP_TYPE_POP, VIE_OP_TYPE_MOVS, VIE_OP_TYPE_GROUP1, VIE_OP_TYPE_STOS, VIE_OP_TYPE_BITTEST, VIE_OP_TYPE_TWOB_GRP15, VIE_OP_TYPE_ADD, VIE_OP_TYPE_TEST, VIE_OP_TYPE_BEXTR, VIE_OP_TYPE_LAST }; /* struct vie_op.op_flags */ #define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ #define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ #define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ #define VIE_OP_F_NO_MODRM (1 << 3) #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) static const struct vie_op three_byte_opcodes_0f38[256] = { [0xF7] = { .op_byte = 0xF7, .op_type = VIE_OP_TYPE_BEXTR, }, }; static const struct vie_op two_byte_opcodes[256] = { [0xAE] = { .op_byte = 0xAE, .op_type = VIE_OP_TYPE_TWOB_GRP15, }, [0xB6] = { .op_byte = 0xB6, .op_type = VIE_OP_TYPE_MOVZX, }, [0xB7] = { .op_byte = 0xB7, .op_type = VIE_OP_TYPE_MOVZX, }, [0xBA] = { .op_byte = 0xBA, .op_type = VIE_OP_TYPE_BITTEST, .op_flags = VIE_OP_F_IMM8, }, [0xBE] = { .op_byte = 0xBE, .op_type = VIE_OP_TYPE_MOVSX, }, }; static const struct vie_op one_byte_opcodes[256] = { [0x03] = { .op_byte = 0x03, .op_type = VIE_OP_TYPE_ADD, }, [0x0F] = { .op_byte = 0x0F, .op_type = VIE_OP_TYPE_TWO_BYTE }, [0x0B] = { .op_byte = 0x0B, .op_type = VIE_OP_TYPE_OR, }, [0x2B] = { .op_byte = 0x2B, .op_type = VIE_OP_TYPE_SUB, }, [0x39] = { .op_byte = 0x39, .op_type = VIE_OP_TYPE_CMP, }, [0x3B] = { .op_byte = 0x3B, .op_type = VIE_OP_TYPE_CMP, }, [0x88] = { .op_byte = 0x88, .op_type = VIE_OP_TYPE_MOV, }, [0x89] = { .op_byte = 0x89, .op_type = VIE_OP_TYPE_MOV, }, [0x8A] = { .op_byte = 0x8A, .op_type = VIE_OP_TYPE_MOV, }, [0x8B] = { .op_byte = 0x8B, .op_type = VIE_OP_TYPE_MOV, }, [0xA1] = { .op_byte = 0xA1, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, }, [0xA3] = { .op_byte = 0xA3, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, }, [0xA4] = { .op_byte = 0xA4, .op_type = VIE_OP_TYPE_MOVS, .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION }, [0xA5] = { .op_byte = 0xA5, .op_type = VIE_OP_TYPE_MOVS, .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION }, [0xAA] = { .op_byte = 0xAA, .op_type = VIE_OP_TYPE_STOS, .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION }, [0xAB] = { .op_byte = 0xAB, .op_type = VIE_OP_TYPE_STOS, .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION }, [0xC6] = { /* XXX Group 11 extended opcode - not just MOV */ .op_byte = 0xC6, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_IMM8, }, [0xC7] = { .op_byte = 0xC7, .op_type = VIE_OP_TYPE_MOV, .op_flags = VIE_OP_F_IMM, }, [0x23] = { .op_byte = 0x23, .op_type = VIE_OP_TYPE_AND, }, [0x80] = { /* Group 1 extended opcode */ .op_byte = 0x80, .op_type = VIE_OP_TYPE_GROUP1, .op_flags = VIE_OP_F_IMM8, }, [0x81] = { /* Group 1 extended opcode */ .op_byte = 0x81, .op_type = VIE_OP_TYPE_GROUP1, .op_flags = VIE_OP_F_IMM, }, [0x83] = { /* Group 1 extended opcode */ .op_byte = 0x83, .op_type = VIE_OP_TYPE_GROUP1, .op_flags = VIE_OP_F_IMM8, }, [0x8F] = { /* XXX Group 1A extended opcode - not just POP */ .op_byte = 0x8F, .op_type = VIE_OP_TYPE_POP, }, [0xF7] = { /* XXX Group 3 extended opcode - not just TEST */ .op_byte = 0xF7, .op_type = VIE_OP_TYPE_TEST, .op_flags = VIE_OP_F_IMM, }, [0xFF] = { /* XXX Group 5 extended opcode - not just PUSH */ .op_byte = 0xFF, .op_type = VIE_OP_TYPE_PUSH, } }; /* struct vie.mod */ #define VIE_MOD_INDIRECT 0 #define VIE_MOD_INDIRECT_DISP8 1 #define VIE_MOD_INDIRECT_DISP32 2 #define VIE_MOD_DIRECT 3 /* struct vie.rm */ #define VIE_RM_SIB 4 #define VIE_RM_DISP32 5 #define GB (1024 * 1024 * 1024) static enum vm_reg_name gpr_map[16] = { VM_REG_GUEST_RAX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RBX, VM_REG_GUEST_RSP, VM_REG_GUEST_RBP, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15 }; static uint64_t size2mask[] = { [1] = 0xff, [2] = 0xffff, [4] = 0xffffffff, [8] = 0xffffffffffffffff, }; static int vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) { int error; error = vm_get_register(vm, vcpuid, reg, rval); return (error); } static void vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) { *lhbr = 0; *reg = gpr_map[vie->reg]; /* * 64-bit mode imposes limitations on accessing legacy high byte * registers (lhbr). * * The legacy high-byte registers cannot be addressed if the REX * prefix is present. In this case the values 4, 5, 6 and 7 of the * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. * * If the REX prefix is not present then the values 4, 5, 6 and 7 * of the 'ModRM:reg' field address the legacy high-byte registers, * %ah, %ch, %dh and %bh respectively. */ if (!vie->rex_present) { if (vie->reg & 0x4) { *lhbr = 1; *reg = gpr_map[vie->reg & 0x3]; } } } static int vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) { uint64_t val; int error, lhbr; enum vm_reg_name reg; vie_calc_bytereg(vie, ®, &lhbr); error = vm_get_register(vm, vcpuid, reg, &val); /* * To obtain the value of a legacy high byte register shift the * base register right by 8 bits (%ah = %rax >> 8). */ if (lhbr) *rval = val >> 8; else *rval = val; return (error); } static int vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) { uint64_t origval, val, mask; int error, lhbr; enum vm_reg_name reg; vie_calc_bytereg(vie, ®, &lhbr); error = vm_get_register(vm, vcpuid, reg, &origval); if (error == 0) { val = byte; mask = 0xff; if (lhbr) { /* * Shift left by 8 to store 'byte' in a legacy high * byte register. */ val <<= 8; mask <<= 8; } val |= origval & ~mask; error = vm_set_register(vm, vcpuid, reg, val); } return (error); } int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t val, int size) { int error; uint64_t origval; switch (size) { case 1: case 2: error = vie_read_register(vm, vcpuid, reg, &origval); if (error) return (error); val &= size2mask[size]; val |= origval & ~size2mask[size]; break; case 4: val &= 0xffffffffUL; break; case 8: break; default: return (EINVAL); } error = vm_set_register(vm, vcpuid, reg, val); return (error); } #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) /* * Return the status flags that would result from doing (x - y). */ #define GETCC(sz) \ static u_long \ getcc##sz(uint##sz##_t x, uint##sz##_t y) \ { \ u_long rflags; \ \ __asm __volatile("sub %2,%1; pushfq; popq %0" : \ "=r" (rflags), "+r" (x) : "m" (y)); \ return (rflags); \ } struct __hack GETCC(8); GETCC(16); GETCC(32); GETCC(64); static u_long getcc(int opsize, uint64_t x, uint64_t y) { KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, ("getcc: invalid operand size %d", opsize)); if (opsize == 1) return (getcc8(x, y)); else if (opsize == 2) return (getcc16(x, y)); else if (opsize == 4) return (getcc32(x, y)); else return (getcc64(x, y)); } /* * Macro creation of functions getaddflags{8,16,32,64} */ #define GETADDFLAGS(sz) \ static u_long \ getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ { \ u_long rflags; \ \ __asm __volatile("add %2,%1; pushfq; popq %0" : \ "=r" (rflags), "+r" (x) : "m" (y)); \ return (rflags); \ } struct __hack GETADDFLAGS(8); GETADDFLAGS(16); GETADDFLAGS(32); GETADDFLAGS(64); static u_long getaddflags(int opsize, uint64_t x, uint64_t y) { KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, ("getaddflags: invalid operand size %d", opsize)); if (opsize == 1) return (getaddflags8(x, y)); else if (opsize == 2) return (getaddflags16(x, y)); else if (opsize == 4) return (getaddflags32(x, y)); else return (getaddflags64(x, y)); } /* * Return the status flags that would result from doing (x & y). */ #define GETANDFLAGS(sz) \ static u_long \ getandflags##sz(uint##sz##_t x, uint##sz##_t y) \ { \ u_long rflags; \ \ __asm __volatile("and %2,%1; pushfq; popq %0" : \ "=r" (rflags), "+r" (x) : "m" (y)); \ return (rflags); \ } struct __hack GETANDFLAGS(8); GETANDFLAGS(16); GETANDFLAGS(32); GETANDFLAGS(64); static u_long getandflags(int opsize, uint64_t x, uint64_t y) { KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, ("getandflags: invalid operand size %d", opsize)); if (opsize == 1) return (getandflags8(x, y)); else if (opsize == 2) return (getandflags16(x, y)); else if (opsize == 4) return (getandflags32(x, y)); else return (getandflags64(x, y)); } static int emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; enum vm_reg_name reg; uint8_t byte; uint64_t val; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x88: /* * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) * 88/r: mov r/m8, r8 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) */ size = 1; /* override for byte operation */ error = vie_read_bytereg(vm, vcpuid, vie, &byte); if (error == 0) error = memwrite(vm, vcpuid, gpa, byte, size, arg); break; case 0x89: /* * MOV from reg (ModRM:reg) to mem (ModRM:r/m) * 89/r: mov r/m16, r16 * 89/r: mov r/m32, r32 * REX.W + 89/r mov r/m64, r64 */ reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, &val); if (error == 0) { val &= size2mask[size]; error = memwrite(vm, vcpuid, gpa, val, size, arg); } break; case 0x8A: /* * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) * 8A/r: mov r8, r/m8 * REX + 8A/r: mov r8, r/m8 */ size = 1; /* override for byte operation */ error = memread(vm, vcpuid, gpa, &val, size, arg); if (error == 0) error = vie_write_bytereg(vm, vcpuid, vie, val); break; case 0x8B: /* * MOV from mem (ModRM:r/m) to reg (ModRM:reg) * 8B/r: mov r16, r/m16 * 8B/r: mov r32, r/m32 * REX.W 8B/r: mov r64, r/m64 */ error = memread(vm, vcpuid, gpa, &val, size, arg); if (error == 0) { reg = gpr_map[vie->reg]; error = vie_update_register(vm, vcpuid, reg, val, size); } break; case 0xA1: /* * MOV from seg:moffset to AX/EAX/RAX * A1: mov AX, moffs16 * A1: mov EAX, moffs32 * REX.W + A1: mov RAX, moffs64 */ error = memread(vm, vcpuid, gpa, &val, size, arg); if (error == 0) { reg = VM_REG_GUEST_RAX; error = vie_update_register(vm, vcpuid, reg, val, size); } break; case 0xA3: /* * MOV from AX/EAX/RAX to seg:moffset * A3: mov moffs16, AX * A3: mov moffs32, EAX * REX.W + A3: mov moffs64, RAX */ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); if (error == 0) { val &= size2mask[size]; error = memwrite(vm, vcpuid, gpa, val, size, arg); } break; case 0xC6: /* * MOV from imm8 to mem (ModRM:r/m) * C6/0 mov r/m8, imm8 * REX + C6/0 mov r/m8, imm8 */ size = 1; /* override for byte operation */ error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); break; case 0xC7: /* * MOV from imm16/imm32 to mem (ModRM:r/m) * C7/0 mov r/m16, imm16 * C7/0 mov r/m32, imm32 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) */ val = vie->immediate & size2mask[size]; error = memwrite(vm, vcpuid, gpa, val, size, arg); break; default: break; } return (error); } static int emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) { int error, size; enum vm_reg_name reg; uint64_t val; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0xB6: /* * MOV and zero extend byte from mem (ModRM:r/m) to * reg (ModRM:reg). * * 0F B6/r movzx r16, r/m8 * 0F B6/r movzx r32, r/m8 * REX.W + 0F B6/r movzx r64, r/m8 */ /* get the first operand */ error = memread(vm, vcpuid, gpa, &val, 1, arg); if (error) break; /* get the second operand */ reg = gpr_map[vie->reg]; /* zero-extend byte */ val = (uint8_t)val; /* write the result */ error = vie_update_register(vm, vcpuid, reg, val, size); break; case 0xB7: /* * MOV and zero extend word from mem (ModRM:r/m) to * reg (ModRM:reg). * * 0F B7/r movzx r32, r/m16 * REX.W + 0F B7/r movzx r64, r/m16 */ error = memread(vm, vcpuid, gpa, &val, 2, arg); if (error) return (error); reg = gpr_map[vie->reg]; /* zero-extend word */ val = (uint16_t)val; error = vie_update_register(vm, vcpuid, reg, val, size); break; case 0xBE: /* * MOV and sign extend byte from mem (ModRM:r/m) to * reg (ModRM:reg). * * 0F BE/r movsx r16, r/m8 * 0F BE/r movsx r32, r/m8 * REX.W + 0F BE/r movsx r64, r/m8 */ /* get the first operand */ error = memread(vm, vcpuid, gpa, &val, 1, arg); if (error) break; /* get the second operand */ reg = gpr_map[vie->reg]; /* sign extend byte */ val = (int8_t)val; /* write the result */ error = vie_update_register(vm, vcpuid, reg, val, size); break; default: break; } return (error); } /* * Helper function to calculate and validate a linear address. */ static int get_gla(void *vm, int vcpuid, struct vie *vie __unused, struct vm_guest_paging *paging, int opsize, int addrsize, int prot, enum vm_reg_name seg, enum vm_reg_name gpr, uint64_t *gla, int *fault) { struct seg_desc desc; uint64_t cr0, val, rflags; int error __diagused; error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); error = vm_get_seg_desc(vm, vcpuid, seg, &desc); KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", __func__, error, seg)); error = vie_read_register(vm, vcpuid, gpr, &val); KASSERT(error == 0, ("%s: error %d getting register %d", __func__, error, gpr)); if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, addrsize, prot, gla)) { if (seg == VM_REG_GUEST_SS) vm_inject_ss(vm, vcpuid, 0); else vm_inject_gp(vm, vcpuid); goto guest_fault; } if (vie_canonical_check(paging->cpu_mode, *gla)) { if (seg == VM_REG_GUEST_SS) vm_inject_ss(vm, vcpuid, 0); else vm_inject_gp(vm, vcpuid); goto guest_fault; } if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { vm_inject_ac(vm, vcpuid, 0); goto guest_fault; } *fault = 0; return (0); guest_fault: *fault = 1; return (0); } static int emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { #ifdef _KERNEL struct vm_copyinfo copyinfo[2]; #else struct iovec copyinfo[2]; #endif uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; uint64_t rcx, rdi, rsi, rflags; int error, fault, opsize, seg, repeat; opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; val = 0; error = 0; /* * XXX although the MOVS instruction is only supposed to be used with * the "rep" prefix some guests like FreeBSD will use "repnz" instead. * * Empirically the "repnz" prefix has identical behavior to "rep" * and the zero flag does not make a difference. */ repeat = vie->repz_present | vie->repnz_present; if (repeat) { error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); /* * The count register is %rcx, %ecx or %cx depending on the * address size of the instruction. */ if ((rcx & vie_size2mask(vie->addrsize)) == 0) { error = 0; goto done; } } /* * Source Destination Comments * -------------------------------------------- * (1) memory memory n/a * (2) memory mmio emulated * (3) mmio memory emulated * (4) mmio mmio emulated * * At this point we don't have sufficient information to distinguish * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this * out because it will succeed only when operating on regular memory. * * XXX the emulation doesn't properly handle the case where 'gpa' * is straddling the boundary between the normal memory and MMIO. */ seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault); if (error || fault) goto done; error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, copyinfo, nitems(copyinfo), &fault); if (error == 0) { if (fault) goto done; /* Resume guest to handle fault */ /* * case (2): read from system memory and write to mmio. */ - vm_copyin(vm, vcpuid, copyinfo, &val, opsize); - vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + vm_copyin(copyinfo, &val, opsize); + vm_copy_teardown(copyinfo, nitems(copyinfo)); error = memwrite(vm, vcpuid, gpa, val, opsize, arg); if (error) goto done; } else { /* * 'vm_copy_setup()' is expected to fail for cases (3) and (4) * if 'srcaddr' is in the mmio space. */ error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr, &fault); if (error || fault) goto done; error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, PROT_WRITE, copyinfo, nitems(copyinfo), &fault); if (error == 0) { if (fault) goto done; /* Resume guest to handle fault */ /* * case (3): read from MMIO and write to system memory. * * A MMIO read can have side-effects so we * commit to it only after vm_copy_setup() is * successful. If a page-fault needs to be * injected into the guest then it will happen * before the MMIO read is attempted. */ error = memread(vm, vcpuid, gpa, &val, opsize, arg); if (error) goto done; - vm_copyout(vm, vcpuid, &val, copyinfo, opsize); - vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + vm_copyout(&val, copyinfo, opsize); + vm_copy_teardown(copyinfo, nitems(copyinfo)); } else { /* * Case (4): read from and write to mmio. * * Commit to the MMIO read/write (with potential * side-effects) only after we are sure that the * instruction is not going to be restarted due * to address translation faults. */ error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, PROT_READ, &srcgpa, &fault); if (error || fault) goto done; error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, PROT_WRITE, &dstgpa, &fault); if (error || fault) goto done; error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); if (error) goto done; error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg); if (error) goto done; } } error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); if (rflags & PSL_D) { rsi -= opsize; rdi -= opsize; } else { rsi += opsize; rdi += opsize; } error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, vie->addrsize); KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, vie->addrsize); KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); if (repeat) { rcx = rcx - 1; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, rcx, vie->addrsize); KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); /* * Repeat the instruction if the count register is not zero. */ if ((rcx & vie_size2mask(vie->addrsize)) != 0) vm_restart_instruction(vm, vcpuid); } done: KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d", __func__, error)); return (error); } static int emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging __unused, mem_region_read_t memread __unused, mem_region_write_t memwrite, void *arg) { int error, opsize, repeat; uint64_t val; uint64_t rcx, rdi, rflags; opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; repeat = vie->repz_present | vie->repnz_present; if (repeat) { error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); /* * The count register is %rcx, %ecx or %cx depending on the * address size of the instruction. */ if ((rcx & vie_size2mask(vie->addrsize)) == 0) return (0); } error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); KASSERT(!error, ("%s: error %d getting rax", __func__, error)); error = memwrite(vm, vcpuid, gpa, val, opsize, arg); if (error) return (error); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); if (rflags & PSL_D) rdi -= opsize; else rdi += opsize; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, vie->addrsize); KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); if (repeat) { rcx = rcx - 1; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, rcx, vie->addrsize); KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); /* * Repeat the instruction if the count register is not zero. */ if ((rcx & vie_size2mask(vie->addrsize)) != 0) vm_restart_instruction(vm, vcpuid); } return (0); } static int emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; enum vm_reg_name reg; uint64_t result, rflags, rflags2, val1, val2; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x23: /* * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the * result in reg. * * 23/r and r16, r/m16 * 23/r and r32, r/m32 * REX.W + 23/r and r64, r/m64 */ /* get the first operand */ reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, &val1); if (error) break; /* get the second operand */ error = memread(vm, vcpuid, gpa, &val2, size, arg); if (error) break; /* perform the operation and write the result */ result = val1 & val2; error = vie_update_register(vm, vcpuid, reg, result, size); break; case 0x81: case 0x83: /* * AND mem (ModRM:r/m) with immediate and store the * result in mem. * * 81 /4 and r/m16, imm16 * 81 /4 and r/m32, imm32 * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 * * 83 /4 and r/m16, imm8 sign-extended to 16 * 83 /4 and r/m32, imm8 sign-extended to 32 * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 */ /* get the first operand */ error = memread(vm, vcpuid, gpa, &val1, size, arg); if (error) break; /* * perform the operation with the pre-fetched immediate * operand and write the result */ result = val1 & vie->immediate; error = memwrite(vm, vcpuid, gpa, result, size, arg); break; default: break; } if (error) return (error); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); /* * OF and CF are cleared; the SF, ZF and PF flags are set according * to the result; AF is undefined. * * The updated status flags are obtained by subtracting 0 from 'result'. */ rflags2 = getcc(size, result, 0); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; enum vm_reg_name reg; uint64_t result, rflags, rflags2, val1, val2; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x0B: /* * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the * result in reg. * * 0b/r or r16, r/m16 * 0b/r or r32, r/m32 * REX.W + 0b/r or r64, r/m64 */ /* get the first operand */ reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, &val1); if (error) break; /* get the second operand */ error = memread(vm, vcpuid, gpa, &val2, size, arg); if (error) break; /* perform the operation and write the result */ result = val1 | val2; error = vie_update_register(vm, vcpuid, reg, result, size); break; case 0x81: case 0x83: /* * OR mem (ModRM:r/m) with immediate and store the * result in mem. * * 81 /1 or r/m16, imm16 * 81 /1 or r/m32, imm32 * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 * * 83 /1 or r/m16, imm8 sign-extended to 16 * 83 /1 or r/m32, imm8 sign-extended to 32 * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 */ /* get the first operand */ error = memread(vm, vcpuid, gpa, &val1, size, arg); if (error) break; /* * perform the operation with the pre-fetched immediate * operand and write the result */ result = val1 | vie->immediate; error = memwrite(vm, vcpuid, gpa, result, size, arg); break; default: break; } if (error) return (error); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); /* * OF and CF are cleared; the SF, ZF and PF flags are set according * to the result; AF is undefined. * * The updated status flags are obtained by subtracting 0 from 'result'. */ rflags2 = getcc(size, result, 0); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) { int error, size; uint64_t regop, memop, op1, op2, rflags, rflags2; enum vm_reg_name reg; size = vie->opsize; switch (vie->op.op_byte) { case 0x39: case 0x3B: /* * 39/r CMP r/m16, r16 * 39/r CMP r/m32, r32 * REX.W 39/r CMP r/m64, r64 * * 3B/r CMP r16, r/m16 * 3B/r CMP r32, r/m32 * REX.W + 3B/r CMP r64, r/m64 * * Compare the first operand with the second operand and * set status flags in EFLAGS register. The comparison is * performed by subtracting the second operand from the first * operand and then setting the status flags. */ /* Get the register operand */ reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, ®op); if (error) return (error); /* Get the memory operand */ error = memread(vm, vcpuid, gpa, &memop, size, arg); if (error) return (error); if (vie->op.op_byte == 0x3B) { op1 = regop; op2 = memop; } else { op1 = memop; op2 = regop; } rflags2 = getcc(size, op1, op2); break; case 0x80: case 0x81: case 0x83: /* * 80 /7 cmp r/m8, imm8 * REX + 80 /7 cmp r/m8, imm8 * * 81 /7 cmp r/m16, imm16 * 81 /7 cmp r/m32, imm32 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 * * 83 /7 cmp r/m16, imm8 sign-extended to 16 * 83 /7 cmp r/m32, imm8 sign-extended to 32 * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 * * Compare mem (ModRM:r/m) with immediate and set * status flags according to the results. The * comparison is performed by subtracting the * immediate from the first operand and then setting * the status flags. * */ if (vie->op.op_byte == 0x80) size = 1; /* get the first operand */ error = memread(vm, vcpuid, gpa, &op1, size, arg); if (error) return (error); rflags2 = getcc(size, op1, vie->immediate); break; default: return (EINVAL); } error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & RFLAGS_STATUS_BITS; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int emulate_test(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) { int error, size; uint64_t op1, rflags, rflags2; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0xF7: /* * F7 /0 test r/m16, imm16 * F7 /0 test r/m32, imm32 * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64 * * Test mem (ModRM:r/m) with immediate and set status * flags according to the results. The comparison is * performed by anding the immediate from the first * operand and then setting the status flags. */ if ((vie->reg & 7) != 0) return (EINVAL); error = memread(vm, vcpuid, gpa, &op1, size, arg); if (error) return (error); rflags2 = getandflags(size, op1, vie->immediate); break; default: return (EINVAL); } error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); /* * OF and CF are cleared; the SF, ZF and PF flags are set according * to the result; AF is undefined. */ rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int emulate_bextr(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) { uint64_t src1, src2, dst, rflags; unsigned start, len, size; int error; size = vie->opsize; error = EINVAL; /* * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b * * Destination operand is ModRM:reg. Source operands are ModRM:r/m and * Vex.vvvv. * * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored). */ if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT) size = 4; /* * Extracts contiguous bits from the first /source/ operand (second * operand) using an index and length specified in the second /source/ * operand (third operand). */ error = memread(vm, vcpuid, gpa, &src1, size, arg); if (error) return (error); error = vie_read_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2); if (error) return (error); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); start = (src2 & 0xff); len = (src2 & 0xff00) >> 8; /* If no bits are extracted, the destination register is cleared. */ dst = 0; /* If START exceeds the operand size, no bits are extracted. */ if (start > size * 8) goto done; /* Length is bounded by both the destination size and start offset. */ if (start + len > size * 8) len = (size * 8) - start; if (len == 0) goto done; if (start > 0) src1 = (src1 >> start); if (len < 64) src1 = src1 & ((1ull << len) - 1); dst = src1; done: error = vie_update_register(vm, vcpuid, gpr_map[vie->reg], dst, size); if (error) return (error); /* * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result. * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared. */ rflags &= ~RFLAGS_STATUS_BITS; if (dst == 0) rflags |= PSL_Z; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); return (error); } static int emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) { int error, size; uint64_t nval, rflags, rflags2, val1, val2; enum vm_reg_name reg; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x03: /* * ADD r/m to r and store the result in r * * 03/r ADD r16, r/m16 * 03/r ADD r32, r/m32 * REX.W + 03/r ADD r64, r/m64 */ /* get the first operand */ reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, &val1); if (error) break; /* get the second operand */ error = memread(vm, vcpuid, gpa, &val2, size, arg); if (error) break; /* perform the operation and write the result */ nval = val1 + val2; error = vie_update_register(vm, vcpuid, reg, nval, size); break; default: break; } if (!error) { rflags2 = getaddflags(size, val1, val2); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & RFLAGS_STATUS_BITS; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); } return (error); } static int emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) { int error, size; uint64_t nval, rflags, rflags2, val1, val2; enum vm_reg_name reg; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { case 0x2B: /* * SUB r/m from r and store the result in r * * 2B/r SUB r16, r/m16 * 2B/r SUB r32, r/m32 * REX.W + 2B/r SUB r64, r/m64 */ /* get the first operand */ reg = gpr_map[vie->reg]; error = vie_read_register(vm, vcpuid, reg, &val1); if (error) break; /* get the second operand */ error = memread(vm, vcpuid, gpa, &val2, size, arg); if (error) break; /* perform the operation and write the result */ nval = val1 - val2; error = vie_update_register(vm, vcpuid, reg, nval, size); break; default: break; } if (!error) { rflags2 = getcc(size, val1, val2); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); rflags &= ~RFLAGS_STATUS_BITS; rflags |= rflags2 & RFLAGS_STATUS_BITS; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); } return (error); } static int emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { #ifdef _KERNEL struct vm_copyinfo copyinfo[2]; #else struct iovec copyinfo[2]; #endif struct seg_desc ss_desc; uint64_t cr0, rflags, rsp, stack_gla, val; int error, fault, size, stackaddrsize, pushop; val = 0; size = vie->opsize; pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; /* * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 */ if (paging->cpu_mode == CPU_MODE_REAL) { stackaddrsize = 2; } else if (paging->cpu_mode == CPU_MODE_64BIT) { /* * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 * - Stack pointer size is always 64-bits. * - PUSH/POP of 32-bit values is not possible in 64-bit mode. * - 16-bit PUSH/POP is supported by using the operand size * override prefix (66H). */ stackaddrsize = 8; size = vie->opsize_override ? 2 : 8; } else { /* * In protected or compatibility mode the 'B' flag in the * stack-segment descriptor determines the size of the * stack pointer. */ error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); KASSERT(error == 0, ("%s: error %d getting SS descriptor", __func__, error)); if (SEG_DESC_DEF32(ss_desc.access)) stackaddrsize = 4; else stackaddrsize = 2; } error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); if (pushop) { rsp -= size; } if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, &stack_gla)) { vm_inject_ss(vm, vcpuid, 0); return (0); } if (vie_canonical_check(paging->cpu_mode, stack_gla)) { vm_inject_ss(vm, vcpuid, 0); return (0); } if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { vm_inject_ac(vm, vcpuid, 0); return (0); } error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), &fault); if (error || fault) return (error); if (pushop) { error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); if (error == 0) - vm_copyout(vm, vcpuid, &val, copyinfo, size); + vm_copyout(&val, copyinfo, size); } else { - vm_copyin(vm, vcpuid, copyinfo, &val, size); + vm_copyin(copyinfo, &val, size); error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg); rsp += size; } - vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + vm_copy_teardown(copyinfo, nitems(copyinfo)); if (error == 0) { error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, stackaddrsize); KASSERT(error == 0, ("error %d updating rsp", error)); } return (error); } static int emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error; /* * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. * * PUSH is part of the group 5 extended opcodes and is identified * by ModRM:reg = b110. */ if ((vie->reg & 7) != 6) return (EINVAL); error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, memwrite, arg); return (error); } static int emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error; /* * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. * * POP is part of the group 1A extended opcodes and is identified * by ModRM:reg = b000. */ if ((vie->reg & 7) != 0) return (EINVAL); error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, memwrite, arg); return (error); } static int emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging __unused, mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) { int error; switch (vie->reg & 7) { case 0x1: /* OR */ error = emulate_or(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case 0x4: /* AND */ error = emulate_and(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case 0x7: /* CMP */ error = emulate_cmp(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; default: error = EINVAL; break; } return (error); } static int emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *memarg) { uint64_t val, rflags; int error, bitmask, bitoff; /* * 0F BA is a Group 8 extended opcode. * * Currently we only emulate the 'Bit Test' instruction which is * identified by a ModR/M:reg encoding of 100b. */ if ((vie->reg & 7) != 4) return (EINVAL); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg); if (error) return (error); /* * Intel SDM, Vol 2, Table 3-2: * "Range of Bit Positions Specified by Bit Offset Operands" */ bitmask = vie->opsize * 8 - 1; bitoff = vie->immediate & bitmask; /* Copy the bit into the Carry flag in %rflags */ if (val & (1UL << bitoff)) rflags |= PSL_C; else rflags &= ~PSL_C; error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); return (0); } static int emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite __unused, void *memarg) { int error; uint64_t buf; switch (vie->reg & 7) { case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ if (vie->mod == 0x3) { /* * SFENCE. Ignore it, VM exit provides enough * barriers on its own. */ error = 0; } else { /* * CLFLUSH, CLFLUSHOPT. Only check for access * rights. */ error = memread(vm, vcpuid, gpa, &buf, 1, memarg); } break; default: error = EINVAL; break; } return (error); } int vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) { int error; if (!vie->decoded) return (EINVAL); switch (vie->op.op_type) { case VIE_OP_TYPE_GROUP1: error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_POP: error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_PUSH: error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_CMP: error = emulate_cmp(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_MOV: error = emulate_mov(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_MOVSX: case VIE_OP_TYPE_MOVZX: error = emulate_movx(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_MOVS: error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_STOS: error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); break; case VIE_OP_TYPE_AND: error = emulate_and(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_OR: error = emulate_or(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_SUB: error = emulate_sub(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_BITTEST: error = emulate_bittest(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_TWOB_GRP15: error = emulate_twob_group15(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_ADD: error = emulate_add(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_TEST: error = emulate_test(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; case VIE_OP_TYPE_BEXTR: error = emulate_bextr(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); break; default: error = EINVAL; break; } return (error); } int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) { KASSERT(size == 1 || size == 2 || size == 4 || size == 8, ("%s: invalid size %d", __func__, size)); KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) return (0); return ((gla & (size - 1)) ? 1 : 0); } int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) { uint64_t mask; if (cpu_mode != CPU_MODE_64BIT) return (0); /* * The value of the bit 47 in the 'gla' should be replicated in the * most significant 16 bits. */ mask = ~((1UL << 48) - 1); if (gla & (1UL << 47)) return ((gla & mask) != mask); else return ((gla & mask) != 0); } uint64_t vie_size2mask(int size) { KASSERT(size == 1 || size == 2 || size == 4 || size == 8, ("vie_size2mask: invalid size %d", size)); return (size2mask[size]); } int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, struct seg_desc *desc, uint64_t offset, int length, int addrsize, int prot, uint64_t *gla) { uint64_t firstoff, low_limit, high_limit, segbase; int glasize, type; KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, ("%s: invalid segment %d", __func__, seg)); KASSERT(length == 1 || length == 2 || length == 4 || length == 8, ("%s: invalid operand size %d", __func__, length)); KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, ("%s: invalid prot %#x", __func__, prot)); firstoff = offset; if (cpu_mode == CPU_MODE_64BIT) { KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); glasize = 8; } else { KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); glasize = 4; /* * If the segment selector is loaded with a NULL selector * then the descriptor is unusable and attempting to use * it results in a #GP(0). */ if (SEG_DESC_UNUSABLE(desc->access)) return (-1); /* * The processor generates a #NP exception when a segment * register is loaded with a selector that points to a * descriptor that is not present. If this was the case then * it would have been checked before the VM-exit. */ KASSERT(SEG_DESC_PRESENT(desc->access), ("segment %d not present: %#x", seg, desc->access)); /* * The descriptor type must indicate a code/data segment. */ type = SEG_DESC_TYPE(desc->access); KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " "descriptor type %#x", seg, type)); if (prot & PROT_READ) { /* #GP on a read access to a exec-only code segment */ if ((type & 0xA) == 0x8) return (-1); } if (prot & PROT_WRITE) { /* * #GP on a write access to a code segment or a * read-only data segment. */ if (type & 0x8) /* code segment */ return (-1); if ((type & 0xA) == 0) /* read-only data seg */ return (-1); } /* * 'desc->limit' is fully expanded taking granularity into * account. */ if ((type & 0xC) == 0x4) { /* expand-down data segment */ low_limit = desc->limit + 1; high_limit = SEG_DESC_DEF32(desc->access) ? 0xffffffff : 0xffff; } else { /* code segment or expand-up data segment */ low_limit = 0; high_limit = desc->limit; } while (length > 0) { offset &= vie_size2mask(addrsize); if (offset < low_limit || offset > high_limit) return (-1); offset++; length--; } } /* * In 64-bit mode all segments except %fs and %gs have a segment * base address of 0. */ if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) { segbase = 0; } else { segbase = desc->base; } /* * Truncate 'firstoff' to the effective address size before adding * it to the segment base. */ firstoff &= vie_size2mask(addrsize); *gla = (segbase + firstoff) & vie_size2mask(glasize); return (0); } /* * Prepare a partially decoded vie for a 2nd attempt. */ void vie_restart(struct vie *vie) { _Static_assert( offsetof(struct vie, inst) < offsetof(struct vie, vie_startzero) && offsetof(struct vie, num_valid) < offsetof(struct vie, vie_startzero), "restart should not erase instruction length or contents"); memset((char *)vie + offsetof(struct vie, vie_startzero), 0, sizeof(*vie) - offsetof(struct vie, vie_startzero)); vie->base_register = VM_REG_LAST; vie->index_register = VM_REG_LAST; vie->segment_register = VM_REG_LAST; } void vie_init(struct vie *vie, const char *inst_bytes, int inst_length) { KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, ("%s: invalid instruction length (%d)", __func__, inst_length)); vie_restart(vie); memset(vie->inst, 0, sizeof(vie->inst)); if (inst_length != 0) memcpy(vie->inst, inst_bytes, inst_length); vie->num_valid = inst_length; } #ifdef _KERNEL static int pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) { int error_code = 0; if (pte & PG_V) error_code |= PGEX_P; if (prot & VM_PROT_WRITE) error_code |= PGEX_W; if (usermode) error_code |= PGEX_U; if (rsvd) error_code |= PGEX_RSV; if (prot & VM_PROT_EXECUTE) error_code |= PGEX_I; return (error_code); } static void ptp_release(void **cookie) { if (*cookie != NULL) { vm_gpa_release(*cookie); *cookie = NULL; } } static void * ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) { void *ptr; ptp_release(cookie); ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie); return (ptr); } static int _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) { int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; u_int retries; uint64_t *ptpbase, ptpphys, pte, pgsize; uint32_t *ptpbase32, pte32; void *cookie; *guest_fault = 0; usermode = (paging->cpl == 3 ? 1 : 0); writable = prot & VM_PROT_WRITE; cookie = NULL; retval = 0; retries = 0; restart: ptpphys = paging->cr3; /* root of the page tables */ ptp_release(&cookie); if (retries++ > 0) maybe_yield(); if (vie_canonical_check(paging->cpu_mode, gla)) { /* * XXX assuming a non-stack reference otherwise a stack fault * should be generated. */ if (!check_only) vm_inject_gp(vm, vcpuid); goto fault; } if (paging->paging_mode == PAGING_MODE_FLAT) { *gpa = gla; goto done; } if (paging->paging_mode == PAGING_MODE_32) { nlevels = 2; while (--nlevels >= 0) { /* Zero out the lower 12 bits. */ ptpphys &= ~0xfff; ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); if (ptpbase32 == NULL) goto error; ptpshift = PAGE_SHIFT + nlevels * 10; ptpindex = (gla >> ptpshift) & 0x3FF; pgsize = 1UL << ptpshift; pte32 = ptpbase32[ptpindex]; if ((pte32 & PG_V) == 0 || (usermode && (pte32 & PG_U) == 0) || (writable && (pte32 & PG_RW) == 0)) { if (!check_only) { pfcode = pf_error_code(usermode, prot, 0, pte32); vm_inject_pf(vm, vcpuid, pfcode, gla); } goto fault; } /* * Emulate the x86 MMU's management of the accessed * and dirty flags. While the accessed flag is set * at every level of the page table, the dirty flag * is only set at the last level providing the guest * physical address. */ if (!check_only && (pte32 & PG_A) == 0) { if (atomic_cmpset_32(&ptpbase32[ptpindex], pte32, pte32 | PG_A) == 0) { goto restart; } } /* XXX must be ignored if CR4.PSE=0 */ if (nlevels > 0 && (pte32 & PG_PS) != 0) break; ptpphys = pte32; } /* Set the dirty bit in the page table entry if necessary */ if (!check_only && writable && (pte32 & PG_M) == 0) { if (atomic_cmpset_32(&ptpbase32[ptpindex], pte32, pte32 | PG_M) == 0) { goto restart; } } /* Zero out the lower 'ptpshift' bits */ pte32 >>= ptpshift; pte32 <<= ptpshift; *gpa = pte32 | (gla & (pgsize - 1)); goto done; } if (paging->paging_mode == PAGING_MODE_PAE) { /* Zero out the lower 5 bits and the upper 32 bits */ ptpphys &= 0xffffffe0UL; ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4, &cookie); if (ptpbase == NULL) goto error; ptpindex = (gla >> 30) & 0x3; pte = ptpbase[ptpindex]; if ((pte & PG_V) == 0) { if (!check_only) { pfcode = pf_error_code(usermode, prot, 0, pte); vm_inject_pf(vm, vcpuid, pfcode, gla); } goto fault; } ptpphys = pte; nlevels = 2; } else if (paging->paging_mode == PAGING_MODE_64_LA57) { nlevels = 5; } else { nlevels = 4; } while (--nlevels >= 0) { /* Zero out the lower 12 bits and the upper 12 bits */ ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); if (ptpbase == NULL) goto error; ptpshift = PAGE_SHIFT + nlevels * 9; ptpindex = (gla >> ptpshift) & 0x1FF; pgsize = 1UL << ptpshift; pte = ptpbase[ptpindex]; if ((pte & PG_V) == 0 || (usermode && (pte & PG_U) == 0) || (writable && (pte & PG_RW) == 0)) { if (!check_only) { pfcode = pf_error_code(usermode, prot, 0, pte); vm_inject_pf(vm, vcpuid, pfcode, gla); } goto fault; } /* Set the accessed bit in the page table entry */ if (!check_only && (pte & PG_A) == 0) { if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_A) == 0) { goto restart; } } if (nlevels > 0 && (pte & PG_PS) != 0) { if (pgsize > 1 * GB) { if (!check_only) { pfcode = pf_error_code(usermode, prot, 1, pte); vm_inject_pf(vm, vcpuid, pfcode, gla); } goto fault; } break; } ptpphys = pte; } /* Set the dirty bit in the page table entry if necessary */ if (!check_only && writable && (pte & PG_M) == 0) { if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) goto restart; } /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; *gpa = pte | (gla & (pgsize - 1)); done: ptp_release(&cookie); KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d", __func__, retval)); return (retval); error: retval = EFAULT; goto done; fault: *guest_fault = 1; goto done; } int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) { return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, false)); } int vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) { return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, true)); } int vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t rip, int inst_length, struct vie *vie, int *faultptr) { struct vm_copyinfo copyinfo[2]; int error, prot; if (inst_length > VIE_INST_SIZE) panic("vmm_fetch_instruction: invalid length %d", inst_length); prot = PROT_READ | PROT_EXEC; error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, copyinfo, nitems(copyinfo), faultptr); if (error || *faultptr) return (error); - vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); - vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + vm_copyin(copyinfo, vie->inst, inst_length); + vm_copy_teardown(copyinfo, nitems(copyinfo)); vie->num_valid = inst_length; return (0); } #endif /* _KERNEL */ static int vie_peek(struct vie *vie, uint8_t *x) { if (vie->num_processed < vie->num_valid) { *x = vie->inst[vie->num_processed]; return (0); } else return (-1); } static void vie_advance(struct vie *vie) { vie->num_processed++; } static bool segment_override(uint8_t x, int *seg) { switch (x) { case 0x2E: *seg = VM_REG_GUEST_CS; break; case 0x36: *seg = VM_REG_GUEST_SS; break; case 0x3E: *seg = VM_REG_GUEST_DS; break; case 0x26: *seg = VM_REG_GUEST_ES; break; case 0x64: *seg = VM_REG_GUEST_FS; break; case 0x65: *seg = VM_REG_GUEST_GS; break; default: return (false); } return (true); } static int decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) { uint8_t x; while (1) { if (vie_peek(vie, &x)) return (-1); if (x == 0x66) vie->opsize_override = 1; else if (x == 0x67) vie->addrsize_override = 1; else if (x == 0xF3) vie->repz_present = 1; else if (x == 0xF2) vie->repnz_present = 1; else if (segment_override(x, &vie->segment_register)) vie->segment_override = 1; else break; vie_advance(vie); } /* * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: * - Only one REX prefix is allowed per instruction. * - The REX prefix must immediately precede the opcode byte or the * escape opcode byte. * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) * the mandatory prefix must come before the REX prefix. */ if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { vie->rex_present = 1; vie->rex_w = x & 0x8 ? 1 : 0; vie->rex_r = x & 0x4 ? 1 : 0; vie->rex_x = x & 0x2 ? 1 : 0; vie->rex_b = x & 0x1 ? 1 : 0; vie_advance(vie); } /* * ยง 2.3.5, "The VEX Prefix", SDM Vol 2. */ if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY) && x == 0xC4) { const struct vie_op *optab; /* 3-byte VEX prefix. */ vie->vex_present = 1; vie_advance(vie); if (vie_peek(vie, &x)) return (-1); /* * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted * relative to REX encoding. */ vie->rex_r = x & 0x80 ? 0 : 1; vie->rex_x = x & 0x40 ? 0 : 1; vie->rex_b = x & 0x20 ? 0 : 1; switch (x & 0x1F) { case 0x2: /* 0F 38. */ optab = three_byte_opcodes_0f38; break; case 0x1: /* 0F class - nothing handled here yet. */ /* FALLTHROUGH */ case 0x3: /* 0F 3A class - nothing handled here yet. */ /* FALLTHROUGH */ default: /* Reserved (#UD). */ return (-1); } vie_advance(vie); if (vie_peek(vie, &x)) return (-1); /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */ vie->rex_w = x & 0x80 ? 1 : 0; vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3); vie->vex_l = !!(x & 0x4); vie->vex_pp = (x & 0x3); /* PP: 1=66 2=F3 3=F2 prefixes. */ switch (vie->vex_pp) { case 0x1: vie->opsize_override = 1; break; case 0x2: vie->repz_present = 1; break; case 0x3: vie->repnz_present = 1; break; } vie_advance(vie); /* Opcode, sans literal prefix prefix. */ if (vie_peek(vie, &x)) return (-1); vie->op = optab[x]; if (vie->op.op_type == VIE_OP_TYPE_NONE) return (-1); vie_advance(vie); } /* * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 */ if (cpu_mode == CPU_MODE_64BIT) { /* * Default address size is 64-bits and default operand size * is 32-bits. */ vie->addrsize = vie->addrsize_override ? 4 : 8; if (vie->rex_w) vie->opsize = 8; else if (vie->opsize_override) vie->opsize = 2; else vie->opsize = 4; } else if (cs_d) { /* Default address and operand sizes are 32-bits */ vie->addrsize = vie->addrsize_override ? 2 : 4; vie->opsize = vie->opsize_override ? 2 : 4; } else { /* Default address and operand sizes are 16-bits */ vie->addrsize = vie->addrsize_override ? 4 : 2; vie->opsize = vie->opsize_override ? 4 : 2; } return (0); } static int decode_two_byte_opcode(struct vie *vie) { uint8_t x; if (vie_peek(vie, &x)) return (-1); vie->op = two_byte_opcodes[x]; if (vie->op.op_type == VIE_OP_TYPE_NONE) return (-1); vie_advance(vie); return (0); } static int decode_opcode(struct vie *vie) { uint8_t x; if (vie_peek(vie, &x)) return (-1); /* Already did this via VEX prefix. */ if (vie->op.op_type != VIE_OP_TYPE_NONE) return (0); vie->op = one_byte_opcodes[x]; if (vie->op.op_type == VIE_OP_TYPE_NONE) return (-1); vie_advance(vie); if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) return (decode_two_byte_opcode(vie)); return (0); } static int decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) { uint8_t x; if (vie->op.op_flags & VIE_OP_F_NO_MODRM) return (0); if (cpu_mode == CPU_MODE_REAL) return (-1); if (vie_peek(vie, &x)) return (-1); vie->mod = (x >> 6) & 0x3; vie->rm = (x >> 0) & 0x7; vie->reg = (x >> 3) & 0x7; /* * A direct addressing mode makes no sense in the context of an EPT * fault. There has to be a memory access involved to cause the * EPT fault. */ if (vie->mod == VIE_MOD_DIRECT) return (-1); if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { /* * Table 2-5: Special Cases of REX Encodings * * mod=0, r/m=5 is used in the compatibility mode to * indicate a disp32 without a base register. * * mod!=3, r/m=4 is used in the compatibility mode to * indicate that the SIB byte is present. * * The 'b' bit in the REX prefix is don't care in * this case. */ } else { vie->rm |= (vie->rex_b << 3); } vie->reg |= (vie->rex_r << 3); /* SIB */ if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) goto done; vie->base_register = gpr_map[vie->rm]; switch (vie->mod) { case VIE_MOD_INDIRECT_DISP8: vie->disp_bytes = 1; break; case VIE_MOD_INDIRECT_DISP32: vie->disp_bytes = 4; break; case VIE_MOD_INDIRECT: if (vie->rm == VIE_RM_DISP32) { vie->disp_bytes = 4; /* * Table 2-7. RIP-Relative Addressing * * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 * whereas in compatibility mode it just implies disp32. */ if (cpu_mode == CPU_MODE_64BIT) vie->base_register = VM_REG_GUEST_RIP; else vie->base_register = VM_REG_LAST; } break; } done: vie_advance(vie); return (0); } static int decode_sib(struct vie *vie) { uint8_t x; /* Proceed only if SIB byte is present */ if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) return (0); if (vie_peek(vie, &x)) return (-1); /* De-construct the SIB byte */ vie->ss = (x >> 6) & 0x3; vie->index = (x >> 3) & 0x7; vie->base = (x >> 0) & 0x7; /* Apply the REX prefix modifiers */ vie->index |= vie->rex_x << 3; vie->base |= vie->rex_b << 3; switch (vie->mod) { case VIE_MOD_INDIRECT_DISP8: vie->disp_bytes = 1; break; case VIE_MOD_INDIRECT_DISP32: vie->disp_bytes = 4; break; } if (vie->mod == VIE_MOD_INDIRECT && (vie->base == 5 || vie->base == 13)) { /* * Special case when base register is unused if mod = 0 * and base = %rbp or %r13. * * Documented in: * Table 2-3: 32-bit Addressing Forms with the SIB Byte * Table 2-5: Special Cases of REX Encodings */ vie->disp_bytes = 4; } else { vie->base_register = gpr_map[vie->base]; } /* * All encodings of 'index' are valid except for %rsp (4). * * Documented in: * Table 2-3: 32-bit Addressing Forms with the SIB Byte * Table 2-5: Special Cases of REX Encodings */ if (vie->index != 4) vie->index_register = gpr_map[vie->index]; /* 'scale' makes sense only in the context of an index register */ if (vie->index_register < VM_REG_LAST) vie->scale = 1 << vie->ss; vie_advance(vie); return (0); } static int decode_displacement(struct vie *vie) { int n, i; uint8_t x; union { char buf[4]; int8_t signed8; int32_t signed32; } u; if ((n = vie->disp_bytes) == 0) return (0); if (n != 1 && n != 4) panic("decode_displacement: invalid disp_bytes %d", n); for (i = 0; i < n; i++) { if (vie_peek(vie, &x)) return (-1); u.buf[i] = x; vie_advance(vie); } if (n == 1) vie->displacement = u.signed8; /* sign-extended */ else vie->displacement = u.signed32; /* sign-extended */ return (0); } static int decode_immediate(struct vie *vie) { int i, n; uint8_t x; union { char buf[4]; int8_t signed8; int16_t signed16; int32_t signed32; } u; /* Figure out immediate operand size (if any) */ if (vie->op.op_flags & VIE_OP_F_IMM) { /* * Section 2.2.1.5 "Immediates", Intel SDM: * In 64-bit mode the typical size of immediate operands * remains 32-bits. When the operand size if 64-bits, the * processor sign-extends all immediates to 64-bits prior * to their use. */ if (vie->opsize == 4 || vie->opsize == 8) vie->imm_bytes = 4; else vie->imm_bytes = 2; } else if (vie->op.op_flags & VIE_OP_F_IMM8) { vie->imm_bytes = 1; } if ((n = vie->imm_bytes) == 0) return (0); KASSERT(n == 1 || n == 2 || n == 4, ("%s: invalid number of immediate bytes: %d", __func__, n)); for (i = 0; i < n; i++) { if (vie_peek(vie, &x)) return (-1); u.buf[i] = x; vie_advance(vie); } /* sign-extend the immediate value before use */ if (n == 1) vie->immediate = u.signed8; else if (n == 2) vie->immediate = u.signed16; else vie->immediate = u.signed32; return (0); } static int decode_moffset(struct vie *vie) { int i, n; uint8_t x; union { char buf[8]; uint64_t u64; } u; if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) return (0); /* * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: * The memory offset size follows the address-size of the instruction. */ n = vie->addrsize; KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); u.u64 = 0; for (i = 0; i < n; i++) { if (vie_peek(vie, &x)) return (-1); u.buf[i] = x; vie_advance(vie); } vie->displacement = u.u64; return (0); } #ifdef _KERNEL /* * Verify that the 'guest linear address' provided as collateral of the nested * page table fault matches with our instruction decoding. */ static int verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie, enum vm_cpu_mode cpu_mode) { int error; uint64_t base, segbase, idx, gla2; enum vm_reg_name seg; struct seg_desc desc; /* Skip 'gla' verification */ if (gla == VIE_INVALID_GLA) return (0); base = 0; if (vie->base_register != VM_REG_LAST) { error = vm_get_register(vm, cpuid, vie->base_register, &base); if (error) { printf("verify_gla: error %d getting base reg %d\n", error, vie->base_register); return (-1); } /* * RIP-relative addressing starts from the following * instruction */ if (vie->base_register == VM_REG_GUEST_RIP) base += vie->num_processed; } idx = 0; if (vie->index_register != VM_REG_LAST) { error = vm_get_register(vm, cpuid, vie->index_register, &idx); if (error) { printf("verify_gla: error %d getting index reg %d\n", error, vie->index_register); return (-1); } } /* * From "Specifying a Segment Selector", Intel SDM, Vol 1 * * In 64-bit mode, segmentation is generally (but not * completely) disabled. The exceptions are the FS and GS * segments. * * In legacy IA-32 mode, when the ESP or EBP register is used * as the base, the SS segment is the default segment. For * other data references, except when relative to stack or * string destination the DS segment is the default. These * can be overridden to allow other segments to be accessed. */ if (vie->segment_override) seg = vie->segment_register; else if (vie->base_register == VM_REG_GUEST_RSP || vie->base_register == VM_REG_GUEST_RBP) seg = VM_REG_GUEST_SS; else seg = VM_REG_GUEST_DS; if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) { segbase = 0; } else { error = vm_get_seg_desc(vm, cpuid, seg, &desc); if (error) { printf("verify_gla: error %d getting segment" " descriptor %d", error, vie->segment_register); return (-1); } segbase = desc.base; } gla2 = segbase + base + vie->scale * idx + vie->displacement; gla2 &= size2mask[vie->addrsize]; if (gla != gla2) { printf("verify_gla mismatch: segbase(0x%0lx)" "base(0x%0lx), scale(%d), index(0x%0lx), " "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", segbase, base, vie->scale, idx, vie->displacement, gla, gla2); return (-1); } return (0); } #endif /* _KERNEL */ int #ifdef _KERNEL vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) #else vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) #endif { if (decode_prefixes(vie, cpu_mode, cs_d)) return (-1); if (decode_opcode(vie)) return (-1); if (decode_modrm(vie, cpu_mode)) return (-1); if (decode_sib(vie)) return (-1); if (decode_displacement(vie)) return (-1); if (decode_immediate(vie)) return (-1); if (decode_moffset(vie)) return (-1); #ifdef _KERNEL if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { if (verify_gla(vm, cpuid, gla, vie, cpu_mode)) return (-1); } #endif vie->decoded = 1; /* success */ return (0); } diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c index 02b38c9b7a99..bdbdd0d1c223 100644 --- a/usr.sbin/bhyve/inout.c +++ b/usr.sbin/bhyve/inout.c @@ -1,301 +1,301 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "inout.h" SET_DECLARE(inout_port_set, struct inout_port); #define MAX_IOPORTS (1 << 16) #define VERIFY_IOPORT(port, size) \ assert((port) >= 0 && (size) > 0 && ((port) + (size)) <= MAX_IOPORTS) static struct { const char *name; int flags; inout_func_t handler; void *arg; } inout_handlers[MAX_IOPORTS]; static int default_inout(struct vmctx *ctx __unused, int vcpu __unused, int in, int port __unused, int bytes, uint32_t *eax, void *arg __unused) { if (in) { switch (bytes) { case 4: *eax = 0xffffffff; break; case 2: *eax = 0xffff; break; case 1: *eax = 0xff; break; } } return (0); } static void register_default_iohandler(int start, int size) { struct inout_port iop; VERIFY_IOPORT(start, size); bzero(&iop, sizeof(iop)); iop.name = "default"; iop.port = start; iop.size = size; iop.flags = IOPORT_F_INOUT | IOPORT_F_DEFAULT; iop.handler = default_inout; register_inout(&iop); } int emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) { int addrsize, bytes, flags, in, port, prot, rep; uint32_t eax, val; inout_func_t handler; void *arg; int error, fault, retval; enum vm_reg_name idxreg; uint64_t gla, index, iterations, count; struct vm_inout_str *vis; struct iovec iov[2]; bytes = vmexit->u.inout.bytes; in = vmexit->u.inout.in; port = vmexit->u.inout.port; assert(port < MAX_IOPORTS); assert(bytes == 1 || bytes == 2 || bytes == 4); handler = inout_handlers[port].handler; if (handler == default_inout && get_config_bool_default("x86.strictio", false)) return (-1); flags = inout_handlers[port].flags; arg = inout_handlers[port].arg; if (in) { if (!(flags & IOPORT_F_IN)) return (-1); } else { if (!(flags & IOPORT_F_OUT)) return (-1); } retval = 0; if (vmexit->u.inout.string) { vis = &vmexit->u.inout_str; rep = vis->inout.rep; addrsize = vis->addrsize; prot = in ? PROT_WRITE : PROT_READ; assert(addrsize == 2 || addrsize == 4 || addrsize == 8); /* Index register */ idxreg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; index = vis->index & vie_size2mask(addrsize); /* Count register */ count = vis->count & vie_size2mask(addrsize); /* Limit number of back-to-back in/out emulations to 16 */ iterations = MIN(count, 16); while (iterations > 0) { assert(retval == 0); if (vie_calculate_gla(vis->paging.cpu_mode, vis->seg_name, &vis->seg_desc, index, bytes, addrsize, prot, &gla)) { vm_inject_gp(ctx, vcpu); break; } error = vm_copy_setup(ctx, vcpu, &vis->paging, gla, bytes, prot, iov, nitems(iov), &fault); if (error) { retval = -1; /* Unrecoverable error */ break; } else if (fault) { retval = 0; /* Resume guest to handle fault */ break; } if (vie_alignment_check(vis->paging.cpl, bytes, vis->cr0, vis->rflags, gla)) { vm_inject_ac(ctx, vcpu, 0); break; } val = 0; if (!in) - vm_copyin(ctx, vcpu, iov, &val, bytes); + vm_copyin(iov, &val, bytes); retval = handler(ctx, vcpu, in, port, bytes, &val, arg); if (retval != 0) break; if (in) - vm_copyout(ctx, vcpu, &val, iov, bytes); + vm_copyout(&val, iov, bytes); /* Update index */ if (vis->rflags & PSL_D) index -= bytes; else index += bytes; count--; iterations--; } /* Update index register */ error = vie_update_register(ctx, vcpu, idxreg, index, addrsize); assert(error == 0); /* * Update count register only if the instruction had a repeat * prefix. */ if (rep) { error = vie_update_register(ctx, vcpu, VM_REG_GUEST_RCX, count, addrsize); assert(error == 0); } /* Restart the instruction if more iterations remain */ if (retval == 0 && count != 0) { error = vm_restart_instruction(ctx, vcpu); assert(error == 0); } } else { eax = vmexit->u.inout.eax; val = eax & vie_size2mask(bytes); retval = handler(ctx, vcpu, in, port, bytes, &val, arg); if (retval == 0 && in) { eax &= ~vie_size2mask(bytes); eax |= val & vie_size2mask(bytes); error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); } } return (retval); } void init_inout(void) { struct inout_port **iopp, *iop; /* * Set up the default handler for all ports */ register_default_iohandler(0, MAX_IOPORTS); /* * Overwrite with specified handlers */ SET_FOREACH(iopp, inout_port_set) { iop = *iopp; assert(iop->port < MAX_IOPORTS); inout_handlers[iop->port].name = iop->name; inout_handlers[iop->port].flags = iop->flags; inout_handlers[iop->port].handler = iop->handler; inout_handlers[iop->port].arg = NULL; } } int register_inout(struct inout_port *iop) { int i; VERIFY_IOPORT(iop->port, iop->size); /* * Verify that the new registration is not overwriting an already * allocated i/o range. */ if ((iop->flags & IOPORT_F_DEFAULT) == 0) { for (i = iop->port; i < iop->port + iop->size; i++) { if ((inout_handlers[i].flags & IOPORT_F_DEFAULT) == 0) return (-1); } } for (i = iop->port; i < iop->port + iop->size; i++) { inout_handlers[i].name = iop->name; inout_handlers[i].flags = iop->flags; inout_handlers[i].handler = iop->handler; inout_handlers[i].arg = iop->arg; } return (0); } int unregister_inout(struct inout_port *iop) { VERIFY_IOPORT(iop->port, iop->size); assert(inout_handlers[iop->port].name == iop->name); register_default_iohandler(iop->port, iop->size); return (0); } diff --git a/usr.sbin/bhyve/task_switch.c b/usr.sbin/bhyve/task_switch.c index 78dfb8190e48..0dfb536f09f8 100644 --- a/usr.sbin/bhyve/task_switch.c +++ b/usr.sbin/bhyve/task_switch.c @@ -1,942 +1,942 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "debug.h" /* * Using 'struct i386tss' is tempting but causes myriad sign extension * issues because all of its fields are defined as signed integers. */ struct tss32 { uint16_t tss_link; uint16_t rsvd1; uint32_t tss_esp0; uint16_t tss_ss0; uint16_t rsvd2; uint32_t tss_esp1; uint16_t tss_ss1; uint16_t rsvd3; uint32_t tss_esp2; uint16_t tss_ss2; uint16_t rsvd4; uint32_t tss_cr3; uint32_t tss_eip; uint32_t tss_eflags; uint32_t tss_eax; uint32_t tss_ecx; uint32_t tss_edx; uint32_t tss_ebx; uint32_t tss_esp; uint32_t tss_ebp; uint32_t tss_esi; uint32_t tss_edi; uint16_t tss_es; uint16_t rsvd5; uint16_t tss_cs; uint16_t rsvd6; uint16_t tss_ss; uint16_t rsvd7; uint16_t tss_ds; uint16_t rsvd8; uint16_t tss_fs; uint16_t rsvd9; uint16_t tss_gs; uint16_t rsvd10; uint16_t tss_ldt; uint16_t rsvd11; uint16_t tss_trap; uint16_t tss_iomap; }; static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed"); #define SEL_START(sel) (((sel) & ~0x7)) #define SEL_LIMIT(sel) (((sel) | 0x7)) #define TSS_BUSY(type) (((type) & 0x2) != 0) static uint64_t GETREG(struct vmctx *ctx, int vcpu, int reg) { uint64_t val; int error; error = vm_get_register(ctx, vcpu, reg, &val); assert(error == 0); return (val); } static void SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val) { int error; error = vm_set_register(ctx, vcpu, reg, val); assert(error == 0); } static struct seg_desc usd_to_seg_desc(struct user_segment_descriptor *usd) { struct seg_desc seg_desc; seg_desc.base = (u_int)USD_GETBASE(usd); if (usd->sd_gran) seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; else seg_desc.limit = (u_int)USD_GETLIMIT(usd); seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7; seg_desc.access |= usd->sd_xx << 12; seg_desc.access |= usd->sd_def32 << 14; seg_desc.access |= usd->sd_gran << 15; return (seg_desc); } /* * Inject an exception with an error code that is a segment selector. * The format of the error code is described in section 6.13, "Error Code", * Intel SDM volume 3. * * Bit 0 (EXT) denotes whether the exception occurred during delivery * of an external event like an interrupt. * * Bit 1 (IDT) indicates whether the selector points to a gate descriptor * in the IDT. * * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). */ static void sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext) { /* * Bit 2 from the selector is retained as-is in the error code. * * Bit 1 can be safely cleared because none of the selectors * encountered during task switch emulation refer to a task * gate in the IDT. * * Bit 0 is set depending on the value of 'ext'. */ sel &= ~0x3; if (ext) sel |= 0x1; vm_inject_fault(ctx, vcpu, vector, 1, sel); } /* * Return 0 if the selector 'sel' in within the limits of the GDT/LDT * and non-zero otherwise. */ static int desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel) { uint64_t base; uint32_t limit, access; int error, reg; reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); assert(error == 0); if (reg == VM_REG_GUEST_LDTR) { if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) return (-1); } if (limit < SEL_LIMIT(sel)) return (-1); else return (0); } /* * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced * by the selector 'sel'. * * Returns 0 on success. * Returns 1 if an exception was injected into the guest. * Returns -1 otherwise. */ static int desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint16_t sel, struct user_segment_descriptor *desc, bool doread, int *faultptr) { struct iovec iov[2]; uint64_t base; uint32_t limit, access; int error, reg; reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); assert(error == 0); assert(limit >= SEL_LIMIT(sel)); error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel), sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov), faultptr); if (error || *faultptr) return (error); if (doread) - vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc)); + vm_copyin(iov, desc, sizeof(*desc)); else - vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc)); + vm_copyout(desc, iov, sizeof(*desc)); return (0); } static int desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) { return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr)); } static int desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) { return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr)); } /* * Read the TSS descriptor referenced by 'sel' into 'desc'. * * Returns 0 on success. * Returns 1 if an exception was injected into the guest. * Returns -1 otherwise. */ static int read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) { struct vm_guest_paging sup_paging; int error; assert(!ISLDT(sel)); assert(IDXSEL(sel) != 0); /* Fetch the new TSS descriptor */ if (desc_table_limit_check(ctx, vcpu, sel)) { if (ts->reason == TSR_IRET) sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); else sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext); return (1); } sup_paging = ts->paging; sup_paging.cpl = 0; /* implicit supervisor mode */ error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr); return (error); } static bool code_desc(int sd_type) { /* code descriptor */ return ((sd_type & 0x18) == 0x18); } static bool stack_desc(int sd_type) { /* writable data descriptor */ return ((sd_type & 0x1A) == 0x12); } static bool data_desc(int sd_type) { /* data descriptor or a readable code descriptor */ return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); } static bool ldt_desc(int sd_type) { return (sd_type == SDT_SYSLDT); } /* * Validate the descriptor 'seg_desc' associated with 'segment'. */ static int validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, int segment, struct seg_desc *seg_desc, int *faultptr) { struct vm_guest_paging sup_paging; struct user_segment_descriptor usd; int error, idtvec; int cpl, dpl, rpl; uint16_t sel, cs; bool ldtseg, codeseg, stackseg, dataseg, conforming; ldtseg = codeseg = stackseg = dataseg = false; switch (segment) { case VM_REG_GUEST_LDTR: ldtseg = true; break; case VM_REG_GUEST_CS: codeseg = true; break; case VM_REG_GUEST_SS: stackseg = true; break; case VM_REG_GUEST_DS: case VM_REG_GUEST_ES: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: dataseg = true; break; default: assert(0); } /* Get the segment selector */ sel = GETREG(ctx, vcpu, segment); /* LDT selector must point into the GDT */ if (ldtseg && ISLDT(sel)) { sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); return (1); } /* Descriptor table limit check */ if (desc_table_limit_check(ctx, vcpu, sel)) { sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); return (1); } /* NULL selector */ if (IDXSEL(sel) == 0) { /* Code and stack segment selectors cannot be NULL */ if (codeseg || stackseg) { sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); return (1); } seg_desc->base = 0; seg_desc->limit = 0; seg_desc->access = 0x10000; /* unusable */ return (0); } /* Read the descriptor from the GDT/LDT */ sup_paging = ts->paging; sup_paging.cpl = 0; /* implicit supervisor mode */ error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr); if (error || *faultptr) return (error); /* Verify that the descriptor type is compatible with the segment */ if ((ldtseg && !ldt_desc(usd.sd_type)) || (codeseg && !code_desc(usd.sd_type)) || (dataseg && !data_desc(usd.sd_type)) || (stackseg && !stack_desc(usd.sd_type))) { sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); return (1); } /* Segment must be marked present */ if (!usd.sd_p) { if (ldtseg) idtvec = IDT_TS; else if (stackseg) idtvec = IDT_SS; else idtvec = IDT_NP; sel_exception(ctx, vcpu, idtvec, sel, ts->ext); return (1); } cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); cpl = cs & SEL_RPL_MASK; rpl = sel & SEL_RPL_MASK; dpl = usd.sd_dpl; if (stackseg && (rpl != cpl || dpl != cpl)) { sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); return (1); } if (codeseg) { conforming = (usd.sd_type & 0x4) ? true : false; if ((conforming && (cpl < dpl)) || (!conforming && (cpl != dpl))) { sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); return (1); } } if (dataseg) { /* * A data segment is always non-conforming except when it's * descriptor is a readable, conforming code segment. */ if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) conforming = true; else conforming = false; if (!conforming && (rpl > dpl || cpl > dpl)) { sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); return (1); } } *seg_desc = usd_to_seg_desc(&usd); return (0); } static void tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch, uint32_t eip, struct tss32 *tss, struct iovec *iov) { /* General purpose registers */ tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX); tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX); tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX); tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX); tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP); tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI); tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI); /* Segment selectors */ tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES); tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS); tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS); tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS); tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS); /* eflags and eip */ tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); if (task_switch->reason == TSR_IRET) tss->tss_eflags &= ~PSL_NT; tss->tss_eip = eip; /* Copy updated old TSS into guest memory */ - vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32)); + vm_copyout(tss, iov, sizeof(struct tss32)); } static void update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd) { int error; error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access); assert(error == 0); } /* * Update the vcpu registers to reflect the state of the new task. */ static int tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr) { struct seg_desc seg_desc, seg_desc2; uint64_t *pdpte, maxphyaddr, reserved; uint32_t eflags; int error, i; bool nested; nested = false; if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { tss->tss_link = ot_sel; nested = true; } eflags = tss->tss_eflags; if (nested) eflags |= PSL_NT; /* LDTR */ SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); /* PBDR */ if (ts->paging.paging_mode != PAGING_MODE_FLAT) { if (ts->paging.paging_mode == PAGING_MODE_PAE) { /* * XXX Assuming 36-bit MAXPHYADDR. */ maxphyaddr = (1UL << 36) - 1; pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32); for (i = 0; i < 4; i++) { /* Check reserved bits if the PDPTE is valid */ if (!(pdpte[i] & 0x1)) continue; /* * Bits 2:1, 8:5 and bits above the processor's * maximum physical address are reserved. */ reserved = ~maxphyaddr | 0x1E6; if (pdpte[i] & reserved) { vm_inject_gp(ctx, vcpu); return (1); } } SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); } SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); ts->paging.cr3 = tss->tss_cr3; } /* eflags and eip */ SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags); SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip); /* General purpose registers */ SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax); SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx); SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp); SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi); SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi); /* Segment selectors */ SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es); SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs); SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss); SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds); SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs); SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs); /* * If this is a nested task then write out the new TSS to update * the previous link field. */ if (nested) - vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss)); + vm_copyout(tss, iov, sizeof(*tss)); /* Validate segment descriptors */ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc, faultptr); if (error || *faultptr) return (error); update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc); /* * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. * * The SS and CS attribute checks on VM-entry are inter-dependent so * we need to make sure that both segments are valid before updating * either of them. This ensures that the VMCS state can pass the * VM-entry checks so the guest can handle any exception injected * during task switch emulation. */ error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc, faultptr); if (error || *faultptr) return (error); error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2, faultptr); if (error || *faultptr) return (error); update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc); update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2); ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc, faultptr); if (error || *faultptr) return (error); update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc); error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc, faultptr); if (error || *faultptr) return (error); update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc); error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc, faultptr); if (error || *faultptr) return (error); update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc); error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc, faultptr); if (error || *faultptr) return (error); update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc); return (0); } /* * Push an error code on the stack of the new task. This is needed if the * task switch was triggered by a hardware exception that causes an error * code to be saved (e.g. #PF). */ static int push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, int task_type, uint32_t errcode, int *faultptr) { struct iovec iov[2]; struct seg_desc seg_desc; int stacksize, bytes, error; uint64_t gla, cr0, rflags; uint32_t esp; uint16_t stacksel; *faultptr = 0; cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS); error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base, &seg_desc.limit, &seg_desc.access); assert(error == 0); /* * Section "Error Code" in the Intel SDM vol 3: the error code is * pushed on the stack as a doubleword or word (depending on the * default interrupt, trap or task gate size). */ if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) bytes = 4; else bytes = 2; /* * PUSH instruction from Intel SDM vol 2: the 'B' flag in the * stack-segment descriptor determines the size of the stack * pointer outside of 64-bit mode. */ if (SEG_DESC_DEF32(seg_desc.access)) stacksize = 4; else stacksize = 2; esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); esp -= bytes; if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { sel_exception(ctx, vcpu, IDT_SS, stacksel, 1); *faultptr = 1; return (0); } if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { vm_inject_ac(ctx, vcpu, 1); *faultptr = 1; return (0); } error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE, iov, nitems(iov), faultptr); if (error || *faultptr) return (error); - vm_copyout(ctx, vcpu, &errcode, iov, bytes); + vm_copyout(&errcode, iov, bytes); SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp); return (0); } /* * Evaluate return value from helper functions and potentially return to * the VM run loop. */ #define CHKERR(error,fault) \ do { \ assert((error == 0) || (error == EFAULT)); \ if (error) \ return (VMEXIT_ABORT); \ else if (fault) \ return (VMEXIT_CONTINUE); \ } while (0) int vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { struct seg_desc nt; struct tss32 oldtss, newtss; struct vm_task_switch *task_switch; struct vm_guest_paging *paging, sup_paging; struct user_segment_descriptor nt_desc, ot_desc; struct iovec nt_iov[2], ot_iov[2]; uint64_t cr0, ot_base; uint32_t eip, ot_lim, access; int error, ext, fault, minlimit, nt_type, ot_type, vcpu; enum task_switch_reason reason; uint16_t nt_sel, ot_sel; task_switch = &vmexit->u.task_switch; nt_sel = task_switch->tsssel; ext = vmexit->u.task_switch.ext; reason = vmexit->u.task_switch.reason; paging = &vmexit->u.task_switch.paging; vcpu = *pvcpu; assert(paging->cpu_mode == CPU_MODE_PROTECTED); /* * Calculate the instruction pointer to store in the old TSS. */ eip = vmexit->rip + vmexit->inst_length; /* * Section 4.6, "Access Rights" in Intel SDM Vol 3. * The following page table accesses are implicitly supervisor mode: * - accesses to GDT or LDT to load segment descriptors * - accesses to the task state segment during task switch */ sup_paging = *paging; sup_paging.cpl = 0; /* implicit supervisor mode */ /* Fetch the new TSS descriptor */ error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc, &fault); CHKERR(error, fault); nt = usd_to_seg_desc(&nt_desc); /* Verify the type of the new TSS */ nt_type = SEG_DESC_TYPE(nt.access); if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); goto done; } /* TSS descriptor must have present bit set */ if (!SEG_DESC_PRESENT(nt.access)) { sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext); goto done; } /* * TSS must have a minimum length of 104 bytes for a 32-bit TSS and * 44 bytes for a 16-bit TSS. */ if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) minlimit = 104 - 1; else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) minlimit = 44 - 1; else minlimit = 0; assert(minlimit > 0); if (nt.limit < (unsigned int)minlimit) { sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); goto done; } /* TSS must be busy if task switch is due to IRET */ if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); goto done; } /* * TSS must be available (not busy) if task switch reason is * CALL, JMP, exception or interrupt. */ if (reason != TSR_IRET && TSS_BUSY(nt_type)) { sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext); goto done; } /* Fetch the new TSS */ error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1, PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault); CHKERR(error, fault); - vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1); + vm_copyin(nt_iov, &newtss, minlimit + 1); /* Get the old TSS selector from the guest's task register */ ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR); if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { /* * This might happen if a task switch was attempted without * ever loading the task register with LTR. In this case the * TR would contain the values from power-on: * (sel = 0, base = 0, limit = 0xffff). */ sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext); goto done; } /* Get the old TSS base and limit from the guest's task register */ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, &access); assert(error == 0); assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); ot_type = SEG_DESC_TYPE(access); assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); /* Fetch the old TSS descriptor */ error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc, &fault); CHKERR(error, fault); /* Get the old TSS */ error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1, PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault); CHKERR(error, fault); - vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1); + vm_copyin(ot_iov, &oldtss, minlimit + 1); /* * Clear the busy bit in the old TSS descriptor if the task switch * due to an IRET or JMP instruction. */ if (reason == TSR_IRET || reason == TSR_JMP) { ot_desc.sd_type &= ~0x2; error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel, &ot_desc, &fault); CHKERR(error, fault); } if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { EPRINTLN("Task switch to 16-bit TSS not supported"); return (VMEXIT_ABORT); } /* Save processor state in old TSS */ tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov); /* * If the task switch was triggered for any reason other than IRET * then set the busy bit in the new TSS descriptor. */ if (reason != TSR_IRET) { nt_desc.sd_type |= 0x2; error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel, &nt_desc, &fault); CHKERR(error, fault); } /* Update task register to point at the new TSS */ SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel); /* Update the hidden descriptor state of the task register */ nt = usd_to_seg_desc(&nt_desc); update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt); /* Set CR0.TS */ cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); /* * We are now committed to the task switch. Any exceptions encountered * after this point will be handled in the context of the new task and * the saved instruction pointer will belong to the new task. */ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); assert(error == 0); /* Load processor state from new TSS */ error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov, &fault); CHKERR(error, fault); /* * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception * caused an error code to be generated, this error code is copied * to the stack of the new task. */ if (task_switch->errcode_valid) { assert(task_switch->ext); assert(task_switch->reason == TSR_IDT_GATE); error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type, task_switch->errcode, &fault); CHKERR(error, fault); } /* * Treatment of virtual-NMI blocking if NMI is delivered through * a task gate. * * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: * If the virtual NMIs VM-execution control is 1, VM entry injects * an NMI, and delivery of the NMI causes a task switch that causes * a VM exit, virtual-NMI blocking is in effect before the VM exit * commences. * * Thus, virtual-NMI blocking is in effect at the time of the task * switch VM exit. */ /* * Treatment of virtual-NMI unblocking on IRET from NMI handler task. * * Section "Changes to Instruction Behavior in VMX Non-Root Operation" * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. * This unblocking of virtual-NMI occurs even if IRET causes a fault. * * Thus, virtual-NMI blocking is cleared at the time of the task switch * VM exit. */ /* * If the task switch was triggered by an event delivered through * the IDT then extinguish the pending event from the vcpu's * exitintinfo. */ if (task_switch->reason == TSR_IDT_GATE) { error = vm_set_intinfo(ctx, vcpu, 0); assert(error == 0); } /* * XXX should inject debug exception if 'T' bit is 1 */ done: return (VMEXIT_CONTINUE); }