diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c index 05b0d1135fd1..25e19c0d9931 100644 --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -1,1836 +1,1829 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmmapi.h" #include "internal.h" #define MB (1024 * 1024UL) #define GB (1024 * 1024 * 1024UL) /* * Size of the guard region before and after the virtual address space * mapping the guest physical memory. This must be a multiple of the * superpage size for performance reasons. */ #define VM_MMAP_GUARD_SIZE (4 * MB) #define PROT_RW (PROT_READ | PROT_WRITE) #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) struct vmctx { int fd; uint32_t lowmem_limit; int memflags; size_t lowmem; size_t highmem; char *baseaddr; char *name; }; #define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) #define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) static int vm_device_open(const char *name) { int fd, len; char *vmfile; len = strlen("/dev/vmm/") + strlen(name) + 1; vmfile = malloc(len); assert(vmfile != NULL); snprintf(vmfile, len, "/dev/vmm/%s", name); /* Open the device file */ fd = open(vmfile, O_RDWR, 0); free(vmfile); return (fd); } int vm_create(const char *name) { /* Try to load vmm(4) module before creating a guest. */ if (modfind("vmm") < 0) kldload("vmm"); return (CREATE(name)); } struct vmctx * vm_open(const char *name) { struct vmctx *vm; int saved_errno; vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); assert(vm != NULL); vm->fd = -1; vm->memflags = 0; vm->lowmem_limit = 3 * GB; vm->name = (char *)(vm + 1); strcpy(vm->name, name); if ((vm->fd = vm_device_open(vm->name)) < 0) goto err; return (vm); err: saved_errno = errno; free(vm); errno = saved_errno; return (NULL); } void vm_close(struct vmctx *vm) { assert(vm != NULL); close(vm->fd); free(vm); } void vm_destroy(struct vmctx *vm) { assert(vm != NULL); if (vm->fd >= 0) close(vm->fd); DESTROY(vm->name); free(vm); } struct vcpu * vm_vcpu_open(struct vmctx *ctx, int vcpuid) { struct vcpu *vcpu; vcpu = malloc(sizeof(*vcpu)); vcpu->ctx = ctx; vcpu->vcpuid = vcpuid; return (vcpu); } void vm_vcpu_close(struct vcpu *vcpu) { free(vcpu); } int vcpu_id(struct vcpu *vcpu) { return (vcpu->vcpuid); } int vm_parse_memsize(const char *opt, size_t *ret_memsize) { char *endptr; size_t optval; int error; optval = strtoul(opt, &endptr, 0); if (*opt != '\0' && *endptr == '\0') { /* * For the sake of backward compatibility if the memory size * specified on the command line is less than a megabyte then * it is interpreted as being in units of MB. */ if (optval < MB) optval *= MB; *ret_memsize = optval; error = 0; } else error = expand_number(opt, ret_memsize); return (error); } uint32_t vm_get_lowmem_limit(struct vmctx *ctx) { return (ctx->lowmem_limit); } void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit) { ctx->lowmem_limit = limit; } void vm_set_memflags(struct vmctx *ctx, int flags) { ctx->memflags = flags; } int vm_get_memflags(struct vmctx *ctx) { return (ctx->memflags); } /* * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot) { struct vm_memmap memmap; int error, flags; memmap.gpa = gpa; memmap.segid = segid; memmap.segoff = off; memmap.len = len; memmap.prot = prot; memmap.flags = 0; if (ctx->memflags & VM_MEM_F_WIRED) memmap.flags |= VM_MEMMAP_F_WIRED; /* * If this mapping already exists then don't create it again. This * is the common case for SYSMEM mappings created by bhyveload(8). */ error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); if (error == 0 && gpa == memmap.gpa) { if (segid != memmap.segid || off != memmap.segoff || prot != memmap.prot || flags != memmap.flags) { errno = EEXIST; return (-1); } else { return (0); } } error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); return (error); } int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, size_t *lowmem_size, size_t *highmem_size) { *guest_baseaddr = ctx->baseaddr; *lowmem_size = ctx->lowmem; *highmem_size = ctx->highmem; return (0); } int vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len) { struct vm_munmap munmap; int error; munmap.gpa = gpa; munmap.len = len; error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap); return (error); } int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { struct vm_memmap memmap; int error; bzero(&memmap, sizeof(struct vm_memmap)); memmap.gpa = *gpa; error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); if (error == 0) { *gpa = memmap.gpa; *segid = memmap.segid; *segoff = memmap.segoff; *len = memmap.len; *prot = memmap.prot; *flags = memmap.flags; } return (error); } /* * Return 0 if the segments are identical and non-zero otherwise. * * This is slightly complicated by the fact that only device memory segments * are named. */ static int cmpseg(size_t len, const char *str, size_t len2, const char *str2) { if (len == len2) { if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) return (0); } return (-1); } static int vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) { struct vm_memseg memseg; size_t n; int error; /* * If the memory segment has already been created then just return. * This is the usual case for the SYSMEM segment created by userspace * loaders like bhyveload(8). */ error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, sizeof(memseg.name)); if (error) return (error); if (memseg.len != 0) { if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { errno = EINVAL; return (-1); } else { return (0); } } bzero(&memseg, sizeof(struct vm_memseg)); memseg.segid = segid; memseg.len = len; if (name != NULL) { n = strlcpy(memseg.name, name, sizeof(memseg.name)); if (n >= sizeof(memseg.name)) { errno = ENAMETOOLONG; return (-1); } } error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); return (error); } int vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, size_t bufsize) { struct vm_memseg memseg; size_t n; int error; memseg.segid = segid; error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); if (error == 0) { *lenp = memseg.len; n = strlcpy(namebuf, memseg.name, bufsize); if (n >= bufsize) { errno = ENAMETOOLONG; error = -1; } } return (error); } static int setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) { char *ptr; int error, flags; /* Map 'len' bytes starting at 'gpa' in the guest address space */ error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); if (error) return (error); flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap into the process address space on the host */ ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); if (ptr == MAP_FAILED) return (-1); return (0); } int vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) { size_t objsize, len; vm_paddr_t gpa; char *baseaddr, *ptr; int error; assert(vms == VM_MMAP_ALL); /* * If 'memsize' cannot fit entirely in the 'lowmem' segment then * create another 'highmem' segment above 4GB for the remainder. */ if (memsize > ctx->lowmem_limit) { ctx->lowmem = ctx->lowmem_limit; ctx->highmem = memsize - ctx->lowmem_limit; objsize = 4*GB + ctx->highmem; } else { ctx->lowmem = memsize; ctx->highmem = 0; objsize = ctx->lowmem; } error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); if (error) return (error); /* * Stake out a contiguous region covering the guest physical memory * and the adjoining guard regions. */ len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (ptr == MAP_FAILED) return (-1); baseaddr = ptr + VM_MMAP_GUARD_SIZE; if (ctx->highmem > 0) { gpa = 4*GB; len = ctx->highmem; error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } if (ctx->lowmem > 0) { gpa = 0; len = ctx->lowmem; error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } ctx->baseaddr = baseaddr; return (0); } /* * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in * the lowmem or highmem regions. * * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. * The instruction emulation code depends on this behavior. */ void * vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) { if (ctx->lowmem > 0) { if (gaddr < ctx->lowmem && len <= ctx->lowmem && gaddr + len <= ctx->lowmem) return (ctx->baseaddr + gaddr); } if (ctx->highmem > 0) { if (gaddr >= 4*GB) { if (gaddr < 4*GB + ctx->highmem && len <= ctx->highmem && gaddr + len <= 4*GB + ctx->highmem) return (ctx->baseaddr + gaddr); } } return (NULL); } vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr) { vm_paddr_t offaddr; offaddr = (char *)addr - ctx->baseaddr; if (ctx->lowmem > 0) if (offaddr <= ctx->lowmem) return (offaddr); if (ctx->highmem > 0) if (offaddr >= 4*GB && offaddr < 4*GB + ctx->highmem) return (offaddr); return ((vm_paddr_t)-1); } const char * vm_get_name(struct vmctx *ctx) { return (ctx->name); } size_t vm_get_lowmem_size(struct vmctx *ctx) { return (ctx->lowmem); } size_t vm_get_highmem_size(struct vmctx *ctx) { return (ctx->highmem); } void * vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) { char pathname[MAXPATHLEN]; size_t len2; char *base, *ptr; int fd, error, flags; fd = -1; ptr = MAP_FAILED; if (name == NULL || strlen(name) == 0) { errno = EINVAL; goto done; } error = vm_alloc_memseg(ctx, segid, len, name); if (error) goto done; strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); strlcat(pathname, ctx->name, sizeof(pathname)); strlcat(pathname, ".", sizeof(pathname)); strlcat(pathname, name, sizeof(pathname)); fd = open(pathname, O_RDWR); if (fd < 0) goto done; /* * Stake out a contiguous region covering the device memory and the * adjoining guard regions. */ len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (base == MAP_FAILED) goto done; flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap the devmem region in the host address space */ ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); done: if (fd >= 0) close(fd); return (ptr); } static int vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg) { /* * XXX: fragile, handle with care * Assumes that the first field of the ioctl data * is the vcpuid. */ *(int *)arg = vcpu->vcpuid; return (ioctl(vcpu->ctx->fd, cmd, arg)); } int vm_set_desc(struct vcpu *vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); vmsegdesc.regnum = reg; vmsegdesc.desc.base = base; vmsegdesc.desc.limit = limit; vmsegdesc.desc.access = access; error = vcpu_ioctl(vcpu, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc); return (error); } int vm_get_desc(struct vcpu *vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); vmsegdesc.regnum = reg; error = vcpu_ioctl(vcpu, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc); if (error == 0) { *base = vmsegdesc.desc.base; *limit = vmsegdesc.desc.limit; *access = vmsegdesc.desc.access; } return (error); } int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *seg_desc) { int error; error = vm_get_desc(vcpu, reg, &seg_desc->base, &seg_desc->limit, &seg_desc->access); return (error); } int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.regnum = reg; vmreg.regval = val; error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg); return (error); } int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.regnum = reg; error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg); *ret_val = vmreg.regval; return (error); } int vm_set_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals) { int error; struct vm_register_set vmregset; bzero(&vmregset, sizeof(vmregset)); vmregset.count = count; vmregset.regnums = regnums; vmregset.regvals = regvals; error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset); return (error); } int vm_get_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals) { int error; struct vm_register_set vmregset; bzero(&vmregset, sizeof(vmregset)); vmregset.count = count; vmregset.regnums = regnums; vmregset.regvals = regvals; error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset); return (error); } int -vm_run(struct vcpu *vcpu, struct vm_exit *vmexit) +vm_run(struct vcpu *vcpu, struct vm_run *vmrun) { - int error; - struct vm_run vmrun; - - bzero(&vmrun, sizeof(vmrun)); - - error = vcpu_ioctl(vcpu, VM_RUN, &vmrun); - bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); - return (error); + return (vcpu_ioctl(vcpu, VM_RUN, vmrun)); } int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) { struct vm_suspend vmsuspend; bzero(&vmsuspend, sizeof(vmsuspend)); vmsuspend.how = how; return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); } int vm_reinit(struct vmctx *ctx) { return (ioctl(ctx->fd, VM_REINIT, 0)); } int vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction) { struct vm_exception exc; exc.vector = vector; exc.error_code = errcode; exc.error_code_valid = errcode_valid; exc.restart_instruction = restart_instruction; return (vcpu_ioctl(vcpu, VM_INJECT_EXCEPTION, &exc)); } int vm_apicid2vcpu(struct vmctx *ctx __unused, int apicid) { /* * The apic id associated with the 'vcpu' has the same numerical value * as the 'vcpu' itself. */ return (apicid); } int vm_lapic_irq(struct vcpu *vcpu, int vector) { struct vm_lapic_irq vmirq; bzero(&vmirq, sizeof(vmirq)); vmirq.vector = vector; return (vcpu_ioctl(vcpu, VM_LAPIC_IRQ, &vmirq)); } int vm_lapic_local_irq(struct vcpu *vcpu, int vector) { struct vm_lapic_irq vmirq; bzero(&vmirq, sizeof(vmirq)); vmirq.vector = vector; return (vcpu_ioctl(vcpu, VM_LAPIC_LOCAL_IRQ, &vmirq)); } int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg) { struct vm_lapic_msi vmmsi; bzero(&vmmsi, sizeof(vmmsi)); vmmsi.addr = addr; vmmsi.msg = msg; return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi)); } int vm_ioapic_assert_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq)); } int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq)); } int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq)); } int vm_ioapic_pincount(struct vmctx *ctx, int *pincount) { return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount)); } int vm_readwrite_kernemu_device(struct vcpu *vcpu, vm_paddr_t gpa, bool write, int size, uint64_t *value) { struct vm_readwrite_kernemu_device irp = { .access_width = fls(size) - 1, .gpa = gpa, .value = write ? *value : ~0ul, }; long cmd = (write ? VM_SET_KERNEMU_DEV : VM_GET_KERNEMU_DEV); int rc; rc = vcpu_ioctl(vcpu, cmd, &irp); if (rc == 0 && !write) *value = irp.value; return (rc); } int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq)); } int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq)); } int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq)); } int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger) { struct vm_isa_irq_trigger isa_irq_trigger; bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger)); isa_irq_trigger.atpic_irq = atpic_irq; isa_irq_trigger.trigger = trigger; return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger)); } int vm_inject_nmi(struct vcpu *vcpu) { struct vm_nmi vmnmi; bzero(&vmnmi, sizeof(vmnmi)); return (vcpu_ioctl(vcpu, VM_INJECT_NMI, &vmnmi)); } static const char *capstrmap[] = { [VM_CAP_HALT_EXIT] = "hlt_exit", [VM_CAP_MTRAP_EXIT] = "mtrap_exit", [VM_CAP_PAUSE_EXIT] = "pause_exit", [VM_CAP_UNRESTRICTED_GUEST] = "unrestricted_guest", [VM_CAP_ENABLE_INVPCID] = "enable_invpcid", [VM_CAP_BPT_EXIT] = "bpt_exit", }; int vm_capability_name2type(const char *capname) { int i; for (i = 0; i < (int)nitems(capstrmap); i++) { if (strcmp(capstrmap[i], capname) == 0) return (i); } return (-1); } const char * vm_capability_type2name(int type) { if (type >= 0 && type < (int)nitems(capstrmap)) return (capstrmap[type]); return (NULL); } int vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval) { int error; struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.captype = cap; error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap); *retval = vmcap.capval; return (error); } int vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val) { struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.captype = cap; vmcap.capval = val; return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap)); } int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev)); } int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev)); } int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { struct vm_pptdev_mmio pptmmio; bzero(&pptmmio, sizeof(pptmmio)); pptmmio.bus = bus; pptmmio.slot = slot; pptmmio.func = func; pptmmio.gpa = gpa; pptmmio.len = len; pptmmio.hpa = hpa; return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio)); } int vm_unmap_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len) { struct vm_pptdev_mmio pptmmio; bzero(&pptmmio, sizeof(pptmmio)); pptmmio.bus = bus; pptmmio.slot = slot; pptmmio.func = func; pptmmio.gpa = gpa; pptmmio.len = len; return (ioctl(ctx->fd, VM_UNMAP_PPTDEV_MMIO, &pptmmio)); } int vm_setup_pptdev_msi(struct vmctx *ctx, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec) { struct vm_pptdev_msi pptmsi; bzero(&pptmsi, sizeof(pptmsi)); pptmsi.bus = bus; pptmsi.slot = slot; pptmsi.func = func; pptmsi.msg = msg; pptmsi.addr = addr; pptmsi.numvec = numvec; return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi)); } int vm_setup_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control) { struct vm_pptdev_msix pptmsix; bzero(&pptmsix, sizeof(pptmsix)); pptmsix.bus = bus; pptmsix.slot = slot; pptmsix.func = func; pptmsix.idx = idx; pptmsix.msg = msg; pptmsix.addr = addr; pptmsix.vector_control = vector_control; return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix); } int vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev ppt; bzero(&ppt, sizeof(ppt)); ppt.bus = bus; ppt.slot = slot; ppt.func = func; return ioctl(ctx->fd, VM_PPTDEV_DISABLE_MSIX, &ppt); } uint64_t * vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, int *ret_entries) { static _Thread_local uint64_t *stats_buf; static _Thread_local u_int stats_count; uint64_t *new_stats; struct vm_stats vmstats; u_int count, index; bool have_stats; have_stats = false; count = 0; for (index = 0;; index += nitems(vmstats.statbuf)) { vmstats.index = index; if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0) break; if (stats_count < index + vmstats.num_entries) { new_stats = realloc(stats_buf, (index + vmstats.num_entries) * sizeof(uint64_t)); if (new_stats == NULL) { errno = ENOMEM; return (NULL); } stats_count = index + vmstats.num_entries; stats_buf = new_stats; } memcpy(stats_buf + index, vmstats.statbuf, vmstats.num_entries * sizeof(uint64_t)); count += vmstats.num_entries; have_stats = true; if (vmstats.num_entries != nitems(vmstats.statbuf)) break; } if (have_stats) { if (ret_entries) *ret_entries = count; if (ret_tv) *ret_tv = vmstats.tv; return (stats_buf); } else return (NULL); } const char * vm_get_stat_desc(struct vmctx *ctx, int index) { static struct vm_stat_desc statdesc; statdesc.index = index; if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) return (statdesc.desc); else return (NULL); } int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); error = vcpu_ioctl(vcpu, VM_GET_X2APIC_STATE, &x2apic); *state = x2apic.state; return (error); } int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); x2apic.state = state; error = vcpu_ioctl(vcpu, VM_SET_X2APIC_STATE, &x2apic); return (error); } /* * From Intel Vol 3a: * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT */ int vcpu_reset(struct vcpu *vcpu) { int error; uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx; uint32_t desc_access, desc_limit; uint16_t sel; zero = 0; rflags = 0x2; error = vm_set_register(vcpu, VM_REG_GUEST_RFLAGS, rflags); if (error) goto done; rip = 0xfff0; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RIP, rip)) != 0) goto done; /* * According to Intels Software Developer Manual CR0 should be * initialized with CR0_ET | CR0_NW | CR0_CD but that crashes some * guests like Windows. */ cr0 = CR0_NE; if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR0, cr0)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR2, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR3, zero)) != 0) goto done; cr4 = 0; if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR4, cr4)) != 0) goto done; /* * CS: present, r/w, accessed, 16-bit, byte granularity, usable */ desc_base = 0xffff0000; desc_limit = 0xffff; desc_access = 0x0093; error = vm_set_desc(vcpu, VM_REG_GUEST_CS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0xf000; if ((error = vm_set_register(vcpu, VM_REG_GUEST_CS, sel)) != 0) goto done; /* * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0093; error = vm_set_desc(vcpu, VM_REG_GUEST_SS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vcpu, VM_REG_GUEST_DS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vcpu, VM_REG_GUEST_ES, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vcpu, VM_REG_GUEST_FS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vcpu, VM_REG_GUEST_GS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vcpu, VM_REG_GUEST_SS, sel)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_DS, sel)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_ES, sel)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_FS, sel)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_GS, sel)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_EFER, zero)) != 0) goto done; /* General purpose registers */ rdx = 0xf00; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RAX, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RBX, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RCX, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RDX, rdx)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RSI, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RDI, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RBP, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RSP, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R8, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R9, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R10, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R11, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R12, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R13, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R14, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R15, zero)) != 0) goto done; /* GDTR, IDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0; error = vm_set_desc(vcpu, VM_REG_GUEST_GDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; error = vm_set_desc(vcpu, VM_REG_GUEST_IDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; /* TR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0000008b; error = vm_set_desc(vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vcpu, VM_REG_GUEST_TR, sel)) != 0) goto done; /* LDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x00000082; error = vm_set_desc(vcpu, VM_REG_GUEST_LDTR, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vcpu, VM_REG_GUEST_LDTR, 0)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_DR6, 0xffff0ff0)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_DR7, 0x400)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_INTR_SHADOW, zero)) != 0) goto done; error = 0; done: return (error); } int vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) { int error, i; struct vm_gpa_pte gpapte; bzero(&gpapte, sizeof(gpapte)); gpapte.gpa = gpa; error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); if (error == 0) { *num = gpapte.ptenum; for (i = 0; i < gpapte.ptenum; i++) pte[i] = gpapte.pte[i]; } return (error); } int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities) { int error; struct vm_hpet_cap cap; bzero(&cap, sizeof(struct vm_hpet_cap)); error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap); if (capabilities != NULL) *capabilities = cap.capabilities; return (error); } int vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; bzero(&gg, sizeof(struct vm_gla2gpa)); gg.prot = prot; gg.gla = gla; gg.paging = *paging; error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg); if (error == 0) { *fault = gg.fault; *gpa = gg.gpa; } return (error); } int vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; bzero(&gg, sizeof(struct vm_gla2gpa)); gg.prot = prot; gg.gla = gla; gg.paging = *paging; error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg); if (error == 0) { *fault = gg.fault; *gpa = gg.gpa; } return (error); } #ifndef min #define min(a,b) (((a) < (b)) ? (a) : (b)) #endif int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault) { void *va; uint64_t gpa, off; int error, i, n; for (i = 0; i < iovcnt; i++) { iov[i].iov_base = 0; iov[i].iov_len = 0; } while (len) { assert(iovcnt > 0); error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = MIN(len, PAGE_SIZE - off); va = vm_map_gpa(vcpu->ctx, gpa, n); if (va == NULL) return (EFAULT); iov->iov_base = va; iov->iov_len = n; iov++; iovcnt--; gla += n; len -= n; } return (0); } void vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused) { /* * Intentionally empty. This is used by the instruction * emulation code shared with the kernel. The in-kernel * version of this is non-empty. */ } void vm_copyin(struct iovec *iov, void *vp, size_t len) { const char *src; char *dst; size_t n; dst = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); src = iov->iov_base; bcopy(src, dst, n); iov++; dst += n; len -= n; } } void vm_copyout(const void *vp, struct iovec *iov, size_t len) { const char *src; char *dst; size_t n; src = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); dst = iov->iov_base; bcopy(src, dst, n); iov++; src += n; len -= n; } } static int vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) { struct vm_cpuset vm_cpuset; int error; bzero(&vm_cpuset, sizeof(struct vm_cpuset)); vm_cpuset.which = which; vm_cpuset.cpusetsize = sizeof(cpuset_t); vm_cpuset.cpus = cpus; error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); return (error); } int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); } int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); } int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); } int vm_activate_cpu(struct vcpu *vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac); return (error); } int vm_suspend_all_cpus(struct vmctx *ctx) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = -1; error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); return (error); } int vm_suspend_cpu(struct vcpu *vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac); return (error); } int vm_resume_cpu(struct vcpu *vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac); return (error); } int vm_resume_all_cpus(struct vmctx *ctx) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = -1; error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); return (error); } int vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii); if (error == 0) { *info1 = vmii.info1; *info2 = vmii.info2; } return (error); } int vm_set_intinfo(struct vcpu *vcpu, uint64_t info1) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); vmii.info1 = info1; error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii); return (error); } int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value) { struct vm_rtc_data rtcdata; int error; bzero(&rtcdata, sizeof(struct vm_rtc_data)); rtcdata.offset = offset; rtcdata.value = value; error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata); return (error); } int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval) { struct vm_rtc_data rtcdata; int error; bzero(&rtcdata, sizeof(struct vm_rtc_data)); rtcdata.offset = offset; error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata); if (error == 0) *retval = rtcdata.value; return (error); } int vm_rtc_settime(struct vmctx *ctx, time_t secs) { struct vm_rtc_time rtctime; int error; bzero(&rtctime, sizeof(struct vm_rtc_time)); rtctime.secs = secs; error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime); return (error); } int vm_rtc_gettime(struct vmctx *ctx, time_t *secs) { struct vm_rtc_time rtctime; int error; bzero(&rtctime, sizeof(struct vm_rtc_time)); error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime); if (error == 0) *secs = rtctime.secs; return (error); } int vm_restart_instruction(struct vcpu *vcpu) { int arg; return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg)); } int vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta) { if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { #ifdef SNAPSHOT_DEBUG fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", __func__, meta->dev_name, errno); #endif return (-1); } return (0); } int vm_restore_time(struct vmctx *ctx) { int dummy; dummy = 0; return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); } int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) { struct vm_cpu_topology topology; bzero(&topology, sizeof (struct vm_cpu_topology)); topology.sockets = sockets; topology.cores = cores; topology.threads = threads; topology.maxcpus = maxcpus; return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); } int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) { struct vm_cpu_topology topology; int error; bzero(&topology, sizeof (struct vm_cpu_topology)); error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); if (error == 0) { *sockets = topology.sockets; *cores = topology.cores; *threads = topology.threads; *maxcpus = topology.maxcpus; } return (error); } /* Keep in sync with machine/vmm_dev.h. */ static const cap_ioctl_t vm_ioctl_cmds[] = { VM_RUN, VM_SUSPEND, VM_REINIT, VM_ALLOC_MEMSEG, VM_GET_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_GETNEXT, VM_MUNMAP_MEMSEG, VM_SET_REGISTER, VM_GET_REGISTER, VM_SET_SEGMENT_DESCRIPTOR, VM_GET_SEGMENT_DESCRIPTOR, VM_SET_REGISTER_SET, VM_GET_REGISTER_SET, VM_SET_KERNEMU_DEV, VM_GET_KERNEMU_DEV, VM_INJECT_EXCEPTION, VM_LAPIC_IRQ, VM_LAPIC_LOCAL_IRQ, VM_LAPIC_MSI, VM_IOAPIC_ASSERT_IRQ, VM_IOAPIC_DEASSERT_IRQ, VM_IOAPIC_PULSE_IRQ, VM_IOAPIC_PINCOUNT, VM_ISA_ASSERT_IRQ, VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER, VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV, VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI, VM_PPTDEV_MSIX, VM_UNMAP_PPTDEV_MMIO, VM_PPTDEV_DISABLE_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE, VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA, VM_GLA2GPA_NOFAULT, VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SUSPEND_CPU, VM_RESUME_CPU, VM_SET_INTINFO, VM_GET_INTINFO, VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME, VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY, VM_SNAPSHOT_REQ, VM_RESTORE_TIME }; int vm_limit_rights(struct vmctx *ctx) { cap_rights_t rights; size_t ncmds; cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); if (caph_rights_limit(ctx->fd, &rights) != 0) return (-1); ncmds = nitems(vm_ioctl_cmds); if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, ncmds) != 0) return (-1); return (0); } /* * Avoid using in new code. Operations on the fd should be wrapped here so that * capability rights can be kept in sync. */ int vm_get_device_fd(struct vmctx *ctx) { return (ctx->fd); } /* Legacy interface, do not use. */ const cap_ioctl_t * vm_get_ioctls(size_t *len) { cap_ioctl_t *cmds; if (len == NULL) { cmds = malloc(sizeof(vm_ioctl_cmds)); if (cmds == NULL) return (NULL); bcopy(vm_ioctl_cmds, cmds, sizeof(vm_ioctl_cmds)); return (cmds); } *len = nitems(vm_ioctl_cmds); return (NULL); } diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h index e75f5e471d77..00d48a61fda2 100644 --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -1,279 +1,279 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMMAPI_H_ #define _VMMAPI_H_ #include #include #include #include #include #include /* * API version for out-of-tree consumers like grub-bhyve for making compile * time decisions. */ #define VMMAPI_VERSION 0200 /* 2 digit major followed by 2 digit minor */ struct iovec; struct vcpu; struct vmctx; struct vm_snapshot_meta; enum x2apic_state; /* * Different styles of mapping the memory assigned to a VM into the address * space of the controlling process. */ enum vm_mmap_style { VM_MMAP_NONE, /* no mapping */ VM_MMAP_ALL, /* fully and statically mapped */ VM_MMAP_SPARSE, /* mappings created on-demand */ }; /* * 'flags' value passed to 'vm_set_memflags()'. */ #define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */ #define VM_MEM_F_WIRED 0x02 /* guest memory is wired */ /* * Identifiers for memory segments: * - vm_setup_memory() uses VM_SYSMEM for the system memory segment. * - the remaining identifiers can be used to create devmem segments. */ enum { VM_SYSMEM, VM_BOOTROM, VM_FRAMEBUFFER, VM_PCIROM, }; __BEGIN_DECLS /* * Get the length and name of the memory segment identified by 'segid'. * Note that system memory segments are identified with a nul name. * * Returns 0 on success and non-zero otherwise. */ int vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name, size_t namesiz); /* * Iterate over the guest address space. This function finds an address range * that starts at an address >= *gpa. * * Returns 0 if the next address range was found and non-zero otherwise. */ int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, size_t *lowmem_size, size_t *highmem_size); /* * Create a device memory segment identified by 'segid'. * * Returns a pointer to the memory segment on success and MAP_FAILED otherwise. */ void *vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len); /* * Map the memory segment identified by 'segid' into the guest address space * at [gpa,gpa+len) with protection 'prot'. */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t segoff, size_t len, int prot); int vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len); int vm_create(const char *name); struct vmctx *vm_open(const char *name); void vm_close(struct vmctx *ctx); void vm_destroy(struct vmctx *ctx); int vm_limit_rights(struct vmctx *ctx); struct vcpu *vm_vcpu_open(struct vmctx *ctx, int vcpuid); void vm_vcpu_close(struct vcpu *vcpu); int vcpu_id(struct vcpu *vcpu); int vm_parse_memsize(const char *optarg, size_t *memsize); int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); /* inverse operation to vm_map_gpa - extract guest address from host pointer */ vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr); int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); int vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); int vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); uint32_t vm_get_lowmem_limit(struct vmctx *ctx); void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit); void vm_set_memflags(struct vmctx *ctx, int flags); int vm_get_memflags(struct vmctx *ctx); const char *vm_get_name(struct vmctx *ctx); size_t vm_get_lowmem_size(struct vmctx *ctx); size_t vm_get_highmem_size(struct vmctx *ctx); int vm_set_desc(struct vcpu *vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access); int vm_get_desc(struct vcpu *vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access); int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *seg_desc); int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val); int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval); int vm_set_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals); int vm_get_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals); -int vm_run(struct vcpu *vcpu, struct vm_exit *ret_vmexit); +int vm_run(struct vcpu *vcpu, struct vm_run *vmrun); int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how); int vm_reinit(struct vmctx *ctx); int vm_apicid2vcpu(struct vmctx *ctx, int apicid); int vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction); int vm_lapic_irq(struct vcpu *vcpu, int vector); int vm_lapic_local_irq(struct vcpu *vcpu, int vector); int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg); int vm_ioapic_assert_irq(struct vmctx *ctx, int irq); int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq); int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq); int vm_ioapic_pincount(struct vmctx *ctx, int *pincount); int vm_readwrite_kernemu_device(struct vcpu *vcpu, vm_paddr_t gpa, bool write, int size, uint64_t *value); int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger); int vm_inject_nmi(struct vcpu *vcpu); int vm_capability_name2type(const char *capname); const char *vm_capability_type2name(int type); int vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval); int vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val); int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len); int vm_setup_pptdev_msi(struct vmctx *ctx, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec); int vm_setup_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); int vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func); int vm_get_intinfo(struct vcpu *vcpu, uint64_t *i1, uint64_t *i2); int vm_set_intinfo(struct vcpu *vcpu, uint64_t exit_intinfo); /* * Return a pointer to the statistics buffer. Note that this is not MT-safe. */ uint64_t *vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, int *ret_entries); const char *vm_get_stat_desc(struct vmctx *ctx, int index); int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *s); int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state s); int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities); /* * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'. * The 'iovcnt' should be big enough to accommodate all GPA segments. * * retval fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Error */ int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *pg, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault); void vm_copyin(struct iovec *guest_iov, void *host_dst, size_t len); void vm_copyout(const void *host_src, struct iovec *guest_iov, size_t len); void vm_copy_teardown(struct iovec *iov, int iovcnt); /* RTC */ int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value); int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval); int vm_rtc_settime(struct vmctx *ctx, time_t secs); int vm_rtc_gettime(struct vmctx *ctx, time_t *secs); /* Reset vcpu register state */ int vcpu_reset(struct vcpu *vcpu); int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_activate_cpu(struct vcpu *vcpu); int vm_suspend_all_cpus(struct vmctx *ctx); int vm_suspend_cpu(struct vcpu *vcpu); int vm_resume_all_cpus(struct vmctx *ctx); int vm_resume_cpu(struct vcpu *vcpu); int vm_restart_instruction(struct vcpu *vcpu); /* CPU topology */ int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); /* * FreeBSD specific APIs */ int vm_setup_freebsd_registers(struct vcpu *vcpu, uint64_t rip, uint64_t cr3, uint64_t gdtbase, uint64_t rsp); int vm_setup_freebsd_registers_i386(struct vcpu *vcpu, uint32_t eip, uint32_t gdtbase, uint32_t esp); void vm_setup_freebsd_gdt(uint64_t *gdtr); /* * Save and restore */ int vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta); int vm_restore_time(struct vmctx *ctx); /* * Deprecated interfaces, do not use them in new code. */ int vm_get_device_fd(struct vmctx *ctx); const cap_ioctl_t *vm_get_ioctls(size_t *len); __END_DECLS #endif /* _VMMAPI_H_ */ diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index ca7ed25506b0..163fcf669ae0 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -1,795 +1,800 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_H_ #define _VMM_H_ #include #include #include struct vcpu; struct vm_snapshot_meta; #ifdef _KERNEL SDT_PROVIDER_DECLARE(vmm); #endif enum vm_suspend_how { VM_SUSPEND_NONE, VM_SUSPEND_RESET, VM_SUSPEND_POWEROFF, VM_SUSPEND_HALT, VM_SUSPEND_TRIPLEFAULT, VM_SUSPEND_LAST }; /* * Identifiers for architecturally defined registers. */ enum vm_reg_name { VM_REG_GUEST_RAX, VM_REG_GUEST_RBX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_RBP, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15, VM_REG_GUEST_CR0, VM_REG_GUEST_CR3, VM_REG_GUEST_CR4, VM_REG_GUEST_DR7, VM_REG_GUEST_RSP, VM_REG_GUEST_RIP, VM_REG_GUEST_RFLAGS, VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS, VM_REG_GUEST_LDTR, VM_REG_GUEST_TR, VM_REG_GUEST_IDTR, VM_REG_GUEST_GDTR, VM_REG_GUEST_EFER, VM_REG_GUEST_CR2, VM_REG_GUEST_PDPTE0, VM_REG_GUEST_PDPTE1, VM_REG_GUEST_PDPTE2, VM_REG_GUEST_PDPTE3, VM_REG_GUEST_INTR_SHADOW, VM_REG_GUEST_DR0, VM_REG_GUEST_DR1, VM_REG_GUEST_DR2, VM_REG_GUEST_DR3, VM_REG_GUEST_DR6, VM_REG_GUEST_ENTRY_INST_LENGTH, VM_REG_LAST }; enum x2apic_state { X2APIC_DISABLED, X2APIC_ENABLED, X2APIC_STATE_LAST }; #define VM_INTINFO_VECTOR(info) ((info) & 0xff) #define VM_INTINFO_DEL_ERRCODE 0x800 #define VM_INTINFO_RSVD 0x7ffff000 #define VM_INTINFO_VALID 0x80000000 #define VM_INTINFO_TYPE 0x700 #define VM_INTINFO_HWINTR (0 << 8) #define VM_INTINFO_NMI (2 << 8) #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) /* * The VM name has to fit into the pathname length constraints of devfs, * governed primarily by SPECNAMELEN. The length is the total number of * characters in the full path, relative to the mount point and not * including any leading '/' characters. * A prefix and a suffix are added to the name specified by the user. * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters * longer for future use. * The suffix is a string that identifies a bootrom image or some similar * image that is attached to the VM. A separator character gets added to * the suffix automatically when generating the full path, so it must be * accounted for, reducing the effective length by 1. * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37 * bytes for FreeBSD 12. A minimum length is set for safety and supports * a SPECNAMELEN as small as 32 on old systems. */ #define VM_MAX_PREFIXLEN 10 #define VM_MAX_SUFFIXLEN 15 #define VM_MIN_NAMELEN 6 #define VM_MAX_NAMELEN \ (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1) #ifdef _KERNEL CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN); struct vm; struct vm_exception; struct seg_desc; struct vm_exit; struct vm_run; struct vhpet; struct vioapic; struct vlapic; struct vmspace; struct vm_object; struct vm_guest_paging; struct pmap; enum snapshot_req; struct vm_eventinfo { cpuset_t *rptr; /* rendezvous cookie */ int *sptr; /* suspend cookie */ int *iptr; /* reqidle cookie */ }; typedef int (*vmm_init_func_t)(int ipinum); typedef int (*vmm_cleanup_func_t)(void); typedef void (*vmm_resume_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); typedef int (*vmi_run_func_t)(void *vcpui, register_t rip, struct pmap *pmap, struct vm_eventinfo *info); typedef void (*vmi_cleanup_func_t)(void *vmi); typedef void * (*vmi_vcpu_init_func_t)(void *vmi, struct vcpu *vcpu, int vcpu_id); typedef void (*vmi_vcpu_cleanup_func_t)(void *vcpui); typedef int (*vmi_get_register_t)(void *vcpui, int num, uint64_t *retval); typedef int (*vmi_set_register_t)(void *vcpui, int num, uint64_t val); typedef int (*vmi_get_desc_t)(void *vcpui, int num, struct seg_desc *desc); typedef int (*vmi_set_desc_t)(void *vcpui, int num, struct seg_desc *desc); typedef int (*vmi_get_cap_t)(void *vcpui, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vcpui, int num, int val); typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); typedef struct vlapic * (*vmi_vlapic_init)(void *vcpui); typedef void (*vmi_vlapic_cleanup)(struct vlapic *vlapic); typedef int (*vmi_snapshot_vcpu_t)(void *vcpui, struct vm_snapshot_meta *meta); typedef int (*vmi_restore_tsc_t)(void *vcpui, uint64_t now); struct vmm_ops { vmm_init_func_t modinit; /* module wide initialization */ vmm_cleanup_func_t modcleanup; vmm_resume_func_t modresume; vmi_init_func_t init; /* vm-specific initialization */ vmi_run_func_t run; vmi_cleanup_func_t cleanup; vmi_vcpu_init_func_t vcpu_init; vmi_vcpu_cleanup_func_t vcpu_cleanup; vmi_get_register_t getreg; vmi_set_register_t setreg; vmi_get_desc_t getdesc; vmi_set_desc_t setdesc; vmi_get_cap_t getcap; vmi_set_cap_t setcap; vmi_vmspace_alloc vmspace_alloc; vmi_vmspace_free vmspace_free; vmi_vlapic_init vlapic_init; vmi_vlapic_cleanup vlapic_cleanup; /* checkpoint operations */ vmi_snapshot_vcpu_t vcpu_snapshot; vmi_restore_tsc_t restore_tsc; }; extern const struct vmm_ops vmm_ops_intel; extern const struct vmm_ops vmm_ops_amd; extern u_int vm_maxcpu; /* maximum virtual cpus */ int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); void vm_slock_vcpus(struct vm *vm); void vm_unlock_vcpus(struct vm *vm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); const char *vm_name(struct vm *vm); uint16_t vm_get_maxcpus(struct vm *vm); void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); /* * APIs that modify the guest memory map require all vcpus to be frozen. */ void vm_slock_memsegs(struct vm *vm); void vm_xlock_memsegs(struct vm *vm); void vm_unlock_memsegs(struct vm *vm); int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot, int flags); int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); void vm_free_memseg(struct vm *vm, int ident); int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); /* * APIs that inspect the guest memory map require only a *single* vcpu to * be frozen. This acts like a read lock on the guest memory map since any * modification requires *all* vcpus to be frozen. */ int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, struct vm_object **objptr); vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); void *vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int prot, void **cookie); void *vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int prot, void **cookie); void *vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int prot, void **cookie); void vm_gpa_release(void *cookie); bool vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa); int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval); int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *ret_desc); int vm_set_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc); -int vm_run(struct vcpu *vcpu, struct vm_exit *vme_user); +int vm_run(struct vcpu *vcpu); int vm_suspend(struct vm *vm, enum vm_suspend_how how); int vm_inject_nmi(struct vcpu *vcpu); int vm_nmi_pending(struct vcpu *vcpu); void vm_nmi_clear(struct vcpu *vcpu); int vm_inject_extint(struct vcpu *vcpu); int vm_extint_pending(struct vcpu *vcpu); void vm_extint_clear(struct vcpu *vcpu); int vcpu_vcpuid(struct vcpu *vcpu); struct vm *vcpu_vm(struct vcpu *vcpu); struct vcpu *vm_vcpu(struct vm *vm, int cpu); struct vlapic *vm_lapic(struct vcpu *vcpu); struct vioapic *vm_ioapic(struct vm *vm); struct vhpet *vm_hpet(struct vm *vm); int vm_get_capability(struct vcpu *vcpu, int type, int *val); int vm_set_capability(struct vcpu *vcpu, int type, int val); int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state); int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state); int vm_apicid2vcpuid(struct vm *vm, int apicid); int vm_activate_cpu(struct vcpu *vcpu); int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu); int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu); int vm_restart_instruction(struct vcpu *vcpu); struct vm_exit *vm_exitinfo(struct vcpu *vcpu); +cpuset_t *vm_exitinfo_cpuset(struct vcpu *vcpu); void vm_exit_suspended(struct vcpu *vcpu, uint64_t rip); void vm_exit_debug(struct vcpu *vcpu, uint64_t rip); void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t rip); void vm_exit_astpending(struct vcpu *vcpu, uint64_t rip); void vm_exit_reqidle(struct vcpu *vcpu, uint64_t rip); int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta); int vm_restore_time(struct vm *vm); #ifdef _SYS__CPUSET_H_ /* * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. * The rendezvous 'func(arg)' is not allowed to do anything that will * cause the thread to be put to sleep. * * The caller cannot hold any locks when initiating the rendezvous. * * The implementation of this API may cause vcpus other than those specified * by 'dest' to be stalled. The caller should not rely on any vcpus making * forward progress when the rendezvous is in progress. */ typedef void (*vm_rendezvous_func_t)(struct vcpu *vcpu, void *arg); int vm_smp_rendezvous(struct vcpu *vcpu, cpuset_t dest, vm_rendezvous_func_t func, void *arg); cpuset_t vm_active_cpus(struct vm *vm); cpuset_t vm_debug_cpus(struct vm *vm); cpuset_t vm_suspended_cpus(struct vm *vm); cpuset_t vm_start_cpus(struct vm *vm, const cpuset_t *tostart); void vm_await_start(struct vm *vm, const cpuset_t *waiting); #endif /* _SYS__CPUSET_H_ */ static __inline int vcpu_rendezvous_pending(struct vcpu *vcpu, struct vm_eventinfo *info) { /* * This check isn't done with atomic operations or under a lock because * there's no need to. If the vcpuid bit is set, the vcpu is part of a * rendezvous and the bit won't be cleared until the vcpu enters the * rendezvous. On rendezvous exit, the cpuset is cleared and the vcpu * will see an empty cpuset. So, the races are harmless. */ return (CPU_ISSET(vcpu_vcpuid(vcpu), info->rptr)); } static __inline int vcpu_suspended(struct vm_eventinfo *info) { return (*info->sptr); } static __inline int vcpu_reqidle(struct vm_eventinfo *info) { return (*info->iptr); } int vcpu_debugged(struct vcpu *vcpu); /* * Return true if device indicated by bus/slot/func is supposed to be a * pci passthrough device. * * Return false otherwise. */ bool vmm_is_pptdev(int bus, int slot, int func); void *vm_iommu_domain(struct vm *vm); enum vcpu_state { VCPU_IDLE, VCPU_FROZEN, VCPU_RUNNING, VCPU_SLEEPING, }; int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle); enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu); static int __inline vcpu_is_running(struct vcpu *vcpu, int *hostcpu) { return (vcpu_get_state(vcpu, hostcpu) == VCPU_RUNNING); } #ifdef _SYS_PROC_H_ static int __inline vcpu_should_yield(struct vcpu *vcpu) { struct thread *td; td = curthread; return (td->td_ast != 0 || td->td_owepreempt != 0); } #endif void *vcpu_stats(struct vcpu *vcpu); void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr); struct vmspace *vm_get_vmspace(struct vm *vm); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); struct vpmtmr *vm_pmtmr(struct vm *vm); struct vrtc *vm_rtc(struct vm *vm); /* * Inject exception 'vector' into the guest vcpu. This function returns 0 on * success and non-zero on failure. * * Wrapper functions like 'vm_inject_gp()' should be preferred to calling * this function directly because they enforce the trap-like or fault-like * behavior of an exception. * * This function should only be called in the context of the thread that is * executing this vcpu. */ int vm_inject_exception(struct vcpu *vcpu, int vector, int err_valid, uint32_t errcode, int restart_instruction); /* * This function is called after a VM-exit that occurred during exception or * interrupt delivery through the IDT. The format of 'intinfo' is described * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. * * If a VM-exit handler completes the event delivery successfully then it * should call vm_exit_intinfo() to extinguish the pending event. For e.g., * if the task switch emulation is triggered via a task gate then it should * call this function with 'intinfo=0' to indicate that the external event * is not pending anymore. * * Return value is 0 on success and non-zero on failure. */ int vm_exit_intinfo(struct vcpu *vcpu, uint64_t intinfo); /* * This function is called before every VM-entry to retrieve a pending * event that should be injected into the guest. This function combines * nested events into a double or triple fault. * * Returns 0 if there are no events that need to be injected into the guest * and non-zero otherwise. */ int vm_entry_intinfo(struct vcpu *vcpu, uint64_t *info); int vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2); /* * Function used to keep track of the guest's TSC offset. The * offset is used by the virutalization extensions to provide a consistent * value for the Time Stamp Counter to the guest. */ void vm_set_tsc_offset(struct vcpu *vcpu, uint64_t offset); enum vm_reg_name vm_segment_name(int seg_encoding); struct vm_copyinfo { uint64_t gpa; size_t len; void *hva; void *cookie; }; /* * Set up 'copyinfo[]' to copy to/from guest linear address space starting * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for * a copyin or PROT_WRITE for a copyout. * * retval is_fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Unrecoverable error * * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if * the return value is 0. The 'copyinfo[]' resources should be freed by calling * 'vm_copy_teardown()' after the copy is done. */ int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *is_fault); void vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo); void vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len); void vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len); int vcpu_trace_exceptions(struct vcpu *vcpu); int vcpu_trap_wbinvd(struct vcpu *vcpu); #endif /* KERNEL */ /* * Identifiers for optional vmm capabilities */ enum vm_cap_type { VM_CAP_HALT_EXIT, VM_CAP_MTRAP_EXIT, VM_CAP_PAUSE_EXIT, VM_CAP_UNRESTRICTED_GUEST, VM_CAP_ENABLE_INVPCID, VM_CAP_BPT_EXIT, VM_CAP_RDPID, VM_CAP_RDTSCP, VM_CAP_IPI_EXIT, VM_CAP_MASK_HWINTR, VM_CAP_MAX }; enum vm_intr_trigger { EDGE_TRIGGER, LEVEL_TRIGGER }; /* * The 'access' field has the format specified in Table 21-2 of the Intel * Architecture Manual vol 3b. * * XXX The contents of the 'access' field are architecturally defined except * bit 16 - Segment Unusable. */ struct seg_desc { uint64_t base; uint32_t limit; uint32_t access; }; #define SEG_DESC_TYPE(access) ((access) & 0x001f) #define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) #define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) #define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) #define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) #define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) enum vm_cpu_mode { CPU_MODE_REAL, CPU_MODE_PROTECTED, CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ }; enum vm_paging_mode { PAGING_MODE_FLAT, PAGING_MODE_32, PAGING_MODE_PAE, PAGING_MODE_64, PAGING_MODE_64_LA57, }; struct vm_guest_paging { uint64_t cr3; int cpl; enum vm_cpu_mode cpu_mode; enum vm_paging_mode paging_mode; }; /* * The data structures 'vie' and 'vie_op' are meant to be opaque to the * consumers of instruction decoding. The only reason why their contents * need to be exposed is because they are part of the 'vm_exit' structure. */ struct vie_op { uint8_t op_byte; /* actual opcode byte */ uint8_t op_type; /* type of operation (e.g. MOV) */ uint16_t op_flags; }; _Static_assert(sizeof(struct vie_op) == 4, "ABI"); _Static_assert(_Alignof(struct vie_op) == 2, "ABI"); #define VIE_INST_SIZE 15 struct vie { uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ uint8_t num_valid; /* size of the instruction */ /* The following fields are all zeroed upon restart. */ #define vie_startzero num_processed uint8_t num_processed; uint8_t addrsize:4, opsize:4; /* address and operand sizes */ uint8_t rex_w:1, /* REX prefix */ rex_r:1, rex_x:1, rex_b:1, rex_present:1, repz_present:1, /* REP/REPE/REPZ prefix */ repnz_present:1, /* REPNE/REPNZ prefix */ opsize_override:1, /* Operand size override */ addrsize_override:1, /* Address size override */ segment_override:1; /* Segment override */ uint8_t mod:2, /* ModRM byte */ reg:4, rm:4; uint8_t ss:2, /* SIB byte */ vex_present:1, /* VEX prefixed */ vex_l:1, /* L bit */ index:4, /* SIB byte */ base:4; /* SIB byte */ uint8_t disp_bytes; uint8_t imm_bytes; uint8_t scale; uint8_t vex_reg:4, /* vvvv: first source register specifier */ vex_pp:2, /* pp */ _sparebits:2; uint8_t _sparebytes[2]; int base_register; /* VM_REG_GUEST_xyz */ int index_register; /* VM_REG_GUEST_xyz */ int segment_register; /* VM_REG_GUEST_xyz */ int64_t displacement; /* optional addr displacement */ int64_t immediate; /* optional immediate operand */ uint8_t decoded; /* set to 1 if successfully decoded */ uint8_t _sparebyte; struct vie_op op; /* opcode description */ }; _Static_assert(sizeof(struct vie) == 64, "ABI"); _Static_assert(__offsetof(struct vie, disp_bytes) == 22, "ABI"); _Static_assert(__offsetof(struct vie, scale) == 24, "ABI"); _Static_assert(__offsetof(struct vie, base_register) == 28, "ABI"); enum vm_exitcode { VM_EXITCODE_INOUT, VM_EXITCODE_VMX, VM_EXITCODE_BOGUS, VM_EXITCODE_RDMSR, VM_EXITCODE_WRMSR, VM_EXITCODE_HLT, VM_EXITCODE_MTRAP, VM_EXITCODE_PAUSE, VM_EXITCODE_PAGING, VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ VM_EXITCODE_RENDEZVOUS, VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, VM_EXITCODE_INOUT_STR, VM_EXITCODE_TASK_SWITCH, VM_EXITCODE_MONITOR, VM_EXITCODE_MWAIT, VM_EXITCODE_SVM, VM_EXITCODE_REQIDLE, VM_EXITCODE_DEBUG, VM_EXITCODE_VMINSN, VM_EXITCODE_BPT, VM_EXITCODE_IPI, VM_EXITCODE_MAX }; struct vm_inout { uint16_t bytes:3; /* 1 or 2 or 4 */ uint16_t in:1; uint16_t string:1; uint16_t rep:1; uint16_t port; uint32_t eax; /* valid for out */ }; struct vm_inout_str { struct vm_inout inout; /* must be the first element */ struct vm_guest_paging paging; uint64_t rflags; uint64_t cr0; uint64_t index; uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ int addrsize; enum vm_reg_name seg_name; struct seg_desc seg_desc; }; enum task_switch_reason { TSR_CALL, TSR_IRET, TSR_JMP, TSR_IDT_GATE, /* task gate in IDT */ }; struct vm_task_switch { uint16_t tsssel; /* new TSS selector */ int ext; /* task switch due to external event */ uint32_t errcode; int errcode_valid; /* push 'errcode' on the new stack */ enum task_switch_reason reason; struct vm_guest_paging paging; }; struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ uint64_t rip; union { struct vm_inout inout; struct vm_inout_str inout_str; struct { uint64_t gpa; int fault_type; } paging; struct { uint64_t gpa; uint64_t gla; uint64_t cs_base; int cs_d; /* CS.D */ struct vm_guest_paging paging; struct vie vie; } inst_emul; /* * VMX specific payload. Used when there is no "better" * exitcode to represent the VM-exit. */ struct { int status; /* vmx inst status */ /* * 'exit_reason' and 'exit_qualification' are valid * only if 'status' is zero. */ uint32_t exit_reason; uint64_t exit_qualification; /* * 'inst_error' and 'inst_type' are valid * only if 'status' is non-zero. */ int inst_type; int inst_error; } vmx; /* * SVM specific payload. */ struct { uint64_t exitcode; uint64_t exitinfo1; uint64_t exitinfo2; } svm; struct { int inst_length; } bpt; struct { uint32_t code; /* ecx value */ uint64_t wval; } msr; struct { int vcpu; uint64_t rip; } spinup_ap; struct { uint64_t rflags; uint64_t intr_status; } hlt; struct { int vector; } ioapic_eoi; struct { enum vm_suspend_how how; } suspended; struct { + /* + * The destination vCPU mask is saved in vcpu->cpuset + * and is copied out to userspace separately to avoid + * ABI concerns. + */ uint32_t mode; uint8_t vector; - cpuset_t dmask; } ipi; struct vm_task_switch task_switch; } u; }; /* APIs to inject faults into the guest */ void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode); static __inline void vm_inject_ud(struct vcpu *vcpu) { vm_inject_fault(vcpu, IDT_UD, 0, 0); } static __inline void vm_inject_gp(struct vcpu *vcpu) { vm_inject_fault(vcpu, IDT_GP, 1, 0); } static __inline void vm_inject_ac(struct vcpu *vcpu, int errcode) { vm_inject_fault(vcpu, IDT_AC, 1, errcode); } static __inline void vm_inject_ss(struct vcpu *vcpu, int errcode) { vm_inject_fault(vcpu, IDT_SS, 1, errcode); } void vm_inject_pf(struct vcpu *vcpu, int error_code, uint64_t cr2); #endif /* _VMM_H_ */ diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index def48a9f86fe..42406b115933 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -1,479 +1,481 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ struct vm_snapshot_meta; #ifdef _KERNEL void vmmdev_init(void); int vmmdev_cleanup(void); #endif struct vm_memmap { vm_paddr_t gpa; int segid; /* memory segment */ vm_ooffset_t segoff; /* offset into memory segment */ size_t len; /* mmap length */ int prot; /* RWX */ int flags; }; #define VM_MEMMAP_F_WIRED 0x01 #define VM_MEMMAP_F_IOMMU 0x02 struct vm_munmap { vm_paddr_t gpa; size_t len; }; #define VM_MEMSEG_NAME(m) ((m)->name[0] != '\0' ? (m)->name : NULL) struct vm_memseg { int segid; size_t len; char name[VM_MAX_SUFFIXLEN + 1]; }; struct vm_memseg_fbsd12 { int segid; size_t len; char name[64]; }; _Static_assert(sizeof(struct vm_memseg_fbsd12) == 80, "COMPAT_FREEBSD12 ABI"); struct vm_register { int cpuid; int regnum; /* enum vm_reg_name */ uint64_t regval; }; struct vm_seg_desc { /* data or code segment */ int cpuid; int regnum; /* enum vm_reg_name */ struct seg_desc desc; }; struct vm_register_set { int cpuid; unsigned int count; const int *regnums; /* enum vm_reg_name */ uint64_t *regvals; }; struct vm_run { int cpuid; - struct vm_exit vm_exit; + cpuset_t *cpuset; /* CPU set storage */ + size_t cpusetsize; + struct vm_exit *vm_exit; }; struct vm_exception { int cpuid; int vector; uint32_t error_code; int error_code_valid; int restart_instruction; }; struct vm_lapic_msi { uint64_t msg; uint64_t addr; }; struct vm_lapic_irq { int cpuid; int vector; }; struct vm_ioapic_irq { int irq; }; struct vm_isa_irq { int atpic_irq; int ioapic_irq; }; struct vm_isa_irq_trigger { int atpic_irq; enum vm_intr_trigger trigger; }; struct vm_capability { int cpuid; enum vm_cap_type captype; int capval; int allcpus; }; struct vm_pptdev { int bus; int slot; int func; }; struct vm_pptdev_mmio { int bus; int slot; int func; vm_paddr_t gpa; vm_paddr_t hpa; size_t len; }; struct vm_pptdev_msi { int vcpu; /* unused */ int bus; int slot; int func; int numvec; /* 0 means disabled */ uint64_t msg; uint64_t addr; }; struct vm_pptdev_msix { int vcpu; /* unused */ int bus; int slot; int func; int idx; uint64_t msg; uint32_t vector_control; uint64_t addr; }; struct vm_nmi { int cpuid; }; #define MAX_VM_STATS 64 struct vm_stats { int cpuid; /* in */ int index; /* in */ int num_entries; /* out */ struct timeval tv; uint64_t statbuf[MAX_VM_STATS]; }; struct vm_stat_desc { int index; /* in */ char desc[128]; /* out */ }; struct vm_x2apic { int cpuid; enum x2apic_state state; }; struct vm_gpa_pte { uint64_t gpa; /* in */ uint64_t pte[4]; /* out */ int ptenum; }; struct vm_hpet_cap { uint32_t capabilities; /* lower 32 bits of HPET capabilities */ }; struct vm_suspend { enum vm_suspend_how how; }; struct vm_gla2gpa { int vcpuid; /* inputs */ int prot; /* PROT_READ or PROT_WRITE */ uint64_t gla; struct vm_guest_paging paging; int fault; /* outputs */ uint64_t gpa; }; struct vm_activate_cpu { int vcpuid; }; struct vm_cpuset { int which; int cpusetsize; cpuset_t *cpus; }; #define VM_ACTIVE_CPUS 0 #define VM_SUSPENDED_CPUS 1 #define VM_DEBUG_CPUS 2 struct vm_intinfo { int vcpuid; uint64_t info1; uint64_t info2; }; struct vm_rtc_time { time_t secs; }; struct vm_rtc_data { int offset; uint8_t value; }; struct vm_cpu_topology { uint16_t sockets; uint16_t cores; uint16_t threads; uint16_t maxcpus; }; struct vm_readwrite_kernemu_device { int vcpuid; unsigned access_width : 3; unsigned _unused : 29; uint64_t gpa; uint64_t value; }; _Static_assert(sizeof(struct vm_readwrite_kernemu_device) == 24, "ABI"); enum { /* general routines */ IOCNUM_ABIVERS = 0, IOCNUM_RUN = 1, IOCNUM_SET_CAPABILITY = 2, IOCNUM_GET_CAPABILITY = 3, IOCNUM_SUSPEND = 4, IOCNUM_REINIT = 5, /* memory apis */ IOCNUM_MAP_MEMORY = 10, /* deprecated */ IOCNUM_GET_MEMORY_SEG = 11, /* deprecated */ IOCNUM_GET_GPA_PMAP = 12, IOCNUM_GLA2GPA = 13, IOCNUM_ALLOC_MEMSEG = 14, IOCNUM_GET_MEMSEG = 15, IOCNUM_MMAP_MEMSEG = 16, IOCNUM_MMAP_GETNEXT = 17, IOCNUM_GLA2GPA_NOFAULT = 18, IOCNUM_MUNMAP_MEMSEG = 19, /* register/state accessors */ IOCNUM_SET_REGISTER = 20, IOCNUM_GET_REGISTER = 21, IOCNUM_SET_SEGMENT_DESCRIPTOR = 22, IOCNUM_GET_SEGMENT_DESCRIPTOR = 23, IOCNUM_SET_REGISTER_SET = 24, IOCNUM_GET_REGISTER_SET = 25, IOCNUM_GET_KERNEMU_DEV = 26, IOCNUM_SET_KERNEMU_DEV = 27, /* interrupt injection */ IOCNUM_GET_INTINFO = 28, IOCNUM_SET_INTINFO = 29, IOCNUM_INJECT_EXCEPTION = 30, IOCNUM_LAPIC_IRQ = 31, IOCNUM_INJECT_NMI = 32, IOCNUM_IOAPIC_ASSERT_IRQ = 33, IOCNUM_IOAPIC_DEASSERT_IRQ = 34, IOCNUM_IOAPIC_PULSE_IRQ = 35, IOCNUM_LAPIC_MSI = 36, IOCNUM_LAPIC_LOCAL_IRQ = 37, IOCNUM_IOAPIC_PINCOUNT = 38, IOCNUM_RESTART_INSTRUCTION = 39, /* PCI pass-thru */ IOCNUM_BIND_PPTDEV = 40, IOCNUM_UNBIND_PPTDEV = 41, IOCNUM_MAP_PPTDEV_MMIO = 42, IOCNUM_PPTDEV_MSI = 43, IOCNUM_PPTDEV_MSIX = 44, IOCNUM_PPTDEV_DISABLE_MSIX = 45, IOCNUM_UNMAP_PPTDEV_MMIO = 46, /* statistics */ IOCNUM_VM_STATS = 50, IOCNUM_VM_STAT_DESC = 51, /* kernel device state */ IOCNUM_SET_X2APIC_STATE = 60, IOCNUM_GET_X2APIC_STATE = 61, IOCNUM_GET_HPET_CAPABILITIES = 62, /* CPU Topology */ IOCNUM_SET_TOPOLOGY = 63, IOCNUM_GET_TOPOLOGY = 64, /* legacy interrupt injection */ IOCNUM_ISA_ASSERT_IRQ = 80, IOCNUM_ISA_DEASSERT_IRQ = 81, IOCNUM_ISA_PULSE_IRQ = 82, IOCNUM_ISA_SET_IRQ_TRIGGER = 83, /* vm_cpuset */ IOCNUM_ACTIVATE_CPU = 90, IOCNUM_GET_CPUSET = 91, IOCNUM_SUSPEND_CPU = 92, IOCNUM_RESUME_CPU = 93, /* RTC */ IOCNUM_RTC_READ = 100, IOCNUM_RTC_WRITE = 101, IOCNUM_RTC_SETTIME = 102, IOCNUM_RTC_GETTIME = 103, /* checkpoint */ IOCNUM_SNAPSHOT_REQ = 113, IOCNUM_RESTORE_TIME = 115 }; #define VM_RUN \ - _IOWR('v', IOCNUM_RUN, struct vm_run) + _IOW('v', IOCNUM_RUN, struct vm_run) #define VM_SUSPEND \ _IOW('v', IOCNUM_SUSPEND, struct vm_suspend) #define VM_REINIT \ _IO('v', IOCNUM_REINIT) #define VM_ALLOC_MEMSEG_FBSD12 \ _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_fbsd12) #define VM_ALLOC_MEMSEG \ _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg) #define VM_GET_MEMSEG_FBSD12 \ _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_fbsd12) #define VM_GET_MEMSEG \ _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg) #define VM_MMAP_MEMSEG \ _IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap) #define VM_MMAP_GETNEXT \ _IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap) #define VM_MUNMAP_MEMSEG \ _IOW('v', IOCNUM_MUNMAP_MEMSEG, struct vm_munmap) #define VM_SET_REGISTER \ _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) #define VM_GET_REGISTER \ _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) #define VM_SET_SEGMENT_DESCRIPTOR \ _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) #define VM_GET_SEGMENT_DESCRIPTOR \ _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) #define VM_SET_REGISTER_SET \ _IOW('v', IOCNUM_SET_REGISTER_SET, struct vm_register_set) #define VM_GET_REGISTER_SET \ _IOWR('v', IOCNUM_GET_REGISTER_SET, struct vm_register_set) #define VM_SET_KERNEMU_DEV \ _IOW('v', IOCNUM_SET_KERNEMU_DEV, \ struct vm_readwrite_kernemu_device) #define VM_GET_KERNEMU_DEV \ _IOWR('v', IOCNUM_GET_KERNEMU_DEV, \ struct vm_readwrite_kernemu_device) #define VM_INJECT_EXCEPTION \ _IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception) #define VM_LAPIC_IRQ \ _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq) #define VM_LAPIC_LOCAL_IRQ \ _IOW('v', IOCNUM_LAPIC_LOCAL_IRQ, struct vm_lapic_irq) #define VM_LAPIC_MSI \ _IOW('v', IOCNUM_LAPIC_MSI, struct vm_lapic_msi) #define VM_IOAPIC_ASSERT_IRQ \ _IOW('v', IOCNUM_IOAPIC_ASSERT_IRQ, struct vm_ioapic_irq) #define VM_IOAPIC_DEASSERT_IRQ \ _IOW('v', IOCNUM_IOAPIC_DEASSERT_IRQ, struct vm_ioapic_irq) #define VM_IOAPIC_PULSE_IRQ \ _IOW('v', IOCNUM_IOAPIC_PULSE_IRQ, struct vm_ioapic_irq) #define VM_IOAPIC_PINCOUNT \ _IOR('v', IOCNUM_IOAPIC_PINCOUNT, int) #define VM_ISA_ASSERT_IRQ \ _IOW('v', IOCNUM_ISA_ASSERT_IRQ, struct vm_isa_irq) #define VM_ISA_DEASSERT_IRQ \ _IOW('v', IOCNUM_ISA_DEASSERT_IRQ, struct vm_isa_irq) #define VM_ISA_PULSE_IRQ \ _IOW('v', IOCNUM_ISA_PULSE_IRQ, struct vm_isa_irq) #define VM_ISA_SET_IRQ_TRIGGER \ _IOW('v', IOCNUM_ISA_SET_IRQ_TRIGGER, struct vm_isa_irq_trigger) #define VM_SET_CAPABILITY \ _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) #define VM_GET_CAPABILITY \ _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) #define VM_BIND_PPTDEV \ _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev) #define VM_UNBIND_PPTDEV \ _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev) #define VM_MAP_PPTDEV_MMIO \ _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio) #define VM_PPTDEV_MSI \ _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi) #define VM_PPTDEV_MSIX \ _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix) #define VM_PPTDEV_DISABLE_MSIX \ _IOW('v', IOCNUM_PPTDEV_DISABLE_MSIX, struct vm_pptdev) #define VM_UNMAP_PPTDEV_MMIO \ _IOW('v', IOCNUM_UNMAP_PPTDEV_MMIO, struct vm_pptdev_mmio) #define VM_INJECT_NMI \ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) #define VM_STATS \ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) #define VM_STAT_DESC \ _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) #define VM_SET_X2APIC_STATE \ _IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic) #define VM_GET_X2APIC_STATE \ _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic) #define VM_GET_HPET_CAPABILITIES \ _IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap) #define VM_SET_TOPOLOGY \ _IOW('v', IOCNUM_SET_TOPOLOGY, struct vm_cpu_topology) #define VM_GET_TOPOLOGY \ _IOR('v', IOCNUM_GET_TOPOLOGY, struct vm_cpu_topology) #define VM_GET_GPA_PMAP \ _IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte) #define VM_GLA2GPA \ _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa) #define VM_GLA2GPA_NOFAULT \ _IOWR('v', IOCNUM_GLA2GPA_NOFAULT, struct vm_gla2gpa) #define VM_ACTIVATE_CPU \ _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) #define VM_GET_CPUS \ _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) #define VM_SUSPEND_CPU \ _IOW('v', IOCNUM_SUSPEND_CPU, struct vm_activate_cpu) #define VM_RESUME_CPU \ _IOW('v', IOCNUM_RESUME_CPU, struct vm_activate_cpu) #define VM_SET_INTINFO \ _IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo) #define VM_GET_INTINFO \ _IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo) #define VM_RTC_WRITE \ _IOW('v', IOCNUM_RTC_WRITE, struct vm_rtc_data) #define VM_RTC_READ \ _IOWR('v', IOCNUM_RTC_READ, struct vm_rtc_data) #define VM_RTC_SETTIME \ _IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time) #define VM_RTC_GETTIME \ _IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time) #define VM_RESTART_INSTRUCTION \ _IOW('v', IOCNUM_RESTART_INSTRUCTION, int) #define VM_SNAPSHOT_REQ \ _IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta) #define VM_RESTORE_TIME \ _IOWR('v', IOCNUM_RESTORE_TIME, int) #endif diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 45a70e494e62..d36a7738278e 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -1,1911 +1,1911 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_ktr.h" #include "vmm_stat.h" #include "vlapic.h" #include "vlapic_priv.h" #include "vioapic.h" #define PRIO(x) ((x) >> 4) #define VLAPIC_VERSION (0x14) #define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0) /* * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the * vlapic_callout_handler() and vcpu accesses to: * - timer_freq_bt, timer_period_bt, timer_fire_bt * - timer LVT register */ #define VLAPIC_TIMER_LOCK(vlapic) mtx_lock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx)) /* * APIC timer frequency: * - arbitrary but chosen to be in the ballpark of contemporary hardware. * - power-of-two to avoid loss of precision when converted to a bintime. */ #define VLAPIC_BUS_FREQ (128 * 1024 * 1024) static void vlapic_set_error(struct vlapic *, uint32_t, bool); static void vlapic_callout_handler(void *arg); static void vlapic_reset(struct vlapic *vlapic); static __inline uint32_t vlapic_get_id(struct vlapic *vlapic) { if (x2apic(vlapic)) return (vlapic->vcpuid); else return (vlapic->vcpuid << 24); } static uint32_t x2apic_ldr(struct vlapic *vlapic) { int apicid; uint32_t ldr; apicid = vlapic_get_id(vlapic); ldr = 1 << (apicid & 0xf); ldr |= (apicid & 0xffff0) << 12; return (ldr); } void vlapic_dfr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; if (x2apic(vlapic)) { VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x", lapic->dfr); lapic->dfr = 0; return; } lapic->dfr &= APIC_DFR_MODEL_MASK; lapic->dfr |= APIC_DFR_RESERVED; if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model"); else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model"); else VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr); } void vlapic_ldr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; /* LDR is read-only in x2apic mode */ if (x2apic(vlapic)) { VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x", lapic->ldr); lapic->ldr = x2apic_ldr(vlapic); } else { lapic->ldr &= ~APIC_LDR_RESERVED; VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr); } } void vlapic_id_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; /* * We don't allow the ID register to be modified so reset it back to * its default value. */ lapic = vlapic->apic_page; lapic->id = vlapic_get_id(vlapic); } static int vlapic_timer_divisor(uint32_t dcr) { switch (dcr & 0xB) { case APIC_TDCR_1: return (1); case APIC_TDCR_2: return (2); case APIC_TDCR_4: return (4); case APIC_TDCR_8: return (8); case APIC_TDCR_16: return (16); case APIC_TDCR_32: return (32); case APIC_TDCR_64: return (64); case APIC_TDCR_128: return (128); default: panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); } } #if 0 static inline void vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) { printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset, *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS, *lvt & APIC_LVTT_M); } #endif static uint32_t vlapic_get_ccr(struct vlapic *vlapic) { struct bintime bt_now, bt_rem; struct LAPIC *lapic __diagused; uint32_t ccr; ccr = 0; lapic = vlapic->apic_page; VLAPIC_TIMER_LOCK(vlapic); if (callout_active(&vlapic->callout)) { /* * If the timer is scheduled to expire in the future then * compute the value of 'ccr' based on the remaining time. */ binuptime(&bt_now); if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) { bt_rem = vlapic->timer_fire_bt; bintime_sub(&bt_rem, &bt_now); ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt); ccr += bt_rem.frac / vlapic->timer_freq_bt.frac; } } KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, " "icr_timer is %#x", ccr, lapic->icr_timer)); VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x", ccr, lapic->icr_timer); VLAPIC_TIMER_UNLOCK(vlapic); return (ccr); } void vlapic_dcr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; int divisor; lapic = vlapic->apic_page; VLAPIC_TIMER_LOCK(vlapic); divisor = vlapic_timer_divisor(lapic->dcr_timer); VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", lapic->dcr_timer, divisor); /* * Update the timer frequency and the timer period. * * XXX changes to the frequency divider will not take effect until * the timer is reloaded. */ FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt); vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); VLAPIC_TIMER_UNLOCK(vlapic); } void vlapic_esr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; lapic->esr = vlapic->esr_pending; vlapic->esr_pending = 0; } int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) { struct LAPIC *lapic; uint32_t *irrptr, *tmrptr, mask; int idx; KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector)); lapic = vlapic->apic_page; if (!(lapic->svr & APIC_SVR_ENABLE)) { VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring " "interrupt %d", vector); return (0); } if (vector < 16) { vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR, false); VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d", vector); return (1); } if (vlapic->ops.set_intr_ready) return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level)); idx = (vector / 32) * 4; mask = 1 << (vector % 32); irrptr = &lapic->irr0; atomic_set_int(&irrptr[idx], mask); /* * Verify that the trigger-mode of the interrupt matches with * the vlapic TMR registers. */ tmrptr = &lapic->tmr0; if ((tmrptr[idx] & mask) != (level ? mask : 0)) { VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but " "interrupt is %s-triggered", idx / 4, tmrptr[idx], level ? "level" : "edge"); } VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); return (1); } static __inline uint32_t * vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) { struct LAPIC *lapic = vlapic->apic_page; int i; switch (offset) { case APIC_OFFSET_CMCI_LVT: return (&lapic->lvt_cmci); case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; return ((&lapic->lvt_timer) + i); default: panic("vlapic_get_lvt: invalid LVT\n"); } } static __inline int lvt_off_to_idx(uint32_t offset) { int index; switch (offset) { case APIC_OFFSET_CMCI_LVT: index = APIC_LVT_CMCI; break; case APIC_OFFSET_TIMER_LVT: index = APIC_LVT_TIMER; break; case APIC_OFFSET_THERM_LVT: index = APIC_LVT_THERMAL; break; case APIC_OFFSET_PERF_LVT: index = APIC_LVT_PMC; break; case APIC_OFFSET_LINT0_LVT: index = APIC_LVT_LINT0; break; case APIC_OFFSET_LINT1_LVT: index = APIC_LVT_LINT1; break; case APIC_OFFSET_ERROR_LVT: index = APIC_LVT_ERROR; break; default: index = -1; break; } KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: " "invalid lvt index %d for offset %#x", index, offset)); return (index); } static __inline uint32_t vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) { int idx; uint32_t val; idx = lvt_off_to_idx(offset); val = atomic_load_acq_32(&vlapic->lvt_last[idx]); return (val); } void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset) { uint32_t *lvtptr, mask, val; struct LAPIC *lapic; int idx; lapic = vlapic->apic_page; lvtptr = vlapic_get_lvtptr(vlapic, offset); val = *lvtptr; idx = lvt_off_to_idx(offset); if (!(lapic->svr & APIC_SVR_ENABLE)) val |= APIC_LVT_M; mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR; switch (offset) { case APIC_OFFSET_TIMER_LVT: mask |= APIC_LVTT_TM; break; case APIC_OFFSET_ERROR_LVT: break; case APIC_OFFSET_LINT0_LVT: case APIC_OFFSET_LINT1_LVT: mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP; /* FALLTHROUGH */ default: mask |= APIC_LVT_DM; break; } val &= mask; *lvtptr = val; atomic_store_rel_32(&vlapic->lvt_last[idx], val); } static void vlapic_mask_lvts(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; lapic->lvt_cmci |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT); lapic->lvt_timer |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT); lapic->lvt_thermal |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT); lapic->lvt_pcint |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT); lapic->lvt_lint0 |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT); lapic->lvt_lint1 |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT); lapic->lvt_error |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT); } static int vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt) { uint32_t mode, reg, vec; reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]); if (reg & APIC_LVT_M) return (0); vec = reg & APIC_LVT_VECTOR; mode = reg & APIC_LVT_DM; switch (mode) { case APIC_LVT_DM_FIXED: if (vec < 16) { vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, lvt == APIC_LVT_ERROR); return (0); } if (vlapic_set_intr_ready(vlapic, vec, false)) vcpu_notify_event(vlapic->vcpu, true); break; case APIC_LVT_DM_NMI: vm_inject_nmi(vlapic->vcpu); break; case APIC_LVT_DM_EXTINT: vm_inject_extint(vlapic->vcpu); break; default: // Other modes ignored return (0); } return (1); } #if 1 static void dump_isrvec_stk(struct vlapic *vlapic) { int i; uint32_t *isrptr; isrptr = &vlapic->apic_page->isr0; for (i = 0; i < 8; i++) printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); for (i = 0; i <= vlapic->isrvec_stk_top; i++) printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]); } #endif /* * Algorithm adopted from section "Interrupt, Task and Processor Priority" * in Intel Architecture Manual Vol 3a. */ static void vlapic_update_ppr(struct vlapic *vlapic) { int isrvec, tpr, ppr; /* * Note that the value on the stack at index 0 is always 0. * * This is a placeholder for the value of ISRV when none of the * bits is set in the ISRx registers. */ isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top]; tpr = vlapic->apic_page->tpr; #if 1 { int i, lastprio, curprio, vector, idx; uint32_t *isrptr; if (vlapic->isrvec_stk_top == 0 && isrvec != 0) panic("isrvec_stk is corrupted: %d", isrvec); /* * Make sure that the priority of the nested interrupts is * always increasing. */ lastprio = -1; for (i = 1; i <= vlapic->isrvec_stk_top; i++) { curprio = PRIO(vlapic->isrvec_stk[i]); if (curprio <= lastprio) { dump_isrvec_stk(vlapic); panic("isrvec_stk does not satisfy invariant"); } lastprio = curprio; } /* * Make sure that each bit set in the ISRx registers has a * corresponding entry on the isrvec stack. */ i = 1; isrptr = &vlapic->apic_page->isr0; for (vector = 0; vector < 256; vector++) { idx = (vector / 32) * 4; if (isrptr[idx] & (1 << (vector % 32))) { if (i > vlapic->isrvec_stk_top || vlapic->isrvec_stk[i] != vector) { dump_isrvec_stk(vlapic); panic("ISR and isrvec_stk out of sync"); } i++; } } } #endif if (PRIO(tpr) >= PRIO(isrvec)) ppr = tpr; else ppr = isrvec & 0xf0; vlapic->apic_page->ppr = ppr; VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); } void vlapic_sync_tpr(struct vlapic *vlapic) { vlapic_update_ppr(vlapic); } static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt"); static void vlapic_process_eoi(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *isrptr, *tmrptr; int i, idx, bitpos, vector; isrptr = &lapic->isr0; tmrptr = &lapic->tmr0; for (i = 7; i >= 0; i--) { idx = i * 4; bitpos = fls(isrptr[idx]); if (bitpos-- != 0) { if (vlapic->isrvec_stk_top <= 0) { panic("invalid vlapic isrvec_stk_top %d", vlapic->isrvec_stk_top); } isrptr[idx] &= ~(1 << bitpos); vector = i * 32 + bitpos; VLAPIC_CTR1(vlapic, "EOI vector %d", vector); VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); vlapic->isrvec_stk_top--; vlapic_update_ppr(vlapic); if ((tmrptr[idx] & (1 << bitpos)) != 0) { vioapic_process_eoi(vlapic->vm, vector); } return; } } VLAPIC_CTR0(vlapic, "Gratuitous EOI"); vmm_stat_incr(vlapic->vcpu, VLAPIC_GRATUITOUS_EOI, 1); } static __inline int vlapic_get_lvt_field(uint32_t lvt, uint32_t mask) { return (lvt & mask); } static __inline int vlapic_periodic_timer(struct vlapic *vlapic) { uint32_t lvt; lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); } static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic"); static void vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error) { vlapic->esr_pending |= mask; /* * Avoid infinite recursion if the error LVT itself is configured with * an illegal vector. */ if (lvt_error) return; if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) { vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_ERROR, 1); } } static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); static void vlapic_fire_timer(struct vlapic *vlapic) { KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked")); if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) { VLAPIC_CTR0(vlapic, "vlapic timer fired"); vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_TIMER, 1); } } static VMM_STAT(VLAPIC_INTR_CMC, "corrected machine check interrupts generated by vlapic"); void vlapic_fire_cmci(struct vlapic *vlapic) { if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) { vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_CMC, 1); } } static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1, "lvts triggered"); int vlapic_trigger_lvt(struct vlapic *vlapic, int vector) { if (vlapic_enabled(vlapic) == false) { /* * When the local APIC is global/hardware disabled, * LINT[1:0] pins are configured as INTR and NMI pins, * respectively. */ switch (vector) { case APIC_LVT_LINT0: vm_inject_extint(vlapic->vcpu); break; case APIC_LVT_LINT1: vm_inject_nmi(vlapic->vcpu); break; default: break; } return (0); } switch (vector) { case APIC_LVT_LINT0: case APIC_LVT_LINT1: case APIC_LVT_TIMER: case APIC_LVT_ERROR: case APIC_LVT_PMC: case APIC_LVT_THERMAL: case APIC_LVT_CMCI: if (vlapic_fire_lvt(vlapic, vector)) { vmm_stat_array_incr(vlapic->vcpu, LVTS_TRIGGERRED, vector, 1); } break; default: return (EINVAL); } return (0); } static void vlapic_callout_reset(struct vlapic *vlapic, sbintime_t t) { callout_reset_sbt_curcpu(&vlapic->callout, t, 0, vlapic_callout_handler, vlapic, 0); } static void vlapic_callout_handler(void *arg) { struct vlapic *vlapic; struct bintime bt, btnow; sbintime_t rem_sbt; vlapic = arg; VLAPIC_TIMER_LOCK(vlapic); if (callout_pending(&vlapic->callout)) /* callout was reset */ goto done; if (!callout_active(&vlapic->callout)) /* callout was stopped */ goto done; callout_deactivate(&vlapic->callout); vlapic_fire_timer(vlapic); if (vlapic_periodic_timer(vlapic)) { binuptime(&btnow); KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=), ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx", btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec, vlapic->timer_fire_bt.frac)); /* * Compute the delta between when the timer was supposed to * fire and the present time. */ bt = btnow; bintime_sub(&bt, &vlapic->timer_fire_bt); rem_sbt = bttosbt(vlapic->timer_period_bt); if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) { /* * Adjust the time until the next countdown downward * to account for the lost time. */ rem_sbt -= bttosbt(bt); } else { /* * If the delta is greater than the timer period then * just reset our time base instead of trying to catch * up. */ vlapic->timer_fire_bt = btnow; VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu " "usecs, period is %lu usecs - resetting time base", bttosbt(bt) / SBT_1US, bttosbt(vlapic->timer_period_bt) / SBT_1US); } bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); vlapic_callout_reset(vlapic, rem_sbt); } done: VLAPIC_TIMER_UNLOCK(vlapic); } void vlapic_icrtmr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; sbintime_t sbt; uint32_t icr_timer; VLAPIC_TIMER_LOCK(vlapic); lapic = vlapic->apic_page; icr_timer = lapic->icr_timer; vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, icr_timer); if (icr_timer != 0) { binuptime(&vlapic->timer_fire_bt); bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); sbt = bttosbt(vlapic->timer_period_bt); vlapic_callout_reset(vlapic, sbt); } else callout_stop(&vlapic->callout); VLAPIC_TIMER_UNLOCK(vlapic); } /* * This function populates 'dmask' with the set of vcpus that match the * addressing specified by the (dest, phys, lowprio) tuple. * * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) * or xAPIC (8-bit) destination field. */ static void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, bool lowprio, bool x2apic_dest) { struct vlapic *vlapic; uint32_t dfr, ldr, ldest, cluster; uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id; cpuset_t amask; int vcpuid; if ((x2apic_dest && dest == 0xffffffff) || (!x2apic_dest && dest == 0xff)) { /* * Broadcast in both logical and physical modes. */ *dmask = vm_active_cpus(vm); return; } if (phys) { /* * Physical mode: destination is APIC ID. */ CPU_ZERO(dmask); vcpuid = vm_apicid2vcpuid(vm, dest); amask = vm_active_cpus(vm); if (vcpuid < vm_get_maxcpus(vm) && CPU_ISSET(vcpuid, &amask)) CPU_SET(vcpuid, dmask); } else { /* * In the "Flat Model" the MDA is interpreted as an 8-bit wide * bitmask. This model is only available in the xAPIC mode. */ mda_flat_ldest = dest & 0xff; /* * In the "Cluster Model" the MDA is used to identify a * specific cluster and a set of APICs in that cluster. */ if (x2apic_dest) { mda_cluster_id = dest >> 16; mda_cluster_ldest = dest & 0xffff; } else { mda_cluster_id = (dest >> 4) & 0xf; mda_cluster_ldest = dest & 0xf; } /* * Logical mode: match each APIC that has a bit set * in its LDR that matches a bit in the ldest. */ CPU_ZERO(dmask); amask = vm_active_cpus(vm); CPU_FOREACH_ISSET(vcpuid, &amask) { vlapic = vm_lapic(vm_vcpu(vm, vcpuid)); dfr = vlapic->apic_page->dfr; ldr = vlapic->apic_page->ldr; if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) { ldest = ldr >> 24; mda_ldest = mda_flat_ldest; } else if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) { if (x2apic(vlapic)) { cluster = ldr >> 16; ldest = ldr & 0xffff; } else { cluster = ldr >> 28; ldest = (ldr >> 24) & 0xf; } if (cluster != mda_cluster_id) continue; mda_ldest = mda_cluster_ldest; } else { /* * Guest has configured a bad logical * model for this vcpu - skip it. */ VLAPIC_CTR1(vlapic, "vlapic has bad logical " "model %x - cannot deliver interrupt", dfr); continue; } if ((mda_ldest & ldest) != 0) { CPU_SET(vcpuid, dmask); if (lowprio) break; } } } } static VMM_STAT(VLAPIC_IPI_SEND, "ipis sent from vcpu"); static VMM_STAT(VLAPIC_IPI_RECV, "ipis received by vcpu"); static void vlapic_set_tpr(struct vlapic *vlapic, uint8_t val) { struct LAPIC *lapic = vlapic->apic_page; if (lapic->tpr != val) { VLAPIC_CTR2(vlapic, "vlapic TPR changed from %#x to %#x", lapic->tpr, val); lapic->tpr = val; vlapic_update_ppr(vlapic); } } static uint8_t vlapic_get_tpr(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; return (lapic->tpr); } void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val) { uint8_t tpr; if (val & ~0xf) { vm_inject_gp(vlapic->vcpu); return; } tpr = val << 4; vlapic_set_tpr(vlapic, tpr); } uint64_t vlapic_get_cr8(struct vlapic *vlapic) { uint8_t tpr; tpr = vlapic_get_tpr(vlapic); return (tpr >> 4); } static bool vlapic_is_icr_valid(uint64_t icrval) { uint32_t mode = icrval & APIC_DELMODE_MASK; uint32_t level = icrval & APIC_LEVEL_MASK; uint32_t trigger = icrval & APIC_TRIGMOD_MASK; uint32_t shorthand = icrval & APIC_DEST_MASK; switch (mode) { case APIC_DELMODE_FIXED: if (trigger == APIC_TRIGMOD_EDGE) return (true); /* * AMD allows a level assert IPI and Intel converts a level * assert IPI into an edge IPI. */ if (trigger == APIC_TRIGMOD_LEVEL && level == APIC_LEVEL_ASSERT) return (true); break; case APIC_DELMODE_LOWPRIO: case APIC_DELMODE_SMI: case APIC_DELMODE_NMI: case APIC_DELMODE_INIT: if (trigger == APIC_TRIGMOD_EDGE && (shorthand == APIC_DEST_DESTFLD || shorthand == APIC_DEST_ALLESELF)) return (true); /* * AMD allows a level assert IPI and Intel converts a level * assert IPI into an edge IPI. */ if (trigger == APIC_TRIGMOD_LEVEL && level == APIC_LEVEL_ASSERT && (shorthand == APIC_DEST_DESTFLD || shorthand == APIC_DEST_ALLESELF)) return (true); /* * An level triggered deassert INIT is defined in the Intel * Multiprocessor Specification and the Intel Software Developer * Manual. Due to the MPS it's required to send a level assert * INIT to a cpu and then a level deassert INIT. Some operating * systems e.g. FreeBSD or Linux use that algorithm. According * to the SDM a level deassert INIT is only supported by Pentium * and P6 processors. It's always send to all cpus regardless of * the destination or shorthand field. It resets the arbitration * id register. This register is not software accessible and * only required for the APIC bus arbitration. So, the level * deassert INIT doesn't need any emulation and we should ignore * it. The SDM also defines that newer processors don't support * the level deassert INIT and it's not valid any more. As it's * defined for older systems, it can't be invalid per se. * Otherwise, backward compatibility would be broken. However, * when returning false here, it'll be ignored which is the * desired behaviour. */ if (mode == APIC_DELMODE_INIT && trigger == APIC_TRIGMOD_LEVEL && level == APIC_LEVEL_DEASSERT) return (false); break; case APIC_DELMODE_STARTUP: if (shorthand == APIC_DEST_DESTFLD || shorthand == APIC_DEST_ALLESELF) return (true); break; case APIC_DELMODE_RR: /* Only available on AMD! */ if (trigger == APIC_TRIGMOD_EDGE && shorthand == APIC_DEST_DESTFLD) return (true); break; case APIC_DELMODE_RESV: return (false); default: __assert_unreachable(); } return (false); } int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) { int i; bool phys; cpuset_t dmask, ipimask; uint64_t icrval; uint32_t dest, vec, mode, shorthand; struct vcpu *vcpu; struct vm_exit *vmexit; struct LAPIC *lapic; lapic = vlapic->apic_page; lapic->icr_lo &= ~APIC_DELSTAT_PEND; icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo; if (x2apic(vlapic)) dest = icrval >> 32; else dest = icrval >> (32 + 24); vec = icrval & APIC_VECTOR_MASK; mode = icrval & APIC_DELMODE_MASK; phys = (icrval & APIC_DESTMODE_LOG) == 0; shorthand = icrval & APIC_DEST_MASK; VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec); switch (shorthand) { case APIC_DEST_DESTFLD: vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, x2apic(vlapic)); break; case APIC_DEST_SELF: CPU_SETOF(vlapic->vcpuid, &dmask); break; case APIC_DEST_ALLISELF: dmask = vm_active_cpus(vlapic->vm); break; case APIC_DEST_ALLESELF: dmask = vm_active_cpus(vlapic->vm); CPU_CLR(vlapic->vcpuid, &dmask); break; default: __assert_unreachable(); } /* * Ignore invalid combinations of the icr. */ if (!vlapic_is_icr_valid(icrval)) { VLAPIC_CTR1(vlapic, "Ignoring invalid ICR %016lx", icrval); return (0); } /* * ipimask is a set of vCPUs needing userland handling of the current * IPI. */ CPU_ZERO(&ipimask); switch (mode) { case APIC_DELMODE_FIXED: if (vec < 16) { vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false); VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); return (0); } CPU_FOREACH_ISSET(i, &dmask) { vcpu = vm_vcpu(vlapic->vm, i); lapic_intr_edge(vcpu, vec); vmm_stat_incr(vlapic->vcpu, VLAPIC_IPI_SEND, 1); vmm_stat_incr(vcpu, VLAPIC_IPI_RECV, 1); VLAPIC_CTR2(vlapic, "vlapic sending ipi %d to vcpuid %d", vec, i); } break; case APIC_DELMODE_NMI: CPU_FOREACH_ISSET(i, &dmask) { vcpu = vm_vcpu(vlapic->vm, i); vm_inject_nmi(vcpu); VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi to vcpuid %d", i); } break; case APIC_DELMODE_INIT: case APIC_DELMODE_STARTUP: if (!vlapic->ipi_exit) { if (!phys) break; i = vm_apicid2vcpuid(vlapic->vm, dest); if (i >= vm_get_maxcpus(vlapic->vm) || i == vlapic->vcpuid) break; CPU_SETOF(i, &ipimask); break; } CPU_COPY(&dmask, &ipimask); break; default: return (1); } if (!CPU_EMPTY(&ipimask)) { vmexit = vm_exitinfo(vlapic->vcpu); vmexit->exitcode = VM_EXITCODE_IPI; vmexit->u.ipi.mode = mode; vmexit->u.ipi.vector = vec; - vmexit->u.ipi.dmask = ipimask; + *vm_exitinfo_cpuset(vlapic->vcpu) = ipimask; *retu = true; } return (0); } static void vlapic_handle_init(struct vcpu *vcpu, void *arg) { struct vlapic *vlapic = vm_lapic(vcpu); vlapic_reset(vlapic); } int vm_handle_ipi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) { struct vlapic *vlapic = vm_lapic(vcpu); - cpuset_t *dmask = &vme->u.ipi.dmask; + cpuset_t *dmask = vm_exitinfo_cpuset(vcpu); uint8_t vec = vme->u.ipi.vector; *retu = true; switch (vme->u.ipi.mode) { case APIC_DELMODE_INIT: { cpuset_t active, reinit; active = vm_active_cpus(vcpu_vm(vcpu)); CPU_AND(&reinit, &active, dmask); if (!CPU_EMPTY(&reinit)) { vm_smp_rendezvous(vcpu, reinit, vlapic_handle_init, NULL); } vm_await_start(vcpu_vm(vcpu), dmask); if (!vlapic->ipi_exit) *retu = false; break; } case APIC_DELMODE_STARTUP: /* * Ignore SIPIs in any state other than wait-for-SIPI */ *dmask = vm_start_cpus(vcpu_vm(vcpu), dmask); if (CPU_EMPTY(dmask)) { *retu = false; break; } /* * Old bhyve versions don't support the IPI * exit. Translate it into the old style. */ if (!vlapic->ipi_exit) { vme->exitcode = VM_EXITCODE_SPINUP_AP; vme->u.spinup_ap.vcpu = CPU_FFS(dmask) - 1; vme->u.spinup_ap.rip = vec << PAGE_SHIFT; } break; default: __assert_unreachable(); } return (0); } void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val) { int vec; KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode")); vec = val & 0xff; lapic_intr_edge(vlapic->vcpu, vec); vmm_stat_incr(vlapic->vcpu, VLAPIC_IPI_SEND, 1); vmm_stat_incr(vlapic->vcpu, VLAPIC_IPI_RECV, 1); VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec); } int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) { struct LAPIC *lapic = vlapic->apic_page; int idx, i, bitpos, vector; uint32_t *irrptr, val; vlapic_update_ppr(vlapic); if (vlapic->ops.pending_intr) return ((*vlapic->ops.pending_intr)(vlapic, vecptr)); irrptr = &lapic->irr0; for (i = 7; i >= 0; i--) { idx = i * 4; val = atomic_load_acq_int(&irrptr[idx]); bitpos = fls(val); if (bitpos != 0) { vector = i * 32 + (bitpos - 1); if (PRIO(vector) > PRIO(lapic->ppr)) { VLAPIC_CTR1(vlapic, "pending intr %d", vector); if (vecptr != NULL) *vecptr = vector; return (1); } else break; } } return (0); } void vlapic_intr_accepted(struct vlapic *vlapic, int vector) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *irrptr, *isrptr; int idx, stk_top; if (vlapic->ops.intr_accepted) return ((*vlapic->ops.intr_accepted)(vlapic, vector)); /* * clear the ready bit for vector being accepted in irr * and set the vector as in service in isr. */ idx = (vector / 32) * 4; irrptr = &lapic->irr0; atomic_clear_int(&irrptr[idx], 1 << (vector % 32)); VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted"); isrptr = &lapic->isr0; isrptr[idx] |= 1 << (vector % 32); VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted"); /* * Update the PPR */ vlapic->isrvec_stk_top++; stk_top = vlapic->isrvec_stk_top; if (stk_top >= ISRVEC_STK_SIZE) panic("isrvec_stk_top overflow %d", stk_top); vlapic->isrvec_stk[stk_top] = vector; } void vlapic_svr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; uint32_t old, new, changed; lapic = vlapic->apic_page; new = lapic->svr; old = vlapic->svr_last; vlapic->svr_last = new; changed = old ^ new; if ((changed & APIC_SVR_ENABLE) != 0) { if ((new & APIC_SVR_ENABLE) == 0) { /* * The apic is now disabled so stop the apic timer * and mask all the LVT entries. */ VLAPIC_CTR0(vlapic, "vlapic is software-disabled"); VLAPIC_TIMER_LOCK(vlapic); callout_stop(&vlapic->callout); VLAPIC_TIMER_UNLOCK(vlapic); vlapic_mask_lvts(vlapic); } else { /* * The apic is now enabled so restart the apic timer * if it is configured in periodic mode. */ VLAPIC_CTR0(vlapic, "vlapic is software-enabled"); if (vlapic_periodic_timer(vlapic)) vlapic_icrtmr_write_handler(vlapic); } } } int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, uint64_t *data, bool *retu) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *reg; int i; /* Ignore MMIO accesses in x2APIC mode */ if (x2apic(vlapic) && mmio_access) { VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode", offset); *data = 0; goto done; } if (!x2apic(vlapic) && !mmio_access) { /* * XXX Generate GP fault for MSR accesses in xAPIC mode */ VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in " "xAPIC mode", offset); *data = 0; goto done; } if (offset > sizeof(*lapic)) { *data = 0; goto done; } offset &= ~3; switch(offset) { case APIC_OFFSET_ID: *data = lapic->id; break; case APIC_OFFSET_VER: *data = lapic->version; break; case APIC_OFFSET_TPR: *data = vlapic_get_tpr(vlapic); break; case APIC_OFFSET_APR: *data = lapic->apr; break; case APIC_OFFSET_PPR: *data = lapic->ppr; break; case APIC_OFFSET_EOI: *data = lapic->eoi; break; case APIC_OFFSET_LDR: *data = lapic->ldr; break; case APIC_OFFSET_DFR: *data = lapic->dfr; break; case APIC_OFFSET_SVR: *data = lapic->svr; break; case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: i = (offset - APIC_OFFSET_ISR0) >> 2; reg = &lapic->isr0; *data = *(reg + i); break; case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: i = (offset - APIC_OFFSET_TMR0) >> 2; reg = &lapic->tmr0; *data = *(reg + i); break; case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: i = (offset - APIC_OFFSET_IRR0) >> 2; reg = &lapic->irr0; *data = atomic_load_acq_int(reg + i); break; case APIC_OFFSET_ESR: *data = lapic->esr; break; case APIC_OFFSET_ICR_LOW: *data = lapic->icr_lo; if (x2apic(vlapic)) *data |= (uint64_t)lapic->icr_hi << 32; break; case APIC_OFFSET_ICR_HI: *data = lapic->icr_hi; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: *data = vlapic_get_lvt(vlapic, offset); #ifdef INVARIANTS reg = vlapic_get_lvtptr(vlapic, offset); KASSERT(*data == *reg, ("inconsistent lvt value at " "offset %#lx: %#lx/%#x", offset, *data, *reg)); #endif break; case APIC_OFFSET_TIMER_ICR: *data = lapic->icr_timer; break; case APIC_OFFSET_TIMER_CCR: *data = vlapic_get_ccr(vlapic); break; case APIC_OFFSET_TIMER_DCR: *data = lapic->dcr_timer; break; case APIC_OFFSET_SELF_IPI: /* * XXX generate a GP fault if vlapic is in x2apic mode */ *data = 0; break; case APIC_OFFSET_RRR: default: *data = 0; break; } done: VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data); return 0; } int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, uint64_t data, bool *retu) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *regptr; int retval; KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE, ("vlapic_write: invalid offset %#lx", offset)); VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx", offset, data); if (offset > sizeof(*lapic)) return (0); /* Ignore MMIO accesses in x2APIC mode */ if (x2apic(vlapic) && mmio_access) { VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx " "in x2APIC mode", data, offset); return (0); } /* * XXX Generate GP fault for MSR accesses in xAPIC mode */ if (!x2apic(vlapic) && !mmio_access) { VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx " "in xAPIC mode", data, offset); return (0); } retval = 0; switch(offset) { case APIC_OFFSET_ID: lapic->id = data; vlapic_id_write_handler(vlapic); break; case APIC_OFFSET_TPR: vlapic_set_tpr(vlapic, data & 0xff); break; case APIC_OFFSET_EOI: vlapic_process_eoi(vlapic); break; case APIC_OFFSET_LDR: lapic->ldr = data; vlapic_ldr_write_handler(vlapic); break; case APIC_OFFSET_DFR: lapic->dfr = data; vlapic_dfr_write_handler(vlapic); break; case APIC_OFFSET_SVR: lapic->svr = data; vlapic_svr_write_handler(vlapic); break; case APIC_OFFSET_ICR_LOW: lapic->icr_lo = data; if (x2apic(vlapic)) lapic->icr_hi = data >> 32; retval = vlapic_icrlo_write_handler(vlapic, retu); break; case APIC_OFFSET_ICR_HI: lapic->icr_hi = data; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: regptr = vlapic_get_lvtptr(vlapic, offset); *regptr = data; vlapic_lvt_write_handler(vlapic, offset); break; case APIC_OFFSET_TIMER_ICR: lapic->icr_timer = data; vlapic_icrtmr_write_handler(vlapic); break; case APIC_OFFSET_TIMER_DCR: lapic->dcr_timer = data; vlapic_dcr_write_handler(vlapic); break; case APIC_OFFSET_ESR: vlapic_esr_write_handler(vlapic); break; case APIC_OFFSET_SELF_IPI: if (x2apic(vlapic)) vlapic_self_ipi_handler(vlapic, data); break; case APIC_OFFSET_VER: case APIC_OFFSET_APR: case APIC_OFFSET_PPR: case APIC_OFFSET_RRR: case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: case APIC_OFFSET_TIMER_CCR: default: // Read only. break; } return (retval); } static void vlapic_reset(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; bzero(lapic, sizeof(struct LAPIC)); lapic->id = vlapic_get_id(vlapic); lapic->version = VLAPIC_VERSION; lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT); lapic->dfr = 0xffffffff; lapic->svr = APIC_SVR_VECTOR; vlapic_mask_lvts(vlapic); vlapic_reset_tmr(vlapic); lapic->dcr_timer = 0; vlapic_dcr_write_handler(vlapic); vlapic->svr_last = lapic->svr; } void vlapic_init(struct vlapic *vlapic) { KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized")); KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < vm_get_maxcpus(vlapic->vm), ("vlapic_init: vcpuid is not initialized")); KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not " "initialized")); /* * If the vlapic is configured in x2apic mode then it will be * accessed in the critical section via the MSR emulation code. * * Therefore the timer mutex must be a spinlock because blockable * mutexes cannot be acquired in a critical section. */ mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN); callout_init(&vlapic->callout, 1); vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; if (vlapic->vcpuid == 0) vlapic->msr_apicbase |= APICBASE_BSP; vlapic->ipi_exit = false; vlapic_reset(vlapic); } void vlapic_cleanup(struct vlapic *vlapic) { callout_drain(&vlapic->callout); mtx_destroy(&vlapic->timer_mtx); } uint64_t vlapic_get_apicbase(struct vlapic *vlapic) { return (vlapic->msr_apicbase); } int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new) { if (vlapic->msr_apicbase != new) { VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx " "not supported", vlapic->msr_apicbase, new); return (-1); } return (0); } void vlapic_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) { struct vlapic *vlapic; struct LAPIC *lapic; vlapic = vm_lapic(vcpu); if (state == X2APIC_DISABLED) vlapic->msr_apicbase &= ~APICBASE_X2APIC; else vlapic->msr_apicbase |= APICBASE_X2APIC; /* * Reset the local APIC registers whose values are mode-dependent. * * XXX this works because the APIC mode can be changed only at vcpu * initialization time. */ lapic = vlapic->apic_page; lapic->id = vlapic_get_id(vlapic); if (x2apic(vlapic)) { lapic->ldr = x2apic_ldr(vlapic); lapic->dfr = 0; } else { lapic->ldr = 0; lapic->dfr = 0xffffffff; } if (state == X2APIC_ENABLED) { if (vlapic->ops.enable_x2apic_mode) (*vlapic->ops.enable_x2apic_mode)(vlapic); } } void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, int delmode, int vec) { struct vcpu *vcpu; bool lowprio; int vcpuid; cpuset_t dmask; if (delmode != IOART_DELFIXED && delmode != IOART_DELLOPRI && delmode != IOART_DELEXINT) { VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode); return; } lowprio = (delmode == IOART_DELLOPRI); /* * We don't provide any virtual interrupt redirection hardware so * all interrupts originating from the ioapic or MSI specify the * 'dest' in the legacy xAPIC format. */ vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false); CPU_FOREACH_ISSET(vcpuid, &dmask) { vcpu = vm_vcpu(vm, vcpuid); if (delmode == IOART_DELEXINT) { vm_inject_extint(vcpu); } else { lapic_set_intr(vcpu, vec, level); } } } void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum) { /* * Post an interrupt to the vcpu currently running on 'hostcpu'. * * This is done by leveraging features like Posted Interrupts (Intel) * Doorbell MSR (AMD AVIC) that avoid a VM exit. * * If neither of these features are available then fallback to * sending an IPI to 'hostcpu'. */ if (vlapic->ops.post_intr) (*vlapic->ops.post_intr)(vlapic, hostcpu); else ipi_cpu(hostcpu, ipinum); } bool vlapic_enabled(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 && (lapic->svr & APIC_SVR_ENABLE) != 0) return (true); else return (false); } static void vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level) { struct LAPIC *lapic; uint32_t *tmrptr, mask; int idx; lapic = vlapic->apic_page; tmrptr = &lapic->tmr0; idx = (vector / 32) * 4; mask = 1 << (vector % 32); if (level) tmrptr[idx] |= mask; else tmrptr[idx] &= ~mask; if (vlapic->ops.set_tmr != NULL) (*vlapic->ops.set_tmr)(vlapic, vector, level); } void vlapic_reset_tmr(struct vlapic *vlapic) { int vector; VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered"); for (vector = 0; vector <= 255; vector++) vlapic_set_tmr(vlapic, vector, false); } void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, int delmode, int vector) { cpuset_t dmask; bool lowprio; KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); /* * A level trigger is valid only for fixed and lowprio delivery modes. */ if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for " "delivery-mode %d", delmode); return; } lowprio = (delmode == APIC_DELMODE_LOWPRIO); vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false); if (!CPU_ISSET(vlapic->vcpuid, &dmask)) return; VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); vlapic_set_tmr(vlapic, vector, true); } #ifdef BHYVE_SNAPSHOT static void vlapic_reset_callout(struct vlapic *vlapic, uint32_t ccr) { /* The implementation is similar to the one in the * `vlapic_icrtmr_write_handler` function */ sbintime_t sbt; struct bintime bt; VLAPIC_TIMER_LOCK(vlapic); bt = vlapic->timer_freq_bt; bintime_mul(&bt, ccr); if (ccr != 0) { binuptime(&vlapic->timer_fire_bt); bintime_add(&vlapic->timer_fire_bt, &bt); sbt = bttosbt(bt); vlapic_callout_reset(vlapic, sbt); } else { /* even if the CCR was 0, periodic timers should be reset */ if (vlapic_periodic_timer(vlapic)) { binuptime(&vlapic->timer_fire_bt); bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); sbt = bttosbt(vlapic->timer_period_bt); callout_stop(&vlapic->callout); vlapic_callout_reset(vlapic, sbt); } } VLAPIC_TIMER_UNLOCK(vlapic); } int vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta) { int ret; struct vcpu *vcpu; struct vlapic *vlapic; struct LAPIC *lapic; uint32_t ccr; uint16_t i, maxcpus; KASSERT(vm != NULL, ("%s: arg was NULL", __func__)); ret = 0; maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = vm_vcpu(vm, i); if (vcpu == NULL) continue; vlapic = vm_lapic(vcpu); /* snapshot the page first; timer period depends on icr_timer */ lapic = vlapic->apic_page; SNAPSHOT_BUF_OR_LEAVE(lapic, PAGE_SIZE, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vlapic->esr_pending, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.sec, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.frac, meta, ret, done); /* * Timer period is equal to 'icr_timer' ticks at a frequency of * 'timer_freq_bt'. */ if (meta->op == VM_SNAPSHOT_RESTORE) { vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); } SNAPSHOT_BUF_OR_LEAVE(vlapic->isrvec_stk, sizeof(vlapic->isrvec_stk), meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vlapic->isrvec_stk_top, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(vlapic->lvt_last, sizeof(vlapic->lvt_last), meta, ret, done); if (meta->op == VM_SNAPSHOT_SAVE) ccr = vlapic_get_ccr(vlapic); SNAPSHOT_VAR_OR_LEAVE(ccr, meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE && vlapic_enabled(vlapic) && lapic->icr_timer != 0) { /* Reset the value of the 'timer_fire_bt' and the vlapic * callout based on the value of the current count * register saved when the VM snapshot was created. * If initial count register is 0, timer is not used. * Look at "10.5.4 APIC Timer" in Software Developer Manual. */ vlapic_reset_callout(vlapic, ccr); } } done: return (ret); } #endif diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 81d5212ec744..0f29a6845f9e 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -1,2988 +1,2993 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_ioport.h" #include "vmm_ktr.h" #include "vmm_host.h" #include "vmm_mem.h" #include "vmm_util.h" #include "vatpic.h" #include "vatpit.h" #include "vhpet.h" #include "vioapic.h" #include "vlapic.h" #include "vpmtmr.h" #include "vrtc.h" #include "vmm_stat.h" #include "vmm_lapic.h" #include "io/ppt.h" #include "io/iommu.h" struct vlapic; /* * Initialization: * (a) allocated when vcpu is created * (i) initialized when vcpu is created and when it is reinitialized * (o) initialized the first time the vcpu is created * (x) initialized before use */ struct vcpu { struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ enum vcpu_state state; /* (o) vcpu state */ int vcpuid; /* (o) */ int hostcpu; /* (o) vcpu's host cpu */ int reqidle; /* (i) request vcpu to idle */ struct vm *vm; /* (o) */ void *cookie; /* (i) cpu-specific data */ struct vlapic *vlapic; /* (i) APIC device model */ enum x2apic_state x2apic_state; /* (i) APIC mode */ uint64_t exitintinfo; /* (i) events pending at VM exit */ int nmi_pending; /* (i) NMI pending */ int extint_pending; /* (i) INTR pending */ int exception_pending; /* (i) exception pending */ int exc_vector; /* (x) exception collateral */ int exc_errcode_valid; uint32_t exc_errcode; struct savefpu *guestfpu; /* (a,i) guest fpu state */ uint64_t guest_xcr0; /* (i) guest %xcr0 register */ void *stats; /* (a,i) statistics */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ + cpuset_t exitinfo_cpuset; /* (x) storage for vmexit handlers */ uint64_t nextrip; /* (x) next instruction to execute */ uint64_t tsc_offset; /* (o) TSC offsetting */ }; #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock_destroy(v) mtx_destroy(&((v)->mtx)) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) struct mem_seg { size_t len; bool sysmem; struct vm_object *object; }; #define VM_MAX_MEMSEGS 4 struct mem_map { vm_paddr_t gpa; size_t len; vm_ooffset_t segoff; int segid; int prot; int flags; }; #define VM_MAX_MEMMAPS 8 /* * Initialization: * (o) initialized the first time the VM is created * (i) initialized when VM is created and when it is reinitialized * (x) initialized before use * * Locking: * [m] mem_segs_lock * [r] rendezvous_mtx * [v] reads require one frozen vcpu, writes require freezing all vcpus */ struct vm { void *cookie; /* (i) cpu-specific data */ void *iommu; /* (x) iommu-specific data */ struct vhpet *vhpet; /* (i) virtual HPET */ struct vioapic *vioapic; /* (i) virtual ioapic */ struct vatpic *vatpic; /* (i) virtual atpic */ struct vatpit *vatpit; /* (i) virtual atpit */ struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ struct vrtc *vrtc; /* (o) virtual RTC */ volatile cpuset_t active_cpus; /* (i) active vcpus */ volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ cpuset_t startup_cpus; /* (i) [r] waiting for startup */ int suspend; /* (i) stop VM execution */ bool dying; /* (o) is dying */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ cpuset_t rendezvous_req_cpus; /* (x) [r] rendezvous requested */ cpuset_t rendezvous_done_cpus; /* (x) [r] rendezvous finished */ void *rendezvous_arg; /* (x) [r] rendezvous func/arg */ vm_rendezvous_func_t rendezvous_func; struct mtx rendezvous_mtx; /* (o) rendezvous lock */ struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) [m+v] guest address space */ struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) [m+v] guest memory regions */ struct vmspace *vmspace; /* (o) guest's address space */ char name[VM_MAX_NAMELEN+1]; /* (o) virtual machine name */ struct vcpu **vcpu; /* (o) guest vcpus */ /* The following describe the vm cpu topology */ uint16_t sockets; /* (o) num of sockets */ uint16_t cores; /* (o) num of cores/socket */ uint16_t threads; /* (o) num of threads/core */ uint16_t maxcpus; /* (o) max pluggable cpus */ struct sx mem_segs_lock; /* (o) */ struct sx vcpus_init_lock; /* (o) */ }; #define VMM_CTR0(vcpu, format) \ VCPU_CTR0((vcpu)->vm, (vcpu)->vcpuid, format) #define VMM_CTR1(vcpu, format, p1) \ VCPU_CTR1((vcpu)->vm, (vcpu)->vcpuid, format, p1) #define VMM_CTR2(vcpu, format, p1, p2) \ VCPU_CTR2((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2) #define VMM_CTR3(vcpu, format, p1, p2, p3) \ VCPU_CTR3((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3) #define VMM_CTR4(vcpu, format, p1, p2, p3, p4) \ VCPU_CTR4((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3, p4) static int vmm_initialized; static void vmmops_panic(void); static void vmmops_panic(void) { panic("vmm_ops func called when !vmm_is_intel() && !vmm_is_svm()"); } #define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ DEFINE_IFUNC(static, ret_type, vmmops_##opname, args) \ { \ if (vmm_is_intel()) \ return (vmm_ops_intel.opname); \ else if (vmm_is_svm()) \ return (vmm_ops_amd.opname); \ else \ return ((ret_type (*)args)vmmops_panic); \ } DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum)) DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) DEFINE_VMMOPS_IFUNC(void, modresume, (void)) DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t rip, struct pmap *pmap, struct vm_eventinfo *info)) DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, int vcpu_id)) DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) DEFINE_VMMOPS_IFUNC(int, getdesc, (void *vcpui, int num, struct seg_desc *desc)) DEFINE_VMMOPS_IFUNC(int, setdesc, (void *vcpui, int num, struct seg_desc *desc)) DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, vm_offset_t max)) DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) DEFINE_VMMOPS_IFUNC(struct vlapic *, vlapic_init, (void *vcpui)) DEFINE_VMMOPS_IFUNC(void, vlapic_cleanup, (struct vlapic *vlapic)) #ifdef BHYVE_SNAPSHOT DEFINE_VMMOPS_IFUNC(int, vcpu_snapshot, (void *vcpui, struct vm_snapshot_meta *meta)) DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vcpui, uint64_t now)) #endif #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) #define fpu_stop_emulating() clts() SDT_PROVIDER_DEFINE(vmm); static MALLOC_DEFINE(M_VM, "vm", "vm"); /* statistics */ static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL); /* * Halt the guest if all vcpus are executing a HLT instruction with * interrupts disabled. */ static int halt_detection_enabled = 1; SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, &halt_detection_enabled, 0, "Halt VM if all vcpus execute HLT with interrupts disabled"); static int vmm_ipinum; SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, "IPI vector used for vcpu notifications"); static int trace_guest_exceptions; SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, &trace_guest_exceptions, 0, "Trap into hypervisor on all guest exceptions and reflect them back"); static int trap_wbinvd; SYSCTL_INT(_hw_vmm, OID_AUTO, trap_wbinvd, CTLFLAG_RDTUN, &trap_wbinvd, 0, "WBINVD triggers a VM-exit"); u_int vm_maxcpu; SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &vm_maxcpu, 0, "Maximum number of vCPUs"); static void vm_free_memmap(struct vm *vm, int ident); static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); /* * Upper limit on vm_maxcpu. Limited by use of uint16_t types for CPU * counts as well as range of vpid values for VT-x and by the capacity * of cpuset_t masks. The call to new_unrhdr() in vpid_init() in * vmx.c requires 'vm_maxcpu + 1 <= 0xffff', hence the '- 1' below. */ #define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE) #ifdef KTR static const char * vcpu_state2str(enum vcpu_state state) { switch (state) { case VCPU_IDLE: return ("idle"); case VCPU_FROZEN: return ("frozen"); case VCPU_RUNNING: return ("running"); case VCPU_SLEEPING: return ("sleeping"); default: return ("unknown"); } } #endif static void vcpu_cleanup(struct vcpu *vcpu, bool destroy) { vmmops_vlapic_cleanup(vcpu->vlapic); vmmops_vcpu_cleanup(vcpu->cookie); vcpu->cookie = NULL; if (destroy) { vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); vcpu_lock_destroy(vcpu); free(vcpu, M_VM); } } static struct vcpu * vcpu_alloc(struct vm *vm, int vcpu_id) { struct vcpu *vcpu; KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, ("vcpu_init: invalid vcpu %d", vcpu_id)); vcpu = malloc(sizeof(*vcpu), M_VM, M_WAITOK | M_ZERO); vcpu_lock_init(vcpu); vcpu->state = VCPU_IDLE; vcpu->hostcpu = NOCPU; vcpu->vcpuid = vcpu_id; vcpu->vm = vm; vcpu->guestfpu = fpu_save_area_alloc(); vcpu->stats = vmm_stat_alloc(); vcpu->tsc_offset = 0; return (vcpu); } static void vcpu_init(struct vcpu *vcpu) { vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid); vcpu->vlapic = vmmops_vlapic_init(vcpu->cookie); vm_set_x2apic_state(vcpu, X2APIC_DISABLED); vcpu->reqidle = 0; vcpu->exitintinfo = 0; vcpu->nmi_pending = 0; vcpu->extint_pending = 0; vcpu->exception_pending = 0; vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; fpu_save_area_reset(vcpu->guestfpu); vmm_stat_init(vcpu->stats); } int vcpu_trace_exceptions(struct vcpu *vcpu) { return (trace_guest_exceptions); } int vcpu_trap_wbinvd(struct vcpu *vcpu) { return (trap_wbinvd); } struct vm_exit * vm_exitinfo(struct vcpu *vcpu) { return (&vcpu->exitinfo); } +cpuset_t * +vm_exitinfo_cpuset(struct vcpu *vcpu) +{ + return (&vcpu->exitinfo_cpuset); +} + static int vmm_init(void) { int error; if (!vmm_is_hw_supported()) return (ENXIO); vm_maxcpu = mp_ncpus; TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); if (vm_maxcpu > VM_MAXCPU) { printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); vm_maxcpu = VM_MAXCPU; } if (vm_maxcpu == 0) vm_maxcpu = 1; vmm_host_state_init(); vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : &IDTVEC(justreturn)); if (vmm_ipinum < 0) vmm_ipinum = IPI_AST; error = vmm_mem_init(); if (error) return (error); vmm_resume_p = vmmops_modresume; return (vmmops_modinit(vmm_ipinum)); } static int vmm_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: if (vmm_is_hw_supported()) { vmmdev_init(); error = vmm_init(); if (error == 0) vmm_initialized = 1; } else { error = ENXIO; } break; case MOD_UNLOAD: if (vmm_is_hw_supported()) { error = vmmdev_cleanup(); if (error == 0) { vmm_resume_p = NULL; iommu_cleanup(); if (vmm_ipinum != IPI_AST) lapic_ipi_free(vmm_ipinum); error = vmmops_modcleanup(); /* * Something bad happened - prevent new * VMs from being created */ if (error) vmm_initialized = 0; } } else { error = 0; } break; default: error = 0; break; } return (error); } static moduledata_t vmm_kmod = { "vmm", vmm_handler, NULL }; /* * vmm initialization has the following dependencies: * * - VT-x initialization requires smp_rendezvous() and therefore must happen * after SMP is fully functional (after SI_SUB_SMP). */ DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); static void vm_init(struct vm *vm, bool create) { vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); vm->iommu = NULL; vm->vioapic = vioapic_init(vm); vm->vhpet = vhpet_init(vm); vm->vatpic = vatpic_init(vm); vm->vatpit = vatpit_init(vm); vm->vpmtmr = vpmtmr_init(vm); if (create) vm->vrtc = vrtc_init(vm); CPU_ZERO(&vm->active_cpus); CPU_ZERO(&vm->debug_cpus); CPU_ZERO(&vm->startup_cpus); vm->suspend = 0; CPU_ZERO(&vm->suspended_cpus); if (!create) { for (int i = 0; i < vm->maxcpus; i++) { if (vm->vcpu[i] != NULL) vcpu_init(vm->vcpu[i]); } } } void vm_disable_vcpu_creation(struct vm *vm) { sx_xlock(&vm->vcpus_init_lock); vm->dying = true; sx_xunlock(&vm->vcpus_init_lock); } struct vcpu * vm_alloc_vcpu(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm)) return (NULL); vcpu = atomic_load_ptr(&vm->vcpu[vcpuid]); if (__predict_true(vcpu != NULL)) return (vcpu); sx_xlock(&vm->vcpus_init_lock); vcpu = vm->vcpu[vcpuid]; if (vcpu == NULL && !vm->dying) { vcpu = vcpu_alloc(vm, vcpuid); vcpu_init(vcpu); /* * Ensure vCPU is fully created before updating pointer * to permit unlocked reads above. */ atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid], (uintptr_t)vcpu); } sx_xunlock(&vm->vcpus_init_lock); return (vcpu); } void vm_slock_vcpus(struct vm *vm) { sx_slock(&vm->vcpus_init_lock); } void vm_unlock_vcpus(struct vm *vm) { sx_unlock(&vm->vcpus_init_lock); } /* * The default CPU topology is a single thread per package. */ u_int cores_per_package = 1; u_int threads_per_core = 1; int vm_create(const char *name, struct vm **retvm) { struct vm *vm; struct vmspace *vmspace; /* * If vmm.ko could not be successfully initialized then don't attempt * to create the virtual machine. */ if (!vmm_initialized) return (ENXIO); if (name == NULL || strnlen(name, VM_MAX_NAMELEN + 1) == VM_MAX_NAMELEN + 1) return (EINVAL); vmspace = vmmops_vmspace_alloc(0, VM_MAXUSER_ADDRESS_LA48); if (vmspace == NULL) return (ENOMEM); vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); vm->vmspace = vmspace; mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); sx_init(&vm->mem_segs_lock, "vm mem_segs"); sx_init(&vm->vcpus_init_lock, "vm vcpus"); vm->vcpu = malloc(sizeof(*vm->vcpu) * vm_maxcpu, M_VM, M_WAITOK | M_ZERO); vm->sockets = 1; vm->cores = cores_per_package; /* XXX backwards compatibility */ vm->threads = threads_per_core; /* XXX backwards compatibility */ vm->maxcpus = vm_maxcpu; vm_init(vm, true); *retvm = vm; return (0); } void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) { *sockets = vm->sockets; *cores = vm->cores; *threads = vm->threads; *maxcpus = vm->maxcpus; } uint16_t vm_get_maxcpus(struct vm *vm) { return (vm->maxcpus); } int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus __unused) { /* Ignore maxcpus. */ if ((sockets * cores * threads) > vm->maxcpus) return (EINVAL); vm->sockets = sockets; vm->cores = cores; vm->threads = threads; return(0); } static void vm_cleanup(struct vm *vm, bool destroy) { struct mem_map *mm; int i; if (destroy) vm_xlock_memsegs(vm); ppt_unassign_all(vm); if (vm->iommu != NULL) iommu_destroy_domain(vm->iommu); if (destroy) vrtc_cleanup(vm->vrtc); else vrtc_reset(vm->vrtc); vpmtmr_cleanup(vm->vpmtmr); vatpit_cleanup(vm->vatpit); vhpet_cleanup(vm->vhpet); vatpic_cleanup(vm->vatpic); vioapic_cleanup(vm->vioapic); for (i = 0; i < vm->maxcpus; i++) { if (vm->vcpu[i] != NULL) vcpu_cleanup(vm->vcpu[i], destroy); } vmmops_cleanup(vm->cookie); /* * System memory is removed from the guest address space only when * the VM is destroyed. This is because the mapping remains the same * across VM reset. * * Device memory can be relocated by the guest (e.g. using PCI BARs) * so those mappings are removed on a VM reset. */ for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (destroy || !sysmem_mapping(vm, mm)) vm_free_memmap(vm, i); } if (destroy) { for (i = 0; i < VM_MAX_MEMSEGS; i++) vm_free_memseg(vm, i); vm_unlock_memsegs(vm); vmmops_vmspace_free(vm->vmspace); vm->vmspace = NULL; free(vm->vcpu, M_VM); sx_destroy(&vm->vcpus_init_lock); sx_destroy(&vm->mem_segs_lock); mtx_destroy(&vm->rendezvous_mtx); } } void vm_destroy(struct vm *vm) { vm_cleanup(vm, true); free(vm, M_VM); } int vm_reinit(struct vm *vm) { int error; /* * A virtual machine can be reset only if all vcpus are suspended. */ if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { vm_cleanup(vm, false); vm_init(vm, false); error = 0; } else { error = EBUSY; } return (error); } const char * vm_name(struct vm *vm) { return (vm->name); } void vm_slock_memsegs(struct vm *vm) { sx_slock(&vm->mem_segs_lock); } void vm_xlock_memsegs(struct vm *vm) { sx_xlock(&vm->mem_segs_lock); } void vm_unlock_memsegs(struct vm *vm) { sx_unlock(&vm->mem_segs_lock); } int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { vm_object_t obj; if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) return (ENOMEM); else return (0); } int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { vmm_mmio_free(vm->vmspace, gpa, len); return (0); } /* * Return 'true' if 'gpa' is allocated in the guest address space. * * This function is called in the context of a running vcpu which acts as * an implicit lock on 'vm->mem_maps[]'. */ bool vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa) { struct vm *vm = vcpu->vm; struct mem_map *mm; int i; #ifdef INVARIANTS int hostcpu, state; state = vcpu_get_state(vcpu, &hostcpu); KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); #endif for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) return (true); /* 'gpa' is sysmem or devmem */ } if (ppt_is_mmio(vm, gpa)) return (true); /* 'gpa' is pci passthru mmio */ return (false); } int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) { struct mem_seg *seg; vm_object_t obj; sx_assert(&vm->mem_segs_lock, SX_XLOCKED); if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); if (len == 0 || (len & PAGE_MASK)) return (EINVAL); seg = &vm->mem_segs[ident]; if (seg->object != NULL) { if (seg->len == len && seg->sysmem == sysmem) return (EEXIST); else return (EINVAL); } obj = vm_object_allocate(OBJT_SWAP, len >> PAGE_SHIFT); if (obj == NULL) return (ENOMEM); seg->len = len; seg->object = obj; seg->sysmem = sysmem; return (0); } int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, vm_object_t *objptr) { struct mem_seg *seg; sx_assert(&vm->mem_segs_lock, SX_LOCKED); if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); seg = &vm->mem_segs[ident]; if (len) *len = seg->len; if (sysmem) *sysmem = seg->sysmem; if (objptr) *objptr = seg->object; return (0); } void vm_free_memseg(struct vm *vm, int ident) { struct mem_seg *seg; KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, ("%s: invalid memseg ident %d", __func__, ident)); seg = &vm->mem_segs[ident]; if (seg->object != NULL) { vm_object_deallocate(seg->object); bzero(seg, sizeof(struct mem_seg)); } } int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, size_t len, int prot, int flags) { struct mem_seg *seg; struct mem_map *m, *map; vm_ooffset_t last; int i, error; if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0) return (EINVAL); if (flags & ~VM_MEMMAP_F_WIRED) return (EINVAL); if (segid < 0 || segid >= VM_MAX_MEMSEGS) return (EINVAL); seg = &vm->mem_segs[segid]; if (seg->object == NULL) return (EINVAL); last = first + len; if (first < 0 || first >= last || last > seg->len) return (EINVAL); if ((gpa | first | last) & PAGE_MASK) return (EINVAL); map = NULL; for (i = 0; i < VM_MAX_MEMMAPS; i++) { m = &vm->mem_maps[i]; if (m->len == 0) { map = m; break; } } if (map == NULL) return (ENOSPC); error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, len, 0, VMFS_NO_SPACE, prot, prot, 0); if (error != KERN_SUCCESS) return (EFAULT); vm_object_reference(seg->object); if (flags & VM_MEMMAP_F_WIRED) { error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM : EFAULT); } } map->gpa = gpa; map->len = len; map->segoff = first; map->segid = segid; map->prot = prot; map->flags = flags; return (0); } int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) { struct mem_map *m; int i; for (i = 0; i < VM_MAX_MEMMAPS; i++) { m = &vm->mem_maps[i]; if (m->gpa == gpa && m->len == len && (m->flags & VM_MEMMAP_F_IOMMU) == 0) { vm_free_memmap(vm, i); return (0); } } return (EINVAL); } int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { struct mem_map *mm, *mmnext; int i; mmnext = NULL; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (mm->len == 0 || mm->gpa < *gpa) continue; if (mmnext == NULL || mm->gpa < mmnext->gpa) mmnext = mm; } if (mmnext != NULL) { *gpa = mmnext->gpa; if (segid) *segid = mmnext->segid; if (segoff) *segoff = mmnext->segoff; if (len) *len = mmnext->len; if (prot) *prot = mmnext->prot; if (flags) *flags = mmnext->flags; return (0); } else { return (ENOENT); } } static void vm_free_memmap(struct vm *vm, int ident) { struct mem_map *mm; int error __diagused; mm = &vm->mem_maps[ident]; if (mm->len) { error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, mm->gpa + mm->len); KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d", __func__, error)); bzero(mm, sizeof(struct mem_map)); } } static __inline bool sysmem_mapping(struct vm *vm, struct mem_map *mm) { if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) return (true); else return (false); } vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm) { struct mem_map *mm; vm_paddr_t maxaddr; int i; maxaddr = 0; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (sysmem_mapping(vm, mm)) { if (maxaddr < mm->gpa + mm->len) maxaddr = mm->gpa + mm->len; } } return (maxaddr); } static void vm_iommu_modify(struct vm *vm, bool map) { int i, sz; vm_paddr_t gpa, hpa; struct mem_map *mm; void *vp, *cookie, *host_domain; sz = PAGE_SIZE; host_domain = iommu_host_domain(); for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (!sysmem_mapping(vm, mm)) continue; if (map) { KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, ("iommu map found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) continue; mm->flags |= VM_MEMMAP_F_IOMMU; } else { if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) continue; mm->flags &= ~VM_MEMMAP_F_IOMMU; KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, ("iommu unmap found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); } gpa = mm->gpa; while (gpa < mm->gpa + mm->len) { vp = vm_gpa_hold_global(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, &cookie); KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", vm_name(vm), gpa)); vm_gpa_release(cookie); hpa = DMAP_TO_PHYS((uintptr_t)vp); if (map) { iommu_create_mapping(vm->iommu, gpa, hpa, sz); } else { iommu_remove_mapping(vm->iommu, gpa, sz); } gpa += PAGE_SIZE; } } /* * Invalidate the cached translations associated with the domain * from which pages were removed. */ if (map) iommu_invalidate_tlb(host_domain); else iommu_invalidate_tlb(vm->iommu); } #define vm_iommu_unmap(vm) vm_iommu_modify((vm), false) #define vm_iommu_map(vm) vm_iommu_modify((vm), true) int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; error = ppt_unassign_device(vm, bus, slot, func); if (error) return (error); if (ppt_assigned_devices(vm) == 0) vm_iommu_unmap(vm); return (0); } int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; vm_paddr_t maxaddr; /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ if (ppt_assigned_devices(vm) == 0) { KASSERT(vm->iommu == NULL, ("vm_assign_pptdev: iommu must be NULL")); maxaddr = vmm_sysmem_maxaddr(vm); vm->iommu = iommu_create_domain(maxaddr); if (vm->iommu == NULL) return (ENXIO); vm_iommu_map(vm); } error = ppt_assign_device(vm, bus, slot, func); return (error); } static void * _vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { int i, count, pageoff; struct mem_map *mm; vm_page_t m; pageoff = gpa & PAGE_MASK; if (len > PAGE_SIZE - pageoff) panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); count = 0; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) { count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); break; } } if (count == 1) { *cookie = m; return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); } else { *cookie = NULL; return (NULL); } } void * vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { #ifdef INVARIANTS /* * The current vcpu should be frozen to ensure 'vm_memmap[]' * stability. */ int state = vcpu_get_state(vcpu, NULL); KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", __func__, state)); #endif return (_vm_gpa_hold(vcpu->vm, gpa, len, reqprot, cookie)); } void * vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { sx_assert(&vm->mem_segs_lock, SX_LOCKED); return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie)); } void vm_gpa_release(void *cookie) { vm_page_t m = cookie; vm_page_unwire(m, PQ_ACTIVE); } int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval) { if (reg >= VM_REG_LAST) return (EINVAL); return (vmmops_getreg(vcpu->cookie, reg, retval)); } int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) { int error; if (reg >= VM_REG_LAST) return (EINVAL); error = vmmops_setreg(vcpu->cookie, reg, val); if (error || reg != VM_REG_GUEST_RIP) return (error); /* Set 'nextrip' to match the value of %rip */ VMM_CTR1(vcpu, "Setting nextrip to %#lx", val); vcpu->nextrip = val; return (0); } static bool is_descriptor_table(int reg) { switch (reg) { case VM_REG_GUEST_IDTR: case VM_REG_GUEST_GDTR: return (true); default: return (false); } } static bool is_segment_register(int reg) { switch (reg) { case VM_REG_GUEST_ES: case VM_REG_GUEST_CS: case VM_REG_GUEST_SS: case VM_REG_GUEST_DS: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_TR: case VM_REG_GUEST_LDTR: return (true); default: return (false); } } int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc) { if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (vmmops_getdesc(vcpu->cookie, reg, desc)); } int vm_set_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc) { if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (vmmops_setdesc(vcpu->cookie, reg, desc)); } static void restore_guest_fpustate(struct vcpu *vcpu) { /* flush host state to the pcb */ fpuexit(curthread); /* restore guest FPU state */ fpu_stop_emulating(); fpurestore(vcpu->guestfpu); /* restore guest XCR0 if XSAVE is enabled in the host */ if (rcr4() & CR4_XSAVE) load_xcr(0, vcpu->guest_xcr0); /* * The FPU is now "dirty" with the guest's state so turn on emulation * to trap any access to the FPU by the host. */ fpu_start_emulating(); } static void save_guest_fpustate(struct vcpu *vcpu) { if ((rcr0() & CR0_TS) == 0) panic("fpu emulation not enabled in host!"); /* save guest XCR0 and restore host XCR0 */ if (rcr4() & CR4_XSAVE) { vcpu->guest_xcr0 = rxcr(0); load_xcr(0, vmm_get_host_xcr0()); } /* save guest FPU state */ fpu_stop_emulating(); fpusave(vcpu->guestfpu); fpu_start_emulating(); } static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); static int vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) { int error; vcpu_assert_locked(vcpu); /* * State transitions from the vmmdev_ioctl() must always begin from * the VCPU_IDLE state. This guarantees that there is only a single * ioctl() operating on a vcpu at any point. */ if (from_idle) { while (vcpu->state != VCPU_IDLE) { vcpu->reqidle = 1; vcpu_notify_event_locked(vcpu, false); VMM_CTR1(vcpu, "vcpu state change from %s to " "idle requested", vcpu_state2str(vcpu->state)); msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); } } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); } if (vcpu->state == VCPU_RUNNING) { KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " "mismatch for running vcpu", curcpu, vcpu->hostcpu)); } else { KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " "vcpu that is not running", vcpu->hostcpu)); } /* * The following state transitions are allowed: * IDLE -> FROZEN -> IDLE * FROZEN -> RUNNING -> FROZEN * FROZEN -> SLEEPING -> FROZEN */ switch (vcpu->state) { case VCPU_IDLE: case VCPU_RUNNING: case VCPU_SLEEPING: error = (newstate != VCPU_FROZEN); break; case VCPU_FROZEN: error = (newstate == VCPU_FROZEN); break; default: error = 1; break; } if (error) return (EBUSY); VMM_CTR2(vcpu, "vcpu state changed from %s to %s", vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); vcpu->state = newstate; if (newstate == VCPU_RUNNING) vcpu->hostcpu = curcpu; else vcpu->hostcpu = NOCPU; if (newstate == VCPU_IDLE) wakeup(&vcpu->state); return (0); } static void vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state(vcpu, newstate, false)) != 0) panic("Error %d setting state to %d\n", error, newstate); } static void vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) panic("Error %d setting state to %d", error, newstate); } static int vm_handle_rendezvous(struct vcpu *vcpu) { struct vm *vm = vcpu->vm; struct thread *td; int error, vcpuid; error = 0; vcpuid = vcpu->vcpuid; td = curthread; mtx_lock(&vm->rendezvous_mtx); while (vm->rendezvous_func != NULL) { /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus); if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { VMM_CTR0(vcpu, "Calling rendezvous func"); (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg); CPU_SET(vcpuid, &vm->rendezvous_done_cpus); } if (CPU_CMP(&vm->rendezvous_req_cpus, &vm->rendezvous_done_cpus) == 0) { VMM_CTR0(vcpu, "Rendezvous completed"); CPU_ZERO(&vm->rendezvous_req_cpus); vm->rendezvous_func = NULL; wakeup(&vm->rendezvous_func); break; } VMM_CTR0(vcpu, "Wait for rendezvous completion"); mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, "vmrndv", hz); if (td_ast_pending(td, TDA_SUSPEND)) { mtx_unlock(&vm->rendezvous_mtx); error = thread_check_susp(td, true); if (error != 0) return (error); mtx_lock(&vm->rendezvous_mtx); } } mtx_unlock(&vm->rendezvous_mtx); return (0); } /* * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. */ static int vm_handle_hlt(struct vcpu *vcpu, bool intr_disabled, bool *retu) { struct vm *vm = vcpu->vm; const char *wmesg; struct thread *td; int error, t, vcpuid, vcpu_halted, vm_halted; vcpuid = vcpu->vcpuid; vcpu_halted = 0; vm_halted = 0; error = 0; td = curthread; KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); vcpu_lock(vcpu); while (1) { /* * Do a final check for pending NMI or interrupts before * really putting this thread to sleep. Also check for * software events that would cause this vcpu to wakeup. * * These interrupts/events could have happened after the * vcpu returned from vmmops_run() and before it acquired the * vcpu lock above. */ if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle) break; if (vm_nmi_pending(vcpu)) break; if (!intr_disabled) { if (vm_extint_pending(vcpu) || vlapic_pending_intr(vcpu->vlapic, NULL)) { break; } } /* Don't go to sleep if the vcpu thread needs to yield */ if (vcpu_should_yield(vcpu)) break; if (vcpu_debugged(vcpu)) break; /* * Some Linux guests implement "halt" by having all vcpus * execute HLT with interrupts disabled. 'halted_cpus' keeps * track of the vcpus that have entered this state. When all * vcpus enter the halted state the virtual machine is halted. */ if (intr_disabled) { wmesg = "vmhalt"; VMM_CTR0(vcpu, "Halted"); if (!vcpu_halted && halt_detection_enabled) { vcpu_halted = 1; CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); } if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { vm_halted = 1; break; } } else { wmesg = "vmidle"; } t = ticks; vcpu_require_state_locked(vcpu, VCPU_SLEEPING); /* * XXX msleep_spin() cannot be interrupted by signals so * wake up periodically to check pending signals. */ msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); vcpu_require_state_locked(vcpu, VCPU_FROZEN); vmm_stat_incr(vcpu, VCPU_IDLE_TICKS, ticks - t); if (td_ast_pending(td, TDA_SUSPEND)) { vcpu_unlock(vcpu); error = thread_check_susp(td, false); if (error != 0) { if (vcpu_halted) { CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); } return (error); } vcpu_lock(vcpu); } } if (vcpu_halted) CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); vcpu_unlock(vcpu); if (vm_halted) vm_suspend(vm, VM_SUSPEND_HALT); return (0); } static int vm_handle_paging(struct vcpu *vcpu, bool *retu) { struct vm *vm = vcpu->vm; int rv, ftype; struct vm_map *map; struct vm_exit *vme; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); ftype = vme->u.paging.fault_type; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, ("vm_handle_paging: invalid fault_type %d", ftype)); if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), vme->u.paging.gpa, ftype); if (rv == 0) { VMM_CTR2(vcpu, "%s bit emulation for gpa %#lx", ftype == VM_PROT_READ ? "accessed" : "dirty", vme->u.paging.gpa); goto done; } } map = &vm->vmspace->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); VMM_CTR3(vcpu, "vm_handle_paging rv = %d, gpa = %#lx, " "ftype = %d", rv, vme->u.paging.gpa, ftype); if (rv != KERN_SUCCESS) return (EFAULT); done: return (0); } static int vm_handle_inst_emul(struct vcpu *vcpu, bool *retu) { struct vie *vie; struct vm_exit *vme; uint64_t gla, gpa, cs_base; struct vm_guest_paging *paging; mem_region_read_t mread; mem_region_write_t mwrite; enum vm_cpu_mode cpu_mode; int cs_d, error, fault; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); gla = vme->u.inst_emul.gla; gpa = vme->u.inst_emul.gpa; cs_base = vme->u.inst_emul.cs_base; cs_d = vme->u.inst_emul.cs_d; vie = &vme->u.inst_emul.vie; paging = &vme->u.inst_emul.paging; cpu_mode = paging->cpu_mode; VMM_CTR1(vcpu, "inst_emul fault accessing gpa %#lx", gpa); /* Fetch, decode and emulate the faulting instruction */ if (vie->num_valid == 0) { error = vmm_fetch_instruction(vcpu, paging, vme->rip + cs_base, VIE_INST_SIZE, vie, &fault); } else { /* * The instruction bytes have already been copied into 'vie' */ error = fault = 0; } if (error || fault) return (error); if (vmm_decode_instruction(vcpu, gla, cpu_mode, cs_d, vie) != 0) { VMM_CTR1(vcpu, "Error decoding instruction at %#lx", vme->rip + cs_base); *retu = true; /* dump instruction bytes in userspace */ return (0); } /* * Update 'nextrip' based on the length of the emulated instruction. */ vme->inst_length = vie->num_processed; vcpu->nextrip += vie->num_processed; VMM_CTR1(vcpu, "nextrip updated to %#lx after instruction decoding", vcpu->nextrip); /* return to userland unless this is an in-kernel emulated device */ if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { mread = lapic_mmio_read; mwrite = lapic_mmio_write; } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { mread = vioapic_mmio_read; mwrite = vioapic_mmio_write; } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { *retu = true; return (0); } error = vmm_emulate_instruction(vcpu, gpa, vie, paging, mread, mwrite, retu); return (error); } static int vm_handle_suspend(struct vcpu *vcpu, bool *retu) { struct vm *vm = vcpu->vm; int error, i; struct thread *td; error = 0; td = curthread; CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus); /* * Wait until all 'active_cpus' have suspended themselves. * * Since a VM may be suspended at any time including when one or * more vcpus are doing a rendezvous we need to call the rendezvous * handler while we are waiting to prevent a deadlock. */ vcpu_lock(vcpu); while (error == 0) { if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { VMM_CTR0(vcpu, "All vcpus suspended"); break; } if (vm->rendezvous_func == NULL) { VMM_CTR0(vcpu, "Sleeping during suspend"); vcpu_require_state_locked(vcpu, VCPU_SLEEPING); msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); vcpu_require_state_locked(vcpu, VCPU_FROZEN); if (td_ast_pending(td, TDA_SUSPEND)) { vcpu_unlock(vcpu); error = thread_check_susp(td, false); vcpu_lock(vcpu); } } else { VMM_CTR0(vcpu, "Rendezvous during suspend"); vcpu_unlock(vcpu); error = vm_handle_rendezvous(vcpu); vcpu_lock(vcpu); } } vcpu_unlock(vcpu); /* * Wakeup the other sleeping vcpus and return to userspace. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->suspended_cpus)) { vcpu_notify_event(vm_vcpu(vm, i), false); } } *retu = true; return (error); } static int vm_handle_reqidle(struct vcpu *vcpu, bool *retu) { vcpu_lock(vcpu); KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); vcpu->reqidle = 0; vcpu_unlock(vcpu); *retu = true; return (0); } int vm_suspend(struct vm *vm, enum vm_suspend_how how) { int i; if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) return (EINVAL); if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { VM_CTR2(vm, "virtual machine already suspended %d/%d", vm->suspend, how); return (EALREADY); } VM_CTR1(vm, "virtual machine successfully suspended %d", how); /* * Notify all active vcpus that they are now suspended. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm_vcpu(vm, i), false); } return (0); } void vm_exit_suspended(struct vcpu *vcpu, uint64_t rip) { struct vm *vm = vcpu->vm; struct vm_exit *vmexit; KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_SUSPENDED; vmexit->u.suspended.how = vm->suspend; } void vm_exit_debug(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_DEBUG; } void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; vmm_stat_incr(vcpu, VMEXIT_RENDEZVOUS, 1); } void vm_exit_reqidle(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_REQIDLE; vmm_stat_incr(vcpu, VMEXIT_REQIDLE, 1); } void vm_exit_astpending(struct vcpu *vcpu, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vcpu, VMEXIT_ASTPENDING, 1); } int -vm_run(struct vcpu *vcpu, struct vm_exit *vme_user) +vm_run(struct vcpu *vcpu) { struct vm *vm = vcpu->vm; struct vm_eventinfo evinfo; int error, vcpuid; struct pcb *pcb; uint64_t tscval; struct vm_exit *vme; bool retu, intr_disabled; pmap_t pmap; vcpuid = vcpu->vcpuid; if (!CPU_ISSET(vcpuid, &vm->active_cpus)) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); pmap = vmspace_pmap(vm->vmspace); vme = &vcpu->exitinfo; evinfo.rptr = &vm->rendezvous_req_cpus; evinfo.sptr = &vm->suspend; evinfo.iptr = &vcpu->reqidle; restart: critical_enter(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("vm_run: absurd pm_active")); tscval = rdtsc(); pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); restore_guest_fpustate(vcpu); vcpu_require_state(vcpu, VCPU_RUNNING); error = vmmops_run(vcpu->cookie, vcpu->nextrip, pmap, &evinfo); vcpu_require_state(vcpu, VCPU_FROZEN); save_guest_fpustate(vcpu); vmm_stat_incr(vcpu, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); critical_exit(); if (error == 0) { retu = false; vcpu->nextrip = vme->rip + vme->inst_length; switch (vme->exitcode) { case VM_EXITCODE_REQIDLE: error = vm_handle_reqidle(vcpu, &retu); break; case VM_EXITCODE_SUSPENDED: error = vm_handle_suspend(vcpu, &retu); break; case VM_EXITCODE_IOAPIC_EOI: vioapic_process_eoi(vm, vme->u.ioapic_eoi.vector); break; case VM_EXITCODE_RENDEZVOUS: error = vm_handle_rendezvous(vcpu); break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vcpu, intr_disabled, &retu); break; case VM_EXITCODE_PAGING: error = vm_handle_paging(vcpu, &retu); break; case VM_EXITCODE_INST_EMUL: error = vm_handle_inst_emul(vcpu, &retu); break; case VM_EXITCODE_INOUT: case VM_EXITCODE_INOUT_STR: error = vm_handle_inout(vcpu, vme, &retu); break; case VM_EXITCODE_MONITOR: case VM_EXITCODE_MWAIT: case VM_EXITCODE_VMINSN: vm_inject_ud(vcpu); break; default: retu = true; /* handled in userland */ break; } } /* * VM_EXITCODE_INST_EMUL could access the apic which could transform the * exit code into VM_EXITCODE_IPI. */ if (error == 0 && vme->exitcode == VM_EXITCODE_IPI) error = vm_handle_ipi(vcpu, vme, &retu); if (error == 0 && retu == false) goto restart; vmm_stat_incr(vcpu, VMEXIT_USERSPACE, 1); VMM_CTR2(vcpu, "retu %d/%d", error, vme->exitcode); - /* copy the exit information */ - *vme_user = *vme; return (error); } int vm_restart_instruction(struct vcpu *vcpu) { enum vcpu_state state; uint64_t rip; int error __diagused; state = vcpu_get_state(vcpu, NULL); if (state == VCPU_RUNNING) { /* * When a vcpu is "running" the next instruction is determined * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. * Thus setting 'inst_length' to zero will cause the current * instruction to be restarted. */ vcpu->exitinfo.inst_length = 0; VMM_CTR1(vcpu, "restarting instruction at %#lx by " "setting inst_length to zero", vcpu->exitinfo.rip); } else if (state == VCPU_FROZEN) { /* * When a vcpu is "frozen" it is outside the critical section * around vmmops_run() and 'nextrip' points to the next * instruction. Thus instruction restart is achieved by setting * 'nextrip' to the vcpu's %rip. */ error = vm_get_register(vcpu, VM_REG_GUEST_RIP, &rip); KASSERT(!error, ("%s: error %d getting rip", __func__, error)); VMM_CTR2(vcpu, "restarting instruction by updating " "nextrip from %#lx to %#lx", vcpu->nextrip, rip); vcpu->nextrip = rip; } else { panic("%s: invalid state %d", __func__, state); } return (0); } int vm_exit_intinfo(struct vcpu *vcpu, uint64_t info) { int type, vector; if (info & VM_INTINFO_VALID) { type = info & VM_INTINFO_TYPE; vector = info & 0xff; if (type == VM_INTINFO_NMI && vector != IDT_NMI) return (EINVAL); if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) return (EINVAL); if (info & VM_INTINFO_RSVD) return (EINVAL); } else { info = 0; } VMM_CTR2(vcpu, "%s: info1(%#lx)", __func__, info); vcpu->exitintinfo = info; return (0); } enum exc_class { EXC_BENIGN, EXC_CONTRIBUTORY, EXC_PAGEFAULT }; #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ static enum exc_class exception_class(uint64_t info) { int type, vector; KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); type = info & VM_INTINFO_TYPE; vector = info & 0xff; /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ switch (type) { case VM_INTINFO_HWINTR: case VM_INTINFO_SWINTR: case VM_INTINFO_NMI: return (EXC_BENIGN); default: /* * Hardware exception. * * SVM and VT-x use identical type values to represent NMI, * hardware interrupt and software interrupt. * * SVM uses type '3' for all exceptions. VT-x uses type '3' * for exceptions except #BP and #OF. #BP and #OF use a type * value of '5' or '6'. Therefore we don't check for explicit * values of 'type' to classify 'intinfo' into a hardware * exception. */ break; } switch (vector) { case IDT_PF: case IDT_VE: return (EXC_PAGEFAULT); case IDT_DE: case IDT_TS: case IDT_NP: case IDT_SS: case IDT_GP: return (EXC_CONTRIBUTORY); default: return (EXC_BENIGN); } } static int nested_fault(struct vcpu *vcpu, uint64_t info1, uint64_t info2, uint64_t *retinfo) { enum exc_class exc1, exc2; int type1, vector1; KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); /* * If an exception occurs while attempting to call the double-fault * handler the processor enters shutdown mode (aka triple fault). */ type1 = info1 & VM_INTINFO_TYPE; vector1 = info1 & 0xff; if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { VMM_CTR2(vcpu, "triple fault: info1(%#lx), info2(%#lx)", info1, info2); vm_suspend(vcpu->vm, VM_SUSPEND_TRIPLEFAULT); *retinfo = 0; return (0); } /* * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 */ exc1 = exception_class(info1); exc2 = exception_class(info2); if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { /* Convert nested fault into a double fault. */ *retinfo = IDT_DF; *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; *retinfo |= VM_INTINFO_DEL_ERRCODE; } else { /* Handle exceptions serially */ *retinfo = info2; } return (1); } static uint64_t vcpu_exception_intinfo(struct vcpu *vcpu) { uint64_t info = 0; if (vcpu->exception_pending) { info = vcpu->exc_vector & 0xff; info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; if (vcpu->exc_errcode_valid) { info |= VM_INTINFO_DEL_ERRCODE; info |= (uint64_t)vcpu->exc_errcode << 32; } } return (info); } int vm_entry_intinfo(struct vcpu *vcpu, uint64_t *retinfo) { uint64_t info1, info2; int valid; info1 = vcpu->exitintinfo; vcpu->exitintinfo = 0; info2 = 0; if (vcpu->exception_pending) { info2 = vcpu_exception_intinfo(vcpu); vcpu->exception_pending = 0; VMM_CTR2(vcpu, "Exception %d delivered: %#lx", vcpu->exc_vector, info2); } if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { valid = nested_fault(vcpu, info1, info2, retinfo); } else if (info1 & VM_INTINFO_VALID) { *retinfo = info1; valid = 1; } else if (info2 & VM_INTINFO_VALID) { *retinfo = info2; valid = 1; } else { valid = 0; } if (valid) { VMM_CTR4(vcpu, "%s: info1(%#lx), info2(%#lx), " "retinfo(%#lx)", __func__, info1, info2, *retinfo); } return (valid); } int vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) { *info1 = vcpu->exitintinfo; *info2 = vcpu_exception_intinfo(vcpu); return (0); } int vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction) { uint64_t regval; int error __diagused; if (vector < 0 || vector >= 32) return (EINVAL); /* * A double fault exception should never be injected directly into * the guest. It is a derived exception that results from specific * combinations of nested faults. */ if (vector == IDT_DF) return (EINVAL); if (vcpu->exception_pending) { VMM_CTR2(vcpu, "Unable to inject exception %d due to " "pending exception %d", vector, vcpu->exc_vector); return (EBUSY); } if (errcode_valid) { /* * Exceptions don't deliver an error code in real mode. */ error = vm_get_register(vcpu, VM_REG_GUEST_CR0, ®val); KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); if (!(regval & CR0_PE)) errcode_valid = 0; } /* * From section 26.6.1 "Interruptibility State" in Intel SDM: * * Event blocking by "STI" or "MOV SS" is cleared after guest executes * one instruction or incurs an exception. */ error = vm_set_register(vcpu, VM_REG_GUEST_INTR_SHADOW, 0); KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", __func__, error)); if (restart_instruction) vm_restart_instruction(vcpu); vcpu->exception_pending = 1; vcpu->exc_vector = vector; vcpu->exc_errcode = errcode; vcpu->exc_errcode_valid = errcode_valid; VMM_CTR1(vcpu, "Exception %d pending", vector); return (0); } void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode) { int error __diagused, restart_instruction; restart_instruction = 1; error = vm_inject_exception(vcpu, vector, errcode_valid, errcode, restart_instruction); KASSERT(error == 0, ("vm_inject_exception error %d", error)); } void vm_inject_pf(struct vcpu *vcpu, int error_code, uint64_t cr2) { int error __diagused; VMM_CTR2(vcpu, "Injecting page fault: error_code %#x, cr2 %#lx", error_code, cr2); error = vm_set_register(vcpu, VM_REG_GUEST_CR2, cr2); KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); vm_inject_fault(vcpu, IDT_PF, 1, error_code); } static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); int vm_inject_nmi(struct vcpu *vcpu) { vcpu->nmi_pending = 1; vcpu_notify_event(vcpu, false); return (0); } int vm_nmi_pending(struct vcpu *vcpu) { return (vcpu->nmi_pending); } void vm_nmi_clear(struct vcpu *vcpu) { if (vcpu->nmi_pending == 0) panic("vm_nmi_clear: inconsistent nmi_pending state"); vcpu->nmi_pending = 0; vmm_stat_incr(vcpu, VCPU_NMI_COUNT, 1); } static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); int vm_inject_extint(struct vcpu *vcpu) { vcpu->extint_pending = 1; vcpu_notify_event(vcpu, false); return (0); } int vm_extint_pending(struct vcpu *vcpu) { return (vcpu->extint_pending); } void vm_extint_clear(struct vcpu *vcpu) { if (vcpu->extint_pending == 0) panic("vm_extint_clear: inconsistent extint_pending state"); vcpu->extint_pending = 0; vmm_stat_incr(vcpu, VCPU_EXTINT_COUNT, 1); } int vm_get_capability(struct vcpu *vcpu, int type, int *retval) { if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (vmmops_getcap(vcpu->cookie, type, retval)); } int vm_set_capability(struct vcpu *vcpu, int type, int val) { if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (vmmops_setcap(vcpu->cookie, type, val)); } struct vm * vcpu_vm(struct vcpu *vcpu) { return (vcpu->vm); } int vcpu_vcpuid(struct vcpu *vcpu) { return (vcpu->vcpuid); } struct vcpu * vm_vcpu(struct vm *vm, int vcpuid) { return (vm->vcpu[vcpuid]); } struct vlapic * vm_lapic(struct vcpu *vcpu) { return (vcpu->vlapic); } struct vioapic * vm_ioapic(struct vm *vm) { return (vm->vioapic); } struct vhpet * vm_hpet(struct vm *vm) { return (vm->vhpet); } bool vmm_is_pptdev(int bus, int slot, int func) { int b, f, i, n, s; char *val, *cp, *cp2; bool found; /* * XXX * The length of an environment variable is limited to 128 bytes which * puts an upper limit on the number of passthru devices that may be * specified using a single environment variable. * * Work around this by scanning multiple environment variable * names instead of a single one - yuck! */ const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ found = false; for (i = 0; names[i] != NULL && !found; i++) { cp = val = kern_getenv(names[i]); while (cp != NULL && *cp != '\0') { if ((cp2 = strchr(cp, ' ')) != NULL) *cp2 = '\0'; n = sscanf(cp, "%d/%d/%d", &b, &s, &f); if (n == 3 && bus == b && slot == s && func == f) { found = true; break; } if (cp2 != NULL) *cp2++ = ' '; cp = cp2; } freeenv(val); } return (found); } void * vm_iommu_domain(struct vm *vm) { return (vm->iommu); } int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) { int error; vcpu_lock(vcpu); error = vcpu_set_state_locked(vcpu, newstate, from_idle); vcpu_unlock(vcpu); return (error); } enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu) { enum vcpu_state state; vcpu_lock(vcpu); state = vcpu->state; if (hostcpu != NULL) *hostcpu = vcpu->hostcpu; vcpu_unlock(vcpu); return (state); } int vm_activate_cpu(struct vcpu *vcpu) { struct vm *vm = vcpu->vm; if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) return (EBUSY); VMM_CTR0(vcpu, "activated"); CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus); return (0); } int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu) { if (vcpu == NULL) { vm->debug_cpus = vm->active_cpus; for (int i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm_vcpu(vm, i), false); } } else { if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) return (EINVAL); CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); vcpu_notify_event(vcpu, false); } return (0); } int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu) { if (vcpu == NULL) { CPU_ZERO(&vm->debug_cpus); } else { if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus)) return (EINVAL); CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); } return (0); } int vcpu_debugged(struct vcpu *vcpu) { return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus)); } cpuset_t vm_active_cpus(struct vm *vm) { return (vm->active_cpus); } cpuset_t vm_debug_cpus(struct vm *vm) { return (vm->debug_cpus); } cpuset_t vm_suspended_cpus(struct vm *vm) { return (vm->suspended_cpus); } /* * Returns the subset of vCPUs in tostart that are awaiting startup. * These vCPUs are also marked as no longer awaiting startup. */ cpuset_t vm_start_cpus(struct vm *vm, const cpuset_t *tostart) { cpuset_t set; mtx_lock(&vm->rendezvous_mtx); CPU_AND(&set, &vm->startup_cpus, tostart); CPU_ANDNOT(&vm->startup_cpus, &vm->startup_cpus, &set); mtx_unlock(&vm->rendezvous_mtx); return (set); } void vm_await_start(struct vm *vm, const cpuset_t *waiting) { mtx_lock(&vm->rendezvous_mtx); CPU_OR(&vm->startup_cpus, &vm->startup_cpus, waiting); mtx_unlock(&vm->rendezvous_mtx); } void * vcpu_stats(struct vcpu *vcpu) { return (vcpu->stats); } int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state) { *state = vcpu->x2apic_state; return (0); } int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) { if (state >= X2APIC_STATE_LAST) return (EINVAL); vcpu->x2apic_state = state; vlapic_set_x2apic_state(vcpu, state); return (0); } /* * This function is called to ensure that a vcpu "sees" a pending event * as soon as possible: * - If the vcpu thread is sleeping then it is woken up. * - If the vcpu is running on a different host_cpu then an IPI will be directed * to the host_cpu to cause the vcpu to trap into the hypervisor. */ static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) { int hostcpu; hostcpu = vcpu->hostcpu; if (vcpu->state == VCPU_RUNNING) { KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); if (hostcpu != curcpu) { if (lapic_intr) { vlapic_post_intr(vcpu->vlapic, hostcpu, vmm_ipinum); } else { ipi_cpu(hostcpu, vmm_ipinum); } } else { /* * If the 'vcpu' is running on 'curcpu' then it must * be sending a notification to itself (e.g. SELF_IPI). * The pending event will be picked up when the vcpu * transitions back to guest context. */ } } else { KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " "with hostcpu %d", vcpu->state, hostcpu)); if (vcpu->state == VCPU_SLEEPING) wakeup_one(vcpu); } } void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr) { vcpu_lock(vcpu); vcpu_notify_event_locked(vcpu, lapic_intr); vcpu_unlock(vcpu); } struct vmspace * vm_get_vmspace(struct vm *vm) { return (vm->vmspace); } int vm_apicid2vcpuid(struct vm *vm, int apicid) { /* * XXX apic id is assumed to be numerically identical to vcpu id */ return (apicid); } int vm_smp_rendezvous(struct vcpu *vcpu, cpuset_t dest, vm_rendezvous_func_t func, void *arg) { struct vm *vm = vcpu->vm; int error, i; /* * Enforce that this function is called without any locks */ WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); restart: mtx_lock(&vm->rendezvous_mtx); if (vm->rendezvous_func != NULL) { /* * If a rendezvous is already in progress then we need to * call the rendezvous handler in case this 'vcpu' is one * of the targets of the rendezvous. */ VMM_CTR0(vcpu, "Rendezvous already in progress"); mtx_unlock(&vm->rendezvous_mtx); error = vm_handle_rendezvous(vcpu); if (error != 0) return (error); goto restart; } KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " "rendezvous is still in progress")); VMM_CTR0(vcpu, "Initiating rendezvous"); vm->rendezvous_req_cpus = dest; CPU_ZERO(&vm->rendezvous_done_cpus); vm->rendezvous_arg = arg; vm->rendezvous_func = func; mtx_unlock(&vm->rendezvous_mtx); /* * Wake up any sleeping vcpus and trigger a VM-exit in any running * vcpus so they handle the rendezvous as soon as possible. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &dest)) vcpu_notify_event(vm_vcpu(vm, i), false); } return (vm_handle_rendezvous(vcpu)); } struct vatpic * vm_atpic(struct vm *vm) { return (vm->vatpic); } struct vatpit * vm_atpit(struct vm *vm) { return (vm->vatpit); } struct vpmtmr * vm_pmtmr(struct vm *vm) { return (vm->vpmtmr); } struct vrtc * vm_rtc(struct vm *vm) { return (vm->vrtc); } enum vm_reg_name vm_segment_name(int seg) { static enum vm_reg_name seg_names[] = { VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS }; KASSERT(seg >= 0 && seg < nitems(seg_names), ("%s: invalid segment encoding %d", __func__, seg)); return (seg_names[seg]); } void vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo) { int idx; for (idx = 0; idx < num_copyinfo; idx++) { if (copyinfo[idx].cookie != NULL) vm_gpa_release(copyinfo[idx].cookie); } bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); } int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *fault) { int error, idx, nused; size_t n, off, remaining; void *hva, *cookie; uint64_t gpa; bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); nused = 0; remaining = len; while (remaining > 0) { KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = min(remaining, PAGE_SIZE - off); copyinfo[nused].gpa = gpa; copyinfo[nused].len = n; remaining -= n; gla += n; nused++; } for (idx = 0; idx < nused; idx++) { hva = vm_gpa_hold(vcpu, copyinfo[idx].gpa, copyinfo[idx].len, prot, &cookie); if (hva == NULL) break; copyinfo[idx].hva = hva; copyinfo[idx].cookie = cookie; } if (idx != nused) { vm_copy_teardown(copyinfo, num_copyinfo); return (EFAULT); } else { *fault = 0; return (0); } } void vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len) { char *dst; int idx; dst = kaddr; idx = 0; while (len > 0) { bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); len -= copyinfo[idx].len; dst += copyinfo[idx].len; idx++; } } void vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len) { const char *src; int idx; src = kaddr; idx = 0; while (len > 0) { bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); len -= copyinfo[idx].len; src += copyinfo[idx].len; idx++; } } /* * Return the amount of in-use and wired memory for the VM. Since * these are global stats, only return the values with for vCPU 0 */ VMM_STAT_DECLARE(VMM_MEM_RESIDENT); VMM_STAT_DECLARE(VMM_MEM_WIRED); static void vm_get_rescnt(struct vcpu *vcpu, struct vmm_stat_type *stat) { if (vcpu->vcpuid == 0) { vmm_stat_set(vcpu, VMM_MEM_RESIDENT, PAGE_SIZE * vmspace_resident_count(vcpu->vm->vmspace)); } } static void vm_get_wiredcnt(struct vcpu *vcpu, struct vmm_stat_type *stat) { if (vcpu->vcpuid == 0) { vmm_stat_set(vcpu, VMM_MEM_WIRED, PAGE_SIZE * pmap_wired_count(vmspace_pmap(vcpu->vm->vmspace))); } } VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); #ifdef BHYVE_SNAPSHOT static int vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta) { uint64_t tsc, now; int ret; struct vcpu *vcpu; uint16_t i, maxcpus; now = rdtsc(); maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = vm->vcpu[i]; if (vcpu == NULL) continue; SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done); /* * Save the absolute TSC value by adding now to tsc_offset. * * It will be turned turned back into an actual offset when the * TSC restore function is called */ tsc = now + vcpu->tsc_offset; SNAPSHOT_VAR_OR_LEAVE(tsc, meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE) vcpu->tsc_offset = tsc; } done: return (ret); } static int vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta) { int ret; ret = vm_snapshot_vcpus(vm, meta); if (ret != 0) goto done; SNAPSHOT_VAR_OR_LEAVE(vm->startup_cpus, meta, ret, done); done: return (ret); } static int vm_snapshot_vcpu(struct vm *vm, struct vm_snapshot_meta *meta) { int error; struct vcpu *vcpu; uint16_t i, maxcpus; error = 0; maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = vm->vcpu[i]; if (vcpu == NULL) continue; error = vmmops_vcpu_snapshot(vcpu->cookie, meta); if (error != 0) { printf("%s: failed to snapshot vmcs/vmcb data for " "vCPU: %d; error: %d\n", __func__, i, error); goto done; } } done: return (error); } /* * Save kernel-side structures to user-space for snapshotting. */ int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta) { int ret = 0; switch (meta->dev_req) { case STRUCT_VMCX: ret = vm_snapshot_vcpu(vm, meta); break; case STRUCT_VM: ret = vm_snapshot_vm(vm, meta); break; case STRUCT_VIOAPIC: ret = vioapic_snapshot(vm_ioapic(vm), meta); break; case STRUCT_VLAPIC: ret = vlapic_snapshot(vm, meta); break; case STRUCT_VHPET: ret = vhpet_snapshot(vm_hpet(vm), meta); break; case STRUCT_VATPIC: ret = vatpic_snapshot(vm_atpic(vm), meta); break; case STRUCT_VATPIT: ret = vatpit_snapshot(vm_atpit(vm), meta); break; case STRUCT_VPMTMR: ret = vpmtmr_snapshot(vm_pmtmr(vm), meta); break; case STRUCT_VRTC: ret = vrtc_snapshot(vm_rtc(vm), meta); break; default: printf("%s: failed to find the requested type %#x\n", __func__, meta->dev_req); ret = (EINVAL); } return (ret); } void vm_set_tsc_offset(struct vcpu *vcpu, uint64_t offset) { vcpu->tsc_offset = offset; } int vm_restore_time(struct vm *vm) { int error; uint64_t now; struct vcpu *vcpu; uint16_t i, maxcpus; now = rdtsc(); error = vhpet_restore_time(vm_hpet(vm)); if (error) return (error); maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = vm->vcpu[i]; if (vcpu == NULL) continue; error = vmmops_restore_tsc(vcpu->cookie, vcpu->tsc_offset - now); if (error) return (error); } return (0); } #endif diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index 85b5bac9efe1..f9fcb32d8e28 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -1,1363 +1,1453 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_stat.h" #include "vmm_mem.h" #include "io/ppt.h" #include "io/vatpic.h" #include "io/vioapic.h" #include "io/vhpet.h" #include "io/vrtc.h" #ifdef COMPAT_FREEBSD13 struct vm_stats_old { int cpuid; /* in */ int num_entries; /* out */ struct timeval tv; uint64_t statbuf[MAX_VM_STATS]; }; #define VM_STATS_OLD \ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats_old) struct vm_snapshot_meta_old { void *ctx; /* unused */ void *dev_data; const char *dev_name; /* identify userspace devices */ enum snapshot_req dev_req; /* identify kernel structs */ struct vm_snapshot_buffer buffer; enum vm_snapshot_op op; }; #define VM_SNAPSHOT_REQ_OLD \ _IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta_old) -#endif + +struct vm_exit_ipi_13 { + uint32_t mode; + uint8_t vector; + __BITSET_DEFINE(, 256) dmask; +}; + +struct vm_exit_13 { + uint32_t exitcode; + int32_t inst_length; + uint64_t rip; + uint64_t u[120 / sizeof(uint64_t)]; +}; + +struct vm_run_13 { + int cpuid; + struct vm_exit_13 vm_exit; +}; + +#define VM_RUN_13 \ + _IOWR('v', IOCNUM_RUN, struct vm_run_13) + +#endif /* COMPAT_FREEBSD13 */ struct devmem_softc { int segid; char *name; struct cdev *cdev; struct vmmdev_softc *sc; SLIST_ENTRY(devmem_softc) link; }; struct vmmdev_softc { struct vm *vm; /* vm instance cookie */ struct cdev *cdev; struct ucred *ucred; SLIST_ENTRY(vmmdev_softc) link; SLIST_HEAD(, devmem_softc) devmem; int flags; }; #define VSC_LINKED 0x01 static SLIST_HEAD(, vmmdev_softc) head; static unsigned pr_allow_flag; static struct mtx vmmdev_mtx; MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF); static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); SYSCTL_DECL(_hw_vmm); static int vmm_priv_check(struct ucred *ucred); static int devmem_create_cdev(const char *vmname, int id, char *devmem); static void devmem_destroy(void *arg); static int vmm_priv_check(struct ucred *ucred) { if (jailed(ucred) && !(ucred->cr_prison->pr_allow & pr_allow_flag)) return (EPERM); return (0); } static int vcpu_lock_one(struct vcpu *vcpu) { return (vcpu_set_state(vcpu, VCPU_FROZEN, true)); } static void vcpu_unlock_one(struct vmmdev_softc *sc, int vcpuid, struct vcpu *vcpu) { enum vcpu_state state; state = vcpu_get_state(vcpu, NULL); if (state != VCPU_FROZEN) { panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm), vcpuid, state); } vcpu_set_state(vcpu, VCPU_IDLE, false); } static int vcpu_lock_all(struct vmmdev_softc *sc) { struct vcpu *vcpu; int error; uint16_t i, j, maxcpus; vm_slock_vcpus(sc->vm); maxcpus = vm_get_maxcpus(sc->vm); for (i = 0; i < maxcpus; i++) { vcpu = vm_vcpu(sc->vm, i); if (vcpu == NULL) continue; error = vcpu_lock_one(vcpu); if (error) break; } if (error) { for (j = 0; j < i; j++) { vcpu = vm_vcpu(sc->vm, j); if (vcpu == NULL) continue; vcpu_unlock_one(sc, j, vcpu); } vm_unlock_vcpus(sc->vm); } return (error); } static void vcpu_unlock_all(struct vmmdev_softc *sc) { struct vcpu *vcpu; uint16_t i, maxcpus; maxcpus = vm_get_maxcpus(sc->vm); for (i = 0; i < maxcpus; i++) { vcpu = vm_vcpu(sc->vm, i); if (vcpu == NULL) continue; vcpu_unlock_one(sc, i, vcpu); } vm_unlock_vcpus(sc->vm); } static struct vmmdev_softc * vmmdev_lookup(const char *name) { struct vmmdev_softc *sc; #ifdef notyet /* XXX kernel is not compiled with invariants */ mtx_assert(&vmmdev_mtx, MA_OWNED); #endif SLIST_FOREACH(sc, &head, link) { if (strcmp(name, vm_name(sc->vm)) == 0) break; } if (sc == NULL) return (NULL); if (cr_cansee(curthread->td_ucred, sc->ucred)) return (NULL); return (sc); } static struct vmmdev_softc * vmmdev_lookup2(struct cdev *cdev) { return (cdev->si_drv1); } static int vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) { int error, off, c, prot; vm_paddr_t gpa, maxaddr; void *hpa, *cookie; struct vmmdev_softc *sc; error = vmm_priv_check(curthread->td_ucred); if (error) return (error); sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); /* * Get a read lock on the guest memory map. */ vm_slock_memsegs(sc->vm); prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); maxaddr = vmm_sysmem_maxaddr(sc->vm); while (uio->uio_resid > 0 && error == 0) { gpa = uio->uio_offset; off = gpa & PAGE_MASK; c = min(uio->uio_resid, PAGE_SIZE - off); /* * The VM has a hole in its physical memory map. If we want to * use 'dd' to inspect memory beyond the hole we need to * provide bogus data for memory that lies in the hole. * * Since this device does not support lseek(2), dd(1) will * read(2) blocks of data to simulate the lseek(2). */ hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie); if (hpa == NULL) { if (uio->uio_rw == UIO_READ && gpa < maxaddr) error = uiomove(__DECONST(void *, zero_region), c, uio); else error = EFAULT; } else { error = uiomove(hpa, c, uio); vm_gpa_release(cookie); } } vm_unlock_memsegs(sc->vm); return (error); } CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1); static int get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) { struct devmem_softc *dsc; int error; bool sysmem; error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL); if (error || mseg->len == 0) return (error); if (!sysmem) { SLIST_FOREACH(dsc, &sc->devmem, link) { if (dsc->segid == mseg->segid) break; } KASSERT(dsc != NULL, ("%s: devmem segment %d not found", __func__, mseg->segid)); error = copystr(dsc->name, mseg->name, len, NULL); } else { bzero(mseg->name, len); } return (error); } static int alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) { char *name; int error; bool sysmem; error = 0; name = NULL; sysmem = true; /* * The allocation is lengthened by 1 to hold a terminating NUL. It'll * by stripped off when devfs processes the full string. */ if (VM_MEMSEG_NAME(mseg)) { sysmem = false; name = malloc(len, M_VMMDEV, M_WAITOK); error = copystr(mseg->name, name, len, NULL); if (error) goto done; } error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem); if (error) goto done; if (VM_MEMSEG_NAME(mseg)) { error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name); if (error) vm_free_memseg(sc->vm, mseg->segid); else name = NULL; /* freed when 'cdev' is destroyed */ } done: free(name, M_VMMDEV); return (error); } static int vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_get_register(vcpu, regnum[i], ®val[i]); if (error) break; } return (error); } static int vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_set_register(vcpu, regnum[i], regval[i]); if (error) break; } return (error); } static int vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { int error, vcpuid, size; cpuset_t *cpuset; struct vmmdev_softc *sc; struct vcpu *vcpu; struct vm_register *vmreg; struct vm_seg_desc *vmsegdesc; struct vm_register_set *vmregset; struct vm_run *vmrun; +#ifdef COMPAT_FREEBSD13 + struct vm_run_13 *vmrun_13; +#endif struct vm_exception *vmexc; struct vm_lapic_irq *vmirq; struct vm_lapic_msi *vmmsi; struct vm_ioapic_irq *ioapic_irq; struct vm_isa_irq *isa_irq; struct vm_isa_irq_trigger *isa_irq_trigger; struct vm_capability *vmcap; struct vm_pptdev *pptdev; struct vm_pptdev_mmio *pptmmio; struct vm_pptdev_msi *pptmsi; struct vm_pptdev_msix *pptmsix; #ifdef COMPAT_FREEBSD13 struct vm_stats_old *vmstats_old; #endif struct vm_stats *vmstats; struct vm_stat_desc *statdesc; struct vm_x2apic *x2apic; struct vm_gpa_pte *gpapte; struct vm_suspend *vmsuspend; struct vm_gla2gpa *gg; struct vm_cpuset *vm_cpuset; struct vm_intinfo *vmii; struct vm_rtc_time *rtctime; struct vm_rtc_data *rtcdata; struct vm_memmap *mm; struct vm_munmap *mu; struct vm_cpu_topology *topology; struct vm_readwrite_kernemu_device *kernemu; uint64_t *regvals; int *regnums; enum { NONE, SINGLE, ALL } vcpus_locked; bool memsegs_locked; #ifdef BHYVE_SNAPSHOT struct vm_snapshot_meta *snapshot_meta; #ifdef COMPAT_FREEBSD13 struct vm_snapshot_meta_old *snapshot_old; #endif #endif error = vmm_priv_check(curthread->td_ucred); if (error) return (error); sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); vcpuid = -1; vcpu = NULL; vcpus_locked = NONE; memsegs_locked = false; /* * For VMM ioctls that operate on a single vCPU, lookup the * vcpu. For VMM ioctls which require one or more vCPUs to * not be running, lock necessary vCPUs. * * XXX fragile, handle with care * Most of these assume that the first field of the ioctl data * is the vcpuid. */ switch (cmd) { case VM_RUN: +#ifdef COMPAT_FREEBSD13 + case VM_RUN_13: +#endif case VM_GET_REGISTER: case VM_SET_REGISTER: case VM_GET_SEGMENT_DESCRIPTOR: case VM_SET_SEGMENT_DESCRIPTOR: case VM_GET_REGISTER_SET: case VM_SET_REGISTER_SET: case VM_INJECT_EXCEPTION: case VM_GET_CAPABILITY: case VM_SET_CAPABILITY: case VM_SET_X2APIC_STATE: case VM_GLA2GPA: case VM_GLA2GPA_NOFAULT: case VM_ACTIVATE_CPU: case VM_SET_INTINFO: case VM_GET_INTINFO: case VM_RESTART_INSTRUCTION: case VM_GET_KERNEMU_DEV: case VM_SET_KERNEMU_DEV: /* * ioctls that can operate only on vcpus that are not running. */ vcpuid = *(int *)data; vcpu = vm_alloc_vcpu(sc->vm, vcpuid); if (vcpu == NULL) { error = EINVAL; goto done; } error = vcpu_lock_one(vcpu); if (error) goto done; vcpus_locked = SINGLE; break; #ifdef COMPAT_FREEBSD12 case VM_ALLOC_MEMSEG_FBSD12: #endif case VM_ALLOC_MEMSEG: case VM_BIND_PPTDEV: case VM_UNBIND_PPTDEV: case VM_MMAP_MEMSEG: case VM_MUNMAP_MEMSEG: case VM_REINIT: /* * ioctls that modify the memory map must lock memory * segments exclusively. */ vm_xlock_memsegs(sc->vm); memsegs_locked = true; /* FALLTHROUGH */ case VM_MAP_PPTDEV_MMIO: case VM_UNMAP_PPTDEV_MMIO: #ifdef BHYVE_SNAPSHOT case VM_SNAPSHOT_REQ: #ifdef COMPAT_FREEBSD13 case VM_SNAPSHOT_REQ_OLD: #endif case VM_RESTORE_TIME: #endif /* * ioctls that operate on the entire virtual machine must * prevent all vcpus from running. */ error = vcpu_lock_all(sc); if (error) goto done; vcpus_locked = ALL; break; #ifdef COMPAT_FREEBSD12 case VM_GET_MEMSEG_FBSD12: #endif case VM_GET_MEMSEG: case VM_MMAP_GETNEXT: /* * Lock the memory map while it is being inspected. */ vm_slock_memsegs(sc->vm); memsegs_locked = true; break; #ifdef COMPAT_FREEBSD13 case VM_STATS_OLD: #endif case VM_STATS: case VM_INJECT_NMI: case VM_LAPIC_IRQ: case VM_GET_X2APIC_STATE: /* * These do not need the vCPU locked but do operate on * a specific vCPU. */ vcpuid = *(int *)data; vcpu = vm_alloc_vcpu(sc->vm, vcpuid); if (vcpu == NULL) { error = EINVAL; goto done; } break; case VM_LAPIC_LOCAL_IRQ: case VM_SUSPEND_CPU: case VM_RESUME_CPU: /* * These can either operate on all CPUs via a vcpuid of * -1 or on a specific vCPU. */ vcpuid = *(int *)data; if (vcpuid == -1) break; vcpu = vm_alloc_vcpu(sc->vm, vcpuid); if (vcpu == NULL) { error = EINVAL; goto done; } break; default: break; } - switch(cmd) { - case VM_RUN: + switch (cmd) { + case VM_RUN: { + struct vm_exit *vme; + vmrun = (struct vm_run *)data; - error = vm_run(vcpu, &vmrun->vm_exit); + vme = vm_exitinfo(vcpu); + + error = vm_run(vcpu); + if (error != 0) + break; + + error = copyout(vme, vmrun->vm_exit, sizeof(*vme)); + if (error != 0) + break; + if (vme->exitcode == VM_EXITCODE_IPI) { + error = copyout(vm_exitinfo_cpuset(vcpu), + vmrun->cpuset, + min(vmrun->cpusetsize, sizeof(cpuset_t))); + if (error != 0) + break; + if (sizeof(cpuset_t) < vmrun->cpusetsize) { + uint8_t *p; + + p = (uint8_t *)vmrun->cpuset + + sizeof(cpuset_t); + while (error == 0 && + p < (uint8_t *)vmrun->cpuset + + vmrun->cpusetsize) { + error = subyte(p++, 0); + } + } + } break; + } +#ifdef COMPAT_FREEBSD13 + case VM_RUN_13: { + struct vm_exit *vme; + struct vm_exit_13 *vme_13; + + vmrun_13 = (struct vm_run_13 *)data; + vme_13 = &vmrun_13->vm_exit; + vme = vm_exitinfo(vcpu); + + error = vm_run(vcpu); + if (error == 0) { + vme_13->exitcode = vme->exitcode; + vme_13->inst_length = vme->inst_length; + vme_13->rip = vme->rip; + memcpy(vme_13->u, &vme->u, sizeof(vme_13->u)); + if (vme->exitcode == VM_EXITCODE_IPI) { + struct vm_exit_ipi_13 *ipi; + cpuset_t *dmask; + int cpu; + + dmask = vm_exitinfo_cpuset(vcpu); + ipi = (struct vm_exit_ipi_13 *)&vme_13->u[0]; + BIT_ZERO(256, &ipi->dmask); + CPU_FOREACH_ISSET(cpu, dmask) { + if (cpu >= 256) + break; + BIT_SET(256, cpu, &ipi->dmask); + } + } + } + break; + } +#endif case VM_SUSPEND: vmsuspend = (struct vm_suspend *)data; error = vm_suspend(sc->vm, vmsuspend->how); break; case VM_REINIT: error = vm_reinit(sc->vm); break; case VM_STAT_DESC: { statdesc = (struct vm_stat_desc *)data; error = vmm_stat_desc_copy(statdesc->index, statdesc->desc, sizeof(statdesc->desc)); break; } #ifdef COMPAT_FREEBSD13 case VM_STATS_OLD: vmstats_old = (struct vm_stats_old *)data; getmicrotime(&vmstats_old->tv); error = vmm_stat_copy(vcpu, 0, nitems(vmstats_old->statbuf), &vmstats_old->num_entries, vmstats_old->statbuf); break; #endif case VM_STATS: { vmstats = (struct vm_stats *)data; getmicrotime(&vmstats->tv); error = vmm_stat_copy(vcpu, vmstats->index, nitems(vmstats->statbuf), &vmstats->num_entries, vmstats->statbuf); break; } case VM_PPTDEV_MSI: pptmsi = (struct vm_pptdev_msi *)data; error = ppt_setup_msi(sc->vm, pptmsi->bus, pptmsi->slot, pptmsi->func, pptmsi->addr, pptmsi->msg, pptmsi->numvec); break; case VM_PPTDEV_MSIX: pptmsix = (struct vm_pptdev_msix *)data; error = ppt_setup_msix(sc->vm, pptmsix->bus, pptmsix->slot, pptmsix->func, pptmsix->idx, pptmsix->addr, pptmsix->msg, pptmsix->vector_control); break; case VM_PPTDEV_DISABLE_MSIX: pptdev = (struct vm_pptdev *)data; error = ppt_disable_msix(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_MAP_PPTDEV_MMIO: pptmmio = (struct vm_pptdev_mmio *)data; error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, pptmmio->func, pptmmio->gpa, pptmmio->len, pptmmio->hpa); break; case VM_UNMAP_PPTDEV_MMIO: pptmmio = (struct vm_pptdev_mmio *)data; error = ppt_unmap_mmio(sc->vm, pptmmio->bus, pptmmio->slot, pptmmio->func, pptmmio->gpa, pptmmio->len); break; case VM_BIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_UNBIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_INJECT_EXCEPTION: vmexc = (struct vm_exception *)data; error = vm_inject_exception(vcpu, vmexc->vector, vmexc->error_code_valid, vmexc->error_code, vmexc->restart_instruction); break; case VM_INJECT_NMI: error = vm_inject_nmi(vcpu); break; case VM_LAPIC_IRQ: vmirq = (struct vm_lapic_irq *)data; error = lapic_intr_edge(vcpu, vmirq->vector); break; case VM_LAPIC_LOCAL_IRQ: vmirq = (struct vm_lapic_irq *)data; error = lapic_set_local_intr(sc->vm, vcpu, vmirq->vector); break; case VM_LAPIC_MSI: vmmsi = (struct vm_lapic_msi *)data; error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg); break; case VM_IOAPIC_ASSERT_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_DEASSERT_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_PULSE_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_PINCOUNT: *(int *)data = vioapic_pincount(sc->vm); break; case VM_SET_KERNEMU_DEV: case VM_GET_KERNEMU_DEV: { mem_region_write_t mwrite; mem_region_read_t mread; bool arg; kernemu = (void *)data; if (kernemu->access_width > 0) size = (1u << kernemu->access_width); else size = 1; if (kernemu->gpa >= DEFAULT_APIC_BASE && kernemu->gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { mread = lapic_mmio_read; mwrite = lapic_mmio_write; } else if (kernemu->gpa >= VIOAPIC_BASE && kernemu->gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { mread = vioapic_mmio_read; mwrite = vioapic_mmio_write; } else if (kernemu->gpa >= VHPET_BASE && kernemu->gpa < VHPET_BASE + VHPET_SIZE) { mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { error = EINVAL; break; } if (cmd == VM_SET_KERNEMU_DEV) error = mwrite(vcpu, kernemu->gpa, kernemu->value, size, &arg); else error = mread(vcpu, kernemu->gpa, &kernemu->value, size, &arg); break; } case VM_ISA_ASSERT_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_assert_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_DEASSERT_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_deassert_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_PULSE_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_SET_IRQ_TRIGGER: isa_irq_trigger = (struct vm_isa_irq_trigger *)data; error = vatpic_set_irq_trigger(sc->vm, isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger); break; case VM_MMAP_GETNEXT: mm = (struct vm_memmap *)data; error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid, &mm->segoff, &mm->len, &mm->prot, &mm->flags); break; case VM_MMAP_MEMSEG: mm = (struct vm_memmap *)data; error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff, mm->len, mm->prot, mm->flags); break; case VM_MUNMAP_MEMSEG: mu = (struct vm_munmap *)data; error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len); break; #ifdef COMPAT_FREEBSD12 case VM_ALLOC_MEMSEG_FBSD12: error = alloc_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg_fbsd12 *)0)->name)); break; #endif case VM_ALLOC_MEMSEG: error = alloc_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg *)0)->name)); break; #ifdef COMPAT_FREEBSD12 case VM_GET_MEMSEG_FBSD12: error = get_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg_fbsd12 *)0)->name)); break; #endif case VM_GET_MEMSEG: error = get_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg *)0)->name)); break; case VM_GET_REGISTER: vmreg = (struct vm_register *)data; error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval); break; case VM_SET_REGISTER: vmreg = (struct vm_register *)data; error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval); break; case VM_SET_SEGMENT_DESCRIPTOR: vmsegdesc = (struct vm_seg_desc *)data; error = vm_set_seg_desc(vcpu, vmsegdesc->regnum, &vmsegdesc->desc); break; case VM_GET_SEGMENT_DESCRIPTOR: vmsegdesc = (struct vm_seg_desc *)data; error = vm_get_seg_desc(vcpu, vmsegdesc->regnum, &vmsegdesc->desc); break; case VM_GET_REGISTER_SET: vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, M_WAITOK); regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = vm_get_register_set(vcpu, vmregset->count, regnums, regvals); if (error == 0) error = copyout(regvals, vmregset->regvals, sizeof(regvals[0]) * vmregset->count); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; case VM_SET_REGISTER_SET: vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, M_WAITOK); regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = copyin(vmregset->regvals, regvals, sizeof(regvals[0]) * vmregset->count); if (error == 0) error = vm_set_register_set(vcpu, vmregset->count, regnums, regvals); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; case VM_GET_CAPABILITY: vmcap = (struct vm_capability *)data; error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval); break; case VM_SET_CAPABILITY: vmcap = (struct vm_capability *)data; error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval); break; case VM_SET_X2APIC_STATE: x2apic = (struct vm_x2apic *)data; error = vm_set_x2apic_state(vcpu, x2apic->state); break; case VM_GET_X2APIC_STATE: x2apic = (struct vm_x2apic *)data; error = vm_get_x2apic_state(vcpu, &x2apic->state); break; case VM_GET_GPA_PMAP: gpapte = (struct vm_gpa_pte *)data; pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), gpapte->gpa, gpapte->pte, &gpapte->ptenum); error = 0; break; case VM_GET_HPET_CAPABILITIES: error = vhpet_getcap((struct vm_hpet_cap *)data); break; case VM_GLA2GPA: { CTASSERT(PROT_READ == VM_PROT_READ); CTASSERT(PROT_WRITE == VM_PROT_WRITE); CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); gg = (struct vm_gla2gpa *)data; error = vm_gla2gpa(vcpu, &gg->paging, gg->gla, gg->prot, &gg->gpa, &gg->fault); KASSERT(error == 0 || error == EFAULT, ("%s: vm_gla2gpa unknown error %d", __func__, error)); break; } case VM_GLA2GPA_NOFAULT: gg = (struct vm_gla2gpa *)data; error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla, gg->prot, &gg->gpa, &gg->fault); KASSERT(error == 0 || error == EFAULT, ("%s: vm_gla2gpa unknown error %d", __func__, error)); break; case VM_ACTIVATE_CPU: error = vm_activate_cpu(vcpu); break; case VM_GET_CPUS: error = 0; vm_cpuset = (struct vm_cpuset *)data; size = vm_cpuset->cpusetsize; if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) { error = ERANGE; break; } cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO); if (vm_cpuset->which == VM_ACTIVE_CPUS) *cpuset = vm_active_cpus(sc->vm); else if (vm_cpuset->which == VM_SUSPENDED_CPUS) *cpuset = vm_suspended_cpus(sc->vm); else if (vm_cpuset->which == VM_DEBUG_CPUS) *cpuset = vm_debug_cpus(sc->vm); else error = EINVAL; if (error == 0) error = copyout(cpuset, vm_cpuset->cpus, size); free(cpuset, M_TEMP); break; case VM_SUSPEND_CPU: error = vm_suspend_cpu(sc->vm, vcpu); break; case VM_RESUME_CPU: error = vm_resume_cpu(sc->vm, vcpu); break; case VM_SET_INTINFO: vmii = (struct vm_intinfo *)data; error = vm_exit_intinfo(vcpu, vmii->info1); break; case VM_GET_INTINFO: vmii = (struct vm_intinfo *)data; error = vm_get_intinfo(vcpu, &vmii->info1, &vmii->info2); break; case VM_RTC_WRITE: rtcdata = (struct vm_rtc_data *)data; error = vrtc_nvram_write(sc->vm, rtcdata->offset, rtcdata->value); break; case VM_RTC_READ: rtcdata = (struct vm_rtc_data *)data; error = vrtc_nvram_read(sc->vm, rtcdata->offset, &rtcdata->value); break; case VM_RTC_SETTIME: rtctime = (struct vm_rtc_time *)data; error = vrtc_set_time(sc->vm, rtctime->secs); break; case VM_RTC_GETTIME: error = 0; rtctime = (struct vm_rtc_time *)data; rtctime->secs = vrtc_get_time(sc->vm); break; case VM_RESTART_INSTRUCTION: error = vm_restart_instruction(vcpu); break; case VM_SET_TOPOLOGY: topology = (struct vm_cpu_topology *)data; error = vm_set_topology(sc->vm, topology->sockets, topology->cores, topology->threads, topology->maxcpus); break; case VM_GET_TOPOLOGY: topology = (struct vm_cpu_topology *)data; vm_get_topology(sc->vm, &topology->sockets, &topology->cores, &topology->threads, &topology->maxcpus); error = 0; break; #ifdef BHYVE_SNAPSHOT case VM_SNAPSHOT_REQ: snapshot_meta = (struct vm_snapshot_meta *)data; error = vm_snapshot_req(sc->vm, snapshot_meta); break; #ifdef COMPAT_FREEBSD13 case VM_SNAPSHOT_REQ_OLD: /* * The old structure just has an additional pointer at * the start that is ignored. */ snapshot_old = (struct vm_snapshot_meta_old *)data; snapshot_meta = (struct vm_snapshot_meta *)&snapshot_old->dev_data; error = vm_snapshot_req(sc->vm, snapshot_meta); break; #endif case VM_RESTORE_TIME: error = vm_restore_time(sc->vm); break; #endif default: error = ENOTTY; break; } if (vcpus_locked == SINGLE) vcpu_unlock_one(sc, vcpuid, vcpu); else if (vcpus_locked == ALL) vcpu_unlock_all(sc); if (memsegs_locked) vm_unlock_memsegs(sc->vm); done: /* * Make sure that no handler returns a kernel-internal * error value to userspace. */ KASSERT(error == ERESTART || error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); return (error); } static int vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize, struct vm_object **objp, int nprot) { struct vmmdev_softc *sc; vm_paddr_t gpa; size_t len; vm_ooffset_t segoff, first, last; int error, found, segid; bool sysmem; error = vmm_priv_check(curthread->td_ucred); if (error) return (error); first = *offset; last = first + mapsize; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); sc = vmmdev_lookup2(cdev); if (sc == NULL) { /* virtual machine is in the process of being created */ return (EINVAL); } /* * Get a read lock on the guest memory map. */ vm_slock_memsegs(sc->vm); gpa = 0; found = 0; while (!found) { error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len, NULL, NULL); if (error) break; if (first >= gpa && last <= gpa + len) found = 1; else gpa += len; } if (found) { error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp); KASSERT(error == 0 && *objp != NULL, ("%s: invalid memory segment %d", __func__, segid)); if (sysmem) { vm_object_reference(*objp); *offset = segoff + (first - gpa); } else { error = EINVAL; } } vm_unlock_memsegs(sc->vm); return (error); } static void vmmdev_destroy(void *arg) { struct vmmdev_softc *sc = arg; struct devmem_softc *dsc; int error __diagused; vm_disable_vcpu_creation(sc->vm); error = vcpu_lock_all(sc); KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error)); vm_unlock_vcpus(sc->vm); while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) { KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__)); SLIST_REMOVE_HEAD(&sc->devmem, link); free(dsc->name, M_VMMDEV); free(dsc, M_VMMDEV); } if (sc->cdev != NULL) destroy_dev(sc->cdev); if (sc->vm != NULL) vm_destroy(sc->vm); if (sc->ucred != NULL) crfree(sc->ucred); if ((sc->flags & VSC_LINKED) != 0) { mtx_lock(&vmmdev_mtx); SLIST_REMOVE(&head, sc, vmmdev_softc, link); mtx_unlock(&vmmdev_mtx); } free(sc, M_VMMDEV); } static int sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) { struct devmem_softc *dsc; struct vmmdev_softc *sc; struct cdev *cdev; char *buf; int error, buflen; error = vmm_priv_check(req->td->td_ucred); if (error) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); strlcpy(buf, "beavis", buflen); error = sysctl_handle_string(oidp, buf, buflen, req); if (error != 0 || req->newptr == NULL) goto out; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(buf); if (sc == NULL || sc->cdev == NULL) { mtx_unlock(&vmmdev_mtx); error = EINVAL; goto out; } /* * Setting 'sc->cdev' to NULL is used to indicate that the VM * is scheduled for destruction. */ cdev = sc->cdev; sc->cdev = NULL; mtx_unlock(&vmmdev_mtx); /* * Destroy all cdevs: * * - any new operations on the 'cdev' will return an error (ENXIO). * * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev' */ SLIST_FOREACH(dsc, &sc->devmem, link) { KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed")); destroy_dev(dsc->cdev); devmem_destroy(dsc); } destroy_dev(cdev); vmmdev_destroy(sc); error = 0; out: free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_destroy, "A", NULL); static struct cdevsw vmmdevsw = { .d_name = "vmmdev", .d_version = D_VERSION, .d_ioctl = vmmdev_ioctl, .d_mmap_single = vmmdev_mmap_single, .d_read = vmmdev_rw, .d_write = vmmdev_rw, }; static int sysctl_vmm_create(SYSCTL_HANDLER_ARGS) { struct vm *vm; struct cdev *cdev; struct vmmdev_softc *sc, *sc2; char *buf; int error, buflen; error = vmm_priv_check(req->td->td_ucred); if (error) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); strlcpy(buf, "beavis", buflen); error = sysctl_handle_string(oidp, buf, buflen, req); if (error != 0 || req->newptr == NULL) goto out; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(buf); mtx_unlock(&vmmdev_mtx); if (sc != NULL) { error = EEXIST; goto out; } error = vm_create(buf, &vm); if (error != 0) goto out; sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); sc->ucred = crhold(curthread->td_ucred); sc->vm = vm; SLIST_INIT(&sc->devmem); /* * Lookup the name again just in case somebody sneaked in when we * dropped the lock. */ mtx_lock(&vmmdev_mtx); sc2 = vmmdev_lookup(buf); if (sc2 == NULL) { SLIST_INSERT_HEAD(&head, sc, link); sc->flags |= VSC_LINKED; } mtx_unlock(&vmmdev_mtx); if (sc2 != NULL) { vmmdev_destroy(sc); error = EEXIST; goto out; } error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred, UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); if (error != 0) { vmmdev_destroy(sc); goto out; } mtx_lock(&vmmdev_mtx); sc->cdev = cdev; sc->cdev->si_drv1 = sc; mtx_unlock(&vmmdev_mtx); out: free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_create, "A", NULL); void vmmdev_init(void) { pr_allow_flag = prison_add_allow(NULL, "vmm", NULL, "Allow use of vmm in a jail."); } int vmmdev_cleanup(void) { int error; if (SLIST_EMPTY(&head)) error = 0; else error = EBUSY; return (error); } static int devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len, struct vm_object **objp, int nprot) { struct devmem_softc *dsc; vm_ooffset_t first, last; size_t seglen; int error; bool sysmem; dsc = cdev->si_drv1; if (dsc == NULL) { /* 'cdev' has been created but is not ready for use */ return (ENXIO); } first = *offset; last = *offset + len; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); vm_slock_memsegs(dsc->sc->vm); error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp); KASSERT(error == 0 && !sysmem && *objp != NULL, ("%s: invalid devmem segment %d", __func__, dsc->segid)); if (seglen >= last) vm_object_reference(*objp); else error = EINVAL; vm_unlock_memsegs(dsc->sc->vm); return (error); } static struct cdevsw devmemsw = { .d_name = "devmem", .d_version = D_VERSION, .d_mmap_single = devmem_mmap_single, }; static int devmem_create_cdev(const char *vmname, int segid, char *devname) { struct devmem_softc *dsc; struct vmmdev_softc *sc; struct cdev *cdev; int error; error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL, UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname); if (error) return (error); dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO); mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(vmname); KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname)); if (sc->cdev == NULL) { /* virtual machine is being created or destroyed */ mtx_unlock(&vmmdev_mtx); free(dsc, M_VMMDEV); destroy_dev_sched_cb(cdev, NULL, 0); return (ENODEV); } dsc->segid = segid; dsc->name = devname; dsc->cdev = cdev; dsc->sc = sc; SLIST_INSERT_HEAD(&sc->devmem, dsc, link); mtx_unlock(&vmmdev_mtx); /* The 'cdev' is ready for use after 'si_drv1' is initialized */ cdev->si_drv1 = dsc; return (0); } static void devmem_destroy(void *arg) { struct devmem_softc *dsc = arg; KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__)); dsc->cdev = NULL; dsc->sc = NULL; } diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index 25d8937138e6..7be97ecedfdc 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -1,1651 +1,1678 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #ifdef BHYVE_SNAPSHOT #include #include #endif #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #include #include #endif #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "bootrom.h" #include "config.h" #include "inout.h" #include "debug.h" #include "e820.h" #include "fwctl.h" #include "gdb.h" #include "ioapic.h" #include "kernemu_dev.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "qemu_fwcfg.h" #include "smbiostbl.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include "vmgenc.h" #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ #define MB (1024UL * 1024) #define GB (1024UL * MB) static const char * const vmx_exit_reason_desc[] = { [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", [EXIT_REASON_EXT_INTR] = "External interrupt", [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", [EXIT_REASON_INIT] = "INIT signal", [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", [EXIT_REASON_SMI] = "Other SMI", [EXIT_REASON_INTR_WINDOW] = "Interrupt window", [EXIT_REASON_NMI_WINDOW] = "NMI window", [EXIT_REASON_TASK_SWITCH] = "Task switch", [EXIT_REASON_CPUID] = "CPUID", [EXIT_REASON_GETSEC] = "GETSEC", [EXIT_REASON_HLT] = "HLT", [EXIT_REASON_INVD] = "INVD", [EXIT_REASON_INVLPG] = "INVLPG", [EXIT_REASON_RDPMC] = "RDPMC", [EXIT_REASON_RDTSC] = "RDTSC", [EXIT_REASON_RSM] = "RSM", [EXIT_REASON_VMCALL] = "VMCALL", [EXIT_REASON_VMCLEAR] = "VMCLEAR", [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", [EXIT_REASON_VMPTRLD] = "VMPTRLD", [EXIT_REASON_VMPTRST] = "VMPTRST", [EXIT_REASON_VMREAD] = "VMREAD", [EXIT_REASON_VMRESUME] = "VMRESUME", [EXIT_REASON_VMWRITE] = "VMWRITE", [EXIT_REASON_VMXOFF] = "VMXOFF", [EXIT_REASON_VMXON] = "VMXON", [EXIT_REASON_CR_ACCESS] = "Control-register accesses", [EXIT_REASON_DR_ACCESS] = "MOV DR", [EXIT_REASON_INOUT] = "I/O instruction", [EXIT_REASON_RDMSR] = "RDMSR", [EXIT_REASON_WRMSR] = "WRMSR", [EXIT_REASON_INVAL_VMCS] = "VM-entry failure due to invalid guest state", [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", [EXIT_REASON_MWAIT] = "MWAIT", [EXIT_REASON_MTF] = "Monitor trap flag", [EXIT_REASON_MONITOR] = "MONITOR", [EXIT_REASON_PAUSE] = "PAUSE", [EXIT_REASON_MCE_DURING_ENTRY] = "VM-entry failure due to machine-check event", [EXIT_REASON_TPR] = "TPR below threshold", [EXIT_REASON_APIC_ACCESS] = "APIC access", [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", [EXIT_REASON_EPT_FAULT] = "EPT violation", [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", [EXIT_REASON_INVEPT] = "INVEPT", [EXIT_REASON_RDTSCP] = "RDTSCP", [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", [EXIT_REASON_INVVPID] = "INVVPID", [EXIT_REASON_WBINVD] = "WBINVD", [EXIT_REASON_XSETBV] = "XSETBV", [EXIT_REASON_APIC_WRITE] = "APIC write", [EXIT_REASON_RDRAND] = "RDRAND", [EXIT_REASON_INVPCID] = "INVPCID", [EXIT_REASON_VMFUNC] = "VMFUNC", [EXIT_REASON_ENCLS] = "ENCLS", [EXIT_REASON_RDSEED] = "RDSEED", [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", [EXIT_REASON_XSAVES] = "XSAVES", [EXIT_REASON_XRSTORS] = "XRSTORS" }; -typedef int (*vmexit_handler_t)(struct vmctx *, struct vcpu *, struct vm_exit *); +typedef int (*vmexit_handler_t)(struct vmctx *, struct vcpu *, struct vm_run *); int guest_ncpus; uint16_t cpu_cores, cpu_sockets, cpu_threads; int raw_stdio = 0; static char *progname; static const int BSP = 0; static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu); static struct bhyvestats { uint64_t vmexit_bogus; uint64_t vmexit_reqidle; uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; uint64_t vmexit_inst_emul; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; } stats; static struct vcpu_info { struct vmctx *ctx; struct vcpu *vcpu; int vcpuid; } *vcpu_info; static cpuset_t **vcpumap; static void usage(int code) { fprintf(stderr, "Usage: %s [-AaCDeHhPSuWwxY]\n" " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" " %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n" " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n" " -A: create ACPI tables\n" " -a: local apic is in xAPIC mode (deprecated)\n" " -C: include guest memory in core file\n" " -c: number of CPUs and/or topology specification\n" " -D: destroy on power-off\n" " -e: exit on unhandled I/O access\n" " -G: start a debug server\n" " -H: vmexit from the guest on HLT\n" " -h: help\n" " -k: key=value flat config file\n" " -K: PS2 keyboard layout\n" " -l: LPC device configuration\n" " -m: memory size\n" " -o: set config 'var' to 'value'\n" " -P: vmexit from the guest on pause\n" " -p: pin 'vcpu' to 'hostcpu'\n" #ifdef BHYVE_SNAPSHOT " -r: path to checkpoint file\n" #endif " -S: guest memory cannot be swapped\n" " -s: PCI slot config\n" " -U: UUID\n" " -u: RTC keeps UTC time\n" " -W: force virtio to use single-vector MSI\n" " -w: ignore unimplemented MSRs\n" " -x: local APIC is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), "", (int)strlen(progname), "", (int)strlen(progname), ""); exit(code); } /* * XXX This parser is known to have the following issues: * 1. It accepts null key=value tokens ",," as setting "cpus" to an * empty string. * * The acceptance of a null specification ('-c ""') is by design to match the * manual page syntax specification, this results in a topology of 1 vCPU. */ static int topology_parse(const char *opt) { char *cp, *str, *tofree; if (*opt == '\0') { set_config_value("sockets", "1"); set_config_value("cores", "1"); set_config_value("threads", "1"); set_config_value("cpus", "1"); return (0); } tofree = str = strdup(opt); if (str == NULL) errx(4, "Failed to allocate memory"); while ((cp = strsep(&str, ",")) != NULL) { if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) set_config_value("cpus", cp + strlen("cpus=")); else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0) set_config_value("sockets", cp + strlen("sockets=")); else if (strncmp(cp, "cores=", strlen("cores=")) == 0) set_config_value("cores", cp + strlen("cores=")); else if (strncmp(cp, "threads=", strlen("threads=")) == 0) set_config_value("threads", cp + strlen("threads=")); #ifdef notyet /* Do not expose this until vmm.ko implements it */ else if (strncmp(cp, "maxcpus=", strlen("maxcpus=")) == 0) set_config_value("maxcpus", cp + strlen("maxcpus=")); #endif else if (strchr(cp, '=') != NULL) goto out; else set_config_value("cpus", cp); } free(tofree); return (0); out: free(tofree); return (-1); } static int parse_int_value(const char *key, const char *value, int minval, int maxval) { char *cp; long lval; errno = 0; lval = strtol(value, &cp, 0); if (errno != 0 || *cp != '\0' || cp == value || lval < minval || lval > maxval) errx(4, "Invalid value for %s: '%s'", key, value); return (lval); } /* * Set the sockets, cores, threads, and guest_cpus variables based on * the configured topology. * * The limits of UINT16_MAX are due to the types passed to * vm_set_topology(). vmm.ko may enforce tighter limits. */ static void calc_topology(void) { const char *value; bool explicit_cpus; uint64_t ncpus; value = get_config_value("cpus"); if (value != NULL) { guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX); explicit_cpus = true; } else { guest_ncpus = 1; explicit_cpus = false; } value = get_config_value("cores"); if (value != NULL) cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX); else cpu_cores = 1; value = get_config_value("threads"); if (value != NULL) cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX); else cpu_threads = 1; value = get_config_value("sockets"); if (value != NULL) cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX); else cpu_sockets = guest_ncpus; /* * Compute sockets * cores * threads avoiding overflow. The * range check above insures these are 16 bit values. */ ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads; if (ncpus > UINT16_MAX) errx(4, "Computed number of vCPUs too high: %ju", (uintmax_t)ncpus); if (explicit_cpus) { if (guest_ncpus != (int)ncpus) errx(4, "Topology (%d sockets, %d cores, %d threads) " "does not match %d vCPUs", cpu_sockets, cpu_cores, cpu_threads, guest_ncpus); } else guest_ncpus = ncpus; } static int pincpu_parse(const char *opt) { const char *value; char *newval; char key[16]; int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0) { fprintf(stderr, "invalid vcpu '%d'\n", vcpu); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (asprintf(&newval, "%s%s%d", value != NULL ? value : "", value != NULL ? "," : "", pcpu) == -1) { perror("failed to build new cpuset string"); return (-1); } set_config_value(key, newval); free(newval); return (0); } static void parse_cpuset(int vcpu, const char *list, cpuset_t *set) { char *cp, *token; int pcpu, start; CPU_ZERO(set); start = -1; token = __DECONST(char *, list); for (;;) { pcpu = strtoul(token, &cp, 0); if (cp == token) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); if (pcpu < 0 || pcpu >= CPU_SETSIZE) errx(4, "hostcpu '%d' outside valid range from 0 to %d", pcpu, CPU_SETSIZE - 1); switch (*cp) { case ',': case '\0': if (start >= 0) { if (start > pcpu) errx(4, "Invalid hostcpu range %d-%d", start, pcpu); while (start < pcpu) { CPU_SET(start, set); start++; } start = -1; } CPU_SET(pcpu, set); break; case '-': if (start >= 0) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); start = pcpu; break; default: errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); } if (*cp == '\0') break; token = cp + 1; } } static void build_vcpumaps(void) { char key[16]; const char *value; int vcpu; vcpumap = calloc(guest_ncpus, sizeof(*vcpumap)); for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (value == NULL) continue; vcpumap[vcpu] = malloc(sizeof(cpuset_t)); if (vcpumap[vcpu] == NULL) err(4, "Failed to allocate cpuset for vcpu %d", vcpu); parse_cpuset(vcpu, value, vcpumap[vcpu]); } } void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode) { int error, restart_instruction; restart_instruction = 1; error = vm_inject_exception(vcpu, vector, errcode_valid, errcode, restart_instruction); assert(error == 0); } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr) { return (vm_rev_map_gpa(ctx, addr)); } #endif int fbsdrun_virtio_msix(void) { return (get_config_bool_default("virtio_msix", true)); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct vcpu_info *vi = param; int error; snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid); pthread_set_name_np(pthread_self(), tname); if (vcpumap[vi->vcpuid] != NULL) { error = pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), vcpumap[vi->vcpuid]); assert(error == 0); } #ifdef BHYVE_SNAPSHOT checkpoint_cpu_add(vi->vcpuid); #endif gdb_cpu_add(vi->vcpu); vm_loop(vi->ctx, vi->vcpu); /* not reached */ exit(1); return (NULL); } static void fbsdrun_addcpu(struct vcpu_info *vi) { pthread_t thr; int error; error = vm_activate_cpu(vi->vcpu); if (error != 0) err(EX_OSERR, "could not activate CPU %d", vi->vcpuid); CPU_SET_ATOMIC(vi->vcpuid, &cpumask); vm_suspend_cpu(vi->vcpu); error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi); assert(error == 0); } static int fbsdrun_deletecpu(int vcpu) { if (!CPU_ISSET(vcpu, &cpumask)) { fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); exit(4); } CPU_CLR_ATOMIC(vcpu, &cpumask); return (CPU_EMPTY(&cpumask)); } static int vmexit_handle_notify(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_exit *vme __unused, uint32_t eax __unused) { #if BHYVE_DEBUG /* * put guest-driven debug here */ #endif return (VMEXIT_CONTINUE); } static int -vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vme) +vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { + struct vm_exit *vme; int error; int bytes, port, in, out; + vme = vmrun->vm_exit; port = vme->u.inout.port; bytes = vme->u.inout.bytes; in = vme->u.inout.in; out = !in; /* Extra-special case of host notifications */ if (out && port == GUEST_NIO_PORT) { error = vmexit_handle_notify(ctx, vcpu, vme, vme->u.inout.eax); return (error); } error = emulate_inout(ctx, vcpu, vme); if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port, vme->rip); return (VMEXIT_ABORT); } else { return (VMEXIT_CONTINUE); } } static int -vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme) +vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, + struct vm_run *vmrun) { + struct vm_exit *vme; uint64_t val; uint32_t eax, edx; int error; + vme = vmrun->vm_exit; + val = 0; error = emulate_rdmsr(vcpu, vme->u.msr.code, &val); if (error != 0) { fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", vme->u.msr.code, vcpu_id(vcpu)); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(vcpu); return (VMEXIT_CONTINUE); } } eax = val; error = vm_set_register(vcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; error = vm_set_register(vcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int -vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme) +vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, + struct vm_run *vmrun) { + struct vm_exit *vme; int error; + vme = vmrun->vm_exit; + error = emulate_wrmsr(vcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", vme->u.msr.code, vme->u.msr.wval, vcpu_id(vcpu)); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(vcpu); return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); } #define DEBUG_EPT_MISCONFIG #ifdef DEBUG_EPT_MISCONFIG #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; static int ept_misconfig_ptenum; #endif static const char * vmexit_vmx_desc(uint32_t exit_reason) { if (exit_reason >= nitems(vmx_exit_reason_desc) || vmx_exit_reason_desc[exit_reason] == NULL) return ("Unknown"); return (vmx_exit_reason_desc[exit_reason]); } static int -vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vme) +vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { + struct vm_exit *vme; + + vme = vmrun->vm_exit; fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); fprintf(stderr, "\treason\t\tVMX\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vme->u.vmx.status); fprintf(stderr, "\texit_reason\t%u (%s)\n", vme->u.vmx.exit_reason, vmexit_vmx_desc(vme->u.vmx.exit_reason)); fprintf(stderr, "\tqualification\t0x%016lx\n", vme->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vme->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vme->u.vmx.inst_error); #ifdef DEBUG_EPT_MISCONFIG if (vme->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { vm_get_register(vcpu, VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), &ept_misconfig_gpa); vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, &ept_misconfig_ptenum); fprintf(stderr, "\tEPT misconfiguration:\n"); fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", ept_misconfig_ptenum, ept_misconfig_pte[0], ept_misconfig_pte[1], ept_misconfig_pte[2], ept_misconfig_pte[3]); } #endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } static int -vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme) +vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { + struct vm_exit *vme; + + vme = vmrun->vm_exit; fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); fprintf(stderr, "\treason\t\tSVM\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\texitcode\t%#lx\n", vme->u.svm.exitcode); fprintf(stderr, "\texitinfo1\t%#lx\n", vme->u.svm.exitinfo1); fprintf(stderr, "\texitinfo2\t%#lx\n", vme->u.svm.exitinfo2); return (VMEXIT_ABORT); } static int vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, - struct vm_exit *vme) + struct vm_run *vmrun) { - - assert(vme->inst_length == 0); + assert(vmrun->vm_exit->inst_length == 0); stats.vmexit_bogus++; return (VMEXIT_CONTINUE); } static int vmexit_reqidle(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, - struct vm_exit *vme) + struct vm_run *vmrun) { - - assert(vme->inst_length == 0); + assert(vmrun->vm_exit->inst_length == 0); stats.vmexit_reqidle++; return (VMEXIT_CONTINUE); } static int vmexit_hlt(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, - struct vm_exit *vme __unused) + struct vm_run *vmrun __unused) { - stats.vmexit_hlt++; /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int vmexit_pause(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, - struct vm_exit *vme __unused) + struct vm_run *vmrun __unused) { - stats.vmexit_pause++; return (VMEXIT_CONTINUE); } static int vmexit_mtrap(struct vmctx *ctx __unused, struct vcpu *vcpu, - struct vm_exit *vme) + struct vm_run *vmrun) { - - assert(vme->inst_length == 0); + assert(vmrun->vm_exit->inst_length == 0); stats.vmexit_mtrap++; #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(vcpu_id(vcpu)); #endif gdb_cpu_mtrap(vcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(vcpu_id(vcpu)); #endif return (VMEXIT_CONTINUE); } static int vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu, - struct vm_exit *vme) + struct vm_run *vmrun) { - int err, i, cs_d; + struct vm_exit *vme; struct vie *vie; + int err, i, cs_d; enum vm_cpu_mode mode; + vme = vmrun->vm_exit; + stats.vmexit_inst_emul++; vie = &vme->u.inst_emul.vie; if (!vie->decoded) { /* * Attempt to decode in userspace as a fallback. This allows * updating instruction decode in bhyve without rebooting the * kernel (rapid prototyping), albeit with much slower * emulation. */ vie_restart(vie); mode = vme->u.inst_emul.paging.cpu_mode; cs_d = vme->u.inst_emul.cs_d; if (vmm_decode_instruction(mode, cs_d, vie) != 0) goto fail; if (vm_set_register(vcpu, VM_REG_GUEST_RIP, vme->rip + vie->num_processed) != 0) goto fail; } err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie, &vme->u.inst_emul.paging); if (err) { if (err == ESRCH) { EPRINTLN("Unhandled memory access to 0x%lx\n", vme->u.inst_emul.gpa); } goto fail; } return (VMEXIT_CONTINUE); fail: fprintf(stderr, "Failed to emulate instruction sequence [ "); for (i = 0; i < vie->num_valid; i++) fprintf(stderr, "%02x", vie->inst[i]); FPRINTLN(stderr, " ] at 0x%lx", vme->rip); return (VMEXIT_ABORT); } static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; static int -vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vme) +vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { + struct vm_exit *vme; enum vm_suspend_how how; int vcpuid = vcpu_id(vcpu); + vme = vmrun->vm_exit; + how = vme->u.suspended.how; fbsdrun_deletecpu(vcpuid); if (vcpuid != BSP) { pthread_mutex_lock(&resetcpu_mtx); pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); } pthread_mutex_lock(&resetcpu_mtx); while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); switch (how) { case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: if (get_config_bool_default("destroy_on_poweroff", false)) vm_destroy(ctx); exit(1); case VM_SUSPEND_HALT: exit(2); case VM_SUSPEND_TRIPLEFAULT: exit(3); default: fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); exit(100); } return (0); /* NOTREACHED */ } static int vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu, - struct vm_exit *vme __unused) + struct vm_run *vmrun __unused) { #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(vcpu_id(vcpu)); #endif gdb_cpu_suspend(vcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(vcpu_id(vcpu)); #endif /* * XXX-MJ sleep for a short period to avoid chewing up the CPU in the * window between activation of the vCPU thread and the STARTUP IPI. */ usleep(1000); return (VMEXIT_CONTINUE); } static int vmexit_breakpoint(struct vmctx *ctx __unused, struct vcpu *vcpu, - struct vm_exit *vme) + struct vm_run *vmrun) { - - gdb_cpu_breakpoint(vcpu, vme); + gdb_cpu_breakpoint(vcpu, vmrun->vm_exit); return (VMEXIT_CONTINUE); } static int vmexit_ipi(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, - struct vm_exit *vme) + struct vm_run *vmrun) { + struct vm_exit *vme; + cpuset_t *dmask; int error = -1; int i; + + dmask = vmrun->cpuset; + vme = vmrun->vm_exit; + switch (vme->u.ipi.mode) { case APIC_DELMODE_INIT: - CPU_FOREACH_ISSET(i, &vme->u.ipi.dmask) { + CPU_FOREACH_ISSET(i, dmask) { error = vm_suspend_cpu(vcpu_info[i].vcpu); if (error) { warnx("%s: failed to suspend cpu %d\n", __func__, i); break; } } break; case APIC_DELMODE_STARTUP: - CPU_FOREACH_ISSET(i, &vme->u.ipi.dmask) { + CPU_FOREACH_ISSET(i, dmask) { spinup_ap(vcpu_info[i].vcpu, vme->u.ipi.vector << PAGE_SHIFT); } error = 0; break; default: break; } return (error); } static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, [VM_EXITCODE_BPT] = vmexit_breakpoint, [VM_EXITCODE_IPI] = vmexit_ipi, }; static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu) { struct vm_exit vme; + struct vm_run vmrun; int error, rc; enum vm_exitcode exitcode; - cpuset_t active_cpus; + cpuset_t active_cpus, dmask; error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus)); + vmrun.vm_exit = &vme; + vmrun.cpuset = &dmask; + vmrun.cpusetsize = sizeof(dmask); + while (1) { - error = vm_run(vcpu, &vme); + error = vm_run(vcpu, &vmrun); if (error != 0) break; exitcode = vme.exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(4); } - rc = (*handler[exitcode])(ctx, vcpu, &vme); + rc = (*handler[exitcode])(ctx, vcpu, &vmrun); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(4); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu) { uint16_t sockets, cores, threads, maxcpus; int tmp, error; /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp); if (error != 0) return (1); error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); if (error == 0) return (maxcpus); else return (1); } static void fbsdrun_set_capabilities(struct vcpu *vcpu, bool bsp) { int err, tmp; if (get_config_bool_default("x86.vmexit_on_hlt", false)) { err = vm_get_capability(vcpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); exit(4); } vm_set_capability(vcpu, VM_CAP_HALT_EXIT, 1); if (bsp) handler[VM_EXITCODE_HLT] = vmexit_hlt; } if (get_config_bool_default("x86.vmexit_on_pause", false)) { /* * pause exit support required for this mode */ err = vm_get_capability(vcpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); exit(4); } vm_set_capability(vcpu, VM_CAP_PAUSE_EXIT, 1); if (bsp) handler[VM_EXITCODE_PAUSE] = vmexit_pause; } if (get_config_bool_default("x86.x2apic", false)) err = vm_set_x2apic_state(vcpu, X2APIC_ENABLED); else err = vm_set_x2apic_state(vcpu, X2APIC_DISABLED); if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); exit(4); } vm_set_capability(vcpu, VM_CAP_ENABLE_INVPCID, 1); err = vm_set_capability(vcpu, VM_CAP_IPI_EXIT, 1); assert(err == 0); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool reinit, romboot; reinit = romboot = false; if (lpc_bootrom()) romboot = true; error = vm_create(vmname); if (error) { if (errno == EEXIST) { if (romboot) { reinit = true; } else { /* * The virtual machine has been setup by the * userspace bootloader. */ } } else { perror("vm_create"); exit(4); } } else { if (!romboot) { /* * If the virtual machine was just created then a * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); exit(4); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(4); } #ifndef WITHOUT_CAPSICUM if (vm_limit_rights(ctx) != 0) err(EX_OSERR, "vm_limit_rights"); #endif if (reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(4); } } error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0 /* maxcpus, unimplemented */); if (error) errx(EX_OSERR, "vm_set_topology"); return (ctx); } static void spinup_vcpu(struct vcpu_info *vi, bool bsp) { int error; if (!bsp) { fbsdrun_set_capabilities(vi->vcpu, false); /* * Enable the 'unrestricted guest' mode for APs. * * APs startup in power-on 16-bit mode. */ error = vm_set_capability(vi->vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); } fbsdrun_addcpu(vi); } static bool parse_config_option(const char *option) { const char *value; char *path; value = strchr(option, '='); if (value == NULL || value[1] == '\0') return (false); path = strndup(option, value - option); if (path == NULL) err(4, "Failed to allocate memory"); set_config_value(path, value + 1); return (true); } static void parse_simple_config_file(const char *path) { FILE *fp; char *line, *cp; size_t linecap; unsigned int lineno; fp = fopen(path, "r"); if (fp == NULL) err(4, "Failed to open configuration file %s", path); line = NULL; linecap = 0; lineno = 1; for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) { if (*line == '#' || *line == '\n') continue; cp = strchr(line, '\n'); if (cp != NULL) *cp = '\0'; if (!parse_config_option(line)) errx(4, "%s line %u: invalid config option '%s'", path, lineno, line); } free(line); fclose(fp); } static void parse_gdb_options(const char *opt) { const char *sport; char *colon; if (opt[0] == 'w') { set_config_bool("gdb.wait", true); opt++; } colon = strrchr(opt, ':'); if (colon == NULL) { sport = opt; } else { *colon = '\0'; colon++; sport = colon; set_config_value("gdb.address", opt); } set_config_value("gdb.port", sport); } static void set_defaults(void) { set_config_bool("acpi_tables", false); set_config_value("memory.size", "256M"); set_config_bool("x86.strictmsr", true); set_config_value("lpc.fwcfg", "bhyve"); } int main(int argc, char *argv[]) { int c, error; int max_vcpus, memflags; struct vcpu *bsp; struct vmctx *ctx; struct qemu_fwcfg_item *e820_fwcfg_item; uint64_t rip; size_t memsize; const char *optstr, *value, *vmname; #ifdef BHYVE_SNAPSHOT char *restore_file; struct restore_state rstate; restore_file = NULL; #endif init_config(); set_defaults(); progname = basename(argv[0]); #ifdef BHYVE_SNAPSHOT optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:"; #else optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': set_config_bool("x86.x2apic", false); break; case 'A': set_config_bool("acpi_tables", true); break; case 'D': set_config_bool("destroy_on_poweroff", true); break; case 'p': if (pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': if (topology_parse(optarg) != 0) { errx(EX_USAGE, "invalid cpu topology " "'%s'", optarg); } break; case 'C': set_config_bool("memory.guest_in_core", true); break; case 'f': if (qemu_fwcfg_parse_cmdline_arg(optarg) != 0) { errx(EX_USAGE, "invalid fwcfg item '%s'", optarg); } break; case 'G': parse_gdb_options(optarg); break; case 'k': parse_simple_config_file(optarg); break; case 'K': set_config_value("keyboard.layout", optarg); break; case 'l': if (strncmp(optarg, "help", strlen(optarg)) == 0) { lpc_print_supported_devices(); exit(0); } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; #ifdef BHYVE_SNAPSHOT case 'r': restore_file = optarg; break; #endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); exit(0); } else if (pci_parse_slot(optarg) != 0) exit(4); else break; case 'S': set_config_bool("memory.wired", true); break; case 'm': set_config_value("memory.size", optarg); break; case 'o': if (!parse_config_option(optarg)) errx(EX_USAGE, "invalid configuration option '%s'", optarg); break; case 'H': set_config_bool("x86.vmexit_on_hlt", true); break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': set_config_bool("x86.vmexit_on_pause", true); break; case 'e': set_config_bool("x86.strictio", true); break; case 'u': set_config_bool("rtc.use_localtime", false); break; case 'U': set_config_value("uuid", optarg); break; case 'w': set_config_bool("x86.strictmsr", false); break; case 'W': set_config_bool("virtio_msix", false); break; case 'x': set_config_bool("x86.x2apic", true); break; case 'Y': set_config_bool("x86.mptable", false); break; case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; if (argc > 1) usage(1); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { error = load_restore_file(restore_file, &rstate); if (error) { fprintf(stderr, "Failed to read checkpoint info from " "file: '%s'.\n", restore_file); exit(1); } vmname = lookup_vmname(&rstate); if (vmname != NULL) set_config_value("name", vmname); } #endif if (argc == 1) set_config_value("name", argv[0]); vmname = get_config_value("name"); if (vmname == NULL) usage(1); if (get_config_bool_default("config.dump", false)) { dump_config(); exit(1); } calc_topology(); build_vcpumaps(); value = get_config_value("memory.size"); error = vm_parse_memsize(value, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", value); ctx = do_open(vmname); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { guest_ncpus = lookup_guest_ncpus(&rstate); memflags = lookup_memflags(&rstate); memsize = lookup_memsize(&rstate); } if (guest_ncpus < 1) { fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); exit(1); } #endif bsp = vm_vcpu_open(ctx, BSP); max_vcpus = num_vcpus_allowed(ctx, bsp); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(4); } fbsdrun_set_capabilities(bsp, true); /* Allocate per-VCPU resources. */ vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info)); for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) { vcpu_info[vcpuid].ctx = ctx; vcpu_info[vcpuid].vcpuid = vcpuid; if (vcpuid == BSP) vcpu_info[vcpuid].vcpu = bsp; else vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid); } memflags = 0; if (get_config_bool_default("memory.wired", false)) memflags |= VM_MEM_F_WIRED; if (get_config_bool_default("memory.guest_in_core", false)) memflags |= VM_MEM_F_INCORE; vm_set_memflags(ctx, memflags); error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (error) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); exit(4); } init_mem(guest_ncpus); init_inout(); kernemu_dev_init(); init_bootrom(ctx); atkbdc_init(ctx); pci_irq_init(ctx); ioapic_init(ctx); rtc_init(ctx); sci_init(ctx); if (qemu_fwcfg_init(ctx) != 0) { fprintf(stderr, "qemu fwcfg initialization error"); exit(4); } if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus), &guest_ncpus) != 0) { fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu"); exit(4); } if (e820_init(ctx) != 0) { fprintf(stderr, "Unable to setup E820"); exit(4); } /* * Exit if a device emulation finds an error in its initilization */ if (init_pci(ctx) != 0) { perror("device emulation initialization error"); exit(4); } /* * Initialize after PCI, to allow a bootrom file to reserve the high * region. */ if (get_config_bool("acpi_tables")) vmgenc_init(ctx); init_gdb(ctx); if (lpc_bootrom()) { if (vm_set_capability(bsp, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); exit(4); } error = vcpu_reset(bsp); assert(error == 0); } /* * Add all vCPUs. */ for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) spinup_vcpu(&vcpu_info[vcpuid], vcpuid == BSP); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { fprintf(stdout, "Pausing pci devs...\r\n"); if (vm_pause_user_devs() != 0) { fprintf(stderr, "Failed to pause PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring vm mem...\r\n"); if (restore_vm_mem(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore VM memory.\n"); exit(1); } fprintf(stdout, "Restoring pci devs...\r\n"); if (vm_restore_user_devs(&rstate) != 0) { fprintf(stderr, "Failed to restore PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring kernel structs...\r\n"); if (vm_restore_kern_structs(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore kernel structs.\n"); exit(1); } fprintf(stdout, "Resuming pci devs...\r\n"); if (vm_resume_user_devs() != 0) { fprintf(stderr, "Failed to resume PCI device state.\n"); exit(1); } } #endif error = vm_get_register(bsp, VM_REG_GUEST_RIP, &rip); assert(error == 0); /* * build the guest tables, MP etc. */ if (get_config_bool_default("x86.mptable", true)) { error = mptable_build(ctx, guest_ncpus); if (error) { perror("error to build the guest tables"); exit(4); } } error = smbios_build(ctx); if (error != 0) exit(4); if (get_config_bool("acpi_tables")) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } e820_fwcfg_item = e820_get_fwcfg_item(); if (e820_fwcfg_item == NULL) { fprintf(stderr, "invalid e820 table"); exit(4); } if (qemu_fwcfg_add_file("etc/e820", e820_fwcfg_item->size, e820_fwcfg_item->data) != 0) { fprintf(stderr, "could not add qemu fwcfg etc/e820"); exit(4); } free(e820_fwcfg_item); if (lpc_bootrom() && strcmp(lpc_fwcfg(), "bhyve") == 0) { fwctl_init(); } /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); #ifdef BHYVE_SNAPSHOT /* initialize mutex/cond variables */ init_snapshot(); /* * checkpointing thread for communication with bhyvectl */ if (init_checkpoint_thread(ctx) != 0) errx(EX_OSERR, "Failed to start checkpoint thread"); #endif #ifndef WITHOUT_CAPSICUM caph_cache_catpages(); if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_enter() == -1) errx(EX_OSERR, "cap_enter() failed"); #endif #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { destroy_restore_state(&rstate); if (vm_restore_time(ctx) < 0) err(EX_OSERR, "Unable to restore time"); for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) vm_resume_cpu(vcpu_info[vcpuid].vcpu); } else #endif vm_resume_cpu(bsp); /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(4); } diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h index f45b97e779c8..47f9d5071044 100644 --- a/usr.sbin/bhyve/bhyverun.h +++ b/usr.sbin/bhyve/bhyverun.h @@ -1,53 +1,53 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _FBSDRUN_H_ #define _FBSDRUN_H_ #define VMEXIT_CONTINUE (0) #define VMEXIT_ABORT (-1) extern int guest_ncpus; extern uint16_t cpu_cores, cpu_sockets, cpu_threads; struct vcpu; struct vmctx; -struct vm_exit; +struct vm_run; void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len); #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr); #endif int fbsdrun_virtio_msix(void); -int vmexit_task_switch(struct vmctx *, struct vcpu *, struct vm_exit *); +int vmexit_task_switch(struct vmctx *, struct vcpu *, struct vm_run *); #endif diff --git a/usr.sbin/bhyve/task_switch.c b/usr.sbin/bhyve/task_switch.c index cf4c2356e155..17b311d8f211 100644 --- a/usr.sbin/bhyve/task_switch.c +++ b/usr.sbin/bhyve/task_switch.c @@ -1,941 +1,943 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2014 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "debug.h" /* * Using 'struct i386tss' is tempting but causes myriad sign extension * issues because all of its fields are defined as signed integers. */ struct tss32 { uint16_t tss_link; uint16_t rsvd1; uint32_t tss_esp0; uint16_t tss_ss0; uint16_t rsvd2; uint32_t tss_esp1; uint16_t tss_ss1; uint16_t rsvd3; uint32_t tss_esp2; uint16_t tss_ss2; uint16_t rsvd4; uint32_t tss_cr3; uint32_t tss_eip; uint32_t tss_eflags; uint32_t tss_eax; uint32_t tss_ecx; uint32_t tss_edx; uint32_t tss_ebx; uint32_t tss_esp; uint32_t tss_ebp; uint32_t tss_esi; uint32_t tss_edi; uint16_t tss_es; uint16_t rsvd5; uint16_t tss_cs; uint16_t rsvd6; uint16_t tss_ss; uint16_t rsvd7; uint16_t tss_ds; uint16_t rsvd8; uint16_t tss_fs; uint16_t rsvd9; uint16_t tss_gs; uint16_t rsvd10; uint16_t tss_ldt; uint16_t rsvd11; uint16_t tss_trap; uint16_t tss_iomap; }; static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed"); #define SEL_START(sel) (((sel) & ~0x7)) #define SEL_LIMIT(sel) (((sel) | 0x7)) #define TSS_BUSY(type) (((type) & 0x2) != 0) static uint64_t GETREG(struct vcpu *vcpu, int reg) { uint64_t val; int error; error = vm_get_register(vcpu, reg, &val); assert(error == 0); return (val); } static void SETREG(struct vcpu *vcpu, int reg, uint64_t val) { int error; error = vm_set_register(vcpu, reg, val); assert(error == 0); } static struct seg_desc usd_to_seg_desc(struct user_segment_descriptor *usd) { struct seg_desc seg_desc; seg_desc.base = (u_int)USD_GETBASE(usd); if (usd->sd_gran) seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; else seg_desc.limit = (u_int)USD_GETLIMIT(usd); seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7; seg_desc.access |= usd->sd_xx << 12; seg_desc.access |= usd->sd_def32 << 14; seg_desc.access |= usd->sd_gran << 15; return (seg_desc); } /* * Inject an exception with an error code that is a segment selector. * The format of the error code is described in section 6.13, "Error Code", * Intel SDM volume 3. * * Bit 0 (EXT) denotes whether the exception occurred during delivery * of an external event like an interrupt. * * Bit 1 (IDT) indicates whether the selector points to a gate descriptor * in the IDT. * * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). */ static void sel_exception(struct vcpu *vcpu, int vector, uint16_t sel, int ext) { /* * Bit 2 from the selector is retained as-is in the error code. * * Bit 1 can be safely cleared because none of the selectors * encountered during task switch emulation refer to a task * gate in the IDT. * * Bit 0 is set depending on the value of 'ext'. */ sel &= ~0x3; if (ext) sel |= 0x1; vm_inject_fault(vcpu, vector, 1, sel); } /* * Return 0 if the selector 'sel' in within the limits of the GDT/LDT * and non-zero otherwise. */ static int desc_table_limit_check(struct vcpu *vcpu, uint16_t sel) { uint64_t base; uint32_t limit, access; int error, reg; reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; error = vm_get_desc(vcpu, reg, &base, &limit, &access); assert(error == 0); if (reg == VM_REG_GUEST_LDTR) { if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) return (-1); } if (limit < SEL_LIMIT(sel)) return (-1); else return (0); } /* * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced * by the selector 'sel'. * * Returns 0 on success. * Returns 1 if an exception was injected into the guest. * Returns -1 otherwise. */ static int desc_table_rw(struct vcpu *vcpu, struct vm_guest_paging *paging, uint16_t sel, struct user_segment_descriptor *desc, bool doread, int *faultptr) { struct iovec iov[2]; uint64_t base; uint32_t limit, access; int error, reg; reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; error = vm_get_desc(vcpu, reg, &base, &limit, &access); assert(error == 0); assert(limit >= SEL_LIMIT(sel)); error = vm_copy_setup(vcpu, paging, base + SEL_START(sel), sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov), faultptr); if (error || *faultptr) return (error); if (doread) vm_copyin(iov, desc, sizeof(*desc)); else vm_copyout(desc, iov, sizeof(*desc)); return (0); } static int desc_table_read(struct vcpu *vcpu, struct vm_guest_paging *paging, uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) { return (desc_table_rw(vcpu, paging, sel, desc, true, faultptr)); } static int desc_table_write(struct vcpu *vcpu, struct vm_guest_paging *paging, uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) { return (desc_table_rw(vcpu, paging, sel, desc, false, faultptr)); } /* * Read the TSS descriptor referenced by 'sel' into 'desc'. * * Returns 0 on success. * Returns 1 if an exception was injected into the guest. * Returns -1 otherwise. */ static int read_tss_descriptor(struct vcpu *vcpu, struct vm_task_switch *ts, uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) { struct vm_guest_paging sup_paging; int error; assert(!ISLDT(sel)); assert(IDXSEL(sel) != 0); /* Fetch the new TSS descriptor */ if (desc_table_limit_check(vcpu, sel)) { if (ts->reason == TSR_IRET) sel_exception(vcpu, IDT_TS, sel, ts->ext); else sel_exception(vcpu, IDT_GP, sel, ts->ext); return (1); } sup_paging = ts->paging; sup_paging.cpl = 0; /* implicit supervisor mode */ error = desc_table_read(vcpu, &sup_paging, sel, desc, faultptr); return (error); } static bool code_desc(int sd_type) { /* code descriptor */ return ((sd_type & 0x18) == 0x18); } static bool stack_desc(int sd_type) { /* writable data descriptor */ return ((sd_type & 0x1A) == 0x12); } static bool data_desc(int sd_type) { /* data descriptor or a readable code descriptor */ return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); } static bool ldt_desc(int sd_type) { return (sd_type == SDT_SYSLDT); } /* * Validate the descriptor 'seg_desc' associated with 'segment'. */ static int validate_seg_desc(struct vcpu *vcpu, struct vm_task_switch *ts, int segment, struct seg_desc *seg_desc, int *faultptr) { struct vm_guest_paging sup_paging; struct user_segment_descriptor usd; int error, idtvec; int cpl, dpl, rpl; uint16_t sel, cs; bool ldtseg, codeseg, stackseg, dataseg, conforming; ldtseg = codeseg = stackseg = dataseg = false; switch (segment) { case VM_REG_GUEST_LDTR: ldtseg = true; break; case VM_REG_GUEST_CS: codeseg = true; break; case VM_REG_GUEST_SS: stackseg = true; break; case VM_REG_GUEST_DS: case VM_REG_GUEST_ES: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: dataseg = true; break; default: assert(0); } /* Get the segment selector */ sel = GETREG(vcpu, segment); /* LDT selector must point into the GDT */ if (ldtseg && ISLDT(sel)) { sel_exception(vcpu, IDT_TS, sel, ts->ext); return (1); } /* Descriptor table limit check */ if (desc_table_limit_check(vcpu, sel)) { sel_exception(vcpu, IDT_TS, sel, ts->ext); return (1); } /* NULL selector */ if (IDXSEL(sel) == 0) { /* Code and stack segment selectors cannot be NULL */ if (codeseg || stackseg) { sel_exception(vcpu, IDT_TS, sel, ts->ext); return (1); } seg_desc->base = 0; seg_desc->limit = 0; seg_desc->access = 0x10000; /* unusable */ return (0); } /* Read the descriptor from the GDT/LDT */ sup_paging = ts->paging; sup_paging.cpl = 0; /* implicit supervisor mode */ error = desc_table_read(vcpu, &sup_paging, sel, &usd, faultptr); if (error || *faultptr) return (error); /* Verify that the descriptor type is compatible with the segment */ if ((ldtseg && !ldt_desc(usd.sd_type)) || (codeseg && !code_desc(usd.sd_type)) || (dataseg && !data_desc(usd.sd_type)) || (stackseg && !stack_desc(usd.sd_type))) { sel_exception(vcpu, IDT_TS, sel, ts->ext); return (1); } /* Segment must be marked present */ if (!usd.sd_p) { if (ldtseg) idtvec = IDT_TS; else if (stackseg) idtvec = IDT_SS; else idtvec = IDT_NP; sel_exception(vcpu, idtvec, sel, ts->ext); return (1); } cs = GETREG(vcpu, VM_REG_GUEST_CS); cpl = cs & SEL_RPL_MASK; rpl = sel & SEL_RPL_MASK; dpl = usd.sd_dpl; if (stackseg && (rpl != cpl || dpl != cpl)) { sel_exception(vcpu, IDT_TS, sel, ts->ext); return (1); } if (codeseg) { conforming = (usd.sd_type & 0x4) ? true : false; if ((conforming && (cpl < dpl)) || (!conforming && (cpl != dpl))) { sel_exception(vcpu, IDT_TS, sel, ts->ext); return (1); } } if (dataseg) { /* * A data segment is always non-conforming except when it's * descriptor is a readable, conforming code segment. */ if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) conforming = true; else conforming = false; if (!conforming && (rpl > dpl || cpl > dpl)) { sel_exception(vcpu, IDT_TS, sel, ts->ext); return (1); } } *seg_desc = usd_to_seg_desc(&usd); return (0); } static void tss32_save(struct vcpu *vcpu, struct vm_task_switch *task_switch, uint32_t eip, struct tss32 *tss, struct iovec *iov) { /* General purpose registers */ tss->tss_eax = GETREG(vcpu, VM_REG_GUEST_RAX); tss->tss_ecx = GETREG(vcpu, VM_REG_GUEST_RCX); tss->tss_edx = GETREG(vcpu, VM_REG_GUEST_RDX); tss->tss_ebx = GETREG(vcpu, VM_REG_GUEST_RBX); tss->tss_esp = GETREG(vcpu, VM_REG_GUEST_RSP); tss->tss_ebp = GETREG(vcpu, VM_REG_GUEST_RBP); tss->tss_esi = GETREG(vcpu, VM_REG_GUEST_RSI); tss->tss_edi = GETREG(vcpu, VM_REG_GUEST_RDI); /* Segment selectors */ tss->tss_es = GETREG(vcpu, VM_REG_GUEST_ES); tss->tss_cs = GETREG(vcpu, VM_REG_GUEST_CS); tss->tss_ss = GETREG(vcpu, VM_REG_GUEST_SS); tss->tss_ds = GETREG(vcpu, VM_REG_GUEST_DS); tss->tss_fs = GETREG(vcpu, VM_REG_GUEST_FS); tss->tss_gs = GETREG(vcpu, VM_REG_GUEST_GS); /* eflags and eip */ tss->tss_eflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS); if (task_switch->reason == TSR_IRET) tss->tss_eflags &= ~PSL_NT; tss->tss_eip = eip; /* Copy updated old TSS into guest memory */ vm_copyout(tss, iov, sizeof(struct tss32)); } static void update_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *sd) { int error; error = vm_set_desc(vcpu, reg, sd->base, sd->limit, sd->access); assert(error == 0); } /* * Update the vcpu registers to reflect the state of the new task. */ static int tss32_restore(struct vmctx *ctx, struct vcpu *vcpu, struct vm_task_switch *ts, uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr) { struct seg_desc seg_desc, seg_desc2; uint64_t *pdpte, maxphyaddr, reserved; uint32_t eflags; int error, i; bool nested; nested = false; if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { tss->tss_link = ot_sel; nested = true; } eflags = tss->tss_eflags; if (nested) eflags |= PSL_NT; /* LDTR */ SETREG(vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); /* PBDR */ if (ts->paging.paging_mode != PAGING_MODE_FLAT) { if (ts->paging.paging_mode == PAGING_MODE_PAE) { /* * XXX Assuming 36-bit MAXPHYADDR. */ maxphyaddr = (1UL << 36) - 1; pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32); for (i = 0; i < 4; i++) { /* Check reserved bits if the PDPTE is valid */ if (!(pdpte[i] & 0x1)) continue; /* * Bits 2:1, 8:5 and bits above the processor's * maximum physical address are reserved. */ reserved = ~maxphyaddr | 0x1E6; if (pdpte[i] & reserved) { vm_inject_gp(vcpu); return (1); } } SETREG(vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); SETREG(vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); SETREG(vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); SETREG(vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); } SETREG(vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); ts->paging.cr3 = tss->tss_cr3; } /* eflags and eip */ SETREG(vcpu, VM_REG_GUEST_RFLAGS, eflags); SETREG(vcpu, VM_REG_GUEST_RIP, tss->tss_eip); /* General purpose registers */ SETREG(vcpu, VM_REG_GUEST_RAX, tss->tss_eax); SETREG(vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); SETREG(vcpu, VM_REG_GUEST_RDX, tss->tss_edx); SETREG(vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); SETREG(vcpu, VM_REG_GUEST_RSP, tss->tss_esp); SETREG(vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); SETREG(vcpu, VM_REG_GUEST_RSI, tss->tss_esi); SETREG(vcpu, VM_REG_GUEST_RDI, tss->tss_edi); /* Segment selectors */ SETREG(vcpu, VM_REG_GUEST_ES, tss->tss_es); SETREG(vcpu, VM_REG_GUEST_CS, tss->tss_cs); SETREG(vcpu, VM_REG_GUEST_SS, tss->tss_ss); SETREG(vcpu, VM_REG_GUEST_DS, tss->tss_ds); SETREG(vcpu, VM_REG_GUEST_FS, tss->tss_fs); SETREG(vcpu, VM_REG_GUEST_GS, tss->tss_gs); /* * If this is a nested task then write out the new TSS to update * the previous link field. */ if (nested) vm_copyout(tss, iov, sizeof(*tss)); /* Validate segment descriptors */ error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc, faultptr); if (error || *faultptr) return (error); update_seg_desc(vcpu, VM_REG_GUEST_LDTR, &seg_desc); /* * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. * * The SS and CS attribute checks on VM-entry are inter-dependent so * we need to make sure that both segments are valid before updating * either of them. This ensures that the VMCS state can pass the * VM-entry checks so the guest can handle any exception injected * during task switch emulation. */ error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_CS, &seg_desc, faultptr); if (error || *faultptr) return (error); error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_SS, &seg_desc2, faultptr); if (error || *faultptr) return (error); update_seg_desc(vcpu, VM_REG_GUEST_CS, &seg_desc); update_seg_desc(vcpu, VM_REG_GUEST_SS, &seg_desc2); ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_DS, &seg_desc, faultptr); if (error || *faultptr) return (error); update_seg_desc(vcpu, VM_REG_GUEST_DS, &seg_desc); error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_ES, &seg_desc, faultptr); if (error || *faultptr) return (error); update_seg_desc(vcpu, VM_REG_GUEST_ES, &seg_desc); error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_FS, &seg_desc, faultptr); if (error || *faultptr) return (error); update_seg_desc(vcpu, VM_REG_GUEST_FS, &seg_desc); error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_GS, &seg_desc, faultptr); if (error || *faultptr) return (error); update_seg_desc(vcpu, VM_REG_GUEST_GS, &seg_desc); return (0); } /* * Push an error code on the stack of the new task. This is needed if the * task switch was triggered by a hardware exception that causes an error * code to be saved (e.g. #PF). */ static int push_errcode(struct vcpu *vcpu, struct vm_guest_paging *paging, int task_type, uint32_t errcode, int *faultptr) { struct iovec iov[2]; struct seg_desc seg_desc; int stacksize, bytes, error; uint64_t gla, cr0, rflags; uint32_t esp; uint16_t stacksel; *faultptr = 0; cr0 = GETREG(vcpu, VM_REG_GUEST_CR0); rflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS); stacksel = GETREG(vcpu, VM_REG_GUEST_SS); error = vm_get_desc(vcpu, VM_REG_GUEST_SS, &seg_desc.base, &seg_desc.limit, &seg_desc.access); assert(error == 0); /* * Section "Error Code" in the Intel SDM vol 3: the error code is * pushed on the stack as a doubleword or word (depending on the * default interrupt, trap or task gate size). */ if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) bytes = 4; else bytes = 2; /* * PUSH instruction from Intel SDM vol 2: the 'B' flag in the * stack-segment descriptor determines the size of the stack * pointer outside of 64-bit mode. */ if (SEG_DESC_DEF32(seg_desc.access)) stacksize = 4; else stacksize = 2; esp = GETREG(vcpu, VM_REG_GUEST_RSP); esp -= bytes; if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { sel_exception(vcpu, IDT_SS, stacksel, 1); *faultptr = 1; return (0); } if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { vm_inject_ac(vcpu, 1); *faultptr = 1; return (0); } error = vm_copy_setup(vcpu, paging, gla, bytes, PROT_WRITE, iov, nitems(iov), faultptr); if (error || *faultptr) return (error); vm_copyout(&errcode, iov, bytes); SETREG(vcpu, VM_REG_GUEST_RSP, esp); return (0); } /* * Evaluate return value from helper functions and potentially return to * the VM run loop. */ #define CHKERR(error,fault) \ do { \ assert((error == 0) || (error == EFAULT)); \ if (error) \ return (VMEXIT_ABORT); \ else if (fault) \ return (VMEXIT_CONTINUE); \ } while (0) int -vmexit_task_switch(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vmexit) +vmexit_task_switch(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { struct seg_desc nt; struct tss32 oldtss, newtss; struct vm_task_switch *task_switch; struct vm_guest_paging *paging, sup_paging; struct user_segment_descriptor nt_desc, ot_desc; struct iovec nt_iov[2], ot_iov[2]; + struct vm_exit *vmexit; uint64_t cr0, ot_base; uint32_t eip, ot_lim, access; int error, ext, fault, minlimit, nt_type, ot_type; enum task_switch_reason reason; uint16_t nt_sel, ot_sel; + vmexit = vmrun->vm_exit; task_switch = &vmexit->u.task_switch; nt_sel = task_switch->tsssel; ext = vmexit->u.task_switch.ext; reason = vmexit->u.task_switch.reason; paging = &vmexit->u.task_switch.paging; assert(paging->cpu_mode == CPU_MODE_PROTECTED); /* * Calculate the instruction pointer to store in the old TSS. */ eip = vmexit->rip + vmexit->inst_length; /* * Section 4.6, "Access Rights" in Intel SDM Vol 3. * The following page table accesses are implicitly supervisor mode: * - accesses to GDT or LDT to load segment descriptors * - accesses to the task state segment during task switch */ sup_paging = *paging; sup_paging.cpl = 0; /* implicit supervisor mode */ /* Fetch the new TSS descriptor */ error = read_tss_descriptor(vcpu, task_switch, nt_sel, &nt_desc, &fault); CHKERR(error, fault); nt = usd_to_seg_desc(&nt_desc); /* Verify the type of the new TSS */ nt_type = SEG_DESC_TYPE(nt.access); if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { sel_exception(vcpu, IDT_TS, nt_sel, ext); goto done; } /* TSS descriptor must have present bit set */ if (!SEG_DESC_PRESENT(nt.access)) { sel_exception(vcpu, IDT_NP, nt_sel, ext); goto done; } /* * TSS must have a minimum length of 104 bytes for a 32-bit TSS and * 44 bytes for a 16-bit TSS. */ if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) minlimit = 104 - 1; else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) minlimit = 44 - 1; else minlimit = 0; assert(minlimit > 0); if (nt.limit < (unsigned int)minlimit) { sel_exception(vcpu, IDT_TS, nt_sel, ext); goto done; } /* TSS must be busy if task switch is due to IRET */ if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { sel_exception(vcpu, IDT_TS, nt_sel, ext); goto done; } /* * TSS must be available (not busy) if task switch reason is * CALL, JMP, exception or interrupt. */ if (reason != TSR_IRET && TSS_BUSY(nt_type)) { sel_exception(vcpu, IDT_GP, nt_sel, ext); goto done; } /* Fetch the new TSS */ error = vm_copy_setup(vcpu, &sup_paging, nt.base, minlimit + 1, PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault); CHKERR(error, fault); vm_copyin(nt_iov, &newtss, minlimit + 1); /* Get the old TSS selector from the guest's task register */ ot_sel = GETREG(vcpu, VM_REG_GUEST_TR); if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { /* * This might happen if a task switch was attempted without * ever loading the task register with LTR. In this case the * TR would contain the values from power-on: * (sel = 0, base = 0, limit = 0xffff). */ sel_exception(vcpu, IDT_TS, ot_sel, task_switch->ext); goto done; } /* Get the old TSS base and limit from the guest's task register */ error = vm_get_desc(vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, &access); assert(error == 0); assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); ot_type = SEG_DESC_TYPE(access); assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); /* Fetch the old TSS descriptor */ error = read_tss_descriptor(vcpu, task_switch, ot_sel, &ot_desc, &fault); CHKERR(error, fault); /* Get the old TSS */ error = vm_copy_setup(vcpu, &sup_paging, ot_base, minlimit + 1, PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault); CHKERR(error, fault); vm_copyin(ot_iov, &oldtss, minlimit + 1); /* * Clear the busy bit in the old TSS descriptor if the task switch * due to an IRET or JMP instruction. */ if (reason == TSR_IRET || reason == TSR_JMP) { ot_desc.sd_type &= ~0x2; error = desc_table_write(vcpu, &sup_paging, ot_sel, &ot_desc, &fault); CHKERR(error, fault); } if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { EPRINTLN("Task switch to 16-bit TSS not supported"); return (VMEXIT_ABORT); } /* Save processor state in old TSS */ tss32_save(vcpu, task_switch, eip, &oldtss, ot_iov); /* * If the task switch was triggered for any reason other than IRET * then set the busy bit in the new TSS descriptor. */ if (reason != TSR_IRET) { nt_desc.sd_type |= 0x2; error = desc_table_write(vcpu, &sup_paging, nt_sel, &nt_desc, &fault); CHKERR(error, fault); } /* Update task register to point at the new TSS */ SETREG(vcpu, VM_REG_GUEST_TR, nt_sel); /* Update the hidden descriptor state of the task register */ nt = usd_to_seg_desc(&nt_desc); update_seg_desc(vcpu, VM_REG_GUEST_TR, &nt); /* Set CR0.TS */ cr0 = GETREG(vcpu, VM_REG_GUEST_CR0); SETREG(vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); /* * We are now committed to the task switch. Any exceptions encountered * after this point will be handled in the context of the new task and * the saved instruction pointer will belong to the new task. */ error = vm_set_register(vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); assert(error == 0); /* Load processor state from new TSS */ error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov, &fault); CHKERR(error, fault); /* * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception * caused an error code to be generated, this error code is copied * to the stack of the new task. */ if (task_switch->errcode_valid) { assert(task_switch->ext); assert(task_switch->reason == TSR_IDT_GATE); error = push_errcode(vcpu, &task_switch->paging, nt_type, task_switch->errcode, &fault); CHKERR(error, fault); } /* * Treatment of virtual-NMI blocking if NMI is delivered through * a task gate. * * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: * If the virtual NMIs VM-execution control is 1, VM entry injects * an NMI, and delivery of the NMI causes a task switch that causes * a VM exit, virtual-NMI blocking is in effect before the VM exit * commences. * * Thus, virtual-NMI blocking is in effect at the time of the task * switch VM exit. */ /* * Treatment of virtual-NMI unblocking on IRET from NMI handler task. * * Section "Changes to Instruction Behavior in VMX Non-Root Operation" * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. * This unblocking of virtual-NMI occurs even if IRET causes a fault. * * Thus, virtual-NMI blocking is cleared at the time of the task switch * VM exit. */ /* * If the task switch was triggered by an event delivered through * the IDT then extinguish the pending event from the vcpu's * exitintinfo. */ if (task_switch->reason == TSR_IDT_GATE) { error = vm_set_intinfo(vcpu, 0); assert(error == 0); } /* * XXX should inject debug exception if 'T' bit is 1 */ done: return (VMEXIT_CONTINUE); }