Index: head/lib/libvmmapi/vmmapi.c =================================================================== --- head/lib/libvmmapi/vmmapi.c (revision 368002) +++ head/lib/libvmmapi/vmmapi.c (revision 368003) @@ -1,1663 +1,1677 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmmapi.h" #define MB (1024 * 1024UL) #define GB (1024 * 1024 * 1024UL) /* * Size of the guard region before and after the virtual address space * mapping the guest physical memory. This must be a multiple of the * superpage size for performance reasons. */ #define VM_MMAP_GUARD_SIZE (4 * MB) #define PROT_RW (PROT_READ | PROT_WRITE) #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) struct vmctx { int fd; uint32_t lowmem_limit; int memflags; size_t lowmem; size_t highmem; char *baseaddr; char *name; }; #define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) #define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) static int vm_device_open(const char *name) { int fd, len; char *vmfile; len = strlen("/dev/vmm/") + strlen(name) + 1; vmfile = malloc(len); assert(vmfile != NULL); snprintf(vmfile, len, "/dev/vmm/%s", name); /* Open the device file */ fd = open(vmfile, O_RDWR, 0); free(vmfile); return (fd); } int vm_create(const char *name) { /* Try to load vmm(4) module before creating a guest. */ if (modfind("vmm") < 0) kldload("vmm"); return (CREATE((char *)name)); } struct vmctx * vm_open(const char *name) { struct vmctx *vm; vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); assert(vm != NULL); vm->fd = -1; vm->memflags = 0; vm->lowmem_limit = 3 * GB; vm->name = (char *)(vm + 1); strcpy(vm->name, name); if ((vm->fd = vm_device_open(vm->name)) < 0) goto err; return (vm); err: vm_destroy(vm); return (NULL); } void vm_destroy(struct vmctx *vm) { assert(vm != NULL); if (vm->fd >= 0) close(vm->fd); DESTROY(vm->name); free(vm); } int vm_parse_memsize(const char *optarg, size_t *ret_memsize) { char *endptr; size_t optval; int error; optval = strtoul(optarg, &endptr, 0); if (*optarg != '\0' && *endptr == '\0') { /* * For the sake of backward compatibility if the memory size * specified on the command line is less than a megabyte then * it is interpreted as being in units of MB. */ if (optval < MB) optval *= MB; *ret_memsize = optval; error = 0; } else error = expand_number(optarg, ret_memsize); return (error); } uint32_t vm_get_lowmem_limit(struct vmctx *ctx) { return (ctx->lowmem_limit); } void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit) { ctx->lowmem_limit = limit; } void vm_set_memflags(struct vmctx *ctx, int flags) { ctx->memflags = flags; } int vm_get_memflags(struct vmctx *ctx) { return (ctx->memflags); } /* * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot) { struct vm_memmap memmap; int error, flags; memmap.gpa = gpa; memmap.segid = segid; memmap.segoff = off; memmap.len = len; memmap.prot = prot; memmap.flags = 0; if (ctx->memflags & VM_MEM_F_WIRED) memmap.flags |= VM_MEMMAP_F_WIRED; /* * If this mapping already exists then don't create it again. This * is the common case for SYSMEM mappings created by bhyveload(8). */ error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); if (error == 0 && gpa == memmap.gpa) { if (segid != memmap.segid || off != memmap.segoff || prot != memmap.prot || flags != memmap.flags) { errno = EEXIST; return (-1); } else { return (0); } } error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); return (error); } int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, size_t *lowmem_size, size_t *highmem_size) { *guest_baseaddr = ctx->baseaddr; *lowmem_size = ctx->lowmem; *highmem_size = ctx->highmem; return (0); } int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { struct vm_memmap memmap; int error; bzero(&memmap, sizeof(struct vm_memmap)); memmap.gpa = *gpa; error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); if (error == 0) { *gpa = memmap.gpa; *segid = memmap.segid; *segoff = memmap.segoff; *len = memmap.len; *prot = memmap.prot; *flags = memmap.flags; } return (error); } /* * Return 0 if the segments are identical and non-zero otherwise. * * This is slightly complicated by the fact that only device memory segments * are named. */ static int cmpseg(size_t len, const char *str, size_t len2, const char *str2) { if (len == len2) { if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) return (0); } return (-1); } static int vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) { struct vm_memseg memseg; size_t n; int error; /* * If the memory segment has already been created then just return. * This is the usual case for the SYSMEM segment created by userspace * loaders like bhyveload(8). */ error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, sizeof(memseg.name)); if (error) return (error); if (memseg.len != 0) { if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { errno = EINVAL; return (-1); } else { return (0); } } bzero(&memseg, sizeof(struct vm_memseg)); memseg.segid = segid; memseg.len = len; if (name != NULL) { n = strlcpy(memseg.name, name, sizeof(memseg.name)); if (n >= sizeof(memseg.name)) { errno = ENAMETOOLONG; return (-1); } } error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); return (error); } int vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, size_t bufsize) { struct vm_memseg memseg; size_t n; int error; memseg.segid = segid; error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); if (error == 0) { *lenp = memseg.len; n = strlcpy(namebuf, memseg.name, bufsize); if (n >= bufsize) { errno = ENAMETOOLONG; error = -1; } } return (error); } static int setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) { char *ptr; int error, flags; /* Map 'len' bytes starting at 'gpa' in the guest address space */ error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); if (error) return (error); flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap into the process address space on the host */ ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); if (ptr == MAP_FAILED) return (-1); return (0); } int vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) { size_t objsize, len; vm_paddr_t gpa; char *baseaddr, *ptr; int error; assert(vms == VM_MMAP_ALL); /* * If 'memsize' cannot fit entirely in the 'lowmem' segment then * create another 'highmem' segment above 4GB for the remainder. */ if (memsize > ctx->lowmem_limit) { ctx->lowmem = ctx->lowmem_limit; ctx->highmem = memsize - ctx->lowmem_limit; objsize = 4*GB + ctx->highmem; } else { ctx->lowmem = memsize; ctx->highmem = 0; objsize = ctx->lowmem; } error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); if (error) return (error); /* * Stake out a contiguous region covering the guest physical memory * and the adjoining guard regions. */ len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (ptr == MAP_FAILED) return (-1); baseaddr = ptr + VM_MMAP_GUARD_SIZE; if (ctx->highmem > 0) { gpa = 4*GB; len = ctx->highmem; error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } if (ctx->lowmem > 0) { gpa = 0; len = ctx->lowmem; error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } ctx->baseaddr = baseaddr; return (0); } /* * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in * the lowmem or highmem regions. * * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. * The instruction emulation code depends on this behavior. */ void * vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) { if (ctx->lowmem > 0) { if (gaddr < ctx->lowmem && len <= ctx->lowmem && gaddr + len <= ctx->lowmem) return (ctx->baseaddr + gaddr); } if (ctx->highmem > 0) { if (gaddr >= 4*GB) { if (gaddr < 4*GB + ctx->highmem && len <= ctx->highmem && gaddr + len <= 4*GB + ctx->highmem) return (ctx->baseaddr + gaddr); } } return (NULL); } vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr) { vm_paddr_t offaddr; offaddr = (char *)addr - ctx->baseaddr; if (ctx->lowmem > 0) if (offaddr >= 0 && offaddr <= ctx->lowmem) return (offaddr); if (ctx->highmem > 0) if (offaddr >= 4*GB && offaddr < 4*GB + ctx->highmem) return (offaddr); return ((vm_paddr_t)-1); } /* TODO: maximum size for vmname */ int vm_get_name(struct vmctx *ctx, char *buf, size_t max_len) { if (strlcpy(buf, ctx->name, max_len) >= max_len) return (EINVAL); return (0); } size_t vm_get_lowmem_size(struct vmctx *ctx) { return (ctx->lowmem); } size_t vm_get_highmem_size(struct vmctx *ctx) { return (ctx->highmem); } void * vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) { char pathname[MAXPATHLEN]; size_t len2; char *base, *ptr; int fd, error, flags; fd = -1; ptr = MAP_FAILED; if (name == NULL || strlen(name) == 0) { errno = EINVAL; goto done; } error = vm_alloc_memseg(ctx, segid, len, name); if (error) goto done; strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); strlcat(pathname, ctx->name, sizeof(pathname)); strlcat(pathname, ".", sizeof(pathname)); strlcat(pathname, name, sizeof(pathname)); fd = open(pathname, O_RDWR); if (fd < 0) goto done; /* * Stake out a contiguous region covering the device memory and the * adjoining guard regions. */ len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (base == MAP_FAILED) goto done; flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap the devmem region in the host address space */ ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); done: if (fd >= 0) close(fd); return (ptr); } int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); vmsegdesc.cpuid = vcpu; vmsegdesc.regnum = reg; vmsegdesc.desc.base = base; vmsegdesc.desc.limit = limit; vmsegdesc.desc.access = access; error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc); return (error); } int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); vmsegdesc.cpuid = vcpu; vmsegdesc.regnum = reg; error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc); if (error == 0) { *base = vmsegdesc.desc.base; *limit = vmsegdesc.desc.limit; *access = vmsegdesc.desc.access; } return (error); } int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc) { int error; error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit, &seg_desc->access); return (error); } int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.cpuid = vcpu; vmreg.regnum = reg; vmreg.regval = val; error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg); return (error); } int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.cpuid = vcpu; vmreg.regnum = reg; error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg); *ret_val = vmreg.regval; return (error); } int vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count, const int *regnums, uint64_t *regvals) { int error; struct vm_register_set vmregset; bzero(&vmregset, sizeof(vmregset)); vmregset.cpuid = vcpu; vmregset.count = count; vmregset.regnums = regnums; vmregset.regvals = regvals; error = ioctl(ctx->fd, VM_SET_REGISTER_SET, &vmregset); return (error); } int vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count, const int *regnums, uint64_t *regvals) { int error; struct vm_register_set vmregset; bzero(&vmregset, sizeof(vmregset)); vmregset.cpuid = vcpu; vmregset.count = count; vmregset.regnums = regnums; vmregset.regvals = regvals; error = ioctl(ctx->fd, VM_GET_REGISTER_SET, &vmregset); return (error); } int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) { int error; struct vm_run vmrun; bzero(&vmrun, sizeof(vmrun)); vmrun.cpuid = vcpu; error = ioctl(ctx->fd, VM_RUN, &vmrun); bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); return (error); } int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) { struct vm_suspend vmsuspend; bzero(&vmsuspend, sizeof(vmsuspend)); vmsuspend.how = how; return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); } int vm_reinit(struct vmctx *ctx) { return (ioctl(ctx->fd, VM_REINIT, 0)); } int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction) { struct vm_exception exc; exc.cpuid = vcpu; exc.vector = vector; exc.error_code = errcode; exc.error_code_valid = errcode_valid; exc.restart_instruction = restart_instruction; return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc)); } int vm_apicid2vcpu(struct vmctx *ctx, int apicid) { /* * The apic id associated with the 'vcpu' has the same numerical value * as the 'vcpu' itself. */ return (apicid); } int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector) { struct vm_lapic_irq vmirq; bzero(&vmirq, sizeof(vmirq)); vmirq.cpuid = vcpu; vmirq.vector = vector; return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq)); } int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector) { struct vm_lapic_irq vmirq; bzero(&vmirq, sizeof(vmirq)); vmirq.cpuid = vcpu; vmirq.vector = vector; return (ioctl(ctx->fd, VM_LAPIC_LOCAL_IRQ, &vmirq)); } int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg) { struct vm_lapic_msi vmmsi; bzero(&vmmsi, sizeof(vmmsi)); vmmsi.addr = addr; vmmsi.msg = msg; return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi)); } int vm_ioapic_assert_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq)); } int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq)); } int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq)); } int vm_ioapic_pincount(struct vmctx *ctx, int *pincount) { return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount)); } int vm_readwrite_kernemu_device(struct vmctx *ctx, int vcpu, vm_paddr_t gpa, bool write, int size, uint64_t *value) { struct vm_readwrite_kernemu_device irp = { .vcpuid = vcpu, .access_width = fls(size) - 1, .gpa = gpa, .value = write ? *value : ~0ul, }; long cmd = (write ? VM_SET_KERNEMU_DEV : VM_GET_KERNEMU_DEV); int rc; rc = ioctl(ctx->fd, cmd, &irp); if (rc == 0 && !write) *value = irp.value; return (rc); } int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq)); } int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq)); } int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq)); } int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger) { struct vm_isa_irq_trigger isa_irq_trigger; bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger)); isa_irq_trigger.atpic_irq = atpic_irq; isa_irq_trigger.trigger = trigger; return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger)); } int vm_inject_nmi(struct vmctx *ctx, int vcpu) { struct vm_nmi vmnmi; bzero(&vmnmi, sizeof(vmnmi)); vmnmi.cpuid = vcpu; return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi)); } static const char *capstrmap[] = { [VM_CAP_HALT_EXIT] = "hlt_exit", [VM_CAP_MTRAP_EXIT] = "mtrap_exit", [VM_CAP_PAUSE_EXIT] = "pause_exit", [VM_CAP_UNRESTRICTED_GUEST] = "unrestricted_guest", [VM_CAP_ENABLE_INVPCID] = "enable_invpcid", [VM_CAP_BPT_EXIT] = "bpt_exit", }; int vm_capability_name2type(const char *capname) { int i; for (i = 0; i < nitems(capstrmap); i++) { if (strcmp(capstrmap[i], capname) == 0) return (i); } return (-1); } const char * vm_capability_type2name(int type) { if (type >= 0 && type < nitems(capstrmap)) return (capstrmap[type]); return (NULL); } int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int *retval) { int error; struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.cpuid = vcpu; vmcap.captype = cap; error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap); *retval = vmcap.capval; return (error); } int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val) { struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.cpuid = vcpu; vmcap.captype = cap; vmcap.capval = val; return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap)); } int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev)); } int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev)); } int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { struct vm_pptdev_mmio pptmmio; bzero(&pptmmio, sizeof(pptmmio)); pptmmio.bus = bus; pptmmio.slot = slot; pptmmio.func = func; pptmmio.gpa = gpa; pptmmio.len = len; pptmmio.hpa = hpa; return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio)); } int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec) { struct vm_pptdev_msi pptmsi; bzero(&pptmsi, sizeof(pptmsi)); pptmsi.vcpu = vcpu; pptmsi.bus = bus; pptmsi.slot = slot; pptmsi.func = func; pptmsi.msg = msg; pptmsi.addr = addr; pptmsi.numvec = numvec; return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi)); } int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control) { struct vm_pptdev_msix pptmsix; bzero(&pptmsix, sizeof(pptmsix)); pptmsix.vcpu = vcpu; pptmsix.bus = bus; pptmsix.slot = slot; pptmsix.func = func; pptmsix.idx = idx; pptmsix.msg = msg; pptmsix.addr = addr; pptmsix.vector_control = vector_control; return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix); } +int +vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func) +{ + struct vm_pptdev ppt; + + bzero(&ppt, sizeof(ppt)); + ppt.bus = bus; + ppt.slot = slot; + ppt.func = func; + + return ioctl(ctx->fd, VM_PPTDEV_DISABLE_MSIX, &ppt); +} + uint64_t * vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, int *ret_entries) { int error; static struct vm_stats vmstats; vmstats.cpuid = vcpu; error = ioctl(ctx->fd, VM_STATS, &vmstats); if (error == 0) { if (ret_entries) *ret_entries = vmstats.num_entries; if (ret_tv) *ret_tv = vmstats.tv; return (vmstats.statbuf); } else return (NULL); } const char * vm_get_stat_desc(struct vmctx *ctx, int index) { static struct vm_stat_desc statdesc; statdesc.index = index; if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) return (statdesc.desc); else return (NULL); } int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); x2apic.cpuid = vcpu; error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic); *state = x2apic.state; return (error); } int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); x2apic.cpuid = vcpu; x2apic.state = state; error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic); return (error); } /* * From Intel Vol 3a: * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT */ int vcpu_reset(struct vmctx *vmctx, int vcpu) { int error; uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx; uint32_t desc_access, desc_limit; uint16_t sel; zero = 0; rflags = 0x2; error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); if (error) goto done; rip = 0xfff0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0) goto done; cr0 = CR0_NE; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0) goto done; cr4 = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0) goto done; /* * CS: present, r/w, accessed, 16-bit, byte granularity, usable */ desc_base = 0xffff0000; desc_limit = 0xffff; desc_access = 0x0093; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0xf000; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0) goto done; /* * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0093; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0) goto done; /* General purpose registers */ rdx = 0xf00; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0) goto done; /* GDTR, IDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; /* TR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0000008b; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0) goto done; /* LDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x00000082; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) goto done; /* XXX cr2, debug registers */ error = 0; done: return (error); } int vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) { int error, i; struct vm_gpa_pte gpapte; bzero(&gpapte, sizeof(gpapte)); gpapte.gpa = gpa; error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); if (error == 0) { *num = gpapte.ptenum; for (i = 0; i < gpapte.ptenum; i++) pte[i] = gpapte.pte[i]; } return (error); } int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities) { int error; struct vm_hpet_cap cap; bzero(&cap, sizeof(struct vm_hpet_cap)); error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap); if (capabilities != NULL) *capabilities = cap.capabilities; return (error); } int vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; bzero(&gg, sizeof(struct vm_gla2gpa)); gg.vcpuid = vcpu; gg.prot = prot; gg.gla = gla; gg.paging = *paging; error = ioctl(ctx->fd, VM_GLA2GPA, &gg); if (error == 0) { *fault = gg.fault; *gpa = gg.gpa; } return (error); } int vm_gla2gpa_nofault(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; bzero(&gg, sizeof(struct vm_gla2gpa)); gg.vcpuid = vcpu; gg.prot = prot; gg.gla = gla; gg.paging = *paging; error = ioctl(ctx->fd, VM_GLA2GPA_NOFAULT, &gg); if (error == 0) { *fault = gg.fault; *gpa = gg.gpa; } return (error); } #ifndef min #define min(a,b) (((a) < (b)) ? (a) : (b)) #endif int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault) { void *va; uint64_t gpa; int error, i, n, off; for (i = 0; i < iovcnt; i++) { iov[i].iov_base = 0; iov[i].iov_len = 0; } while (len) { assert(iovcnt > 0); error = vm_gla2gpa(ctx, vcpu, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = min(len, PAGE_SIZE - off); va = vm_map_gpa(ctx, gpa, n); if (va == NULL) return (EFAULT); iov->iov_base = va; iov->iov_len = n; iov++; iovcnt--; gla += n; len -= n; } return (0); } void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt) { return; } void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len) { const char *src; char *dst; size_t n; dst = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); src = iov->iov_base; bcopy(src, dst, n); iov++; dst += n; len -= n; } } void vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov, size_t len) { const char *src; char *dst; size_t n; src = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); dst = iov->iov_base; bcopy(src, dst, n); iov++; src += n; len -= n; } } static int vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) { struct vm_cpuset vm_cpuset; int error; bzero(&vm_cpuset, sizeof(struct vm_cpuset)); vm_cpuset.which = which; vm_cpuset.cpusetsize = sizeof(cpuset_t); vm_cpuset.cpus = cpus; error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); return (error); } int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); } int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); } int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); } int vm_activate_cpu(struct vmctx *ctx, int vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = vcpu; error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac); return (error); } int vm_suspend_cpu(struct vmctx *ctx, int vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = vcpu; error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); return (error); } int vm_resume_cpu(struct vmctx *ctx, int vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = vcpu; error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); return (error); } int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); vmii.vcpuid = vcpu; error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii); if (error == 0) { *info1 = vmii.info1; *info2 = vmii.info2; } return (error); } int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); vmii.vcpuid = vcpu; vmii.info1 = info1; error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii); return (error); } int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value) { struct vm_rtc_data rtcdata; int error; bzero(&rtcdata, sizeof(struct vm_rtc_data)); rtcdata.offset = offset; rtcdata.value = value; error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata); return (error); } int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval) { struct vm_rtc_data rtcdata; int error; bzero(&rtcdata, sizeof(struct vm_rtc_data)); rtcdata.offset = offset; error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata); if (error == 0) *retval = rtcdata.value; return (error); } int vm_rtc_settime(struct vmctx *ctx, time_t secs) { struct vm_rtc_time rtctime; int error; bzero(&rtctime, sizeof(struct vm_rtc_time)); rtctime.secs = secs; error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime); return (error); } int vm_rtc_gettime(struct vmctx *ctx, time_t *secs) { struct vm_rtc_time rtctime; int error; bzero(&rtctime, sizeof(struct vm_rtc_time)); error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime); if (error == 0) *secs = rtctime.secs; return (error); } int vm_restart_instruction(void *arg, int vcpu) { struct vmctx *ctx = arg; return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu)); } int vm_snapshot_req(struct vm_snapshot_meta *meta) { if (ioctl(meta->ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { #ifdef SNAPSHOT_DEBUG fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", __func__, meta->dev_name, errno); #endif return (-1); } return (0); } int vm_restore_time(struct vmctx *ctx) { int dummy; dummy = 0; return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); } int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) { struct vm_cpu_topology topology; bzero(&topology, sizeof (struct vm_cpu_topology)); topology.sockets = sockets; topology.cores = cores; topology.threads = threads; topology.maxcpus = maxcpus; return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); } int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) { struct vm_cpu_topology topology; int error; bzero(&topology, sizeof (struct vm_cpu_topology)); error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); if (error == 0) { *sockets = topology.sockets; *cores = topology.cores; *threads = topology.threads; *maxcpus = topology.maxcpus; } return (error); } int vm_get_device_fd(struct vmctx *ctx) { return (ctx->fd); } const cap_ioctl_t * vm_get_ioctls(size_t *len) { cap_ioctl_t *cmds; /* keep in sync with machine/vmm_dev.h */ static const cap_ioctl_t vm_ioctl_cmds[] = { VM_RUN, VM_SUSPEND, VM_REINIT, VM_ALLOC_MEMSEG, VM_GET_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_GETNEXT, VM_SET_REGISTER, VM_GET_REGISTER, VM_SET_SEGMENT_DESCRIPTOR, VM_GET_SEGMENT_DESCRIPTOR, VM_SET_REGISTER_SET, VM_GET_REGISTER_SET, VM_SET_KERNEMU_DEV, VM_GET_KERNEMU_DEV, VM_INJECT_EXCEPTION, VM_LAPIC_IRQ, VM_LAPIC_LOCAL_IRQ, VM_LAPIC_MSI, VM_IOAPIC_ASSERT_IRQ, VM_IOAPIC_DEASSERT_IRQ, VM_IOAPIC_PULSE_IRQ, VM_IOAPIC_PINCOUNT, VM_ISA_ASSERT_IRQ, VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER, VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV, VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI, - VM_PPTDEV_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, + VM_PPTDEV_MSIX, VM_PPTDEV_DISABLE_MSIX, + VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE, VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA, VM_GLA2GPA_NOFAULT, VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SUSPEND_CPU, VM_RESUME_CPU, VM_SET_INTINFO, VM_GET_INTINFO, VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME, VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY }; if (len == NULL) { cmds = malloc(sizeof(vm_ioctl_cmds)); if (cmds == NULL) return (NULL); bcopy(vm_ioctl_cmds, cmds, sizeof(vm_ioctl_cmds)); return (cmds); } *len = nitems(vm_ioctl_cmds); return (NULL); } Index: head/lib/libvmmapi/vmmapi.h =================================================================== --- head/lib/libvmmapi/vmmapi.h (revision 368002) +++ head/lib/libvmmapi/vmmapi.h (revision 368003) @@ -1,273 +1,274 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMMAPI_H_ #define _VMMAPI_H_ #include #include #include #include /* * API version for out-of-tree consumers like grub-bhyve for making compile * time decisions. */ #define VMMAPI_VERSION 0103 /* 2 digit major followed by 2 digit minor */ struct iovec; struct vmctx; struct vm_snapshot_meta; enum x2apic_state; /* * Different styles of mapping the memory assigned to a VM into the address * space of the controlling process. */ enum vm_mmap_style { VM_MMAP_NONE, /* no mapping */ VM_MMAP_ALL, /* fully and statically mapped */ VM_MMAP_SPARSE, /* mappings created on-demand */ }; /* * 'flags' value passed to 'vm_set_memflags()'. */ #define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */ #define VM_MEM_F_WIRED 0x02 /* guest memory is wired */ /* * Identifiers for memory segments: * - vm_setup_memory() uses VM_SYSMEM for the system memory segment. * - the remaining identifiers can be used to create devmem segments. */ enum { VM_SYSMEM, VM_BOOTROM, VM_FRAMEBUFFER, }; /* * Get the length and name of the memory segment identified by 'segid'. * Note that system memory segments are identified with a nul name. * * Returns 0 on success and non-zero otherwise. */ int vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name, size_t namesiz); /* * Iterate over the guest address space. This function finds an address range * that starts at an address >= *gpa. * * Returns 0 if the next address range was found and non-zero otherwise. */ int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, size_t *lowmem_size, size_t *highmem_size); /* * Create a device memory segment identified by 'segid'. * * Returns a pointer to the memory segment on success and MAP_FAILED otherwise. */ void *vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len); /* * Map the memory segment identified by 'segid' into the guest address space * at [gpa,gpa+len) with protection 'prot'. */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t segoff, size_t len, int prot); int vm_create(const char *name); int vm_get_device_fd(struct vmctx *ctx); struct vmctx *vm_open(const char *name); void vm_destroy(struct vmctx *ctx); int vm_parse_memsize(const char *optarg, size_t *memsize); int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); /* inverse operation to vm_map_gpa - extract guest address from host pointer */ vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr); int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); int vm_gla2gpa_nofault(struct vmctx *, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); uint32_t vm_get_lowmem_limit(struct vmctx *ctx); void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit); void vm_set_memflags(struct vmctx *ctx, int flags); int vm_get_memflags(struct vmctx *ctx); int vm_get_name(struct vmctx *ctx, char *buffer, size_t max_len); size_t vm_get_lowmem_size(struct vmctx *ctx); size_t vm_get_highmem_size(struct vmctx *ctx); int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access); int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access); int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc); int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); int vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count, const int *regnums, uint64_t *regvals); int vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count, const int *regnums, uint64_t *regvals); int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit); int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how); int vm_reinit(struct vmctx *ctx); int vm_apicid2vcpu(struct vmctx *ctx, int apicid); int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction); int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector); int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector); int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg); int vm_ioapic_assert_irq(struct vmctx *ctx, int irq); int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq); int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq); int vm_ioapic_pincount(struct vmctx *ctx, int *pincount); int vm_readwrite_kernemu_device(struct vmctx *ctx, int vcpu, vm_paddr_t gpa, bool write, int size, uint64_t *value); int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger); int vm_inject_nmi(struct vmctx *ctx, int vcpu); int vm_capability_name2type(const char *capname); const char *vm_capability_type2name(int type); int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int *retval); int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val); int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec); int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); +int vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func); int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2); int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo); const cap_ioctl_t *vm_get_ioctls(size_t *len); /* * Return a pointer to the statistics buffer. Note that this is not MT-safe. */ uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, int *ret_entries); const char *vm_get_stat_desc(struct vmctx *ctx, int index); int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s); int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s); int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities); /* * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'. * The 'iovcnt' should be big enough to accommodate all GPA segments. * * retval fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Error */ int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault); void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov, void *host_dst, size_t len); void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src, struct iovec *guest_iov, size_t len); void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt); /* RTC */ int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value); int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval); int vm_rtc_settime(struct vmctx *ctx, time_t secs); int vm_rtc_gettime(struct vmctx *ctx, time_t *secs); /* Reset vcpu register state */ int vcpu_reset(struct vmctx *ctx, int vcpu); int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_activate_cpu(struct vmctx *ctx, int vcpu); int vm_suspend_cpu(struct vmctx *ctx, int vcpu); int vm_resume_cpu(struct vmctx *ctx, int vcpu); /* CPU topology */ int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); /* * FreeBSD specific APIs */ int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu, uint64_t rip, uint64_t cr3, uint64_t gdtbase, uint64_t rsp); int vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu, uint32_t eip, uint32_t gdtbase, uint32_t esp); void vm_setup_freebsd_gdt(uint64_t *gdtr); /* * Save and restore */ #define MAX_SNAPSHOT_VMNAME 100 enum checkpoint_opcodes { START_CHECKPOINT = 0, START_SUSPEND = 1, }; struct checkpoint_op { unsigned int op; char snapshot_filename[MAX_SNAPSHOT_VMNAME]; }; int vm_snapshot_req(struct vm_snapshot_meta *meta); int vm_restore_time(struct vmctx *ctx); #endif /* _VMMAPI_H_ */ Index: head/sys/amd64/include/vmm_dev.h =================================================================== --- head/sys/amd64/include/vmm_dev.h (revision 368002) +++ head/sys/amd64/include/vmm_dev.h (revision 368003) @@ -1,464 +1,467 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ struct vm_snapshot_meta; #ifdef _KERNEL void vmmdev_init(void); int vmmdev_cleanup(void); #endif struct vm_memmap { vm_paddr_t gpa; int segid; /* memory segment */ vm_ooffset_t segoff; /* offset into memory segment */ size_t len; /* mmap length */ int prot; /* RWX */ int flags; }; #define VM_MEMMAP_F_WIRED 0x01 #define VM_MEMMAP_F_IOMMU 0x02 #define VM_MEMSEG_NAME(m) ((m)->name[0] != '\0' ? (m)->name : NULL) struct vm_memseg { int segid; size_t len; char name[VM_MAX_SUFFIXLEN + 1]; }; struct vm_memseg_fbsd12 { int segid; size_t len; char name[64]; }; _Static_assert(sizeof(struct vm_memseg_fbsd12) == 80, "COMPAT_FREEBSD12 ABI"); struct vm_register { int cpuid; int regnum; /* enum vm_reg_name */ uint64_t regval; }; struct vm_seg_desc { /* data or code segment */ int cpuid; int regnum; /* enum vm_reg_name */ struct seg_desc desc; }; struct vm_register_set { int cpuid; unsigned int count; const int *regnums; /* enum vm_reg_name */ uint64_t *regvals; }; struct vm_run { int cpuid; struct vm_exit vm_exit; }; struct vm_exception { int cpuid; int vector; uint32_t error_code; int error_code_valid; int restart_instruction; }; struct vm_lapic_msi { uint64_t msg; uint64_t addr; }; struct vm_lapic_irq { int cpuid; int vector; }; struct vm_ioapic_irq { int irq; }; struct vm_isa_irq { int atpic_irq; int ioapic_irq; }; struct vm_isa_irq_trigger { int atpic_irq; enum vm_intr_trigger trigger; }; struct vm_capability { int cpuid; enum vm_cap_type captype; int capval; int allcpus; }; struct vm_pptdev { int bus; int slot; int func; }; struct vm_pptdev_mmio { int bus; int slot; int func; vm_paddr_t gpa; vm_paddr_t hpa; size_t len; }; struct vm_pptdev_msi { int vcpu; int bus; int slot; int func; int numvec; /* 0 means disabled */ uint64_t msg; uint64_t addr; }; struct vm_pptdev_msix { int vcpu; int bus; int slot; int func; int idx; uint64_t msg; uint32_t vector_control; uint64_t addr; }; struct vm_nmi { int cpuid; }; #define MAX_VM_STATS 64 struct vm_stats { int cpuid; /* in */ int num_entries; /* out */ struct timeval tv; uint64_t statbuf[MAX_VM_STATS]; }; struct vm_stat_desc { int index; /* in */ char desc[128]; /* out */ }; struct vm_x2apic { int cpuid; enum x2apic_state state; }; struct vm_gpa_pte { uint64_t gpa; /* in */ uint64_t pte[4]; /* out */ int ptenum; }; struct vm_hpet_cap { uint32_t capabilities; /* lower 32 bits of HPET capabilities */ }; struct vm_suspend { enum vm_suspend_how how; }; struct vm_gla2gpa { int vcpuid; /* inputs */ int prot; /* PROT_READ or PROT_WRITE */ uint64_t gla; struct vm_guest_paging paging; int fault; /* outputs */ uint64_t gpa; }; struct vm_activate_cpu { int vcpuid; }; struct vm_cpuset { int which; int cpusetsize; cpuset_t *cpus; }; #define VM_ACTIVE_CPUS 0 #define VM_SUSPENDED_CPUS 1 #define VM_DEBUG_CPUS 2 struct vm_intinfo { int vcpuid; uint64_t info1; uint64_t info2; }; struct vm_rtc_time { time_t secs; }; struct vm_rtc_data { int offset; uint8_t value; }; struct vm_cpu_topology { uint16_t sockets; uint16_t cores; uint16_t threads; uint16_t maxcpus; }; struct vm_readwrite_kernemu_device { int vcpuid; unsigned access_width : 3; unsigned _unused : 29; uint64_t gpa; uint64_t value; }; _Static_assert(sizeof(struct vm_readwrite_kernemu_device) == 24, "ABI"); enum { /* general routines */ IOCNUM_ABIVERS = 0, IOCNUM_RUN = 1, IOCNUM_SET_CAPABILITY = 2, IOCNUM_GET_CAPABILITY = 3, IOCNUM_SUSPEND = 4, IOCNUM_REINIT = 5, /* memory apis */ IOCNUM_MAP_MEMORY = 10, /* deprecated */ IOCNUM_GET_MEMORY_SEG = 11, /* deprecated */ IOCNUM_GET_GPA_PMAP = 12, IOCNUM_GLA2GPA = 13, IOCNUM_ALLOC_MEMSEG = 14, IOCNUM_GET_MEMSEG = 15, IOCNUM_MMAP_MEMSEG = 16, IOCNUM_MMAP_GETNEXT = 17, IOCNUM_GLA2GPA_NOFAULT = 18, /* register/state accessors */ IOCNUM_SET_REGISTER = 20, IOCNUM_GET_REGISTER = 21, IOCNUM_SET_SEGMENT_DESCRIPTOR = 22, IOCNUM_GET_SEGMENT_DESCRIPTOR = 23, IOCNUM_SET_REGISTER_SET = 24, IOCNUM_GET_REGISTER_SET = 25, IOCNUM_GET_KERNEMU_DEV = 26, IOCNUM_SET_KERNEMU_DEV = 27, /* interrupt injection */ IOCNUM_GET_INTINFO = 28, IOCNUM_SET_INTINFO = 29, IOCNUM_INJECT_EXCEPTION = 30, IOCNUM_LAPIC_IRQ = 31, IOCNUM_INJECT_NMI = 32, IOCNUM_IOAPIC_ASSERT_IRQ = 33, IOCNUM_IOAPIC_DEASSERT_IRQ = 34, IOCNUM_IOAPIC_PULSE_IRQ = 35, IOCNUM_LAPIC_MSI = 36, IOCNUM_LAPIC_LOCAL_IRQ = 37, IOCNUM_IOAPIC_PINCOUNT = 38, IOCNUM_RESTART_INSTRUCTION = 39, /* PCI pass-thru */ IOCNUM_BIND_PPTDEV = 40, IOCNUM_UNBIND_PPTDEV = 41, IOCNUM_MAP_PPTDEV_MMIO = 42, IOCNUM_PPTDEV_MSI = 43, IOCNUM_PPTDEV_MSIX = 44, + IOCNUM_PPTDEV_DISABLE_MSIX = 45, /* statistics */ IOCNUM_VM_STATS = 50, IOCNUM_VM_STAT_DESC = 51, /* kernel device state */ IOCNUM_SET_X2APIC_STATE = 60, IOCNUM_GET_X2APIC_STATE = 61, IOCNUM_GET_HPET_CAPABILITIES = 62, /* CPU Topology */ IOCNUM_SET_TOPOLOGY = 63, IOCNUM_GET_TOPOLOGY = 64, /* legacy interrupt injection */ IOCNUM_ISA_ASSERT_IRQ = 80, IOCNUM_ISA_DEASSERT_IRQ = 81, IOCNUM_ISA_PULSE_IRQ = 82, IOCNUM_ISA_SET_IRQ_TRIGGER = 83, /* vm_cpuset */ IOCNUM_ACTIVATE_CPU = 90, IOCNUM_GET_CPUSET = 91, IOCNUM_SUSPEND_CPU = 92, IOCNUM_RESUME_CPU = 93, /* RTC */ IOCNUM_RTC_READ = 100, IOCNUM_RTC_WRITE = 101, IOCNUM_RTC_SETTIME = 102, IOCNUM_RTC_GETTIME = 103, /* checkpoint */ IOCNUM_SNAPSHOT_REQ = 113, IOCNUM_RESTORE_TIME = 115 }; #define VM_RUN \ _IOWR('v', IOCNUM_RUN, struct vm_run) #define VM_SUSPEND \ _IOW('v', IOCNUM_SUSPEND, struct vm_suspend) #define VM_REINIT \ _IO('v', IOCNUM_REINIT) #define VM_ALLOC_MEMSEG_FBSD12 \ _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_fbsd12) #define VM_ALLOC_MEMSEG \ _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg) #define VM_GET_MEMSEG_FBSD12 \ _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_fbsd12) #define VM_GET_MEMSEG \ _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg) #define VM_MMAP_MEMSEG \ _IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap) #define VM_MMAP_GETNEXT \ _IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap) #define VM_SET_REGISTER \ _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) #define VM_GET_REGISTER \ _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) #define VM_SET_SEGMENT_DESCRIPTOR \ _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) #define VM_GET_SEGMENT_DESCRIPTOR \ _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) #define VM_SET_REGISTER_SET \ _IOW('v', IOCNUM_SET_REGISTER_SET, struct vm_register_set) #define VM_GET_REGISTER_SET \ _IOWR('v', IOCNUM_GET_REGISTER_SET, struct vm_register_set) #define VM_SET_KERNEMU_DEV \ _IOW('v', IOCNUM_SET_KERNEMU_DEV, \ struct vm_readwrite_kernemu_device) #define VM_GET_KERNEMU_DEV \ _IOWR('v', IOCNUM_GET_KERNEMU_DEV, \ struct vm_readwrite_kernemu_device) #define VM_INJECT_EXCEPTION \ _IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception) #define VM_LAPIC_IRQ \ _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq) #define VM_LAPIC_LOCAL_IRQ \ _IOW('v', IOCNUM_LAPIC_LOCAL_IRQ, struct vm_lapic_irq) #define VM_LAPIC_MSI \ _IOW('v', IOCNUM_LAPIC_MSI, struct vm_lapic_msi) #define VM_IOAPIC_ASSERT_IRQ \ _IOW('v', IOCNUM_IOAPIC_ASSERT_IRQ, struct vm_ioapic_irq) #define VM_IOAPIC_DEASSERT_IRQ \ _IOW('v', IOCNUM_IOAPIC_DEASSERT_IRQ, struct vm_ioapic_irq) #define VM_IOAPIC_PULSE_IRQ \ _IOW('v', IOCNUM_IOAPIC_PULSE_IRQ, struct vm_ioapic_irq) #define VM_IOAPIC_PINCOUNT \ _IOR('v', IOCNUM_IOAPIC_PINCOUNT, int) #define VM_ISA_ASSERT_IRQ \ _IOW('v', IOCNUM_ISA_ASSERT_IRQ, struct vm_isa_irq) #define VM_ISA_DEASSERT_IRQ \ _IOW('v', IOCNUM_ISA_DEASSERT_IRQ, struct vm_isa_irq) #define VM_ISA_PULSE_IRQ \ _IOW('v', IOCNUM_ISA_PULSE_IRQ, struct vm_isa_irq) #define VM_ISA_SET_IRQ_TRIGGER \ _IOW('v', IOCNUM_ISA_SET_IRQ_TRIGGER, struct vm_isa_irq_trigger) #define VM_SET_CAPABILITY \ _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) #define VM_GET_CAPABILITY \ _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) #define VM_BIND_PPTDEV \ _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev) #define VM_UNBIND_PPTDEV \ _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev) #define VM_MAP_PPTDEV_MMIO \ _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio) #define VM_PPTDEV_MSI \ _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi) #define VM_PPTDEV_MSIX \ _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix) +#define VM_PPTDEV_DISABLE_MSIX \ + _IOW('v', IOCNUM_PPTDEV_DISABLE_MSIX, struct vm_pptdev) #define VM_INJECT_NMI \ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) #define VM_STATS \ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) #define VM_STAT_DESC \ _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) #define VM_SET_X2APIC_STATE \ _IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic) #define VM_GET_X2APIC_STATE \ _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic) #define VM_GET_HPET_CAPABILITIES \ _IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap) #define VM_SET_TOPOLOGY \ _IOW('v', IOCNUM_SET_TOPOLOGY, struct vm_cpu_topology) #define VM_GET_TOPOLOGY \ _IOR('v', IOCNUM_GET_TOPOLOGY, struct vm_cpu_topology) #define VM_GET_GPA_PMAP \ _IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte) #define VM_GLA2GPA \ _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa) #define VM_GLA2GPA_NOFAULT \ _IOWR('v', IOCNUM_GLA2GPA_NOFAULT, struct vm_gla2gpa) #define VM_ACTIVATE_CPU \ _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) #define VM_GET_CPUS \ _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) #define VM_SUSPEND_CPU \ _IOW('v', IOCNUM_SUSPEND_CPU, struct vm_activate_cpu) #define VM_RESUME_CPU \ _IOW('v', IOCNUM_RESUME_CPU, struct vm_activate_cpu) #define VM_SET_INTINFO \ _IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo) #define VM_GET_INTINFO \ _IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo) #define VM_RTC_WRITE \ _IOW('v', IOCNUM_RTC_WRITE, struct vm_rtc_data) #define VM_RTC_READ \ _IOWR('v', IOCNUM_RTC_READ, struct vm_rtc_data) #define VM_RTC_SETTIME \ _IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time) #define VM_RTC_GETTIME \ _IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time) #define VM_RESTART_INSTRUCTION \ _IOW('v', IOCNUM_RESTART_INSTRUCTION, int) #define VM_SNAPSHOT_REQ \ _IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta) #define VM_RESTORE_TIME \ _IOWR('v', IOCNUM_RESTORE_TIME, int) #endif Index: head/sys/amd64/vmm/io/ppt.c =================================================================== --- head/sys/amd64/vmm/io/ppt.c (revision 368002) +++ head/sys/amd64/vmm/io/ppt.c (revision 368003) @@ -1,702 +1,725 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_ktr.h" #include "iommu.h" #include "ppt.h" /* XXX locking */ #define MAX_MSIMSGS 32 /* * If the MSI-X table is located in the middle of a BAR then that MMIO * region gets split into two segments - one segment above the MSI-X table * and the other segment below the MSI-X table - with a hole in place of * the MSI-X table so accesses to it can be trapped and emulated. * * So, allocate a MMIO segment for each BAR register + 1 additional segment. */ #define MAX_MMIOSEGS ((PCIR_MAX_BAR_0 + 1) + 1) MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources"); struct pptintr_arg { /* pptintr(pptintr_arg) */ struct pptdev *pptdev; uint64_t addr; uint64_t msg_data; }; struct pptseg { vm_paddr_t gpa; size_t len; int wired; }; struct pptdev { device_t dev; struct vm *vm; /* owner of this device */ TAILQ_ENTRY(pptdev) next; struct pptseg mmio[MAX_MMIOSEGS]; struct { int num_msgs; /* guest state */ int startrid; /* host state */ struct resource *res[MAX_MSIMSGS]; void *cookie[MAX_MSIMSGS]; struct pptintr_arg arg[MAX_MSIMSGS]; } msi; struct { int num_msgs; int startrid; int msix_table_rid; int msix_pba_rid; struct resource *msix_table_res; struct resource *msix_pba_res; struct resource **res; void **cookie; struct pptintr_arg *arg; } msix; }; SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "bhyve passthru devices"); static int num_pptdevs; SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0, "number of pci passthru devices"); static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list); static int ppt_probe(device_t dev) { int bus, slot, func; struct pci_devinfo *dinfo; dinfo = (struct pci_devinfo *)device_get_ivars(dev); bus = pci_get_bus(dev); slot = pci_get_slot(dev); func = pci_get_function(dev); /* * To qualify as a pci passthrough device a device must: * - be allowed by administrator to be used in this role * - be an endpoint device */ if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL) return (ENXIO); else if (vmm_is_pptdev(bus, slot, func)) return (0); else /* * Returning BUS_PROBE_NOWILDCARD here matches devices that the * SR-IOV infrastructure specified as "ppt" passthrough devices. * All normal devices that did not have "ppt" specified as their * driver will not be matched by this. */ return (BUS_PROBE_NOWILDCARD); } static int ppt_attach(device_t dev) { struct pptdev *ppt; ppt = device_get_softc(dev); iommu_remove_device(iommu_host_domain(), pci_get_rid(dev)); num_pptdevs++; TAILQ_INSERT_TAIL(&pptdev_list, ppt, next); ppt->dev = dev; if (bootverbose) device_printf(dev, "attached\n"); return (0); } static int ppt_detach(device_t dev) { struct pptdev *ppt; ppt = device_get_softc(dev); if (ppt->vm != NULL) return (EBUSY); num_pptdevs--; TAILQ_REMOVE(&pptdev_list, ppt, next); pci_disable_busmaster(dev); iommu_add_device(iommu_host_domain(), pci_get_rid(dev)); return (0); } static device_method_t ppt_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ppt_probe), DEVMETHOD(device_attach, ppt_attach), DEVMETHOD(device_detach, ppt_detach), {0, 0} }; static devclass_t ppt_devclass; DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev)); DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL); static struct pptdev * ppt_find(int bus, int slot, int func) { device_t dev; struct pptdev *ppt; int b, s, f; TAILQ_FOREACH(ppt, &pptdev_list, next) { dev = ppt->dev; b = pci_get_bus(dev); s = pci_get_slot(dev); f = pci_get_function(dev); if (bus == b && slot == s && func == f) return (ppt); } return (NULL); } static void ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt) { int i; struct pptseg *seg; for (i = 0; i < MAX_MMIOSEGS; i++) { seg = &ppt->mmio[i]; if (seg->len == 0) continue; (void)vm_unmap_mmio(vm, seg->gpa, seg->len); bzero(seg, sizeof(struct pptseg)); } } static void ppt_teardown_msi(struct pptdev *ppt) { int i, rid; void *cookie; struct resource *res; if (ppt->msi.num_msgs == 0) return; for (i = 0; i < ppt->msi.num_msgs; i++) { rid = ppt->msi.startrid + i; res = ppt->msi.res[i]; cookie = ppt->msi.cookie[i]; if (cookie != NULL) bus_teardown_intr(ppt->dev, res, cookie); if (res != NULL) bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res); ppt->msi.res[i] = NULL; ppt->msi.cookie[i] = NULL; } if (ppt->msi.startrid == 1) pci_release_msi(ppt->dev); ppt->msi.num_msgs = 0; } static void ppt_teardown_msix_intr(struct pptdev *ppt, int idx) { int rid; struct resource *res; void *cookie; rid = ppt->msix.startrid + idx; res = ppt->msix.res[idx]; cookie = ppt->msix.cookie[idx]; if (cookie != NULL) bus_teardown_intr(ppt->dev, res, cookie); if (res != NULL) bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res); ppt->msix.res[idx] = NULL; ppt->msix.cookie[idx] = NULL; } static void ppt_teardown_msix(struct pptdev *ppt) { int i; if (ppt->msix.num_msgs == 0) return; for (i = 0; i < ppt->msix.num_msgs; i++) ppt_teardown_msix_intr(ppt, i); free(ppt->msix.res, M_PPTMSIX); free(ppt->msix.cookie, M_PPTMSIX); free(ppt->msix.arg, M_PPTMSIX); pci_release_msi(ppt->dev); if (ppt->msix.msix_table_res) { bus_release_resource(ppt->dev, SYS_RES_MEMORY, ppt->msix.msix_table_rid, ppt->msix.msix_table_res); ppt->msix.msix_table_res = NULL; ppt->msix.msix_table_rid = 0; } if (ppt->msix.msix_pba_res) { bus_release_resource(ppt->dev, SYS_RES_MEMORY, ppt->msix.msix_pba_rid, ppt->msix.msix_pba_res); ppt->msix.msix_pba_res = NULL; ppt->msix.msix_pba_rid = 0; } ppt->msix.num_msgs = 0; } int ppt_avail_devices(void) { return (num_pptdevs); } int ppt_assigned_devices(struct vm *vm) { struct pptdev *ppt; int num; num = 0; TAILQ_FOREACH(ppt, &pptdev_list, next) { if (ppt->vm == vm) num++; } return (num); } bool ppt_is_mmio(struct vm *vm, vm_paddr_t gpa) { int i; struct pptdev *ppt; struct pptseg *seg; TAILQ_FOREACH(ppt, &pptdev_list, next) { if (ppt->vm != vm) continue; for (i = 0; i < MAX_MMIOSEGS; i++) { seg = &ppt->mmio[i]; if (seg->len == 0) continue; if (gpa >= seg->gpa && gpa < seg->gpa + seg->len) return (true); } } return (false); } static void ppt_pci_reset(device_t dev) { if (pcie_flr(dev, max(pcie_get_max_completion_timeout(dev) / 1000, 10), true)) return; pci_power_reset(dev); } int ppt_assign_device(struct vm *vm, int bus, int slot, int func) { struct pptdev *ppt; ppt = ppt_find(bus, slot, func); if (ppt != NULL) { /* * If this device is owned by a different VM then we * cannot change its owner. */ if (ppt->vm != NULL && ppt->vm != vm) return (EBUSY); pci_save_state(ppt->dev); ppt_pci_reset(ppt->dev); pci_restore_state(ppt->dev); ppt->vm = vm; iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev)); return (0); } return (ENOENT); } int ppt_unassign_device(struct vm *vm, int bus, int slot, int func) { struct pptdev *ppt; ppt = ppt_find(bus, slot, func); if (ppt != NULL) { /* * If this device is not owned by this 'vm' then bail out. */ if (ppt->vm != vm) return (EBUSY); pci_save_state(ppt->dev); ppt_pci_reset(ppt->dev); pci_restore_state(ppt->dev); ppt_unmap_mmio(vm, ppt); ppt_teardown_msi(ppt); ppt_teardown_msix(ppt); iommu_remove_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev)); ppt->vm = NULL; return (0); } return (ENOENT); } int ppt_unassign_all(struct vm *vm) { struct pptdev *ppt; int bus, slot, func; device_t dev; TAILQ_FOREACH(ppt, &pptdev_list, next) { if (ppt->vm == vm) { dev = ppt->dev; bus = pci_get_bus(dev); slot = pci_get_slot(dev); func = pci_get_function(dev); vm_unassign_pptdev(vm, bus, slot, func); } } return (0); } int ppt_map_mmio(struct vm *vm, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { int i, error; struct pptseg *seg; struct pptdev *ppt; ppt = ppt_find(bus, slot, func); if (ppt != NULL) { if (ppt->vm != vm) return (EBUSY); for (i = 0; i < MAX_MMIOSEGS; i++) { seg = &ppt->mmio[i]; if (seg->len == 0) { error = vm_map_mmio(vm, gpa, len, hpa); if (error == 0) { seg->gpa = gpa; seg->len = len; } return (error); } } return (ENOSPC); } return (ENOENT); } static int pptintr(void *arg) { struct pptdev *ppt; struct pptintr_arg *pptarg; pptarg = arg; ppt = pptarg->pptdev; if (ppt->vm != NULL) lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data); else { /* * XXX * This is not expected to happen - panic? */ } /* * For legacy interrupts give other filters a chance in case * the interrupt was not generated by the passthrough device. */ if (ppt->msi.startrid == 0) return (FILTER_STRAY); else return (FILTER_HANDLED); } int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec) { int i, rid, flags; int msi_count, startrid, error, tmp; struct pptdev *ppt; if (numvec < 0 || numvec > MAX_MSIMSGS) return (EINVAL); ppt = ppt_find(bus, slot, func); if (ppt == NULL) return (ENOENT); if (ppt->vm != vm) /* Make sure we own this device */ return (EBUSY); + /* Reject attempts to enable MSI while MSI-X is active. */ + if (ppt->msix.num_msgs != 0 && numvec != 0) + return (EBUSY); + /* Free any allocated resources */ ppt_teardown_msi(ppt); if (numvec == 0) /* nothing more to do */ return (0); flags = RF_ACTIVE; msi_count = pci_msi_count(ppt->dev); if (msi_count == 0) { startrid = 0; /* legacy interrupt */ msi_count = 1; flags |= RF_SHAREABLE; } else startrid = 1; /* MSI */ /* * The device must be capable of supporting the number of vectors * the guest wants to allocate. */ if (numvec > msi_count) return (EINVAL); /* * Make sure that we can allocate all the MSI vectors that are needed * by the guest. */ if (startrid == 1) { tmp = numvec; error = pci_alloc_msi(ppt->dev, &tmp); if (error) return (error); else if (tmp != numvec) { pci_release_msi(ppt->dev); return (ENOSPC); } else { /* success */ } } ppt->msi.startrid = startrid; /* * Allocate the irq resource and attach it to the interrupt handler. */ for (i = 0; i < numvec; i++) { ppt->msi.num_msgs = i + 1; ppt->msi.cookie[i] = NULL; rid = startrid + i; ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ, &rid, flags); if (ppt->msi.res[i] == NULL) break; ppt->msi.arg[i].pptdev = ppt; ppt->msi.arg[i].addr = addr; ppt->msi.arg[i].msg_data = msg + i; error = bus_setup_intr(ppt->dev, ppt->msi.res[i], INTR_TYPE_NET | INTR_MPSAFE, pptintr, NULL, &ppt->msi.arg[i], &ppt->msi.cookie[i]); if (error != 0) break; } if (i < numvec) { ppt_teardown_msi(ppt); return (ENXIO); } return (0); } int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control) { struct pptdev *ppt; struct pci_devinfo *dinfo; int numvec, alloced, rid, error; size_t res_size, cookie_size, arg_size; ppt = ppt_find(bus, slot, func); if (ppt == NULL) return (ENOENT); if (ppt->vm != vm) /* Make sure we own this device */ return (EBUSY); + /* Reject attempts to enable MSI-X while MSI is active. */ + if (ppt->msi.num_msgs != 0) + return (EBUSY); + dinfo = device_get_ivars(ppt->dev); if (!dinfo) return (ENXIO); /* * First-time configuration: * Allocate the MSI-X table * Allocate the IRQ resources * Set up some variables in ppt->msix */ if (ppt->msix.num_msgs == 0) { numvec = pci_msix_count(ppt->dev); if (numvec <= 0) return (EINVAL); ppt->msix.startrid = 1; ppt->msix.num_msgs = numvec; res_size = numvec * sizeof(ppt->msix.res[0]); cookie_size = numvec * sizeof(ppt->msix.cookie[0]); arg_size = numvec * sizeof(ppt->msix.arg[0]); ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO); ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX, M_WAITOK | M_ZERO); ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO); rid = dinfo->cfg.msix.msix_table_bar; ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (ppt->msix.msix_table_res == NULL) { ppt_teardown_msix(ppt); return (ENOSPC); } ppt->msix.msix_table_rid = rid; if (dinfo->cfg.msix.msix_table_bar != dinfo->cfg.msix.msix_pba_bar) { rid = dinfo->cfg.msix.msix_pba_bar; ppt->msix.msix_pba_res = bus_alloc_resource_any( ppt->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (ppt->msix.msix_pba_res == NULL) { ppt_teardown_msix(ppt); return (ENOSPC); } ppt->msix.msix_pba_rid = rid; } alloced = numvec; error = pci_alloc_msix(ppt->dev, &alloced); if (error || alloced != numvec) { ppt_teardown_msix(ppt); return (error == 0 ? ENOSPC: error); } } if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { /* Tear down the IRQ if it's already set up */ ppt_teardown_msix_intr(ppt, idx); /* Allocate the IRQ resource */ ppt->msix.cookie[idx] = NULL; rid = ppt->msix.startrid + idx; ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (ppt->msix.res[idx] == NULL) return (ENXIO); ppt->msix.arg[idx].pptdev = ppt; ppt->msix.arg[idx].addr = addr; ppt->msix.arg[idx].msg_data = msg; /* Setup the MSI-X interrupt */ error = bus_setup_intr(ppt->dev, ppt->msix.res[idx], INTR_TYPE_NET | INTR_MPSAFE, pptintr, NULL, &ppt->msix.arg[idx], &ppt->msix.cookie[idx]); if (error != 0) { bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]); ppt->msix.cookie[idx] = NULL; ppt->msix.res[idx] = NULL; return (ENXIO); } } else { /* Masked, tear it down if it's already been set up */ ppt_teardown_msix_intr(ppt, idx); } + return (0); +} + +int +ppt_disable_msix(struct vm *vm, int bus, int slot, int func) +{ + struct pptdev *ppt; + + ppt = ppt_find(bus, slot, func); + if (ppt == NULL) + return (ENOENT); + if (ppt->vm != vm) /* Make sure we own this device */ + return (EBUSY); + + ppt_teardown_msix(ppt); return (0); } Index: head/sys/amd64/vmm/io/ppt.h =================================================================== --- head/sys/amd64/vmm/io/ppt.h (revision 368002) +++ head/sys/amd64/vmm/io/ppt.h (revision 368003) @@ -1,56 +1,57 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _IO_PPT_H_ #define _IO_PPT_H_ int ppt_unassign_all(struct vm *vm); int ppt_map_mmio(struct vm *vm, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec); int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); +int ppt_disable_msix(struct vm *vm, int bus, int slot, int func); int ppt_assigned_devices(struct vm *vm); bool ppt_is_mmio(struct vm *vm, vm_paddr_t gpa); /* * Returns the number of devices sequestered by the ppt driver for assignment * to virtual machines. */ int ppt_avail_devices(void); /* * The following functions should never be called directly. * Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead. */ int ppt_assign_device(struct vm *vm, int bus, int slot, int func); int ppt_unassign_device(struct vm *vm, int bus, int slot, int func); #endif Index: head/sys/amd64/vmm/vmm_dev.c =================================================================== --- head/sys/amd64/vmm/vmm_dev.c (revision 368002) +++ head/sys/amd64/vmm/vmm_dev.c (revision 368003) @@ -1,1238 +1,1243 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_stat.h" #include "vmm_mem.h" #include "io/ppt.h" #include "io/vatpic.h" #include "io/vioapic.h" #include "io/vhpet.h" #include "io/vrtc.h" struct devmem_softc { int segid; char *name; struct cdev *cdev; struct vmmdev_softc *sc; SLIST_ENTRY(devmem_softc) link; }; struct vmmdev_softc { struct vm *vm; /* vm instance cookie */ struct cdev *cdev; SLIST_ENTRY(vmmdev_softc) link; SLIST_HEAD(, devmem_softc) devmem; int flags; }; #define VSC_LINKED 0x01 static SLIST_HEAD(, vmmdev_softc) head; static unsigned pr_allow_flag; static struct mtx vmmdev_mtx; static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); SYSCTL_DECL(_hw_vmm); static int vmm_priv_check(struct ucred *ucred); static int devmem_create_cdev(const char *vmname, int id, char *devmem); static void devmem_destroy(void *arg); static int vmm_priv_check(struct ucred *ucred) { if (jailed(ucred) && !(ucred->cr_prison->pr_allow & pr_allow_flag)) return (EPERM); return (0); } static int vcpu_lock_one(struct vmmdev_softc *sc, int vcpu) { int error; if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm)) return (EINVAL); error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); return (error); } static void vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu) { enum vcpu_state state; state = vcpu_get_state(sc->vm, vcpu, NULL); if (state != VCPU_FROZEN) { panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm), vcpu, state); } vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); } static int vcpu_lock_all(struct vmmdev_softc *sc) { int error, vcpu; uint16_t maxcpus; maxcpus = vm_get_maxcpus(sc->vm); for (vcpu = 0; vcpu < maxcpus; vcpu++) { error = vcpu_lock_one(sc, vcpu); if (error) break; } if (error) { while (--vcpu >= 0) vcpu_unlock_one(sc, vcpu); } return (error); } static void vcpu_unlock_all(struct vmmdev_softc *sc) { int vcpu; uint16_t maxcpus; maxcpus = vm_get_maxcpus(sc->vm); for (vcpu = 0; vcpu < maxcpus; vcpu++) vcpu_unlock_one(sc, vcpu); } static struct vmmdev_softc * vmmdev_lookup(const char *name) { struct vmmdev_softc *sc; #ifdef notyet /* XXX kernel is not compiled with invariants */ mtx_assert(&vmmdev_mtx, MA_OWNED); #endif SLIST_FOREACH(sc, &head, link) { if (strcmp(name, vm_name(sc->vm)) == 0) break; } return (sc); } static struct vmmdev_softc * vmmdev_lookup2(struct cdev *cdev) { return (cdev->si_drv1); } static int vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) { int error, off, c, prot; vm_paddr_t gpa, maxaddr; void *hpa, *cookie; struct vmmdev_softc *sc; uint16_t lastcpu; error = vmm_priv_check(curthread->td_ucred); if (error) return (error); sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); /* * Get a read lock on the guest memory map by freezing any vcpu. */ lastcpu = vm_get_maxcpus(sc->vm) - 1; error = vcpu_lock_one(sc, lastcpu); if (error) return (error); prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); maxaddr = vmm_sysmem_maxaddr(sc->vm); while (uio->uio_resid > 0 && error == 0) { gpa = uio->uio_offset; off = gpa & PAGE_MASK; c = min(uio->uio_resid, PAGE_SIZE - off); /* * The VM has a hole in its physical memory map. If we want to * use 'dd' to inspect memory beyond the hole we need to * provide bogus data for memory that lies in the hole. * * Since this device does not support lseek(2), dd(1) will * read(2) blocks of data to simulate the lseek(2). */ hpa = vm_gpa_hold(sc->vm, lastcpu, gpa, c, prot, &cookie); if (hpa == NULL) { if (uio->uio_rw == UIO_READ && gpa < maxaddr) error = uiomove(__DECONST(void *, zero_region), c, uio); else error = EFAULT; } else { error = uiomove(hpa, c, uio); vm_gpa_release(cookie); } } vcpu_unlock_one(sc, lastcpu); return (error); } CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1); static int get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) { struct devmem_softc *dsc; int error; bool sysmem; error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL); if (error || mseg->len == 0) return (error); if (!sysmem) { SLIST_FOREACH(dsc, &sc->devmem, link) { if (dsc->segid == mseg->segid) break; } KASSERT(dsc != NULL, ("%s: devmem segment %d not found", __func__, mseg->segid)); error = copystr(dsc->name, mseg->name, len, NULL); } else { bzero(mseg->name, len); } return (error); } static int alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) { char *name; int error; bool sysmem; error = 0; name = NULL; sysmem = true; /* * The allocation is lengthened by 1 to hold a terminating NUL. It'll * by stripped off when devfs processes the full string. */ if (VM_MEMSEG_NAME(mseg)) { sysmem = false; name = malloc(len, M_VMMDEV, M_WAITOK); error = copystr(mseg->name, name, len, NULL); if (error) goto done; } error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem); if (error) goto done; if (VM_MEMSEG_NAME(mseg)) { error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name); if (error) vm_free_memseg(sc->vm, mseg->segid); else name = NULL; /* freed when 'cdev' is destroyed */ } done: free(name, M_VMMDEV); return (error); } static int vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_get_register(vm, vcpu, regnum[i], ®val[i]); if (error) break; } return (error); } static int vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_set_register(vm, vcpu, regnum[i], regval[i]); if (error) break; } return (error); } static int vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { int error, vcpu, state_changed, size; cpuset_t *cpuset; struct vmmdev_softc *sc; struct vm_register *vmreg; struct vm_seg_desc *vmsegdesc; struct vm_register_set *vmregset; struct vm_run *vmrun; struct vm_exception *vmexc; struct vm_lapic_irq *vmirq; struct vm_lapic_msi *vmmsi; struct vm_ioapic_irq *ioapic_irq; struct vm_isa_irq *isa_irq; struct vm_isa_irq_trigger *isa_irq_trigger; struct vm_capability *vmcap; struct vm_pptdev *pptdev; struct vm_pptdev_mmio *pptmmio; struct vm_pptdev_msi *pptmsi; struct vm_pptdev_msix *pptmsix; struct vm_nmi *vmnmi; struct vm_stats *vmstats; struct vm_stat_desc *statdesc; struct vm_x2apic *x2apic; struct vm_gpa_pte *gpapte; struct vm_suspend *vmsuspend; struct vm_gla2gpa *gg; struct vm_activate_cpu *vac; struct vm_cpuset *vm_cpuset; struct vm_intinfo *vmii; struct vm_rtc_time *rtctime; struct vm_rtc_data *rtcdata; struct vm_memmap *mm; struct vm_cpu_topology *topology; struct vm_readwrite_kernemu_device *kernemu; uint64_t *regvals; int *regnums; #ifdef BHYVE_SNAPSHOT struct vm_snapshot_meta *snapshot_meta; #endif error = vmm_priv_check(curthread->td_ucred); if (error) return (error); sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); vcpu = -1; state_changed = 0; /* * Some VMM ioctls can operate only on vcpus that are not running. */ switch (cmd) { case VM_RUN: case VM_GET_REGISTER: case VM_SET_REGISTER: case VM_GET_SEGMENT_DESCRIPTOR: case VM_SET_SEGMENT_DESCRIPTOR: case VM_GET_REGISTER_SET: case VM_SET_REGISTER_SET: case VM_INJECT_EXCEPTION: case VM_GET_CAPABILITY: case VM_SET_CAPABILITY: case VM_PPTDEV_MSI: case VM_PPTDEV_MSIX: case VM_SET_X2APIC_STATE: case VM_GLA2GPA: case VM_GLA2GPA_NOFAULT: case VM_ACTIVATE_CPU: case VM_SET_INTINFO: case VM_GET_INTINFO: case VM_RESTART_INSTRUCTION: /* * XXX fragile, handle with care * Assumes that the first field of the ioctl data is the vcpu. */ vcpu = *(int *)data; error = vcpu_lock_one(sc, vcpu); if (error) goto done; state_changed = 1; break; case VM_MAP_PPTDEV_MMIO: case VM_BIND_PPTDEV: case VM_UNBIND_PPTDEV: #ifdef COMPAT_FREEBSD12 case VM_ALLOC_MEMSEG_FBSD12: #endif case VM_ALLOC_MEMSEG: case VM_MMAP_MEMSEG: case VM_REINIT: /* * ioctls that operate on the entire virtual machine must * prevent all vcpus from running. */ error = vcpu_lock_all(sc); if (error) goto done; state_changed = 2; break; #ifdef COMPAT_FREEBSD12 case VM_GET_MEMSEG_FBSD12: #endif case VM_GET_MEMSEG: case VM_MMAP_GETNEXT: /* * Lock a vcpu to make sure that the memory map cannot be * modified while it is being inspected. */ vcpu = vm_get_maxcpus(sc->vm) - 1; error = vcpu_lock_one(sc, vcpu); if (error) goto done; state_changed = 1; break; default: break; } switch(cmd) { case VM_RUN: vmrun = (struct vm_run *)data; error = vm_run(sc->vm, vmrun); break; case VM_SUSPEND: vmsuspend = (struct vm_suspend *)data; error = vm_suspend(sc->vm, vmsuspend->how); break; case VM_REINIT: error = vm_reinit(sc->vm); break; case VM_STAT_DESC: { statdesc = (struct vm_stat_desc *)data; error = vmm_stat_desc_copy(statdesc->index, statdesc->desc, sizeof(statdesc->desc)); break; } case VM_STATS: { CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); vmstats = (struct vm_stats *)data; getmicrotime(&vmstats->tv); error = vmm_stat_copy(sc->vm, vmstats->cpuid, &vmstats->num_entries, vmstats->statbuf); break; } case VM_PPTDEV_MSI: pptmsi = (struct vm_pptdev_msi *)data; error = ppt_setup_msi(sc->vm, pptmsi->vcpu, pptmsi->bus, pptmsi->slot, pptmsi->func, pptmsi->addr, pptmsi->msg, pptmsi->numvec); break; case VM_PPTDEV_MSIX: pptmsix = (struct vm_pptdev_msix *)data; error = ppt_setup_msix(sc->vm, pptmsix->vcpu, pptmsix->bus, pptmsix->slot, pptmsix->func, pptmsix->idx, pptmsix->addr, pptmsix->msg, pptmsix->vector_control); break; + case VM_PPTDEV_DISABLE_MSIX: + pptdev = (struct vm_pptdev *)data; + error = ppt_disable_msix(sc->vm, pptdev->bus, pptdev->slot, + pptdev->func); + break; case VM_MAP_PPTDEV_MMIO: pptmmio = (struct vm_pptdev_mmio *)data; error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, pptmmio->func, pptmmio->gpa, pptmmio->len, pptmmio->hpa); break; case VM_BIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_UNBIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_INJECT_EXCEPTION: vmexc = (struct vm_exception *)data; error = vm_inject_exception(sc->vm, vmexc->cpuid, vmexc->vector, vmexc->error_code_valid, vmexc->error_code, vmexc->restart_instruction); break; case VM_INJECT_NMI: vmnmi = (struct vm_nmi *)data; error = vm_inject_nmi(sc->vm, vmnmi->cpuid); break; case VM_LAPIC_IRQ: vmirq = (struct vm_lapic_irq *)data; error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector); break; case VM_LAPIC_LOCAL_IRQ: vmirq = (struct vm_lapic_irq *)data; error = lapic_set_local_intr(sc->vm, vmirq->cpuid, vmirq->vector); break; case VM_LAPIC_MSI: vmmsi = (struct vm_lapic_msi *)data; error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg); break; case VM_IOAPIC_ASSERT_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_DEASSERT_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_PULSE_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_PINCOUNT: *(int *)data = vioapic_pincount(sc->vm); break; case VM_SET_KERNEMU_DEV: case VM_GET_KERNEMU_DEV: { mem_region_write_t mwrite; mem_region_read_t mread; bool arg; kernemu = (void *)data; if (kernemu->access_width > 0) size = (1u << kernemu->access_width); else size = 1; if (kernemu->gpa >= DEFAULT_APIC_BASE && kernemu->gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { mread = lapic_mmio_read; mwrite = lapic_mmio_write; } else if (kernemu->gpa >= VIOAPIC_BASE && kernemu->gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { mread = vioapic_mmio_read; mwrite = vioapic_mmio_write; } else if (kernemu->gpa >= VHPET_BASE && kernemu->gpa < VHPET_BASE + VHPET_SIZE) { mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { error = EINVAL; break; } if (cmd == VM_SET_KERNEMU_DEV) error = mwrite(sc->vm, kernemu->vcpuid, kernemu->gpa, kernemu->value, size, &arg); else error = mread(sc->vm, kernemu->vcpuid, kernemu->gpa, &kernemu->value, size, &arg); break; } case VM_ISA_ASSERT_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_assert_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_DEASSERT_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_deassert_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_PULSE_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_SET_IRQ_TRIGGER: isa_irq_trigger = (struct vm_isa_irq_trigger *)data; error = vatpic_set_irq_trigger(sc->vm, isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger); break; case VM_MMAP_GETNEXT: mm = (struct vm_memmap *)data; error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid, &mm->segoff, &mm->len, &mm->prot, &mm->flags); break; case VM_MMAP_MEMSEG: mm = (struct vm_memmap *)data; error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff, mm->len, mm->prot, mm->flags); break; #ifdef COMPAT_FREEBSD12 case VM_ALLOC_MEMSEG_FBSD12: error = alloc_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg_fbsd12 *)0)->name)); break; #endif case VM_ALLOC_MEMSEG: error = alloc_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg *)0)->name)); break; #ifdef COMPAT_FREEBSD12 case VM_GET_MEMSEG_FBSD12: error = get_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg_fbsd12 *)0)->name)); break; #endif case VM_GET_MEMSEG: error = get_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg *)0)->name)); break; case VM_GET_REGISTER: vmreg = (struct vm_register *)data; error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, &vmreg->regval); break; case VM_SET_REGISTER: vmreg = (struct vm_register *)data; error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, vmreg->regval); break; case VM_SET_SEGMENT_DESCRIPTOR: vmsegdesc = (struct vm_seg_desc *)data; error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, vmsegdesc->regnum, &vmsegdesc->desc); break; case VM_GET_SEGMENT_DESCRIPTOR: vmsegdesc = (struct vm_seg_desc *)data; error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, vmsegdesc->regnum, &vmsegdesc->desc); break; case VM_GET_REGISTER_SET: vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, M_WAITOK); regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = vm_get_register_set(sc->vm, vmregset->cpuid, vmregset->count, regnums, regvals); if (error == 0) error = copyout(regvals, vmregset->regvals, sizeof(regvals[0]) * vmregset->count); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; case VM_SET_REGISTER_SET: vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, M_WAITOK); regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = copyin(vmregset->regvals, regvals, sizeof(regvals[0]) * vmregset->count); if (error == 0) error = vm_set_register_set(sc->vm, vmregset->cpuid, vmregset->count, regnums, regvals); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; case VM_GET_CAPABILITY: vmcap = (struct vm_capability *)data; error = vm_get_capability(sc->vm, vmcap->cpuid, vmcap->captype, &vmcap->capval); break; case VM_SET_CAPABILITY: vmcap = (struct vm_capability *)data; error = vm_set_capability(sc->vm, vmcap->cpuid, vmcap->captype, vmcap->capval); break; case VM_SET_X2APIC_STATE: x2apic = (struct vm_x2apic *)data; error = vm_set_x2apic_state(sc->vm, x2apic->cpuid, x2apic->state); break; case VM_GET_X2APIC_STATE: x2apic = (struct vm_x2apic *)data; error = vm_get_x2apic_state(sc->vm, x2apic->cpuid, &x2apic->state); break; case VM_GET_GPA_PMAP: gpapte = (struct vm_gpa_pte *)data; pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), gpapte->gpa, gpapte->pte, &gpapte->ptenum); error = 0; break; case VM_GET_HPET_CAPABILITIES: error = vhpet_getcap((struct vm_hpet_cap *)data); break; case VM_GLA2GPA: { CTASSERT(PROT_READ == VM_PROT_READ); CTASSERT(PROT_WRITE == VM_PROT_WRITE); CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); gg = (struct vm_gla2gpa *)data; error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla, gg->prot, &gg->gpa, &gg->fault); KASSERT(error == 0 || error == EFAULT, ("%s: vm_gla2gpa unknown error %d", __func__, error)); break; } case VM_GLA2GPA_NOFAULT: gg = (struct vm_gla2gpa *)data; error = vm_gla2gpa_nofault(sc->vm, gg->vcpuid, &gg->paging, gg->gla, gg->prot, &gg->gpa, &gg->fault); KASSERT(error == 0 || error == EFAULT, ("%s: vm_gla2gpa unknown error %d", __func__, error)); break; case VM_ACTIVATE_CPU: vac = (struct vm_activate_cpu *)data; error = vm_activate_cpu(sc->vm, vac->vcpuid); break; case VM_GET_CPUS: error = 0; vm_cpuset = (struct vm_cpuset *)data; size = vm_cpuset->cpusetsize; if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) { error = ERANGE; break; } cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO); if (vm_cpuset->which == VM_ACTIVE_CPUS) *cpuset = vm_active_cpus(sc->vm); else if (vm_cpuset->which == VM_SUSPENDED_CPUS) *cpuset = vm_suspended_cpus(sc->vm); else if (vm_cpuset->which == VM_DEBUG_CPUS) *cpuset = vm_debug_cpus(sc->vm); else error = EINVAL; if (error == 0) error = copyout(cpuset, vm_cpuset->cpus, size); free(cpuset, M_TEMP); break; case VM_SUSPEND_CPU: vac = (struct vm_activate_cpu *)data; error = vm_suspend_cpu(sc->vm, vac->vcpuid); break; case VM_RESUME_CPU: vac = (struct vm_activate_cpu *)data; error = vm_resume_cpu(sc->vm, vac->vcpuid); break; case VM_SET_INTINFO: vmii = (struct vm_intinfo *)data; error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1); break; case VM_GET_INTINFO: vmii = (struct vm_intinfo *)data; error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1, &vmii->info2); break; case VM_RTC_WRITE: rtcdata = (struct vm_rtc_data *)data; error = vrtc_nvram_write(sc->vm, rtcdata->offset, rtcdata->value); break; case VM_RTC_READ: rtcdata = (struct vm_rtc_data *)data; error = vrtc_nvram_read(sc->vm, rtcdata->offset, &rtcdata->value); break; case VM_RTC_SETTIME: rtctime = (struct vm_rtc_time *)data; error = vrtc_set_time(sc->vm, rtctime->secs); break; case VM_RTC_GETTIME: error = 0; rtctime = (struct vm_rtc_time *)data; rtctime->secs = vrtc_get_time(sc->vm); break; case VM_RESTART_INSTRUCTION: error = vm_restart_instruction(sc->vm, vcpu); break; case VM_SET_TOPOLOGY: topology = (struct vm_cpu_topology *)data; error = vm_set_topology(sc->vm, topology->sockets, topology->cores, topology->threads, topology->maxcpus); break; case VM_GET_TOPOLOGY: topology = (struct vm_cpu_topology *)data; vm_get_topology(sc->vm, &topology->sockets, &topology->cores, &topology->threads, &topology->maxcpus); error = 0; break; #ifdef BHYVE_SNAPSHOT case VM_SNAPSHOT_REQ: snapshot_meta = (struct vm_snapshot_meta *)data; error = vm_snapshot_req(sc->vm, snapshot_meta); break; case VM_RESTORE_TIME: error = vm_restore_time(sc->vm); break; #endif default: error = ENOTTY; break; } if (state_changed == 1) vcpu_unlock_one(sc, vcpu); else if (state_changed == 2) vcpu_unlock_all(sc); done: /* * Make sure that no handler returns a kernel-internal * error value to userspace. */ KASSERT(error == ERESTART || error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); return (error); } static int vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize, struct vm_object **objp, int nprot) { struct vmmdev_softc *sc; vm_paddr_t gpa; size_t len; vm_ooffset_t segoff, first, last; int error, found, segid; uint16_t lastcpu; bool sysmem; error = vmm_priv_check(curthread->td_ucred); if (error) return (error); first = *offset; last = first + mapsize; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); sc = vmmdev_lookup2(cdev); if (sc == NULL) { /* virtual machine is in the process of being created */ return (EINVAL); } /* * Get a read lock on the guest memory map by freezing any vcpu. */ lastcpu = vm_get_maxcpus(sc->vm) - 1; error = vcpu_lock_one(sc, lastcpu); if (error) return (error); gpa = 0; found = 0; while (!found) { error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len, NULL, NULL); if (error) break; if (first >= gpa && last <= gpa + len) found = 1; else gpa += len; } if (found) { error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp); KASSERT(error == 0 && *objp != NULL, ("%s: invalid memory segment %d", __func__, segid)); if (sysmem) { vm_object_reference(*objp); *offset = segoff + (first - gpa); } else { error = EINVAL; } } vcpu_unlock_one(sc, lastcpu); return (error); } static void vmmdev_destroy(void *arg) { struct vmmdev_softc *sc = arg; struct devmem_softc *dsc; int error; error = vcpu_lock_all(sc); KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error)); while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) { KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__)); SLIST_REMOVE_HEAD(&sc->devmem, link); free(dsc->name, M_VMMDEV); free(dsc, M_VMMDEV); } if (sc->cdev != NULL) destroy_dev(sc->cdev); if (sc->vm != NULL) vm_destroy(sc->vm); if ((sc->flags & VSC_LINKED) != 0) { mtx_lock(&vmmdev_mtx); SLIST_REMOVE(&head, sc, vmmdev_softc, link); mtx_unlock(&vmmdev_mtx); } free(sc, M_VMMDEV); } static int sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) { struct devmem_softc *dsc; struct vmmdev_softc *sc; struct cdev *cdev; char *buf; int error, buflen; error = vmm_priv_check(req->td->td_ucred); if (error) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); strlcpy(buf, "beavis", buflen); error = sysctl_handle_string(oidp, buf, buflen, req); if (error != 0 || req->newptr == NULL) goto out; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(buf); if (sc == NULL || sc->cdev == NULL) { mtx_unlock(&vmmdev_mtx); error = EINVAL; goto out; } /* * The 'cdev' will be destroyed asynchronously when 'si_threadcount' * goes down to 0 so we should not do it again in the callback. * * Setting 'sc->cdev' to NULL is also used to indicate that the VM * is scheduled for destruction. */ cdev = sc->cdev; sc->cdev = NULL; mtx_unlock(&vmmdev_mtx); /* * Schedule all cdevs to be destroyed: * * - any new operations on the 'cdev' will return an error (ENXIO). * * - when the 'si_threadcount' dwindles down to zero the 'cdev' will * be destroyed and the callback will be invoked in a taskqueue * context. * * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev' */ SLIST_FOREACH(dsc, &sc->devmem, link) { KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed")); destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc); } destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); error = 0; out: free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_destroy, "A", NULL); static struct cdevsw vmmdevsw = { .d_name = "vmmdev", .d_version = D_VERSION, .d_ioctl = vmmdev_ioctl, .d_mmap_single = vmmdev_mmap_single, .d_read = vmmdev_rw, .d_write = vmmdev_rw, }; static int sysctl_vmm_create(SYSCTL_HANDLER_ARGS) { struct vm *vm; struct cdev *cdev; struct vmmdev_softc *sc, *sc2; char *buf; int error, buflen; error = vmm_priv_check(req->td->td_ucred); if (error) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); strlcpy(buf, "beavis", buflen); error = sysctl_handle_string(oidp, buf, buflen, req); if (error != 0 || req->newptr == NULL) goto out; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(buf); mtx_unlock(&vmmdev_mtx); if (sc != NULL) { error = EEXIST; goto out; } error = vm_create(buf, &vm); if (error != 0) goto out; sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); sc->vm = vm; SLIST_INIT(&sc->devmem); /* * Lookup the name again just in case somebody sneaked in when we * dropped the lock. */ mtx_lock(&vmmdev_mtx); sc2 = vmmdev_lookup(buf); if (sc2 == NULL) { SLIST_INSERT_HEAD(&head, sc, link); sc->flags |= VSC_LINKED; } mtx_unlock(&vmmdev_mtx); if (sc2 != NULL) { vmmdev_destroy(sc); error = EEXIST; goto out; } error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); if (error != 0) { vmmdev_destroy(sc); goto out; } mtx_lock(&vmmdev_mtx); sc->cdev = cdev; sc->cdev->si_drv1 = sc; mtx_unlock(&vmmdev_mtx); out: free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_create, "A", NULL); void vmmdev_init(void) { mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); pr_allow_flag = prison_add_allow(NULL, "vmm", NULL, "Allow use of vmm in a jail."); } int vmmdev_cleanup(void) { int error; if (SLIST_EMPTY(&head)) error = 0; else error = EBUSY; return (error); } static int devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len, struct vm_object **objp, int nprot) { struct devmem_softc *dsc; vm_ooffset_t first, last; size_t seglen; int error; uint16_t lastcpu; bool sysmem; dsc = cdev->si_drv1; if (dsc == NULL) { /* 'cdev' has been created but is not ready for use */ return (ENXIO); } first = *offset; last = *offset + len; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1; error = vcpu_lock_one(dsc->sc, lastcpu); if (error) return (error); error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp); KASSERT(error == 0 && !sysmem && *objp != NULL, ("%s: invalid devmem segment %d", __func__, dsc->segid)); vcpu_unlock_one(dsc->sc, lastcpu); if (seglen >= last) { vm_object_reference(*objp); return (0); } else { return (EINVAL); } } static struct cdevsw devmemsw = { .d_name = "devmem", .d_version = D_VERSION, .d_mmap_single = devmem_mmap_single, }; static int devmem_create_cdev(const char *vmname, int segid, char *devname) { struct devmem_softc *dsc; struct vmmdev_softc *sc; struct cdev *cdev; int error; error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL, UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname); if (error) return (error); dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO); mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(vmname); KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname)); if (sc->cdev == NULL) { /* virtual machine is being created or destroyed */ mtx_unlock(&vmmdev_mtx); free(dsc, M_VMMDEV); destroy_dev_sched_cb(cdev, NULL, 0); return (ENODEV); } dsc->segid = segid; dsc->name = devname; dsc->cdev = cdev; dsc->sc = sc; SLIST_INSERT_HEAD(&sc->devmem, dsc, link); mtx_unlock(&vmmdev_mtx); /* The 'cdev' is ready for use after 'si_drv1' is initialized */ cdev->si_drv1 = dsc; return (0); } static void devmem_destroy(void *arg) { struct devmem_softc *dsc = arg; KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__)); dsc->cdev = NULL; dsc->sc = NULL; } Index: head/usr.sbin/bhyve/pci_passthru.c =================================================================== --- head/usr.sbin/bhyve/pci_passthru.c (revision 368002) +++ head/usr.sbin/bhyve/pci_passthru.c (revision 368003) @@ -1,960 +1,965 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include "pci_emul.h" #include "mem.h" #ifndef _PATH_DEVPCI #define _PATH_DEVPCI "/dev/pci" #endif #ifndef _PATH_DEVIO #define _PATH_DEVIO "/dev/io" #endif #ifndef _PATH_MEM #define _PATH_MEM "/dev/mem" #endif #define LEGACY_SUPPORT 1 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1) #define MSIX_CAPLEN 12 static int pcifd = -1; static int iofd = -1; static int memfd = -1; struct passthru_softc { struct pci_devinst *psc_pi; struct pcibar psc_bar[PCI_BARMAX + 1]; struct { int capoff; int msgctrl; int emulated; } psc_msi; struct { int capoff; } psc_msix; struct pcisel psc_sel; }; static int msi_caplen(int msgctrl) { int len; len = 10; /* minimum length of msi capability */ if (msgctrl & PCIM_MSICTRL_64BIT) len += 4; #if 0 /* * Ignore the 'mask' and 'pending' bits in the MSI capability. * We'll let the guest manipulate them directly. */ if (msgctrl & PCIM_MSICTRL_VECTOR) len += 10; #endif return (len); } static uint32_t read_config(const struct pcisel *sel, long reg, int width) { struct pci_io pi; bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; if (ioctl(pcifd, PCIOCREAD, &pi) < 0) return (0); /* XXX */ else return (pi.pi_data); } static void write_config(const struct pcisel *sel, long reg, int width, uint32_t data) { struct pci_io pi; bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; pi.pi_data = data; (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */ } #ifdef LEGACY_SUPPORT static int passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr) { int capoff, i; struct msicap msicap; u_char *capdata; pci_populate_msicap(&msicap, msgnum, nextptr); /* * XXX * Copy the msi capability structure in the last 16 bytes of the * config space. This is wrong because it could shadow something * useful to the device. */ capoff = 256 - roundup(sizeof(msicap), 4); capdata = (u_char *)&msicap; for (i = 0; i < sizeof(msicap); i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); return (capoff); } #endif /* LEGACY_SUPPORT */ static int cfginitmsi(struct passthru_softc *sc) { int i, ptr, capptr, cap, sts, caplen, table_size; uint32_t u32; struct pcisel sel; struct pci_devinst *pi; struct msixcap msixcap; uint32_t *msixcap_ptr; pi = sc->psc_pi; sel = sc->psc_sel; /* * Parse the capabilities and cache the location of the MSI * and MSI-X capabilities. */ sts = read_config(&sel, PCIR_STATUS, 2); if (sts & PCIM_STATUS_CAPPRESENT) { ptr = read_config(&sel, PCIR_CAP_PTR, 1); while (ptr != 0 && ptr != 0xff) { cap = read_config(&sel, ptr + PCICAP_ID, 1); if (cap == PCIY_MSI) { /* * Copy the MSI capability into the config * space of the emulated pci device */ sc->psc_msi.capoff = ptr; sc->psc_msi.msgctrl = read_config(&sel, ptr + 2, 2); sc->psc_msi.emulated = 0; caplen = msi_caplen(sc->psc_msi.msgctrl); capptr = ptr; while (caplen > 0) { u32 = read_config(&sel, capptr, 4); pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; } } else if (cap == PCIY_MSIX) { /* * Copy the MSI-X capability */ sc->psc_msix.capoff = ptr; caplen = 12; msixcap_ptr = (uint32_t*) &msixcap; capptr = ptr; while (caplen > 0) { u32 = read_config(&sel, capptr, 4); *msixcap_ptr = u32; pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; msixcap_ptr++; } } ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1); } } if (sc->psc_msix.capoff != 0) { pi->pi_msix.pba_bar = msixcap.pba_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.pba_offset = msixcap.pba_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_bar = msixcap.table_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.table_offset = msixcap.table_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl); pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count); /* Allocate the emulated MSI-X table array */ table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* Mask all table entries */ for (i = 0; i < pi->pi_msix.table_count; i++) { pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } } #ifdef LEGACY_SUPPORT /* * If the passthrough device does not support MSI then craft a * MSI capability for it. We link the new MSI capability at the * head of the list of capabilities. */ if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) { int origptr, msiptr; origptr = read_config(&sel, PCIR_CAP_PTR, 1); msiptr = passthru_add_msicap(pi, 1, origptr); sc->psc_msi.capoff = msiptr; sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2); sc->psc_msi.emulated = 1; pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr); } #endif /* Make sure one of the capabilities is present */ if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) return (-1); else return (0); } static uint64_t msix_table_read(struct passthru_softc *sc, uint64_t offset, int size) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *src8; uint16_t *src16; uint32_t *src32; uint64_t *src64; uint64_t data; size_t entry_offset; int index; pi = sc->psc_pi; if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset && offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { switch(size) { case 1: src8 = (uint8_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src8; break; case 2: src16 = (uint16_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src16; break; case 4: src32 = (uint32_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src32; break; case 8: src64 = (uint64_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src64; break; default: return (-1); } return (data); } if (offset < pi->pi_msix.table_offset) return (-1); offset -= pi->pi_msix.table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; if (index >= pi->pi_msix.table_count) return (-1); entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; switch(size) { case 1: src8 = (uint8_t *)((void *)entry + entry_offset); data = *src8; break; case 2: src16 = (uint16_t *)((void *)entry + entry_offset); data = *src16; break; case 4: src32 = (uint32_t *)((void *)entry + entry_offset); data = *src32; break; case 8: src64 = (uint64_t *)((void *)entry + entry_offset); data = *src64; break; default: return (-1); } return (data); } static void msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc, uint64_t offset, int size, uint64_t data) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *dest8; uint16_t *dest16; uint32_t *dest32; uint64_t *dest64; size_t entry_offset; uint32_t vector_control; int index; pi = sc->psc_pi; if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset && offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { switch(size) { case 1: dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest8 = data; break; case 2: dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest16 = data; break; case 4: dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest32 = data; break; case 8: dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest64 = data; break; default: break; } return; } if (offset < pi->pi_msix.table_offset) return; offset -= pi->pi_msix.table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; if (index >= pi->pi_msix.table_count) return; entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* Only 4 byte naturally-aligned writes are supported */ assert(size == 4); assert(entry_offset % 4 == 0); vector_control = entry->vector_control; dest32 = (uint32_t *)((void *)entry + entry_offset); *dest32 = data; /* If MSI-X hasn't been enabled, do nothing */ if (pi->pi_msix.enabled) { /* If the entry is masked, don't set it up */ if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 || (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { (void)vm_setup_pptdev_msix(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, index, entry->addr, entry->msg_data, entry->vector_control); } } } static int init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base) { int b, s, f; int error, idx; size_t len, remaining; uint32_t table_size, table_offset; uint32_t pba_size, pba_offset; vm_paddr_t start; struct pci_devinst *pi = sc->psc_pi; assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0); b = sc->psc_sel.pc_bus; s = sc->psc_sel.pc_dev; f = sc->psc_sel.pc_func; /* * If the MSI-X table BAR maps memory intended for * other uses, it is at least assured that the table * either resides in its own page within the region, * or it resides in a page shared with only the PBA. */ table_offset = rounddown2(pi->pi_msix.table_offset, 4096); table_size = pi->pi_msix.table_offset - table_offset; table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; table_size = roundup2(table_size, 4096); idx = pi->pi_msix.table_bar; start = pi->pi_bar[idx].addr; remaining = pi->pi_bar[idx].size; if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) { pba_offset = pi->pi_msix.pba_offset; pba_size = pi->pi_msix.pba_size; if (pba_offset >= table_offset + table_size || table_offset >= pba_offset + pba_size) { /* * If the PBA does not share a page with the MSI-x * tables, no PBA emulation is required. */ pi->pi_msix.pba_page = NULL; pi->pi_msix.pba_page_offset = 0; } else { /* * The PBA overlaps with either the first or last * page of the MSI-X table region. Map the * appropriate page. */ if (pba_offset <= table_offset) pi->pi_msix.pba_page_offset = table_offset; else pi->pi_msix.pba_page_offset = table_offset + table_size - 4096; pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, memfd, start + pi->pi_msix.pba_page_offset); if (pi->pi_msix.pba_page == MAP_FAILED) { warn( "Failed to map PBA page for MSI-X on %d/%d/%d", b, s, f); return (-1); } } } /* Map everything before the MSI-X table */ if (table_offset > 0) { len = table_offset; error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base); if (error) return (error); base += len; start += len; remaining -= len; } /* Skip the MSI-X table */ base += table_size; start += table_size; remaining -= table_size; /* Map everything beyond the end of the MSI-X table */ if (remaining > 0) { len = remaining; error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base); if (error) return (error); } return (0); } static int cfginitbar(struct vmctx *ctx, struct passthru_softc *sc) { int i, error; struct pci_devinst *pi; struct pci_bar_io bar; enum pcibar_type bartype; uint64_t base, size; pi = sc->psc_pi; /* * Initialize BAR registers */ for (i = 0; i <= PCI_BARMAX; i++) { bzero(&bar, sizeof(bar)); bar.pbi_sel = sc->psc_sel; bar.pbi_reg = PCIR_BAR(i); if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0) continue; if (PCI_BAR_IO(bar.pbi_base)) { bartype = PCIBAR_IO; base = bar.pbi_base & PCIM_BAR_IO_BASE; } else { switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) { case PCIM_BAR_MEM_64: bartype = PCIBAR_MEM64; break; default: bartype = PCIBAR_MEM32; break; } base = bar.pbi_base & PCIM_BAR_MEM_BASE; } size = bar.pbi_length; if (bartype != PCIBAR_IO) { if (((base | size) & PAGE_MASK) != 0) { warnx("passthru device %d/%d/%d BAR %d: " "base %#lx or size %#lx not page aligned\n", sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, base, size); return (-1); } } /* Cache information about the "real" BAR */ sc->psc_bar[i].type = bartype; sc->psc_bar[i].size = size; sc->psc_bar[i].addr = base; /* Allocate the BAR in the guest I/O or MMIO space */ error = pci_emul_alloc_bar(pi, i, bartype, size); if (error) return (-1); /* The MSI-X table needs special handling */ if (i == pci_msix_table_bar(pi)) { error = init_msix_table(ctx, sc, base); if (error) return (-1); } else if (bartype != PCIBAR_IO) { /* Map the physical BAR in the guest MMIO space */ error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_bar[i].addr, pi->pi_bar[i].size, base); if (error) return (-1); } /* * 64-bit BAR takes up two slots so skip the next one. */ if (bartype == PCIBAR_MEM64) { i++; assert(i <= PCI_BARMAX); sc->psc_bar[i].type = PCIBAR_MEMHI64; } } return (0); } static int cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func) { int error; struct passthru_softc *sc; error = 1; sc = pi->pi_arg; bzero(&sc->psc_sel, sizeof(struct pcisel)); sc->psc_sel.pc_bus = bus; sc->psc_sel.pc_dev = slot; sc->psc_sel.pc_func = func; if (cfginitmsi(sc) != 0) { warnx("failed to initialize MSI for PCI %d/%d/%d", bus, slot, func); goto done; } if (cfginitbar(ctx, sc) != 0) { warnx("failed to initialize BARs for PCI %d/%d/%d", bus, slot, func); goto done; } pci_set_cfgdata16(pi, PCIR_COMMAND, read_config(&sc->psc_sel, PCIR_COMMAND, 2)); error = 0; /* success */ done: return (error); } static int passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { int bus, slot, func, error, memflags; struct passthru_softc *sc; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t pci_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR }; cap_ioctl_t io_ioctls[] = { IODEV_PIO }; #endif sc = NULL; error = 1; #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_IOCTL, CAP_READ, CAP_WRITE); #endif memflags = vm_get_memflags(ctx); if (!(memflags & VM_MEM_F_WIRED)) { warnx("passthru requires guest memory to be wired"); return (error); } if (pcifd < 0) { pcifd = open(_PATH_DEVPCI, O_RDWR, 0); if (pcifd < 0) { warn("failed to open %s", _PATH_DEVPCI); return (error); } } #ifndef WITHOUT_CAPSICUM if (caph_rights_limit(pcifd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_ioctls_limit(pcifd, pci_ioctls, nitems(pci_ioctls)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif if (iofd < 0) { iofd = open(_PATH_DEVIO, O_RDWR, 0); if (iofd < 0) { warn("failed to open %s", _PATH_DEVIO); return (error); } } #ifndef WITHOUT_CAPSICUM if (caph_rights_limit(iofd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_ioctls_limit(iofd, io_ioctls, nitems(io_ioctls)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif if (memfd < 0) { memfd = open(_PATH_MEM, O_RDWR, 0); if (memfd < 0) { warn("failed to open %s", _PATH_MEM); return (error); } } #ifndef WITHOUT_CAPSICUM cap_rights_clear(&rights, CAP_IOCTL); cap_rights_set(&rights, CAP_MMAP_RW); if (caph_rights_limit(memfd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif if (opts == NULL || sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) { warnx("invalid passthru options"); return (error); } if (vm_assign_pptdev(ctx, bus, slot, func) != 0) { warnx("PCI device at %d/%d/%d is not using the ppt(4) driver", bus, slot, func); goto done; } sc = calloc(1, sizeof(struct passthru_softc)); pi->pi_arg = sc; sc->psc_pi = pi; /* initialize config space */ error = cfginit(ctx, pi, bus, slot, func); done: if (error) { free(sc); vm_unassign_pptdev(ctx, bus, slot, func); } return (error); } static int bar_access(int coff) { if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) return (1); else return (0); } static int msicap_access(struct passthru_softc *sc, int coff) { int caplen; if (sc->psc_msi.capoff == 0) return (0); caplen = msi_caplen(sc->psc_msi.msgctrl); if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen) return (1); else return (0); } static int msixcap_access(struct passthru_softc *sc, int coff) { if (sc->psc_msix.capoff == 0) return (0); return (coff >= sc->psc_msix.capoff && coff < sc->psc_msix.capoff + MSIX_CAPLEN); } static int passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t *rv) { struct passthru_softc *sc; sc = pi->pi_arg; /* * PCI BARs and MSI capability is emulated. */ if (bar_access(coff) || msicap_access(sc, coff)) return (-1); #ifdef LEGACY_SUPPORT /* * Emulate PCIR_CAP_PTR if this device does not support MSI capability * natively. */ if (sc->psc_msi.emulated) { if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4) return (-1); } #endif /* * Emulate the command register. If a single read reads both the * command and status registers, read the status register from the * device's config space. */ if (coff == PCIR_COMMAND) { if (bytes <= 2) return (-1); *rv = read_config(&sc->psc_sel, PCIR_STATUS, 2) << 16 | pci_get_cfgdata16(pi, PCIR_COMMAND); return (0); } /* Everything else just read from the device's config space */ *rv = read_config(&sc->psc_sel, coff, bytes); return (0); } static int passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t val) { int error, msix_table_entries, i; struct passthru_softc *sc; uint16_t cmd_old; sc = pi->pi_arg; /* * PCI BARs are emulated */ if (bar_access(coff)) return (-1); /* * MSI capability is emulated */ if (msicap_access(sc, coff)) { pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff, PCIY_MSI); error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum); if (error != 0) err(1, "vm_setup_pptdev_msi"); return (0); } if (msixcap_access(sc, coff)) { pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff, PCIY_MSIX); if (pi->pi_msix.enabled) { msix_table_entries = pi->pi_msix.table_count; for (i = 0; i < msix_table_entries; i++) { error = vm_setup_pptdev_msix(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, pi->pi_msix.table[i].addr, pi->pi_msix.table[i].msg_data, pi->pi_msix.table[i].vector_control); if (error) err(1, "vm_setup_pptdev_msix"); } + } else { + error = vm_disable_pptdev_msix(ctx, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, sc->psc_sel.pc_func); + if (error) + err(1, "vm_disable_pptdev_msix"); } return (0); } #ifdef LEGACY_SUPPORT /* * If this device does not support MSI natively then we cannot let * the guest disable legacy interrupts from the device. It is the * legacy interrupt that is triggering the virtual MSI to the guest. */ if (sc->psc_msi.emulated && pci_msi_enabled(pi)) { if (coff == PCIR_COMMAND && bytes == 2) val &= ~PCIM_CMD_INTxDIS; } #endif write_config(&sc->psc_sel, coff, bytes, val); if (coff == PCIR_COMMAND) { cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND); if (bytes == 1) pci_set_cfgdata8(pi, PCIR_COMMAND, val); else if (bytes == 2) pci_set_cfgdata16(pi, PCIR_COMMAND, val); pci_emul_cmd_changed(pi, cmd_old); } return (0); } static void passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct passthru_softc *sc; struct iodev_pio_req pio; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { msix_table_write(ctx, vcpu, sc, offset, size, value); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); bzero(&pio, sizeof(struct iodev_pio_req)); pio.access = IODEV_PIO_WRITE; pio.port = sc->psc_bar[baridx].addr + offset; pio.width = size; pio.val = value; (void)ioctl(iofd, IODEV_PIO, &pio); } } static uint64_t passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct passthru_softc *sc; struct iodev_pio_req pio; uint64_t val; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { val = msix_table_read(sc, offset, size); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); bzero(&pio, sizeof(struct iodev_pio_req)); pio.access = IODEV_PIO_READ; pio.port = sc->psc_bar[baridx].addr + offset; pio.width = size; pio.val = 0; (void)ioctl(iofd, IODEV_PIO, &pio); val = pio.val; } return (val); } struct pci_devemu passthru = { .pe_emu = "passthru", .pe_init = passthru_init, .pe_cfgwrite = passthru_cfgwrite, .pe_cfgread = passthru_cfgread, .pe_barwrite = passthru_write, .pe_barread = passthru_read, }; PCI_EMUL_SET(passthru);