diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c index 77f0f8f5c581..ede46dce73b3 100644 --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -1,1319 +1,1321 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef WITH_VMMAPI_SNAPSHOT #include #endif #include #include "vmmapi.h" #include "internal.h" #define MB (1024 * 1024UL) #define GB (1024 * 1024 * 1024UL) #ifdef __amd64__ #define VM_LOWMEM_LIMIT (3 * GB) #else #define VM_LOWMEM_LIMIT 0 #endif #define VM_HIGHMEM_BASE (4 * GB) /* * Size of the guard region before and after the virtual address space * mapping the guest physical memory. This must be a multiple of the * superpage size for performance reasons. */ #define VM_MMAP_GUARD_SIZE (4 * MB) #define PROT_RW (PROT_READ | PROT_WRITE) #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) static int vm_device_open(const char *name) { char devpath[PATH_MAX]; assert(strlen(name) <= VM_MAX_NAMELEN); (void)snprintf(devpath, sizeof(devpath), "/dev/vmm/%s", name); return (open(devpath, O_RDWR)); } static int vm_ctl_open(void) { if (modfind("vmm") < 0) (void)kldload("vmm"); return (open("/dev/vmmctl", O_RDWR, 0)); } static int -vm_ctl_create(const char *name, int ctlfd) +vm_ctl_create(const char *name, int flags, int ctlfd) { struct vmmctl_vm_create vmc; memset(&vmc, 0, sizeof(vmc)); + if ((flags & VMMAPI_OPEN_CREATE_DESTROY_ON_CLOSE) != 0) + vmc.flags |= VMMCTL_CREATE_DESTROY_ON_CLOSE; if (strlcpy(vmc.name, name, sizeof(vmc.name)) >= sizeof(vmc.name)) { errno = ENAMETOOLONG; return (-1); } return (ioctl(ctlfd, VMMCTL_VM_CREATE, &vmc)); } int vm_create(const char *name) { int error, fd; fd = vm_ctl_open(); if (fd < 0) return (-1); - error = vm_ctl_create(name, fd); + error = vm_ctl_create(name, 0, fd); if (error != 0) { error = errno; (void)close(fd); errno = error; return (-1); } (void)close(fd); return (0); } struct vmctx * vm_open(const char *name) { return (vm_openf(name, 0)); } struct vmctx * vm_openf(const char *name, int flags) { struct vmctx *vm; int saved_errno; bool created; created = false; vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); assert(vm != NULL); vm->fd = vm->ctlfd = -1; vm->memflags = 0; vm->name = (char *)(vm + 1); strcpy(vm->name, name); memset(vm->memsegs, 0, sizeof(vm->memsegs)); if ((vm->ctlfd = vm_ctl_open()) < 0) goto err; vm->fd = vm_device_open(vm->name); if (vm->fd < 0 && errno == ENOENT) { if (flags & VMMAPI_OPEN_CREATE) { - if (vm_ctl_create(vm->name, vm->ctlfd) != 0) + if (vm_ctl_create(vm->name, flags, vm->ctlfd) != 0) goto err; vm->fd = vm_device_open(vm->name); created = true; } } if (vm->fd < 0) goto err; if (!created && (flags & VMMAPI_OPEN_REINIT) != 0 && vm_reinit(vm) != 0) goto err; return (vm); err: saved_errno = errno; if (created) vm_destroy(vm); else vm_close(vm); errno = saved_errno; return (NULL); } void vm_close(struct vmctx *vm) { assert(vm != NULL); if (vm->fd >= 0) (void)close(vm->fd); if (vm->ctlfd >= 0) (void)close(vm->ctlfd); free(vm); } void vm_destroy(struct vmctx *vm) { struct vmmctl_vm_destroy vmd; memset(&vmd, 0, sizeof(vmd)); (void)strlcpy(vmd.name, vm->name, sizeof(vmd.name)); if (ioctl(vm->ctlfd, VMMCTL_VM_DESTROY, &vmd) != 0) warn("ioctl(VMMCTL_VM_DESTROY)"); vm_close(vm); } struct vcpu * vm_vcpu_open(struct vmctx *ctx, int vcpuid) { struct vcpu *vcpu; vcpu = malloc(sizeof(*vcpu)); vcpu->ctx = ctx; vcpu->vcpuid = vcpuid; return (vcpu); } void vm_vcpu_close(struct vcpu *vcpu) { free(vcpu); } int vcpu_id(struct vcpu *vcpu) { return (vcpu->vcpuid); } int vm_parse_memsize(const char *opt, size_t *ret_memsize) { char *endptr; size_t optval; int error; optval = strtoul(opt, &endptr, 0); if (*opt != '\0' && *endptr == '\0') { /* * For the sake of backward compatibility if the memory size * specified on the command line is less than a megabyte then * it is interpreted as being in units of MB. */ if (optval < MB) optval *= MB; *ret_memsize = optval; error = 0; } else error = expand_number(opt, ret_memsize); return (error); } uint32_t vm_get_lowmem_limit(struct vmctx *ctx __unused) { return (VM_LOWMEM_LIMIT); } void vm_set_memflags(struct vmctx *ctx, int flags) { ctx->memflags = flags; } int vm_get_memflags(struct vmctx *ctx) { return (ctx->memflags); } /* * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot) { struct vm_memmap memmap; int error, flags; memmap.gpa = gpa; memmap.segid = segid; memmap.segoff = off; memmap.len = len; memmap.prot = prot; memmap.flags = 0; if (ctx->memflags & VM_MEM_F_WIRED) memmap.flags |= VM_MEMMAP_F_WIRED; /* * If this mapping already exists then don't create it again. This * is the common case for SYSMEM mappings created by bhyveload(8). */ error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); if (error == 0 && gpa == memmap.gpa) { if (segid != memmap.segid || off != memmap.segoff || prot != memmap.prot || flags != memmap.flags) { errno = EEXIST; return (-1); } else { return (0); } } error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); return (error); } int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, size_t *lowmem_size, size_t *highmem_size) { *guest_baseaddr = ctx->baseaddr; *lowmem_size = ctx->lowmem_size; *highmem_size = ctx->highmem_size; return (0); } int vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len) { struct vm_munmap munmap; int error; munmap.gpa = gpa; munmap.len = len; error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap); return (error); } int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { struct vm_memmap memmap; int error; bzero(&memmap, sizeof(struct vm_memmap)); memmap.gpa = *gpa; error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); if (error == 0) { *gpa = memmap.gpa; *segid = memmap.segid; *segoff = memmap.segoff; *len = memmap.len; *prot = memmap.prot; *flags = memmap.flags; } return (error); } /* * Return 0 if the segments are identical and non-zero otherwise. * * This is slightly complicated by the fact that only device memory segments * are named. */ static int cmpseg(size_t len, const char *str, size_t len2, const char *str2) { if (len == len2) { if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) return (0); } return (-1); } static int vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name, int ds_policy, domainset_t *ds_mask, size_t ds_size) { struct vm_memseg memseg; size_t n; int error; /* * If the memory segment has already been created then just return. * This is the usual case for the SYSMEM segment created by userspace * loaders like bhyveload(8). */ error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, sizeof(memseg.name)); if (error) return (error); if (memseg.len != 0) { if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { errno = EINVAL; return (-1); } else { return (0); } } bzero(&memseg, sizeof(struct vm_memseg)); memseg.segid = segid; memseg.len = len; if (ds_mask == NULL) { memseg.ds_policy = DOMAINSET_POLICY_INVALID; } else { memseg.ds_policy = ds_policy; memseg.ds_mask = ds_mask; memseg.ds_mask_size = ds_size; } if (name != NULL) { n = strlcpy(memseg.name, name, sizeof(memseg.name)); if (n >= sizeof(memseg.name)) { errno = ENAMETOOLONG; return (-1); } } error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); return (error); } int vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, size_t bufsize) { struct vm_memseg memseg; size_t n; int error; bzero(&memseg, sizeof(memseg)); memseg.segid = segid; error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); if (error == 0) { *lenp = memseg.len; n = strlcpy(namebuf, memseg.name, bufsize); if (n >= bufsize) { errno = ENAMETOOLONG; error = -1; } } return (error); } static int map_memory_segment(struct vmctx *ctx, int segid, vm_paddr_t gpa, size_t len, size_t segoff, char *base) { char *ptr; int error, flags; /* Map 'len' bytes starting at 'gpa' in the guest address space */ error = vm_mmap_memseg(ctx, gpa, segid, segoff, len, PROT_ALL); if (error) return (error); flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap into the process address space on the host */ ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); if (ptr == MAP_FAILED) return (-1); return (0); } /* * Allocates and maps virtual machine memory segments according * to the NUMA topology specified by the 'doms' array. * * The domains are laid out sequentially in the guest's physical address space. * The [VM_LOWMEM_LIMIT, VM_HIGHMEM_BASE) address range is skipped and * left unmapped. */ int vm_setup_memory_domains(struct vmctx *ctx, enum vm_mmap_style vms, struct vm_mem_domain *doms, int ndoms) { size_t low_len, len, totalsize; struct vm_mem_domain *dom; struct vm_memseg memseg; char *baseaddr, *ptr; int error, i, segid; vm_paddr_t gpa; /* Sanity checks. */ assert(vms == VM_MMAP_ALL); if (doms == NULL || ndoms <= 0 || ndoms > VM_MAXMEMDOM) { errno = EINVAL; return (-1); } /* Calculate total memory size. */ totalsize = 0; for (i = 0; i < ndoms; i++) totalsize += doms[i].size; if (totalsize > VM_LOWMEM_LIMIT) totalsize = VM_HIGHMEM_BASE + (totalsize - VM_LOWMEM_LIMIT); /* * Stake out a contiguous region covering the guest physical memory * and the adjoining guard regions. */ len = VM_MMAP_GUARD_SIZE + totalsize + VM_MMAP_GUARD_SIZE; ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (ptr == MAP_FAILED) return (-1); baseaddr = ptr + VM_MMAP_GUARD_SIZE; /* * Allocate and map memory segments for the virtual machine. */ gpa = VM_LOWMEM_LIMIT > 0 ? 0 : VM_HIGHMEM_BASE; ctx->lowmem_size = 0; ctx->highmem_size = 0; for (i = 0; i < ndoms; i++) { segid = VM_SYSMEM + i; dom = &doms[i]; /* * Check if the memory segment already exists. * If 'ndoms' is greater than one, refuse to proceed if the * memseg already exists. If only one domain was requested, use * the existing segment to preserve the behaviour of the previous * implementation. * * Splitting existing memory segments is tedious and * error-prone, which is why we don't support NUMA * domains for bhyveload(8)-loaded VMs. */ error = vm_get_memseg(ctx, segid, &len, memseg.name, sizeof(memseg.name)); if (error == 0 && len != 0) { if (ndoms != 1) { errno = EEXIST; return (-1); } else doms[0].size = len; } else { error = vm_alloc_memseg(ctx, segid, dom->size, NULL, dom->ds_policy, dom->ds_mask, dom->ds_size); if (error) return (error); } /* * If a domain is split by VM_LOWMEM_LIMIT then break * its segment mapping into two parts, one below VM_LOWMEM_LIMIT * and one above VM_HIGHMEM_BASE. */ if (gpa <= VM_LOWMEM_LIMIT && gpa + dom->size > VM_LOWMEM_LIMIT) { low_len = VM_LOWMEM_LIMIT - gpa; error = map_memory_segment(ctx, segid, gpa, low_len, 0, baseaddr); if (error) return (error); ctx->lowmem_size = VM_LOWMEM_LIMIT; /* Map the remainder. */ gpa = VM_HIGHMEM_BASE; len = dom->size - low_len; error = map_memory_segment(ctx, segid, gpa, len, low_len, baseaddr); if (error) return (error); } else { len = dom->size; error = map_memory_segment(ctx, segid, gpa, len, 0, baseaddr); if (error) return (error); } if (gpa <= VM_LOWMEM_LIMIT) ctx->lowmem_size += len; else ctx->highmem_size += len; gpa += len; } ctx->baseaddr = baseaddr; return (0); } int vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) { struct vm_mem_domain dom0; memset(&dom0, 0, sizeof(dom0)); dom0.ds_policy = DOMAINSET_POLICY_INVALID; dom0.size = memsize; return (vm_setup_memory_domains(ctx, vms, &dom0, 1)); } /* * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in * the lowmem or highmem regions. * * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. * The instruction emulation code depends on this behavior. */ void * vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) { vm_size_t lowsize, highsize; lowsize = ctx->lowmem_size; if (lowsize > 0) { if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize) return (ctx->baseaddr + gaddr); } highsize = ctx->highmem_size; if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) { if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize && gaddr + len <= VM_HIGHMEM_BASE + highsize) return (ctx->baseaddr + gaddr); } return (NULL); } vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr) { vm_paddr_t offaddr; vm_size_t lowsize, highsize; offaddr = (char *)addr - ctx->baseaddr; lowsize = ctx->lowmem_size; if (lowsize > 0) if (offaddr <= lowsize) return (offaddr); highsize = ctx->highmem_size; if (highsize > 0) if (offaddr >= VM_HIGHMEM_BASE && offaddr < VM_HIGHMEM_BASE + highsize) return (offaddr); return ((vm_paddr_t)-1); } const char * vm_get_name(struct vmctx *ctx) { return (ctx->name); } size_t vm_get_lowmem_size(struct vmctx *ctx) { return (ctx->lowmem_size); } vm_paddr_t vm_get_highmem_base(struct vmctx *ctx __unused) { return (VM_HIGHMEM_BASE); } size_t vm_get_highmem_size(struct vmctx *ctx) { return (ctx->highmem_size); } void * vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) { char pathname[MAXPATHLEN]; size_t len2; char *base, *ptr; int fd, error, flags; fd = -1; ptr = MAP_FAILED; if (name == NULL || strlen(name) == 0) { errno = EINVAL; goto done; } error = vm_alloc_memseg(ctx, segid, len, name, 0, NULL, 0); if (error) goto done; strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); strlcat(pathname, ctx->name, sizeof(pathname)); strlcat(pathname, ".", sizeof(pathname)); strlcat(pathname, name, sizeof(pathname)); fd = open(pathname, O_RDWR); if (fd < 0) goto done; /* * Stake out a contiguous region covering the device memory and the * adjoining guard regions. */ len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (base == MAP_FAILED) goto done; flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap the devmem region in the host address space */ ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); done: if (fd >= 0) close(fd); return (ptr); } int vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg) { /* * XXX: fragile, handle with care * Assumes that the first field of the ioctl data * is the vcpuid. */ *(int *)arg = vcpu->vcpuid; return (ioctl(vcpu->ctx->fd, cmd, arg)); } int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.regnum = reg; vmreg.regval = val; error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg); return (error); } int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.regnum = reg; error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg); *ret_val = vmreg.regval; return (error); } int vm_set_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals) { int error; struct vm_register_set vmregset; bzero(&vmregset, sizeof(vmregset)); vmregset.count = count; vmregset.regnums = regnums; vmregset.regvals = regvals; error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset); return (error); } int vm_get_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals) { int error; struct vm_register_set vmregset; bzero(&vmregset, sizeof(vmregset)); vmregset.count = count; vmregset.regnums = regnums; vmregset.regvals = regvals; error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset); return (error); } int vm_run(struct vcpu *vcpu, struct vm_run *vmrun) { return (vcpu_ioctl(vcpu, VM_RUN, vmrun)); } int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) { struct vm_suspend vmsuspend; bzero(&vmsuspend, sizeof(vmsuspend)); vmsuspend.how = how; return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); } int vm_reinit(struct vmctx *ctx) { return (ioctl(ctx->fd, VM_REINIT, 0)); } int vm_capability_name2type(const char *capname) { int i; for (i = 0; i < VM_CAP_MAX; i++) { if (vm_capstrmap[i] != NULL && strcmp(vm_capstrmap[i], capname) == 0) return (i); } return (-1); } const char * vm_capability_type2name(int type) { if (type >= 0 && type < VM_CAP_MAX) return (vm_capstrmap[type]); return (NULL); } int vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval) { int error; struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.captype = cap; error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap); *retval = vmcap.capval; return (error); } int vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val) { struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.captype = cap; vmcap.capval = val; return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap)); } uint64_t * vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, int *ret_entries) { static _Thread_local uint64_t *stats_buf; static _Thread_local u_int stats_count; uint64_t *new_stats; struct vm_stats vmstats; u_int count, index; bool have_stats; have_stats = false; count = 0; for (index = 0;; index += nitems(vmstats.statbuf)) { vmstats.index = index; if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0) break; if (stats_count < index + vmstats.num_entries) { new_stats = realloc(stats_buf, (index + vmstats.num_entries) * sizeof(uint64_t)); if (new_stats == NULL) { errno = ENOMEM; return (NULL); } stats_count = index + vmstats.num_entries; stats_buf = new_stats; } memcpy(stats_buf + index, vmstats.statbuf, vmstats.num_entries * sizeof(uint64_t)); count += vmstats.num_entries; have_stats = true; if (vmstats.num_entries != nitems(vmstats.statbuf)) break; } if (have_stats) { if (ret_entries) *ret_entries = count; if (ret_tv) *ret_tv = vmstats.tv; return (stats_buf); } else return (NULL); } const char * vm_get_stat_desc(struct vmctx *ctx, int index) { static struct vm_stat_desc statdesc; statdesc.index = index; if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) return (statdesc.desc); else return (NULL); } #ifdef __amd64__ int vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) { int error, i; struct vm_gpa_pte gpapte; bzero(&gpapte, sizeof(gpapte)); gpapte.gpa = gpa; error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); if (error == 0) { *num = gpapte.ptenum; for (i = 0; i < gpapte.ptenum; i++) pte[i] = gpapte.pte[i]; } return (error); } int vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; bzero(&gg, sizeof(struct vm_gla2gpa)); gg.prot = prot; gg.gla = gla; gg.paging = *paging; error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg); if (error == 0) { *fault = gg.fault; *gpa = gg.gpa; } return (error); } #endif int vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; bzero(&gg, sizeof(struct vm_gla2gpa)); gg.prot = prot; gg.gla = gla; gg.paging = *paging; error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg); if (error == 0) { *fault = gg.fault; *gpa = gg.gpa; } return (error); } #ifndef min #define min(a,b) (((a) < (b)) ? (a) : (b)) #endif #ifdef __amd64__ int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault) { void *va; uint64_t gpa, off; int error, i, n; for (i = 0; i < iovcnt; i++) { iov[i].iov_base = 0; iov[i].iov_len = 0; } while (len) { assert(iovcnt > 0); error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = MIN(len, PAGE_SIZE - off); va = vm_map_gpa(vcpu->ctx, gpa, n); if (va == NULL) return (EFAULT); iov->iov_base = va; iov->iov_len = n; iov++; iovcnt--; gla += n; len -= n; } return (0); } #endif void vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused) { /* * Intentionally empty. This is used by the instruction * emulation code shared with the kernel. The in-kernel * version of this is non-empty. */ } void vm_copyin(struct iovec *iov, void *vp, size_t len) { const char *src; char *dst; size_t n; dst = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); src = iov->iov_base; bcopy(src, dst, n); iov++; dst += n; len -= n; } } void vm_copyout(const void *vp, struct iovec *iov, size_t len) { const char *src; char *dst; size_t n; src = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); dst = iov->iov_base; bcopy(src, dst, n); iov++; src += n; len -= n; } } static int vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) { struct vm_cpuset vm_cpuset; int error; bzero(&vm_cpuset, sizeof(struct vm_cpuset)); vm_cpuset.which = which; vm_cpuset.cpusetsize = sizeof(cpuset_t); vm_cpuset.cpus = cpus; error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); return (error); } int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); } int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); } int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); } int vm_activate_cpu(struct vcpu *vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac); return (error); } int vm_suspend_all_cpus(struct vmctx *ctx) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = -1; error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); return (error); } int vm_suspend_cpu(struct vcpu *vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac); return (error); } int vm_resume_cpu(struct vcpu *vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac); return (error); } int vm_resume_all_cpus(struct vmctx *ctx) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = -1; error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); return (error); } #ifdef __amd64__ int vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii); if (error == 0) { *info1 = vmii.info1; *info2 = vmii.info2; } return (error); } int vm_set_intinfo(struct vcpu *vcpu, uint64_t info1) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); vmii.info1 = info1; error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii); return (error); } #endif #ifdef WITH_VMMAPI_SNAPSHOT int vm_restart_instruction(struct vcpu *vcpu) { int arg; return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg)); } int vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta) { if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { #ifdef SNAPSHOT_DEBUG fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", __func__, meta->dev_name, errno); #endif return (-1); } return (0); } int vm_restore_time(struct vmctx *ctx) { int dummy; dummy = 0; return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); } #endif int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) { struct vm_cpu_topology topology; bzero(&topology, sizeof (struct vm_cpu_topology)); topology.sockets = sockets; topology.cores = cores; topology.threads = threads; topology.maxcpus = maxcpus; return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); } int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) { struct vm_cpu_topology topology; int error; bzero(&topology, sizeof (struct vm_cpu_topology)); error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); if (error == 0) { *sockets = topology.sockets; *cores = topology.cores; *threads = topology.threads; *maxcpus = topology.maxcpus; } return (error); } int vm_limit_rights(struct vmctx *ctx) { cap_rights_t rights; cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); if (caph_rights_limit(ctx->fd, &rights) != 0) return (-1); if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0) return (-1); return (0); } /* * Avoid using in new code. Operations on the fd should be wrapped here so that * capability rights can be kept in sync. */ int vm_get_device_fd(struct vmctx *ctx) { return (ctx->fd); } /* Legacy interface, do not use. */ const cap_ioctl_t * vm_get_ioctls(size_t *len) { cap_ioctl_t *cmds; size_t sz; if (len == NULL) { sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]); cmds = malloc(sz); if (cmds == NULL) return (NULL); bcopy(vm_ioctl_cmds, cmds, sz); return (cmds); } *len = vm_ioctl_ncmds; return (NULL); } diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h index b637c45d1eff..5d3495a128d9 100644 --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -1,301 +1,302 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _VMMAPI_H_ #define _VMMAPI_H_ #include #include #include #include #include /* * API version for out-of-tree consumers like grub-bhyve for making compile * time decisions. */ #define VMMAPI_VERSION 0300 /* 2 digit major followed by 2 digit minor */ struct iovec; struct vcpu; struct vmctx; struct vm_snapshot_meta; enum x2apic_state; /* * Different styles of mapping the memory assigned to a VM into the address * space of the controlling process. */ enum vm_mmap_style { VM_MMAP_NONE, /* no mapping */ VM_MMAP_ALL, /* fully and statically mapped */ VM_MMAP_SPARSE, /* mappings created on-demand */ }; /* * 'flags' value passed to 'vm_set_memflags()'. */ #define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */ #define VM_MEM_F_WIRED 0x02 /* guest memory is wired */ /* Memory size and allocation policy for a single NUMA domain. */ struct vm_mem_domain { size_t size; int ds_policy; domainset_t *ds_mask; size_t ds_size; }; __BEGIN_DECLS /* * Get the length and name of the memory segment identified by 'segid'. * Note that system memory segments are identified with a nul name. * * Returns 0 on success and non-zero otherwise. */ int vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name, size_t namesiz); /* * Iterate over the guest address space. This function finds an address range * that starts at an address >= *gpa. * * Returns 0 if the next address range was found and non-zero otherwise. */ int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, size_t *lowmem_size, size_t *highmem_size); /* * Create a device memory segment identified by 'segid'. * * Returns a pointer to the memory segment on success and MAP_FAILED otherwise. */ void *vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len); /* * Map the memory segment identified by 'segid' into the guest address space * at [gpa,gpa+len) with protection 'prot'. */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t segoff, size_t len, int prot); int vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len); int vm_create(const char *name); struct vmctx *vm_open(const char *name); #define VMMAPI_OPEN_CREATE 0x01 /* create if the VM does not exist */ #define VMMAPI_OPEN_REINIT 0x02 /* reinitialize the VM if it exists */ +#define VMMAPI_OPEN_CREATE_DESTROY_ON_CLOSE 0x04 /* Destroy the VM when closing vmm_ctl */ struct vmctx *vm_openf(const char *name, int flags); void vm_close(struct vmctx *ctx); void vm_destroy(struct vmctx *ctx); int vm_limit_rights(struct vmctx *ctx); struct vcpu *vm_vcpu_open(struct vmctx *ctx, int vcpuid); void vm_vcpu_close(struct vcpu *vcpu); int vcpu_id(struct vcpu *vcpu); int vm_parse_memsize(const char *optarg, size_t *memsize); int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); int vm_setup_memory_domains(struct vmctx *ctx, enum vm_mmap_style s, struct vm_mem_domain *doms, int ndoms); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); /* inverse operation to vm_map_gpa - extract guest address from host pointer */ vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr); #ifdef __amd64__ int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); int vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); #endif int vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); uint32_t vm_get_lowmem_limit(struct vmctx *ctx); void vm_set_memflags(struct vmctx *ctx, int flags); int vm_get_memflags(struct vmctx *ctx); const char *vm_get_name(struct vmctx *ctx); size_t vm_get_lowmem_size(struct vmctx *ctx); vm_paddr_t vm_get_highmem_base(struct vmctx *ctx); size_t vm_get_highmem_size(struct vmctx *ctx); #ifdef __amd64__ int vm_set_desc(struct vcpu *vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access); int vm_get_desc(struct vcpu *vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access); int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *seg_desc); #endif int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val); int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval); int vm_set_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals); int vm_get_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals); int vm_run(struct vcpu *vcpu, struct vm_run *vmrun); int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how); int vm_reinit(struct vmctx *ctx); int vm_raise_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg, int bus, int slot, int func); #if defined(__aarch64__) int vm_attach_vgic(struct vmctx *ctx, uint64_t dist_start, size_t dist_size, uint64_t redist_start, size_t redist_size); int vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far); #elif defined(__riscv) int vm_attach_aplic(struct vmctx *ctx, uint64_t mem_start, size_t mem_size); int vm_inject_exception(struct vcpu *vcpu, uint64_t scause); #endif #if defined(__aarch64__) || defined(__riscv) int vm_assert_irq(struct vmctx *ctx, uint32_t irq); int vm_deassert_irq(struct vmctx *ctx, uint32_t irq); #endif #ifdef __amd64__ int vm_apicid2vcpu(struct vmctx *ctx, int apicid); int vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction); int vm_lapic_irq(struct vcpu *vcpu, int vector); int vm_lapic_local_irq(struct vcpu *vcpu, int vector); int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg); int vm_ioapic_assert_irq(struct vmctx *ctx, int irq); int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq); int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq); int vm_ioapic_pincount(struct vmctx *ctx, int *pincount); int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger); int vm_inject_nmi(struct vcpu *vcpu); int vm_readwrite_kernemu_device(struct vcpu *vcpu, vm_paddr_t gpa, bool write, int size, uint64_t *value); #endif int vm_capability_name2type(const char *capname); const char *vm_capability_type2name(int type); int vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval); int vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val); int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len); int vm_setup_pptdev_msi(struct vmctx *ctx, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec); int vm_setup_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); int vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func); int vm_get_intinfo(struct vcpu *vcpu, uint64_t *i1, uint64_t *i2); int vm_set_intinfo(struct vcpu *vcpu, uint64_t exit_intinfo); /* * Return a pointer to the statistics buffer. Note that this is not MT-safe. */ uint64_t *vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, int *ret_entries); const char *vm_get_stat_desc(struct vmctx *ctx, int index); #ifdef __amd64__ int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *s); int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state s); int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities); /* * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'. * The 'iovcnt' should be big enough to accommodate all GPA segments. * * retval fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Error */ int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *pg, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault); #endif void vm_copyin(struct iovec *guest_iov, void *host_dst, size_t len); void vm_copyout(const void *host_src, struct iovec *guest_iov, size_t len); void vm_copy_teardown(struct iovec *iov, int iovcnt); #ifdef __amd64__ /* RTC */ int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value); int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval); int vm_rtc_settime(struct vmctx *ctx, time_t secs); int vm_rtc_gettime(struct vmctx *ctx, time_t *secs); #endif /* Reset vcpu register state */ int vcpu_reset(struct vcpu *vcpu); int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_activate_cpu(struct vcpu *vcpu); int vm_suspend_all_cpus(struct vmctx *ctx); int vm_suspend_cpu(struct vcpu *vcpu); int vm_resume_all_cpus(struct vmctx *ctx); int vm_resume_cpu(struct vcpu *vcpu); int vm_restart_instruction(struct vcpu *vcpu); /* CPU topology */ int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); /* * FreeBSD specific APIs */ int vm_setup_freebsd_registers(struct vcpu *vcpu, uint64_t rip, uint64_t cr3, uint64_t gdtbase, uint64_t rsp); int vm_setup_freebsd_registers_i386(struct vcpu *vcpu, uint32_t eip, uint32_t gdtbase, uint32_t esp); void vm_setup_freebsd_gdt(uint64_t *gdtr); /* * Save and restore */ int vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta); int vm_restore_time(struct vmctx *ctx); /* * Deprecated interfaces, do not use them in new code. */ int vm_get_device_fd(struct vmctx *ctx); const cap_ioctl_t *vm_get_ioctls(size_t *len); __END_DECLS #endif /* _VMMAPI_H_ */ diff --git a/sys/dev/vmm/vmm_dev.c b/sys/dev/vmm/vmm_dev.c index 3a86a8f966ef..840e810a39fb 100644 --- a/sys/dev/vmm/vmm_dev.c +++ b/sys/dev/vmm/vmm_dev.c @@ -1,1319 +1,1389 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * Copyright (C) 2015 Mihai Carabas * All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __amd64__ #ifdef COMPAT_FREEBSD12 struct vm_memseg_12 { int segid; size_t len; char name[64]; }; _Static_assert(sizeof(struct vm_memseg_12) == 80, "COMPAT_FREEBSD12 ABI"); #define VM_ALLOC_MEMSEG_12 \ _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_12) #define VM_GET_MEMSEG_12 \ _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_12) #endif /* COMPAT_FREEBSD12 */ #ifdef COMPAT_FREEBSD14 struct vm_memseg_14 { int segid; size_t len; char name[VM_MAX_SUFFIXLEN + 1]; }; _Static_assert(sizeof(struct vm_memseg_14) == (VM_MAX_SUFFIXLEN + 1 + 16), "COMPAT_FREEBSD14 ABI"); #define VM_ALLOC_MEMSEG_14 \ _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg_14) #define VM_GET_MEMSEG_14 \ _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg_14) #endif /* COMPAT_FREEBSD14 */ #endif /* __amd64__ */ struct devmem_softc { int segid; char *name; struct cdev *cdev; struct vmmdev_softc *sc; SLIST_ENTRY(devmem_softc) link; }; struct vmmdev_softc { struct vm *vm; /* vm instance cookie */ struct cdev *cdev; struct ucred *ucred; SLIST_ENTRY(vmmdev_softc) link; + LIST_ENTRY(vmmdev_softc) priv_link; SLIST_HEAD(, devmem_softc) devmem; int flags; }; +struct vmmctl_priv { + LIST_HEAD(, vmmdev_softc) softcs; +}; + static bool vmm_initialized = false; static SLIST_HEAD(, vmmdev_softc) head; static unsigned pr_allow_flag; static struct sx vmmdev_mtx; SX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex"); static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); SYSCTL_DECL(_hw_vmm); u_int vm_maxcpu; SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &vm_maxcpu, 0, "Maximum number of vCPUs"); u_int vm_maxvmms; SYSCTL_UINT(_hw_vmm, OID_AUTO, maxvmms, CTLFLAG_RWTUN, &vm_maxvmms, 0, "Maximum number of VMM instances per user"); static void devmem_destroy(void *arg); static int devmem_create_cdev(struct vmmdev_softc *sc, int id, char *devmem); +static void vmmdev_destroy(struct vmmdev_softc *sc); static int vmm_priv_check(struct ucred *ucred) { if (jailed(ucred) && !(ucred->cr_prison->pr_allow & pr_allow_flag)) return (EPERM); return (0); } static int vcpu_lock_one(struct vcpu *vcpu) { return (vcpu_set_state(vcpu, VCPU_FROZEN, true)); } static void vcpu_unlock_one(struct vcpu *vcpu) { enum vcpu_state state; state = vcpu_get_state(vcpu, NULL); if (state != VCPU_FROZEN) { panic("vcpu %s(%d) has invalid state %d", vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state); } vcpu_set_state(vcpu, VCPU_IDLE, false); } #ifndef __amd64__ static int vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate) { struct vcpu *vcpu; int error; uint16_t i, j, maxcpus; error = 0; maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vcpu = vm_vcpu(vm, i); if (vcpu == NULL) continue; error = vcpu_lock_one(vcpu); if (error) break; } if (error) { for (j = 0; j < i; j++) { vcpu = vm_vcpu(vm, j); if (vcpu == NULL) continue; vcpu_unlock_one(vcpu); } } return (error); } #endif static int vcpu_lock_all(struct vmmdev_softc *sc) { int error; /* * Serialize vcpu_lock_all() callers. Individual vCPUs are not locked * in a consistent order so we need to serialize to avoid deadlocks. */ vm_lock_vcpus(sc->vm); error = vcpu_set_state_all(sc->vm, VCPU_FROZEN); if (error != 0) vm_unlock_vcpus(sc->vm); return (error); } static void vcpu_unlock_all(struct vmmdev_softc *sc) { struct vcpu *vcpu; uint16_t i, maxcpus; maxcpus = vm_get_maxcpus(sc->vm); for (i = 0; i < maxcpus; i++) { vcpu = vm_vcpu(sc->vm, i); if (vcpu == NULL) continue; vcpu_unlock_one(vcpu); } vm_unlock_vcpus(sc->vm); } static struct vmmdev_softc * vmmdev_lookup(const char *name, struct ucred *cred) { struct vmmdev_softc *sc; sx_assert(&vmmdev_mtx, SA_XLOCKED); SLIST_FOREACH(sc, &head, link) { if (strcmp(name, vm_name(sc->vm)) == 0) break; } if (sc == NULL) return (NULL); if (cr_cansee(cred, sc->ucred)) return (NULL); return (sc); } static struct vmmdev_softc * vmmdev_lookup2(struct cdev *cdev) { return (cdev->si_drv1); } static int vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) { int error, off, c, prot; vm_paddr_t gpa, maxaddr; void *hpa, *cookie; struct vmmdev_softc *sc; sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); /* * Get a read lock on the guest memory map. */ vm_slock_memsegs(sc->vm); error = 0; prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); maxaddr = vmm_sysmem_maxaddr(sc->vm); while (uio->uio_resid > 0 && error == 0) { gpa = uio->uio_offset; off = gpa & PAGE_MASK; c = min(uio->uio_resid, PAGE_SIZE - off); /* * The VM has a hole in its physical memory map. If we want to * use 'dd' to inspect memory beyond the hole we need to * provide bogus data for memory that lies in the hole. * * Since this device does not support lseek(2), dd(1) will * read(2) blocks of data to simulate the lseek(2). */ hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie); if (hpa == NULL) { if (uio->uio_rw == UIO_READ && gpa < maxaddr) error = uiomove(__DECONST(void *, zero_region), c, uio); else error = EFAULT; } else { error = uiomove(hpa, c, uio); vm_gpa_release(cookie); } } vm_unlock_memsegs(sc->vm); return (error); } CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1); static int get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) { struct devmem_softc *dsc; int error; bool sysmem; error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL); if (error || mseg->len == 0) return (error); if (!sysmem) { SLIST_FOREACH(dsc, &sc->devmem, link) { if (dsc->segid == mseg->segid) break; } KASSERT(dsc != NULL, ("%s: devmem segment %d not found", __func__, mseg->segid)); error = copystr(dsc->name, mseg->name, len, NULL); } else { bzero(mseg->name, len); } return (error); } static int alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len, struct domainset *domainset) { char *name; int error; bool sysmem; error = 0; name = NULL; sysmem = true; /* * The allocation is lengthened by 1 to hold a terminating NUL. It'll * by stripped off when devfs processes the full string. */ if (VM_MEMSEG_NAME(mseg)) { sysmem = false; name = malloc(len, M_VMMDEV, M_WAITOK); error = copystr(mseg->name, name, len, NULL); if (error) goto done; } error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem, domainset); if (error) goto done; if (VM_MEMSEG_NAME(mseg)) { error = devmem_create_cdev(sc, mseg->segid, name); if (error) vm_free_memseg(sc->vm, mseg->segid); else name = NULL; /* freed when 'cdev' is destroyed */ } done: free(name, M_VMMDEV); return (error); } #if defined(__amd64__) && \ (defined(COMPAT_FREEBSD14) || defined(COMPAT_FREEBSD12)) /* * Translate pre-15.0 memory segment identifiers into their 15.0 counterparts. */ static void adjust_segid(struct vm_memseg *mseg) { if (mseg->segid != VM_SYSMEM) { mseg->segid += (VM_BOOTROM - 1); } } #endif static int vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_get_register(vcpu, regnum[i], ®val[i]); if (error) break; } return (error); } static int vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_set_register(vcpu, regnum[i], regval[i]); if (error) break; } return (error); } static int vmmdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) { int error; /* * A jail without vmm access shouldn't be able to access vmm device * files at all, but check here just to be thorough. */ error = vmm_priv_check(td->td_ucred); if (error != 0) return (error); return (0); } static const struct vmmdev_ioctl vmmdev_ioctls[] = { VMMDEV_IOCTL(VM_GET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_SET_REGISTER, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_GET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_SET_REGISTER_SET, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_GET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_SET_CAPABILITY, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_ACTIVATE_CPU, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_STATS, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_STAT_DESC, 0), #ifdef __amd64__ #ifdef COMPAT_FREEBSD12 VMMDEV_IOCTL(VM_ALLOC_MEMSEG_12, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), #endif #ifdef COMPAT_FREEBSD14 VMMDEV_IOCTL(VM_ALLOC_MEMSEG_14, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), #endif #endif /* __amd64__ */ VMMDEV_IOCTL(VM_ALLOC_MEMSEG, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), VMMDEV_IOCTL(VM_MMAP_MEMSEG, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), VMMDEV_IOCTL(VM_MUNMAP_MEMSEG, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), VMMDEV_IOCTL(VM_REINIT, VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), #ifdef __amd64__ #if defined(COMPAT_FREEBSD12) VMMDEV_IOCTL(VM_GET_MEMSEG_12, VMMDEV_IOCTL_SLOCK_MEMSEGS), #endif #ifdef COMPAT_FREEBSD14 VMMDEV_IOCTL(VM_GET_MEMSEG_14, VMMDEV_IOCTL_SLOCK_MEMSEGS), #endif #endif /* __amd64__ */ VMMDEV_IOCTL(VM_GET_MEMSEG, VMMDEV_IOCTL_SLOCK_MEMSEGS), VMMDEV_IOCTL(VM_MMAP_GETNEXT, VMMDEV_IOCTL_SLOCK_MEMSEGS), VMMDEV_IOCTL(VM_SUSPEND_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU), VMMDEV_IOCTL(VM_RESUME_CPU, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU), VMMDEV_IOCTL(VM_SUSPEND, 0), VMMDEV_IOCTL(VM_GET_CPUS, 0), VMMDEV_IOCTL(VM_GET_TOPOLOGY, 0), VMMDEV_IOCTL(VM_SET_TOPOLOGY, 0), }; static int vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct vmmdev_softc *sc; struct vcpu *vcpu; const struct vmmdev_ioctl *ioctl; struct vm_memseg *mseg; int error, vcpuid; sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); ioctl = NULL; for (size_t i = 0; i < nitems(vmmdev_ioctls); i++) { if (vmmdev_ioctls[i].cmd == cmd) { ioctl = &vmmdev_ioctls[i]; break; } } if (ioctl == NULL) { for (size_t i = 0; i < vmmdev_machdep_ioctl_count; i++) { if (vmmdev_machdep_ioctls[i].cmd == cmd) { ioctl = &vmmdev_machdep_ioctls[i]; break; } } } if (ioctl == NULL) return (ENOTTY); if ((ioctl->flags & VMMDEV_IOCTL_PRIV_CHECK_DRIVER) != 0) { error = priv_check(td, PRIV_DRIVER); if (error != 0) return (error); } if ((ioctl->flags & VMMDEV_IOCTL_XLOCK_MEMSEGS) != 0) vm_xlock_memsegs(sc->vm); else if ((ioctl->flags & VMMDEV_IOCTL_SLOCK_MEMSEGS) != 0) vm_slock_memsegs(sc->vm); vcpu = NULL; vcpuid = -1; if ((ioctl->flags & (VMMDEV_IOCTL_LOCK_ONE_VCPU | VMMDEV_IOCTL_ALLOC_VCPU | VMMDEV_IOCTL_MAYBE_ALLOC_VCPU)) != 0) { vcpuid = *(int *)data; if (vcpuid == -1) { if ((ioctl->flags & VMMDEV_IOCTL_MAYBE_ALLOC_VCPU) == 0) { error = EINVAL; goto lockfail; } } else { vcpu = vm_alloc_vcpu(sc->vm, vcpuid); if (vcpu == NULL) { error = EINVAL; goto lockfail; } if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) { error = vcpu_lock_one(vcpu); if (error) goto lockfail; } } } if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) { error = vcpu_lock_all(sc); if (error) goto lockfail; } switch (cmd) { case VM_SUSPEND: { struct vm_suspend *vmsuspend; vmsuspend = (struct vm_suspend *)data; error = vm_suspend(sc->vm, vmsuspend->how); break; } case VM_REINIT: error = vm_reinit(sc->vm); break; case VM_STAT_DESC: { struct vm_stat_desc *statdesc; statdesc = (struct vm_stat_desc *)data; error = vmm_stat_desc_copy(statdesc->index, statdesc->desc, sizeof(statdesc->desc)); break; } case VM_STATS: { struct vm_stats *vmstats; vmstats = (struct vm_stats *)data; getmicrotime(&vmstats->tv); error = vmm_stat_copy(vcpu, vmstats->index, nitems(vmstats->statbuf), &vmstats->num_entries, vmstats->statbuf); break; } case VM_MMAP_GETNEXT: { struct vm_memmap *mm; mm = (struct vm_memmap *)data; error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid, &mm->segoff, &mm->len, &mm->prot, &mm->flags); break; } case VM_MMAP_MEMSEG: { struct vm_memmap *mm; mm = (struct vm_memmap *)data; error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff, mm->len, mm->prot, mm->flags); break; } case VM_MUNMAP_MEMSEG: { struct vm_munmap *mu; mu = (struct vm_munmap *)data; error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len); break; } #ifdef __amd64__ #ifdef COMPAT_FREEBSD12 case VM_ALLOC_MEMSEG_12: mseg = (struct vm_memseg *)data; adjust_segid(mseg); error = alloc_memseg(sc, mseg, sizeof(((struct vm_memseg_12 *)0)->name), NULL); break; case VM_GET_MEMSEG_12: mseg = (struct vm_memseg *)data; adjust_segid(mseg); error = get_memseg(sc, mseg, sizeof(((struct vm_memseg_12 *)0)->name)); break; #endif /* COMPAT_FREEBSD12 */ #ifdef COMPAT_FREEBSD14 case VM_ALLOC_MEMSEG_14: mseg = (struct vm_memseg *)data; adjust_segid(mseg); error = alloc_memseg(sc, mseg, sizeof(((struct vm_memseg_14 *)0)->name), NULL); break; case VM_GET_MEMSEG_14: mseg = (struct vm_memseg *)data; adjust_segid(mseg); error = get_memseg(sc, mseg, sizeof(((struct vm_memseg_14 *)0)->name)); break; #endif /* COMPAT_FREEBSD14 */ #endif /* __amd64__ */ case VM_ALLOC_MEMSEG: { domainset_t *mask; struct domainset *domainset, domain; domainset = NULL; mseg = (struct vm_memseg *)data; if (mseg->ds_policy != DOMAINSET_POLICY_INVALID && mseg->ds_mask != NULL) { if (mseg->ds_mask_size < sizeof(domainset_t) || mseg->ds_mask_size > DOMAINSET_MAXSIZE / NBBY) { error = ERANGE; break; } memset(&domain, 0, sizeof(domain)); mask = malloc(mseg->ds_mask_size, M_VMMDEV, M_WAITOK); error = copyin(mseg->ds_mask, mask, mseg->ds_mask_size); if (error) { free(mask, M_VMMDEV); break; } error = domainset_populate(&domain, mask, mseg->ds_policy, mseg->ds_mask_size); free(mask, M_VMMDEV); if (error) break; domainset = domainset_create(&domain); if (domainset == NULL) { error = EINVAL; break; } } error = alloc_memseg(sc, mseg, sizeof(mseg->name), domainset); break; } case VM_GET_MEMSEG: error = get_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg *)0)->name)); break; case VM_GET_REGISTER: { struct vm_register *vmreg; vmreg = (struct vm_register *)data; error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval); break; } case VM_SET_REGISTER: { struct vm_register *vmreg; vmreg = (struct vm_register *)data; error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval); break; } case VM_GET_REGISTER_SET: { struct vm_register_set *vmregset; uint64_t *regvals; int *regnums; vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = mallocarray(vmregset->count, sizeof(regvals[0]), M_VMMDEV, M_WAITOK); regnums = mallocarray(vmregset->count, sizeof(regnums[0]), M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = vm_get_register_set(vcpu, vmregset->count, regnums, regvals); if (error == 0) error = copyout(regvals, vmregset->regvals, sizeof(regvals[0]) * vmregset->count); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; } case VM_SET_REGISTER_SET: { struct vm_register_set *vmregset; uint64_t *regvals; int *regnums; vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = mallocarray(vmregset->count, sizeof(regvals[0]), M_VMMDEV, M_WAITOK); regnums = mallocarray(vmregset->count, sizeof(regnums[0]), M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = copyin(vmregset->regvals, regvals, sizeof(regvals[0]) * vmregset->count); if (error == 0) error = vm_set_register_set(vcpu, vmregset->count, regnums, regvals); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; } case VM_GET_CAPABILITY: { struct vm_capability *vmcap; vmcap = (struct vm_capability *)data; error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval); break; } case VM_SET_CAPABILITY: { struct vm_capability *vmcap; vmcap = (struct vm_capability *)data; error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval); break; } case VM_ACTIVATE_CPU: error = vm_activate_cpu(vcpu); break; case VM_GET_CPUS: { struct vm_cpuset *vm_cpuset; cpuset_t *cpuset; int size; error = 0; vm_cpuset = (struct vm_cpuset *)data; size = vm_cpuset->cpusetsize; if (size < 1 || size > CPU_MAXSIZE / NBBY) { error = ERANGE; break; } cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP, M_WAITOK | M_ZERO); if (vm_cpuset->which == VM_ACTIVE_CPUS) *cpuset = vm_active_cpus(sc->vm); else if (vm_cpuset->which == VM_SUSPENDED_CPUS) *cpuset = vm_suspended_cpus(sc->vm); else if (vm_cpuset->which == VM_DEBUG_CPUS) *cpuset = vm_debug_cpus(sc->vm); else error = EINVAL; if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY)) error = ERANGE; if (error == 0) error = copyout(cpuset, vm_cpuset->cpus, size); free(cpuset, M_TEMP); break; } case VM_SUSPEND_CPU: error = vm_suspend_cpu(sc->vm, vcpu); break; case VM_RESUME_CPU: error = vm_resume_cpu(sc->vm, vcpu); break; case VM_SET_TOPOLOGY: { struct vm_cpu_topology *topology; topology = (struct vm_cpu_topology *)data; error = vm_set_topology(sc->vm, topology->sockets, topology->cores, topology->threads, topology->maxcpus); break; } case VM_GET_TOPOLOGY: { struct vm_cpu_topology *topology; topology = (struct vm_cpu_topology *)data; vm_get_topology(sc->vm, &topology->sockets, &topology->cores, &topology->threads, &topology->maxcpus); error = 0; break; } default: error = vmmdev_machdep_ioctl(sc->vm, vcpu, cmd, data, fflag, td); break; } if ((ioctl->flags & (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0) vm_unlock_memsegs(sc->vm); if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ALL_VCPUS) != 0) vcpu_unlock_all(sc); else if ((ioctl->flags & VMMDEV_IOCTL_LOCK_ONE_VCPU) != 0) vcpu_unlock_one(vcpu); /* * Make sure that no handler returns a kernel-internal * error value to userspace. */ KASSERT(error == ERESTART || error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); return (error); lockfail: if ((ioctl->flags & (VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_SLOCK_MEMSEGS)) != 0) vm_unlock_memsegs(sc->vm); return (error); } static int vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize, struct vm_object **objp, int nprot) { struct vmmdev_softc *sc; vm_paddr_t gpa; size_t len; vm_ooffset_t segoff, first, last; int error, found, segid; bool sysmem; first = *offset; last = first + mapsize; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); sc = vmmdev_lookup2(cdev); if (sc == NULL) { /* virtual machine is in the process of being created */ return (EINVAL); } /* * Get a read lock on the guest memory map. */ vm_slock_memsegs(sc->vm); gpa = 0; found = 0; while (!found) { error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len, NULL, NULL); if (error) break; if (first >= gpa && last <= gpa + len) found = 1; else gpa += len; } if (found) { error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp); KASSERT(error == 0 && *objp != NULL, ("%s: invalid memory segment %d", __func__, segid)); if (sysmem) { vm_object_reference(*objp); *offset = segoff + (first - gpa); } else { error = EINVAL; } } vm_unlock_memsegs(sc->vm); return (error); } static void vmmdev_destroy(struct vmmdev_softc *sc) { struct devmem_softc *dsc; int error __diagused; KASSERT(sc->cdev == NULL, ("%s: cdev not free", __func__)); KASSERT(sc->ucred != NULL, ("%s: missing ucred", __func__)); /* * Destroy all cdevs: * * - any new operations on the 'cdev' will return an error (ENXIO). * * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev' */ SLIST_FOREACH(dsc, &sc->devmem, link) { KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed")); devmem_destroy(dsc); } vm_disable_vcpu_creation(sc->vm); error = vcpu_lock_all(sc); KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error)); vm_unlock_vcpus(sc->vm); while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) { KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__)); SLIST_REMOVE_HEAD(&sc->devmem, link); free(dsc->name, M_VMMDEV); free(dsc, M_VMMDEV); } if (sc->vm != NULL) vm_destroy(sc->vm); chgvmmcnt(sc->ucred->cr_ruidinfo, -1, 0); crfree(sc->ucred); sx_xlock(&vmmdev_mtx); SLIST_REMOVE(&head, sc, vmmdev_softc, link); + if ((sc->flags & VMMCTL_CREATE_DESTROY_ON_CLOSE) != 0) + LIST_REMOVE(sc, priv_link); sx_xunlock(&vmmdev_mtx); + wakeup(sc); free(sc, M_VMMDEV); } static int vmmdev_lookup_and_destroy(const char *name, struct ucred *cred) { struct cdev *cdev; struct vmmdev_softc *sc; sx_xlock(&vmmdev_mtx); sc = vmmdev_lookup(name, cred); if (sc == NULL || sc->cdev == NULL) { sx_xunlock(&vmmdev_mtx); return (EINVAL); } /* * Setting 'sc->cdev' to NULL is used to indicate that the VM * is scheduled for destruction. */ cdev = sc->cdev; sc->cdev = NULL; sx_xunlock(&vmmdev_mtx); - vm_suspend(sc->vm, VM_SUSPEND_DESTROY); + (void)vm_suspend(sc->vm, VM_SUSPEND_DESTROY); destroy_dev(cdev); vmmdev_destroy(sc); return (0); } static int sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) { char *buf; int error, buflen; error = vmm_priv_check(req->td->td_ucred); if (error) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); error = sysctl_handle_string(oidp, buf, buflen, req); if (error == 0 && req->newptr != NULL) error = vmmdev_lookup_and_destroy(buf, req->td->td_ucred); free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_destroy, "A", "Destroy a vmm(4) instance (legacy interface)"); static struct cdevsw vmmdevsw = { .d_name = "vmmdev", .d_version = D_VERSION, .d_open = vmmdev_open, .d_ioctl = vmmdev_ioctl, .d_mmap_single = vmmdev_mmap_single, .d_read = vmmdev_rw, .d_write = vmmdev_rw, }; static struct vmmdev_softc * vmmdev_alloc(struct vm *vm, struct ucred *cred) { struct vmmdev_softc *sc; sc = malloc(sizeof(*sc), M_VMMDEV, M_WAITOK | M_ZERO); SLIST_INIT(&sc->devmem); sc->vm = vm; sc->ucred = crhold(cred); return (sc); } static int -vmmdev_create(const char *name, struct ucred *cred) +vmmdev_create(const char *name, uint32_t flags, struct ucred *cred) { struct make_dev_args mda; struct cdev *cdev; struct vmmdev_softc *sc; + struct vmmctl_priv *priv; struct vm *vm; int error; if (name == NULL || strlen(name) > VM_MAX_NAMELEN) return (EINVAL); + if ((flags & ~VMMCTL_FLAGS_MASK) != 0) + return (EINVAL); + error = devfs_get_cdevpriv((void **)&priv); + if (error) + return (error); + sx_xlock(&vmmdev_mtx); sc = vmmdev_lookup(name, cred); if (sc != NULL) { sx_xunlock(&vmmdev_mtx); return (EEXIST); } error = vm_create(name, &vm); if (error != 0) { sx_xunlock(&vmmdev_mtx); return (error); } sc = vmmdev_alloc(vm, cred); SLIST_INSERT_HEAD(&head, sc, link); + sc->flags = flags; + if ((flags & VMMCTL_CREATE_DESTROY_ON_CLOSE) != 0) + LIST_INSERT_HEAD(&priv->softcs, sc, priv_link); make_dev_args_init(&mda); mda.mda_devsw = &vmmdevsw; mda.mda_cr = sc->ucred; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = sc; mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; error = make_dev_s(&mda, &cdev, "vmm/%s", name); if (error != 0) { sx_xunlock(&vmmdev_mtx); vmmdev_destroy(sc); return (error); } if (!chgvmmcnt(cred->cr_ruidinfo, 1, vm_maxvmms)) { sx_xunlock(&vmmdev_mtx); destroy_dev(cdev); vmmdev_destroy(sc); return (ENOMEM); } sc->cdev = cdev; sx_xunlock(&vmmdev_mtx); return (0); } static int sysctl_vmm_create(SYSCTL_HANDLER_ARGS) { char *buf; int error, buflen; if (!vmm_initialized) return (ENXIO); error = vmm_priv_check(req->td->td_ucred); if (error != 0) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); error = sysctl_handle_string(oidp, buf, buflen, req); if (error == 0 && req->newptr != NULL) - error = vmmdev_create(buf, req->td->td_ucred); + error = vmmdev_create(buf, 0, req->td->td_ucred); free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_create, "A", "Create a vmm(4) instance (legacy interface)"); +static void +vmmctl_dtor(void *arg) +{ + struct cdev *sc_cdev; + struct vmmdev_softc *sc; + struct vmmctl_priv *priv = arg; + + /* + * Scan the softc list for any VMs associated with + * the current descriptor and destroy them. + */ + sx_xlock(&vmmdev_mtx); + while (!LIST_EMPTY(&priv->softcs)) { + sc = LIST_FIRST(&priv->softcs); + sc_cdev = sc->cdev; + if (sc_cdev != NULL) { + sc->cdev = NULL; + } else { + /* + * Another thread has already + * started the removal process. + * Sleep until 'vmmdev_destroy' notifies us + * that the removal has finished. + */ + sx_sleep(sc, &vmmdev_mtx, 0, "vmmctl_dtor", 0); + continue; + } + /* + * Temporarily drop the lock to allow vmmdev_destroy to run. + */ + sx_xunlock(&vmmdev_mtx); + (void)vm_suspend(sc->vm, VM_SUSPEND_DESTROY); + destroy_dev(sc_cdev); + /* vmmdev_destroy will unlink the 'priv_link' entry. */ + vmmdev_destroy(sc); + sx_xlock(&vmmdev_mtx); + } + sx_xunlock(&vmmdev_mtx); + + free(priv, M_VMMDEV); +} + static int vmmctl_open(struct cdev *cdev, int flags, int fmt, struct thread *td) { int error; + struct vmmctl_priv *priv; error = vmm_priv_check(td->td_ucred); if (error != 0) return (error); if ((flags & FWRITE) == 0) return (EPERM); + priv = malloc(sizeof(*priv), M_VMMDEV, M_WAITOK | M_ZERO); + LIST_INIT(&priv->softcs); + error = devfs_set_cdevpriv(priv, vmmctl_dtor); + if (error != 0) { + free(priv, M_VMMDEV); + return (error); + } + return (0); } static int vmmctl_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { int error; switch (cmd) { case VMMCTL_VM_CREATE: { struct vmmctl_vm_create *vmc; vmc = (struct vmmctl_vm_create *)data; vmc->name[VM_MAX_NAMELEN] = '\0'; for (size_t i = 0; i < nitems(vmc->reserved); i++) { if (vmc->reserved[i] != 0) { error = EINVAL; return (error); } } - error = vmmdev_create(vmc->name, td->td_ucred); + error = vmmdev_create(vmc->name, vmc->flags, td->td_ucred); break; } case VMMCTL_VM_DESTROY: { struct vmmctl_vm_destroy *vmd; vmd = (struct vmmctl_vm_destroy *)data; vmd->name[VM_MAX_NAMELEN] = '\0'; for (size_t i = 0; i < nitems(vmd->reserved); i++) { if (vmd->reserved[i] != 0) { error = EINVAL; return (error); } } error = vmmdev_lookup_and_destroy(vmd->name, td->td_ucred); break; } default: error = ENOTTY; break; } return (error); } static struct cdev *vmmctl_cdev; static struct cdevsw vmmctlsw = { .d_name = "vmmctl", .d_version = D_VERSION, .d_open = vmmctl_open, .d_ioctl = vmmctl_ioctl, }; static int vmmdev_init(void) { int error; sx_xlock(&vmmdev_mtx); error = make_dev_p(MAKEDEV_CHECKNAME, &vmmctl_cdev, &vmmctlsw, NULL, UID_ROOT, GID_WHEEL, 0600, "vmmctl"); if (error == 0) pr_allow_flag = prison_add_allow(NULL, "vmm", NULL, "Allow use of vmm in a jail."); sx_xunlock(&vmmdev_mtx); return (error); } static int vmmdev_cleanup(void) { sx_xlock(&vmmdev_mtx); if (!SLIST_EMPTY(&head)) { sx_xunlock(&vmmdev_mtx); return (EBUSY); } if (vmmctl_cdev != NULL) { destroy_dev(vmmctl_cdev); vmmctl_cdev = NULL; } sx_xunlock(&vmmdev_mtx); return (0); } static int vmm_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: error = vmmdev_init(); if (error != 0) break; vm_maxcpu = mp_ncpus; TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); if (vm_maxcpu > VM_MAXCPU) { printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); vm_maxcpu = VM_MAXCPU; } if (vm_maxcpu == 0) vm_maxcpu = 1; vm_maxvmms = 4 * mp_ncpus; error = vmm_modinit(); if (error == 0) vmm_initialized = true; else { error = vmmdev_cleanup(); KASSERT(error == 0, ("%s: vmmdev_cleanup failed: %d", __func__, error)); } break; case MOD_UNLOAD: error = vmmdev_cleanup(); if (error == 0 && vmm_initialized) { error = vmm_modcleanup(); if (error) { /* * Something bad happened - prevent new * VMs from being created */ vmm_initialized = false; } } break; default: error = 0; break; } return (error); } static moduledata_t vmm_kmod = { "vmm", vmm_handler, NULL }; /* * vmm initialization has the following dependencies: * * - Initialization requires smp_rendezvous() and therefore must happen * after SMP is fully functional (after SI_SUB_SMP). * - vmm device initialization requires an initialized devfs. */ DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); static int devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len, struct vm_object **objp, int nprot) { struct devmem_softc *dsc; vm_ooffset_t first, last; size_t seglen; int error; bool sysmem; dsc = cdev->si_drv1; if (dsc == NULL) { /* 'cdev' has been created but is not ready for use */ return (ENXIO); } first = *offset; last = *offset + len; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); vm_slock_memsegs(dsc->sc->vm); error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp); KASSERT(error == 0 && !sysmem && *objp != NULL, ("%s: invalid devmem segment %d", __func__, dsc->segid)); if (seglen >= last) vm_object_reference(*objp); else error = EINVAL; vm_unlock_memsegs(dsc->sc->vm); return (error); } static struct cdevsw devmemsw = { .d_name = "devmem", .d_version = D_VERSION, .d_mmap_single = devmem_mmap_single, }; static int devmem_create_cdev(struct vmmdev_softc *sc, int segid, char *devname) { struct make_dev_args mda; struct devmem_softc *dsc; int error; sx_xlock(&vmmdev_mtx); dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO); dsc->segid = segid; dsc->name = devname; dsc->sc = sc; SLIST_INSERT_HEAD(&sc->devmem, dsc, link); make_dev_args_init(&mda); mda.mda_devsw = &devmemsw; mda.mda_cr = sc->ucred; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = dsc; mda.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; error = make_dev_s(&mda, &dsc->cdev, "vmm.io/%s.%s", vm_name(sc->vm), devname); if (error != 0) { SLIST_REMOVE(&sc->devmem, dsc, devmem_softc, link); free(dsc->name, M_VMMDEV); free(dsc, M_VMMDEV); } sx_xunlock(&vmmdev_mtx); return (error); } static void devmem_destroy(void *arg) { struct devmem_softc *dsc = arg; destroy_dev(dsc->cdev); dsc->cdev = NULL; dsc->sc = NULL; } diff --git a/sys/dev/vmm/vmm_dev.h b/sys/dev/vmm/vmm_dev.h index f14176c8afad..f8f637fda687 100644 --- a/sys/dev/vmm/vmm_dev.h +++ b/sys/dev/vmm/vmm_dev.h @@ -1,86 +1,90 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * Copyright (C) 2015 Mihai Carabas * All rights reserved. */ #ifndef _DEV_VMM_DEV_H_ #define _DEV_VMM_DEV_H_ #include #include #include #include #ifdef _KERNEL struct thread; struct vm; struct vcpu; int vmm_modinit(void); int vmm_modcleanup(void); int vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data, int fflag, struct thread *td); /* * Entry in an ioctl handler table. A number of generic ioctls are defined, * plus a table of machine-dependent ioctls. The flags indicate the * required preconditions for a given ioctl. * * Some ioctls encode a vcpuid as the first member of their ioctl structure. * These ioctls must specify one of the following flags: * - ALLOC_VCPU: create the vCPU if it does not already exist * - LOCK_ONE_VCPU: create the vCPU if it does not already exist * and lock the vCPU for the duration of the ioctl * - MAYBE_ALLOC_VCPU: if the vcpuid is -1, do nothing, otherwise * create the vCPU if it does not already exist */ struct vmmdev_ioctl { unsigned long cmd; #define VMMDEV_IOCTL_SLOCK_MEMSEGS 0x01 #define VMMDEV_IOCTL_XLOCK_MEMSEGS 0x02 #define VMMDEV_IOCTL_LOCK_ONE_VCPU 0x04 #define VMMDEV_IOCTL_LOCK_ALL_VCPUS 0x08 #define VMMDEV_IOCTL_ALLOC_VCPU 0x10 #define VMMDEV_IOCTL_MAYBE_ALLOC_VCPU 0x20 #define VMMDEV_IOCTL_PRIV_CHECK_DRIVER 0x40 int flags; }; #define VMMDEV_IOCTL(_cmd, _flags) { .cmd = (_cmd), .flags = (_flags) } extern const struct vmmdev_ioctl vmmdev_machdep_ioctls[]; extern const size_t vmmdev_machdep_ioctl_count; /* * Upper limit on vm_maxcpu. Limited by use of uint16_t types for CPU counts as * well as range of vpid values for VT-x on amd64 and by the capacity of * cpuset_t masks. The call to new_unrhdr() in vpid_init() in vmx.c requires * 'vm_maxcpu + 1 <= 0xffff', hence the '- 1' below. */ #define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE) /* Maximum number of vCPUs in a single VM. */ extern u_int vm_maxcpu; #endif /* _KERNEL */ +#define VMMCTL_CREATE_DESTROY_ON_CLOSE 0x1 +#define VMMCTL_FLAGS_MASK (VMMCTL_CREATE_DESTROY_ON_CLOSE) + struct vmmctl_vm_create { char name[VM_MAX_NAMELEN + 1]; - int reserved[16]; + uint32_t flags; + int reserved[15]; }; struct vmmctl_vm_destroy { char name[VM_MAX_NAMELEN + 1]; int reserved[16]; }; #define VMMCTL_VM_CREATE _IOWR('V', 0, struct vmmctl_vm_create) #define VMMCTL_VM_DESTROY _IOWR('V', 1, struct vmmctl_vm_destroy) #endif /* _DEV_VMM_DEV_H_ */