diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c index 11f38c926cc3..7347c41dd311 100644 --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -1,1578 +1,1643 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include +#include #include #include +#include #include "vmmapi.h" #define MB (1024 * 1024UL) #define GB (1024 * 1024 * 1024UL) /* * Size of the guard region before and after the virtual address space * mapping the guest physical memory. This must be a multiple of the * superpage size for performance reasons. */ #define VM_MMAP_GUARD_SIZE (4 * MB) #define PROT_RW (PROT_READ | PROT_WRITE) #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) struct vmctx { int fd; uint32_t lowmem_limit; int memflags; size_t lowmem; size_t highmem; char *baseaddr; char *name; }; #define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) #define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) static int vm_device_open(const char *name) { int fd, len; char *vmfile; len = strlen("/dev/vmm/") + strlen(name) + 1; vmfile = malloc(len); assert(vmfile != NULL); snprintf(vmfile, len, "/dev/vmm/%s", name); /* Open the device file */ fd = open(vmfile, O_RDWR, 0); free(vmfile); return (fd); } int vm_create(const char *name) { /* Try to load vmm(4) module before creating a guest. */ if (modfind("vmm") < 0) kldload("vmm"); return (CREATE((char *)name)); } struct vmctx * vm_open(const char *name) { struct vmctx *vm; vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); assert(vm != NULL); vm->fd = -1; vm->memflags = 0; vm->lowmem_limit = 3 * GB; vm->name = (char *)(vm + 1); strcpy(vm->name, name); if ((vm->fd = vm_device_open(vm->name)) < 0) goto err; return (vm); err: vm_destroy(vm); return (NULL); } void vm_destroy(struct vmctx *vm) { assert(vm != NULL); if (vm->fd >= 0) close(vm->fd); DESTROY(vm->name); free(vm); } int vm_parse_memsize(const char *optarg, size_t *ret_memsize) { char *endptr; size_t optval; int error; optval = strtoul(optarg, &endptr, 0); if (*optarg != '\0' && *endptr == '\0') { /* * For the sake of backward compatibility if the memory size * specified on the command line is less than a megabyte then * it is interpreted as being in units of MB. */ if (optval < MB) optval *= MB; *ret_memsize = optval; error = 0; } else error = expand_number(optarg, ret_memsize); return (error); } uint32_t vm_get_lowmem_limit(struct vmctx *ctx) { return (ctx->lowmem_limit); } void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit) { ctx->lowmem_limit = limit; } void vm_set_memflags(struct vmctx *ctx, int flags) { ctx->memflags = flags; } int vm_get_memflags(struct vmctx *ctx) { return (ctx->memflags); } /* * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot) { struct vm_memmap memmap; int error, flags; memmap.gpa = gpa; memmap.segid = segid; memmap.segoff = off; memmap.len = len; memmap.prot = prot; memmap.flags = 0; if (ctx->memflags & VM_MEM_F_WIRED) memmap.flags |= VM_MEMMAP_F_WIRED; /* * If this mapping already exists then don't create it again. This * is the common case for SYSMEM mappings created by bhyveload(8). */ error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); if (error == 0 && gpa == memmap.gpa) { if (segid != memmap.segid || off != memmap.segoff || prot != memmap.prot || flags != memmap.flags) { errno = EEXIST; return (-1); } else { return (0); } } error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); return (error); } +int +vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, + size_t *lowmem_size, size_t *highmem_size) +{ + + *guest_baseaddr = ctx->baseaddr; + *lowmem_size = ctx->lowmem; + *highmem_size = ctx->highmem; + return (0); +} + int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { struct vm_memmap memmap; int error; bzero(&memmap, sizeof(struct vm_memmap)); memmap.gpa = *gpa; error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); if (error == 0) { *gpa = memmap.gpa; *segid = memmap.segid; *segoff = memmap.segoff; *len = memmap.len; *prot = memmap.prot; *flags = memmap.flags; } return (error); } /* * Return 0 if the segments are identical and non-zero otherwise. * * This is slightly complicated by the fact that only device memory segments * are named. */ static int cmpseg(size_t len, const char *str, size_t len2, const char *str2) { if (len == len2) { if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) return (0); } return (-1); } static int vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) { struct vm_memseg memseg; size_t n; int error; /* * If the memory segment has already been created then just return. * This is the usual case for the SYSMEM segment created by userspace * loaders like bhyveload(8). */ error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, sizeof(memseg.name)); if (error) return (error); if (memseg.len != 0) { if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { errno = EINVAL; return (-1); } else { return (0); } } bzero(&memseg, sizeof(struct vm_memseg)); memseg.segid = segid; memseg.len = len; if (name != NULL) { n = strlcpy(memseg.name, name, sizeof(memseg.name)); if (n >= sizeof(memseg.name)) { errno = ENAMETOOLONG; return (-1); } } error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); return (error); } int vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, size_t bufsize) { struct vm_memseg memseg; size_t n; int error; memseg.segid = segid; error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); if (error == 0) { *lenp = memseg.len; n = strlcpy(namebuf, memseg.name, bufsize); if (n >= bufsize) { errno = ENAMETOOLONG; error = -1; } } return (error); } static int setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) { char *ptr; int error, flags; /* Map 'len' bytes starting at 'gpa' in the guest address space */ error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); if (error) return (error); flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap into the process address space on the host */ ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); if (ptr == MAP_FAILED) return (-1); return (0); } int vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) { size_t objsize, len; vm_paddr_t gpa; char *baseaddr, *ptr; int error; assert(vms == VM_MMAP_ALL); /* * If 'memsize' cannot fit entirely in the 'lowmem' segment then * create another 'highmem' segment above 4GB for the remainder. */ if (memsize > ctx->lowmem_limit) { ctx->lowmem = ctx->lowmem_limit; ctx->highmem = memsize - ctx->lowmem_limit; objsize = 4*GB + ctx->highmem; } else { ctx->lowmem = memsize; ctx->highmem = 0; objsize = ctx->lowmem; } error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); if (error) return (error); /* * Stake out a contiguous region covering the guest physical memory * and the adjoining guard regions. */ len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (ptr == MAP_FAILED) return (-1); baseaddr = ptr + VM_MMAP_GUARD_SIZE; if (ctx->highmem > 0) { gpa = 4*GB; len = ctx->highmem; error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } if (ctx->lowmem > 0) { gpa = 0; len = ctx->lowmem; error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } ctx->baseaddr = baseaddr; return (0); } /* * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in * the lowmem or highmem regions. * * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. * The instruction emulation code depends on this behavior. */ void * vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) { if (ctx->lowmem > 0) { if (gaddr < ctx->lowmem && len <= ctx->lowmem && gaddr + len <= ctx->lowmem) return (ctx->baseaddr + gaddr); } if (ctx->highmem > 0) { if (gaddr >= 4*GB) { if (gaddr < 4*GB + ctx->highmem && len <= ctx->highmem && gaddr + len <= 4*GB + ctx->highmem) return (ctx->baseaddr + gaddr); } } return (NULL); } +vm_paddr_t +vm_rev_map_gpa(struct vmctx *ctx, void *addr) +{ + vm_paddr_t offaddr; + + offaddr = (char *)addr - ctx->baseaddr; + + if (ctx->lowmem > 0) + if (offaddr >= 0 && offaddr <= ctx->lowmem) + return (offaddr); + + if (ctx->highmem > 0) + if (offaddr >= 4*GB && offaddr < 4*GB + ctx->highmem) + return (offaddr); + + return ((vm_paddr_t)-1); +} + +/* TODO: maximum size for vmname */ +int +vm_get_name(struct vmctx *ctx, char *buf, size_t max_len) +{ + + if (strlcpy(buf, ctx->name, max_len) >= max_len) + return (EINVAL); + return (0); +} + size_t vm_get_lowmem_size(struct vmctx *ctx) { return (ctx->lowmem); } size_t vm_get_highmem_size(struct vmctx *ctx) { return (ctx->highmem); } void * vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) { char pathname[MAXPATHLEN]; size_t len2; char *base, *ptr; int fd, error, flags; fd = -1; ptr = MAP_FAILED; if (name == NULL || strlen(name) == 0) { errno = EINVAL; goto done; } error = vm_alloc_memseg(ctx, segid, len, name); if (error) goto done; strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); strlcat(pathname, ctx->name, sizeof(pathname)); strlcat(pathname, ".", sizeof(pathname)); strlcat(pathname, name, sizeof(pathname)); fd = open(pathname, O_RDWR); if (fd < 0) goto done; /* * Stake out a contiguous region covering the device memory and the * adjoining guard regions. */ len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (base == MAP_FAILED) goto done; flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap the devmem region in the host address space */ ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); done: if (fd >= 0) close(fd); return (ptr); } int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); vmsegdesc.cpuid = vcpu; vmsegdesc.regnum = reg; vmsegdesc.desc.base = base; vmsegdesc.desc.limit = limit; vmsegdesc.desc.access = access; error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc); return (error); } int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); vmsegdesc.cpuid = vcpu; vmsegdesc.regnum = reg; error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc); if (error == 0) { *base = vmsegdesc.desc.base; *limit = vmsegdesc.desc.limit; *access = vmsegdesc.desc.access; } return (error); } int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc) { int error; error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit, &seg_desc->access); return (error); } int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.cpuid = vcpu; vmreg.regnum = reg; vmreg.regval = val; error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg); return (error); } int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.cpuid = vcpu; vmreg.regnum = reg; error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg); *ret_val = vmreg.regval; return (error); } int vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count, const int *regnums, uint64_t *regvals) { int error; struct vm_register_set vmregset; bzero(&vmregset, sizeof(vmregset)); vmregset.cpuid = vcpu; vmregset.count = count; vmregset.regnums = regnums; vmregset.regvals = regvals; error = ioctl(ctx->fd, VM_SET_REGISTER_SET, &vmregset); return (error); } int vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count, const int *regnums, uint64_t *regvals) { int error; struct vm_register_set vmregset; bzero(&vmregset, sizeof(vmregset)); vmregset.cpuid = vcpu; vmregset.count = count; vmregset.regnums = regnums; vmregset.regvals = regvals; error = ioctl(ctx->fd, VM_GET_REGISTER_SET, &vmregset); return (error); } int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) { int error; struct vm_run vmrun; bzero(&vmrun, sizeof(vmrun)); vmrun.cpuid = vcpu; error = ioctl(ctx->fd, VM_RUN, &vmrun); bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); return (error); } int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) { struct vm_suspend vmsuspend; bzero(&vmsuspend, sizeof(vmsuspend)); vmsuspend.how = how; return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); } int vm_reinit(struct vmctx *ctx) { return (ioctl(ctx->fd, VM_REINIT, 0)); } int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction) { struct vm_exception exc; exc.cpuid = vcpu; exc.vector = vector; exc.error_code = errcode; exc.error_code_valid = errcode_valid; exc.restart_instruction = restart_instruction; return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc)); } int vm_apicid2vcpu(struct vmctx *ctx, int apicid) { /* * The apic id associated with the 'vcpu' has the same numerical value * as the 'vcpu' itself. */ return (apicid); } int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector) { struct vm_lapic_irq vmirq; bzero(&vmirq, sizeof(vmirq)); vmirq.cpuid = vcpu; vmirq.vector = vector; return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq)); } int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector) { struct vm_lapic_irq vmirq; bzero(&vmirq, sizeof(vmirq)); vmirq.cpuid = vcpu; vmirq.vector = vector; return (ioctl(ctx->fd, VM_LAPIC_LOCAL_IRQ, &vmirq)); } int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg) { struct vm_lapic_msi vmmsi; bzero(&vmmsi, sizeof(vmmsi)); vmmsi.addr = addr; vmmsi.msg = msg; return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi)); } int vm_ioapic_assert_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq)); } int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq)); } int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq)); } int vm_ioapic_pincount(struct vmctx *ctx, int *pincount) { return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount)); } int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq)); } int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq)); } int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq)); } int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger) { struct vm_isa_irq_trigger isa_irq_trigger; bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger)); isa_irq_trigger.atpic_irq = atpic_irq; isa_irq_trigger.trigger = trigger; return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger)); } int vm_inject_nmi(struct vmctx *ctx, int vcpu) { struct vm_nmi vmnmi; bzero(&vmnmi, sizeof(vmnmi)); vmnmi.cpuid = vcpu; return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi)); } static const char *capstrmap[] = { [VM_CAP_HALT_EXIT] = "hlt_exit", [VM_CAP_MTRAP_EXIT] = "mtrap_exit", [VM_CAP_PAUSE_EXIT] = "pause_exit", [VM_CAP_UNRESTRICTED_GUEST] = "unrestricted_guest", [VM_CAP_ENABLE_INVPCID] = "enable_invpcid", [VM_CAP_BPT_EXIT] = "bpt_exit", }; int vm_capability_name2type(const char *capname) { int i; for (i = 0; i < nitems(capstrmap); i++) { if (strcmp(capstrmap[i], capname) == 0) return (i); } return (-1); } const char * vm_capability_type2name(int type) { if (type >= 0 && type < nitems(capstrmap)) return (capstrmap[type]); return (NULL); } int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int *retval) { int error; struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.cpuid = vcpu; vmcap.captype = cap; error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap); *retval = vmcap.capval; return (error); } int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val) { struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.cpuid = vcpu; vmcap.captype = cap; vmcap.capval = val; return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap)); } int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev)); } int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev)); } int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { struct vm_pptdev_mmio pptmmio; bzero(&pptmmio, sizeof(pptmmio)); pptmmio.bus = bus; pptmmio.slot = slot; pptmmio.func = func; pptmmio.gpa = gpa; pptmmio.len = len; pptmmio.hpa = hpa; return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio)); } int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec) { struct vm_pptdev_msi pptmsi; bzero(&pptmsi, sizeof(pptmsi)); pptmsi.vcpu = vcpu; pptmsi.bus = bus; pptmsi.slot = slot; pptmsi.func = func; pptmsi.msg = msg; pptmsi.addr = addr; pptmsi.numvec = numvec; return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi)); } int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control) { struct vm_pptdev_msix pptmsix; bzero(&pptmsix, sizeof(pptmsix)); pptmsix.vcpu = vcpu; pptmsix.bus = bus; pptmsix.slot = slot; pptmsix.func = func; pptmsix.idx = idx; pptmsix.msg = msg; pptmsix.addr = addr; pptmsix.vector_control = vector_control; return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix); } uint64_t * vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, int *ret_entries) { int error; static struct vm_stats vmstats; vmstats.cpuid = vcpu; error = ioctl(ctx->fd, VM_STATS, &vmstats); if (error == 0) { if (ret_entries) *ret_entries = vmstats.num_entries; if (ret_tv) *ret_tv = vmstats.tv; return (vmstats.statbuf); } else return (NULL); } const char * vm_get_stat_desc(struct vmctx *ctx, int index) { static struct vm_stat_desc statdesc; statdesc.index = index; if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) return (statdesc.desc); else return (NULL); } int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); x2apic.cpuid = vcpu; error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic); *state = x2apic.state; return (error); } int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); x2apic.cpuid = vcpu; x2apic.state = state; error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic); return (error); } /* * From Intel Vol 3a: * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT */ int vcpu_reset(struct vmctx *vmctx, int vcpu) { int error; uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx; uint32_t desc_access, desc_limit; uint16_t sel; zero = 0; rflags = 0x2; error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); if (error) goto done; rip = 0xfff0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0) goto done; cr0 = CR0_NE; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0) goto done; cr4 = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0) goto done; /* * CS: present, r/w, accessed, 16-bit, byte granularity, usable */ desc_base = 0xffff0000; desc_limit = 0xffff; desc_access = 0x0093; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0xf000; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0) goto done; /* * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0093; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0) goto done; /* General purpose registers */ rdx = 0xf00; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0) goto done; /* GDTR, IDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; /* TR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0000008b; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0) goto done; /* LDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x00000082; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) goto done; /* XXX cr2, debug registers */ error = 0; done: return (error); } int vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) { int error, i; struct vm_gpa_pte gpapte; bzero(&gpapte, sizeof(gpapte)); gpapte.gpa = gpa; error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); if (error == 0) { *num = gpapte.ptenum; for (i = 0; i < gpapte.ptenum; i++) pte[i] = gpapte.pte[i]; } return (error); } int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities) { int error; struct vm_hpet_cap cap; bzero(&cap, sizeof(struct vm_hpet_cap)); error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap); if (capabilities != NULL) *capabilities = cap.capabilities; return (error); } int vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; bzero(&gg, sizeof(struct vm_gla2gpa)); gg.vcpuid = vcpu; gg.prot = prot; gg.gla = gla; gg.paging = *paging; error = ioctl(ctx->fd, VM_GLA2GPA, &gg); if (error == 0) { *fault = gg.fault; *gpa = gg.gpa; } return (error); } int vm_gla2gpa_nofault(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; bzero(&gg, sizeof(struct vm_gla2gpa)); gg.vcpuid = vcpu; gg.prot = prot; gg.gla = gla; gg.paging = *paging; error = ioctl(ctx->fd, VM_GLA2GPA_NOFAULT, &gg); if (error == 0) { *fault = gg.fault; *gpa = gg.gpa; } return (error); } #ifndef min #define min(a,b) (((a) < (b)) ? (a) : (b)) #endif int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault) { void *va; uint64_t gpa; int error, i, n, off; for (i = 0; i < iovcnt; i++) { iov[i].iov_base = 0; iov[i].iov_len = 0; } while (len) { assert(iovcnt > 0); error = vm_gla2gpa(ctx, vcpu, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = min(len, PAGE_SIZE - off); va = vm_map_gpa(ctx, gpa, n); if (va == NULL) return (EFAULT); iov->iov_base = va; iov->iov_len = n; iov++; iovcnt--; gla += n; len -= n; } return (0); } void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt) { return; } void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len) { const char *src; char *dst; size_t n; dst = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); src = iov->iov_base; bcopy(src, dst, n); iov++; dst += n; len -= n; } } void vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov, size_t len) { const char *src; char *dst; size_t n; src = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); dst = iov->iov_base; bcopy(src, dst, n); iov++; src += n; len -= n; } } static int vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) { struct vm_cpuset vm_cpuset; int error; bzero(&vm_cpuset, sizeof(struct vm_cpuset)); vm_cpuset.which = which; vm_cpuset.cpusetsize = sizeof(cpuset_t); vm_cpuset.cpus = cpus; error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); return (error); } int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); } int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); } int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); } int vm_activate_cpu(struct vmctx *ctx, int vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = vcpu; error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac); return (error); } int vm_suspend_cpu(struct vmctx *ctx, int vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = vcpu; error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); return (error); } int vm_resume_cpu(struct vmctx *ctx, int vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = vcpu; error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); return (error); } int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); vmii.vcpuid = vcpu; error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii); if (error == 0) { *info1 = vmii.info1; *info2 = vmii.info2; } return (error); } int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); vmii.vcpuid = vcpu; vmii.info1 = info1; error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii); return (error); } int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value) { struct vm_rtc_data rtcdata; int error; bzero(&rtcdata, sizeof(struct vm_rtc_data)); rtcdata.offset = offset; rtcdata.value = value; error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata); return (error); } int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval) { struct vm_rtc_data rtcdata; int error; bzero(&rtcdata, sizeof(struct vm_rtc_data)); rtcdata.offset = offset; error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata); if (error == 0) *retval = rtcdata.value; return (error); } int vm_rtc_settime(struct vmctx *ctx, time_t secs) { struct vm_rtc_time rtctime; int error; bzero(&rtctime, sizeof(struct vm_rtc_time)); rtctime.secs = secs; error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime); return (error); } int vm_rtc_gettime(struct vmctx *ctx, time_t *secs) { struct vm_rtc_time rtctime; int error; bzero(&rtctime, sizeof(struct vm_rtc_time)); error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime); if (error == 0) *secs = rtctime.secs; return (error); } int vm_restart_instruction(void *arg, int vcpu) { struct vmctx *ctx = arg; return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu)); } +int +vm_snapshot_req(struct vm_snapshot_meta *meta) +{ + + if (ioctl(meta->ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { +#ifdef SNAPSHOT_DEBUG + fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", + __func__, meta->dev_name, errno); +#endif + return (-1); + } + return (0); +} + +int +vm_restore_time(struct vmctx *ctx) +{ + int dummy; + + dummy = 0; + return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); +} + int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) { struct vm_cpu_topology topology; bzero(&topology, sizeof (struct vm_cpu_topology)); topology.sockets = sockets; topology.cores = cores; topology.threads = threads; topology.maxcpus = maxcpus; return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); } int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) { struct vm_cpu_topology topology; int error; bzero(&topology, sizeof (struct vm_cpu_topology)); error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); if (error == 0) { *sockets = topology.sockets; *cores = topology.cores; *threads = topology.threads; *maxcpus = topology.maxcpus; } return (error); } int vm_get_device_fd(struct vmctx *ctx) { return (ctx->fd); } const cap_ioctl_t * vm_get_ioctls(size_t *len) { cap_ioctl_t *cmds; /* keep in sync with machine/vmm_dev.h */ static const cap_ioctl_t vm_ioctl_cmds[] = { VM_RUN, VM_SUSPEND, VM_REINIT, VM_ALLOC_MEMSEG, VM_GET_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_GETNEXT, VM_SET_REGISTER, VM_GET_REGISTER, VM_SET_SEGMENT_DESCRIPTOR, VM_GET_SEGMENT_DESCRIPTOR, VM_SET_REGISTER_SET, VM_GET_REGISTER_SET, VM_INJECT_EXCEPTION, VM_LAPIC_IRQ, VM_LAPIC_LOCAL_IRQ, VM_LAPIC_MSI, VM_IOAPIC_ASSERT_IRQ, VM_IOAPIC_DEASSERT_IRQ, VM_IOAPIC_PULSE_IRQ, VM_IOAPIC_PINCOUNT, VM_ISA_ASSERT_IRQ, VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER, VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV, VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI, VM_PPTDEV_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE, VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA, VM_GLA2GPA_NOFAULT, VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SUSPEND_CPU, VM_RESUME_CPU, VM_SET_INTINFO, VM_GET_INTINFO, VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME, VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY }; if (len == NULL) { cmds = malloc(sizeof(vm_ioctl_cmds)); if (cmds == NULL) return (NULL); bcopy(vm_ioctl_cmds, cmds, sizeof(vm_ioctl_cmds)); return (cmds); } *len = nitems(vm_ioctl_cmds); return (NULL); } diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h index 9819cda16bd9..2b026031b50f 100644 --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -1,240 +1,269 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMMAPI_H_ #define _VMMAPI_H_ #include #include +#include /* * API version for out-of-tree consumers like grub-bhyve for making compile * time decisions. */ #define VMMAPI_VERSION 0103 /* 2 digit major followed by 2 digit minor */ struct iovec; struct vmctx; +struct vm_snapshot_meta; enum x2apic_state; /* * Different styles of mapping the memory assigned to a VM into the address * space of the controlling process. */ enum vm_mmap_style { VM_MMAP_NONE, /* no mapping */ VM_MMAP_ALL, /* fully and statically mapped */ VM_MMAP_SPARSE, /* mappings created on-demand */ }; /* * 'flags' value passed to 'vm_set_memflags()'. */ #define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */ #define VM_MEM_F_WIRED 0x02 /* guest memory is wired */ /* * Identifiers for memory segments: * - vm_setup_memory() uses VM_SYSMEM for the system memory segment. * - the remaining identifiers can be used to create devmem segments. */ enum { VM_SYSMEM, VM_BOOTROM, VM_FRAMEBUFFER, }; /* * Get the length and name of the memory segment identified by 'segid'. * Note that system memory segments are identified with a nul name. * * Returns 0 on success and non-zero otherwise. */ int vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name, size_t namesiz); /* * Iterate over the guest address space. This function finds an address range * that starts at an address >= *gpa. * * Returns 0 if the next address range was found and non-zero otherwise. */ int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); + +int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, + size_t *lowmem_size, size_t *highmem_size); + /* * Create a device memory segment identified by 'segid'. * * Returns a pointer to the memory segment on success and MAP_FAILED otherwise. */ void *vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len); /* * Map the memory segment identified by 'segid' into the guest address space * at [gpa,gpa+len) with protection 'prot'. */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t segoff, size_t len, int prot); int vm_create(const char *name); int vm_get_device_fd(struct vmctx *ctx); struct vmctx *vm_open(const char *name); void vm_destroy(struct vmctx *ctx); int vm_parse_memsize(const char *optarg, size_t *memsize); int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); +/* inverse operation to vm_map_gpa - extract guest address from host pointer */ +vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr); int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); int vm_gla2gpa_nofault(struct vmctx *, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); uint32_t vm_get_lowmem_limit(struct vmctx *ctx); void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit); void vm_set_memflags(struct vmctx *ctx, int flags); int vm_get_memflags(struct vmctx *ctx); +int vm_get_name(struct vmctx *ctx, char *buffer, size_t max_len); size_t vm_get_lowmem_size(struct vmctx *ctx); size_t vm_get_highmem_size(struct vmctx *ctx); int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access); int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access); int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc); int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); int vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count, const int *regnums, uint64_t *regvals); int vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count, const int *regnums, uint64_t *regvals); int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit); int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how); int vm_reinit(struct vmctx *ctx); int vm_apicid2vcpu(struct vmctx *ctx, int apicid); int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction); int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector); int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector); int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg); int vm_ioapic_assert_irq(struct vmctx *ctx, int irq); int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq); int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq); int vm_ioapic_pincount(struct vmctx *ctx, int *pincount); int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger); int vm_inject_nmi(struct vmctx *ctx, int vcpu); int vm_capability_name2type(const char *capname); const char *vm_capability_type2name(int type); int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int *retval); int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val); int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec); int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2); int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo); const cap_ioctl_t *vm_get_ioctls(size_t *len); /* * Return a pointer to the statistics buffer. Note that this is not MT-safe. */ uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, int *ret_entries); const char *vm_get_stat_desc(struct vmctx *ctx, int index); int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s); int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s); int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities); /* * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'. * The 'iovcnt' should be big enough to accommodate all GPA segments. * * retval fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Error */ int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault); void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov, void *host_dst, size_t len); void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src, struct iovec *guest_iov, size_t len); void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt); /* RTC */ int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value); int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval); int vm_rtc_settime(struct vmctx *ctx, time_t secs); int vm_rtc_gettime(struct vmctx *ctx, time_t *secs); /* Reset vcpu register state */ int vcpu_reset(struct vmctx *ctx, int vcpu); int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_activate_cpu(struct vmctx *ctx, int vcpu); int vm_suspend_cpu(struct vmctx *ctx, int vcpu); int vm_resume_cpu(struct vmctx *ctx, int vcpu); /* CPU topology */ int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); /* * FreeBSD specific APIs */ int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu, uint64_t rip, uint64_t cr3, uint64_t gdtbase, uint64_t rsp); int vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu, uint32_t eip, uint32_t gdtbase, uint32_t esp); void vm_setup_freebsd_gdt(uint64_t *gdtr); + +/* + * Save and restore + */ + +#define MAX_SNAPSHOT_VMNAME 100 + +enum checkpoint_opcodes { + START_CHECKPOINT = 0, + START_SUSPEND = 1, +}; + +struct checkpoint_op { + unsigned int op; + char snapshot_filename[MAX_SNAPSHOT_VMNAME]; +}; + +int vm_snapshot_req(struct vm_snapshot_meta *meta); +int vm_restore_time(struct vmctx *ctx); + #endif /* _VMMAPI_H_ */ diff --git a/share/man/man5/src.conf.5 b/share/man/man5/src.conf.5 index da3a1f9c3044..4d28f019e100 100644 --- a/share/man/man5/src.conf.5 +++ b/share/man/man5/src.conf.5 @@ -1,1781 +1,1788 @@ .\" DO NOT EDIT-- this file is @generated by tools/build/options/makeman. .\" $FreeBSD$ -.Dd April 30, 2020 +.Dd May 4, 2020 .Dt SRC.CONF 5 .Os .Sh NAME .Nm src.conf .Nd "source build options" .Sh DESCRIPTION The .Nm file contains settings that will apply to every build involving the .Fx source tree; see .Xr build 7 . .Pp The .Nm file uses the standard makefile syntax. However, .Nm should not specify any dependencies to .Xr make 1 . Instead, .Nm is to set .Xr make 1 variables that control the aspects of how the system builds. .Pp The default location of .Nm is .Pa /etc/src.conf , though an alternative location can be specified in the .Xr make 1 variable .Va SRCCONF . Overriding the location of .Nm may be necessary if the system-wide settings are not suitable for a particular build. For instance, setting .Va SRCCONF to .Pa /dev/null effectively resets all build controls to their defaults. .Pp The only purpose of .Nm is to control the compilation of the .Fx source code, which is usually located in .Pa /usr/src . As a rule, the system administrator creates .Nm when the values of certain control variables need to be changed from their defaults. .Pp In addition, control variables can be specified for a particular build via the .Fl D option of .Xr make 1 or in its environment; see .Xr environ 7 . .Pp The environment of .Xr make 1 for the build can be controlled via the .Va SRC_ENV_CONF variable, which defaults to .Pa /etc/src-env.conf . Some examples that may only be set in this file are .Va WITH_DIRDEPS_BUILD , and .Va WITH_META_MODE , and .Va MAKEOBJDIRPREFIX as they are environment-only variables. .Pp The values of variables are ignored regardless of their setting; even if they would be set to .Dq Li FALSE or .Dq Li NO . The presence of an option causes it to be honored by .Xr make 1 . .Pp This list provides a name and short description for variables that can be used for source builds. .Bl -tag -width indent .It Va WITHOUT_ACCT Set to not build process accounting tools such as .Xr accton 8 and .Xr sa 8 . .It Va WITHOUT_ACPI Set to not build .Xr acpiconf 8 , .Xr acpidump 8 and related programs. .It Va WITHOUT_APM Set to not build .Xr apm 8 , .Xr apmd 8 and related programs. .It Va WITHOUT_ASSERT_DEBUG Set to compile programs and libraries without the .Xr assert 3 checks. .It Va WITHOUT_AT Set to not build .Xr at 1 and related utilities. .It Va WITHOUT_ATM Set to not build programs and libraries related to ATM networking. .It Va WITHOUT_AUDIT Set to not build audit support into system programs. .It Va WITHOUT_AUTHPF Set to not build .Xr authpf 8 . .It Va WITHOUT_AUTOFS Set to not build .Xr autofs 5 related programs, libraries, and kernel modules. .It Va WITHOUT_AUTO_OBJ Disable automatic creation of objdirs. This is enabled by default if the wanted OBJDIR is writable by the current user. .Pp This must be set in the environment, make command line, or .Pa /etc/src-env.conf , not .Pa /etc/src.conf . .It Va WITH_BEARSSL Build the BearSSL library. .Pp BearSSL is a tiny SSL library suitable for embedded environments. For details see .Lk http://www.BearSSL.org/ .Pp This library is currently only used to perform signature verification and related operations for Verified Exec and .Xr loader 8 . When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITH_LOADER_EFI_SECUREBOOT (unless .Va WITHOUT_LOADER_EFI_SECUREBOOT is set explicitly) .It Va WITH_LOADER_VERIEXEC (unless .Va WITHOUT_LOADER_VERIEXEC is set explicitly) .It Va WITH_LOADER_VERIEXEC_VECTX (unless .Va WITHOUT_LOADER_VERIEXEC_VECTX is set explicitly) .It Va WITH_VERIEXEC (unless .Va WITHOUT_VERIEXEC is set explicitly) .El .It Va WITHOUT_BHYVE Set to not build or install .Xr bhyve 8 , associated utilities, and examples. .Pp This option only affects amd64/amd64. +.It Va WITH_BHYVE_SNAPSHOT +Set to include support for save and restore (snapshots) in +.Xr bhyve 8 +and +.Xr bhyvectl 8 . +.Pp +This option only affects amd64/amd64. .It Va WITH_BIND_NOW Build all binaries with the .Dv DF_BIND_NOW flag set to indicate that the run-time loader should perform all relocation processing at process startup rather than on demand. .It Va WITHOUT_BINUTILS Do not build or install GNU .Xr as 1 and .Xr objdump 1 as part of the normal system build. .Pp This is a default setting on arm64/aarch64, riscv/riscv64 and riscv/riscv64sf. .It Va WITH_BINUTILS Build and install GNU .Xr as 1 on i386 and amd64, and .Xr objdump 1 as part of the normal system build. .Pp This is a default setting on amd64/amd64, arm/armv6, arm/armv7, i386/i386, mips/mips, mips/mips64, powerpc/powerpc and powerpc/powerpc64. .It Va WITHOUT_BINUTILS_BOOTSTRAP Do not build GNU binutils as part of the bootstrap process. .Pp This is a default setting on arm/armv6, arm/armv7, arm64/aarch64, mips/mips, mips/mips64, powerpc/powerpc, powerpc/powerpc64, riscv/riscv64 and riscv/riscv64sf. .It Va WITH_BINUTILS_BOOTSTRAP Build GNU binutils as part of the bootstrap process. .Pp This is a default setting on amd64/amd64 and i386/i386. .It Va WITHOUT_BLACKLIST Set this if you do not want to build .Xr blacklistd 8 and .Xr blacklistctl 8 . When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_BLACKLIST_SUPPORT (unless .Va WITH_BLACKLIST_SUPPORT is set explicitly) .El .It Va WITHOUT_BLACKLIST_SUPPORT Set to build some programs without .Xr libblacklist 3 support, like .Xr fingerd 8 , .Xr ftpd 8 , and .Xr sshd 8 . .It Va WITHOUT_BLUETOOTH Set to not build Bluetooth related kernel modules, programs and libraries. .It Va WITHOUT_BOOT Set to not build the boot blocks and loader. .It Va WITHOUT_BOOTPARAMD Set to not build or install .Xr bootparamd 8 . .It Va WITHOUT_BOOTPD Set to not build or install .Xr bootpd 8 . .It Va WITHOUT_BSDINSTALL Set to not build .Xr bsdinstall 8 , .Xr sade 8 , and related programs. .It Va WITHOUT_BSD_CPIO Set to not build the BSD licensed version of cpio based on .Xr libarchive 3 . .It Va WITH_BSD_GREP Install BSD-licensed grep as '[ef]grep' instead of GNU grep. .It Va WITHOUT_BSNMP Set to not build or install .Xr bsnmpd 1 and related libraries and data files. .It Va WITHOUT_BZIP2 Set to not build contributed bzip2 software as a part of the base system. .Bf -symbolic The option has no effect yet. .Ef When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_BZIP2_SUPPORT (unless .Va WITH_BZIP2_SUPPORT is set explicitly) .El .It Va WITHOUT_BZIP2_SUPPORT Set to build some programs without optional bzip2 support. .It Va WITHOUT_CALENDAR Set to not build .Xr calendar 1 . .It Va WITHOUT_CAPSICUM Set to not build Capsicum support into system programs. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_CASPER .El .It Va WITHOUT_CAROOT Set to not add the trusted certificates from the Mozilla NSS bundle to base. .It Va WITHOUT_CASPER Set to not build Casper program and related libraries. .It Va WITH_CCACHE_BUILD Set to use .Xr ccache 1 for the build. No configuration is required except to install the .Sy devel/ccache package. When using with .Xr distcc 1 , set .Sy CCACHE_PREFIX=/usr/local/bin/distcc . The default cache directory of .Pa $HOME/.ccache will be used, which can be overridden by setting .Sy CCACHE_DIR . The .Sy CCACHE_COMPILERCHECK option defaults to .Sy content when using the in-tree bootstrap compiler, and .Sy mtime when using an external compiler. The .Sy CCACHE_CPP2 option is used for Clang but not GCC. .Pp Sharing a cache between multiple work directories requires using a layout similar to .Pa /some/prefix/src .Pa /some/prefix/obj and an environment such as: .Bd -literal -offset indent CCACHE_BASEDIR='${SRCTOP:H}' MAKEOBJDIRPREFIX='${SRCTOP:H}/obj' .Ed .Pp See .Xr ccache 1 for more configuration options. .It Va WITHOUT_CCD Set to not build .Xr geom_ccd 4 and related utilities. .It Va WITHOUT_CDDL Set to not build code licensed under Sun's CDDL. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_CTF .It .Va WITHOUT_LOADER_ZFS .It .Va WITHOUT_ZFS .El .It Va WITHOUT_CLANG Set to not build the Clang C/C++ compiler during the regular phase of the build. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_CLANG_EXTRAS .It .Va WITHOUT_CLANG_FULL .It .Va WITHOUT_LLVM_COV .El .It Va WITHOUT_CLANG_BOOTSTRAP Set to not build the Clang C/C++ compiler during the bootstrap phase of the build. To be able to build the system, either gcc or clang bootstrap must be enabled unless an alternate compiler is provided via XCC. .It Va WITH_CLANG_EXTRAS Set to build additional clang and llvm tools, such as bugpoint and clang-format. .It Va WITHOUT_CLANG_FULL Set to avoid building the ARCMigrate, Rewriter and StaticAnalyzer components of the Clang C/C++ compiler. .It Va WITHOUT_CLANG_IS_CC Do not install links to the Clang C/C++ compiler as .Pa /usr/bin/cc , .Pa /usr/bin/c++ and .Pa /usr/bin/cpp . .It Va WITHOUT_CPP Set to not build .Xr cpp 1 . .It Va WITHOUT_CROSS_COMPILER Set to not build any cross compiler in the cross-tools stage of buildworld. When compiling a different version of .Fx than what is installed on the system, provide an alternate compiler with XCC to ensure success. When compiling with an identical version of .Fx to the host, this option may be safely used. This option may also be safe when the host version of .Fx is close to the sources being built, but all bets are off if there have been any changes to the toolchain between the versions. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_BINUTILS_BOOTSTRAP .It .Va WITHOUT_CLANG_BOOTSTRAP .It .Va WITHOUT_ELFTOOLCHAIN_BOOTSTRAP .It .Va WITHOUT_LLD_BOOTSTRAP .El .It Va WITHOUT_CRYPT Set to not build any crypto code. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_DMAGENT .It .Va WITHOUT_KERBEROS .It .Va WITHOUT_KERBEROS_SUPPORT .It .Va WITHOUT_LDNS .It .Va WITHOUT_LDNS_UTILS .It .Va WITHOUT_OPENSSH .It .Va WITHOUT_OPENSSL .It .Va WITHOUT_PKGBOOTSTRAP .It .Va WITHOUT_SVN .It .Va WITHOUT_SVNLITE .It .Va WITHOUT_UNBOUND .It .Va WITHOUT_WIRELESS .El .Pp When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_GSSAPI (unless .Va WITH_GSSAPI is set explicitly) .El .It Va WITH_CTF Set to compile with CTF (Compact C Type Format) data. CTF data encapsulates a reduced form of debugging information similar to DWARF and the venerable stabs and is required for DTrace. .It Va WITHOUT_CUSE Set to not build CUSE-related programs and libraries. .It Va WITHOUT_CXGBETOOL Set to not build .Xr cxgbetool 8 .Pp This is a default setting on arm/armv6, arm/armv7, mips/mips, mips/mips64, powerpc/powerpc, riscv/riscv64 and riscv/riscv64sf. .It Va WITH_CXGBETOOL Set to build .Xr cxgbetool 8 .Pp This is a default setting on amd64/amd64, arm64/aarch64, i386/i386 and powerpc/powerpc64. .It Va WITHOUT_CXX Set to not build .Xr c++ 1 and related libraries. It will also prevent building of .Xr gperf 1 and .Xr devd 8 . When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_CLANG .It .Va WITHOUT_CLANG_EXTRAS .It .Va WITHOUT_CLANG_FULL .It .Va WITHOUT_DTRACE_TESTS .It .Va WITHOUT_GOOGLETEST .It .Va WITHOUT_LLVM_COV .It .Va WITHOUT_TESTS .El .It Va WITHOUT_DEBUG_FILES Set to avoid building or installing standalone debug files for each executable binary and shared library. .It Va WITHOUT_DIALOG Set to not build .Xr dialog 1 , .Xr dialog 3 , .Xr dpv 1 , and .Xr dpv 3 . When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_BSDINSTALL .El .It Va WITHOUT_DICT Set to not build the Webster dictionary files. .It Va WITH_DIRDEPS_BUILD This is an experimental build system. For details see http://www.crufty.net/sjg/docs/freebsd-meta-mode.htm. Build commands can be seen from the top-level with: .Dl make show-valid-targets The build is driven by dirdeps.mk using .Va DIRDEPS stored in Makefile.depend files found in each directory. .Pp The build can be started from anywhere, and behaves the same. The initial instance of .Xr make 1 recursively reads .Va DIRDEPS from .Pa Makefile.depend , computing a graph of tree dependencies from the current origin. Setting .Va NO_DIRDEPS skips checking dirdep dependencies and will only build in the current and child directories. .Va NO_DIRDEPS_BELOW skips building any dirdeps and only build the current directory. .Pp This also utilizes the .Va WITH_META_MODE logic for incremental builds. .Pp The build hides commands executed unless .Va NO_SILENT is defined. .Pp Note that there is currently no mass install feature for this. .Pp When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITH_INSTALL_AS_USER .El .Pp When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITH_META_MODE (unless .Va WITHOUT_META_MODE is set explicitly) .It Va WITH_STAGING (unless .Va WITHOUT_STAGING is set explicitly) .It Va WITH_STAGING_MAN (unless .Va WITHOUT_STAGING_MAN is set explicitly) .It Va WITH_STAGING_PROG (unless .Va WITHOUT_STAGING_PROG is set explicitly) .It Va WITH_SYSROOT (unless .Va WITHOUT_SYSROOT is set explicitly) .El .Pp This must be set in the environment, make command line, or .Pa /etc/src-env.conf , not .Pa /etc/src.conf . .It Va WITH_DIRDEPS_CACHE Cache result of dirdeps.mk which can save significant time for subsequent builds. Depends on .Va WITH_DIRDEPS_BUILD . .Pp This must be set in the environment, make command line, or .Pa /etc/src-env.conf , not .Pa /etc/src.conf . .It Va WITHOUT_DMAGENT Set to not build dma Mail Transport Agent. .It Va WITHOUT_DOCCOMPRESS Set to not install compressed system documentation. Only the uncompressed version will be installed. .It Va WITH_DTRACE_TESTS Set to build and install the DTrace test suite in .Pa /usr/tests/cddl/usr.sbin/dtrace . This test suite is considered experimental on architectures other than amd64/amd64 and running it may cause system instability. .It Va WITHOUT_DYNAMICROOT Set this if you do not want to link .Pa /bin and .Pa /sbin dynamically. .It Va WITHOUT_EE Set to not build and install .Xr edit 1 , .Xr ee 1 , and related programs. .It Va WITHOUT_EFI Set not to build .Xr efivar 3 and .Xr efivar 8 . .Pp This is a default setting on mips/mips, mips/mips64, powerpc/powerpc, powerpc/powerpc64, riscv/riscv64 and riscv/riscv64sf. .It Va WITH_EFI Set to build .Xr efivar 3 and .Xr efivar 8 . .Pp This is a default setting on amd64/amd64, arm/armv6, arm/armv7, arm64/aarch64 and i386/i386. .It Va WITHOUT_ELFTOOLCHAIN_BOOTSTRAP Set to not build ELF Tool Chain tools (addr2line, nm, size, strings and strip) as part of the bootstrap process. .Bf -symbolic An alternate bootstrap tool chain must be provided. .Ef .It Va WITHOUT_EXAMPLES Set to avoid installing examples to .Pa /usr/share/examples/ . .It Va WITH_EXPERIMENTAL Set to include experimental features in the build. .It Va WITH_EXTRA_TCP_STACKS Set to build extra TCP stack modules. .It Va WITHOUT_FDT Set to not build Flattened Device Tree support as part of the base system. This includes the device tree compiler (dtc) and libfdt support library. .It Va WITHOUT_FILE Set to not build .Xr file 1 and related programs. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_SVNLITE .El .It Va WITHOUT_FINGER Set to not build or install .Xr finger 1 and .Xr fingerd 8 . .It Va WITHOUT_FLOPPY Set to not build or install programs for operating floppy disk driver. .It Va WITHOUT_FMTREE Set to not build and install .Pa /usr/sbin/fmtree . .It Va WITHOUT_FORMAT_EXTENSIONS Set to not enable .Fl fformat-extensions when compiling the kernel. Also disables all format checking. .It Va WITHOUT_FORTH Set to build bootloaders without Forth support. .It Va WITHOUT_FP_LIBC Set to build .Nm libc without floating-point support. .It Va WITHOUT_FREEBSD_UPDATE Set to not build .Xr freebsd-update 8 . .It Va WITHOUT_FTP Set to not build or install .Xr ftp 1 and .Xr ftpd 8 . .It Va WITHOUT_GAMES Set to not build games. .It Va WITHOUT_GDB Set to not build .Xr gdb 1 . .Pp This is a default setting on arm64/aarch64, riscv/riscv64 and riscv/riscv64sf. .It Va WITH_GDB Set to build .Xr gdb 1 . .Pp This is a default setting on amd64/amd64, arm/armv6, arm/armv7, i386/i386, mips/mips, mips/mips64, powerpc/powerpc and powerpc/powerpc64. .It Va WITHOUT_GNU_DIFF Set to not build GNU .Xr diff 1 and .Xr diff3 1 . .It Va WITHOUT_GNU_GREP Set to not build GNU .Xr grep 1 . .It Va WITH_GNU_GREP_COMPAT Set this option to include GNU extensions in .Xr bsdgrep 1 by linking against libgnuregex. .It Va WITHOUT_GOOGLETEST Set to neither build nor install .Lb libgmock , .Lb libgtest , and dependent tests. .Pp This is a default setting on mips/mips and mips/mips64. .It Va WITH_GOOGLETEST Set to build and install .Lb libgmock , .Lb libgtest , and dependent tests. .Pp This is a default setting on amd64/amd64, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, powerpc/powerpc, powerpc/powerpc64, riscv/riscv64 and riscv/riscv64sf. .It Va WITHOUT_GPIO Set to not build .Xr gpioctl 8 as part of the base system. .It Va WITHOUT_GSSAPI Set to not build libgssapi. .It Va WITHOUT_HAST Set to not build .Xr hastd 8 and related utilities. .It Va WITH_HESIOD Set to build Hesiod support. .It Va WITHOUT_HTML Set to not build HTML docs. .It Va WITHOUT_HYPERV Set to not build or install HyperV utilities. .Pp This is a default setting on arm/armv6, arm/armv7, arm64/aarch64, mips/mips, mips/mips64, powerpc/powerpc, powerpc/powerpc64, riscv/riscv64 and riscv/riscv64sf. .It Va WITH_HYPERV Set to build or install HyperV utilities. .Pp This is a default setting on amd64/amd64 and i386/i386. .It Va WITHOUT_ICONV Set to not build iconv as part of libc. .It Va WITHOUT_INCLUDES Set to not install header files. This option used to be spelled .Va NO_INCS . .Bf -symbolic The option does not work for build targets. .Ef .It Va WITHOUT_INET Set to not build programs and libraries related to IPv4 networking. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_INET_SUPPORT .El .It Va WITHOUT_INET6 Set to not build programs and libraries related to IPv6 networking. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_INET6_SUPPORT .El .It Va WITHOUT_INET6_SUPPORT Set to build libraries, programs, and kernel modules without IPv6 support. .It Va WITHOUT_INETD Set to not build .Xr inetd 8 . .It Va WITHOUT_INET_SUPPORT Set to build libraries, programs, and kernel modules without IPv4 support. .It Va WITHOUT_INSTALLLIB Set this to not install optional libraries. For example, when creating a .Xr nanobsd 8 image. .Bf -symbolic The option does not work for build targets. .Ef .It Va WITH_INSTALL_AS_USER Set to make install targets succeed for non-root users by installing files with owner and group attributes set to that of the user running the .Xr make 1 command. The user still must set the .Va DESTDIR variable to point to a directory where the user has write permissions. .It Va WITHOUT_IPFILTER Set to not build IP Filter package. .It Va WITHOUT_IPFW Set to not build IPFW tools. .It Va WITHOUT_IPSEC_SUPPORT Set to not build the kernel with .Xr ipsec 4 support. This option is needed for .Xr ipsec 4 and .Xr tcpmd5 4 . .It Va WITHOUT_ISCSI Set to not build .Xr iscsid 8 and related utilities. .It Va WITHOUT_JAIL Set to not build tools for the support of jails; e.g., .Xr jail 8 . .It Va WITHOUT_KDUMP Set to not build .Xr kdump 1 and .Xr truss 1 . .It Va WITHOUT_KERBEROS Set this to not build Kerberos 5 (KTH Heimdal). When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_GSSAPI (unless .Va WITH_GSSAPI is set explicitly) .It Va WITHOUT_KERBEROS_SUPPORT (unless .Va WITH_KERBEROS_SUPPORT is set explicitly) .El .It Va WITHOUT_KERBEROS_SUPPORT Set to build some programs without Kerberos support, like .Xr ssh 1 , .Xr telnet 1 , .Xr sshd 8 , and .Xr telnetd 8 . .It Va WITH_KERNEL_RETPOLINE Set to enable the "retpoline" mitigation for CVE-2017-5715 in the kernel build. .It Va WITHOUT_KERNEL_SYMBOLS Set to not install kernel symbol files. .Bf -symbolic This option is recommended for those people who have small root partitions. .Ef .It Va WITHOUT_KVM Set to not build the .Nm libkvm library as a part of the base system. .Bf -symbolic The option has no effect yet. .Ef When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_KVM_SUPPORT (unless .Va WITH_KVM_SUPPORT is set explicitly) .El .It Va WITHOUT_KVM_SUPPORT Set to build some programs without optional .Nm libkvm support. .It Va WITHOUT_LDNS Setting this variable will prevent the LDNS library from being built. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_LDNS_UTILS .It .Va WITHOUT_UNBOUND .El .It Va WITHOUT_LDNS_UTILS Setting this variable will prevent building the LDNS utilities .Xr drill 1 and .Xr host 1 . .It Va WITHOUT_LEGACY_CONSOLE Set to not build programs that support a legacy PC console; e.g., .Xr kbdcontrol 1 and .Xr vidcontrol 1 . .It Va WITHOUT_LIB32 On 64-bit platforms, set to not build 32-bit library set and a .Nm ld-elf32.so.1 runtime linker. .Pp This is a default setting on arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mips, powerpc/powerpc, riscv/riscv64 and riscv/riscv64sf. .It Va WITHOUT_LIBCPLUSPLUS Set to avoid building libcxxrt and libc++. .It Va WITHOUT_LIBPTHREAD Set to not build the .Nm libpthread providing library, .Nm libthr . When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_LIBTHR .El .It Va WITH_LIBSOFT On armv6 only, set to enable soft float ABI compatibility libraries. This option is for transitioning to the new hard float ABI. .It Va WITHOUT_LIBTHR Set to not build the .Nm libthr (1:1 threading) library. .It Va WITHOUT_LLD Set to not build LLVM's lld linker. .It Va WITHOUT_LLDB Set to not build the LLDB debugger. .Pp This is a default setting on arm/armv6, arm/armv7, mips/mips, mips/mips64, powerpc/powerpc, powerpc/powerpc64, riscv/riscv64 and riscv/riscv64sf. .It Va WITH_LLDB Set to build the LLDB debugger. .Pp This is a default setting on amd64/amd64, arm64/aarch64 and i386/i386. .It Va WITHOUT_LLD_BOOTSTRAP Set to not build the LLD linker during the bootstrap phase of the build. To be able to build the system, either Binutils or LLD bootstrap must be enabled unless an alternate linker is provided via XLD. .It Va WITHOUT_LLD_IS_LD Set to use GNU binutils ld as the system linker, instead of LLVM's LLD. .It Va WITHOUT_LLVM_ASSERTIONS Set to disable debugging assertions in LLVM. .It Va WITHOUT_LLVM_COV Set to not build the .Xr llvm-cov 1 tool. .It Va WITHOUT_LLVM_TARGET_AARCH64 Set to not build LLVM target support for AArch64. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .It Va WITHOUT_LLVM_TARGET_ALL Set to only build the required LLVM target support. This option is preferred to specific target support options. When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_LLVM_TARGET_AARCH64 (unless .Va WITH_LLVM_TARGET_AARCH64 is set explicitly) .It Va WITHOUT_LLVM_TARGET_ARM (unless .Va WITH_LLVM_TARGET_ARM is set explicitly) .It Va WITHOUT_LLVM_TARGET_MIPS (unless .Va WITH_LLVM_TARGET_MIPS is set explicitly) .It Va WITHOUT_LLVM_TARGET_POWERPC (unless .Va WITH_LLVM_TARGET_POWERPC is set explicitly) .It Va WITHOUT_LLVM_TARGET_RISCV (unless .Va WITH_LLVM_TARGET_RISCV is set explicitly) .El .It Va WITHOUT_LLVM_TARGET_ARM Set to not build LLVM target support for ARM. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .It Va WITH_LLVM_TARGET_BPF Set to build LLVM target support for BPF. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .It Va WITHOUT_LLVM_TARGET_MIPS Set to not build LLVM target support for MIPS. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .It Va WITHOUT_LLVM_TARGET_POWERPC Set to not build LLVM target support for PowerPC. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .It Va WITHOUT_LLVM_TARGET_RISCV Set to not build LLVM target support for RISC-V. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .It Va WITHOUT_LLVM_TARGET_X86 Set to not build LLVM target support for X86. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .It Va WITH_LOADER_EFI_SECUREBOOT Enable building .Xr loader 8 with support for verification based on certificates obtained from UEFI. .Pp .It Va WITH_LOADER_FIREWIRE Enable firewire support in /boot/loader on x86. This option is a nop on all other platforms. .It Va WITH_LOADER_FORCE_LE Set to force the powerpc boot loader to launch the kernel in little endian mode. .It Va WITHOUT_LOADER_GELI Disable inclusion of GELI crypto support in the boot chain binaries. .Pp This is a default setting on powerpc/powerpc and powerpc/powerpc64. .It Va WITH_LOADER_GELI Set to build GELI bootloader support. .Pp This is a default setting on amd64/amd64, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mips, mips/mips64, riscv/riscv64 and riscv/riscv64sf. .It Va WITHOUT_LOADER_LUA Set to not build LUA bindings for the boot loader. .Pp This is a default setting on powerpc/powerpc and powerpc/powerpc64. .It Va WITH_LOADER_LUA Set to build LUA bindings for the boot loader. .Pp This is a default setting on amd64/amd64, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mips, mips/mips64, riscv/riscv64 and riscv/riscv64sf. .It Va WITHOUT_LOADER_OFW Disable building of openfirmware bootloader components. .Pp This is a default setting on amd64/amd64, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mips, mips/mips64, riscv/riscv64 and riscv/riscv64sf. .It Va WITH_LOADER_OFW Set to build openfirmware bootloader components. .Pp This is a default setting on powerpc/powerpc and powerpc/powerpc64. .It Va WITHOUT_LOADER_UBOOT Disable building of ubldr. .Pp This is a default setting on amd64/amd64, arm64/aarch64, i386/i386, riscv/riscv64 and riscv/riscv64sf. .It Va WITH_LOADER_UBOOT Set to build ubldr. .Pp This is a default setting on arm/armv6, arm/armv7, mips/mips, mips/mips64, powerpc/powerpc and powerpc/powerpc64. .It Va WITH_LOADER_VERBOSE Set to build with extra verbose debugging in the loader. May explode already nearly too large loader over the limit. Use with care. .It Va WITH_LOADER_VERIEXEC Enable building .Xr loader 8 with support for verification similar to Verified Exec. .Pp Depends on .Va WITH_BEARSSL . When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITH_LOADER_EFI_SECUREBOOT (unless .Va WITHOUT_LOADER_EFI_SECUREBOOT is set explicitly) .It Va WITH_LOADER_VERIEXEC_VECTX (unless .Va WITHOUT_LOADER_VERIEXEC_VECTX is set explicitly) .El .It Va WITH_LOADER_VERIEXEC_PASS_MANIFEST Enable building .Xr loader 8 with support to pass a verified manifest to the kernel. The kernel has to be built with a module to parse the manifest. .Pp Depends on .Va WITH_LOADER_VERIEXEC . .It Va WITHOUT_LOADER_ZFS Set to not build ZFS file system boot loader support. .It Va WITHOUT_LOCALES Set to not build localization files; see .Xr locale 1 . .It Va WITHOUT_LOCATE Set to not build .Xr locate 1 and related programs. .It Va WITHOUT_LPR Set to not build .Xr lpr 1 and related programs. .It Va WITHOUT_LS_COLORS Set to build .Xr ls 1 without support for colors to distinguish file types. .It Va WITHOUT_LZMA_SUPPORT Set to build some programs without optional lzma compression support. .It Va WITHOUT_MAIL Set to not build any mail support (MUA or MTA). When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_DMAGENT .It .Va WITHOUT_MAILWRAPPER .It .Va WITHOUT_SENDMAIL .El .It Va WITHOUT_MAILWRAPPER Set to not build the .Xr mailwrapper 8 MTA selector. .It Va WITHOUT_MAKE Set to not install .Xr make 1 and related support files. .It Va WITHOUT_MAKE_CHECK_USE_SANDBOX Set to not execute .Dq Li "make check" in limited sandbox mode. This option should be paired with .Va WITH_INSTALL_AS_USER if executed as an unprivileged user. See .Xr tests 7 for more details. .It Va WITHOUT_MAN Set to not build manual pages. When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_MAN_UTILS (unless .Va WITH_MAN_UTILS is set explicitly) .El .It Va WITHOUT_MANCOMPRESS Set to not to install compressed man pages. Only the uncompressed versions will be installed. .It Va WITHOUT_MAN_UTILS Set to not build utilities for manual pages, .Xr apropos 1 , .Xr makewhatis 1 , .Xr man 1 , .Xr whatis 1 , .Xr manctl 8 , and related support files. .It Va WITH_META_MODE Create .Xr make 1 meta files when building, which can provide a reliable incremental build when using .Xr filemon 4 . The meta file is created in OBJDIR as .Pa target.meta . These meta files track the command that was executed, its output, and the current directory. The .Xr filemon 4 module is required unless .Va NO_FILEMON is defined. When the module is loaded, any files used by the commands executed are tracked as dependencies for the target in its meta file. The target is considered out-of-date and rebuilt if any of these conditions are true compared to the last build: .Bl -bullet -compact .It The command to execute changes. .It The current working directory changes. .It The target's meta file is missing. .It The target's meta file is missing filemon data when filemon is loaded and a previous run did not have it loaded. .It [requires .Xr filemon 4 ] Files read, executed or linked to are newer than the target. .It [requires .Xr filemon 4 ] Files read, written, executed or linked are missing. .El The meta files can also be useful for debugging. .Pp The build hides commands that are executed unless .Va NO_SILENT is defined. Errors cause .Xr make 1 to show some of its environment for further debugging. .Pp The build operates as it normally would otherwise. This option originally invoked a different build system but that was renamed to .Va WITH_DIRDEPS_BUILD . .Pp This must be set in the environment, make command line, or .Pa /etc/src-env.conf , not .Pa /etc/src.conf . .It Va WITHOUT_MLX5TOOL Set to not build .Xr mlx5tool 8 .Pp This is a default setting on arm/armv6, arm/armv7, mips/mips, mips/mips64, powerpc/powerpc, riscv/riscv64 and riscv/riscv64sf. .It Va WITH_MLX5TOOL Set to build .Xr mlx5tool 8 .Pp This is a default setting on amd64/amd64, arm64/aarch64, i386/i386 and powerpc/powerpc64. .It Va WITHOUT_NDIS Set to not build programs and libraries related to NDIS emulation support. .It Va WITHOUT_NETCAT Set to not build .Xr nc 1 utility. .It Va WITHOUT_NETGRAPH Set to not build applications to support .Xr netgraph 4 . When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_ATM .It .Va WITHOUT_BLUETOOTH .El .Pp When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_NETGRAPH_SUPPORT (unless .Va WITH_NETGRAPH_SUPPORT is set explicitly) .El .It Va WITHOUT_NETGRAPH_SUPPORT Set to build libraries, programs, and kernel modules without netgraph support. .It Va WITHOUT_NIS Set to not build .Xr NIS 8 support and related programs. If set, you might need to adopt your .Xr nsswitch.conf 5 and remove .Sq nis entries. .It Va WITHOUT_NLS Set to not build NLS catalogs. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_NLS_CATALOGS .El .It Va WITHOUT_NLS_CATALOGS Set to not build NLS catalog support for .Xr csh 1 . .It Va WITHOUT_NS_CACHING Set to disable name caching in the .Pa nsswitch subsystem. The generic caching daemon, .Xr nscd 8 , will not be built either if this option is set. .It Va WITHOUT_NTP Set to not build .Xr ntpd 8 and related programs. .It Va WITHOUT_NVME Set to not build nvme related tools and kernel modules. .Pp This is a default setting on arm/armv6, arm/armv7, mips/mips, mips/mips64, powerpc/powerpc, riscv/riscv64 and riscv/riscv64sf. .It Va WITH_NVME Set to build nvme related tools and kernel modules. .Pp This is a default setting on amd64/amd64, arm64/aarch64, i386/i386 and powerpc/powerpc64. .It Va WITH_OFED Set to build the .Dq "OpenFabrics Enterprise Distribution" Infiniband software stack. .It Va WITH_OFED_EXTRA Set to build the non-essential components of the .Dq "OpenFabrics Enterprise Distribution" Infiniband software stack, mostly examples. .It Va WITH_OPENLDAP Enable building openldap support for kerberos. .It Va WITHOUT_OPENMP Set to not build LLVM's OpenMP runtime. .Pp This is a default setting on arm/armv6, arm/armv7, arm64/aarch64, mips/mips, mips/mips64, powerpc/powerpc, riscv/riscv64 and riscv/riscv64sf. .It Va WITH_OPENMP Set to build LLVM's OpenMP runtime. .Pp This is a default setting on amd64/amd64, i386/i386 and powerpc/powerpc64. .It Va WITHOUT_OPENSSH Set to not build OpenSSH. .It Va WITHOUT_OPENSSL Set to not build OpenSSL. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_DMAGENT .It .Va WITHOUT_KERBEROS .It .Va WITHOUT_KERBEROS_SUPPORT .It .Va WITHOUT_LDNS .It .Va WITHOUT_LDNS_UTILS .It .Va WITHOUT_OPENSSH .It .Va WITHOUT_PKGBOOTSTRAP .It .Va WITHOUT_SVN .It .Va WITHOUT_SVNLITE .It .Va WITHOUT_UNBOUND .It .Va WITHOUT_WIRELESS .El .Pp When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_GSSAPI (unless .Va WITH_GSSAPI is set explicitly) .El .It Va WITHOUT_PAM Set to not build PAM library and modules. .Bf -symbolic This option is deprecated and does nothing. .Ef When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_PAM_SUPPORT (unless .Va WITH_PAM_SUPPORT is set explicitly) .El .It Va WITHOUT_PAM_SUPPORT Set to build some programs without PAM support, particularly .Xr ftpd 8 and .Xr ppp 8 . .It Va WITHOUT_PF Set to not build PF firewall package. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_AUTHPF .El .It Va WITH_PIE Build dynamically linked binaries as Position-Independent Executable (PIE). .It Va WITHOUT_PKGBOOTSTRAP Set to not build .Xr pkg 7 bootstrap tool. .It Va WITHOUT_PMC Set to not build .Xr pmccontrol 8 and related programs. .It Va WITHOUT_PORTSNAP Set to not build or install .Xr portsnap 8 and related files. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_FREEBSD_UPDATE .El .It Va WITHOUT_PPP Set to not build .Xr ppp 8 and related programs. .It Va WITHOUT_PROFILE Set to not build profiled libraries for use with .Xr gprof 8 . .Pp This is a default setting on mips/mips64. .It Va WITH_PROFILE Set to build profiled libraries for use with .Xr gprof 8 . .Pp This is a default setting on amd64/amd64, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mips, powerpc/powerpc, powerpc/powerpc64, riscv/riscv64 and riscv/riscv64sf. .It Va WITHOUT_QUOTAS Set to not build .Xr quota 1 and related programs. .It Va WITHOUT_RADIUS_SUPPORT Set to not build radius support into various applications, like .Xr pam_radius 8 and .Xr ppp 8 . .It Va WITH_RATELIMIT Set to build the system with rate limit support. .Pp This makes .Dv SO_MAX_PACING_RATE effective in .Xr getsockopt 2 , and .Ar txrlimit support in .Xr ifconfig 8 , by proxy. .It Va WITHOUT_RBOOTD Set to not build or install .Xr rbootd 8 . .It Va WITH_REPRODUCIBLE_BUILD Set to exclude build metadata (such as the build time, user, or host) from the kernel, boot loaders, and uname output, so that builds produce bit-for-bit identical output. .It Va WITHOUT_RESCUE Set to not build .Xr rescue 8 . .It Va WITH_RETPOLINE Set to build the base system with the retpoline speculative execution vulnerability mitigation for CVE-2017-5715. .It Va WITHOUT_ROUTED Set to not build .Xr routed 8 utility. .It Va WITH_RPCBIND_WARMSTART_SUPPORT Set to build .Xr rpcbind 8 with warmstart support. .It Va WITHOUT_SENDMAIL Set to not build .Xr sendmail 8 and related programs. .It Va WITHOUT_SERVICESDB Set to not install .Pa /var/db/services.db . .It Va WITHOUT_SETUID_LOGIN Set this to disable the installation of .Xr login 1 as a set-user-ID root program. .It Va WITHOUT_SHAREDOCS Set to not build the .Bx 4.4 legacy docs. .It Va WITHOUT_SHARED_TOOLCHAIN Set to build the toolchain binaries as statically linked executables. The set includes .Xr cc 1 , .Xr make 1 and necessary utilities like assembler, linker and library archive manager. .It Va WITH_SORT_THREADS Set to enable threads in .Xr sort 1 . .It Va WITHOUT_SOURCELESS Set to not build kernel modules that include sourceless code (either microcode or native code for host CPU). When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_SOURCELESS_HOST .It .Va WITHOUT_SOURCELESS_UCODE .El .It Va WITHOUT_SOURCELESS_HOST Set to not build kernel modules that include sourceless native code for host CPU. .It Va WITHOUT_SOURCELESS_UCODE Set to not build kernel modules that include sourceless microcode. .It Va WITHOUT_SSP Set to not build world with propolice stack smashing protection. .Pp This is a default setting on mips/mips and mips/mips64. .It Va WITH_SSP Set to build world with propolice stack smashing protection. .Pp This is a default setting on amd64/amd64, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, powerpc/powerpc, powerpc/powerpc64, riscv/riscv64 and riscv/riscv64sf. .It Va WITH_STAGING Enable staging of files to a stage tree. This can be best thought of as auto-install to .Va DESTDIR with some extra meta data to ensure dependencies can be tracked. Depends on .Va WITH_DIRDEPS_BUILD . When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITH_STAGING_MAN (unless .Va WITHOUT_STAGING_MAN is set explicitly) .It Va WITH_STAGING_PROG (unless .Va WITHOUT_STAGING_PROG is set explicitly) .El .Pp This must be set in the environment, make command line, or .Pa /etc/src-env.conf , not .Pa /etc/src.conf . .It Va WITH_STAGING_MAN Enable staging of man pages to stage tree. .It Va WITH_STAGING_PROG Enable staging of PROGs to stage tree. .It Va WITH_STALE_STAGED Check staged files are not stale. .It Va WITHOUT_STATS Set to neither build nor install .Lb libstats and dependent binaries. .It Va WITH_SVN Set to install .Xr svnlite 1 as .Xr svn 1 . .It Va WITHOUT_SVNLITE Set to not build .Xr svnlite 1 and related programs. .It Va WITHOUT_SYSCONS Set to not build .Xr syscons 4 support files such as keyboard maps, fonts, and screen output maps. .It Va WITH_SYSROOT Enable use of sysroot during build. Depends on .Va WITH_DIRDEPS_BUILD . .Pp This must be set in the environment, make command line, or .Pa /etc/src-env.conf , not .Pa /etc/src.conf . .It Va WITHOUT_SYSTEM_COMPILER Set to not opportunistically skip building a cross-compiler during the bootstrap phase of the build. Normally, if the currently installed compiler matches the planned bootstrap compiler type and revision, then it will not be built. This does not prevent a compiler from being built for installation though, only for building one for the build itself. The .Va WITHOUT_CLANG option controls that. .It Va WITHOUT_SYSTEM_LINKER Set to not opportunistically skip building a cross-linker during the bootstrap phase of the build. Normally, if the currently installed linker matches the planned bootstrap linker type and revision, then it will not be built. This does not prevent a linker from being built for installation though, only for building one for the build itself. The .Va WITHOUT_LLD and .Va WITHOUT_BINUTILS options control those. .Pp This option is only relevant when .Va WITH_LLD_BOOTSTRAP is set. .It Va WITHOUT_TALK Set to not build or install .Xr talk 1 and .Xr talkd 8 . .It Va WITHOUT_TCP_WRAPPERS Set to not build or install .Xr tcpd 8 , and related utilities. .It Va WITHOUT_TCSH Set to not build and install .Pa /bin/csh (which is .Xr tcsh 1 ) . .It Va WITHOUT_TELNET Set to not build .Xr telnet 1 and related programs. .It Va WITHOUT_TESTS Set to not build nor install the .Fx Test Suite in .Pa /usr/tests/ . See .Xr tests 7 for more details. This also disables the build of all test-related dependencies, including ATF. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_DTRACE_TESTS .El .Pp When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_GOOGLETEST (unless .Va WITH_GOOGLETEST is set explicitly) .It Va WITHOUT_TESTS_SUPPORT (unless .Va WITH_TESTS_SUPPORT is set explicitly) .El .It Va WITHOUT_TESTS_SUPPORT Set to disables the build of all test-related dependencies, including ATF. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_GOOGLETEST .El .It Va WITHOUT_TEXTPROC Set to not build programs used for text processing. .It Va WITHOUT_TFTP Set to not build or install .Xr tftp 1 and .Xr tftpd 8 . .It Va WITHOUT_TOOLCHAIN Set to not install header or programs used for program development, compilers, debuggers etc. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_BINUTILS .It .Va WITHOUT_CLANG .It .Va WITHOUT_CLANG_EXTRAS .It .Va WITHOUT_CLANG_FULL .It .Va WITHOUT_GDB .It .Va WITHOUT_INCLUDES .It .Va WITHOUT_LLD .It .Va WITHOUT_LLDB .It .Va WITHOUT_LLVM_COV .El .It Va WITHOUT_UNBOUND Set to not build .Xr unbound 8 and related programs. .It Va WITHOUT_UNIFIED_OBJDIR Set to use the historical object directory format for .Xr build 7 targets. For native-builds and builds done directly in sub-directories the format of .Pa ${MAKEOBJDIRPREFIX}/${.CURDIR} is used, while for cross-builds .Pa ${MAKEOBJDIRPREFIX}/${TARGET}.${TARGET_ARCH}/${.CURDIR} is used. .Pp This option is transitional and will be removed before the 12.0 release, at which time .va WITH_UNIFIED_OBJDIR will be enabled permanently. .Pp This must be set in the environment, make command line, or .Pa /etc/src-env.conf , not .Pa /etc/src.conf . .It Va WITHOUT_USB Set to not build USB-related programs and libraries. .It Va WITHOUT_USB_GADGET_EXAMPLES Set to not build USB gadget kernel modules. .It Va WITHOUT_UTMPX Set to not build user accounting tools such as .Xr last 1 , .Xr users 1 , .Xr who 1 , .Xr ac 8 , .Xr lastlogin 8 and .Xr utx 8 . .It Va WITH_VERIEXEC Enable building .Xr veriexec 8 which loads the contents of verified manifests into the kernel for use by .Xr mac_veriexec 4 .Pp Depends on .Va WITH_BEARSSL . .It Va WITHOUT_VI Set to not build and install vi, view, ex and related programs. .It Va WITHOUT_VT Set to not build .Xr vt 4 support files (fonts and keymaps). .It Va WITHOUT_WARNS Set this to not add warning flags to the compiler invocations. Useful as a temporary workaround when code enters the tree which triggers warnings in environments that differ from the original developer. .It Va WITHOUT_WIRELESS Set to not build programs used for 802.11 wireless networks; especially .Xr wpa_supplicant 8 and .Xr hostapd 8 . When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_WIRELESS_SUPPORT (unless .Va WITH_WIRELESS_SUPPORT is set explicitly) .El .It Va WITHOUT_WIRELESS_SUPPORT Set to build libraries, programs, and kernel modules without 802.11 wireless support. .It Va WITHOUT_WPA_SUPPLICANT_EAPOL Build .Xr wpa_supplicant 8 without support for the IEEE 802.1X protocol and without support for EAP-PEAP, EAP-TLS, EAP-LEAP, and EAP-TTLS protocols (usable only via 802.1X). .It Va WITHOUT_ZFS Set to not build ZFS file system kernel module, libraries, and user commands. .It Va WITHOUT_ZONEINFO Set to not build the timezone database. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_ZONEINFO_LEAPSECONDS_SUPPORT .El .It Va WITH_ZONEINFO_LEAPSECONDS_SUPPORT Set to build leapsecond information in to the timezone database. .El .Sh FILES .Bl -tag -compact -width Pa .It Pa /etc/src.conf .It Pa /etc/src-env.conf .It Pa /usr/share/mk/bsd.own.mk .El .Sh SEE ALSO .Xr make 1 , .Xr make.conf 5 , .Xr build 7 , .Xr ports 7 .Sh HISTORY The .Nm file appeared in .Fx 7.0 . .Sh AUTHORS This manual page was autogenerated by .An tools/build/options/makeman . diff --git a/share/mk/src.opts.mk b/share/mk/src.opts.mk index fc03abf50c61..5b2c76452a27 100644 --- a/share/mk/src.opts.mk +++ b/share/mk/src.opts.mk @@ -1,519 +1,520 @@ # $FreeBSD$ # # Option file for FreeBSD /usr/src builds. # # Users define WITH_FOO and WITHOUT_FOO on the command line or in /etc/src.conf # and /etc/make.conf files. These translate in the build system to MK_FOO={yes,no} # with sensible (usually) defaults. # # Makefiles must include bsd.opts.mk after defining specific MK_FOO options that # are applicable for that Makefile (typically there are none, but sometimes there # are exceptions). Recursive makes usually add MK_FOO=no for options that they wish # to omit from that make. # # Makefiles must include bsd.mkopt.mk before they test the value of any MK_FOO # variable. # # Makefiles may also assume that this file is included by src.opts.mk should it # need variables defined there prior to the end of the Makefile where # bsd.{subdir,lib.bin}.mk is traditionally included. # # The old-style YES_FOO and NO_FOO are being phased out. No new instances of them # should be added. Old instances should be removed since they were just to # bridge the gap between FreeBSD 4 and FreeBSD 5. # # Makefiles should never test WITH_FOO or WITHOUT_FOO directly (although an # exception is made for _WITHOUT_SRCONF which turns off this mechanism # completely inside bsd.*.mk files). # .if !target(____) ____: .include # # Define MK_* variables (which are either "yes" or "no") for users # to set via WITH_*/WITHOUT_* in /etc/src.conf and override in the # make(1) environment. # These should be tested with `== "no"' or `!= "no"' in makefiles. # The NO_* variables should only be set by makefiles for variables # that haven't been converted over. # # These options are used by the src builds. Those listed in # __DEFAULT_YES_OPTIONS default to 'yes' and will build unless turned # off. __DEFAULT_NO_OPTIONS will default to 'no' and won't build # unless turned on. Any options listed in 'BROKEN_OPTIONS' will be # hard-wired to 'no'. "Broken" here means not working or # not-appropriate and/or not supported. It doesn't imply something is # wrong with the code. There's not a single good word for this, so # BROKEN was selected as the least imperfect one considered at the # time. Options are added to BROKEN_OPTIONS list on a per-arch basis. # At this time, there's no provision for mutually incompatible options. __DEFAULT_YES_OPTIONS = \ ACCT \ ACPI \ APM \ AT \ ATM \ AUDIT \ AUTHPF \ AUTOFS \ BHYVE \ BINUTILS \ BLACKLIST \ BLUETOOTH \ BOOT \ BOOTPARAMD \ BOOTPD \ BSD_CPIO \ BSDINSTALL \ BSNMP \ BZIP2 \ CALENDAR \ CAPSICUM \ CAROOT \ CASPER \ CCD \ CDDL \ CLANG \ CLANG_BOOTSTRAP \ CLANG_IS_CC \ CPP \ CROSS_COMPILER \ CRYPT \ CUSE \ CXX \ CXGBETOOL \ DIALOG \ DICT \ DMAGENT \ DYNAMICROOT \ EE \ EFI \ ELFTOOLCHAIN_BOOTSTRAP \ EXAMPLES \ FDT \ FILE \ FINGER \ FLOPPY \ FMTREE \ FORTH \ FP_LIBC \ FREEBSD_UPDATE \ FTP \ GAMES \ GDB \ GNU_DIFF \ GNU_GREP \ GOOGLETEST \ GPIO \ HAST \ HTML \ HYPERV \ ICONV \ INET \ INET6 \ INETD \ IPFILTER \ IPFW \ ISCSI \ JAIL \ KDUMP \ KVM \ LDNS \ LDNS_UTILS \ LEGACY_CONSOLE \ LIBCPLUSPLUS \ LIBPTHREAD \ LIBTHR \ LLD \ LLD_BOOTSTRAP \ LLD_IS_LD \ LLVM_ASSERTIONS \ LLVM_COV \ LLVM_TARGET_ALL \ LOADER_GELI \ LOADER_LUA \ LOADER_OFW \ LOADER_UBOOT \ LOCALES \ LOCATE \ LPR \ LS_COLORS \ LZMA_SUPPORT \ MAIL \ MAILWRAPPER \ MAKE \ MLX5TOOL \ NDIS \ NETCAT \ NETGRAPH \ NLS_CATALOGS \ NS_CACHING \ NTP \ NVME \ OFED \ OPENSSL \ PAM \ PF \ PKGBOOTSTRAP \ PMC \ PORTSNAP \ PPP \ QUOTAS \ RADIUS_SUPPORT \ RBOOTD \ RESCUE \ ROUTED \ SENDMAIL \ SERVICESDB \ SETUID_LOGIN \ SHARED_TOOLCHAIN \ SHAREDOCS \ SOURCELESS \ SOURCELESS_HOST \ SOURCELESS_UCODE \ STATS \ SVNLITE \ SYSCONS \ SYSTEM_COMPILER \ SYSTEM_LINKER \ TALK \ TCP_WRAPPERS \ TCSH \ TELNET \ TEXTPROC \ TFTP \ UNBOUND \ USB \ UTMPX \ VI \ VT \ WIRELESS \ WPA_SUPPLICANT_EAPOL \ ZFS \ LOADER_ZFS \ ZONEINFO __DEFAULT_NO_OPTIONS = \ BEARSSL \ + BHYVE_SNAPSHOT \ BSD_GREP \ CLANG_EXTRAS \ DTRACE_TESTS \ EXPERIMENTAL \ GNU_GREP_COMPAT \ HESIOD \ LIBSOFT \ LOADER_FIREWIRE \ LOADER_FORCE_LE \ LOADER_VERBOSE \ LOADER_VERIEXEC_PASS_MANIFEST \ OFED_EXTRA \ OPENLDAP \ REPRODUCIBLE_BUILD \ RPCBIND_WARMSTART_SUPPORT \ SORT_THREADS \ SVN \ ZONEINFO_LEAPSECONDS_SUPPORT \ # LEFT/RIGHT. Left options which default to "yes" unless their corresponding # RIGHT option is disabled. __DEFAULT_DEPENDENT_OPTIONS= \ CLANG_FULL/CLANG \ LOADER_VERIEXEC/BEARSSL \ LOADER_EFI_SECUREBOOT/LOADER_VERIEXEC \ LOADER_VERIEXEC_VECTX/LOADER_VERIEXEC \ VERIEXEC/BEARSSL \ # MK_*_SUPPORT options which default to "yes" unless their corresponding # MK_* variable is set to "no". # .for var in \ BLACKLIST \ BZIP2 \ INET \ INET6 \ KERBEROS \ KVM \ NETGRAPH \ PAM \ TESTS \ WIRELESS __DEFAULT_DEPENDENT_OPTIONS+= ${var}_SUPPORT/${var} .endfor # # Default behaviour of some options depends on the architecture. Unfortunately # this means that we have to test TARGET_ARCH (the buildworld case) as well # as MACHINE_ARCH (the non-buildworld case). Normally TARGET_ARCH is not # used at all in bsd.*.mk, but we have to make an exception here if we want # to allow defaults for some things like clang to vary by target architecture. # Additional, per-target behavior should be rarely added only after much # gnashing of teeth and grinding of gears. # .if defined(TARGET_ARCH) __T=${TARGET_ARCH} .else __T=${MACHINE_ARCH} .endif # All supported backends for LLVM_TARGET_XXX __LLVM_TARGETS= \ aarch64 \ arm \ mips \ powerpc \ riscv \ x86 __LLVM_TARGET_FILT= C/(amd64|i386)/x86/:C/powerpc.*/powerpc/:C/armv[67]/arm/:C/riscv.*/riscv/:C/mips.*/mips/ .for __llt in ${__LLVM_TARGETS} # Default enable the given TARGET's LLVM_TARGET support .if ${__T:${__LLVM_TARGET_FILT}} == ${__llt} __DEFAULT_YES_OPTIONS+= LLVM_TARGET_${__llt:${__LLVM_TARGET_FILT}:tu} # aarch64 needs arm for -m32 support. .elif ${__T} == "aarch64" && ${__llt:Marm*} != "" __DEFAULT_DEPENDENT_OPTIONS+= LLVM_TARGET_ARM/LLVM_TARGET_AARCH64 # Default the rest of the LLVM_TARGETs to the value of MK_LLVM_TARGET_ALL. .else __DEFAULT_DEPENDENT_OPTIONS+= LLVM_TARGET_${__llt:${__LLVM_TARGET_FILT}:tu}/LLVM_TARGET_ALL .endif .endfor __DEFAULT_NO_OPTIONS+=LLVM_TARGET_BPF .include # In-tree binutils/gcc are older versions without modern architecture support. .if ${__T} == "aarch64" || ${__T:Mriscv*} != "" BROKEN_OPTIONS+=BINUTILS BINUTILS_BOOTSTRAP GDB .endif .if ${__T} == "amd64" || ${__T} == "i386" __DEFAULT_YES_OPTIONS+=BINUTILS_BOOTSTRAP .else __DEFAULT_NO_OPTIONS+=BINUTILS_BOOTSTRAP .endif .if ${__T:Mriscv*} != "" BROKEN_OPTIONS+=OFED .endif .if ${__T} == "aarch64" || ${__T} == "amd64" || ${__T} == "i386" __DEFAULT_YES_OPTIONS+=LLDB .else __DEFAULT_NO_OPTIONS+=LLDB .endif # LIB32 is supported on amd64, mips64, and powerpc64 .if (${__T} == "amd64" || ${__T:Mmips64*} || ${__T} == "powerpc64") __DEFAULT_YES_OPTIONS+=LIB32 .else BROKEN_OPTIONS+=LIB32 .endif # Only doing soft float API stuff on armv6 and armv7 .if ${__T} != "armv6" && ${__T} != "armv7" BROKEN_OPTIONS+=LIBSOFT .endif .if ${__T:Mmips*} # GOOGLETEST cannot currently be compiled on mips due to external circumstances. # Notably, the freebsd-gcc port isn't linking in libgcc so we end up trying ot # link to a hidden symbol. LLVM would successfully link this in, but some of # the mips variants are broken under LLVM until LLVM 10. GOOGLETEST should be # marked no longer broken with the switch to LLVM. BROKEN_OPTIONS+=GOOGLETEST SSP .endif # EFI doesn't exist on mips, powerpc, or riscv. .if ${__T:Mmips*} || ${__T:Mpowerpc*} || ${__T:Mriscv*} BROKEN_OPTIONS+=EFI .endif # OFW is only for powerpc, exclude others .if ${__T:Mpowerpc*} == "" BROKEN_OPTIONS+=LOADER_OFW .endif # UBOOT is only for arm, mips and powerpc, exclude others .if ${__T:Marm*} == "" && ${__T:Mmips*} == "" && ${__T:Mpowerpc*} == "" BROKEN_OPTIONS+=LOADER_UBOOT .endif # GELI and Lua in loader currently cause boot failures on powerpc. # Further debugging is required -- probably they are just broken on big # endian systems generically (they jump to null pointers or try to read # crazy high addresses, which is typical of endianness problems). .if ${__T:Mpowerpc*} BROKEN_OPTIONS+=LOADER_GELI LOADER_LUA .endif .if ${__T:Mmips64*} # profiling won't work on MIPS64 because there is only assembly for o32 BROKEN_OPTIONS+=PROFILE .endif .if ${__T} != "aarch64" && ${__T} != "amd64" && ${__T} != "i386" && \ ${__T} != "powerpc64" BROKEN_OPTIONS+=CXGBETOOL BROKEN_OPTIONS+=MLX5TOOL .endif # HyperV is currently x86-only .if ${__T} != "amd64" && ${__T} != "i386" BROKEN_OPTIONS+=HYPERV .endif # NVME is only aarch64, x86 and powerpc64 .if ${__T} != "aarch64" && ${__T} != "amd64" && ${__T} != "i386" && \ ${__T} != "powerpc64" BROKEN_OPTIONS+=NVME .endif .if ${__T} == "amd64" || ${__T} == "i386" || ${__T} == "powerpc64" __DEFAULT_YES_OPTIONS+=OPENMP .else __DEFAULT_NO_OPTIONS+=OPENMP .endif .include # # Force some options off if their dependencies are off. # Order is somewhat important. # .if ${MK_CAPSICUM} == "no" MK_CASPER:= no .endif .if ${MK_LIBPTHREAD} == "no" MK_LIBTHR:= no .endif .if ${MK_SOURCELESS} == "no" MK_SOURCELESS_HOST:= no MK_SOURCELESS_UCODE:= no .endif .if ${MK_CDDL} == "no" MK_ZFS:= no MK_LOADER_ZFS:= no MK_CTF:= no .endif .if ${MK_CRYPT} == "no" MK_OPENSSL:= no MK_OPENSSH:= no MK_KERBEROS:= no MK_KERBEROS_SUPPORT:= no .endif .if ${MK_CXX} == "no" MK_CLANG:= no MK_GOOGLETEST:= no MK_TESTS:= no .endif .if ${MK_DIALOG} == "no" MK_BSDINSTALL:= no .endif .if ${MK_FILE} == "no" MK_SVNLITE:= no .endif .if ${MK_MAIL} == "no" MK_MAILWRAPPER:= no MK_SENDMAIL:= no MK_DMAGENT:= no .endif .if ${MK_NETGRAPH} == "no" MK_ATM:= no MK_BLUETOOTH:= no .endif .if ${MK_NLS} == "no" MK_NLS_CATALOGS:= no .endif .if ${MK_OPENSSL} == "no" MK_DMAGENT:= no MK_OPENSSH:= no MK_KERBEROS:= no MK_KERBEROS_SUPPORT:= no MK_LDNS:= no MK_PKGBOOTSTRAP:= no MK_SVN:= no MK_SVNLITE:= no MK_WIRELESS:= no .endif .if ${MK_LDNS} == "no" MK_LDNS_UTILS:= no MK_UNBOUND:= no .endif .if ${MK_PF} == "no" MK_AUTHPF:= no .endif .if ${MK_OFED} == "no" MK_OFED_EXTRA:= no .endif .if ${MK_PORTSNAP} == "no" # freebsd-update depends on phttpget from portsnap MK_FREEBSD_UPDATE:= no .endif .if ${MK_TESTS} == "no" MK_DTRACE_TESTS:= no .endif .if ${MK_TESTS_SUPPORT} == "no" MK_GOOGLETEST:= no .endif .if ${MK_ZONEINFO} == "no" MK_ZONEINFO_LEAPSECONDS_SUPPORT:= no .endif .if ${MK_CROSS_COMPILER} == "no" MK_BINUTILS_BOOTSTRAP:= no MK_CLANG_BOOTSTRAP:= no MK_ELFTOOLCHAIN_BOOTSTRAP:= no MK_LLD_BOOTSTRAP:= no .endif .if ${MK_TOOLCHAIN} == "no" MK_BINUTILS:= no MK_CLANG:= no MK_GDB:= no MK_INCLUDES:= no MK_LLD:= no MK_LLDB:= no .endif .if ${MK_CLANG} == "no" MK_CLANG_EXTRAS:= no MK_CLANG_FULL:= no MK_LLVM_COV:= no .endif .if ${MK_LOADER_VERIEXEC} == "no" MK_LOADER_VERIEXEC_PASS_MANIFEST := no .endif # # MK_* options whose default value depends on another option. # .for vv in \ GSSAPI/KERBEROS \ MAN_UTILS/MAN .if defined(WITH_${vv:H}) MK_${vv:H}:= yes .elif defined(WITHOUT_${vv:H}) MK_${vv:H}:= no .else MK_${vv:H}:= ${MK_${vv:T}} .endif .endfor # # Set defaults for the MK_*_SUPPORT variables. # .endif # !target(____) diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index a08c90ed20be..70909510c983 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -1,746 +1,770 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_H_ #define _VMM_H_ #include #include +struct vm_snapshot_meta; + #ifdef _KERNEL SDT_PROVIDER_DECLARE(vmm); #endif enum vm_suspend_how { VM_SUSPEND_NONE, VM_SUSPEND_RESET, VM_SUSPEND_POWEROFF, VM_SUSPEND_HALT, VM_SUSPEND_TRIPLEFAULT, VM_SUSPEND_LAST }; /* * Identifiers for architecturally defined registers. */ enum vm_reg_name { VM_REG_GUEST_RAX, VM_REG_GUEST_RBX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_RBP, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15, VM_REG_GUEST_CR0, VM_REG_GUEST_CR3, VM_REG_GUEST_CR4, VM_REG_GUEST_DR7, VM_REG_GUEST_RSP, VM_REG_GUEST_RIP, VM_REG_GUEST_RFLAGS, VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS, VM_REG_GUEST_LDTR, VM_REG_GUEST_TR, VM_REG_GUEST_IDTR, VM_REG_GUEST_GDTR, VM_REG_GUEST_EFER, VM_REG_GUEST_CR2, VM_REG_GUEST_PDPTE0, VM_REG_GUEST_PDPTE1, VM_REG_GUEST_PDPTE2, VM_REG_GUEST_PDPTE3, VM_REG_GUEST_INTR_SHADOW, VM_REG_GUEST_DR0, VM_REG_GUEST_DR1, VM_REG_GUEST_DR2, VM_REG_GUEST_DR3, VM_REG_GUEST_DR6, VM_REG_GUEST_ENTRY_INST_LENGTH, VM_REG_LAST }; enum x2apic_state { X2APIC_DISABLED, X2APIC_ENABLED, X2APIC_STATE_LAST }; #define VM_INTINFO_VECTOR(info) ((info) & 0xff) #define VM_INTINFO_DEL_ERRCODE 0x800 #define VM_INTINFO_RSVD 0x7ffff000 #define VM_INTINFO_VALID 0x80000000 #define VM_INTINFO_TYPE 0x700 #define VM_INTINFO_HWINTR (0 << 8) #define VM_INTINFO_NMI (2 << 8) #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) /* * The VM name has to fit into the pathname length constraints of devfs, * governed primarily by SPECNAMELEN. The length is the total number of * characters in the full path, relative to the mount point and not * including any leading '/' characters. * A prefix and a suffix are added to the name specified by the user. * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters * longer for future use. * The suffix is a string that identifies a bootrom image or some similar * image that is attached to the VM. A separator character gets added to * the suffix automatically when generating the full path, so it must be * accounted for, reducing the effective length by 1. * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37 * bytes for FreeBSD 12. A minimum length is set for safety and supports * a SPECNAMELEN as small as 32 on old systems. */ #define VM_MAX_PREFIXLEN 10 #define VM_MAX_SUFFIXLEN 15 #define VM_MIN_NAMELEN 6 #define VM_MAX_NAMELEN \ (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1) #ifdef _KERNEL CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN); struct vm; struct vm_exception; struct seg_desc; struct vm_exit; struct vm_run; struct vhpet; struct vioapic; struct vlapic; struct vmspace; struct vm_object; struct vm_guest_paging; struct pmap; +enum snapshot_req; struct vm_eventinfo { void *rptr; /* rendezvous cookie */ int *sptr; /* suspend cookie */ int *iptr; /* reqidle cookie */ }; typedef int (*vmm_init_func_t)(int ipinum); typedef int (*vmm_cleanup_func_t)(void); typedef void (*vmm_resume_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, struct pmap *pmap, struct vm_eventinfo *info); typedef void (*vmi_cleanup_func_t)(void *vmi); typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, uint64_t *retval); typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, uint64_t val); typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, struct seg_desc *desc); typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, struct seg_desc *desc); typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); +typedef int (*vmi_snapshot_t)(void *vmi, struct vm_snapshot_meta *meta); +typedef int (*vmi_snapshot_vmcx_t)(void *vmi, struct vm_snapshot_meta *meta, + int vcpu); +typedef int (*vmi_restore_tsc_t)(void *vmi, int vcpuid, uint64_t now); struct vmm_ops { vmm_init_func_t init; /* module wide initialization */ vmm_cleanup_func_t cleanup; vmm_resume_func_t resume; vmi_init_func_t vminit; /* vm-specific initialization */ vmi_run_func_t vmrun; vmi_cleanup_func_t vmcleanup; vmi_get_register_t vmgetreg; vmi_set_register_t vmsetreg; vmi_get_desc_t vmgetdesc; vmi_set_desc_t vmsetdesc; vmi_get_cap_t vmgetcap; vmi_set_cap_t vmsetcap; vmi_vmspace_alloc vmspace_alloc; vmi_vmspace_free vmspace_free; vmi_vlapic_init vlapic_init; vmi_vlapic_cleanup vlapic_cleanup; + + /* checkpoint operations */ + vmi_snapshot_t vmsnapshot; + vmi_snapshot_vmcx_t vmcx_snapshot; + vmi_restore_tsc_t vm_restore_tsc; }; extern struct vmm_ops vmm_ops_intel; extern struct vmm_ops vmm_ops_amd; int vm_create(const char *name, struct vm **retvm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); const char *vm_name(struct vm *vm); uint16_t vm_get_maxcpus(struct vm *vm); void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); /* * APIs that modify the guest memory map require all vcpus to be frozen. */ int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot, int flags); int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); void vm_free_memseg(struct vm *vm, int ident); int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); /* * APIs that inspect the guest memory map require only a *single* vcpu to * be frozen. This acts like a read lock on the guest memory map since any * modification requires *all* vcpus to be frozen. */ int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, struct vm_object **objptr); vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len, int prot, void **cookie); void vm_gpa_release(void *cookie); bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa); int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *ret_desc); int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc); int vm_run(struct vm *vm, struct vm_run *vmrun); int vm_suspend(struct vm *vm, enum vm_suspend_how how); int vm_inject_nmi(struct vm *vm, int vcpu); int vm_nmi_pending(struct vm *vm, int vcpuid); void vm_nmi_clear(struct vm *vm, int vcpuid); int vm_inject_extint(struct vm *vm, int vcpu); int vm_extint_pending(struct vm *vm, int vcpuid); void vm_extint_clear(struct vm *vm, int vcpuid); struct vlapic *vm_lapic(struct vm *vm, int cpu); struct vioapic *vm_ioapic(struct vm *vm); struct vhpet *vm_hpet(struct vm *vm); int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); int vm_set_capability(struct vm *vm, int vcpu, int type, int val); int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); int vm_apicid2vcpuid(struct vm *vm, int apicid); int vm_activate_cpu(struct vm *vm, int vcpu); int vm_suspend_cpu(struct vm *vm, int vcpu); int vm_resume_cpu(struct vm *vm, int vcpu); struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); +int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta); +int vm_restore_time(struct vm *vm); + #ifdef _SYS__CPUSET_H_ /* * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. * The rendezvous 'func(arg)' is not allowed to do anything that will * cause the thread to be put to sleep. * * If the rendezvous is being initiated from a vcpu context then the * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1. * * The caller cannot hold any locks when initiating the rendezvous. * * The implementation of this API may cause vcpus other than those specified * by 'dest' to be stalled. The caller should not rely on any vcpus making * forward progress when the rendezvous is in progress. */ typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); int vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, vm_rendezvous_func_t func, void *arg); cpuset_t vm_active_cpus(struct vm *vm); cpuset_t vm_debug_cpus(struct vm *vm); cpuset_t vm_suspended_cpus(struct vm *vm); #endif /* _SYS__CPUSET_H_ */ static __inline int vcpu_rendezvous_pending(struct vm_eventinfo *info) { return (*((uintptr_t *)(info->rptr)) != 0); } static __inline int vcpu_suspended(struct vm_eventinfo *info) { return (*info->sptr); } static __inline int vcpu_reqidle(struct vm_eventinfo *info) { return (*info->iptr); } int vcpu_debugged(struct vm *vm, int vcpuid); /* * Return true if device indicated by bus/slot/func is supposed to be a * pci passthrough device. * * Return false otherwise. */ bool vmm_is_pptdev(int bus, int slot, int func); void *vm_iommu_domain(struct vm *vm); enum vcpu_state { VCPU_IDLE, VCPU_FROZEN, VCPU_RUNNING, VCPU_SLEEPING, }; int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, bool from_idle); enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); static int __inline vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) { return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); } #ifdef _SYS_PROC_H_ static int __inline vcpu_should_yield(struct vm *vm, int vcpu) { if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) return (1); else if (curthread->td_owepreempt) return (1); else return (0); } #endif void *vcpu_stats(struct vm *vm, int vcpu); void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); struct vmspace *vm_get_vmspace(struct vm *vm); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); struct vpmtmr *vm_pmtmr(struct vm *vm); struct vrtc *vm_rtc(struct vm *vm); /* * Inject exception 'vector' into the guest vcpu. This function returns 0 on * success and non-zero on failure. * * Wrapper functions like 'vm_inject_gp()' should be preferred to calling * this function directly because they enforce the trap-like or fault-like * behavior of an exception. * * This function should only be called in the context of the thread that is * executing this vcpu. */ int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid, uint32_t errcode, int restart_instruction); /* * This function is called after a VM-exit that occurred during exception or * interrupt delivery through the IDT. The format of 'intinfo' is described * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. * * If a VM-exit handler completes the event delivery successfully then it * should call vm_exit_intinfo() to extinguish the pending event. For e.g., * if the task switch emulation is triggered via a task gate then it should * call this function with 'intinfo=0' to indicate that the external event * is not pending anymore. * * Return value is 0 on success and non-zero on failure. */ int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); /* * This function is called before every VM-entry to retrieve a pending * event that should be injected into the guest. This function combines * nested events into a double or triple fault. * * Returns 0 if there are no events that need to be injected into the guest * and non-zero otherwise. */ int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); +/* + * Function used to keep track of the guest's TSC offset. The + * offset is used by the virutalization extensions to provide a consistent + * value for the Time Stamp Counter to the guest. + * + * Return value is 0 on success and non-zero on failure. + */ +int vm_set_tsc_offset(struct vm *vm, int vcpu_id, uint64_t offset); + enum vm_reg_name vm_segment_name(int seg_encoding); struct vm_copyinfo { uint64_t gpa; size_t len; void *hva; void *cookie; }; /* * Set up 'copyinfo[]' to copy to/from guest linear address space starting * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for * a copyin or PROT_WRITE for a copyout. * * retval is_fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Unrecoverable error * * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if * the return value is 0. The 'copyinfo[]' resources should be freed by calling * 'vm_copy_teardown()' after the copy is done. */ int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *is_fault); void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, int num_copyinfo); void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, size_t len); void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, struct vm_copyinfo *copyinfo, size_t len); int vcpu_trace_exceptions(struct vm *vm, int vcpuid); #endif /* KERNEL */ #define VM_MAXCPU 16 /* maximum virtual cpus */ /* * Identifiers for optional vmm capabilities */ enum vm_cap_type { VM_CAP_HALT_EXIT, VM_CAP_MTRAP_EXIT, VM_CAP_PAUSE_EXIT, VM_CAP_UNRESTRICTED_GUEST, VM_CAP_ENABLE_INVPCID, VM_CAP_BPT_EXIT, VM_CAP_MAX }; enum vm_intr_trigger { EDGE_TRIGGER, LEVEL_TRIGGER }; /* * The 'access' field has the format specified in Table 21-2 of the Intel * Architecture Manual vol 3b. * * XXX The contents of the 'access' field are architecturally defined except * bit 16 - Segment Unusable. */ struct seg_desc { uint64_t base; uint32_t limit; uint32_t access; }; #define SEG_DESC_TYPE(access) ((access) & 0x001f) #define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) #define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) #define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) #define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) #define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) enum vm_cpu_mode { CPU_MODE_REAL, CPU_MODE_PROTECTED, CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ }; enum vm_paging_mode { PAGING_MODE_FLAT, PAGING_MODE_32, PAGING_MODE_PAE, PAGING_MODE_64, }; struct vm_guest_paging { uint64_t cr3; int cpl; enum vm_cpu_mode cpu_mode; enum vm_paging_mode paging_mode; }; /* * The data structures 'vie' and 'vie_op' are meant to be opaque to the * consumers of instruction decoding. The only reason why their contents * need to be exposed is because they are part of the 'vm_exit' structure. */ struct vie_op { uint8_t op_byte; /* actual opcode byte */ uint8_t op_type; /* type of operation (e.g. MOV) */ uint16_t op_flags; }; _Static_assert(sizeof(struct vie_op) == 4, "ABI"); _Static_assert(_Alignof(struct vie_op) == 2, "ABI"); #define VIE_INST_SIZE 15 struct vie { uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ uint8_t num_valid; /* size of the instruction */ uint8_t num_processed; uint8_t addrsize:4, opsize:4; /* address and operand sizes */ uint8_t rex_w:1, /* REX prefix */ rex_r:1, rex_x:1, rex_b:1, rex_present:1, repz_present:1, /* REP/REPE/REPZ prefix */ repnz_present:1, /* REPNE/REPNZ prefix */ opsize_override:1, /* Operand size override */ addrsize_override:1, /* Address size override */ segment_override:1; /* Segment override */ uint8_t mod:2, /* ModRM byte */ reg:4, rm:4; uint8_t ss:2, /* SIB byte */ vex_present:1, /* VEX prefixed */ vex_l:1, /* L bit */ index:4, /* SIB byte */ base:4; /* SIB byte */ uint8_t disp_bytes; uint8_t imm_bytes; uint8_t scale; uint8_t vex_reg:4, /* vvvv: first source register specifier */ vex_pp:2, /* pp */ _sparebits:2; uint8_t _sparebytes[2]; int base_register; /* VM_REG_GUEST_xyz */ int index_register; /* VM_REG_GUEST_xyz */ int segment_register; /* VM_REG_GUEST_xyz */ int64_t displacement; /* optional addr displacement */ int64_t immediate; /* optional immediate operand */ uint8_t decoded; /* set to 1 if successfully decoded */ uint8_t _sparebyte; struct vie_op op; /* opcode description */ }; _Static_assert(sizeof(struct vie) == 64, "ABI"); _Static_assert(__offsetof(struct vie, disp_bytes) == 22, "ABI"); _Static_assert(__offsetof(struct vie, scale) == 24, "ABI"); _Static_assert(__offsetof(struct vie, base_register) == 28, "ABI"); enum vm_exitcode { VM_EXITCODE_INOUT, VM_EXITCODE_VMX, VM_EXITCODE_BOGUS, VM_EXITCODE_RDMSR, VM_EXITCODE_WRMSR, VM_EXITCODE_HLT, VM_EXITCODE_MTRAP, VM_EXITCODE_PAUSE, VM_EXITCODE_PAGING, VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ VM_EXITCODE_RENDEZVOUS, VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, VM_EXITCODE_INOUT_STR, VM_EXITCODE_TASK_SWITCH, VM_EXITCODE_MONITOR, VM_EXITCODE_MWAIT, VM_EXITCODE_SVM, VM_EXITCODE_REQIDLE, VM_EXITCODE_DEBUG, VM_EXITCODE_VMINSN, VM_EXITCODE_BPT, VM_EXITCODE_MAX }; struct vm_inout { uint16_t bytes:3; /* 1 or 2 or 4 */ uint16_t in:1; uint16_t string:1; uint16_t rep:1; uint16_t port; uint32_t eax; /* valid for out */ }; struct vm_inout_str { struct vm_inout inout; /* must be the first element */ struct vm_guest_paging paging; uint64_t rflags; uint64_t cr0; uint64_t index; uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ int addrsize; enum vm_reg_name seg_name; struct seg_desc seg_desc; }; enum task_switch_reason { TSR_CALL, TSR_IRET, TSR_JMP, TSR_IDT_GATE, /* task gate in IDT */ }; struct vm_task_switch { uint16_t tsssel; /* new TSS selector */ int ext; /* task switch due to external event */ uint32_t errcode; int errcode_valid; /* push 'errcode' on the new stack */ enum task_switch_reason reason; struct vm_guest_paging paging; }; struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ uint64_t rip; union { struct vm_inout inout; struct vm_inout_str inout_str; struct { uint64_t gpa; int fault_type; } paging; struct { uint64_t gpa; uint64_t gla; uint64_t cs_base; int cs_d; /* CS.D */ struct vm_guest_paging paging; struct vie vie; } inst_emul; /* * VMX specific payload. Used when there is no "better" * exitcode to represent the VM-exit. */ struct { int status; /* vmx inst status */ /* * 'exit_reason' and 'exit_qualification' are valid * only if 'status' is zero. */ uint32_t exit_reason; uint64_t exit_qualification; /* * 'inst_error' and 'inst_type' are valid * only if 'status' is non-zero. */ int inst_type; int inst_error; } vmx; /* * SVM specific payload. */ struct { uint64_t exitcode; uint64_t exitinfo1; uint64_t exitinfo2; } svm; struct { int inst_length; } bpt; struct { uint32_t code; /* ecx value */ uint64_t wval; } msr; struct { int vcpu; uint64_t rip; } spinup_ap; struct { uint64_t rflags; uint64_t intr_status; } hlt; struct { int vector; } ioapic_eoi; struct { enum vm_suspend_how how; } suspended; struct vm_task_switch task_switch; } u; }; /* APIs to inject faults into the guest */ void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, int errcode); static __inline void vm_inject_ud(void *vm, int vcpuid) { vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); } static __inline void vm_inject_gp(void *vm, int vcpuid) { vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); } static __inline void vm_inject_ac(void *vm, int vcpuid, int errcode) { vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); } static __inline void vm_inject_ss(void *vm, int vcpuid, int errcode) { vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); } void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); int vm_restart_instruction(void *vm, int vcpuid); #endif /* _VMM_H_ */ diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index bd806e7678f4..21775b70718e 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -1,425 +1,436 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ +struct vm_snapshot_meta; + #ifdef _KERNEL void vmmdev_init(void); int vmmdev_cleanup(void); #endif struct vm_memmap { vm_paddr_t gpa; int segid; /* memory segment */ vm_ooffset_t segoff; /* offset into memory segment */ size_t len; /* mmap length */ int prot; /* RWX */ int flags; }; #define VM_MEMMAP_F_WIRED 0x01 #define VM_MEMMAP_F_IOMMU 0x02 #define VM_MEMSEG_NAME(m) ((m)->name[0] != '\0' ? (m)->name : NULL) struct vm_memseg { int segid; size_t len; char name[VM_MAX_SUFFIXLEN + 1]; }; struct vm_register { int cpuid; int regnum; /* enum vm_reg_name */ uint64_t regval; }; struct vm_seg_desc { /* data or code segment */ int cpuid; int regnum; /* enum vm_reg_name */ struct seg_desc desc; }; struct vm_register_set { int cpuid; unsigned int count; const int *regnums; /* enum vm_reg_name */ uint64_t *regvals; }; struct vm_run { int cpuid; struct vm_exit vm_exit; }; struct vm_exception { int cpuid; int vector; uint32_t error_code; int error_code_valid; int restart_instruction; }; struct vm_lapic_msi { uint64_t msg; uint64_t addr; }; struct vm_lapic_irq { int cpuid; int vector; }; struct vm_ioapic_irq { int irq; }; struct vm_isa_irq { int atpic_irq; int ioapic_irq; }; struct vm_isa_irq_trigger { int atpic_irq; enum vm_intr_trigger trigger; }; struct vm_capability { int cpuid; enum vm_cap_type captype; int capval; int allcpus; }; struct vm_pptdev { int bus; int slot; int func; }; struct vm_pptdev_mmio { int bus; int slot; int func; vm_paddr_t gpa; vm_paddr_t hpa; size_t len; }; struct vm_pptdev_msi { int vcpu; int bus; int slot; int func; int numvec; /* 0 means disabled */ uint64_t msg; uint64_t addr; }; struct vm_pptdev_msix { int vcpu; int bus; int slot; int func; int idx; uint64_t msg; uint32_t vector_control; uint64_t addr; }; struct vm_nmi { int cpuid; }; #define MAX_VM_STATS 64 struct vm_stats { int cpuid; /* in */ int num_entries; /* out */ struct timeval tv; uint64_t statbuf[MAX_VM_STATS]; }; struct vm_stat_desc { int index; /* in */ char desc[128]; /* out */ }; struct vm_x2apic { int cpuid; enum x2apic_state state; }; struct vm_gpa_pte { uint64_t gpa; /* in */ uint64_t pte[4]; /* out */ int ptenum; }; struct vm_hpet_cap { uint32_t capabilities; /* lower 32 bits of HPET capabilities */ }; struct vm_suspend { enum vm_suspend_how how; }; struct vm_gla2gpa { int vcpuid; /* inputs */ int prot; /* PROT_READ or PROT_WRITE */ uint64_t gla; struct vm_guest_paging paging; int fault; /* outputs */ uint64_t gpa; }; struct vm_activate_cpu { int vcpuid; }; struct vm_cpuset { int which; int cpusetsize; cpuset_t *cpus; }; #define VM_ACTIVE_CPUS 0 #define VM_SUSPENDED_CPUS 1 #define VM_DEBUG_CPUS 2 struct vm_intinfo { int vcpuid; uint64_t info1; uint64_t info2; }; struct vm_rtc_time { time_t secs; }; struct vm_rtc_data { int offset; uint8_t value; }; struct vm_cpu_topology { uint16_t sockets; uint16_t cores; uint16_t threads; uint16_t maxcpus; }; enum { /* general routines */ IOCNUM_ABIVERS = 0, IOCNUM_RUN = 1, IOCNUM_SET_CAPABILITY = 2, IOCNUM_GET_CAPABILITY = 3, IOCNUM_SUSPEND = 4, IOCNUM_REINIT = 5, /* memory apis */ IOCNUM_MAP_MEMORY = 10, /* deprecated */ IOCNUM_GET_MEMORY_SEG = 11, /* deprecated */ IOCNUM_GET_GPA_PMAP = 12, IOCNUM_GLA2GPA = 13, IOCNUM_ALLOC_MEMSEG = 14, IOCNUM_GET_MEMSEG = 15, IOCNUM_MMAP_MEMSEG = 16, IOCNUM_MMAP_GETNEXT = 17, IOCNUM_GLA2GPA_NOFAULT = 18, /* register/state accessors */ IOCNUM_SET_REGISTER = 20, IOCNUM_GET_REGISTER = 21, IOCNUM_SET_SEGMENT_DESCRIPTOR = 22, IOCNUM_GET_SEGMENT_DESCRIPTOR = 23, IOCNUM_SET_REGISTER_SET = 24, IOCNUM_GET_REGISTER_SET = 25, /* interrupt injection */ IOCNUM_GET_INTINFO = 28, IOCNUM_SET_INTINFO = 29, IOCNUM_INJECT_EXCEPTION = 30, IOCNUM_LAPIC_IRQ = 31, IOCNUM_INJECT_NMI = 32, IOCNUM_IOAPIC_ASSERT_IRQ = 33, IOCNUM_IOAPIC_DEASSERT_IRQ = 34, IOCNUM_IOAPIC_PULSE_IRQ = 35, IOCNUM_LAPIC_MSI = 36, IOCNUM_LAPIC_LOCAL_IRQ = 37, IOCNUM_IOAPIC_PINCOUNT = 38, IOCNUM_RESTART_INSTRUCTION = 39, /* PCI pass-thru */ IOCNUM_BIND_PPTDEV = 40, IOCNUM_UNBIND_PPTDEV = 41, IOCNUM_MAP_PPTDEV_MMIO = 42, IOCNUM_PPTDEV_MSI = 43, IOCNUM_PPTDEV_MSIX = 44, /* statistics */ IOCNUM_VM_STATS = 50, IOCNUM_VM_STAT_DESC = 51, /* kernel device state */ IOCNUM_SET_X2APIC_STATE = 60, IOCNUM_GET_X2APIC_STATE = 61, IOCNUM_GET_HPET_CAPABILITIES = 62, /* CPU Topology */ IOCNUM_SET_TOPOLOGY = 63, IOCNUM_GET_TOPOLOGY = 64, /* legacy interrupt injection */ IOCNUM_ISA_ASSERT_IRQ = 80, IOCNUM_ISA_DEASSERT_IRQ = 81, IOCNUM_ISA_PULSE_IRQ = 82, IOCNUM_ISA_SET_IRQ_TRIGGER = 83, /* vm_cpuset */ IOCNUM_ACTIVATE_CPU = 90, IOCNUM_GET_CPUSET = 91, IOCNUM_SUSPEND_CPU = 92, IOCNUM_RESUME_CPU = 93, /* RTC */ IOCNUM_RTC_READ = 100, IOCNUM_RTC_WRITE = 101, IOCNUM_RTC_SETTIME = 102, IOCNUM_RTC_GETTIME = 103, + + /* checkpoint */ + IOCNUM_SNAPSHOT_REQ = 113, + + IOCNUM_RESTORE_TIME = 115 }; #define VM_RUN \ _IOWR('v', IOCNUM_RUN, struct vm_run) #define VM_SUSPEND \ _IOW('v', IOCNUM_SUSPEND, struct vm_suspend) #define VM_REINIT \ _IO('v', IOCNUM_REINIT) #define VM_ALLOC_MEMSEG \ _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg) #define VM_GET_MEMSEG \ _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg) #define VM_MMAP_MEMSEG \ _IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap) #define VM_MMAP_GETNEXT \ _IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap) #define VM_SET_REGISTER \ _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) #define VM_GET_REGISTER \ _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) #define VM_SET_SEGMENT_DESCRIPTOR \ _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) #define VM_GET_SEGMENT_DESCRIPTOR \ _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) #define VM_SET_REGISTER_SET \ _IOW('v', IOCNUM_SET_REGISTER_SET, struct vm_register_set) #define VM_GET_REGISTER_SET \ _IOWR('v', IOCNUM_GET_REGISTER_SET, struct vm_register_set) #define VM_INJECT_EXCEPTION \ _IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception) #define VM_LAPIC_IRQ \ _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq) #define VM_LAPIC_LOCAL_IRQ \ _IOW('v', IOCNUM_LAPIC_LOCAL_IRQ, struct vm_lapic_irq) #define VM_LAPIC_MSI \ _IOW('v', IOCNUM_LAPIC_MSI, struct vm_lapic_msi) #define VM_IOAPIC_ASSERT_IRQ \ _IOW('v', IOCNUM_IOAPIC_ASSERT_IRQ, struct vm_ioapic_irq) #define VM_IOAPIC_DEASSERT_IRQ \ _IOW('v', IOCNUM_IOAPIC_DEASSERT_IRQ, struct vm_ioapic_irq) #define VM_IOAPIC_PULSE_IRQ \ _IOW('v', IOCNUM_IOAPIC_PULSE_IRQ, struct vm_ioapic_irq) #define VM_IOAPIC_PINCOUNT \ _IOR('v', IOCNUM_IOAPIC_PINCOUNT, int) #define VM_ISA_ASSERT_IRQ \ _IOW('v', IOCNUM_ISA_ASSERT_IRQ, struct vm_isa_irq) #define VM_ISA_DEASSERT_IRQ \ _IOW('v', IOCNUM_ISA_DEASSERT_IRQ, struct vm_isa_irq) #define VM_ISA_PULSE_IRQ \ _IOW('v', IOCNUM_ISA_PULSE_IRQ, struct vm_isa_irq) #define VM_ISA_SET_IRQ_TRIGGER \ _IOW('v', IOCNUM_ISA_SET_IRQ_TRIGGER, struct vm_isa_irq_trigger) #define VM_SET_CAPABILITY \ _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) #define VM_GET_CAPABILITY \ _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) #define VM_BIND_PPTDEV \ _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev) #define VM_UNBIND_PPTDEV \ _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev) #define VM_MAP_PPTDEV_MMIO \ _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio) #define VM_PPTDEV_MSI \ _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi) #define VM_PPTDEV_MSIX \ _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix) #define VM_INJECT_NMI \ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) #define VM_STATS \ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) #define VM_STAT_DESC \ _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) #define VM_SET_X2APIC_STATE \ _IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic) #define VM_GET_X2APIC_STATE \ _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic) #define VM_GET_HPET_CAPABILITIES \ _IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap) #define VM_SET_TOPOLOGY \ _IOW('v', IOCNUM_SET_TOPOLOGY, struct vm_cpu_topology) #define VM_GET_TOPOLOGY \ _IOR('v', IOCNUM_GET_TOPOLOGY, struct vm_cpu_topology) #define VM_GET_GPA_PMAP \ _IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte) #define VM_GLA2GPA \ _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa) #define VM_GLA2GPA_NOFAULT \ _IOWR('v', IOCNUM_GLA2GPA_NOFAULT, struct vm_gla2gpa) #define VM_ACTIVATE_CPU \ _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) #define VM_GET_CPUS \ _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) #define VM_SUSPEND_CPU \ _IOW('v', IOCNUM_SUSPEND_CPU, struct vm_activate_cpu) #define VM_RESUME_CPU \ _IOW('v', IOCNUM_RESUME_CPU, struct vm_activate_cpu) #define VM_SET_INTINFO \ _IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo) #define VM_GET_INTINFO \ _IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo) #define VM_RTC_WRITE \ _IOW('v', IOCNUM_RTC_WRITE, struct vm_rtc_data) #define VM_RTC_READ \ _IOWR('v', IOCNUM_RTC_READ, struct vm_rtc_data) #define VM_RTC_SETTIME \ _IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time) #define VM_RTC_GETTIME \ _IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time) #define VM_RESTART_INSTRUCTION \ _IOW('v', IOCNUM_RESTART_INSTRUCTION, int) +#define VM_SNAPSHOT_REQ \ + _IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta) +#define VM_RESTORE_TIME \ + _IOWR('v', IOCNUM_RESTORE_TIME, int) #endif diff --git a/sys/amd64/include/vmm_snapshot.h b/sys/amd64/include/vmm_snapshot.h new file mode 100644 index 000000000000..6ba25a5dae2e --- /dev/null +++ b/sys/amd64/include/vmm_snapshot.h @@ -0,0 +1,156 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Flavius Anton + * Copyright (c) 2016 Mihai Tiganus + * Copyright (c) 2016-2019 Mihai Carabas + * Copyright (c) 2017-2019 Darius Mihai + * Copyright (c) 2017-2019 Elena Mihailescu + * Copyright (c) 2018-2019 Sergiu Weisz + * All rights reserved. + * The bhyve-snapshot feature was developed under sponsorships + * from Matthew Grooms. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_SNAPSHOT_ +#define _VMM_SNAPSHOT_ + +#include +#include +#ifndef _KERNEL +#include +#endif + +struct vmctx; + +enum snapshot_req { + STRUCT_VMX, + STRUCT_VIOAPIC, + STRUCT_VM, + STRUCT_VLAPIC, + VM_MEM, + STRUCT_VHPET, + STRUCT_VMCX, + STRUCT_VATPIC, + STRUCT_VATPIT, + STRUCT_VPMTMR, + STRUCT_VRTC, +}; + +struct vm_snapshot_buffer { + /* + * R/O for device-specific functions; + * written by generic snapshot functions. + */ + uint8_t *const buf_start; + const size_t buf_size; + + /* + * R/W for device-specific functions used to keep track of buffer + * current position and remaining size. + */ + uint8_t *buf; + size_t buf_rem; + + /* + * Length of the snapshot is either determined as (buf_size - buf_rem) + * or (buf - buf_start) -- the second variation returns a signed value + * so it may not be appropriate. + * + * Use vm_get_snapshot_size(meta). + */ +}; + +enum vm_snapshot_op { + VM_SNAPSHOT_SAVE, + VM_SNAPSHOT_RESTORE, +}; + +struct vm_snapshot_meta { + struct vmctx *ctx; + void *dev_data; + const char *dev_name; /* identify userspace devices */ + enum snapshot_req dev_req; /* identify kernel structs */ + + struct vm_snapshot_buffer buffer; + + enum vm_snapshot_op op; +}; + + +void vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op); +int vm_snapshot_buf(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta); +size_t vm_get_snapshot_size(struct vm_snapshot_meta *meta); +int vm_snapshot_guest2host_addr(void **addrp, size_t len, bool restore_null, + struct vm_snapshot_meta *meta); +int vm_snapshot_buf_cmp(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta); + +#define SNAPSHOT_BUF_OR_LEAVE(DATA, LEN, META, RES, LABEL) \ +do { \ + (RES) = vm_snapshot_buf((DATA), (LEN), (META)); \ + if ((RES) != 0) { \ + vm_snapshot_buf_err(#DATA, (META)->op); \ + goto LABEL; \ + } \ +} while (0) + +#define SNAPSHOT_VAR_OR_LEAVE(DATA, META, RES, LABEL) \ + SNAPSHOT_BUF_OR_LEAVE(&(DATA), sizeof(DATA), (META), (RES), LABEL) + +/* + * Address variables are pointers to guest memory. + * + * When RNULL != 0, do not enforce invalid address checks; instead, make the + * pointer NULL at restore time. + */ +#define SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ADDR, LEN, RNULL, META, RES, LABEL) \ +do { \ + (RES) = vm_snapshot_guest2host_addr((void **)&(ADDR), (LEN), (RNULL), \ + (META)); \ + if ((RES) != 0) { \ + if ((RES) == EFAULT) \ + fprintf(stderr, "%s: invalid address: %s\r\n", \ + __func__, #ADDR); \ + goto LABEL; \ + } \ +} while (0) + +/* compare the value in the meta buffer with the data */ +#define SNAPSHOT_BUF_CMP_OR_LEAVE(DATA, LEN, META, RES, LABEL) \ +do { \ + (RES) = vm_snapshot_buf_cmp((DATA), (LEN), (META)); \ + if ((RES) != 0) { \ + vm_snapshot_buf_err(#DATA, (META)->op); \ + goto LABEL; \ + } \ +} while (0) + +#define SNAPSHOT_VAR_CMP_OR_LEAVE(DATA, META, RES, LABEL) \ + SNAPSHOT_BUF_CMP_OR_LEAVE(&(DATA), sizeof(DATA), (META), (RES), LABEL) + +#endif diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c index d3ba62b4b19c..f9660024fe0c 100644 --- a/sys/amd64/vmm/amd/svm.c +++ b/sys/amd64/vmm/amd/svm.c @@ -1,2305 +1,2662 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include "vmm_lapic.h" #include "vmm_stat.h" #include "vmm_ktr.h" #include "vmm_ioport.h" #include "vatpic.h" #include "vlapic.h" #include "vlapic_priv.h" #include "x86.h" #include "vmcb.h" #include "svm.h" #include "svm_softc.h" #include "svm_msr.h" #include "npt.h" SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL); /* * SVM CPUID function 0x8000_000A, edx bit decoding. */ #define AMD_CPUID_SVM_NP BIT(0) /* Nested paging or RVI */ #define AMD_CPUID_SVM_LBR BIT(1) /* Last branch virtualization */ #define AMD_CPUID_SVM_SVML BIT(2) /* SVM lock */ #define AMD_CPUID_SVM_NRIP_SAVE BIT(3) /* Next RIP is saved */ #define AMD_CPUID_SVM_TSC_RATE BIT(4) /* TSC rate control. */ #define AMD_CPUID_SVM_VMCB_CLEAN BIT(5) /* VMCB state caching */ #define AMD_CPUID_SVM_FLUSH_BY_ASID BIT(6) /* Flush by ASID */ #define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */ #define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */ #define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */ #define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */ #define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \ VMCB_CACHE_IOPM | \ VMCB_CACHE_I | \ VMCB_CACHE_TPR | \ VMCB_CACHE_CR2 | \ VMCB_CACHE_CR | \ VMCB_CACHE_DR | \ VMCB_CACHE_DT | \ VMCB_CACHE_SEG | \ VMCB_CACHE_NP) static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT; SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean, 0, NULL); static MALLOC_DEFINE(M_SVM, "svm", "svm"); static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic"); static uint32_t svm_feature = ~0U; /* AMD SVM features. */ SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0, "SVM features advertised by CPUID.8000000AH:EDX"); static int disable_npf_assist; SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN, &disable_npf_assist, 0, NULL); /* Maximum ASIDs supported by the processor */ static uint32_t nasid; SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0, "Number of ASIDs supported by this processor"); /* Current ASID generation for each host cpu */ static struct asid asid[MAXCPU]; /* * SVM host state saved area of size 4KB for each core. */ static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window"); static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val); static __inline int flush_by_asid(void) { return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID); } static __inline int decode_assist(void) { return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST); } static void svm_disable(void *arg __unused) { uint64_t efer; efer = rdmsr(MSR_EFER); efer &= ~EFER_SVM; wrmsr(MSR_EFER, efer); } /* * Disable SVM on all CPUs. */ static int svm_cleanup(void) { smp_rendezvous(NULL, svm_disable, NULL, NULL); return (0); } /* * Verify that all the features required by bhyve are available. */ static int check_svm_features(void) { u_int regs[4]; /* CPUID Fn8000_000A is for SVM */ do_cpuid(0x8000000A, regs); svm_feature &= regs[3]; /* * The number of ASIDs can be configured to be less than what is * supported by the hardware but not more. */ if (nasid == 0 || nasid > regs[1]) nasid = regs[1]; KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid)); /* bhyve requires the Nested Paging feature */ if (!(svm_feature & AMD_CPUID_SVM_NP)) { printf("SVM: Nested Paging feature not available.\n"); return (ENXIO); } /* bhyve requires the NRIP Save feature */ if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) { printf("SVM: NRIP Save feature not available.\n"); return (ENXIO); } return (0); } static void svm_enable(void *arg __unused) { uint64_t efer; efer = rdmsr(MSR_EFER); efer |= EFER_SVM; wrmsr(MSR_EFER, efer); wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu])); } /* * Return 1 if SVM is enabled on this processor and 0 otherwise. */ static int svm_available(void) { uint64_t msr; /* Section 15.4 Enabling SVM from APM2. */ if ((amd_feature2 & AMDID2_SVM) == 0) { printf("SVM: not available.\n"); return (0); } msr = rdmsr(MSR_VM_CR); if ((msr & VM_CR_SVMDIS) != 0) { printf("SVM: disabled by BIOS.\n"); return (0); } return (1); } static int svm_init(int ipinum) { int error, cpu; if (!svm_available()) return (ENXIO); error = check_svm_features(); if (error) return (error); vmcb_clean &= VMCB_CACHE_DEFAULT; for (cpu = 0; cpu < MAXCPU; cpu++) { /* * Initialize the host ASIDs to their "highest" valid values. * * The next ASID allocation will rollover both 'gen' and 'num' * and start off the sequence at {1,1}. */ asid[cpu].gen = ~0UL; asid[cpu].num = nasid - 1; } svm_msr_init(); svm_npt_init(ipinum); /* Enable SVM on all CPUs */ smp_rendezvous(NULL, svm_enable, NULL, NULL); return (0); } static void svm_restore(void) { svm_enable(NULL); } +#ifdef BHYVE_SNAPSHOT +int +svm_set_tsc_offset(struct svm_softc *sc, int vcpu, uint64_t offset) +{ + int error; + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + ctrl->tsc_offset = offset; + + svm_set_dirty(sc, vcpu, VMCB_CACHE_I); + VCPU_CTR1(sc->vm, vcpu, "tsc offset changed to %#lx", offset); + + error = vm_set_tsc_offset(sc->vm, vcpu, offset); + + return (error); +} +#endif + /* Pentium compatible MSRs */ #define MSR_PENTIUM_START 0 #define MSR_PENTIUM_END 0x1FFF /* AMD 6th generation and Intel compatible MSRs */ #define MSR_AMD6TH_START 0xC0000000UL #define MSR_AMD6TH_END 0xC0001FFFUL /* AMD 7th and 8th generation compatible MSRs */ #define MSR_AMD7TH_START 0xC0010000UL #define MSR_AMD7TH_END 0xC0011FFFUL /* * Get the index and bit position for a MSR in permission bitmap. * Two bits are used for each MSR: lower bit for read and higher bit for write. */ static int svm_msr_index(uint64_t msr, int *index, int *bit) { uint32_t base, off; *index = -1; *bit = (msr % 4) * 2; base = 0; if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) { *index = msr / 4; return (0); } base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) { off = (msr - MSR_AMD6TH_START); *index = (off + base) / 4; return (0); } base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1); if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) { off = (msr - MSR_AMD7TH_START); *index = (off + base) / 4; return (0); } return (EINVAL); } /* * Allow vcpu to read or write the 'msr' without trapping into the hypervisor. */ static void svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write) { int index, bit, error; error = svm_msr_index(msr, &index, &bit); KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr)); KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE, ("%s: invalid index %d for msr %#lx", __func__, index, msr)); KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d " "msr %#lx", __func__, bit, msr)); if (read) perm_bitmap[index] &= ~(1UL << bit); if (write) perm_bitmap[index] &= ~(2UL << bit); } static void svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr) { svm_msr_perm(perm_bitmap, msr, true, true); } static void svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr) { svm_msr_perm(perm_bitmap, msr, true, false); } static __inline int svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask) { struct vmcb_ctrl *ctrl; KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); ctrl = svm_get_vmcb_ctrl(sc, vcpu); return (ctrl->intercept[idx] & bitmask ? 1 : 0); } static __inline void svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask, int enabled) { struct vmcb_ctrl *ctrl; uint32_t oldval; KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); ctrl = svm_get_vmcb_ctrl(sc, vcpu); oldval = ctrl->intercept[idx]; if (enabled) ctrl->intercept[idx] |= bitmask; else ctrl->intercept[idx] &= ~bitmask; if (ctrl->intercept[idx] != oldval) { svm_set_dirty(sc, vcpu, VMCB_CACHE_I); VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified " "from %#x to %#x", idx, oldval, ctrl->intercept[idx]); } } static __inline void svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) { svm_set_intercept(sc, vcpu, off, bitmask, 0); } static __inline void svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) { svm_set_intercept(sc, vcpu, off, bitmask, 1); } static void vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, uint64_t msrpm_base_pa, uint64_t np_pml4) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; uint32_t mask; int n; ctrl = svm_get_vmcb_ctrl(sc, vcpu); state = svm_get_vmcb_state(sc, vcpu); ctrl->iopm_base_pa = iopm_base_pa; ctrl->msrpm_base_pa = msrpm_base_pa; /* Enable nested paging */ ctrl->np_enable = 1; ctrl->n_cr3 = np_pml4; /* * Intercept accesses to the control registers that are not shadowed * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8. */ for (n = 0; n < 16; n++) { mask = (BIT(n) << 16) | BIT(n); if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8) svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); else svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); } /* * Intercept everything when tracing guest exceptions otherwise * just intercept machine check exception. */ if (vcpu_trace_exceptions(sc->vm, vcpu)) { for (n = 0; n < 32; n++) { /* * Skip unimplemented vectors in the exception bitmap. */ if (n == 2 || n == 9) { continue; } svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n)); } } else { svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC)); } /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */ svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_FERR_FREEZE); svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR); svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT); /* * From section "Canonicalization and Consistency Checks" in APMv2 * the VMRUN intercept bit must be set to pass the consistency check. */ svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN); /* * The ASID will be set to a non-zero value just before VMRUN. */ ctrl->asid = 0; /* * Section 15.21.1, Interrupt Masking in EFLAGS * Section 15.21.2, Virtualizing APIC.TPR * * This must be set for %rflag and %cr8 isolation of guest and host. */ ctrl->v_intr_masking = 1; /* Enable Last Branch Record aka LBR for debugging */ ctrl->lbr_virt_en = 1; state->dbgctl = BIT(0); /* EFER_SVM must always be set when the guest is executing */ state->efer = EFER_SVM; /* Set up the PAT to power-on state */ state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); /* Set up DR6/7 to power-on state */ state->dr6 = DBREG_DR6_RESERVED1; state->dr7 = DBREG_DR7_RESERVED1; } /* * Initialize a virtual machine. */ static void * svm_vminit(struct vm *vm, pmap_t pmap) { struct svm_softc *svm_sc; struct svm_vcpu *vcpu; vm_paddr_t msrpm_pa, iopm_pa, pml4_pa; int i; uint16_t maxcpus; svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO); if (((uintptr_t)svm_sc & PAGE_MASK) != 0) panic("malloc of svm_softc not aligned on page boundary"); svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM, M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); if (svm_sc->msr_bitmap == NULL) panic("contigmalloc of SVM MSR bitmap failed"); svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM, M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); if (svm_sc->iopm_bitmap == NULL) panic("contigmalloc of SVM IO bitmap failed"); svm_sc->vm = vm; svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4); /* * Intercept read and write accesses to all MSRs. */ memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE); /* * Access to the following MSRs is redirected to the VMCB when the * guest is executing. Therefore it is safe to allow the guest to * read/write these MSRs directly without hypervisor involvement. */ svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR); svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT); svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC); /* * Intercept writes to make sure that the EFER_SVM bit is not cleared. */ svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER); /* Intercept access to all I/O ports. */ memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE); iopm_pa = vtophys(svm_sc->iopm_bitmap); msrpm_pa = vtophys(svm_sc->msr_bitmap); pml4_pa = svm_sc->nptp; maxcpus = vm_get_maxcpus(svm_sc->vm); for (i = 0; i < maxcpus; i++) { vcpu = svm_get_vcpu(svm_sc, i); vcpu->nextrip = ~0; vcpu->lastcpu = NOCPU; vcpu->vmcb_pa = vtophys(&vcpu->vmcb); vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa); svm_msr_guest_init(svm_sc, i); } return (svm_sc); } /* * Collateral for a generic SVM VM-exit. */ static void vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2) { vme->exitcode = VM_EXITCODE_SVM; vme->u.svm.exitcode = code; vme->u.svm.exitinfo1 = info1; vme->u.svm.exitinfo2 = info2; } static int svm_cpl(struct vmcb_state *state) { /* * From APMv2: * "Retrieve the CPL from the CPL field in the VMCB, not * from any segment DPL" */ return (state->cpl); } static enum vm_cpu_mode svm_vcpu_mode(struct vmcb *vmcb) { struct vmcb_segment seg; struct vmcb_state *state; int error; state = &vmcb->state; if (state->efer & EFER_LMA) { error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__, error)); /* * Section 4.8.1 for APM2, check if Code Segment has * Long attribute set in descriptor. */ if (seg.attrib & VMCB_CS_ATTRIB_L) return (CPU_MODE_64BIT); else return (CPU_MODE_COMPATIBILITY); } else if (state->cr0 & CR0_PE) { return (CPU_MODE_PROTECTED); } else { return (CPU_MODE_REAL); } } static enum vm_paging_mode svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer) { if ((cr0 & CR0_PG) == 0) return (PAGING_MODE_FLAT); if ((cr4 & CR4_PAE) == 0) return (PAGING_MODE_32); if (efer & EFER_LME) return (PAGING_MODE_64); else return (PAGING_MODE_PAE); } /* * ins/outs utility routines */ static uint64_t svm_inout_str_index(struct svm_regctx *regs, int in) { uint64_t val; val = in ? regs->sctx_rdi : regs->sctx_rsi; return (val); } static uint64_t svm_inout_str_count(struct svm_regctx *regs, int rep) { uint64_t val; val = rep ? regs->sctx_rcx : 1; return (val); } static void svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1, int in, struct vm_inout_str *vis) { int error, s; if (in) { vis->seg_name = VM_REG_GUEST_ES; } else { /* The segment field has standard encoding */ s = (info1 >> 10) & 0x7; vis->seg_name = vm_segment_name(s); } error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc); KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error)); } static int svm_inout_str_addrsize(uint64_t info1) { uint32_t size; size = (info1 >> 7) & 0x7; switch (size) { case 1: return (2); /* 16 bit */ case 2: return (4); /* 32 bit */ case 4: return (8); /* 64 bit */ default: panic("%s: invalid size encoding %d", __func__, size); } } static void svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) { struct vmcb_state *state; state = &vmcb->state; paging->cr3 = state->cr3; paging->cpl = svm_cpl(state); paging->cpu_mode = svm_vcpu_mode(vmcb); paging->paging_mode = svm_paging_mode(state->cr0, state->cr4, state->efer); } #define UNHANDLED 0 /* * Handle guest I/O intercept. */ static int svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; struct svm_regctx *regs; struct vm_inout_str *vis; uint64_t info1; int inout_string; state = svm_get_vmcb_state(svm_sc, vcpu); ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); regs = svm_get_guest_regctx(svm_sc, vcpu); info1 = ctrl->exitinfo1; inout_string = info1 & BIT(2) ? 1 : 0; /* * The effective segment number in EXITINFO1[12:10] is populated * only if the processor has the DecodeAssist capability. * * XXX this is not specified explicitly in APMv2 but can be verified * empirically. */ if (inout_string && !decode_assist()) return (UNHANDLED); vmexit->exitcode = VM_EXITCODE_INOUT; vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0; vmexit->u.inout.string = inout_string; vmexit->u.inout.rep = (info1 & BIT(3)) ? 1 : 0; vmexit->u.inout.bytes = (info1 >> 4) & 0x7; vmexit->u.inout.port = (uint16_t)(info1 >> 16); vmexit->u.inout.eax = (uint32_t)(state->rax); if (inout_string) { vmexit->exitcode = VM_EXITCODE_INOUT_STR; vis = &vmexit->u.inout_str; svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging); vis->rflags = state->rflags; vis->cr0 = state->cr0; vis->index = svm_inout_str_index(regs, vmexit->u.inout.in); vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep); vis->addrsize = svm_inout_str_addrsize(info1); svm_inout_str_seginfo(svm_sc, vcpu, info1, vmexit->u.inout.in, vis); } return (UNHANDLED); } static int npf_fault_type(uint64_t exitinfo1) { if (exitinfo1 & VMCB_NPF_INFO1_W) return (VM_PROT_WRITE); else if (exitinfo1 & VMCB_NPF_INFO1_ID) return (VM_PROT_EXECUTE); else return (VM_PROT_READ); } static bool svm_npf_emul_fault(uint64_t exitinfo1) { if (exitinfo1 & VMCB_NPF_INFO1_ID) { return (false); } if (exitinfo1 & VMCB_NPF_INFO1_GPT) { return (false); } if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) { return (false); } return (true); } static void svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) { struct vm_guest_paging *paging; struct vmcb_segment seg; struct vmcb_ctrl *ctrl; char *inst_bytes; int error, inst_len; ctrl = &vmcb->ctrl; paging = &vmexit->u.inst_emul.paging; vmexit->exitcode = VM_EXITCODE_INST_EMUL; vmexit->u.inst_emul.gpa = gpa; vmexit->u.inst_emul.gla = VIE_INVALID_GLA; svm_paging_info(vmcb, paging); error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error)); switch(paging->cpu_mode) { case CPU_MODE_REAL: vmexit->u.inst_emul.cs_base = seg.base; vmexit->u.inst_emul.cs_d = 0; break; case CPU_MODE_PROTECTED: case CPU_MODE_COMPATIBILITY: vmexit->u.inst_emul.cs_base = seg.base; /* * Section 4.8.1 of APM2, Default Operand Size or D bit. */ vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ? 1 : 0; break; default: vmexit->u.inst_emul.cs_base = 0; vmexit->u.inst_emul.cs_d = 0; break; } /* * Copy the instruction bytes into 'vie' if available. */ if (decode_assist() && !disable_npf_assist) { inst_len = ctrl->inst_len; inst_bytes = ctrl->inst_bytes; } else { inst_len = 0; inst_bytes = NULL; } vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len); } #ifdef KTR static const char * intrtype_to_str(int intr_type) { switch (intr_type) { case VMCB_EVENTINJ_TYPE_INTR: return ("hwintr"); case VMCB_EVENTINJ_TYPE_NMI: return ("nmi"); case VMCB_EVENTINJ_TYPE_INTn: return ("swintr"); case VMCB_EVENTINJ_TYPE_EXCEPTION: return ("exception"); default: panic("%s: unknown intr_type %d", __func__, intr_type); } } #endif /* * Inject an event to vcpu as described in section 15.20, "Event injection". */ static void svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector, uint32_t error, bool ec_valid) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event already pending %#lx", __func__, ctrl->eventinj)); KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d", __func__, vector)); switch (intr_type) { case VMCB_EVENTINJ_TYPE_INTR: case VMCB_EVENTINJ_TYPE_NMI: case VMCB_EVENTINJ_TYPE_INTn: break; case VMCB_EVENTINJ_TYPE_EXCEPTION: if (vector >= 0 && vector <= 31 && vector != 2) break; /* FALLTHROUGH */ default: panic("%s: invalid intr_type/vector: %d/%d", __func__, intr_type, vector); } ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID; if (ec_valid) { ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID; ctrl->eventinj |= (uint64_t)error << 32; VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x", intrtype_to_str(intr_type), vector, error); } else { VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d", intrtype_to_str(intr_type), vector); } } static void svm_update_virqinfo(struct svm_softc *sc, int vcpu) { struct vm *vm; struct vlapic *vlapic; struct vmcb_ctrl *ctrl; vm = sc->vm; vlapic = vm_lapic(vm, vcpu); ctrl = svm_get_vmcb_ctrl(sc, vcpu); /* Update %cr8 in the emulated vlapic */ vlapic_set_cr8(vlapic, ctrl->v_tpr); /* Virtual interrupt injection is not used. */ KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid " "v_intr_vector %d", __func__, ctrl->v_intr_vector)); } static void svm_save_intinfo(struct svm_softc *svm_sc, int vcpu) { struct vmcb_ctrl *ctrl; uint64_t intinfo; ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); intinfo = ctrl->exitintinfo; if (!VMCB_EXITINTINFO_VALID(intinfo)) return; /* * From APMv2, Section "Intercepts during IDT interrupt delivery" * * If a #VMEXIT happened during event delivery then record the event * that was being delivered. */ VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n", intinfo, VMCB_EXITINTINFO_VECTOR(intinfo)); vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1); vm_exit_intinfo(svm_sc->vm, vcpu, intinfo); } #ifdef INVARIANTS static __inline int vintr_intercept_enabled(struct svm_softc *sc, int vcpu) { return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR)); } #endif static __inline void enable_intr_window_exiting(struct svm_softc *sc, int vcpu) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); if (ctrl->v_irq && ctrl->v_intr_vector == 0) { KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__)); KASSERT(vintr_intercept_enabled(sc, vcpu), ("%s: vintr intercept should be enabled", __func__)); return; } VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting"); ctrl->v_irq = 1; ctrl->v_ign_tpr = 1; ctrl->v_intr_vector = 0; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); } static __inline void disable_intr_window_exiting(struct svm_softc *sc, int vcpu) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); if (!ctrl->v_irq && ctrl->v_intr_vector == 0) { KASSERT(!vintr_intercept_enabled(sc, vcpu), ("%s: vintr intercept should be disabled", __func__)); return; } VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting"); ctrl->v_irq = 0; ctrl->v_intr_vector = 0; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); } static int svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val) { struct vmcb_ctrl *ctrl; int oldval, newval; ctrl = svm_get_vmcb_ctrl(sc, vcpu); oldval = ctrl->intr_shadow; newval = val ? 1 : 0; if (newval != oldval) { ctrl->intr_shadow = newval; VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval); } return (0); } static int svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val) { struct vmcb_ctrl *ctrl; ctrl = svm_get_vmcb_ctrl(sc, vcpu); *val = ctrl->intr_shadow; return (0); } /* * Once an NMI is injected it blocks delivery of further NMIs until the handler * executes an IRET. The IRET intercept is enabled when an NMI is injected to * to track when the vcpu is done handling the NMI. */ static int nmi_blocked(struct svm_softc *sc, int vcpu) { int blocked; blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); return (blocked); } static void enable_nmi_blocking(struct svm_softc *sc, int vcpu) { KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked")); VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled"); svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); } static void clear_nmi_blocking(struct svm_softc *sc, int vcpu) { int error; KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked")); VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared"); /* * When the IRET intercept is cleared the vcpu will attempt to execute * the "iret" when it runs next. However, it is possible to inject * another NMI into the vcpu before the "iret" has actually executed. * * For e.g. if the "iret" encounters a #NPF when accessing the stack * it will trap back into the hypervisor. If an NMI is pending for * the vcpu it will be injected into the guest. * * XXX this needs to be fixed */ svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); /* * Set 'intr_shadow' to prevent an NMI from being injected on the * immediate VMRUN. */ error = svm_modify_intr_shadow(sc, vcpu, 1); KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error)); } #define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL static int svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu) { struct vm_exit *vme; struct vmcb_state *state; uint64_t changed, lma, oldval; int error; state = svm_get_vmcb_state(sc, vcpu); oldval = state->efer; VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval); newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */ changed = oldval ^ newval; if (newval & EFER_MBZ_BITS) goto gpf; /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */ if (changed & EFER_LME) { if (state->cr0 & CR0_PG) goto gpf; } /* EFER.LMA = EFER.LME & CR0.PG */ if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) lma = EFER_LMA; else lma = 0; if ((newval & EFER_LMA) != lma) goto gpf; if (newval & EFER_NXE) { if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) goto gpf; } /* * XXX bhyve does not enforce segment limits in 64-bit mode. Until * this is fixed flag guest attempt to set EFER_LMSLE as an error. */ if (newval & EFER_LMSLE) { vme = vm_exitinfo(sc->vm, vcpu); vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0); *retu = true; return (0); } if (newval & EFER_FFXSR) { if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) goto gpf; } if (newval & EFER_TCE) { if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) goto gpf; } error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval); KASSERT(error == 0, ("%s: error %d updating efer", __func__, error)); return (0); gpf: vm_inject_gp(sc->vm, vcpu); return (0); } static int emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu) { int error; if (lapic_msr(num)) error = lapic_wrmsr(sc->vm, vcpu, num, val, retu); else if (num == MSR_EFER) error = svm_write_efer(sc, vcpu, val, retu); else error = svm_wrmsr(sc, vcpu, num, val, retu); return (error); } static int emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu) { struct vmcb_state *state; struct svm_regctx *ctx; uint64_t result; int error; if (lapic_msr(num)) error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu); else error = svm_rdmsr(sc, vcpu, num, &result, retu); if (error == 0) { state = svm_get_vmcb_state(sc, vcpu); ctx = svm_get_guest_regctx(sc, vcpu); state->rax = result & 0xffffffff; ctx->sctx_rdx = result >> 32; } return (error); } #ifdef KTR static const char * exit_reason_to_str(uint64_t reason) { static char reasonbuf[32]; switch (reason) { case VMCB_EXIT_INVALID: return ("invalvmcb"); case VMCB_EXIT_SHUTDOWN: return ("shutdown"); case VMCB_EXIT_NPF: return ("nptfault"); case VMCB_EXIT_PAUSE: return ("pause"); case VMCB_EXIT_HLT: return ("hlt"); case VMCB_EXIT_CPUID: return ("cpuid"); case VMCB_EXIT_IO: return ("inout"); case VMCB_EXIT_MC: return ("mchk"); case VMCB_EXIT_INTR: return ("extintr"); case VMCB_EXIT_NMI: return ("nmi"); case VMCB_EXIT_VINTR: return ("vintr"); case VMCB_EXIT_MSR: return ("msr"); case VMCB_EXIT_IRET: return ("iret"); case VMCB_EXIT_MONITOR: return ("monitor"); case VMCB_EXIT_MWAIT: return ("mwait"); default: snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason); return (reasonbuf); } } #endif /* KTR */ /* * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs * that are due to instruction intercepts as well as MSR and IOIO intercepts * and exceptions caused by INT3, INTO and BOUND instructions. * * Return 1 if the nRIP is valid and 0 otherwise. */ static int nrip_valid(uint64_t exitcode) { switch (exitcode) { case 0x00 ... 0x0F: /* read of CR0 through CR15 */ case 0x10 ... 0x1F: /* write of CR0 through CR15 */ case 0x20 ... 0x2F: /* read of DR0 through DR15 */ case 0x30 ... 0x3F: /* write of DR0 through DR15 */ case 0x43: /* INT3 */ case 0x44: /* INTO */ case 0x45: /* BOUND */ case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */ case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */ return (1); default: return (0); } } static int svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) { struct vmcb *vmcb; struct vmcb_state *state; struct vmcb_ctrl *ctrl; struct svm_regctx *ctx; uint64_t code, info1, info2, val; uint32_t eax, ecx, edx; int error, errcode_valid, handled, idtvec, reflect; bool retu; ctx = svm_get_guest_regctx(svm_sc, vcpu); vmcb = svm_get_vmcb(svm_sc, vcpu); state = &vmcb->state; ctrl = &vmcb->ctrl; handled = 0; code = ctrl->exitcode; info1 = ctrl->exitinfo1; info2 = ctrl->exitinfo2; vmexit->exitcode = VM_EXITCODE_BOGUS; vmexit->rip = state->rip; vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0; vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1); /* * #VMEXIT(INVALID) needs to be handled early because the VMCB is * in an inconsistent state and can trigger assertions that would * never happen otherwise. */ if (code == VMCB_EXIT_INVALID) { vm_exit_svm(vmexit, code, info1, info2); return (0); } KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event " "injection valid bit is set %#lx", __func__, ctrl->eventinj)); KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15, ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)", vmexit->inst_length, code, info1, info2)); svm_update_virqinfo(svm_sc, vcpu); svm_save_intinfo(svm_sc, vcpu); switch (code) { case VMCB_EXIT_IRET: /* * Restart execution at "iret" but with the intercept cleared. */ vmexit->inst_length = 0; clear_nmi_blocking(svm_sc, vcpu); handled = 1; break; case VMCB_EXIT_VINTR: /* interrupt window exiting */ vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1); handled = 1; break; case VMCB_EXIT_INTR: /* external interrupt */ vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1); handled = 1; break; case VMCB_EXIT_NMI: /* external NMI */ handled = 1; break; case 0x40 ... 0x5F: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1); reflect = 1; idtvec = code - 0x40; switch (idtvec) { case IDT_MC: /* * Call the machine check handler by hand. Also don't * reflect the machine check back into the guest. */ reflect = 0; VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler"); __asm __volatile("int $18"); break; case IDT_PF: error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2, info2); KASSERT(error == 0, ("%s: error %d updating cr2", __func__, error)); /* fallthru */ case IDT_NP: case IDT_SS: case IDT_GP: case IDT_AC: case IDT_TS: errcode_valid = 1; break; case IDT_DF: errcode_valid = 1; info1 = 0; break; case IDT_BP: case IDT_OF: case IDT_BR: /* * The 'nrip' field is populated for INT3, INTO and * BOUND exceptions and this also implies that * 'inst_length' is non-zero. * * Reset 'inst_length' to zero so the guest %rip at * event injection is identical to what it was when * the exception originally happened. */ VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d " "to zero before injecting exception %d", vmexit->inst_length, idtvec); vmexit->inst_length = 0; /* fallthru */ default: errcode_valid = 0; info1 = 0; break; } KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) " "when reflecting exception %d into guest", vmexit->inst_length, idtvec)); if (reflect) { /* Reflect the exception back into the guest */ VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception " "%d/%#x into the guest", idtvec, (int)info1); error = vm_inject_exception(svm_sc->vm, vcpu, idtvec, errcode_valid, info1, 0); KASSERT(error == 0, ("%s: vm_inject_exception error %d", __func__, error)); } handled = 1; break; case VMCB_EXIT_MSR: /* MSR access. */ eax = state->rax; ecx = ctx->sctx_rcx; edx = ctx->sctx_rdx; retu = false; if (info1) { vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1); val = (uint64_t)edx << 32 | eax; VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx", ecx, val); if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) { vmexit->exitcode = VM_EXITCODE_WRMSR; vmexit->u.msr.code = ecx; vmexit->u.msr.wval = val; } else if (!retu) { handled = 1; } else { KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_wrmsr retu with bogus exitcode")); } } else { VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1); if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) { vmexit->exitcode = VM_EXITCODE_RDMSR; vmexit->u.msr.code = ecx; } else if (!retu) { handled = 1; } else { KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_rdmsr retu with bogus exitcode")); } } break; case VMCB_EXIT_IO: handled = svm_handle_io(svm_sc, vcpu, vmexit); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1); break; case VMCB_EXIT_CPUID: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1); handled = x86_emulate_cpuid(svm_sc->vm, vcpu, (uint32_t *)&state->rax, (uint32_t *)&ctx->sctx_rbx, (uint32_t *)&ctx->sctx_rcx, (uint32_t *)&ctx->sctx_rdx); break; case VMCB_EXIT_HLT: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1); vmexit->exitcode = VM_EXITCODE_HLT; vmexit->u.hlt.rflags = state->rflags; break; case VMCB_EXIT_PAUSE: vmexit->exitcode = VM_EXITCODE_PAUSE; vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1); break; case VMCB_EXIT_NPF: /* EXITINFO2 contains the faulting guest physical address */ if (info1 & VMCB_NPF_INFO1_RSV) { VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with " "reserved bits set: info1(%#lx) info2(%#lx)", info1, info2); } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->u.paging.gpa = info2; vmexit->u.paging.fault_type = npf_fault_type(info1); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1); VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault " "on gpa %#lx/%#lx at rip %#lx", info2, info1, state->rip); } else if (svm_npf_emul_fault(info1)) { svm_handle_inst_emul(vmcb, info2, vmexit); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1); VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault " "for gpa %#lx/%#lx at rip %#lx", info2, info1, state->rip); } break; case VMCB_EXIT_MONITOR: vmexit->exitcode = VM_EXITCODE_MONITOR; break; case VMCB_EXIT_MWAIT: vmexit->exitcode = VM_EXITCODE_MWAIT; break; default: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1); break; } VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d", handled ? "handled" : "unhandled", exit_reason_to_str(code), vmexit->rip, vmexit->inst_length); if (handled) { vmexit->rip += vmexit->inst_length; vmexit->inst_length = 0; state->rip = vmexit->rip; } else { if (vmexit->exitcode == VM_EXITCODE_BOGUS) { /* * If this VM exit was not claimed by anybody then * treat it as a generic SVM exit. */ vm_exit_svm(vmexit, code, info1, info2); } else { /* * The exitcode and collateral have been populated. * The VM exit will be processed further in userland. */ } } return (handled); } static void svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu) { uint64_t intinfo; if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo)) return; KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not " "valid: %#lx", __func__, intinfo)); svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo), VMCB_EXITINTINFO_VECTOR(intinfo), VMCB_EXITINTINFO_EC(intinfo), VMCB_EXITINTINFO_EC_VALID(intinfo)); vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1); VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo); } /* * Inject event to virtual cpu. */ static void svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; struct svm_vcpu *vcpustate; uint8_t v_tpr; int vector, need_intr_window; int extint_pending; state = svm_get_vmcb_state(sc, vcpu); ctrl = svm_get_vmcb_ctrl(sc, vcpu); vcpustate = svm_get_vcpu(sc, vcpu); need_intr_window = 0; if (vcpustate->nextrip != state->rip) { ctrl->intr_shadow = 0; VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking " "cleared due to rip change: %#lx/%#lx", vcpustate->nextrip, state->rip); } /* * Inject pending events or exceptions for this vcpu. * * An event might be pending because the previous #VMEXIT happened * during event delivery (i.e. ctrl->exitintinfo). * * An event might also be pending because an exception was injected * by the hypervisor (e.g. #PF during instruction emulation). */ svm_inj_intinfo(sc, vcpu); /* NMI event has priority over interrupts. */ if (vm_nmi_pending(sc->vm, vcpu)) { if (nmi_blocked(sc, vcpu)) { /* * Can't inject another NMI if the guest has not * yet executed an "iret" after the last NMI. */ VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due " "to NMI-blocking"); } else if (ctrl->intr_shadow) { /* * Can't inject an NMI if the vcpu is in an intr_shadow. */ VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to " "interrupt shadow"); need_intr_window = 1; goto done; } else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { /* * If there is already an exception/interrupt pending * then defer the NMI until after that. */ VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to " "eventinj %#lx", ctrl->eventinj); /* * Use self-IPI to trigger a VM-exit as soon as * possible after the event injection is completed. * * This works only if the external interrupt exiting * is at a lower priority than the event injection. * * Although not explicitly specified in APMv2 the * relative priorities were verified empirically. */ ipi_cpu(curcpu, IPI_AST); /* XXX vmm_ipinum? */ } else { vm_nmi_clear(sc->vm, vcpu); /* Inject NMI, vector number is not used */ svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI, IDT_NMI, 0, false); /* virtual NMI blocking is now in effect */ enable_nmi_blocking(sc, vcpu); VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI"); } } extint_pending = vm_extint_pending(sc->vm, vcpu); if (!extint_pending) { if (!vlapic_pending_intr(vlapic, &vector)) goto done; KASSERT(vector >= 16 && vector <= 255, ("invalid vector %d from local APIC", vector)); } else { /* Ask the legacy pic for a vector to inject */ vatpic_pending_intr(sc->vm, &vector); KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d from INTR", vector)); } /* * If the guest has disabled interrupts or is in an interrupt shadow * then we cannot inject the pending interrupt. */ if ((state->rflags & PSL_I) == 0) { VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " "rflags %#lx", vector, state->rflags); need_intr_window = 1; goto done; } if (ctrl->intr_shadow) { VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to " "interrupt shadow", vector); need_intr_window = 1; goto done; } if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " "eventinj %#lx", vector, ctrl->eventinj); need_intr_window = 1; goto done; } svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false); if (!extint_pending) { vlapic_intr_accepted(vlapic, vector); } else { vm_extint_clear(sc->vm, vcpu); vatpic_intr_accepted(sc->vm, vector); } /* * Force a VM-exit as soon as the vcpu is ready to accept another * interrupt. This is done because the PIC might have another vector * that it wants to inject. Also, if the APIC has a pending interrupt * that was preempted by the ExtInt then it allows us to inject the * APIC vector as soon as possible. */ need_intr_window = 1; done: /* * The guest can modify the TPR by writing to %CR8. In guest mode * the processor reflects this write to V_TPR without hypervisor * intervention. * * The guest can also modify the TPR by writing to it via the memory * mapped APIC page. In this case, the write will be emulated by the * hypervisor. For this reason V_TPR must be updated before every * VMRUN. */ v_tpr = vlapic_get_cr8(vlapic); KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr)); if (ctrl->v_tpr != v_tpr) { VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x", ctrl->v_tpr, v_tpr); ctrl->v_tpr = v_tpr; svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); } if (need_intr_window) { /* * We use V_IRQ in conjunction with the VINTR intercept to * trap into the hypervisor as soon as a virtual interrupt * can be delivered. * * Since injected events are not subject to intercept checks * we need to ensure that the V_IRQ is not actually going to * be delivered on VM entry. The KASSERT below enforces this. */ KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 || (state->rflags & PSL_I) == 0 || ctrl->intr_shadow, ("Bogus intr_window_exiting: eventinj (%#lx), " "intr_shadow (%u), rflags (%#lx)", ctrl->eventinj, ctrl->intr_shadow, state->rflags)); enable_intr_window_exiting(sc, vcpu); } else { disable_intr_window_exiting(sc, vcpu); } } static __inline void restore_host_tss(void) { struct system_segment_descriptor *tss_sd; /* * The TSS descriptor was in use prior to launching the guest so it * has been marked busy. * * 'ltr' requires the descriptor to be marked available so change the * type to "64-bit available TSS". */ tss_sd = PCPU_GET(tss); tss_sd->sd_type = SDT_SYSTSS; ltr(GSEL(GPROC0_SEL, SEL_KPL)); } static void check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) { struct svm_vcpu *vcpustate; struct vmcb_ctrl *ctrl; long eptgen; bool alloc_asid; KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not " "active on cpu %u", __func__, thiscpu)); vcpustate = svm_get_vcpu(sc, vcpuid); ctrl = svm_get_vmcb_ctrl(sc, vcpuid); /* * The TLB entries associated with the vcpu's ASID are not valid * if either of the following conditions is true: * * 1. The vcpu's ASID generation is different than the host cpu's * ASID generation. This happens when the vcpu migrates to a new * host cpu. It can also happen when the number of vcpus executing * on a host cpu is greater than the number of ASIDs available. * * 2. The pmap generation number is different than the value cached in * the 'vcpustate'. This happens when the host invalidates pages * belonging to the guest. * * asidgen eptgen Action * mismatch mismatch * 0 0 (a) * 0 1 (b1) or (b2) * 1 0 (c) * 1 1 (d) * * (a) There is no mismatch in eptgen or ASID generation and therefore * no further action is needed. * * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is * retained and the TLB entries associated with this ASID * are flushed by VMRUN. * * (b2) If the cpu does not support FlushByAsid then a new ASID is * allocated. * * (c) A new ASID is allocated. * * (d) A new ASID is allocated. */ alloc_asid = false; eptgen = pmap->pm_eptgen; ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING; if (vcpustate->asid.gen != asid[thiscpu].gen) { alloc_asid = true; /* (c) and (d) */ } else if (vcpustate->eptgen != eptgen) { if (flush_by_asid()) ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; /* (b1) */ else alloc_asid = true; /* (b2) */ } else { /* * This is the common case (a). */ KASSERT(!alloc_asid, ("ASID allocation not necessary")); KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING, ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl)); } if (alloc_asid) { if (++asid[thiscpu].num >= nasid) { asid[thiscpu].num = 1; if (++asid[thiscpu].gen == 0) asid[thiscpu].gen = 1; /* * If this cpu does not support "flush-by-asid" * then flush the entire TLB on a generation * bump. Subsequent ASID allocation in this * generation can be done without a TLB flush. */ if (!flush_by_asid()) ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL; } vcpustate->asid.gen = asid[thiscpu].gen; vcpustate->asid.num = asid[thiscpu].num; ctrl->asid = vcpustate->asid.num; svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); /* * If this cpu supports "flush-by-asid" then the TLB * was not flushed after the generation bump. The TLB * is flushed selectively after every new ASID allocation. */ if (flush_by_asid()) ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; } vcpustate->eptgen = eptgen; KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero")); KASSERT(ctrl->asid == vcpustate->asid.num, ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num)); } static __inline void disable_gintr(void) { __asm __volatile("clgi"); } static __inline void enable_gintr(void) { __asm __volatile("stgi"); } static __inline void svm_dr_enter_guest(struct svm_regctx *gctx) { /* Save host control debug registers. */ gctx->host_dr7 = rdr7(); gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); /* * Disable debugging in DR7 and DEBUGCTL to avoid triggering * exceptions in the host based on the guest DRx values. The * guest DR6, DR7, and DEBUGCTL are saved/restored in the * VMCB. */ load_dr7(0); wrmsr(MSR_DEBUGCTLMSR, 0); /* Save host debug registers. */ gctx->host_dr0 = rdr0(); gctx->host_dr1 = rdr1(); gctx->host_dr2 = rdr2(); gctx->host_dr3 = rdr3(); gctx->host_dr6 = rdr6(); /* Restore guest debug registers. */ load_dr0(gctx->sctx_dr0); load_dr1(gctx->sctx_dr1); load_dr2(gctx->sctx_dr2); load_dr3(gctx->sctx_dr3); } static __inline void svm_dr_leave_guest(struct svm_regctx *gctx) { /* Save guest debug registers. */ gctx->sctx_dr0 = rdr0(); gctx->sctx_dr1 = rdr1(); gctx->sctx_dr2 = rdr2(); gctx->sctx_dr3 = rdr3(); /* * Restore host debug registers. Restore DR7 and DEBUGCTL * last. */ load_dr0(gctx->host_dr0); load_dr1(gctx->host_dr1); load_dr2(gctx->host_dr2); load_dr3(gctx->host_dr3); load_dr6(gctx->host_dr6); wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl); load_dr7(gctx->host_dr7); } /* * Start vcpu with specified RIP. */ static int svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, struct vm_eventinfo *evinfo) { struct svm_regctx *gctx; struct svm_softc *svm_sc; struct svm_vcpu *vcpustate; struct vmcb_state *state; struct vmcb_ctrl *ctrl; struct vm_exit *vmexit; struct vlapic *vlapic; struct vm *vm; uint64_t vmcb_pa; int handled; uint16_t ldt_sel; svm_sc = arg; vm = svm_sc->vm; vcpustate = svm_get_vcpu(svm_sc, vcpu); state = svm_get_vmcb_state(svm_sc, vcpu); ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); vmexit = vm_exitinfo(vm, vcpu); vlapic = vm_lapic(vm, vcpu); gctx = svm_get_guest_regctx(svm_sc, vcpu); vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa; if (vcpustate->lastcpu != curcpu) { /* * Force new ASID allocation by invalidating the generation. */ vcpustate->asid.gen = 0; /* * Invalidate the VMCB state cache by marking all fields dirty. */ svm_set_dirty(svm_sc, vcpu, 0xffffffff); /* * XXX * Setting 'vcpustate->lastcpu' here is bit premature because * we may return from this function without actually executing * the VMRUN instruction. This could happen if a rendezvous * or an AST is pending on the first time through the loop. * * This works for now but any new side-effects of vcpu * migration should take this case into account. */ vcpustate->lastcpu = curcpu; vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1); } svm_msr_guest_enter(svm_sc, vcpu); /* Update Guest RIP */ state->rip = rip; do { /* * Disable global interrupts to guarantee atomicity during * loading of guest state. This includes not only the state * loaded by the "vmrun" instruction but also software state * maintained by the hypervisor: suspended and rendezvous * state, NPT generation number, vlapic interrupts etc. */ disable_gintr(); if (vcpu_suspended(evinfo)) { enable_gintr(); vm_exit_suspended(vm, vcpu, state->rip); break; } if (vcpu_rendezvous_pending(evinfo)) { enable_gintr(); vm_exit_rendezvous(vm, vcpu, state->rip); break; } if (vcpu_reqidle(evinfo)) { enable_gintr(); vm_exit_reqidle(vm, vcpu, state->rip); break; } /* We are asked to give the cpu by scheduler. */ if (vcpu_should_yield(vm, vcpu)) { enable_gintr(); vm_exit_astpending(vm, vcpu, state->rip); break; } if (vcpu_debugged(vm, vcpu)) { enable_gintr(); vm_exit_debug(vm, vcpu, state->rip); break; } /* * #VMEXIT resumes the host with the guest LDTR, so * save the current LDT selector so it can be restored * after an exit. The userspace hypervisor probably * doesn't use a LDT, but save and restore it to be * safe. */ ldt_sel = sldt(); svm_inj_interrupts(svm_sc, vcpu, vlapic); /* Activate the nested pmap on 'curcpu' */ CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active); /* * Check the pmap generation and the ASID generation to * ensure that the vcpu does not use stale TLB mappings. */ check_asid(svm_sc, vcpu, pmap, curcpu); ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty; vcpustate->dirty = 0; VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean); /* Launch Virtual Machine. */ VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip); svm_dr_enter_guest(gctx); svm_launch(vmcb_pa, gctx, get_pcpu()); svm_dr_leave_guest(gctx); CPU_CLR_ATOMIC(curcpu, &pmap->pm_active); /* * The host GDTR and IDTR is saved by VMRUN and restored * automatically on #VMEXIT. However, the host TSS needs * to be restored explicitly. */ restore_host_tss(); /* Restore host LDTR. */ lldt(ldt_sel); /* #VMEXIT disables interrupts so re-enable them here. */ enable_gintr(); /* Update 'nextrip' */ vcpustate->nextrip = state->rip; /* Handle #VMEXIT and if required return to user space. */ handled = svm_vmexit(svm_sc, vcpu, vmexit); } while (handled); svm_msr_guest_exit(svm_sc, vcpu); return (0); } static void svm_vmcleanup(void *arg) { struct svm_softc *sc = arg; contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM); contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM); free(sc, M_SVM); } static register_t * swctx_regptr(struct svm_regctx *regctx, int reg) { switch (reg) { case VM_REG_GUEST_RBX: return (®ctx->sctx_rbx); case VM_REG_GUEST_RCX: return (®ctx->sctx_rcx); case VM_REG_GUEST_RDX: return (®ctx->sctx_rdx); case VM_REG_GUEST_RDI: return (®ctx->sctx_rdi); case VM_REG_GUEST_RSI: return (®ctx->sctx_rsi); case VM_REG_GUEST_RBP: return (®ctx->sctx_rbp); case VM_REG_GUEST_R8: return (®ctx->sctx_r8); case VM_REG_GUEST_R9: return (®ctx->sctx_r9); case VM_REG_GUEST_R10: return (®ctx->sctx_r10); case VM_REG_GUEST_R11: return (®ctx->sctx_r11); case VM_REG_GUEST_R12: return (®ctx->sctx_r12); case VM_REG_GUEST_R13: return (®ctx->sctx_r13); case VM_REG_GUEST_R14: return (®ctx->sctx_r14); case VM_REG_GUEST_R15: return (®ctx->sctx_r15); case VM_REG_GUEST_DR0: return (®ctx->sctx_dr0); case VM_REG_GUEST_DR1: return (®ctx->sctx_dr1); case VM_REG_GUEST_DR2: return (®ctx->sctx_dr2); case VM_REG_GUEST_DR3: return (®ctx->sctx_dr3); default: return (NULL); } } static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val) { struct svm_softc *svm_sc; register_t *reg; svm_sc = arg; if (ident == VM_REG_GUEST_INTR_SHADOW) { return (svm_get_intr_shadow(svm_sc, vcpu, val)); } if (vmcb_read(svm_sc, vcpu, ident, val) == 0) { return (0); } reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); if (reg != NULL) { *val = *reg; return (0); } VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident); return (EINVAL); } static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val) { struct svm_softc *svm_sc; register_t *reg; svm_sc = arg; if (ident == VM_REG_GUEST_INTR_SHADOW) { return (svm_modify_intr_shadow(svm_sc, vcpu, val)); } if (vmcb_write(svm_sc, vcpu, ident, val) == 0) { return (0); } reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); if (reg != NULL) { *reg = val; return (0); } if (ident == VM_REG_GUEST_ENTRY_INST_LENGTH) { /* Ignore. */ return (0); } /* * XXX deal with CR3 and invalidate TLB entries tagged with the * vcpu's ASID. This needs to be treated differently depending on * whether 'running' is true/false. */ VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident); return (EINVAL); } +#ifdef BHYVE_SNAPSHOT +static int +svm_snapshot_reg(void *arg, int vcpu, int ident, + struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = svm_getreg(arg, vcpu, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = svm_setreg(arg, vcpu, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} +#endif + static int svm_setcap(void *arg, int vcpu, int type, int val) { struct svm_softc *sc; int error; sc = arg; error = 0; switch (type) { case VM_CAP_HALT_EXIT: svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT, val); break; case VM_CAP_PAUSE_EXIT: svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_PAUSE, val); break; case VM_CAP_UNRESTRICTED_GUEST: /* Unrestricted guest execution cannot be disabled in SVM */ if (val == 0) error = EINVAL; break; default: error = ENOENT; break; } return (error); } static int svm_getcap(void *arg, int vcpu, int type, int *retval) { struct svm_softc *sc; int error; sc = arg; error = 0; switch (type) { case VM_CAP_HALT_EXIT: *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT); break; case VM_CAP_PAUSE_EXIT: *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_PAUSE); break; case VM_CAP_UNRESTRICTED_GUEST: *retval = 1; /* unrestricted guest is always enabled */ break; default: error = ENOENT; break; } return (error); } static struct vlapic * svm_vlapic_init(void *arg, int vcpuid) { struct svm_softc *svm_sc; struct vlapic *vlapic; svm_sc = arg; vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO); vlapic->vm = svm_sc->vm; vlapic->vcpuid = vcpuid; vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid]; vlapic_init(vlapic); return (vlapic); } static void svm_vlapic_cleanup(void *arg, struct vlapic *vlapic) { vlapic_cleanup(vlapic); free(vlapic, M_SVM_VLAPIC); } +#ifdef BHYVE_SNAPSHOT +static int +svm_snapshot_vmi(void *arg, struct vm_snapshot_meta *meta) +{ + /* struct svm_softc is AMD's representation for SVM softc */ + struct svm_softc *sc; + struct svm_vcpu *vcpu; + struct vmcb *vmcb; + uint64_t val; + int i; + int ret; + + sc = arg; + + KASSERT(sc != NULL, ("%s: arg was NULL", __func__)); + + SNAPSHOT_VAR_OR_LEAVE(sc->nptp, meta, ret, done); + + for (i = 0; i < VM_MAXCPU; i++) { + vcpu = &sc->vcpu[i]; + vmcb = &vcpu->vmcb; + + /* VMCB fields for virtual cpu i */ + SNAPSHOT_VAR_OR_LEAVE(vmcb->ctrl.v_tpr, meta, ret, done); + val = vmcb->ctrl.v_tpr; + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + vmcb->ctrl.v_tpr = val; + + SNAPSHOT_VAR_OR_LEAVE(vmcb->ctrl.asid, meta, ret, done); + val = vmcb->ctrl.np_enable; + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + vmcb->ctrl.np_enable = val; + + val = vmcb->ctrl.intr_shadow; + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + vmcb->ctrl.intr_shadow = val; + SNAPSHOT_VAR_OR_LEAVE(vmcb->ctrl.tlb_ctrl, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad1, + sizeof(vmcb->state.pad1), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cpl, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad2, + sizeof(vmcb->state.pad2), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.efer, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad3, + sizeof(vmcb->state.pad3), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr4, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr3, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.dr7, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.dr6, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rflags, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rip, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad4, + sizeof(vmcb->state.pad4), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rsp, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad5, + sizeof(vmcb->state.pad5), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rax, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.star, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.lstar, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cstar, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sfmask, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.kernelgsbase, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sysenter_cs, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sysenter_esp, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sysenter_eip, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr2, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad6, + sizeof(vmcb->state.pad6), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.g_pat, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.dbgctl, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.br_from, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.br_to, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.int_from, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.int_to, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad7, + sizeof(vmcb->state.pad7), + meta, ret, done); + + /* Snapshot swctx for virtual cpu i */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rbp, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rbx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rcx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rdx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rdi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rsi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r8, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r9, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r10, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r11, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r12, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r13, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r14, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r15, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr1, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr3, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr1, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr3, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr6, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr7, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_debugctl, meta, ret, + done); + + /* Restore other svm_vcpu struct fields */ + + /* Restore NEXTRIP field */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done); + + /* Restore lastcpu field */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->lastcpu, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->dirty, meta, ret, done); + + /* Restore EPTGEN field - EPT is Extended Page Tabel */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->eptgen, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vcpu->asid.gen, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->asid.num, meta, ret, done); + + /* Set all caches dirty */ + if (meta->op == VM_SNAPSHOT_RESTORE) { + svm_set_dirty(sc, i, VMCB_CACHE_ASID); + svm_set_dirty(sc, i, VMCB_CACHE_IOPM); + svm_set_dirty(sc, i, VMCB_CACHE_I); + svm_set_dirty(sc, i, VMCB_CACHE_TPR); + svm_set_dirty(sc, i, VMCB_CACHE_CR2); + svm_set_dirty(sc, i, VMCB_CACHE_CR); + svm_set_dirty(sc, i, VMCB_CACHE_DT); + svm_set_dirty(sc, i, VMCB_CACHE_SEG); + svm_set_dirty(sc, i, VMCB_CACHE_NP); + } + } + + if (meta->op == VM_SNAPSHOT_RESTORE) + flush_by_asid(); + +done: + return (ret); +} + +static int +svm_snapshot_vmcx(void *arg, struct vm_snapshot_meta *meta, int vcpu) +{ + struct vmcb *vmcb; + struct svm_softc *sc; + int err, running, hostcpu; + + sc = (struct svm_softc *)arg; + err = 0; + + KASSERT(arg != NULL, ("%s: arg was NULL", __func__)); + vmcb = svm_get_vmcb(sc, vcpu); + + running = vcpu_is_running(sc->vm, vcpu, &hostcpu); + if (running && hostcpu !=curcpu) { + printf("%s: %s%d is running", __func__, vm_name(sc->vm), vcpu); + return (EINVAL); + } + + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR0, meta); + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR2, meta); + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR3, meta); + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR4, meta); + + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_DR7, meta); + + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RAX, meta); + + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RSP, meta); + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RIP, meta); + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RFLAGS, meta); + + /* Guest segments */ + /* ES */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_ES, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_ES, meta); + + /* CS */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CS, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_CS, meta); + + /* SS */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_SS, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_SS, meta); + + /* DS */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_DS, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_DS, meta); + + /* FS */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_FS, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_FS, meta); + + /* GS */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_GS, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_GS, meta); + + /* TR */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_TR, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_TR, meta); + + /* LDTR */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_LDTR, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_LDTR, meta); + + /* EFER */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_EFER, meta); + + /* IDTR and GDTR */ + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_IDTR, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_GDTR, meta); + + /* Specific AMD registers */ + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_SYSENTER_CS, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_SYSENTER_ESP, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_SYSENTER_EIP, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_NPT_BASE, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_CR_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_DR_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_EXC_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_INST1_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_INST2_INTERCEPT, 4), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_TLB_CTRL, 4), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_EXITINFO1, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_EXITINFO2, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_EXITINTINFO, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_VIRQ, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_GUEST_PAT, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_BAR, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_PAGE, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_LT, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_PT, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_IO_PERM, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_MSR_PERM, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_ASID, 4), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_EXIT_REASON, 8), meta); + + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_INTR_SHADOW, meta); + + return (err); +} + +static int +svm_restore_tsc(void *arg, int vcpu, uint64_t offset) +{ + int err; + + err = svm_set_tsc_offset(arg, vcpu, offset); + + return (err); +} +#endif + struct vmm_ops vmm_ops_amd = { .init = svm_init, .cleanup = svm_cleanup, .resume = svm_restore, .vminit = svm_vminit, .vmrun = svm_vmrun, .vmcleanup = svm_vmcleanup, .vmgetreg = svm_getreg, .vmsetreg = svm_setreg, .vmgetdesc = vmcb_getdesc, .vmsetdesc = vmcb_setdesc, .vmgetcap = svm_getcap, .vmsetcap = svm_setcap, .vmspace_alloc = svm_npt_alloc, .vmspace_free = svm_npt_free, .vlapic_init = svm_vlapic_init, .vlapic_cleanup = svm_vlapic_cleanup, +#ifdef BHYVE_SNAPSHOT + .vmsnapshot = svm_snapshot_vmi, + .vmcx_snapshot = svm_snapshot_vmcx, + .vm_restore_tsc = svm_restore_tsc, +#endif }; diff --git a/sys/amd64/vmm/amd/svm.h b/sys/amd64/vmm/amd/svm.h index 66b584fc95b1..30e58b9e130f 100644 --- a/sys/amd64/vmm/amd/svm.h +++ b/sys/amd64/vmm/amd/svm.h @@ -1,70 +1,74 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SVM_H_ #define _SVM_H_ struct pcpu; +struct svm_softc; /* * Guest register state that is saved outside the VMCB. */ struct svm_regctx { register_t sctx_rbp; register_t sctx_rbx; register_t sctx_rcx; register_t sctx_rdx; register_t sctx_rdi; register_t sctx_rsi; register_t sctx_r8; register_t sctx_r9; register_t sctx_r10; register_t sctx_r11; register_t sctx_r12; register_t sctx_r13; register_t sctx_r14; register_t sctx_r15; register_t sctx_dr0; register_t sctx_dr1; register_t sctx_dr2; register_t sctx_dr3; register_t host_dr0; register_t host_dr1; register_t host_dr2; register_t host_dr3; register_t host_dr6; register_t host_dr7; uint64_t host_debugctl; }; void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct pcpu *pcpu); +#ifdef BHYVE_SNAPSHOT +int svm_set_tsc_offset(struct svm_softc *sc, int vcpu, uint64_t offset); +#endif #endif /* _SVM_H_ */ diff --git a/sys/amd64/vmm/amd/svm_msr.c b/sys/amd64/vmm/amd/svm_msr.c index 67c43100f168..12046de4dbb9 100644 --- a/sys/amd64/vmm/amd/svm_msr.c +++ b/sys/amd64/vmm/amd/svm_msr.c @@ -1,173 +1,180 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014, Neel Natu (neel@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include #include #include #include #include "svm.h" #include "vmcb.h" #include "svm_softc.h" #include "svm_msr.h" #ifndef MSR_AMDK8_IPM #define MSR_AMDK8_IPM 0xc0010055 #endif enum { IDX_MSR_LSTAR, IDX_MSR_CSTAR, IDX_MSR_STAR, IDX_MSR_SF_MASK, HOST_MSR_NUM /* must be the last enumeration */ }; static uint64_t host_msrs[HOST_MSR_NUM]; void svm_msr_init(void) { /* * It is safe to cache the values of the following MSRs because they * don't change based on curcpu, curproc or curthread. */ host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); } void svm_msr_guest_init(struct svm_softc *sc, int vcpu) { /* * All the MSRs accessible to the guest are either saved/restored by * hardware on every #VMEXIT/VMRUN (e.g., G_PAT) or are saved/restored * by VMSAVE/VMLOAD (e.g., MSR_GSBASE). * * There are no guest MSRs that are saved/restored "by hand" so nothing * more to do here. */ return; } void svm_msr_guest_enter(struct svm_softc *sc, int vcpu) { /* * Save host MSRs (if any) and restore guest MSRs (if any). */ } void svm_msr_guest_exit(struct svm_softc *sc, int vcpu) { /* * Save guest MSRs (if any) and restore host MSRs. */ wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); /* MSR_KGSBASE will be restored on the way back to userspace */ } int svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result, bool *retu) { int error = 0; switch (num) { case MSR_MCG_CAP: case MSR_MCG_STATUS: *result = 0; break; case MSR_MTRRcap: case MSR_MTRRdefType: case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: case MSR_MTRR64kBase: case MSR_SYSCFG: case MSR_AMDK8_IPM: case MSR_EXTFEATURES: *result = 0; break; default: error = EINVAL; break; } return (error); } int svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu) { int error = 0; switch (num) { case MSR_MCG_CAP: case MSR_MCG_STATUS: break; /* ignore writes */ case MSR_MTRRcap: vm_inject_gp(sc->vm, vcpu); break; case MSR_MTRRdefType: case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: case MSR_MTRR64kBase: case MSR_SYSCFG: break; /* Ignore writes */ case MSR_AMDK8_IPM: /* * Ignore writes to the "Interrupt Pending Message" MSR. */ break; case MSR_K8_UCODE_UPDATE: /* * Ignore writes to microcode update register. */ break; +#ifdef BHYVE_SNAPSHOT + case MSR_TSC: + error = svm_set_tsc_offset(sc, vcpu, val - rdtsc()); + break; +#endif case MSR_EXTFEATURES: break; default: error = EINVAL; break; } return (error); } diff --git a/sys/amd64/vmm/amd/vmcb.c b/sys/amd64/vmm/amd/vmcb.c index 5075b6986730..59baa06112f2 100644 --- a/sys/amd64/vmm/amd/vmcb.c +++ b/sys/amd64/vmm/amd/vmcb.c @@ -1,454 +1,560 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include #include #include +#include #include "vmm_ktr.h" #include "vmcb.h" #include "svm.h" #include "svm_softc.h" /* * The VMCB aka Virtual Machine Control Block is a 4KB aligned page * in memory that describes the virtual machine. * * The VMCB contains: * - instructions or events in the guest to intercept * - control bits that modify execution environment of the guest * - guest processor state (e.g. general purpose registers) */ /* * Return VMCB segment area. */ static struct vmcb_segment * vmcb_segptr(struct vmcb *vmcb, int type) { struct vmcb_state *state; struct vmcb_segment *seg; state = &vmcb->state; switch (type) { case VM_REG_GUEST_CS: seg = &state->cs; break; case VM_REG_GUEST_DS: seg = &state->ds; break; case VM_REG_GUEST_ES: seg = &state->es; break; case VM_REG_GUEST_FS: seg = &state->fs; break; case VM_REG_GUEST_GS: seg = &state->gs; break; case VM_REG_GUEST_SS: seg = &state->ss; break; case VM_REG_GUEST_GDTR: seg = &state->gdt; break; case VM_REG_GUEST_IDTR: seg = &state->idt; break; case VM_REG_GUEST_LDTR: seg = &state->ldt; break; case VM_REG_GUEST_TR: seg = &state->tr; break; default: seg = NULL; break; } return (seg); } static int vmcb_access(struct svm_softc *softc, int vcpu, int write, int ident, uint64_t *val) { struct vmcb *vmcb; int off, bytes; char *ptr; vmcb = svm_get_vmcb(softc, vcpu); off = VMCB_ACCESS_OFFSET(ident); bytes = VMCB_ACCESS_BYTES(ident); if ((off + bytes) >= sizeof (struct vmcb)) return (EINVAL); ptr = (char *)vmcb; if (!write) *val = 0; switch (bytes) { case 8: case 4: case 2: if (write) memcpy(ptr + off, val, bytes); else memcpy(val, ptr + off, bytes); break; default: VCPU_CTR1(softc->vm, vcpu, "Invalid size %d for VMCB access: %d", bytes); return (EINVAL); } /* Invalidate all VMCB state cached by h/w. */ if (write) svm_set_dirty(softc, vcpu, 0xffffffff); return (0); } /* * Read from segment selector, control and general purpose register of VMCB. */ int vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval) { struct vmcb *vmcb; struct vmcb_state *state; struct vmcb_segment *seg; int err; vmcb = svm_get_vmcb(sc, vcpu); state = &vmcb->state; err = 0; if (VMCB_ACCESS_OK(ident)) return (vmcb_access(sc, vcpu, 0, ident, retval)); switch (ident) { case VM_REG_GUEST_CR0: *retval = state->cr0; break; case VM_REG_GUEST_CR2: *retval = state->cr2; break; case VM_REG_GUEST_CR3: *retval = state->cr3; break; case VM_REG_GUEST_CR4: *retval = state->cr4; break; case VM_REG_GUEST_DR6: *retval = state->dr6; break; case VM_REG_GUEST_DR7: *retval = state->dr7; break; case VM_REG_GUEST_EFER: *retval = state->efer; break; case VM_REG_GUEST_RAX: *retval = state->rax; break; case VM_REG_GUEST_RFLAGS: *retval = state->rflags; break; case VM_REG_GUEST_RIP: *retval = state->rip; break; case VM_REG_GUEST_RSP: *retval = state->rsp; break; case VM_REG_GUEST_CS: case VM_REG_GUEST_DS: case VM_REG_GUEST_ES: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_SS: case VM_REG_GUEST_LDTR: case VM_REG_GUEST_TR: seg = vmcb_segptr(vmcb, ident); KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB", __func__, ident)); *retval = seg->selector; break; case VM_REG_GUEST_GDTR: case VM_REG_GUEST_IDTR: /* GDTR and IDTR don't have segment selectors */ err = EINVAL; break; default: err = EINVAL; break; } return (err); } /* * Write to segment selector, control and general purpose register of VMCB. */ int vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val) { struct vmcb *vmcb; struct vmcb_state *state; struct vmcb_segment *seg; int err, dirtyseg; vmcb = svm_get_vmcb(sc, vcpu); state = &vmcb->state; dirtyseg = 0; err = 0; if (VMCB_ACCESS_OK(ident)) return (vmcb_access(sc, vcpu, 1, ident, &val)); switch (ident) { case VM_REG_GUEST_CR0: state->cr0 = val; svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); break; case VM_REG_GUEST_CR2: state->cr2 = val; svm_set_dirty(sc, vcpu, VMCB_CACHE_CR2); break; case VM_REG_GUEST_CR3: state->cr3 = val; svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); break; case VM_REG_GUEST_CR4: state->cr4 = val; svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); break; case VM_REG_GUEST_DR6: state->dr6 = val; svm_set_dirty(sc, vcpu, VMCB_CACHE_DR); break; case VM_REG_GUEST_DR7: state->dr7 = val; svm_set_dirty(sc, vcpu, VMCB_CACHE_DR); break; case VM_REG_GUEST_EFER: /* EFER_SVM must always be set when the guest is executing */ state->efer = val | EFER_SVM; svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); break; case VM_REG_GUEST_RAX: state->rax = val; break; case VM_REG_GUEST_RFLAGS: state->rflags = val; break; case VM_REG_GUEST_RIP: state->rip = val; break; case VM_REG_GUEST_RSP: state->rsp = val; break; case VM_REG_GUEST_CS: case VM_REG_GUEST_DS: case VM_REG_GUEST_ES: case VM_REG_GUEST_SS: dirtyseg = 1; /* FALLTHROUGH */ case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_LDTR: case VM_REG_GUEST_TR: seg = vmcb_segptr(vmcb, ident); KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB", __func__, ident)); seg->selector = val; if (dirtyseg) svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG); break; case VM_REG_GUEST_GDTR: case VM_REG_GUEST_IDTR: /* GDTR and IDTR don't have segment selectors */ err = EINVAL; break; default: err = EINVAL; break; } return (err); } int vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg2) { struct vmcb_segment *seg; seg = vmcb_segptr(vmcb, ident); if (seg != NULL) { bcopy(seg, seg2, sizeof(struct vmcb_segment)); return (0); } else { return (EINVAL); } } int vmcb_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) { struct vmcb *vmcb; struct svm_softc *sc; struct vmcb_segment *seg; uint16_t attrib; sc = arg; vmcb = svm_get_vmcb(sc, vcpu); seg = vmcb_segptr(vmcb, reg); KASSERT(seg != NULL, ("%s: invalid segment descriptor %d", __func__, reg)); seg->base = desc->base; seg->limit = desc->limit; if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) { /* * Map seg_desc access to VMCB attribute format. * * SVM uses the 'P' bit in the segment attributes to indicate a * NULL segment so clear it if the segment is marked unusable. */ attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF); if (SEG_DESC_UNUSABLE(desc->access)) { attrib &= ~0x80; } seg->attrib = attrib; } VCPU_CTR4(sc->vm, vcpu, "Setting desc %d: base (%#lx), limit (%#x), " "attrib (%#x)", reg, seg->base, seg->limit, seg->attrib); switch (reg) { case VM_REG_GUEST_CS: case VM_REG_GUEST_DS: case VM_REG_GUEST_ES: case VM_REG_GUEST_SS: svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG); break; case VM_REG_GUEST_GDTR: case VM_REG_GUEST_IDTR: svm_set_dirty(sc, vcpu, VMCB_CACHE_DT); break; default: break; } return (0); } int vmcb_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) { struct vmcb *vmcb; struct svm_softc *sc; struct vmcb_segment *seg; sc = arg; vmcb = svm_get_vmcb(sc, vcpu); seg = vmcb_segptr(vmcb, reg); KASSERT(seg != NULL, ("%s: invalid segment descriptor %d", __func__, reg)); desc->base = seg->base; desc->limit = seg->limit; desc->access = 0; if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) { /* Map seg_desc access to VMCB attribute format */ desc->access = ((seg->attrib & 0xF00) << 4) | (seg->attrib & 0xFF); /* * VT-x uses bit 16 to indicate a segment that has been loaded * with a NULL selector (aka unusable). The 'desc->access' * field is interpreted in the VT-x format by the * processor-independent code. * * SVM uses the 'P' bit to convey the same information so * convert it into the VT-x format. For more details refer to * section "Segment State in the VMCB" in APMv2. */ if (reg != VM_REG_GUEST_CS && reg != VM_REG_GUEST_TR) { if ((desc->access & 0x80) == 0) desc->access |= 0x10000; /* Unusable segment */ } } return (0); } + +#ifdef BHYVE_SNAPSHOT +int +vmcb_getany(struct svm_softc *sc, int vcpu, int ident, uint64_t *val) +{ + int error = 0; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + error = EINVAL; + goto err; + } + + if (ident >= VM_REG_LAST) { + error = EINVAL; + goto err; + } + + error = vm_get_register(sc->vm, vcpu, ident, val); + +err: + return (error); +} + +int +vmcb_setany(struct svm_softc *sc, int vcpu, int ident, uint64_t val) +{ + int error = 0; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + error = EINVAL; + goto err; + } + + if (ident >= VM_REG_LAST) { + error = EINVAL; + goto err; + } + + error = vm_set_register(sc->vm, vcpu, ident, val); + +err: + return (error); +} + +int +vmcb_snapshot_desc(void *arg, int vcpu, int reg, struct vm_snapshot_meta *meta) +{ + int ret; + struct seg_desc desc; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcb_getdesc(arg, vcpu, reg, &desc); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + + ret = vmcb_setdesc(arg, vcpu, reg, &desc); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} + +int +vmcb_snapshot_any(struct svm_softc *sc, int vcpu, int ident, + struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcb_getany(sc, vcpu, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = vmcb_setany(sc, vcpu, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} +#endif diff --git a/sys/amd64/vmm/amd/vmcb.h b/sys/amd64/vmm/amd/vmcb.h index ec7caa91f95e..dd2c90cf25ea 100644 --- a/sys/amd64/vmm/amd/vmcb.h +++ b/sys/amd64/vmm/amd/vmcb.h @@ -1,336 +1,346 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMCB_H_ #define _VMCB_H_ -struct svm_softc; - #define BIT(n) (1ULL << n) /* * Secure Virtual Machine: AMD64 Programmer's Manual Vol2, Chapter 15 * Layout of VMCB: AMD64 Programmer's Manual Vol2, Appendix B */ /* vmcb_ctrl->intercept[] array indices */ #define VMCB_CR_INTCPT 0 #define VMCB_DR_INTCPT 1 #define VMCB_EXC_INTCPT 2 #define VMCB_CTRL1_INTCPT 3 #define VMCB_CTRL2_INTCPT 4 /* intercept[VMCB_CTRL1_INTCPT] fields */ #define VMCB_INTCPT_INTR BIT(0) #define VMCB_INTCPT_NMI BIT(1) #define VMCB_INTCPT_SMI BIT(2) #define VMCB_INTCPT_INIT BIT(3) #define VMCB_INTCPT_VINTR BIT(4) #define VMCB_INTCPT_CR0_WRITE BIT(5) #define VMCB_INTCPT_IDTR_READ BIT(6) #define VMCB_INTCPT_GDTR_READ BIT(7) #define VMCB_INTCPT_LDTR_READ BIT(8) #define VMCB_INTCPT_TR_READ BIT(9) #define VMCB_INTCPT_IDTR_WRITE BIT(10) #define VMCB_INTCPT_GDTR_WRITE BIT(11) #define VMCB_INTCPT_LDTR_WRITE BIT(12) #define VMCB_INTCPT_TR_WRITE BIT(13) #define VMCB_INTCPT_RDTSC BIT(14) #define VMCB_INTCPT_RDPMC BIT(15) #define VMCB_INTCPT_PUSHF BIT(16) #define VMCB_INTCPT_POPF BIT(17) #define VMCB_INTCPT_CPUID BIT(18) #define VMCB_INTCPT_RSM BIT(19) #define VMCB_INTCPT_IRET BIT(20) #define VMCB_INTCPT_INTn BIT(21) #define VMCB_INTCPT_INVD BIT(22) #define VMCB_INTCPT_PAUSE BIT(23) #define VMCB_INTCPT_HLT BIT(24) #define VMCB_INTCPT_INVPG BIT(25) #define VMCB_INTCPT_INVPGA BIT(26) #define VMCB_INTCPT_IO BIT(27) #define VMCB_INTCPT_MSR BIT(28) #define VMCB_INTCPT_TASK_SWITCH BIT(29) #define VMCB_INTCPT_FERR_FREEZE BIT(30) #define VMCB_INTCPT_SHUTDOWN BIT(31) /* intercept[VMCB_CTRL2_INTCPT] fields */ #define VMCB_INTCPT_VMRUN BIT(0) #define VMCB_INTCPT_VMMCALL BIT(1) #define VMCB_INTCPT_VMLOAD BIT(2) #define VMCB_INTCPT_VMSAVE BIT(3) #define VMCB_INTCPT_STGI BIT(4) #define VMCB_INTCPT_CLGI BIT(5) #define VMCB_INTCPT_SKINIT BIT(6) #define VMCB_INTCPT_RDTSCP BIT(7) #define VMCB_INTCPT_ICEBP BIT(8) #define VMCB_INTCPT_WBINVD BIT(9) #define VMCB_INTCPT_MONITOR BIT(10) #define VMCB_INTCPT_MWAIT BIT(11) #define VMCB_INTCPT_MWAIT_ARMED BIT(12) #define VMCB_INTCPT_XSETBV BIT(13) /* VMCB TLB control */ #define VMCB_TLB_FLUSH_NOTHING 0 /* Flush nothing */ #define VMCB_TLB_FLUSH_ALL 1 /* Flush entire TLB */ #define VMCB_TLB_FLUSH_GUEST 3 /* Flush all guest entries */ #define VMCB_TLB_FLUSH_GUEST_NONGLOBAL 7 /* Flush guest non-PG entries */ /* VMCB state caching */ #define VMCB_CACHE_NONE 0 /* No caching */ #define VMCB_CACHE_I BIT(0) /* Intercept, TSC off, Pause filter */ #define VMCB_CACHE_IOPM BIT(1) /* I/O and MSR permission */ #define VMCB_CACHE_ASID BIT(2) /* ASID */ #define VMCB_CACHE_TPR BIT(3) /* V_TPR to V_INTR_VECTOR */ #define VMCB_CACHE_NP BIT(4) /* Nested Paging */ #define VMCB_CACHE_CR BIT(5) /* CR0, CR3, CR4 & EFER */ #define VMCB_CACHE_DR BIT(6) /* Debug registers */ #define VMCB_CACHE_DT BIT(7) /* GDT/IDT */ #define VMCB_CACHE_SEG BIT(8) /* User segments, CPL */ #define VMCB_CACHE_CR2 BIT(9) /* page fault address */ #define VMCB_CACHE_LBR BIT(10) /* Last branch */ /* VMCB control event injection */ #define VMCB_EVENTINJ_EC_VALID BIT(11) /* Error Code valid */ #define VMCB_EVENTINJ_VALID BIT(31) /* Event valid */ /* Event types that can be injected */ #define VMCB_EVENTINJ_TYPE_INTR 0 #define VMCB_EVENTINJ_TYPE_NMI 2 #define VMCB_EVENTINJ_TYPE_EXCEPTION 3 #define VMCB_EVENTINJ_TYPE_INTn 4 /* VMCB exit code, APM vol2 Appendix C */ #define VMCB_EXIT_MC 0x52 #define VMCB_EXIT_INTR 0x60 #define VMCB_EXIT_NMI 0x61 #define VMCB_EXIT_VINTR 0x64 #define VMCB_EXIT_PUSHF 0x70 #define VMCB_EXIT_POPF 0x71 #define VMCB_EXIT_CPUID 0x72 #define VMCB_EXIT_IRET 0x74 #define VMCB_EXIT_PAUSE 0x77 #define VMCB_EXIT_HLT 0x78 #define VMCB_EXIT_IO 0x7B #define VMCB_EXIT_MSR 0x7C #define VMCB_EXIT_SHUTDOWN 0x7F #define VMCB_EXIT_VMSAVE 0x83 #define VMCB_EXIT_MONITOR 0x8A #define VMCB_EXIT_MWAIT 0x8B #define VMCB_EXIT_NPF 0x400 #define VMCB_EXIT_INVALID -1 /* * Nested page fault. * Bit definitions to decode EXITINFO1. */ #define VMCB_NPF_INFO1_P BIT(0) /* Nested page present. */ #define VMCB_NPF_INFO1_W BIT(1) /* Access was write. */ #define VMCB_NPF_INFO1_U BIT(2) /* Access was user access. */ #define VMCB_NPF_INFO1_RSV BIT(3) /* Reserved bits present. */ #define VMCB_NPF_INFO1_ID BIT(4) /* Code read. */ #define VMCB_NPF_INFO1_GPA BIT(32) /* Guest physical address. */ #define VMCB_NPF_INFO1_GPT BIT(33) /* Guest page table. */ /* * EXITINTINFO, Interrupt exit info for all intrecepts. * Section 15.7.2, Intercepts during IDT Interrupt Delivery. */ #define VMCB_EXITINTINFO_VECTOR(x) ((x) & 0xFF) #define VMCB_EXITINTINFO_TYPE(x) (((x) >> 8) & 0x7) #define VMCB_EXITINTINFO_EC_VALID(x) (((x) & BIT(11)) ? 1 : 0) #define VMCB_EXITINTINFO_VALID(x) (((x) & BIT(31)) ? 1 : 0) #define VMCB_EXITINTINFO_EC(x) (((x) >> 32) & 0xFFFFFFFF) /* Offset of various VMCB fields. */ #define VMCB_OFF_CTRL(x) (x) #define VMCB_OFF_STATE(x) ((x) + 0x400) #define VMCB_OFF_CR_INTERCEPT VMCB_OFF_CTRL(0x0) #define VMCB_OFF_DR_INTERCEPT VMCB_OFF_CTRL(0x4) #define VMCB_OFF_EXC_INTERCEPT VMCB_OFF_CTRL(0x8) #define VMCB_OFF_INST1_INTERCEPT VMCB_OFF_CTRL(0xC) #define VMCB_OFF_INST2_INTERCEPT VMCB_OFF_CTRL(0x10) #define VMCB_OFF_IO_PERM VMCB_OFF_CTRL(0x40) #define VMCB_OFF_MSR_PERM VMCB_OFF_CTRL(0x48) #define VMCB_OFF_TSC_OFFSET VMCB_OFF_CTRL(0x50) #define VMCB_OFF_ASID VMCB_OFF_CTRL(0x58) #define VMCB_OFF_TLB_CTRL VMCB_OFF_CTRL(0x5C) #define VMCB_OFF_VIRQ VMCB_OFF_CTRL(0x60) #define VMCB_OFF_EXIT_REASON VMCB_OFF_CTRL(0x70) #define VMCB_OFF_EXITINFO1 VMCB_OFF_CTRL(0x78) #define VMCB_OFF_EXITINFO2 VMCB_OFF_CTRL(0x80) #define VMCB_OFF_EXITINTINFO VMCB_OFF_CTRL(0x88) #define VMCB_OFF_AVIC_BAR VMCB_OFF_CTRL(0x98) #define VMCB_OFF_NPT_BASE VMCB_OFF_CTRL(0xB0) #define VMCB_OFF_AVIC_PAGE VMCB_OFF_CTRL(0xE0) #define VMCB_OFF_AVIC_LT VMCB_OFF_CTRL(0xF0) #define VMCB_OFF_AVIC_PT VMCB_OFF_CTRL(0xF8) #define VMCB_OFF_SYSENTER_CS VMCB_OFF_STATE(0x228) #define VMCB_OFF_SYSENTER_ESP VMCB_OFF_STATE(0x230) #define VMCB_OFF_SYSENTER_EIP VMCB_OFF_STATE(0x238) #define VMCB_OFF_GUEST_PAT VMCB_OFF_STATE(0x268) /* * Encode the VMCB offset and bytes that we want to read from VMCB. */ #define VMCB_ACCESS(o, w) (0x80000000 | (((w) & 0xF) << 16) | \ ((o) & 0xFFF)) #define VMCB_ACCESS_OK(v) ((v) & 0x80000000 ) #define VMCB_ACCESS_BYTES(v) (((v) >> 16) & 0xF) #define VMCB_ACCESS_OFFSET(v) ((v) & 0xFFF) #ifdef _KERNEL + +struct svm_softc; +struct vm_snapshot_meta; + /* VMCB save state area segment format */ struct vmcb_segment { uint16_t selector; uint16_t attrib; uint32_t limit; uint64_t base; } __attribute__ ((__packed__)); CTASSERT(sizeof(struct vmcb_segment) == 16); /* Code segment descriptor attribute in 12 bit format as saved by VMCB. */ #define VMCB_CS_ATTRIB_L BIT(9) /* Long mode. */ #define VMCB_CS_ATTRIB_D BIT(10) /* OPerand size bit. */ /* * The VMCB is divided into two areas - the first one contains various * control bits including the intercept vector and the second one contains * the guest state. */ /* VMCB control area - padded up to 1024 bytes */ struct vmcb_ctrl { uint32_t intercept[5]; /* all intercepts */ uint8_t pad1[0x28]; /* Offsets 0x14-0x3B are reserved. */ uint16_t pause_filthresh; /* Offset 0x3C, PAUSE filter threshold */ uint16_t pause_filcnt; /* Offset 0x3E, PAUSE filter count */ uint64_t iopm_base_pa; /* 0x40: IOPM_BASE_PA */ uint64_t msrpm_base_pa; /* 0x48: MSRPM_BASE_PA */ uint64_t tsc_offset; /* 0x50: TSC_OFFSET */ uint32_t asid; /* 0x58: Guest ASID */ uint8_t tlb_ctrl; /* 0x5C: TLB_CONTROL */ uint8_t pad2[3]; /* 0x5D-0x5F: Reserved. */ uint8_t v_tpr; /* 0x60: V_TPR, guest CR8 */ uint8_t v_irq:1; /* Is virtual interrupt pending? */ uint8_t :7; /* Padding */ uint8_t v_intr_prio:4; /* 0x62: Priority for virtual interrupt. */ uint8_t v_ign_tpr:1; uint8_t :3; uint8_t v_intr_masking:1; /* Guest and host sharing of RFLAGS. */ uint8_t :7; uint8_t v_intr_vector; /* 0x64: Vector for virtual interrupt. */ uint8_t pad3[3]; /* 0x65-0x67 Reserved. */ uint64_t intr_shadow:1; /* 0x68: Interrupt shadow, section15.2.1 APM2 */ uint64_t :63; uint64_t exitcode; /* 0x70, Exitcode */ uint64_t exitinfo1; /* 0x78, EXITINFO1 */ uint64_t exitinfo2; /* 0x80, EXITINFO2 */ uint64_t exitintinfo; /* 0x88, Interrupt exit value. */ uint64_t np_enable:1; /* 0x90, Nested paging enable. */ uint64_t :63; uint8_t pad4[0x10]; /* 0x98-0xA7 reserved. */ uint64_t eventinj; /* 0xA8, Event injection. */ uint64_t n_cr3; /* B0, Nested page table. */ uint64_t lbr_virt_en:1; /* Enable LBR virtualization. */ uint64_t :63; uint32_t vmcb_clean; /* 0xC0: VMCB clean bits for caching */ uint32_t :32; /* 0xC4: Reserved */ uint64_t nrip; /* 0xC8: Guest next nRIP. */ uint8_t inst_len; /* 0xD0: #NPF decode assist */ uint8_t inst_bytes[15]; uint8_t padd6[0x320]; } __attribute__ ((__packed__)); CTASSERT(sizeof(struct vmcb_ctrl) == 1024); struct vmcb_state { struct vmcb_segment es; struct vmcb_segment cs; struct vmcb_segment ss; struct vmcb_segment ds; struct vmcb_segment fs; struct vmcb_segment gs; struct vmcb_segment gdt; struct vmcb_segment ldt; struct vmcb_segment idt; struct vmcb_segment tr; uint8_t pad1[0x2b]; /* Reserved: 0xA0-0xCA */ uint8_t cpl; uint8_t pad2[4]; uint64_t efer; uint8_t pad3[0x70]; /* Reserved: 0xd8-0x147 */ uint64_t cr4; uint64_t cr3; /* Guest CR3 */ uint64_t cr0; uint64_t dr7; uint64_t dr6; uint64_t rflags; uint64_t rip; uint8_t pad4[0x58]; /* Reserved: 0x180-0x1D7 */ uint64_t rsp; uint8_t pad5[0x18]; /* Reserved 0x1E0-0x1F7 */ uint64_t rax; uint64_t star; uint64_t lstar; uint64_t cstar; uint64_t sfmask; uint64_t kernelgsbase; uint64_t sysenter_cs; uint64_t sysenter_esp; uint64_t sysenter_eip; uint64_t cr2; uint8_t pad6[0x20]; uint64_t g_pat; uint64_t dbgctl; uint64_t br_from; uint64_t br_to; uint64_t int_from; uint64_t int_to; uint8_t pad7[0x968]; /* Reserved up to end of VMCB */ } __attribute__ ((__packed__)); CTASSERT(sizeof(struct vmcb_state) == 0xC00); struct vmcb { struct vmcb_ctrl ctrl; struct vmcb_state state; } __attribute__ ((__packed__)); CTASSERT(sizeof(struct vmcb) == PAGE_SIZE); CTASSERT(offsetof(struct vmcb, state) == 0x400); int vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval); int vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val); int vmcb_setdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); int vmcb_getdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); int vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg); +#ifdef BHYVE_SNAPSHOT +int vmcb_getany(struct svm_softc *sc, int vcpu, int ident, uint64_t *val); +int vmcb_setany(struct svm_softc *sc, int vcpu, int ident, uint64_t val); +int vmcb_snapshot_desc(void *arg, int vcpu, int reg, + struct vm_snapshot_meta *meta); +int vmcb_snapshot_any(struct svm_softc *sc, int vcpu, int ident, + struct vm_snapshot_meta *meta); +#endif #endif /* _KERNEL */ #endif /* _VMCB_H_ */ diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c index 7632ba930f37..4ccdc1f61f34 100644 --- a/sys/amd64/vmm/intel/vmcs.c +++ b/sys/amd64/vmm/intel/vmcs.c @@ -1,521 +1,645 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ +#include "opt_bhyve_snapshot.h" #include "opt_ddb.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include +#include #include "vmm_host.h" #include "vmx_cpufunc.h" #include "vmcs.h" #include "ept.h" #include "vmx.h" #ifdef DDB #include #endif SYSCTL_DECL(_hw_vmm_vmx); static int no_flush_rsb; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, no_flush_rsb, CTLFLAG_RW, &no_flush_rsb, 0, "Do not flush RSB upon vmexit"); static uint64_t vmcs_fix_regval(uint32_t encoding, uint64_t val) { switch (encoding) { case VMCS_GUEST_CR0: val = vmx_fix_cr0(val); break; case VMCS_GUEST_CR4: val = vmx_fix_cr4(val); break; default: break; } return (val); } static uint32_t vmcs_field_encoding(int ident) { switch (ident) { case VM_REG_GUEST_CR0: return (VMCS_GUEST_CR0); case VM_REG_GUEST_CR3: return (VMCS_GUEST_CR3); case VM_REG_GUEST_CR4: return (VMCS_GUEST_CR4); case VM_REG_GUEST_DR7: return (VMCS_GUEST_DR7); case VM_REG_GUEST_RSP: return (VMCS_GUEST_RSP); case VM_REG_GUEST_RIP: return (VMCS_GUEST_RIP); case VM_REG_GUEST_RFLAGS: return (VMCS_GUEST_RFLAGS); case VM_REG_GUEST_ES: return (VMCS_GUEST_ES_SELECTOR); case VM_REG_GUEST_CS: return (VMCS_GUEST_CS_SELECTOR); case VM_REG_GUEST_SS: return (VMCS_GUEST_SS_SELECTOR); case VM_REG_GUEST_DS: return (VMCS_GUEST_DS_SELECTOR); case VM_REG_GUEST_FS: return (VMCS_GUEST_FS_SELECTOR); case VM_REG_GUEST_GS: return (VMCS_GUEST_GS_SELECTOR); case VM_REG_GUEST_TR: return (VMCS_GUEST_TR_SELECTOR); case VM_REG_GUEST_LDTR: return (VMCS_GUEST_LDTR_SELECTOR); case VM_REG_GUEST_EFER: return (VMCS_GUEST_IA32_EFER); case VM_REG_GUEST_PDPTE0: return (VMCS_GUEST_PDPTE0); case VM_REG_GUEST_PDPTE1: return (VMCS_GUEST_PDPTE1); case VM_REG_GUEST_PDPTE2: return (VMCS_GUEST_PDPTE2); case VM_REG_GUEST_PDPTE3: return (VMCS_GUEST_PDPTE3); case VM_REG_GUEST_ENTRY_INST_LENGTH: return (VMCS_ENTRY_INST_LENGTH); default: return (-1); } } static int vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc) { switch (seg) { case VM_REG_GUEST_ES: *base = VMCS_GUEST_ES_BASE; *lim = VMCS_GUEST_ES_LIMIT; *acc = VMCS_GUEST_ES_ACCESS_RIGHTS; break; case VM_REG_GUEST_CS: *base = VMCS_GUEST_CS_BASE; *lim = VMCS_GUEST_CS_LIMIT; *acc = VMCS_GUEST_CS_ACCESS_RIGHTS; break; case VM_REG_GUEST_SS: *base = VMCS_GUEST_SS_BASE; *lim = VMCS_GUEST_SS_LIMIT; *acc = VMCS_GUEST_SS_ACCESS_RIGHTS; break; case VM_REG_GUEST_DS: *base = VMCS_GUEST_DS_BASE; *lim = VMCS_GUEST_DS_LIMIT; *acc = VMCS_GUEST_DS_ACCESS_RIGHTS; break; case VM_REG_GUEST_FS: *base = VMCS_GUEST_FS_BASE; *lim = VMCS_GUEST_FS_LIMIT; *acc = VMCS_GUEST_FS_ACCESS_RIGHTS; break; case VM_REG_GUEST_GS: *base = VMCS_GUEST_GS_BASE; *lim = VMCS_GUEST_GS_LIMIT; *acc = VMCS_GUEST_GS_ACCESS_RIGHTS; break; case VM_REG_GUEST_TR: *base = VMCS_GUEST_TR_BASE; *lim = VMCS_GUEST_TR_LIMIT; *acc = VMCS_GUEST_TR_ACCESS_RIGHTS; break; case VM_REG_GUEST_LDTR: *base = VMCS_GUEST_LDTR_BASE; *lim = VMCS_GUEST_LDTR_LIMIT; *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS; break; case VM_REG_GUEST_IDTR: *base = VMCS_GUEST_IDTR_BASE; *lim = VMCS_GUEST_IDTR_LIMIT; *acc = VMCS_INVALID_ENCODING; break; case VM_REG_GUEST_GDTR: *base = VMCS_GUEST_GDTR_BASE; *lim = VMCS_GUEST_GDTR_LIMIT; *acc = VMCS_INVALID_ENCODING; break; default: return (EINVAL); } return (0); } int vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *retval) { int error; uint32_t encoding; /* * If we need to get at vmx-specific state in the VMCS we can bypass * the translation of 'ident' to 'encoding' by simply setting the * sign bit. As it so happens the upper 16 bits are reserved (i.e * set to 0) in the encodings for the VMCS so we are free to use the * sign bit. */ if (ident < 0) encoding = ident & 0x7fffffff; else encoding = vmcs_field_encoding(ident); if (encoding == (uint32_t)-1) return (EINVAL); if (!running) VMPTRLD(vmcs); error = vmread(encoding, retval); if (!running) VMCLEAR(vmcs); return (error); } int vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val) { int error; uint32_t encoding; if (ident < 0) encoding = ident & 0x7fffffff; else encoding = vmcs_field_encoding(ident); if (encoding == (uint32_t)-1) return (EINVAL); val = vmcs_fix_regval(encoding, val); if (!running) VMPTRLD(vmcs); error = vmwrite(encoding, val); if (!running) VMCLEAR(vmcs); return (error); } int vmcs_setdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc) { int error; uint32_t base, limit, access; error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); if (error != 0) panic("vmcs_setdesc: invalid segment register %d", seg); if (!running) VMPTRLD(vmcs); if ((error = vmwrite(base, desc->base)) != 0) goto done; if ((error = vmwrite(limit, desc->limit)) != 0) goto done; if (access != VMCS_INVALID_ENCODING) { if ((error = vmwrite(access, desc->access)) != 0) goto done; } done: if (!running) VMCLEAR(vmcs); return (error); } int vmcs_getdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc) { int error; uint32_t base, limit, access; uint64_t u64; error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); if (error != 0) panic("vmcs_getdesc: invalid segment register %d", seg); if (!running) VMPTRLD(vmcs); if ((error = vmread(base, &u64)) != 0) goto done; desc->base = u64; if ((error = vmread(limit, &u64)) != 0) goto done; desc->limit = u64; if (access != VMCS_INVALID_ENCODING) { if ((error = vmread(access, &u64)) != 0) goto done; desc->access = u64; } done: if (!running) VMCLEAR(vmcs); return (error); } int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count) { int error; VMPTRLD(vmcs); /* * Guest MSRs are saved in the VM-exit MSR-store area. * Guest MSRs are loaded from the VM-entry MSR-load area. * Both areas point to the same location in memory. */ if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0) goto done; if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0) goto done; if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0) goto done; if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0) goto done; error = 0; done: VMCLEAR(vmcs); return (error); } int vmcs_init(struct vmcs *vmcs) { int error, codesel, datasel, tsssel; u_long cr0, cr4, efer; uint64_t pat, fsbase, idtrbase; codesel = vmm_get_host_codesel(); datasel = vmm_get_host_datasel(); tsssel = vmm_get_host_tsssel(); /* * Make sure we have a "current" VMCS to work with. */ VMPTRLD(vmcs); /* Host state */ /* Initialize host IA32_PAT MSR */ pat = vmm_get_host_pat(); if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0) goto done; /* Load the IA32_EFER MSR */ efer = vmm_get_host_efer(); if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0) goto done; /* Load the control registers */ cr0 = vmm_get_host_cr0(); if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0) goto done; cr4 = vmm_get_host_cr4() | CR4_VMXE; if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0) goto done; /* Load the segment selectors */ if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0) goto done; if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0) goto done; if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0) goto done; if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0) goto done; if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0) goto done; if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0) goto done; if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0) goto done; /* * Load the Base-Address for %fs and idtr. * * Note that we exclude %gs, tss and gdtr here because their base * address is pcpu specific. */ fsbase = vmm_get_host_fsbase(); if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0) goto done; idtrbase = vmm_get_host_idtrbase(); if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0) goto done; /* instruction pointer */ if (no_flush_rsb) { if ((error = vmwrite(VMCS_HOST_RIP, (u_long)vmx_exit_guest)) != 0) goto done; } else { if ((error = vmwrite(VMCS_HOST_RIP, (u_long)vmx_exit_guest_flush_rsb)) != 0) goto done; } /* link pointer */ if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0) goto done; done: VMCLEAR(vmcs); return (error); } +#ifdef BHYVE_SNAPSHOT +int +vmcs_getany(struct vmcs *vmcs, int running, int ident, uint64_t *val) +{ + int error; + + if (!running) + VMPTRLD(vmcs); + + error = vmread(ident, val); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_setany(struct vmcs *vmcs, int running, int ident, uint64_t val) +{ + int error; + + if (!running) + VMPTRLD(vmcs); + + error = vmwrite(ident, val); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_snapshot_reg(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcs_getreg(vmcs, running, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = vmcs_setreg(vmcs, running, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} + +int +vmcs_snapshot_desc(struct vmcs *vmcs, int running, int seg, + struct vm_snapshot_meta *meta) +{ + int ret; + struct seg_desc desc; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcs_getdesc(vmcs, running, seg, &desc); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + + ret = vmcs_setdesc(vmcs, running, seg, &desc); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} + +int +vmcs_snapshot_any(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcs_getany(vmcs, running, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = vmcs_setany(vmcs, running, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} +#endif + #ifdef DDB extern int vmxon_enabled[]; DB_SHOW_COMMAND(vmcs, db_show_vmcs) { uint64_t cur_vmcs, val; uint32_t exit; if (!vmxon_enabled[curcpu]) { db_printf("VMX not enabled\n"); return; } if (have_addr) { db_printf("Only current VMCS supported\n"); return; } vmptrst(&cur_vmcs); if (cur_vmcs == VMCS_INITIAL) { db_printf("No current VM context\n"); return; } db_printf("VMCS: %jx\n", cur_vmcs); db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID)); db_printf("Activity: "); val = vmcs_read(VMCS_GUEST_ACTIVITY); switch (val) { case 0: db_printf("Active"); break; case 1: db_printf("HLT"); break; case 2: db_printf("Shutdown"); break; case 3: db_printf("Wait for SIPI"); break; default: db_printf("Unknown: %#lx", val); } db_printf("\n"); exit = vmcs_read(VMCS_EXIT_REASON); if (exit & 0x80000000) db_printf("Entry Failure Reason: %u\n", exit & 0xffff); else db_printf("Exit Reason: %u\n", exit & 0xffff); db_printf("Qualification: %#lx\n", vmcs_exit_qualification()); db_printf("Guest Linear Address: %#lx\n", vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)); switch (exit & 0x8000ffff) { case EXIT_REASON_EXCEPTION: case EXIT_REASON_EXT_INTR: val = vmcs_read(VMCS_EXIT_INTR_INFO); db_printf("Interrupt Type: "); switch (val >> 8 & 0x7) { case 0: db_printf("external"); break; case 2: db_printf("NMI"); break; case 3: db_printf("HW exception"); break; case 4: db_printf("SW exception"); break; default: db_printf("?? %lu", val >> 8 & 0x7); break; } db_printf(" Vector: %lu", val & 0xff); if (val & 0x800) db_printf(" Error Code: %lx", vmcs_read(VMCS_EXIT_INTR_ERRCODE)); db_printf("\n"); break; case EXIT_REASON_EPT_FAULT: case EXIT_REASON_EPT_MISCONFIG: db_printf("Guest Physical Address: %#lx\n", vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)); break; } db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error()); } #endif diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h index 29e0263fb9f2..8aa7b1e8fc08 100644 --- a/sys/amd64/vmm/intel/vmcs.h +++ b/sys/amd64/vmm/intel/vmcs.h @@ -1,411 +1,424 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMCS_H_ #define _VMCS_H_ #ifdef _KERNEL + +struct vm_snapshot_meta; + struct vmcs { uint32_t identifier; uint32_t abort_code; char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2]; }; CTASSERT(sizeof(struct vmcs) == PAGE_SIZE); /* MSR save region is composed of an array of 'struct msr_entry' */ struct msr_entry { uint32_t index; uint32_t reserved; uint64_t val; }; int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count); int vmcs_init(struct vmcs *vmcs); int vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv); int vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val); int vmcs_getdesc(struct vmcs *vmcs, int running, int ident, struct seg_desc *desc); int vmcs_setdesc(struct vmcs *vmcs, int running, int ident, struct seg_desc *desc); +#ifdef BHYVE_SNAPSHOT +int vmcs_getany(struct vmcs *vmcs, int running, int ident, uint64_t *val); +int vmcs_setany(struct vmcs *vmcs, int running, int ident, uint64_t val); +int vmcs_snapshot_reg(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta); +int vmcs_snapshot_desc(struct vmcs *vmcs, int running, int seg, + struct vm_snapshot_meta *meta); +int vmcs_snapshot_any(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta); +#endif /* * Avoid header pollution caused by inline use of 'vtophys()' in vmx_cpufunc.h */ #ifdef _VMX_CPUFUNC_H_ static __inline uint64_t vmcs_read(uint32_t encoding) { int error; uint64_t val; error = vmread(encoding, &val); KASSERT(error == 0, ("vmcs_read(%u) error %d", encoding, error)); return (val); } static __inline void vmcs_write(uint32_t encoding, uint64_t val) { int error; error = vmwrite(encoding, val); KASSERT(error == 0, ("vmcs_write(%u) error %d", encoding, error)); } #endif /* _VMX_CPUFUNC_H_ */ #define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH) #define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP) #define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR) #define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff) #define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION) #define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3) #define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS) #define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS) #define vmcs_idt_vectoring_info() vmcs_read(VMCS_IDT_VECTORING_INFO) #define vmcs_idt_vectoring_err() vmcs_read(VMCS_IDT_VECTORING_ERROR) #endif /* _KERNEL */ #define VMCS_INITIAL 0xffffffffffffffff #define VMCS_IDENT(encoding) ((encoding) | 0x80000000) /* * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B. */ #define VMCS_INVALID_ENCODING 0xffffffff /* 16-bit control fields */ #define VMCS_VPID 0x00000000 #define VMCS_PIR_VECTOR 0x00000002 /* 16-bit guest-state fields */ #define VMCS_GUEST_ES_SELECTOR 0x00000800 #define VMCS_GUEST_CS_SELECTOR 0x00000802 #define VMCS_GUEST_SS_SELECTOR 0x00000804 #define VMCS_GUEST_DS_SELECTOR 0x00000806 #define VMCS_GUEST_FS_SELECTOR 0x00000808 #define VMCS_GUEST_GS_SELECTOR 0x0000080A #define VMCS_GUEST_LDTR_SELECTOR 0x0000080C #define VMCS_GUEST_TR_SELECTOR 0x0000080E #define VMCS_GUEST_INTR_STATUS 0x00000810 /* 16-bit host-state fields */ #define VMCS_HOST_ES_SELECTOR 0x00000C00 #define VMCS_HOST_CS_SELECTOR 0x00000C02 #define VMCS_HOST_SS_SELECTOR 0x00000C04 #define VMCS_HOST_DS_SELECTOR 0x00000C06 #define VMCS_HOST_FS_SELECTOR 0x00000C08 #define VMCS_HOST_GS_SELECTOR 0x00000C0A #define VMCS_HOST_TR_SELECTOR 0x00000C0C /* 64-bit control fields */ #define VMCS_IO_BITMAP_A 0x00002000 #define VMCS_IO_BITMAP_B 0x00002002 #define VMCS_MSR_BITMAP 0x00002004 #define VMCS_EXIT_MSR_STORE 0x00002006 #define VMCS_EXIT_MSR_LOAD 0x00002008 #define VMCS_ENTRY_MSR_LOAD 0x0000200A #define VMCS_EXECUTIVE_VMCS 0x0000200C #define VMCS_TSC_OFFSET 0x00002010 #define VMCS_VIRTUAL_APIC 0x00002012 #define VMCS_APIC_ACCESS 0x00002014 #define VMCS_PIR_DESC 0x00002016 #define VMCS_EPTP 0x0000201A #define VMCS_EOI_EXIT0 0x0000201C #define VMCS_EOI_EXIT1 0x0000201E #define VMCS_EOI_EXIT2 0x00002020 #define VMCS_EOI_EXIT3 0x00002022 #define VMCS_EOI_EXIT(vector) (VMCS_EOI_EXIT0 + ((vector) / 64) * 2) /* 64-bit read-only fields */ #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 /* 64-bit guest-state fields */ #define VMCS_LINK_POINTER 0x00002800 #define VMCS_GUEST_IA32_DEBUGCTL 0x00002802 #define VMCS_GUEST_IA32_PAT 0x00002804 #define VMCS_GUEST_IA32_EFER 0x00002806 #define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808 #define VMCS_GUEST_PDPTE0 0x0000280A #define VMCS_GUEST_PDPTE1 0x0000280C #define VMCS_GUEST_PDPTE2 0x0000280E #define VMCS_GUEST_PDPTE3 0x00002810 /* 64-bit host-state fields */ #define VMCS_HOST_IA32_PAT 0x00002C00 #define VMCS_HOST_IA32_EFER 0x00002C02 #define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04 /* 32-bit control fields */ #define VMCS_PIN_BASED_CTLS 0x00004000 #define VMCS_PRI_PROC_BASED_CTLS 0x00004002 #define VMCS_EXCEPTION_BITMAP 0x00004004 #define VMCS_PF_ERROR_MASK 0x00004006 #define VMCS_PF_ERROR_MATCH 0x00004008 #define VMCS_CR3_TARGET_COUNT 0x0000400A #define VMCS_EXIT_CTLS 0x0000400C #define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E #define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010 #define VMCS_ENTRY_CTLS 0x00004012 #define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014 #define VMCS_ENTRY_INTR_INFO 0x00004016 #define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018 #define VMCS_ENTRY_INST_LENGTH 0x0000401A #define VMCS_TPR_THRESHOLD 0x0000401C #define VMCS_SEC_PROC_BASED_CTLS 0x0000401E #define VMCS_PLE_GAP 0x00004020 #define VMCS_PLE_WINDOW 0x00004022 /* 32-bit read-only data fields */ #define VMCS_INSTRUCTION_ERROR 0x00004400 #define VMCS_EXIT_REASON 0x00004402 #define VMCS_EXIT_INTR_INFO 0x00004404 #define VMCS_EXIT_INTR_ERRCODE 0x00004406 #define VMCS_IDT_VECTORING_INFO 0x00004408 #define VMCS_IDT_VECTORING_ERROR 0x0000440A #define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C #define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E /* 32-bit guest-state fields */ #define VMCS_GUEST_ES_LIMIT 0x00004800 #define VMCS_GUEST_CS_LIMIT 0x00004802 #define VMCS_GUEST_SS_LIMIT 0x00004804 #define VMCS_GUEST_DS_LIMIT 0x00004806 #define VMCS_GUEST_FS_LIMIT 0x00004808 #define VMCS_GUEST_GS_LIMIT 0x0000480A #define VMCS_GUEST_LDTR_LIMIT 0x0000480C #define VMCS_GUEST_TR_LIMIT 0x0000480E #define VMCS_GUEST_GDTR_LIMIT 0x00004810 #define VMCS_GUEST_IDTR_LIMIT 0x00004812 #define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814 #define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816 #define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818 #define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A #define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C #define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E #define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820 #define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822 #define VMCS_GUEST_INTERRUPTIBILITY 0x00004824 #define VMCS_GUEST_ACTIVITY 0x00004826 #define VMCS_GUEST_SMBASE 0x00004828 #define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A #define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E /* 32-bit host state fields */ #define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00 /* Natural Width control fields */ #define VMCS_CR0_MASK 0x00006000 #define VMCS_CR4_MASK 0x00006002 #define VMCS_CR0_SHADOW 0x00006004 #define VMCS_CR4_SHADOW 0x00006006 #define VMCS_CR3_TARGET0 0x00006008 #define VMCS_CR3_TARGET1 0x0000600A #define VMCS_CR3_TARGET2 0x0000600C #define VMCS_CR3_TARGET3 0x0000600E /* Natural Width read-only fields */ #define VMCS_EXIT_QUALIFICATION 0x00006400 #define VMCS_IO_RCX 0x00006402 #define VMCS_IO_RSI 0x00006404 #define VMCS_IO_RDI 0x00006406 #define VMCS_IO_RIP 0x00006408 #define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A /* Natural Width guest-state fields */ #define VMCS_GUEST_CR0 0x00006800 #define VMCS_GUEST_CR3 0x00006802 #define VMCS_GUEST_CR4 0x00006804 #define VMCS_GUEST_ES_BASE 0x00006806 #define VMCS_GUEST_CS_BASE 0x00006808 #define VMCS_GUEST_SS_BASE 0x0000680A #define VMCS_GUEST_DS_BASE 0x0000680C #define VMCS_GUEST_FS_BASE 0x0000680E #define VMCS_GUEST_GS_BASE 0x00006810 #define VMCS_GUEST_LDTR_BASE 0x00006812 #define VMCS_GUEST_TR_BASE 0x00006814 #define VMCS_GUEST_GDTR_BASE 0x00006816 #define VMCS_GUEST_IDTR_BASE 0x00006818 #define VMCS_GUEST_DR7 0x0000681A #define VMCS_GUEST_RSP 0x0000681C #define VMCS_GUEST_RIP 0x0000681E #define VMCS_GUEST_RFLAGS 0x00006820 #define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822 #define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824 #define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826 /* Natural Width host-state fields */ #define VMCS_HOST_CR0 0x00006C00 #define VMCS_HOST_CR3 0x00006C02 #define VMCS_HOST_CR4 0x00006C04 #define VMCS_HOST_FS_BASE 0x00006C06 #define VMCS_HOST_GS_BASE 0x00006C08 #define VMCS_HOST_TR_BASE 0x00006C0A #define VMCS_HOST_GDTR_BASE 0x00006C0C #define VMCS_HOST_IDTR_BASE 0x00006C0E #define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10 #define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12 #define VMCS_HOST_RSP 0x00006C14 #define VMCS_HOST_RIP 0x00006c16 /* * VM instruction error numbers */ #define VMRESUME_WITH_NON_LAUNCHED_VMCS 5 /* * VMCS exit reasons */ #define EXIT_REASON_EXCEPTION 0 #define EXIT_REASON_EXT_INTR 1 #define EXIT_REASON_TRIPLE_FAULT 2 #define EXIT_REASON_INIT 3 #define EXIT_REASON_SIPI 4 #define EXIT_REASON_IO_SMI 5 #define EXIT_REASON_SMI 6 #define EXIT_REASON_INTR_WINDOW 7 #define EXIT_REASON_NMI_WINDOW 8 #define EXIT_REASON_TASK_SWITCH 9 #define EXIT_REASON_CPUID 10 #define EXIT_REASON_GETSEC 11 #define EXIT_REASON_HLT 12 #define EXIT_REASON_INVD 13 #define EXIT_REASON_INVLPG 14 #define EXIT_REASON_RDPMC 15 #define EXIT_REASON_RDTSC 16 #define EXIT_REASON_RSM 17 #define EXIT_REASON_VMCALL 18 #define EXIT_REASON_VMCLEAR 19 #define EXIT_REASON_VMLAUNCH 20 #define EXIT_REASON_VMPTRLD 21 #define EXIT_REASON_VMPTRST 22 #define EXIT_REASON_VMREAD 23 #define EXIT_REASON_VMRESUME 24 #define EXIT_REASON_VMWRITE 25 #define EXIT_REASON_VMXOFF 26 #define EXIT_REASON_VMXON 27 #define EXIT_REASON_CR_ACCESS 28 #define EXIT_REASON_DR_ACCESS 29 #define EXIT_REASON_INOUT 30 #define EXIT_REASON_RDMSR 31 #define EXIT_REASON_WRMSR 32 #define EXIT_REASON_INVAL_VMCS 33 #define EXIT_REASON_INVAL_MSR 34 #define EXIT_REASON_MWAIT 36 #define EXIT_REASON_MTF 37 #define EXIT_REASON_MONITOR 39 #define EXIT_REASON_PAUSE 40 #define EXIT_REASON_MCE_DURING_ENTRY 41 #define EXIT_REASON_TPR 43 #define EXIT_REASON_APIC_ACCESS 44 #define EXIT_REASON_VIRTUALIZED_EOI 45 #define EXIT_REASON_GDTR_IDTR 46 #define EXIT_REASON_LDTR_TR 47 #define EXIT_REASON_EPT_FAULT 48 #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_INVEPT 50 #define EXIT_REASON_RDTSCP 51 #define EXIT_REASON_VMX_PREEMPT 52 #define EXIT_REASON_INVVPID 53 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 #define EXIT_REASON_RDRAND 57 #define EXIT_REASON_INVPCID 58 #define EXIT_REASON_VMFUNC 59 #define EXIT_REASON_ENCLS 60 #define EXIT_REASON_RDSEED 61 #define EXIT_REASON_PM_LOG_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 /* * NMI unblocking due to IRET. * * Applies to VM-exits due to hardware exception or EPT fault. */ #define EXIT_QUAL_NMIUDTI (1 << 12) /* * VMCS interrupt information fields */ #define VMCS_INTR_VALID (1U << 31) #define VMCS_INTR_T_MASK 0x700 /* Interruption-info type */ #define VMCS_INTR_T_HWINTR (0 << 8) #define VMCS_INTR_T_NMI (2 << 8) #define VMCS_INTR_T_HWEXCEPTION (3 << 8) #define VMCS_INTR_T_SWINTR (4 << 8) #define VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8) #define VMCS_INTR_T_SWEXCEPTION (6 << 8) #define VMCS_INTR_DEL_ERRCODE (1 << 11) /* * VMCS IDT-Vectoring information fields */ #define VMCS_IDT_VEC_VALID (1U << 31) #define VMCS_IDT_VEC_ERRCODE_VALID (1 << 11) /* * VMCS Guest interruptibility field */ #define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0) #define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1) #define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2) #define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3) /* * Exit qualification for EXIT_REASON_INVAL_VMCS */ #define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3 /* * Exit qualification for EPT violation */ #define EPT_VIOLATION_DATA_READ (1UL << 0) #define EPT_VIOLATION_DATA_WRITE (1UL << 1) #define EPT_VIOLATION_INST_FETCH (1UL << 2) #define EPT_VIOLATION_GPA_READABLE (1UL << 3) #define EPT_VIOLATION_GPA_WRITEABLE (1UL << 4) #define EPT_VIOLATION_GPA_EXECUTABLE (1UL << 5) #define EPT_VIOLATION_GLA_VALID (1UL << 7) #define EPT_VIOLATION_XLAT_VALID (1UL << 8) /* * Exit qualification for APIC-access VM exit */ #define APIC_ACCESS_OFFSET(qual) ((qual) & 0xFFF) #define APIC_ACCESS_TYPE(qual) (((qual) >> 12) & 0xF) /* * Exit qualification for APIC-write VM exit */ #define APIC_WRITE_OFFSET(qual) ((qual) & 0xFFF) #endif diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 9f610ea50852..21a1b9fdefc4 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1,3896 +1,4058 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * Copyright (c) 2018 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include + #include "vmm_lapic.h" #include "vmm_host.h" #include "vmm_ioport.h" #include "vmm_ktr.h" #include "vmm_stat.h" #include "vatpic.h" #include "vlapic.h" #include "vlapic_priv.h" #include "ept.h" #include "vmx_cpufunc.h" #include "vmx.h" #include "vmx_msr.h" #include "x86.h" #include "vmx_controls.h" #define PINBASED_CTLS_ONE_SETTING \ (PINBASED_EXTINT_EXITING | \ PINBASED_NMI_EXITING | \ PINBASED_VIRTUAL_NMI) #define PINBASED_CTLS_ZERO_SETTING 0 #define PROCBASED_CTLS_WINDOW_SETTING \ (PROCBASED_INT_WINDOW_EXITING | \ PROCBASED_NMI_WINDOW_EXITING) #define PROCBASED_CTLS_ONE_SETTING \ (PROCBASED_SECONDARY_CONTROLS | \ PROCBASED_MWAIT_EXITING | \ PROCBASED_MONITOR_EXITING | \ PROCBASED_IO_EXITING | \ PROCBASED_MSR_BITMAPS | \ PROCBASED_CTLS_WINDOW_SETTING | \ PROCBASED_CR8_LOAD_EXITING | \ PROCBASED_CR8_STORE_EXITING) #define PROCBASED_CTLS_ZERO_SETTING \ (PROCBASED_CR3_LOAD_EXITING | \ PROCBASED_CR3_STORE_EXITING | \ PROCBASED_IO_BITMAPS) #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT #define PROCBASED_CTLS2_ZERO_SETTING 0 #define VM_EXIT_CTLS_ONE_SETTING \ (VM_EXIT_SAVE_DEBUG_CONTROLS | \ VM_EXIT_HOST_LMA | \ VM_EXIT_SAVE_EFER | \ VM_EXIT_LOAD_EFER | \ VM_EXIT_ACKNOWLEDGE_INTERRUPT) #define VM_EXIT_CTLS_ZERO_SETTING 0 #define VM_ENTRY_CTLS_ONE_SETTING \ (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ VM_ENTRY_LOAD_EFER) #define VM_ENTRY_CTLS_ZERO_SETTING \ (VM_ENTRY_INTO_SMM | \ VM_ENTRY_DEACTIVATE_DUAL_MONITOR) #define HANDLED 1 #define UNHANDLED 0 static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL); int vmxon_enabled[MAXCPU]; static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; static uint32_t exit_ctls, entry_ctls; static uint64_t cr0_ones_mask, cr0_zeros_mask; SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, &cr0_ones_mask, 0, NULL); SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, &cr0_zeros_mask, 0, NULL); static uint64_t cr4_ones_mask, cr4_zeros_mask; SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, &cr4_ones_mask, 0, NULL); SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, &cr4_zeros_mask, 0, NULL); static int vmx_initialized; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, &vmx_initialized, 0, "Intel VMX initialized"); /* * Optional capabilities */ static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL); static int cap_halt_exit; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, "HLT triggers a VM-exit"); static int cap_pause_exit; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, 0, "PAUSE triggers a VM-exit"); static int cap_unrestricted_guest; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, &cap_unrestricted_guest, 0, "Unrestricted guests"); static int cap_monitor_trap; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, &cap_monitor_trap, 0, "Monitor trap flag"); static int cap_invpcid; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, 0, "Guests are allowed to use INVPCID"); static int tpr_shadowing; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, tpr_shadowing, CTLFLAG_RD, &tpr_shadowing, 0, "TPR shadowing support"); static int virtual_interrupt_delivery; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); static int posted_interrupts; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, &posted_interrupts, 0, "APICv posted interrupt support"); static int pirvec = -1; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, &pirvec, 0, "APICv posted interrupt vector"); static struct unrhdr *vpid_unr; static u_int vpid_alloc_failed; SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, &vpid_alloc_failed, 0, NULL); int guest_l1d_flush; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD, &guest_l1d_flush, 0, NULL); int guest_l1d_flush_sw; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD, &guest_l1d_flush_sw, 0, NULL); static struct msr_entry msr_load_list[1] __aligned(16); /* * The definitions of SDT probes for VMX. */ SDT_PROBE_DEFINE3(vmm, vmx, exit, entry, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch, "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *"); SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess, "struct vmx *", "int", "struct vm_exit *", "uint64_t"); SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr, "struct vmx *", "int", "struct vm_exit *", "uint32_t"); SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr, "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t"); SDT_PROBE_DEFINE3(vmm, vmx, exit, halt, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, pause, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt, "struct vmx *", "int", "struct vm_exit *", "uint32_t"); SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, inout, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE5(vmm, vmx, exit, exception, "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int"); SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault, "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t"); SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault, "struct vmx *", "int", "struct vm_exit *", "uint64_t"); SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite, "struct vmx *", "int", "struct vm_exit *", "struct vlapic *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, "struct vmx *", "int", "struct vm_exit *", "uint32_t"); SDT_PROBE_DEFINE4(vmm, vmx, exit, return, "struct vmx *", "int", "struct vm_exit *", "int"); /* * Use the last page below 4GB as the APIC access address. This address is * occupied by the boot firmware so it is guaranteed that it will not conflict * with a page in system memory. */ #define APIC_ACCESS_ADDRESS 0xFFFFF000 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); static void vmx_inject_pir(struct vlapic *vlapic); +#ifdef BHYVE_SNAPSHOT +static int vmx_restore_tsc(void *arg, int vcpu, uint64_t now); +#endif #ifdef KTR static const char * exit_reason_to_str(int reason) { static char reasonbuf[32]; switch (reason) { case EXIT_REASON_EXCEPTION: return "exception"; case EXIT_REASON_EXT_INTR: return "extint"; case EXIT_REASON_TRIPLE_FAULT: return "triplefault"; case EXIT_REASON_INIT: return "init"; case EXIT_REASON_SIPI: return "sipi"; case EXIT_REASON_IO_SMI: return "iosmi"; case EXIT_REASON_SMI: return "smi"; case EXIT_REASON_INTR_WINDOW: return "intrwindow"; case EXIT_REASON_NMI_WINDOW: return "nmiwindow"; case EXIT_REASON_TASK_SWITCH: return "taskswitch"; case EXIT_REASON_CPUID: return "cpuid"; case EXIT_REASON_GETSEC: return "getsec"; case EXIT_REASON_HLT: return "hlt"; case EXIT_REASON_INVD: return "invd"; case EXIT_REASON_INVLPG: return "invlpg"; case EXIT_REASON_RDPMC: return "rdpmc"; case EXIT_REASON_RDTSC: return "rdtsc"; case EXIT_REASON_RSM: return "rsm"; case EXIT_REASON_VMCALL: return "vmcall"; case EXIT_REASON_VMCLEAR: return "vmclear"; case EXIT_REASON_VMLAUNCH: return "vmlaunch"; case EXIT_REASON_VMPTRLD: return "vmptrld"; case EXIT_REASON_VMPTRST: return "vmptrst"; case EXIT_REASON_VMREAD: return "vmread"; case EXIT_REASON_VMRESUME: return "vmresume"; case EXIT_REASON_VMWRITE: return "vmwrite"; case EXIT_REASON_VMXOFF: return "vmxoff"; case EXIT_REASON_VMXON: return "vmxon"; case EXIT_REASON_CR_ACCESS: return "craccess"; case EXIT_REASON_DR_ACCESS: return "draccess"; case EXIT_REASON_INOUT: return "inout"; case EXIT_REASON_RDMSR: return "rdmsr"; case EXIT_REASON_WRMSR: return "wrmsr"; case EXIT_REASON_INVAL_VMCS: return "invalvmcs"; case EXIT_REASON_INVAL_MSR: return "invalmsr"; case EXIT_REASON_MWAIT: return "mwait"; case EXIT_REASON_MTF: return "mtf"; case EXIT_REASON_MONITOR: return "monitor"; case EXIT_REASON_PAUSE: return "pause"; case EXIT_REASON_MCE_DURING_ENTRY: return "mce-during-entry"; case EXIT_REASON_TPR: return "tpr"; case EXIT_REASON_APIC_ACCESS: return "apic-access"; case EXIT_REASON_GDTR_IDTR: return "gdtridtr"; case EXIT_REASON_LDTR_TR: return "ldtrtr"; case EXIT_REASON_EPT_FAULT: return "eptfault"; case EXIT_REASON_EPT_MISCONFIG: return "eptmisconfig"; case EXIT_REASON_INVEPT: return "invept"; case EXIT_REASON_RDTSCP: return "rdtscp"; case EXIT_REASON_VMX_PREEMPT: return "vmxpreempt"; case EXIT_REASON_INVVPID: return "invvpid"; case EXIT_REASON_WBINVD: return "wbinvd"; case EXIT_REASON_XSETBV: return "xsetbv"; case EXIT_REASON_APIC_WRITE: return "apic-write"; default: snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); return (reasonbuf); } } #endif /* KTR */ static int vmx_allow_x2apic_msrs(struct vmx *vmx) { int i, error; error = 0; /* * Allow readonly access to the following x2APIC MSRs from the guest. */ error += guest_msr_ro(vmx, MSR_APIC_ID); error += guest_msr_ro(vmx, MSR_APIC_VERSION); error += guest_msr_ro(vmx, MSR_APIC_LDR); error += guest_msr_ro(vmx, MSR_APIC_SVR); for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); error += guest_msr_ro(vmx, MSR_APIC_ESR); error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); error += guest_msr_ro(vmx, MSR_APIC_ICR); /* * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. * * These registers get special treatment described in the section * "Virtualizing MSR-Based APIC Accesses". */ error += guest_msr_rw(vmx, MSR_APIC_TPR); error += guest_msr_rw(vmx, MSR_APIC_EOI); error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); return (error); } u_long vmx_fix_cr0(u_long cr0) { return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); } u_long vmx_fix_cr4(u_long cr4) { return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); } static void vpid_free(int vpid) { if (vpid < 0 || vpid > 0xffff) panic("vpid_free: invalid vpid %d", vpid); /* * VPIDs [0,VM_MAXCPU] are special and are not allocated from * the unit number allocator. */ if (vpid > VM_MAXCPU) free_unr(vpid_unr, vpid); } static void vpid_alloc(uint16_t *vpid, int num) { int i, x; if (num <= 0 || num > VM_MAXCPU) panic("invalid number of vpids requested: %d", num); /* * If the "enable vpid" execution control is not enabled then the * VPID is required to be 0 for all vcpus. */ if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { for (i = 0; i < num; i++) vpid[i] = 0; return; } /* * Allocate a unique VPID for each vcpu from the unit number allocator. */ for (i = 0; i < num; i++) { x = alloc_unr(vpid_unr); if (x == -1) break; else vpid[i] = x; } if (i < num) { atomic_add_int(&vpid_alloc_failed, 1); /* * If the unit number allocator does not have enough unique * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. * * These VPIDs are not be unique across VMs but this does not * affect correctness because the combined mappings are also * tagged with the EP4TA which is unique for each VM. * * It is still sub-optimal because the invvpid will invalidate * combined mappings for a particular VPID across all EP4TAs. */ while (i-- > 0) vpid_free(vpid[i]); for (i = 0; i < num; i++) vpid[i] = i + 1; } } static void vpid_init(void) { /* * VPID 0 is required when the "enable VPID" execution control is * disabled. * * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the * unit number allocator does not have sufficient unique VPIDs to * satisfy the allocation. * * The remaining VPIDs are managed by the unit number allocator. */ vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); } static void vmx_disable(void *arg __unused) { struct invvpid_desc invvpid_desc = { 0 }; struct invept_desc invept_desc = { 0 }; if (vmxon_enabled[curcpu]) { /* * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. * * VMXON or VMXOFF are not required to invalidate any TLB * caching structures. This prevents potential retention of * cached information in the TLB between distinct VMX episodes. */ invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); vmxoff(); } load_cr4(rcr4() & ~CR4_VMXE); } static int vmx_cleanup(void) { if (pirvec >= 0) lapic_ipi_free(pirvec); if (vpid_unr != NULL) { delete_unrhdr(vpid_unr); vpid_unr = NULL; } if (nmi_flush_l1d_sw == 1) nmi_flush_l1d_sw = 0; smp_rendezvous(NULL, vmx_disable, NULL, NULL); return (0); } static void vmx_enable(void *arg __unused) { int error; uint64_t feature_control; feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { wrmsr(MSR_IA32_FEATURE_CONTROL, feature_control | IA32_FEATURE_CONTROL_VMX_EN | IA32_FEATURE_CONTROL_LOCK); } load_cr4(rcr4() | CR4_VMXE); *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); error = vmxon(vmxon_region[curcpu]); if (error == 0) vmxon_enabled[curcpu] = 1; } static void vmx_restore(void) { if (vmxon_enabled[curcpu]) vmxon(vmxon_region[curcpu]); } static int vmx_init(int ipinum) { int error; uint64_t basic, fixed0, fixed1, feature_control; uint32_t tmp, procbased2_vid_bits; /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ if (!(cpu_feature2 & CPUID2_VMX)) { printf("vmx_init: processor does not support VMX operation\n"); return (ENXIO); } /* * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits * are set (bits 0 and 2 respectively). */ feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { printf("vmx_init: VMX operation disabled by BIOS\n"); return (ENXIO); } /* * Verify capabilities MSR_VMX_BASIC: * - bit 54 indicates support for INS/OUTS decoding */ basic = rdmsr(MSR_VMX_BASIC); if ((basic & (1UL << 54)) == 0) { printf("vmx_init: processor does not support desired basic " "capabilities\n"); return (EINVAL); } /* Check support for primary processor-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_CTLS_ONE_SETTING, PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); if (error) { printf("vmx_init: processor does not support desired primary " "processor-based controls\n"); return (error); } /* Clear the processor-based ctl bits that are set on demand */ procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; /* Check support for secondary processor-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED_CTLS2_ONE_SETTING, PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); if (error) { printf("vmx_init: processor does not support desired secondary " "processor-based controls\n"); return (error); } /* Check support for VPID */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_VPID, 0, &tmp); if (error == 0) procbased_ctls2 |= PROCBASED2_ENABLE_VPID; /* Check support for pin-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_CTLS_ONE_SETTING, PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); if (error) { printf("vmx_init: processor does not support desired " "pin-based controls\n"); return (error); } /* Check support for VM-exit controls */ error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, VM_EXIT_CTLS_ONE_SETTING, VM_EXIT_CTLS_ZERO_SETTING, &exit_ctls); if (error) { printf("vmx_init: processor does not support desired " "exit controls\n"); return (error); } /* Check support for VM-entry controls */ error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, &entry_ctls); if (error) { printf("vmx_init: processor does not support desired " "entry controls\n"); return (error); } /* * Check support for optional features by testing them * as individual bits */ cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_HLT_EXITING, 0, &tmp) == 0); cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_PROCBASED_CTLS, PROCBASED_MTF, 0, &tmp) == 0); cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_PAUSE_EXITING, 0, &tmp) == 0); cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED2_UNRESTRICTED_GUEST, 0, &tmp) == 0); cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, &tmp) == 0); /* * Check support for TPR shadow. */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, &tmp); if (error == 0) { tpr_shadowing = 1; TUNABLE_INT_FETCH("hw.vmm.vmx.use_tpr_shadowing", &tpr_shadowing); } if (tpr_shadowing) { procbased_ctls |= PROCBASED_USE_TPR_SHADOW; procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; } /* * Check support for virtual interrupt delivery. */ procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | PROCBASED2_VIRTUALIZE_X2APIC_MODE | PROCBASED2_APIC_REGISTER_VIRTUALIZATION | PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, procbased2_vid_bits, 0, &tmp); if (error == 0 && tpr_shadowing) { virtual_interrupt_delivery = 1; TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", &virtual_interrupt_delivery); } if (virtual_interrupt_delivery) { procbased_ctls |= PROCBASED_USE_TPR_SHADOW; procbased_ctls2 |= procbased2_vid_bits; procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; /* * Check for Posted Interrupts only if Virtual Interrupt * Delivery is enabled. */ error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, &tmp); if (error == 0) { pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : &IDTVEC(justreturn)); if (pirvec < 0) { if (bootverbose) { printf("vmx_init: unable to allocate " "posted interrupt vector\n"); } } else { posted_interrupts = 1; TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", &posted_interrupts); } } } if (posted_interrupts) pinbased_ctls |= PINBASED_POSTED_INTERRUPT; /* Initialize EPT */ error = ept_init(ipinum); if (error) { printf("vmx_init: ept initialization failed (%d)\n", error); return (error); } guest_l1d_flush = (cpu_ia32_arch_caps & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0; TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); /* * L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when * available. Otherwise fall back to the software flush * method which loads enough data from the kernel text to * flush existing L1D content, both on VMX entry and on NMI * return. */ if (guest_l1d_flush) { if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) { guest_l1d_flush_sw = 1; TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw", &guest_l1d_flush_sw); } if (guest_l1d_flush_sw) { if (nmi_flush_l1d_sw <= 1) nmi_flush_l1d_sw = 1; } else { msr_load_list[0].index = MSR_IA32_FLUSH_CMD; msr_load_list[0].val = IA32_FLUSH_CMD_L1D; } } /* * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 */ fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); cr0_ones_mask = fixed0 & fixed1; cr0_zeros_mask = ~fixed0 & ~fixed1; /* * CR0_PE and CR0_PG can be set to zero in VMX non-root operation * if unrestricted guest execution is allowed. */ if (cap_unrestricted_guest) cr0_ones_mask &= ~(CR0_PG | CR0_PE); /* * Do not allow the guest to set CR0_NW or CR0_CD. */ cr0_zeros_mask |= (CR0_NW | CR0_CD); fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); cr4_ones_mask = fixed0 & fixed1; cr4_zeros_mask = ~fixed0 & ~fixed1; vpid_init(); vmx_msr_init(); /* enable VMX operation */ smp_rendezvous(NULL, vmx_enable, NULL, NULL); vmx_initialized = 1; return (0); } static void vmx_trigger_hostintr(int vector) { uintptr_t func; struct gate_descriptor *gd; gd = &idt[vector]; KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " "invalid vector %d", vector)); KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", vector)); KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " "has invalid type %d", vector, gd->gd_type)); KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " "has invalid dpl %d", vector, gd->gd_dpl)); KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " "for vector %d has invalid selector %d", vector, gd->gd_selector)); KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " "IST %d", vector, gd->gd_ist)); func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); vmx_call_isr(func); } static int vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) { int error, mask_ident, shadow_ident; uint64_t mask_value; if (which != 0 && which != 4) panic("vmx_setup_cr_shadow: unknown cr%d", which); if (which == 0) { mask_ident = VMCS_CR0_MASK; mask_value = cr0_ones_mask | cr0_zeros_mask; shadow_ident = VMCS_CR0_SHADOW; } else { mask_ident = VMCS_CR4_MASK; mask_value = cr4_ones_mask | cr4_zeros_mask; shadow_ident = VMCS_CR4_SHADOW; } error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); if (error) return (error); error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); if (error) return (error); return (0); } #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) static void * vmx_vminit(struct vm *vm, pmap_t pmap) { uint16_t vpid[VM_MAXCPU]; int i, error; struct vmx *vmx; struct vmcs *vmcs; uint32_t exc_bitmap; uint16_t maxcpus; vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); if ((uintptr_t)vmx & PAGE_MASK) { panic("malloc of struct vmx not aligned on %d byte boundary", PAGE_SIZE); } vmx->vm = vm; vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); /* * Clean up EPTP-tagged guest physical and combined mappings * * VMX transitions are not required to invalidate any guest physical * mappings. So, it may be possible for stale guest physical mappings * to be present in the processor TLBs. * * Combined mappings for this EP4TA are also invalidated for all VPIDs. */ ept_invalidate_mappings(vmx->eptp); msr_bitmap_initialize(vmx->msr_bitmap); /* * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. * The guest FSBASE and GSBASE are saved and restored during * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are * always restored from the vmcs host state area on vm-exit. * * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in * how they are saved/restored so can be directly accessed by the * guest. * * MSR_EFER is saved and restored in the guest VMCS area on a * VM exit and entry respectively. It is also restored from the * host VMCS area on a VM exit. * * The TSC MSR is exposed read-only. Writes are disallowed as * that will impact the host TSC. If the guest does a write * the "use TSC offsetting" execution control is enabled and the * difference between the host TSC and the guest TSC is written * into the TSC offset in the VMCS. */ if (guest_msr_rw(vmx, MSR_GSBASE) || guest_msr_rw(vmx, MSR_FSBASE) || guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || guest_msr_rw(vmx, MSR_EFER) || guest_msr_ro(vmx, MSR_TSC)) panic("vmx_vminit: error setting guest msr access"); vpid_alloc(vpid, VM_MAXCPU); if (virtual_interrupt_delivery) { error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, APIC_ACCESS_ADDRESS); /* XXX this should really return an error to the caller */ KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); } maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vmcs = &vmx->vmcs[i]; vmcs->identifier = vmx_revision(); error = vmclear(vmcs); if (error != 0) { panic("vmx_vminit: vmclear error %d on vcpu %d\n", error, i); } vmx_msr_guest_init(vmx, i); error = vmcs_init(vmcs); KASSERT(error == 0, ("vmcs_init error %d", error)); VMPTRLD(vmcs); error = 0; error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); error += vmwrite(VMCS_EPTP, vmx->eptp); error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); error += vmwrite(VMCS_VPID, vpid[i]); if (guest_l1d_flush && !guest_l1d_flush_sw) { vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract( (vm_offset_t)&msr_load_list[0])); vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT, nitems(msr_load_list)); vmcs_write(VMCS_EXIT_MSR_STORE, 0); vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0); } /* exception bitmap */ if (vcpu_trace_exceptions(vm, i)) exc_bitmap = 0xffffffff; else exc_bitmap = 1 << IDT_MC; error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1; error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); if (tpr_shadowing) { error += vmwrite(VMCS_VIRTUAL_APIC, vtophys(&vmx->apic_page[i])); } if (virtual_interrupt_delivery) { error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); error += vmwrite(VMCS_EOI_EXIT0, 0); error += vmwrite(VMCS_EOI_EXIT1, 0); error += vmwrite(VMCS_EOI_EXIT2, 0); error += vmwrite(VMCS_EOI_EXIT3, 0); } if (posted_interrupts) { error += vmwrite(VMCS_PIR_VECTOR, pirvec); error += vmwrite(VMCS_PIR_DESC, vtophys(&vmx->pir_desc[i])); } VMCLEAR(vmcs); KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); vmx->cap[i].set = 0; vmx->cap[i].proc_ctls = procbased_ctls; vmx->cap[i].proc_ctls2 = procbased_ctls2; vmx->cap[i].exc_bitmap = exc_bitmap; vmx->state[i].nextrip = ~0; vmx->state[i].lastcpu = NOCPU; vmx->state[i].vpid = vpid[i]; /* * Set up the CR0/4 shadows, and init the read shadow * to the power-on register value from the Intel Sys Arch. * CR0 - 0x60000010 * CR4 - 0 */ error = vmx_setup_cr0_shadow(vmcs, 0x60000010); if (error != 0) panic("vmx_setup_cr0_shadow %d", error); error = vmx_setup_cr4_shadow(vmcs, 0); if (error != 0) panic("vmx_setup_cr4_shadow %d", error); vmx->ctx[i].pmap = pmap; } return (vmx); } static int vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) { int handled, func; func = vmxctx->guest_rax; handled = x86_emulate_cpuid(vm, vcpu, (uint32_t*)(&vmxctx->guest_rax), (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx), (uint32_t*)(&vmxctx->guest_rdx)); return (handled); } static __inline void vmx_run_trace(struct vmx *vmx, int vcpu) { #ifdef KTR VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); #endif } static __inline void vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, int handled) { #ifdef KTR VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", handled ? "handled" : "unhandled", exit_reason_to_str(exit_reason), rip); #endif } static __inline void vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) { #ifdef KTR VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); #endif } static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); /* * Invalidate guest mappings identified by its vpid from the TLB. */ static __inline void vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) { struct vmxstate *vmxstate; struct invvpid_desc invvpid_desc; vmxstate = &vmx->state[vcpu]; if (vmxstate->vpid == 0) return; if (!running) { /* * Set the 'lastcpu' to an invalid host cpu. * * This will invalidate TLB entries tagged with the vcpu's * vpid the next time it runs via vmx_set_pcpu_defaults(). */ vmxstate->lastcpu = NOCPU; return; } KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " "critical section", __func__, vcpu)); /* * Invalidate all mappings tagged with 'vpid' * * We do this because this vcpu was executing on a different host * cpu when it last ran. We do not track whether it invalidated * mappings associated with its 'vpid' during that run. So we must * assume that the mappings associated with 'vpid' on 'curcpu' are * stale and invalidate them. * * Note that we incur this penalty only when the scheduler chooses to * move the thread associated with this vcpu between host cpus. * * Note also that this will invalidate mappings tagged with 'vpid' * for "all" EP4TAs. */ if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { invvpid_desc._res1 = 0; invvpid_desc._res2 = 0; invvpid_desc.vpid = vmxstate->vpid; invvpid_desc.linear_addr = 0; invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); } else { /* * The invvpid can be skipped if an invept is going to * be performed before entering the guest. The invept * will invalidate combined mappings tagged with * 'vmx->eptp' for all vpids. */ vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); } } static void vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) { struct vmxstate *vmxstate; vmxstate = &vmx->state[vcpu]; if (vmxstate->lastcpu == curcpu) return; vmxstate->lastcpu = curcpu; vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); vmx_invvpid(vmx, vcpu, pmap, 1); } /* * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. */ CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); static void __inline vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) { if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); } } static void __inline vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) { KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); } static void __inline vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) { if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); } } static void __inline vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) { KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); } int vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset) { int error; if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) { vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting"); } error = vmwrite(VMCS_TSC_OFFSET, offset); - +#ifdef BHYVE_SNAPSHOT + if (error == 0) + error = vm_set_tsc_offset(vmx->vm, vcpu, offset); +#endif return (error); } #define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) static void vmx_inject_nmi(struct vmx *vmx, int vcpu) { uint32_t gi, info; gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " "interruptibility-state %#x", gi)); info = vmcs_read(VMCS_ENTRY_INTR_INFO); KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " "VM-entry interruption information %#x", info)); /* * Inject the virtual NMI. The vector must be the NMI IDT entry * or the VMCS entry check will fail. */ info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; vmcs_write(VMCS_ENTRY_INTR_INFO, info); VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); /* Clear the request */ vm_nmi_clear(vmx->vm, vcpu); } static void vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, uint64_t guestrip) { int vector, need_nmi_exiting, extint_pending; uint64_t rflags, entryinfo; uint32_t gi, info; if (vmx->state[vcpu].nextrip != guestrip) { gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); if (gi & HWINTR_BLOCKING) { VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " "cleared due to rip change: %#lx/%#lx", vmx->state[vcpu].nextrip, guestrip); gi &= ~HWINTR_BLOCKING; vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); } } if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " "intinfo is not valid: %#lx", __func__, entryinfo)); info = vmcs_read(VMCS_ENTRY_INTR_INFO); KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " "pending exception: %#lx/%#x", __func__, entryinfo, info)); info = entryinfo; vector = info & 0xff; if (vector == IDT_BP || vector == IDT_OF) { /* * VT-x requires #BP and #OF to be injected as software * exceptions. */ info &= ~VMCS_INTR_T_MASK; info |= VMCS_INTR_T_SWEXCEPTION; } if (info & VMCS_INTR_DEL_ERRCODE) vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); vmcs_write(VMCS_ENTRY_INTR_INFO, info); } if (vm_nmi_pending(vmx->vm, vcpu)) { /* * If there are no conditions blocking NMI injection then * inject it directly here otherwise enable "NMI window * exiting" to inject it as soon as we can. * * We also check for STI_BLOCKING because some implementations * don't allow NMI injection in this case. If we are running * on a processor that doesn't have this restriction it will * immediately exit and the NMI will be injected in the * "NMI window exiting" handler. */ need_nmi_exiting = 1; gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { info = vmcs_read(VMCS_ENTRY_INTR_INFO); if ((info & VMCS_INTR_VALID) == 0) { vmx_inject_nmi(vmx, vcpu); need_nmi_exiting = 0; } else { VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " "due to VM-entry intr info %#x", info); } } else { VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " "Guest Interruptibility-state %#x", gi); } if (need_nmi_exiting) vmx_set_nmi_window_exiting(vmx, vcpu); } extint_pending = vm_extint_pending(vmx->vm, vcpu); if (!extint_pending && virtual_interrupt_delivery) { vmx_inject_pir(vlapic); return; } /* * If interrupt-window exiting is already in effect then don't bother * checking for pending interrupts. This is just an optimization and * not needed for correctness. */ if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " "pending int_window_exiting"); return; } if (!extint_pending) { /* Ask the local apic for a vector to inject */ if (!vlapic_pending_intr(vlapic, &vector)) return; /* * From the Intel SDM, Volume 3, Section "Maskable * Hardware Interrupts": * - maskable interrupt vectors [16,255] can be delivered * through the local APIC. */ KASSERT(vector >= 16 && vector <= 255, ("invalid vector %d from local APIC", vector)); } else { /* Ask the legacy pic for a vector to inject */ vatpic_pending_intr(vmx->vm, &vector); /* * From the Intel SDM, Volume 3, Section "Maskable * Hardware Interrupts": * - maskable interrupt vectors [0,255] can be delivered * through the INTR pin. */ KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d from INTR", vector)); } /* Check RFLAGS.IF and the interruptibility state of the guest */ rflags = vmcs_read(VMCS_GUEST_RFLAGS); if ((rflags & PSL_I) == 0) { VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " "rflags %#lx", vector, rflags); goto cantinject; } gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); if (gi & HWINTR_BLOCKING) { VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " "Guest Interruptibility-state %#x", vector, gi); goto cantinject; } info = vmcs_read(VMCS_ENTRY_INTR_INFO); if (info & VMCS_INTR_VALID) { /* * This is expected and could happen for multiple reasons: * - A vectoring VM-entry was aborted due to astpending * - A VM-exit happened during event injection. * - An exception was injected above. * - An NMI was injected above or after "NMI window exiting" */ VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " "VM-entry intr info %#x", vector, info); goto cantinject; } /* Inject the interrupt */ info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; info |= vector; vmcs_write(VMCS_ENTRY_INTR_INFO, info); if (!extint_pending) { /* Update the Local APIC ISR */ vlapic_intr_accepted(vlapic, vector); } else { vm_extint_clear(vmx->vm, vcpu); vatpic_intr_accepted(vmx->vm, vector); /* * After we accepted the current ExtINT the PIC may * have posted another one. If that is the case, set * the Interrupt Window Exiting execution control so * we can inject that one too. * * Also, interrupt window exiting allows us to inject any * pending APIC vector that was preempted by the ExtINT * as soon as possible. This applies both for the software * emulated vlapic and the hardware assisted virtual APIC. */ vmx_set_int_window_exiting(vmx, vcpu); } VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); return; cantinject: /* * Set the Interrupt Window Exiting execution control so we can inject * the interrupt as soon as blocking condition goes away. */ vmx_set_int_window_exiting(vmx, vcpu); } /* * If the Virtual NMIs execution control is '1' then the logical processor * tracks virtual-NMI blocking in the Guest Interruptibility-state field of * the VMCS. An IRET instruction in VMX non-root operation will remove any * virtual-NMI blocking. * * This unblocking occurs even if the IRET causes a fault. In this case the * hypervisor needs to restore virtual-NMI blocking before resuming the guest. */ static void vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) { uint32_t gi; VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); } static void vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) { uint32_t gi; VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); } static void vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) { uint32_t gi; gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, ("NMI blocking is not in effect %#x", gi)); } static int vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { struct vmxctx *vmxctx; uint64_t xcrval; const struct xsave_limits *limits; vmxctx = &vmx->ctx[vcpu]; limits = vmm_get_xsave_limits(); /* * Note that the processor raises a GP# fault on its own if * xsetbv is executed for CPL != 0, so we do not have to * emulate that fault here. */ /* Only xcr0 is supported. */ if (vmxctx->guest_rcx != 0) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { vm_inject_ud(vmx->vm, vcpu); return (HANDLED); } xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); if ((xcrval & ~limits->xcr0_allowed) != 0) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } if (!(xcrval & XFEATURE_ENABLED_X87)) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* AVX (YMM_Hi128) requires SSE. */ if (xcrval & XFEATURE_ENABLED_AVX && (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, * ZMM_Hi256, and Hi16_ZMM. */ if (xcrval & XFEATURE_AVX512 && (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != (XFEATURE_AVX512 | XFEATURE_AVX)) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* * Intel MPX requires both bound register state flags to be * set. */ if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* * This runs "inside" vmrun() with the guest's FPU state, so * modifying xcr0 directly modifies the guest's xcr0, not the * host's. */ load_xcr(0, xcrval); return (HANDLED); } static uint64_t vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) { const struct vmxctx *vmxctx; vmxctx = &vmx->ctx[vcpu]; switch (ident) { case 0: return (vmxctx->guest_rax); case 1: return (vmxctx->guest_rcx); case 2: return (vmxctx->guest_rdx); case 3: return (vmxctx->guest_rbx); case 4: return (vmcs_read(VMCS_GUEST_RSP)); case 5: return (vmxctx->guest_rbp); case 6: return (vmxctx->guest_rsi); case 7: return (vmxctx->guest_rdi); case 8: return (vmxctx->guest_r8); case 9: return (vmxctx->guest_r9); case 10: return (vmxctx->guest_r10); case 11: return (vmxctx->guest_r11); case 12: return (vmxctx->guest_r12); case 13: return (vmxctx->guest_r13); case 14: return (vmxctx->guest_r14); case 15: return (vmxctx->guest_r15); default: panic("invalid vmx register %d", ident); } } static void vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval) { struct vmxctx *vmxctx; vmxctx = &vmx->ctx[vcpu]; switch (ident) { case 0: vmxctx->guest_rax = regval; break; case 1: vmxctx->guest_rcx = regval; break; case 2: vmxctx->guest_rdx = regval; break; case 3: vmxctx->guest_rbx = regval; break; case 4: vmcs_write(VMCS_GUEST_RSP, regval); break; case 5: vmxctx->guest_rbp = regval; break; case 6: vmxctx->guest_rsi = regval; break; case 7: vmxctx->guest_rdi = regval; break; case 8: vmxctx->guest_r8 = regval; break; case 9: vmxctx->guest_r9 = regval; break; case 10: vmxctx->guest_r10 = regval; break; case 11: vmxctx->guest_r11 = regval; break; case 12: vmxctx->guest_r12 = regval; break; case 13: vmxctx->guest_r13 = regval; break; case 14: vmxctx->guest_r14 = regval; break; case 15: vmxctx->guest_r15 = regval; break; default: panic("invalid vmx register %d", ident); } } static int vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual) { uint64_t crval, regval; /* We only handle mov to %cr0 at this time */ if ((exitqual & 0xf0) != 0x00) return (UNHANDLED); regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); vmcs_write(VMCS_CR0_SHADOW, regval); crval = regval | cr0_ones_mask; crval &= ~cr0_zeros_mask; vmcs_write(VMCS_GUEST_CR0, crval); if (regval & CR0_PG) { uint64_t efer, entry_ctls; /* * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and * the "IA-32e mode guest" bit in VM-entry control must be * equal. */ efer = vmcs_read(VMCS_GUEST_IA32_EFER); if (efer & EFER_LME) { efer |= EFER_LMA; vmcs_write(VMCS_GUEST_IA32_EFER, efer); entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); entry_ctls |= VM_ENTRY_GUEST_LMA; vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); } } return (HANDLED); } static int vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual) { uint64_t crval, regval; /* We only handle mov to %cr4 at this time */ if ((exitqual & 0xf0) != 0x00) return (UNHANDLED); regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); vmcs_write(VMCS_CR4_SHADOW, regval); crval = regval | cr4_ones_mask; crval &= ~cr4_zeros_mask; vmcs_write(VMCS_GUEST_CR4, crval); return (HANDLED); } static int vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual) { struct vlapic *vlapic; uint64_t cr8; int regnum; /* We only handle mov %cr8 to/from a register at this time. */ if ((exitqual & 0xe0) != 0x00) { return (UNHANDLED); } vlapic = vm_lapic(vmx->vm, vcpu); regnum = (exitqual >> 8) & 0xf; if (exitqual & 0x10) { cr8 = vlapic_get_cr8(vlapic); vmx_set_guest_reg(vmx, vcpu, regnum, cr8); } else { cr8 = vmx_get_guest_reg(vmx, vcpu, regnum); vlapic_set_cr8(vlapic, cr8); } return (HANDLED); } /* * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL */ static int vmx_cpl(void) { uint32_t ssar; ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); return ((ssar >> 5) & 0x3); } static enum vm_cpu_mode vmx_cpu_mode(void) { uint32_t csar; if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); if (csar & 0x2000) return (CPU_MODE_64BIT); /* CS.L = 1 */ else return (CPU_MODE_COMPATIBILITY); } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { return (CPU_MODE_PROTECTED); } else { return (CPU_MODE_REAL); } } static enum vm_paging_mode vmx_paging_mode(void) { if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) return (PAGING_MODE_FLAT); if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE)) return (PAGING_MODE_32); if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) return (PAGING_MODE_64); else return (PAGING_MODE_PAE); } static uint64_t inout_str_index(struct vmx *vmx, int vcpuid, int in) { uint64_t val; int error; enum vm_reg_name reg; reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; error = vmx_getreg(vmx, vcpuid, reg, &val); KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); return (val); } static uint64_t inout_str_count(struct vmx *vmx, int vcpuid, int rep) { uint64_t val; int error; if (rep) { error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); } else { val = 1; } return (val); } static int inout_str_addrsize(uint32_t inst_info) { uint32_t size; size = (inst_info >> 7) & 0x7; switch (size) { case 0: return (2); /* 16 bit */ case 1: return (4); /* 32 bit */ case 2: return (8); /* 64 bit */ default: panic("%s: invalid size encoding %d", __func__, size); } } static void inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, struct vm_inout_str *vis) { int error, s; if (in) { vis->seg_name = VM_REG_GUEST_ES; } else { s = (inst_info >> 15) & 0x7; vis->seg_name = vm_segment_name(s); } error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); } static void vmx_paging_info(struct vm_guest_paging *paging) { paging->cr3 = vmcs_guest_cr3(); paging->cpl = vmx_cpl(); paging->cpu_mode = vmx_cpu_mode(); paging->paging_mode = vmx_paging_mode(); } static void vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) { struct vm_guest_paging *paging; uint32_t csar; paging = &vmexit->u.inst_emul.paging; vmexit->exitcode = VM_EXITCODE_INST_EMUL; vmexit->inst_length = 0; vmexit->u.inst_emul.gpa = gpa; vmexit->u.inst_emul.gla = gla; vmx_paging_info(paging); switch (paging->cpu_mode) { case CPU_MODE_REAL: vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); vmexit->u.inst_emul.cs_d = 0; break; case CPU_MODE_PROTECTED: case CPU_MODE_COMPATIBILITY: vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); break; default: vmexit->u.inst_emul.cs_base = 0; vmexit->u.inst_emul.cs_d = 0; break; } vie_init(&vmexit->u.inst_emul.vie, NULL, 0); } static int ept_fault_type(uint64_t ept_qual) { int fault_type; if (ept_qual & EPT_VIOLATION_DATA_WRITE) fault_type = VM_PROT_WRITE; else if (ept_qual & EPT_VIOLATION_INST_FETCH) fault_type = VM_PROT_EXECUTE; else fault_type= VM_PROT_READ; return (fault_type); } static bool ept_emulation_fault(uint64_t ept_qual) { int read, write; /* EPT fault on an instruction fetch doesn't make sense here */ if (ept_qual & EPT_VIOLATION_INST_FETCH) return (false); /* EPT fault must be a read fault or a write fault */ read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; if ((read | write) == 0) return (false); /* * The EPT violation must have been caused by accessing a * guest-physical address that is a translation of a guest-linear * address. */ if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { return (false); } return (true); } static __inline int apic_access_virtualization(struct vmx *vmx, int vcpuid) { uint32_t proc_ctls2; proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); } static __inline int x2apic_virtualization(struct vmx *vmx, int vcpuid) { uint32_t proc_ctls2; proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); } static int vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, uint64_t qual) { int error, handled, offset; uint32_t *apic_regs, vector; bool retu; handled = HANDLED; offset = APIC_WRITE_OFFSET(qual); if (!apic_access_virtualization(vmx, vcpuid)) { /* * In general there should not be any APIC write VM-exits * unless APIC-access virtualization is enabled. * * However self-IPI virtualization can legitimately trigger * an APIC-write VM-exit so treat it specially. */ if (x2apic_virtualization(vmx, vcpuid) && offset == APIC_OFFSET_SELF_IPI) { apic_regs = (uint32_t *)(vlapic->apic_page); vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; vlapic_self_ipi_handler(vlapic, vector); return (HANDLED); } else return (UNHANDLED); } switch (offset) { case APIC_OFFSET_ID: vlapic_id_write_handler(vlapic); break; case APIC_OFFSET_LDR: vlapic_ldr_write_handler(vlapic); break; case APIC_OFFSET_DFR: vlapic_dfr_write_handler(vlapic); break; case APIC_OFFSET_SVR: vlapic_svr_write_handler(vlapic); break; case APIC_OFFSET_ESR: vlapic_esr_write_handler(vlapic); break; case APIC_OFFSET_ICR_LOW: retu = false; error = vlapic_icrlo_write_handler(vlapic, &retu); if (error != 0 || retu) handled = UNHANDLED; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: vlapic_lvt_write_handler(vlapic, offset); break; case APIC_OFFSET_TIMER_ICR: vlapic_icrtmr_write_handler(vlapic); break; case APIC_OFFSET_TIMER_DCR: vlapic_dcr_write_handler(vlapic); break; default: handled = UNHANDLED; break; } return (handled); } static bool apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) { if (apic_access_virtualization(vmx, vcpuid) && (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) return (true); else return (false); } static int vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) { uint64_t qual; int access_type, offset, allowed; if (!apic_access_virtualization(vmx, vcpuid)) return (UNHANDLED); qual = vmexit->u.vmx.exit_qualification; access_type = APIC_ACCESS_TYPE(qual); offset = APIC_ACCESS_OFFSET(qual); allowed = 0; if (access_type == 0) { /* * Read data access to the following registers is expected. */ switch (offset) { case APIC_OFFSET_APR: case APIC_OFFSET_PPR: case APIC_OFFSET_RRR: case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_CCR: allowed = 1; break; default: break; } } else if (access_type == 1) { /* * Write data access to the following registers is expected. */ switch (offset) { case APIC_OFFSET_VER: case APIC_OFFSET_APR: case APIC_OFFSET_PPR: case APIC_OFFSET_RRR: case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_CCR: allowed = 1; break; default: break; } } if (allowed) { vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, VIE_INVALID_GLA); } /* * Regardless of whether the APIC-access is allowed this handler * always returns UNHANDLED: * - if the access is allowed then it is handled by emulating the * instruction that caused the VM-exit (outside the critical section) * - if the access is not allowed then it will be converted to an * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. */ return (UNHANDLED); } static enum task_switch_reason vmx_task_switch_reason(uint64_t qual) { int reason; reason = (qual >> 30) & 0x3; switch (reason) { case 0: return (TSR_CALL); case 1: return (TSR_IRET); case 2: return (TSR_JMP); case 3: return (TSR_IDT_GATE); default: panic("%s: invalid reason %d", __func__, reason); } } static int emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) { int error; if (lapic_msr(num)) error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu); else error = vmx_wrmsr(vmx, vcpuid, num, val, retu); return (error); } static int emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) { struct vmxctx *vmxctx; uint64_t result; uint32_t eax, edx; int error; if (lapic_msr(num)) error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu); else error = vmx_rdmsr(vmx, vcpuid, num, &result, retu); if (error == 0) { eax = result; vmxctx = &vmx->ctx[vcpuid]; error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax); KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error)); edx = result >> 32; error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx); KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error)); } return (error); } static int vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { int error, errcode, errcode_valid, handled, in; struct vmxctx *vmxctx; struct vlapic *vlapic; struct vm_inout_str *vis; struct vm_task_switch *ts; uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; uint32_t intr_type, intr_vec, reason; uint64_t exitintinfo, qual, gpa; bool retu; CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); handled = UNHANDLED; vmxctx = &vmx->ctx[vcpu]; qual = vmexit->u.vmx.exit_qualification; reason = vmexit->u.vmx.exit_reason; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit); /* * VM-entry failures during or after loading guest state. * * These VM-exits are uncommon but must be handled specially * as most VM-exit fields are not populated as usual. */ if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) { VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry"); __asm __volatile("int $18"); return (1); } /* * VM exits that can be triggered during event delivery need to * be handled specially by re-injecting the event if the IDT * vectoring information field's valid bit is set. * * See "Information for VM Exits During Event Delivery" in Intel SDM * for details. */ idtvec_info = vmcs_idt_vectoring_info(); if (idtvec_info & VMCS_IDT_VEC_VALID) { idtvec_info &= ~(1 << 12); /* clear undefined bit */ exitintinfo = idtvec_info; if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { idtvec_err = vmcs_idt_vectoring_err(); exitintinfo |= (uint64_t)idtvec_err << 32; } error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); KASSERT(error == 0, ("%s: vm_set_intinfo error %d", __func__, error)); /* * If 'virtual NMIs' are being used and the VM-exit * happened while injecting an NMI during the previous * VM-entry, then clear "blocking by NMI" in the * Guest Interruptibility-State so the NMI can be * reinjected on the subsequent VM-entry. * * However, if the NMI was being delivered through a task * gate, then the new task must start execution with NMIs * blocked so don't clear NMI blocking in this case. */ intr_type = idtvec_info & VMCS_INTR_T_MASK; if (intr_type == VMCS_INTR_T_NMI) { if (reason != EXIT_REASON_TASK_SWITCH) vmx_clear_nmi_blocking(vmx, vcpu); else vmx_assert_nmi_blocking(vmx, vcpu); } /* * Update VM-entry instruction length if the event being * delivered was a software interrupt or software exception. */ if (intr_type == VMCS_INTR_T_SWINTR || intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || intr_type == VMCS_INTR_T_SWEXCEPTION) { vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); } } switch (reason) { case EXIT_REASON_TASK_SWITCH: ts = &vmexit->u.task_switch; ts->tsssel = qual & 0xffff; ts->reason = vmx_task_switch_reason(qual); ts->ext = 0; ts->errcode_valid = 0; vmx_paging_info(&ts->paging); /* * If the task switch was due to a CALL, JMP, IRET, software * interrupt (INT n) or software exception (INT3, INTO), * then the saved %rip references the instruction that caused * the task switch. The instruction length field in the VMCS * is valid in this case. * * In all other cases (e.g., NMI, hardware exception) the * saved %rip is one that would have been saved in the old TSS * had the task switch completed normally so the instruction * length field is not needed in this case and is explicitly * set to 0. */ if (ts->reason == TSR_IDT_GATE) { KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, ("invalid idtvec_info %#x for IDT task switch", idtvec_info)); intr_type = idtvec_info & VMCS_INTR_T_MASK; if (intr_type != VMCS_INTR_T_SWINTR && intr_type != VMCS_INTR_T_SWEXCEPTION && intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { /* Task switch triggered by external event */ ts->ext = 1; vmexit->inst_length = 0; if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { ts->errcode_valid = 1; ts->errcode = vmcs_idt_vectoring_err(); } } } vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts); VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " "%s errcode 0x%016lx", ts->reason, ts->tsssel, ts->ext ? "external" : "internal", ((uint64_t)ts->errcode << 32) | ts->errcode_valid); break; case EXIT_REASON_CR_ACCESS: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual); switch (qual & 0xf) { case 0: handled = vmx_emulate_cr0_access(vmx, vcpu, qual); break; case 4: handled = vmx_emulate_cr4_access(vmx, vcpu, qual); break; case 8: handled = vmx_emulate_cr8_access(vmx, vcpu, qual); break; } break; case EXIT_REASON_RDMSR: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); retu = false; ecx = vmxctx->guest_rcx; VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx); error = emulate_rdmsr(vmx, vcpu, ecx, &retu); if (error) { vmexit->exitcode = VM_EXITCODE_RDMSR; vmexit->u.msr.code = ecx; } else if (!retu) { handled = HANDLED; } else { /* Return to userspace with a valid exitcode */ KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_rdmsr retu with bogus exitcode")); } break; case EXIT_REASON_WRMSR: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); retu = false; eax = vmxctx->guest_rax; ecx = vmxctx->guest_rcx; edx = vmxctx->guest_rdx; VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", ecx, (uint64_t)edx << 32 | eax); SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx, (uint64_t)edx << 32 | eax); error = emulate_wrmsr(vmx, vcpu, ecx, (uint64_t)edx << 32 | eax, &retu); if (error) { vmexit->exitcode = VM_EXITCODE_WRMSR; vmexit->u.msr.code = ecx; vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; } else if (!retu) { handled = HANDLED; } else { /* Return to userspace with a valid exitcode */ KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_wrmsr retu with bogus exitcode")); } break; case EXIT_REASON_HLT: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_HLT; vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); if (virtual_interrupt_delivery) vmexit->u.hlt.intr_status = vmcs_read(VMCS_GUEST_INTR_STATUS); else vmexit->u.hlt.intr_status = 0; break; case EXIT_REASON_MTF: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_MTRAP; vmexit->inst_length = 0; break; case EXIT_REASON_PAUSE: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_PAUSE; break; case EXIT_REASON_INTR_WINDOW: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit); vmx_clear_int_window_exiting(vmx, vcpu); return (1); case EXIT_REASON_EXT_INTR: /* * External interrupts serve only to cause VM exits and allow * the host interrupt handler to run. * * If this external interrupt triggers a virtual interrupt * to a VM, then that state will be recorded by the * host interrupt handler in the VM's softc. We will inject * this virtual interrupt during the subsequent VM enter. */ intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); SDT_PROBE4(vmm, vmx, exit, interrupt, vmx, vcpu, vmexit, intr_info); /* * XXX: Ignore this exit if VMCS_INTR_VALID is not set. * This appears to be a bug in VMware Fusion? */ if (!(intr_info & VMCS_INTR_VALID)) return (1); KASSERT((intr_info & VMCS_INTR_VALID) != 0 && (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, ("VM exit interruption info invalid: %#x", intr_info)); vmx_trigger_hostintr(intr_info & 0xff); /* * This is special. We want to treat this as an 'handled' * VM-exit but not increment the instruction pointer. */ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); return (1); case EXIT_REASON_NMI_WINDOW: SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit); /* Exit to allow the pending virtual NMI to be injected */ if (vm_nmi_pending(vmx->vm, vcpu)) vmx_inject_nmi(vmx, vcpu); vmx_clear_nmi_window_exiting(vmx, vcpu); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); return (1); case EXIT_REASON_INOUT: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); vmexit->exitcode = VM_EXITCODE_INOUT; vmexit->u.inout.bytes = (qual & 0x7) + 1; vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; vmexit->u.inout.port = (uint16_t)(qual >> 16); vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); if (vmexit->u.inout.string) { inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); vmexit->exitcode = VM_EXITCODE_INOUT_STR; vis = &vmexit->u.inout_str; vmx_paging_info(&vis->paging); vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); vis->cr0 = vmcs_read(VMCS_GUEST_CR0); vis->index = inout_str_index(vmx, vcpu, in); vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); vis->addrsize = inout_str_addrsize(inst_info); inout_str_seginfo(vmx, vcpu, inst_info, in, vis); } SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit); break; case EXIT_REASON_CPUID: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit); handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); break; case EXIT_REASON_EXCEPTION: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); KASSERT((intr_info & VMCS_INTR_VALID) != 0, ("VM exit interruption info invalid: %#x", intr_info)); intr_vec = intr_info & 0xff; intr_type = intr_info & VMCS_INTR_T_MASK; /* * If Virtual NMIs control is 1 and the VM-exit is due to a * fault encountered during the execution of IRET then we must * restore the state of "virtual-NMI blocking" before resuming * the guest. * * See "Resuming Guest Software after Handling an Exception". * See "Information for VM Exits Due to Vectored Events". */ if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && (intr_vec != IDT_DF) && (intr_info & EXIT_QUAL_NMIUDTI) != 0) vmx_restore_nmi_blocking(vmx, vcpu); /* * The NMI has already been handled in vmx_exit_handle_nmi(). */ if (intr_type == VMCS_INTR_T_NMI) return (1); /* * Call the machine check handler by hand. Also don't reflect * the machine check back into the guest. */ if (intr_vec == IDT_MC) { VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler"); __asm __volatile("int $18"); return (1); } /* * If the hypervisor has requested user exits for * debug exceptions, bounce them out to userland. */ if (intr_type == VMCS_INTR_T_SWEXCEPTION && intr_vec == IDT_BP && (vmx->cap[vcpu].set & (1 << VM_CAP_BPT_EXIT))) { vmexit->exitcode = VM_EXITCODE_BPT; vmexit->u.bpt.inst_length = vmexit->inst_length; vmexit->inst_length = 0; break; } if (intr_vec == IDT_PF) { error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual); KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d", __func__, error)); } /* * Software exceptions exhibit trap-like behavior. This in * turn requires populating the VM-entry instruction length * so that the %rip in the trap frame is past the INT3/INTO * instruction. */ if (intr_type == VMCS_INTR_T_SWEXCEPTION) vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); /* Reflect all other exceptions back into the guest */ errcode_valid = errcode = 0; if (intr_info & VMCS_INTR_DEL_ERRCODE) { errcode_valid = 1; errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); } VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into " "the guest", intr_vec, errcode); SDT_PROBE5(vmm, vmx, exit, exception, vmx, vcpu, vmexit, intr_vec, errcode); error = vm_inject_exception(vmx->vm, vcpu, intr_vec, errcode_valid, errcode, 0); KASSERT(error == 0, ("%s: vm_inject_exception error %d", __func__, error)); return (1); case EXIT_REASON_EPT_FAULT: /* * If 'gpa' lies within the address space allocated to * memory then this must be a nested page fault otherwise * this must be an instruction that accesses MMIO space. */ gpa = vmcs_gpa(); if (vm_mem_allocated(vmx->vm, vcpu, gpa) || apic_access_fault(vmx, vcpu, gpa)) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->inst_length = 0; vmexit->u.paging.gpa = gpa; vmexit->u.paging.fault_type = ept_fault_type(qual); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); SDT_PROBE5(vmm, vmx, exit, nestedfault, vmx, vcpu, vmexit, gpa, qual); } else if (ept_emulation_fault(qual)) { vmexit_inst_emul(vmexit, gpa, vmcs_gla()); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); SDT_PROBE4(vmm, vmx, exit, mmiofault, vmx, vcpu, vmexit, gpa); } /* * If Virtual NMIs control is 1 and the VM-exit is due to an * EPT fault during the execution of IRET then we must restore * the state of "virtual-NMI blocking" before resuming. * * See description of "NMI unblocking due to IRET" in * "Exit Qualification for EPT Violations". */ if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && (qual & EXIT_QUAL_NMIUDTI) != 0) vmx_restore_nmi_blocking(vmx, vcpu); break; case EXIT_REASON_VIRTUALIZED_EOI: vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; vmexit->u.ioapic_eoi.vector = qual & 0xFF; SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit); vmexit->inst_length = 0; /* trap-like */ break; case EXIT_REASON_APIC_ACCESS: SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit); handled = vmx_handle_apic_access(vmx, vcpu, vmexit); break; case EXIT_REASON_APIC_WRITE: /* * APIC-write VM exit is trap-like so the %rip is already * pointing to the next instruction. */ vmexit->inst_length = 0; vlapic = vm_lapic(vmx->vm, vcpu); SDT_PROBE4(vmm, vmx, exit, apicwrite, vmx, vcpu, vmexit, vlapic); handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); break; case EXIT_REASON_XSETBV: SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit); handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); break; case EXIT_REASON_MONITOR: SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_MONITOR; break; case EXIT_REASON_MWAIT: SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_MWAIT; break; case EXIT_REASON_TPR: vlapic = vm_lapic(vmx->vm, vcpu); vlapic_sync_tpr(vlapic); vmexit->inst_length = 0; handled = HANDLED; break; case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMXOFF: case EXIT_REASON_VMXON: SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_VMINSN; break; default: SDT_PROBE4(vmm, vmx, exit, unknown, vmx, vcpu, vmexit, reason); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); break; } if (handled) { /* * It is possible that control is returned to userland * even though we were able to handle the VM exit in the * kernel. * * In such a case we want to make sure that the userland * restarts guest execution at the instruction *after* * the one we just processed. Therefore we update the * guest rip in the VMCS and in 'vmexit'. */ vmexit->rip += vmexit->inst_length; vmexit->inst_length = 0; vmcs_write(VMCS_GUEST_RIP, vmexit->rip); } else { if (vmexit->exitcode == VM_EXITCODE_BOGUS) { /* * If this VM exit was not claimed by anybody then * treat it as a generic VMX exit. */ vmexit->exitcode = VM_EXITCODE_VMX; vmexit->u.vmx.status = VM_SUCCESS; vmexit->u.vmx.inst_type = 0; vmexit->u.vmx.inst_error = 0; } else { /* * The exitcode and collateral have been populated. * The VM exit will be processed further in userland. */ } } SDT_PROBE4(vmm, vmx, exit, return, vmx, vcpu, vmexit, handled); return (handled); } static __inline void vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) { KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, ("vmx_exit_inst_error: invalid inst_fail_status %d", vmxctx->inst_fail_status)); vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_VMX; vmexit->u.vmx.status = vmxctx->inst_fail_status; vmexit->u.vmx.inst_error = vmcs_instruction_error(); vmexit->u.vmx.exit_reason = ~0; vmexit->u.vmx.exit_qualification = ~0; switch (rc) { case VMX_VMRESUME_ERROR: case VMX_VMLAUNCH_ERROR: case VMX_INVEPT_ERROR: vmexit->u.vmx.inst_type = rc; break; default: panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); } } /* * If the NMI-exiting VM execution control is set to '1' then an NMI in * non-root operation causes a VM-exit. NMI blocking is in effect so it is * sufficient to simply vector to the NMI handler via a software interrupt. * However, this must be done before maskable interrupts are enabled * otherwise the "iret" issued by an interrupt handler will incorrectly * clear NMI blocking. */ static __inline void vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) { uint32_t intr_info; KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) return; intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); KASSERT((intr_info & VMCS_INTR_VALID) != 0, ("VM exit interruption info invalid: %#x", intr_info)); if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " "to NMI has invalid vector: %#x", intr_info)); VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); __asm __volatile("int $2"); } } static __inline void vmx_dr_enter_guest(struct vmxctx *vmxctx) { register_t rflags; /* Save host control debug registers. */ vmxctx->host_dr7 = rdr7(); vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); /* * Disable debugging in DR7 and DEBUGCTL to avoid triggering * exceptions in the host based on the guest DRx values. The * guest DR7 and DEBUGCTL are saved/restored in the VMCS. */ load_dr7(0); wrmsr(MSR_DEBUGCTLMSR, 0); /* * Disable single stepping the kernel to avoid corrupting the * guest DR6. A debugger might still be able to corrupt the * guest DR6 by setting a breakpoint after this point and then * single stepping. */ rflags = read_rflags(); vmxctx->host_tf = rflags & PSL_T; write_rflags(rflags & ~PSL_T); /* Save host debug registers. */ vmxctx->host_dr0 = rdr0(); vmxctx->host_dr1 = rdr1(); vmxctx->host_dr2 = rdr2(); vmxctx->host_dr3 = rdr3(); vmxctx->host_dr6 = rdr6(); /* Restore guest debug registers. */ load_dr0(vmxctx->guest_dr0); load_dr1(vmxctx->guest_dr1); load_dr2(vmxctx->guest_dr2); load_dr3(vmxctx->guest_dr3); load_dr6(vmxctx->guest_dr6); } static __inline void vmx_dr_leave_guest(struct vmxctx *vmxctx) { /* Save guest debug registers. */ vmxctx->guest_dr0 = rdr0(); vmxctx->guest_dr1 = rdr1(); vmxctx->guest_dr2 = rdr2(); vmxctx->guest_dr3 = rdr3(); vmxctx->guest_dr6 = rdr6(); /* * Restore host debug registers. Restore DR7, DEBUGCTL, and * PSL_T last. */ load_dr0(vmxctx->host_dr0); load_dr1(vmxctx->host_dr1); load_dr2(vmxctx->host_dr2); load_dr3(vmxctx->host_dr3); load_dr6(vmxctx->host_dr6); wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl); load_dr7(vmxctx->host_dr7); write_rflags(read_rflags() | vmxctx->host_tf); } static int vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap, struct vm_eventinfo *evinfo) { int rc, handled, launched; struct vmx *vmx; struct vm *vm; struct vmxctx *vmxctx; struct vmcs *vmcs; struct vm_exit *vmexit; struct vlapic *vlapic; uint32_t exit_reason; struct region_descriptor gdtr, idtr; uint16_t ldt_sel; vmx = arg; vm = vmx->vm; vmcs = &vmx->vmcs[vcpu]; vmxctx = &vmx->ctx[vcpu]; vlapic = vm_lapic(vm, vcpu); vmexit = vm_exitinfo(vm, vcpu); launched = 0; KASSERT(vmxctx->pmap == pmap, ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); vmx_msr_guest_enter(vmx, vcpu); VMPTRLD(vmcs); /* * XXX * We do this every time because we may setup the virtual machine * from a different process than the one that actually runs it. * * If the life of a virtual machine was spent entirely in the context * of a single process we could do this once in vmx_vminit(). */ vmcs_write(VMCS_HOST_CR3, rcr3()); vmcs_write(VMCS_GUEST_RIP, rip); vmx_set_pcpu_defaults(vmx, vcpu, pmap); do { KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); handled = UNHANDLED; /* * Interrupts are disabled from this point on until the * guest starts executing. This is done for the following * reasons: * * If an AST is asserted on this thread after the check below, * then the IPI_AST notification will not be lost, because it * will cause a VM exit due to external interrupt as soon as * the guest state is loaded. * * A posted interrupt after 'vmx_inject_interrupts()' will * not be "lost" because it will be held pending in the host * APIC because interrupts are disabled. The pending interrupt * will be recognized as soon as the guest state is loaded. * * The same reasoning applies to the IPI generated by * pmap_invalidate_ept(). */ disable_intr(); vmx_inject_interrupts(vmx, vcpu, vlapic, rip); /* * Check for vcpu suspension after injecting events because * vmx_inject_interrupts() can suspend the vcpu due to a * triple fault. */ if (vcpu_suspended(evinfo)) { enable_intr(); vm_exit_suspended(vmx->vm, vcpu, rip); break; } if (vcpu_rendezvous_pending(evinfo)) { enable_intr(); vm_exit_rendezvous(vmx->vm, vcpu, rip); break; } if (vcpu_reqidle(evinfo)) { enable_intr(); vm_exit_reqidle(vmx->vm, vcpu, rip); break; } if (vcpu_should_yield(vm, vcpu)) { enable_intr(); vm_exit_astpending(vmx->vm, vcpu, rip); vmx_astpending_trace(vmx, vcpu, rip); handled = HANDLED; break; } if (vcpu_debugged(vm, vcpu)) { enable_intr(); vm_exit_debug(vmx->vm, vcpu, rip); break; } /* * If TPR Shadowing is enabled, the TPR Threshold * must be updated right before entering the guest. */ if (tpr_shadowing && !virtual_interrupt_delivery) { if ((vmx->cap[vcpu].proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0) { vmcs_write(VMCS_TPR_THRESHOLD, vlapic_get_cr8(vlapic)); } } /* * VM exits restore the base address but not the * limits of GDTR and IDTR. The VMCS only stores the * base address, so VM exits set the limits to 0xffff. * Save and restore the full GDTR and IDTR to restore * the limits. * * The VMCS does not save the LDTR at all, and VM * exits clear LDTR as if a NULL selector were loaded. * The userspace hypervisor probably doesn't use a * LDT, but save and restore it to be safe. */ sgdt(&gdtr); sidt(&idtr); ldt_sel = sldt(); vmx_run_trace(vmx, vcpu); vmx_dr_enter_guest(vmxctx); rc = vmx_enter_guest(vmxctx, vmx, launched); vmx_dr_leave_guest(vmxctx); bare_lgdt(&gdtr); lidt(&idtr); lldt(ldt_sel); /* Collect some information for VM exit processing */ vmexit->rip = rip = vmcs_guest_rip(); vmexit->inst_length = vmexit_instruction_length(); vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); /* Update 'nextrip' */ vmx->state[vcpu].nextrip = rip; if (rc == VMX_GUEST_VMEXIT) { vmx_exit_handle_nmi(vmx, vcpu, vmexit); enable_intr(); handled = vmx_exit_process(vmx, vcpu, vmexit); } else { enable_intr(); vmx_exit_inst_error(vmxctx, rc, vmexit); } launched = 1; vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); rip = vmexit->rip; } while (handled); /* * If a VM exit has been handled then the exitcode must be BOGUS * If a VM exit is not handled then the exitcode must not be BOGUS */ if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { panic("Mismatch between handled (%d) and exitcode (%d)", handled, vmexit->exitcode); } if (!handled) vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", vmexit->exitcode); VMCLEAR(vmcs); vmx_msr_guest_exit(vmx, vcpu); return (0); } static void vmx_vmcleanup(void *arg) { int i; struct vmx *vmx = arg; uint16_t maxcpus; if (apic_access_virtualization(vmx, 0)) vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); maxcpus = vm_get_maxcpus(vmx->vm); for (i = 0; i < maxcpus; i++) vpid_free(vmx->state[i].vpid); free(vmx, M_VMX); return; } static register_t * vmxctx_regptr(struct vmxctx *vmxctx, int reg) { switch (reg) { case VM_REG_GUEST_RAX: return (&vmxctx->guest_rax); case VM_REG_GUEST_RBX: return (&vmxctx->guest_rbx); case VM_REG_GUEST_RCX: return (&vmxctx->guest_rcx); case VM_REG_GUEST_RDX: return (&vmxctx->guest_rdx); case VM_REG_GUEST_RSI: return (&vmxctx->guest_rsi); case VM_REG_GUEST_RDI: return (&vmxctx->guest_rdi); case VM_REG_GUEST_RBP: return (&vmxctx->guest_rbp); case VM_REG_GUEST_R8: return (&vmxctx->guest_r8); case VM_REG_GUEST_R9: return (&vmxctx->guest_r9); case VM_REG_GUEST_R10: return (&vmxctx->guest_r10); case VM_REG_GUEST_R11: return (&vmxctx->guest_r11); case VM_REG_GUEST_R12: return (&vmxctx->guest_r12); case VM_REG_GUEST_R13: return (&vmxctx->guest_r13); case VM_REG_GUEST_R14: return (&vmxctx->guest_r14); case VM_REG_GUEST_R15: return (&vmxctx->guest_r15); case VM_REG_GUEST_CR2: return (&vmxctx->guest_cr2); case VM_REG_GUEST_DR0: return (&vmxctx->guest_dr0); case VM_REG_GUEST_DR1: return (&vmxctx->guest_dr1); case VM_REG_GUEST_DR2: return (&vmxctx->guest_dr2); case VM_REG_GUEST_DR3: return (&vmxctx->guest_dr3); case VM_REG_GUEST_DR6: return (&vmxctx->guest_dr6); default: break; } return (NULL); } static int vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) { register_t *regp; if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { *retval = *regp; return (0); } else return (EINVAL); } static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) { register_t *regp; if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { *regp = val; return (0); } else return (EINVAL); } static int vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval) { uint64_t gi; int error; error = vmcs_getreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; return (error); } static int vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val) { struct vmcs *vmcs; uint64_t gi; int error, ident; /* * Forcing the vcpu into an interrupt shadow is not supported. */ if (val) { error = EINVAL; goto done; } vmcs = &vmx->vmcs[vcpu]; ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); error = vmcs_getreg(vmcs, running, ident, &gi); if (error == 0) { gi &= ~HWINTR_BLOCKING; error = vmcs_setreg(vmcs, running, ident, gi); } done: VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val, error ? "failed" : "succeeded"); return (error); } static int vmx_shadow_reg(int reg) { int shreg; shreg = -1; switch (reg) { case VM_REG_GUEST_CR0: shreg = VMCS_CR0_SHADOW; break; case VM_REG_GUEST_CR4: shreg = VMCS_CR4_SHADOW; break; default: break; } return (shreg); } static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) { int running, hostcpu; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); if (reg == VM_REG_GUEST_INTR_SHADOW) return (vmx_get_intr_shadow(vmx, vcpu, running, retval)); if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) return (0); return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); } static int vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) { int error, hostcpu, running, shadow; uint64_t ctls; pmap_t pmap; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); if (reg == VM_REG_GUEST_INTR_SHADOW) return (vmx_modify_intr_shadow(vmx, vcpu, running, val)); if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) return (0); error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); if (error == 0) { /* * If the "load EFER" VM-entry control is 1 then the * value of EFER.LMA must be identical to "IA-32e mode guest" * bit in the VM-entry control. */ if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && (reg == VM_REG_GUEST_EFER)) { vmcs_getreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); if (val & EFER_LMA) ctls |= VM_ENTRY_GUEST_LMA; else ctls &= ~VM_ENTRY_GUEST_LMA; vmcs_setreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); } shadow = vmx_shadow_reg(reg); if (shadow > 0) { /* * Store the unmodified value in the shadow */ error = vmcs_setreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(shadow), val); } if (reg == VM_REG_GUEST_CR3) { /* * Invalidate the guest vcpu's TLB mappings to emulate * the behavior of updating %cr3. * * XXX the processor retains global mappings when %cr3 * is updated but vmx_invvpid() does not. */ pmap = vmx->ctx[vcpu].pmap; vmx_invvpid(vmx, vcpu, pmap, running); } } return (error); } static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) { int hostcpu, running; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc)); } static int vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) { int hostcpu, running; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc)); } static int vmx_getcap(void *arg, int vcpu, int type, int *retval) { struct vmx *vmx = arg; int vcap; int ret; ret = ENOENT; vcap = vmx->cap[vcpu].set; switch (type) { case VM_CAP_HALT_EXIT: if (cap_halt_exit) ret = 0; break; case VM_CAP_PAUSE_EXIT: if (cap_pause_exit) ret = 0; break; case VM_CAP_MTRAP_EXIT: if (cap_monitor_trap) ret = 0; break; case VM_CAP_UNRESTRICTED_GUEST: if (cap_unrestricted_guest) ret = 0; break; case VM_CAP_ENABLE_INVPCID: if (cap_invpcid) ret = 0; break; case VM_CAP_BPT_EXIT: ret = 0; break; default: break; } if (ret == 0) *retval = (vcap & (1 << type)) ? 1 : 0; return (ret); } static int vmx_setcap(void *arg, int vcpu, int type, int val) { struct vmx *vmx = arg; struct vmcs *vmcs = &vmx->vmcs[vcpu]; uint32_t baseval; uint32_t *pptr; int error; int flag; int reg; int retval; retval = ENOENT; pptr = NULL; switch (type) { case VM_CAP_HALT_EXIT: if (cap_halt_exit) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls; baseval = *pptr; flag = PROCBASED_HLT_EXITING; reg = VMCS_PRI_PROC_BASED_CTLS; } break; case VM_CAP_MTRAP_EXIT: if (cap_monitor_trap) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls; baseval = *pptr; flag = PROCBASED_MTF; reg = VMCS_PRI_PROC_BASED_CTLS; } break; case VM_CAP_PAUSE_EXIT: if (cap_pause_exit) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls; baseval = *pptr; flag = PROCBASED_PAUSE_EXITING; reg = VMCS_PRI_PROC_BASED_CTLS; } break; case VM_CAP_UNRESTRICTED_GUEST: if (cap_unrestricted_guest) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls2; baseval = *pptr; flag = PROCBASED2_UNRESTRICTED_GUEST; reg = VMCS_SEC_PROC_BASED_CTLS; } break; case VM_CAP_ENABLE_INVPCID: if (cap_invpcid) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls2; baseval = *pptr; flag = PROCBASED2_ENABLE_INVPCID; reg = VMCS_SEC_PROC_BASED_CTLS; } break; case VM_CAP_BPT_EXIT: retval = 0; /* Don't change the bitmap if we are tracing all exceptions. */ if (vmx->cap[vcpu].exc_bitmap != 0xffffffff) { pptr = &vmx->cap[vcpu].exc_bitmap; baseval = *pptr; flag = (1 << IDT_BP); reg = VMCS_EXCEPTION_BITMAP; } break; default: break; } if (retval) return (retval); if (pptr != NULL) { if (val) { baseval |= flag; } else { baseval &= ~flag; } VMPTRLD(vmcs); error = vmwrite(reg, baseval); VMCLEAR(vmcs); if (error) return (error); /* * Update optional stored flags, and record * setting */ *pptr = baseval; } if (val) { vmx->cap[vcpu].set |= (1 << type); } else { vmx->cap[vcpu].set &= ~(1 << type); } return (0); } struct vlapic_vtx { struct vlapic vlapic; struct pir_desc *pir_desc; struct vmx *vmx; u_int pending_prio; }; #define VPR_PRIO_BIT(vpr) (1 << ((vpr) >> 4)) #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ do { \ VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ level ? "level" : "edge", vector); \ VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ } while (0) /* * vlapic->ops handlers that utilize the APICv hardware assist described in * Chapter 29 of the Intel SDM. */ static int vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) { struct vlapic_vtx *vlapic_vtx; struct pir_desc *pir_desc; uint64_t mask; int idx, notify = 0; vlapic_vtx = (struct vlapic_vtx *)vlapic; pir_desc = vlapic_vtx->pir_desc; /* * Keep track of interrupt requests in the PIR descriptor. This is * because the virtual APIC page pointed to by the VMCS cannot be * modified if the vcpu is running. */ idx = vector / 64; mask = 1UL << (vector % 64); atomic_set_long(&pir_desc->pir[idx], mask); /* * A notification is required whenever the 'pending' bit makes a * transition from 0->1. * * Even if the 'pending' bit is already asserted, notification about * the incoming interrupt may still be necessary. For example, if a * vCPU is HLTed with a high PPR, a low priority interrupt would cause * the 0->1 'pending' transition with a notification, but the vCPU * would ignore the interrupt for the time being. The same vCPU would * need to then be notified if a high-priority interrupt arrived which * satisfied the PPR. * * The priorities of interrupts injected while 'pending' is asserted * are tracked in a custom bitfield 'pending_prio'. Should the * to-be-injected interrupt exceed the priorities already present, the * notification is sent. The priorities recorded in 'pending_prio' are * cleared whenever the 'pending' bit makes another 0->1 transition. */ if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) { notify = 1; vlapic_vtx->pending_prio = 0; } else { const u_int old_prio = vlapic_vtx->pending_prio; const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT); if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) { atomic_set_int(&vlapic_vtx->pending_prio, prio_bit); notify = 1; } } VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, level, "vmx_set_intr_ready"); return (notify); } static int vmx_pending_intr(struct vlapic *vlapic, int *vecptr) { struct vlapic_vtx *vlapic_vtx; struct pir_desc *pir_desc; struct LAPIC *lapic; uint64_t pending, pirval; uint32_t ppr, vpr; int i; /* * This function is only expected to be called from the 'HLT' exit * handler which does not care about the vector that is pending. */ KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); vlapic_vtx = (struct vlapic_vtx *)vlapic; pir_desc = vlapic_vtx->pir_desc; pending = atomic_load_acq_long(&pir_desc->pending); if (!pending) { /* * While a virtual interrupt may have already been * processed the actual delivery maybe pending the * interruptibility of the guest. Recognize a pending * interrupt by reevaluating virtual interrupts * following Section 29.2.1 in the Intel SDM Volume 3. */ struct vm_exit *vmexit; uint8_t rvi, ppr; vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); KASSERT(vmexit->exitcode == VM_EXITCODE_HLT, ("vmx_pending_intr: exitcode not 'HLT'")); rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT; lapic = vlapic->apic_page; ppr = lapic->ppr & APIC_TPR_INT; if (rvi > ppr) { return (1); } return (0); } /* * If there is an interrupt pending then it will be recognized only * if its priority is greater than the processor priority. * * Special case: if the processor priority is zero then any pending * interrupt will be recognized. */ lapic = vlapic->apic_page; ppr = lapic->ppr & APIC_TPR_INT; if (ppr == 0) return (1); VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", lapic->ppr); vpr = 0; for (i = 3; i >= 0; i--) { pirval = pir_desc->pir[i]; if (pirval != 0) { vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT; break; } } /* * If the highest-priority pending interrupt falls short of the * processor priority of this vCPU, ensure that 'pending_prio' does not * have any stale bits which would preclude a higher-priority interrupt * from incurring a notification later. */ if (vpr <= ppr) { const u_int prio_bit = VPR_PRIO_BIT(vpr); const u_int old = vlapic_vtx->pending_prio; if (old > prio_bit && (old & prio_bit) == 0) { vlapic_vtx->pending_prio = prio_bit; } return (0); } return (1); } static void vmx_intr_accepted(struct vlapic *vlapic, int vector) { panic("vmx_intr_accepted: not expected to be called"); } static void vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) { struct vlapic_vtx *vlapic_vtx; struct vmx *vmx; struct vmcs *vmcs; uint64_t mask, val; KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), ("vmx_set_tmr: vcpu cannot be running")); vlapic_vtx = (struct vlapic_vtx *)vlapic; vmx = vlapic_vtx->vmx; vmcs = &vmx->vmcs[vlapic->vcpuid]; mask = 1UL << (vector % 64); VMPTRLD(vmcs); val = vmcs_read(VMCS_EOI_EXIT(vector)); if (level) val |= mask; else val &= ~mask; vmcs_write(VMCS_EOI_EXIT(vector), val); VMCLEAR(vmcs); } static void vmx_enable_x2apic_mode_ts(struct vlapic *vlapic) { struct vmx *vmx; struct vmcs *vmcs; uint32_t proc_ctls; int vcpuid; vcpuid = vlapic->vcpuid; vmx = ((struct vlapic_vtx *)vlapic)->vmx; vmcs = &vmx->vmcs[vcpuid]; proc_ctls = vmx->cap[vcpuid].proc_ctls; proc_ctls &= ~PROCBASED_USE_TPR_SHADOW; proc_ctls |= PROCBASED_CR8_LOAD_EXITING; proc_ctls |= PROCBASED_CR8_STORE_EXITING; vmx->cap[vcpuid].proc_ctls = proc_ctls; VMPTRLD(vmcs); vmcs_write(VMCS_PRI_PROC_BASED_CTLS, proc_ctls); VMCLEAR(vmcs); } static void vmx_enable_x2apic_mode_vid(struct vlapic *vlapic) { struct vmx *vmx; struct vmcs *vmcs; uint32_t proc_ctls2; int vcpuid, error; vcpuid = vlapic->vcpuid; vmx = ((struct vlapic_vtx *)vlapic)->vmx; vmcs = &vmx->vmcs[vcpuid]; proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; VMPTRLD(vmcs); vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); VMCLEAR(vmcs); if (vlapic->vcpuid == 0) { /* * The nested page table mappings are shared by all vcpus * so unmap the APIC access page just once. */ error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", __func__, error)); /* * The MSR bitmap is shared by all vcpus so modify it only * once in the context of vcpu 0. */ error = vmx_allow_x2apic_msrs(vmx); KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", __func__, error)); } } static void vmx_post_intr(struct vlapic *vlapic, int hostcpu) { ipi_cpu(hostcpu, pirvec); } /* * Transfer the pending interrupts in the PIR descriptor to the IRR * in the virtual APIC page. */ static void vmx_inject_pir(struct vlapic *vlapic) { struct vlapic_vtx *vlapic_vtx; struct pir_desc *pir_desc; struct LAPIC *lapic; uint64_t val, pirval; int rvi, pirbase = -1; uint16_t intr_status_old, intr_status_new; vlapic_vtx = (struct vlapic_vtx *)vlapic; pir_desc = vlapic_vtx->pir_desc; if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " "no posted interrupt pending"); return; } pirval = 0; pirbase = -1; lapic = vlapic->apic_page; val = atomic_readandclear_long(&pir_desc->pir[0]); if (val != 0) { lapic->irr0 |= val; lapic->irr1 |= val >> 32; pirbase = 0; pirval = val; } val = atomic_readandclear_long(&pir_desc->pir[1]); if (val != 0) { lapic->irr2 |= val; lapic->irr3 |= val >> 32; pirbase = 64; pirval = val; } val = atomic_readandclear_long(&pir_desc->pir[2]); if (val != 0) { lapic->irr4 |= val; lapic->irr5 |= val >> 32; pirbase = 128; pirval = val; } val = atomic_readandclear_long(&pir_desc->pir[3]); if (val != 0) { lapic->irr6 |= val; lapic->irr7 |= val >> 32; pirbase = 192; pirval = val; } VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); /* * Update RVI so the processor can evaluate pending virtual * interrupts on VM-entry. * * It is possible for pirval to be 0 here, even though the * pending bit has been set. The scenario is: * CPU-Y is sending a posted interrupt to CPU-X, which * is running a guest and processing posted interrupts in h/w. * CPU-X will eventually exit and the state seen in s/w is * the pending bit set, but no PIR bits set. * * CPU-X CPU-Y * (vm running) (host running) * rx posted interrupt * CLEAR pending bit * SET PIR bit * READ/CLEAR PIR bits * SET pending bit * (vm exit) * pending bit set, PIR 0 */ if (pirval != 0) { rvi = pirbase + flsl(pirval) - 1; intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); intr_status_new = (intr_status_old & 0xFF00) | rvi; if (intr_status_new > intr_status_old) { vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " "guest_intr_status changed from 0x%04x to 0x%04x", intr_status_old, intr_status_new); } } } static struct vlapic * vmx_vlapic_init(void *arg, int vcpuid) { struct vmx *vmx; struct vlapic *vlapic; struct vlapic_vtx *vlapic_vtx; vmx = arg; vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); vlapic->vm = vmx->vm; vlapic->vcpuid = vcpuid; vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; vlapic_vtx = (struct vlapic_vtx *)vlapic; vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; vlapic_vtx->vmx = vmx; if (tpr_shadowing) { vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_ts; } if (virtual_interrupt_delivery) { vlapic->ops.set_intr_ready = vmx_set_intr_ready; vlapic->ops.pending_intr = vmx_pending_intr; vlapic->ops.intr_accepted = vmx_intr_accepted; vlapic->ops.set_tmr = vmx_set_tmr; vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_vid; } if (posted_interrupts) vlapic->ops.post_intr = vmx_post_intr; vlapic_init(vlapic); return (vlapic); } static void vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) { vlapic_cleanup(vlapic); free(vlapic, M_VLAPIC); } +#ifdef BHYVE_SNAPSHOT +static int +vmx_snapshot_vmi(void *arg, struct vm_snapshot_meta *meta) +{ + struct vmx *vmx; + struct vmxctx *vmxctx; + int i; + int ret; + + vmx = arg; + + KASSERT(vmx != NULL, ("%s: arg was NULL", __func__)); + + for (i = 0; i < VM_MAXCPU; i++) { + SNAPSHOT_BUF_OR_LEAVE(vmx->guest_msrs[i], + sizeof(vmx->guest_msrs[i]), meta, ret, done); + + vmxctx = &vmx->ctx[i]; + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rsi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rcx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r8, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r9, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rax, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbp, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r10, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r11, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r12, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r13, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r14, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r15, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_cr2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr1, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr3, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr6, meta, ret, done); + } + +done: + return (ret); +} + +static int +vmx_snapshot_vmcx(void *arg, struct vm_snapshot_meta *meta, int vcpu) +{ + struct vmcs *vmcs; + struct vmx *vmx; + int err, run, hostcpu; + + vmx = (struct vmx *)arg; + err = 0; + + KASSERT(arg != NULL, ("%s: arg was NULL", __func__)); + vmcs = &vmx->vmcs[vcpu]; + + run = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (run && hostcpu != curcpu) { + printf("%s: %s%d is running", __func__, vm_name(vmx->vm), vcpu); + return (EINVAL); + } + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR0, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR3, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR4, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DR7, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RSP, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RIP, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RFLAGS, meta); + + /* Guest segments */ + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_ES, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_ES, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_CS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_SS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_SS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_DS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_FS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_FS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_GS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_TR, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_TR, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_LDTR, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_LDTR, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_EFER, meta); + + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_IDTR, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GDTR, meta); + + /* Guest page tables */ + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE0, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE1, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE2, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE3, meta); + + /* Other guest state */ + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_CS, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_ESP, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_EIP, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_INTERRUPTIBILITY, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_ACTIVITY, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_ENTRY_CTLS, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_EXIT_CTLS, meta); + + return (err); +} + +static int +vmx_restore_tsc(void *arg, int vcpu, uint64_t offset) +{ + struct vmcs *vmcs; + struct vmx *vmx = (struct vmx *)arg; + int error, running, hostcpu; + + KASSERT(arg != NULL, ("%s: arg was NULL", __func__)); + vmcs = &vmx->vmcs[vcpu]; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) { + printf("%s: %s%d is running", __func__, vm_name(vmx->vm), vcpu); + return (EINVAL); + } + + if (!running) + VMPTRLD(vmcs); + + error = vmx_set_tsc_offset(vmx, vcpu, offset); + + if (!running) + VMCLEAR(vmcs); + return (error); +} +#endif + struct vmm_ops vmm_ops_intel = { .init = vmx_init, .cleanup = vmx_cleanup, .resume = vmx_restore, .vminit = vmx_vminit, .vmrun = vmx_run, .vmcleanup = vmx_vmcleanup, .vmgetreg = vmx_getreg, .vmsetreg = vmx_setreg, .vmgetdesc = vmx_getdesc, .vmsetdesc = vmx_setdesc, .vmgetcap = vmx_getcap, .vmsetcap = vmx_setcap, .vmspace_alloc = ept_vmspace_alloc, .vmspace_free = ept_vmspace_free, .vlapic_init = vmx_vlapic_init, .vlapic_cleanup = vmx_vlapic_cleanup, +#ifdef BHYVE_SNAPSHOT + .vmsnapshot = vmx_snapshot_vmi, + .vmcx_snapshot = vmx_snapshot_vmcx, + .vm_restore_tsc = vmx_restore_tsc, +#endif }; diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c index ba4cd7785e7d..1e053d26c182 100644 --- a/sys/amd64/vmm/io/vatpic.c +++ b/sys/amd64/vmm/io/vatpic.c @@ -1,810 +1,853 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include #include #include #include #include #include #include #include #include +#include #include "vmm_ktr.h" #include "vmm_lapic.h" #include "vioapic.h" #include "vatpic.h" static MALLOC_DEFINE(M_VATPIC, "atpic", "bhyve virtual atpic (8259)"); #define VATPIC_LOCK(vatpic) mtx_lock_spin(&((vatpic)->mtx)) #define VATPIC_UNLOCK(vatpic) mtx_unlock_spin(&((vatpic)->mtx)) #define VATPIC_LOCKED(vatpic) mtx_owned(&((vatpic)->mtx)) enum irqstate { IRQSTATE_ASSERT, IRQSTATE_DEASSERT, IRQSTATE_PULSE }; struct atpic { bool ready; int icw_num; int rd_cmd_reg; bool aeoi; bool poll; bool rotate; bool sfn; /* special fully-nested mode */ int irq_base; uint8_t request; /* Interrupt Request Register (IIR) */ uint8_t service; /* Interrupt Service (ISR) */ uint8_t mask; /* Interrupt Mask Register (IMR) */ uint8_t smm; /* special mask mode */ int acnt[8]; /* sum of pin asserts and deasserts */ int lowprio; /* lowest priority irq */ bool intr_raised; }; struct vatpic { struct vm *vm; struct mtx mtx; struct atpic atpic[2]; uint8_t elc[2]; }; #define VATPIC_CTR0(vatpic, fmt) \ VM_CTR0((vatpic)->vm, fmt) #define VATPIC_CTR1(vatpic, fmt, a1) \ VM_CTR1((vatpic)->vm, fmt, a1) #define VATPIC_CTR2(vatpic, fmt, a1, a2) \ VM_CTR2((vatpic)->vm, fmt, a1, a2) #define VATPIC_CTR3(vatpic, fmt, a1, a2, a3) \ VM_CTR3((vatpic)->vm, fmt, a1, a2, a3) #define VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4) \ VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4) /* * Loop over all the pins in priority order from highest to lowest. */ #define ATPIC_PIN_FOREACH(pinvar, atpic, tmpvar) \ for (tmpvar = 0, pinvar = (atpic->lowprio + 1) & 0x7; \ tmpvar < 8; \ tmpvar++, pinvar = (pinvar + 1) & 0x7) static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate); static __inline bool master_atpic(struct vatpic *vatpic, struct atpic *atpic) { if (atpic == &vatpic->atpic[0]) return (true); else return (false); } static __inline int vatpic_get_highest_isrpin(struct atpic *atpic) { int bit, pin; int i; ATPIC_PIN_FOREACH(pin, atpic, i) { bit = (1 << pin); if (atpic->service & bit) { /* * An IS bit that is masked by an IMR bit will not be * cleared by a non-specific EOI in Special Mask Mode. */ if (atpic->smm && (atpic->mask & bit) != 0) continue; else return (pin); } } return (-1); } static __inline int vatpic_get_highest_irrpin(struct atpic *atpic) { int serviced; int bit, pin, tmp; /* * In 'Special Fully-Nested Mode' when an interrupt request from * a slave is in service, the slave is not locked out from the * master's priority logic. */ serviced = atpic->service; if (atpic->sfn) serviced &= ~(1 << 2); /* * In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits * further interrupts at that level and enables interrupts from all * other levels that are not masked. In other words the ISR has no * bearing on the levels that can generate interrupts. */ if (atpic->smm) serviced = 0; ATPIC_PIN_FOREACH(pin, atpic, tmp) { bit = 1 << pin; /* * If there is already an interrupt in service at the same * or higher priority then bail. */ if ((serviced & bit) != 0) break; /* * If an interrupt is asserted and not masked then return * the corresponding 'pin' to the caller. */ if ((atpic->request & bit) != 0 && (atpic->mask & bit) == 0) return (pin); } return (-1); } static void vatpic_notify_intr(struct vatpic *vatpic) { struct atpic *atpic; int pin; KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_notify_intr not locked")); /* * First check the slave. */ atpic = &vatpic->atpic[1]; if (!atpic->intr_raised && (pin = vatpic_get_highest_irrpin(atpic)) != -1) { VATPIC_CTR4(vatpic, "atpic slave notify pin = %d " "(imr 0x%x irr 0x%x isr 0x%x)", pin, atpic->mask, atpic->request, atpic->service); /* * Cascade the request from the slave to the master. */ atpic->intr_raised = true; vatpic_set_pinstate(vatpic, 2, true); vatpic_set_pinstate(vatpic, 2, false); } else { VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts " "(imr 0x%x irr 0x%x isr 0x%x)", atpic->mask, atpic->request, atpic->service); } /* * Then check the master. */ atpic = &vatpic->atpic[0]; if (!atpic->intr_raised && (pin = vatpic_get_highest_irrpin(atpic)) != -1) { VATPIC_CTR4(vatpic, "atpic master notify pin = %d " "(imr 0x%x irr 0x%x isr 0x%x)", pin, atpic->mask, atpic->request, atpic->service); /* * From Section 3.6.2, "Interrupt Modes", in the * MPtable Specification, Version 1.4 * * PIC interrupts are routed to both the Local APIC * and the I/O APIC to support operation in 1 of 3 * modes. * * 1. Legacy PIC Mode: the PIC effectively bypasses * all APIC components. In this mode the local APIC is * disabled and LINT0 is reconfigured as INTR to * deliver the PIC interrupt directly to the CPU. * * 2. Virtual Wire Mode: the APIC is treated as a * virtual wire which delivers interrupts from the PIC * to the CPU. In this mode LINT0 is programmed as * ExtINT to indicate that the PIC is the source of * the interrupt. * * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are * fielded by the I/O APIC and delivered to the appropriate * CPU. In this mode the I/O APIC input 0 is programmed * as ExtINT to indicate that the PIC is the source of the * interrupt. */ atpic->intr_raised = true; lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0); vioapic_pulse_irq(vatpic->vm, 0); } else { VATPIC_CTR3(vatpic, "atpic master no eligible interrupts " "(imr 0x%x irr 0x%x isr 0x%x)", atpic->mask, atpic->request, atpic->service); } } static int vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw1 0x%x", val); atpic->ready = false; atpic->icw_num = 1; atpic->request = 0; atpic->mask = 0; atpic->lowprio = 7; atpic->rd_cmd_reg = 0; atpic->poll = 0; atpic->smm = 0; if ((val & ICW1_SNGL) != 0) { VATPIC_CTR0(vatpic, "vatpic cascade mode required"); return (-1); } if ((val & ICW1_IC4) == 0) { VATPIC_CTR0(vatpic, "vatpic icw4 required"); return (-1); } atpic->icw_num++; return (0); } static int vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw2 0x%x", val); atpic->irq_base = val & 0xf8; atpic->icw_num++; return (0); } static int vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw3 0x%x", val); atpic->icw_num++; return (0); } static int vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic icw4 0x%x", val); if ((val & ICW4_8086) == 0) { VATPIC_CTR0(vatpic, "vatpic microprocessor mode required"); return (-1); } if ((val & ICW4_AEOI) != 0) atpic->aeoi = true; if ((val & ICW4_SFNM) != 0) { if (master_atpic(vatpic, atpic)) { atpic->sfn = true; } else { VATPIC_CTR1(vatpic, "Ignoring special fully nested " "mode on slave atpic: %#x", val); } } atpic->icw_num = 0; atpic->ready = true; return (0); } static int vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic ocw1 0x%x", val); atpic->mask = val & 0xff; return (0); } static int vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic ocw2 0x%x", val); atpic->rotate = ((val & OCW2_R) != 0); if ((val & OCW2_EOI) != 0) { int isr_bit; if ((val & OCW2_SL) != 0) { /* specific EOI */ isr_bit = val & 0x7; } else { /* non-specific EOI */ isr_bit = vatpic_get_highest_isrpin(atpic); } if (isr_bit != -1) { atpic->service &= ~(1 << isr_bit); if (atpic->rotate) atpic->lowprio = isr_bit; } } else if ((val & OCW2_SL) != 0 && atpic->rotate == true) { /* specific priority */ atpic->lowprio = val & 0x7; } return (0); } static int vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) { VATPIC_CTR1(vatpic, "atpic ocw3 0x%x", val); if (val & OCW3_ESMM) { atpic->smm = val & OCW3_SMM ? 1 : 0; VATPIC_CTR2(vatpic, "%s atpic special mask mode %s", master_atpic(vatpic, atpic) ? "master" : "slave", atpic->smm ? "enabled" : "disabled"); } if (val & OCW3_RR) { /* read register command */ atpic->rd_cmd_reg = val & OCW3_RIS; /* Polling mode */ atpic->poll = ((val & OCW3_P) != 0); } return (0); } static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate) { struct atpic *atpic; int oldcnt, newcnt; bool level; KASSERT(pin >= 0 && pin < 16, ("vatpic_set_pinstate: invalid pin number %d", pin)); KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_set_pinstate: vatpic is not locked")); atpic = &vatpic->atpic[pin >> 3]; oldcnt = atpic->acnt[pin & 0x7]; if (newstate) atpic->acnt[pin & 0x7]++; else atpic->acnt[pin & 0x7]--; newcnt = atpic->acnt[pin & 0x7]; if (newcnt < 0) { VATPIC_CTR2(vatpic, "atpic pin%d: bad acnt %d", pin, newcnt); } level = ((vatpic->elc[pin >> 3] & (1 << (pin & 0x7))) != 0); if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) { /* rising edge or level */ VATPIC_CTR1(vatpic, "atpic pin%d: asserted", pin); atpic->request |= (1 << (pin & 0x7)); } else if (oldcnt == 1 && newcnt == 0) { /* falling edge */ VATPIC_CTR1(vatpic, "atpic pin%d: deasserted", pin); if (level) atpic->request &= ~(1 << (pin & 0x7)); } else { VATPIC_CTR3(vatpic, "atpic pin%d: %s, ignored, acnt %d", pin, newstate ? "asserted" : "deasserted", newcnt); } vatpic_notify_intr(vatpic); } static int vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) { struct vatpic *vatpic; struct atpic *atpic; if (irq < 0 || irq > 15) return (EINVAL); vatpic = vm_atpic(vm); atpic = &vatpic->atpic[irq >> 3]; if (atpic->ready == false) return (0); VATPIC_LOCK(vatpic); switch (irqstate) { case IRQSTATE_ASSERT: vatpic_set_pinstate(vatpic, irq, true); break; case IRQSTATE_DEASSERT: vatpic_set_pinstate(vatpic, irq, false); break; case IRQSTATE_PULSE: vatpic_set_pinstate(vatpic, irq, true); vatpic_set_pinstate(vatpic, irq, false); break; default: panic("vatpic_set_irqstate: invalid irqstate %d", irqstate); } VATPIC_UNLOCK(vatpic); return (0); } int vatpic_assert_irq(struct vm *vm, int irq) { return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); } int vatpic_deassert_irq(struct vm *vm, int irq) { return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); } int vatpic_pulse_irq(struct vm *vm, int irq) { return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE)); } int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger) { struct vatpic *vatpic; if (irq < 0 || irq > 15) return (EINVAL); /* * See comment in vatpic_elc_handler. These IRQs must be * edge triggered. */ if (trigger == LEVEL_TRIGGER) { switch (irq) { case 0: case 1: case 2: case 8: case 13: return (EINVAL); } } vatpic = vm_atpic(vm); VATPIC_LOCK(vatpic); if (trigger == LEVEL_TRIGGER) vatpic->elc[irq >> 3] |= 1 << (irq & 0x7); else vatpic->elc[irq >> 3] &= ~(1 << (irq & 0x7)); VATPIC_UNLOCK(vatpic); return (0); } void vatpic_pending_intr(struct vm *vm, int *vecptr) { struct vatpic *vatpic; struct atpic *atpic; int pin; vatpic = vm_atpic(vm); atpic = &vatpic->atpic[0]; VATPIC_LOCK(vatpic); pin = vatpic_get_highest_irrpin(atpic); if (pin == 2) { atpic = &vatpic->atpic[1]; pin = vatpic_get_highest_irrpin(atpic); } /* * If there are no pins active at this moment then return the spurious * interrupt vector instead. */ if (pin == -1) pin = 7; KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin)); *vecptr = atpic->irq_base + pin; VATPIC_UNLOCK(vatpic); } static void vatpic_pin_accepted(struct atpic *atpic, int pin) { atpic->intr_raised = false; if (atpic->acnt[pin] == 0) atpic->request &= ~(1 << pin); if (atpic->aeoi == true) { if (atpic->rotate == true) atpic->lowprio = pin; } else { atpic->service |= (1 << pin); } } void vatpic_intr_accepted(struct vm *vm, int vector) { struct vatpic *vatpic; int pin; vatpic = vm_atpic(vm); VATPIC_LOCK(vatpic); pin = vector & 0x7; if ((vector & ~0x7) == vatpic->atpic[1].irq_base) { vatpic_pin_accepted(&vatpic->atpic[1], pin); /* * If this vector originated from the slave, * accept the cascaded interrupt too. */ vatpic_pin_accepted(&vatpic->atpic[0], 2); } else { vatpic_pin_accepted(&vatpic->atpic[0], pin); } vatpic_notify_intr(vatpic); VATPIC_UNLOCK(vatpic); } static int vatpic_read(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, int bytes, uint32_t *eax) { int pin; VATPIC_LOCK(vatpic); if (atpic->poll) { atpic->poll = 0; pin = vatpic_get_highest_irrpin(atpic); if (pin >= 0) { vatpic_pin_accepted(atpic, pin); *eax = 0x80 | pin; } else { *eax = 0; } } else { if (port & ICU_IMR_OFFSET) { /* read interrrupt mask register */ *eax = atpic->mask; } else { if (atpic->rd_cmd_reg == OCW3_RIS) { /* read interrupt service register */ *eax = atpic->service; } else { /* read interrupt request register */ *eax = atpic->request; } } } VATPIC_UNLOCK(vatpic); return (0); } static int vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, int bytes, uint32_t *eax) { int error; uint8_t val; error = 0; val = *eax; VATPIC_LOCK(vatpic); if (port & ICU_IMR_OFFSET) { switch (atpic->icw_num) { case 2: error = vatpic_icw2(vatpic, atpic, val); break; case 3: error = vatpic_icw3(vatpic, atpic, val); break; case 4: error = vatpic_icw4(vatpic, atpic, val); break; default: error = vatpic_ocw1(vatpic, atpic, val); break; } } else { if (val & (1 << 4)) error = vatpic_icw1(vatpic, atpic, val); if (atpic->ready) { if (val & (1 << 3)) error = vatpic_ocw3(vatpic, atpic, val); else error = vatpic_ocw2(vatpic, atpic, val); } } if (atpic->ready) vatpic_notify_intr(vatpic); VATPIC_UNLOCK(vatpic); return (error); } int vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax) { struct vatpic *vatpic; struct atpic *atpic; vatpic = vm_atpic(vm); atpic = &vatpic->atpic[0]; if (bytes != 1) return (-1); if (in) { return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); } return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); } int vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax) { struct vatpic *vatpic; struct atpic *atpic; vatpic = vm_atpic(vm); atpic = &vatpic->atpic[1]; if (bytes != 1) return (-1); if (in) { return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); } return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); } int vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax) { struct vatpic *vatpic; bool is_master; vatpic = vm_atpic(vm); is_master = (port == IO_ELCR1); if (bytes != 1) return (-1); VATPIC_LOCK(vatpic); if (in) { if (is_master) *eax = vatpic->elc[0]; else *eax = vatpic->elc[1]; } else { /* * For the master PIC the cascade channel (IRQ2), the * heart beat timer (IRQ0), and the keyboard * controller (IRQ1) cannot be programmed for level * mode. * * For the slave PIC the real time clock (IRQ8) and * the floating point error interrupt (IRQ13) cannot * be programmed for level mode. */ if (is_master) vatpic->elc[0] = (*eax & 0xf8); else vatpic->elc[1] = (*eax & 0xde); } VATPIC_UNLOCK(vatpic); return (0); } struct vatpic * vatpic_init(struct vm *vm) { struct vatpic *vatpic; vatpic = malloc(sizeof(struct vatpic), M_VATPIC, M_WAITOK | M_ZERO); vatpic->vm = vm; mtx_init(&vatpic->mtx, "vatpic lock", NULL, MTX_SPIN); return (vatpic); } void vatpic_cleanup(struct vatpic *vatpic) { free(vatpic, M_VATPIC); } + +#ifdef BHYVE_SNAPSHOT +int +vatpic_snapshot(struct vatpic *vatpic, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + struct atpic *atpic; + + for (i = 0; i < nitems(vatpic->atpic); i++) { + atpic = &vatpic->atpic[i]; + + SNAPSHOT_VAR_OR_LEAVE(atpic->ready, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->icw_num, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->rd_cmd_reg, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(atpic->aeoi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->poll, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->rotate, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->sfn, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->irq_base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->request, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->service, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->mask, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->smm, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(atpic->acnt, sizeof(atpic->acnt), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->lowprio, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->intr_raised, meta, ret, done); + + } + + SNAPSHOT_BUF_OR_LEAVE(vatpic->elc, sizeof(vatpic->elc), + meta, ret, done); + +done: + return (ret); +} +#endif diff --git a/sys/amd64/vmm/io/vatpic.h b/sys/amd64/vmm/io/vatpic.h index d4a1be18208d..8990a2a5fcb0 100644 --- a/sys/amd64/vmm/io/vatpic.h +++ b/sys/amd64/vmm/io/vatpic.h @@ -1,57 +1,63 @@ /*- * Copyright (c) 2014 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VATPIC_H_ #define _VATPIC_H_ #include #define ICU_IMR_OFFSET 1 #define IO_ELCR1 0x4d0 #define IO_ELCR2 0x4d1 +struct vm_snapshot_meta; + struct vatpic *vatpic_init(struct vm *vm); void vatpic_cleanup(struct vatpic *vatpic); int vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax); int vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax); int vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax); int vatpic_assert_irq(struct vm *vm, int irq); int vatpic_deassert_irq(struct vm *vm, int irq); int vatpic_pulse_irq(struct vm *vm, int irq); int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger); void vatpic_pending_intr(struct vm *vm, int *vecptr); void vatpic_intr_accepted(struct vm *vm, int vector); +#ifdef BHYVE_SNAPSHOT +int vatpic_snapshot(struct vatpic *vatpic, struct vm_snapshot_meta *meta); +#endif + #endif /* _VATPIC_H_ */ diff --git a/sys/amd64/vmm/io/vatpit.c b/sys/amd64/vmm/io/vatpit.c index 91d64af21233..4718a0557065 100644 --- a/sys/amd64/vmm/io/vatpit.c +++ b/sys/amd64/vmm/io/vatpit.c @@ -1,474 +1,516 @@ /*- * Copyright (c) 2014 Tycho Nightingale * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * Copyright (c) 2018 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include #include #include #include #include #include #include +#include #include "vmm_ktr.h" #include "vatpic.h" #include "vioapic.h" #include "vatpit.h" static MALLOC_DEFINE(M_VATPIT, "atpit", "bhyve virtual atpit (8254)"); #define VATPIT_LOCK(vatpit) mtx_lock_spin(&((vatpit)->mtx)) #define VATPIT_UNLOCK(vatpit) mtx_unlock_spin(&((vatpit)->mtx)) #define VATPIT_LOCKED(vatpit) mtx_owned(&((vatpit)->mtx)) #define TIMER_SEL_MASK 0xc0 #define TIMER_RW_MASK 0x30 #define TIMER_MODE_MASK 0x0f #define TIMER_SEL_READBACK 0xc0 #define TIMER_STS_OUT 0x80 #define TIMER_STS_NULLCNT 0x40 #define TIMER_RB_LCTR 0x20 #define TIMER_RB_LSTATUS 0x10 #define TIMER_RB_CTR_2 0x08 #define TIMER_RB_CTR_1 0x04 #define TIMER_RB_CTR_0 0x02 #define TMR2_OUT_STS 0x20 #define PIT_8254_FREQ 1193182 #define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz)) struct vatpit_callout_arg { struct vatpit *vatpit; int channel_num; }; struct channel { int mode; uint16_t initial; /* initial counter value */ struct bintime now_bt; /* uptime when counter was loaded */ uint8_t cr[2]; uint8_t ol[2]; bool slatched; /* status latched */ uint8_t status; int crbyte; int olbyte; int frbyte; struct callout callout; struct bintime callout_bt; /* target time */ struct vatpit_callout_arg callout_arg; }; struct vatpit { struct vm *vm; struct mtx mtx; struct bintime freq_bt; struct channel channel[3]; }; static void pit_timer_start_cntr0(struct vatpit *vatpit); static uint64_t vatpit_delta_ticks(struct vatpit *vatpit, struct channel *c) { struct bintime delta; uint64_t result; binuptime(&delta); bintime_sub(&delta, &c->now_bt); result = delta.sec * PIT_8254_FREQ; result += delta.frac / vatpit->freq_bt.frac; return (result); } static int vatpit_get_out(struct vatpit *vatpit, int channel) { struct channel *c; uint64_t delta_ticks; int out; c = &vatpit->channel[channel]; switch (c->mode) { case TIMER_INTTC: delta_ticks = vatpit_delta_ticks(vatpit, c); out = (delta_ticks >= c->initial); break; default: out = 0; break; } return (out); } static void vatpit_callout_handler(void *a) { struct vatpit_callout_arg *arg = a; struct vatpit *vatpit; struct callout *callout; struct channel *c; vatpit = arg->vatpit; c = &vatpit->channel[arg->channel_num]; callout = &c->callout; VM_CTR1(vatpit->vm, "atpit t%d fired", arg->channel_num); VATPIT_LOCK(vatpit); if (callout_pending(callout)) /* callout was reset */ goto done; if (!callout_active(callout)) /* callout was stopped */ goto done; callout_deactivate(callout); if (c->mode == TIMER_RATEGEN) { pit_timer_start_cntr0(vatpit); } vatpic_pulse_irq(vatpit->vm, 0); vioapic_pulse_irq(vatpit->vm, 2); done: VATPIT_UNLOCK(vatpit); return; } static void pit_timer_start_cntr0(struct vatpit *vatpit) { struct channel *c; struct bintime now, delta; sbintime_t precision; c = &vatpit->channel[0]; if (c->initial != 0) { delta.sec = 0; delta.frac = vatpit->freq_bt.frac * c->initial; bintime_add(&c->callout_bt, &delta); precision = bttosbt(delta) >> tc_precexp; /* * Reset 'callout_bt' if the time that the callout * was supposed to fire is more than 'c->initial' * ticks in the past. */ binuptime(&now); if (bintime_cmp(&c->callout_bt, &now, <)) { c->callout_bt = now; bintime_add(&c->callout_bt, &delta); } callout_reset_sbt(&c->callout, bttosbt(c->callout_bt), precision, vatpit_callout_handler, &c->callout_arg, C_ABSOLUTE); } } static uint16_t pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch) { uint16_t lval; uint64_t delta_ticks; /* cannot latch a new value until the old one has been consumed */ if (latch && c->olbyte != 0) return (0); if (c->initial == 0) { /* * This is possibly an o/s bug - reading the value of * the timer without having set up the initial value. * * The original user-space version of this code set * the timer to 100hz in this condition; do the same * here. */ c->initial = TIMER_DIV(PIT_8254_FREQ, 100); binuptime(&c->now_bt); c->status &= ~TIMER_STS_NULLCNT; } delta_ticks = vatpit_delta_ticks(vatpit, c); lval = c->initial - delta_ticks % c->initial; if (latch) { c->olbyte = 2; c->ol[1] = lval; /* LSB */ c->ol[0] = lval >> 8; /* MSB */ } return (lval); } static int pit_readback1(struct vatpit *vatpit, int channel, uint8_t cmd) { struct channel *c; c = &vatpit->channel[channel]; /* * Latch the count/status of the timer if not already latched. * N.B. that the count/status latch-select bits are active-low. */ if (!(cmd & TIMER_RB_LCTR) && !c->olbyte) { (void) pit_update_counter(vatpit, c, true); } if (!(cmd & TIMER_RB_LSTATUS) && !c->slatched) { c->slatched = true; /* * For mode 0, see if the elapsed time is greater * than the initial value - this results in the * output pin being set to 1 in the status byte. */ if (c->mode == TIMER_INTTC && vatpit_get_out(vatpit, channel)) c->status |= TIMER_STS_OUT; else c->status &= ~TIMER_STS_OUT; } return (0); } static int pit_readback(struct vatpit *vatpit, uint8_t cmd) { int error; /* * The readback command can apply to all timers. */ error = 0; if (cmd & TIMER_RB_CTR_0) error = pit_readback1(vatpit, 0, cmd); if (!error && cmd & TIMER_RB_CTR_1) error = pit_readback1(vatpit, 1, cmd); if (!error && cmd & TIMER_RB_CTR_2) error = pit_readback1(vatpit, 2, cmd); return (error); } static int vatpit_update_mode(struct vatpit *vatpit, uint8_t val) { struct channel *c; int sel, rw, mode; sel = val & TIMER_SEL_MASK; rw = val & TIMER_RW_MASK; mode = val & TIMER_MODE_MASK; if (sel == TIMER_SEL_READBACK) return (pit_readback(vatpit, val)); if (rw != TIMER_LATCH && rw != TIMER_16BIT) return (-1); if (rw != TIMER_LATCH) { /* * Counter mode is not affected when issuing a * latch command. */ if (mode != TIMER_INTTC && mode != TIMER_RATEGEN && mode != TIMER_SQWAVE && mode != TIMER_SWSTROBE) return (-1); } c = &vatpit->channel[sel >> 6]; if (rw == TIMER_LATCH) pit_update_counter(vatpit, c, true); else { c->mode = mode; c->olbyte = 0; /* reset latch after reprogramming */ c->status |= TIMER_STS_NULLCNT; } return (0); } int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax) { struct vatpit *vatpit; struct channel *c; uint8_t val; int error; vatpit = vm_atpit(vm); if (bytes != 1) return (-1); val = *eax; if (port == TIMER_MODE) { if (in) { VM_CTR0(vatpit->vm, "vatpit attempt to read mode"); return (-1); } VATPIT_LOCK(vatpit); error = vatpit_update_mode(vatpit, val); VATPIT_UNLOCK(vatpit); return (error); } /* counter ports */ KASSERT(port >= TIMER_CNTR0 && port <= TIMER_CNTR2, ("invalid port 0x%x", port)); c = &vatpit->channel[port - TIMER_CNTR0]; VATPIT_LOCK(vatpit); if (in && c->slatched) { /* * Return the status byte if latched */ *eax = c->status; c->slatched = false; c->status = 0; } else if (in) { /* * The spec says that once the output latch is completely * read it should revert to "following" the counter. Use * the free running counter for this case (i.e. Linux * TSC calibration). Assuming the access mode is 16-bit, * toggle the MSB/LSB bit on each read. */ if (c->olbyte == 0) { uint16_t tmp; tmp = pit_update_counter(vatpit, c, false); if (c->frbyte) tmp >>= 8; tmp &= 0xff; *eax = tmp; c->frbyte ^= 1; } else *eax = c->ol[--c->olbyte]; } else { c->cr[c->crbyte++] = *eax; if (c->crbyte == 2) { c->status &= ~TIMER_STS_NULLCNT; c->frbyte = 0; c->crbyte = 0; c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8; binuptime(&c->now_bt); /* Start an interval timer for channel 0 */ if (port == TIMER_CNTR0) { c->callout_bt = c->now_bt; pit_timer_start_cntr0(vatpit); } if (c->initial == 0) c->initial = 0xffff; } } VATPIT_UNLOCK(vatpit); return (0); } int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax) { struct vatpit *vatpit; vatpit = vm_atpit(vm); if (in) { VATPIT_LOCK(vatpit); if (vatpit_get_out(vatpit, 2)) *eax = TMR2_OUT_STS; else *eax = 0; VATPIT_UNLOCK(vatpit); } return (0); } struct vatpit * vatpit_init(struct vm *vm) { struct vatpit *vatpit; struct vatpit_callout_arg *arg; int i; vatpit = malloc(sizeof(struct vatpit), M_VATPIT, M_WAITOK | M_ZERO); vatpit->vm = vm; mtx_init(&vatpit->mtx, "vatpit lock", NULL, MTX_SPIN); FREQ2BT(PIT_8254_FREQ, &vatpit->freq_bt); for (i = 0; i < 3; i++) { callout_init(&vatpit->channel[i].callout, 1); arg = &vatpit->channel[i].callout_arg; arg->vatpit = vatpit; arg->channel_num = i; } return (vatpit); } void vatpit_cleanup(struct vatpit *vatpit) { int i; for (i = 0; i < 3; i++) callout_drain(&vatpit->channel[i].callout); free(vatpit, M_VATPIT); } + +#ifdef BHYVE_SNAPSHOT +int +vatpit_snapshot(struct vatpit *vatpit, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + struct channel *channel; + + SNAPSHOT_VAR_OR_LEAVE(vatpit->freq_bt.sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vatpit->freq_bt.frac, meta, ret, done); + + /* properly restore timers; they will NOT work currently */ + printf("%s: snapshot restore does not reset timers!\r\n", __func__); + + for (i = 0; i < nitems(vatpit->channel); i++) { + channel = &vatpit->channel[i]; + + SNAPSHOT_VAR_OR_LEAVE(channel->mode, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->initial, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->now_bt.sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->now_bt.frac, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(channel->cr, sizeof(channel->cr), + meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(channel->ol, sizeof(channel->ol), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->slatched, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->status, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->crbyte, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->frbyte, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->callout_bt.sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->callout_bt.frac, meta, ret, + done); + } + +done: + return (ret); +} +#endif diff --git a/sys/amd64/vmm/io/vatpit.h b/sys/amd64/vmm/io/vatpit.h index 090d1a6172a7..65e06ec9bf58 100644 --- a/sys/amd64/vmm/io/vatpit.h +++ b/sys/amd64/vmm/io/vatpit.h @@ -1,47 +1,52 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Tycho Nightingale * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VATPIT_H_ #define _VATPIT_H_ #include #define NMISC_PORT 0x61 +struct vm_snapshot_meta; + struct vatpit *vatpit_init(struct vm *vm); void vatpit_cleanup(struct vatpit *vatpit); int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax); int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax); +#ifdef BHYVE_SNAPSHOT +int vatpit_snapshot(struct vatpit *vatpit, struct vm_snapshot_meta *meta); +#endif #endif /* _VATPIT_H_ */ diff --git a/sys/amd64/vmm/io/vhpet.c b/sys/amd64/vmm/io/vhpet.c index 8f91f9fe6d78..530f5d49f8f1 100644 --- a/sys/amd64/vmm/io/vhpet.c +++ b/sys/amd64/vmm/io/vhpet.c @@ -1,763 +1,812 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Tycho Nightingale * Copyright (c) 2013 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include #include #include #include #include #include #include +#include #include "vmm_lapic.h" #include "vatpic.h" #include "vioapic.h" #include "vhpet.h" #include "vmm_ktr.h" static MALLOC_DEFINE(M_VHPET, "vhpet", "bhyve virtual hpet"); #define HPET_FREQ 16777216 /* 16.7 (2^24) Mhz */ #define FS_PER_S 1000000000000000ul /* Timer N Configuration and Capabilities Register */ #define HPET_TCAP_RO_MASK (HPET_TCAP_INT_ROUTE | \ HPET_TCAP_FSB_INT_DEL | \ HPET_TCAP_SIZE | \ HPET_TCAP_PER_INT) /* * HPET requires at least 3 timers and up to 32 timers per block. */ #define VHPET_NUM_TIMERS 8 CTASSERT(VHPET_NUM_TIMERS >= 3 && VHPET_NUM_TIMERS <= 32); struct vhpet_callout_arg { struct vhpet *vhpet; int timer_num; }; struct vhpet { struct vm *vm; struct mtx mtx; sbintime_t freq_sbt; uint64_t config; /* Configuration */ uint64_t isr; /* Interrupt Status */ uint32_t countbase; /* HPET counter base value */ sbintime_t countbase_sbt; /* uptime corresponding to base value */ struct { uint64_t cap_config; /* Configuration */ uint64_t msireg; /* FSB interrupt routing */ uint32_t compval; /* Comparator */ uint32_t comprate; struct callout callout; sbintime_t callout_sbt; /* time when counter==compval */ struct vhpet_callout_arg arg; } timer[VHPET_NUM_TIMERS]; }; #define VHPET_LOCK(vhp) mtx_lock(&((vhp)->mtx)) #define VHPET_UNLOCK(vhp) mtx_unlock(&((vhp)->mtx)) static void vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now); static uint64_t vhpet_capabilities(void) { uint64_t cap = 0; cap |= 0x8086 << 16; /* vendor id */ cap |= (VHPET_NUM_TIMERS - 1) << 8; /* number of timers */ cap |= 1; /* revision */ cap &= ~HPET_CAP_COUNT_SIZE; /* 32-bit timer */ cap &= 0xffffffff; cap |= (FS_PER_S / HPET_FREQ) << 32; /* tick period in fs */ return (cap); } static __inline bool vhpet_counter_enabled(struct vhpet *vhpet) { return ((vhpet->config & HPET_CNF_ENABLE) ? true : false); } static __inline bool vhpet_timer_msi_enabled(struct vhpet *vhpet, int n) { const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN; if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable) return (true); else return (false); } static __inline int vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n) { /* * If the timer is configured to use MSI then treat it as if the * timer is not connected to the ioapic. */ if (vhpet_timer_msi_enabled(vhpet, n)) return (0); return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9); } static uint32_t vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr) { uint32_t val; sbintime_t now, delta; val = vhpet->countbase; if (vhpet_counter_enabled(vhpet)) { now = sbinuptime(); delta = now - vhpet->countbase_sbt; KASSERT(delta >= 0, ("vhpet_counter: uptime went backwards: " "%#lx to %#lx", vhpet->countbase_sbt, now)); val += delta / vhpet->freq_sbt; if (nowptr != NULL) *nowptr = now; } else { /* * The sbinuptime corresponding to the 'countbase' is * meaningless when the counter is disabled. Make sure * that the caller doesn't want to use it. */ KASSERT(nowptr == NULL, ("vhpet_counter: nowptr must be NULL")); } return (val); } static void vhpet_timer_clear_isr(struct vhpet *vhpet, int n) { int pin; if (vhpet->isr & (1 << n)) { pin = vhpet_timer_ioapic_pin(vhpet, n); KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n)); vioapic_deassert_irq(vhpet->vm, pin); vhpet->isr &= ~(1 << n); } } static __inline bool vhpet_periodic_timer(struct vhpet *vhpet, int n) { return ((vhpet->timer[n].cap_config & HPET_TCNF_TYPE) != 0); } static __inline bool vhpet_timer_interrupt_enabled(struct vhpet *vhpet, int n) { return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ENB) != 0); } static __inline bool vhpet_timer_edge_trig(struct vhpet *vhpet, int n) { KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: " "timer %d is using MSI", n)); if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0) return (true); else return (false); } static void vhpet_timer_interrupt(struct vhpet *vhpet, int n) { int pin; /* If interrupts are not enabled for this timer then just return. */ if (!vhpet_timer_interrupt_enabled(vhpet, n)) return; /* * If a level triggered interrupt is already asserted then just return. */ if ((vhpet->isr & (1 << n)) != 0) { VM_CTR1(vhpet->vm, "hpet t%d intr is already asserted", n); return; } if (vhpet_timer_msi_enabled(vhpet, n)) { lapic_intr_msi(vhpet->vm, vhpet->timer[n].msireg >> 32, vhpet->timer[n].msireg & 0xffffffff); return; } pin = vhpet_timer_ioapic_pin(vhpet, n); if (pin == 0) { VM_CTR1(vhpet->vm, "hpet t%d intr is not routed to ioapic", n); return; } if (vhpet_timer_edge_trig(vhpet, n)) { vioapic_pulse_irq(vhpet->vm, pin); } else { vhpet->isr |= 1 << n; vioapic_assert_irq(vhpet->vm, pin); } } static void vhpet_adjust_compval(struct vhpet *vhpet, int n, uint32_t counter) { uint32_t compval, comprate, compnext; KASSERT(vhpet->timer[n].comprate != 0, ("hpet t%d is not periodic", n)); compval = vhpet->timer[n].compval; comprate = vhpet->timer[n].comprate; /* * Calculate the comparator value to be used for the next periodic * interrupt. * * This function is commonly called from the callout handler. * In this scenario the 'counter' is ahead of 'compval'. To find * the next value to program into the accumulator we divide the * number space between 'compval' and 'counter' into 'comprate' * sized units. The 'compval' is rounded up such that is "ahead" * of 'counter'. */ compnext = compval + ((counter - compval) / comprate + 1) * comprate; vhpet->timer[n].compval = compnext; } static void vhpet_handler(void *a) { int n; uint32_t counter; sbintime_t now; struct vhpet *vhpet; struct callout *callout; struct vhpet_callout_arg *arg; arg = a; vhpet = arg->vhpet; n = arg->timer_num; callout = &vhpet->timer[n].callout; VM_CTR1(vhpet->vm, "hpet t%d fired", n); VHPET_LOCK(vhpet); if (callout_pending(callout)) /* callout was reset */ goto done; if (!callout_active(callout)) /* callout was stopped */ goto done; callout_deactivate(callout); if (!vhpet_counter_enabled(vhpet)) panic("vhpet(%p) callout with counter disabled", vhpet); counter = vhpet_counter(vhpet, &now); vhpet_start_timer(vhpet, n, counter, now); vhpet_timer_interrupt(vhpet, n); done: VHPET_UNLOCK(vhpet); return; } static void vhpet_stop_timer(struct vhpet *vhpet, int n, sbintime_t now) { VM_CTR1(vhpet->vm, "hpet t%d stopped", n); callout_stop(&vhpet->timer[n].callout); /* * If the callout was scheduled to expire in the past but hasn't * had a chance to execute yet then trigger the timer interrupt * here. Failing to do so will result in a missed timer interrupt * in the guest. This is especially bad in one-shot mode because * the next interrupt has to wait for the counter to wrap around. */ if (vhpet->timer[n].callout_sbt < now) { VM_CTR1(vhpet->vm, "hpet t%d interrupt triggered after " "stopping timer", n); vhpet_timer_interrupt(vhpet, n); } } static void vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now) { sbintime_t delta, precision; if (vhpet->timer[n].comprate != 0) vhpet_adjust_compval(vhpet, n, counter); else { /* * In one-shot mode it is the guest's responsibility to make * sure that the comparator value is not in the "past". The * hardware doesn't have any belt-and-suspenders to deal with * this so we don't either. */ } delta = (vhpet->timer[n].compval - counter) * vhpet->freq_sbt; precision = delta >> tc_precexp; vhpet->timer[n].callout_sbt = now + delta; callout_reset_sbt(&vhpet->timer[n].callout, vhpet->timer[n].callout_sbt, precision, vhpet_handler, &vhpet->timer[n].arg, C_ABSOLUTE); } static void vhpet_start_counting(struct vhpet *vhpet) { int i; vhpet->countbase_sbt = sbinuptime(); for (i = 0; i < VHPET_NUM_TIMERS; i++) { /* * Restart the timers based on the value of the main counter * when it stopped counting. */ vhpet_start_timer(vhpet, i, vhpet->countbase, vhpet->countbase_sbt); } } static void vhpet_stop_counting(struct vhpet *vhpet, uint32_t counter, sbintime_t now) { int i; vhpet->countbase = counter; for (i = 0; i < VHPET_NUM_TIMERS; i++) vhpet_stop_timer(vhpet, i, now); } static __inline void update_register(uint64_t *regptr, uint64_t data, uint64_t mask) { *regptr &= ~mask; *regptr |= (data & mask); } static void vhpet_timer_update_config(struct vhpet *vhpet, int n, uint64_t data, uint64_t mask) { bool clear_isr; int old_pin, new_pin; uint32_t allowed_irqs; uint64_t oldval, newval; if (vhpet_timer_msi_enabled(vhpet, n) || vhpet_timer_edge_trig(vhpet, n)) { if (vhpet->isr & (1 << n)) panic("vhpet timer %d isr should not be asserted", n); } old_pin = vhpet_timer_ioapic_pin(vhpet, n); oldval = vhpet->timer[n].cap_config; newval = oldval; update_register(&newval, data, mask); newval &= ~(HPET_TCAP_RO_MASK | HPET_TCNF_32MODE); newval |= oldval & HPET_TCAP_RO_MASK; if (newval == oldval) return; vhpet->timer[n].cap_config = newval; VM_CTR2(vhpet->vm, "hpet t%d cap_config set to 0x%016x", n, newval); /* * Validate the interrupt routing in the HPET_TCNF_INT_ROUTE field. * If it does not match the bits set in HPET_TCAP_INT_ROUTE then set * it to the default value of 0. */ allowed_irqs = vhpet->timer[n].cap_config >> 32; new_pin = vhpet_timer_ioapic_pin(vhpet, n); if (new_pin != 0 && (allowed_irqs & (1 << new_pin)) == 0) { VM_CTR3(vhpet->vm, "hpet t%d configured invalid irq %d, " "allowed_irqs 0x%08x", n, new_pin, allowed_irqs); new_pin = 0; vhpet->timer[n].cap_config &= ~HPET_TCNF_INT_ROUTE; } if (!vhpet_periodic_timer(vhpet, n)) vhpet->timer[n].comprate = 0; /* * If the timer's ISR bit is set then clear it in the following cases: * - interrupt is disabled * - interrupt type is changed from level to edge or fsb. * - interrupt routing is changed * * This is to ensure that this timer's level triggered interrupt does * not remain asserted forever. */ if (vhpet->isr & (1 << n)) { KASSERT(old_pin != 0, ("timer %d isr asserted to ioapic pin %d", n, old_pin)); if (!vhpet_timer_interrupt_enabled(vhpet, n)) clear_isr = true; else if (vhpet_timer_msi_enabled(vhpet, n)) clear_isr = true; else if (vhpet_timer_edge_trig(vhpet, n)) clear_isr = true; else if (vhpet_timer_ioapic_pin(vhpet, n) != old_pin) clear_isr = true; else clear_isr = false; if (clear_isr) { VM_CTR1(vhpet->vm, "hpet t%d isr cleared due to " "configuration change", n); vioapic_deassert_irq(vhpet->vm, old_pin); vhpet->isr &= ~(1 << n); } } } int vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, int size, void *arg) { struct vhpet *vhpet; uint64_t data, mask, oldval, val64; uint32_t isr_clear_mask, old_compval, old_comprate, counter; sbintime_t now, *nowptr; int i, offset; vhpet = vm_hpet(vm); offset = gpa - VHPET_BASE; VHPET_LOCK(vhpet); /* Accesses to the HPET should be 4 or 8 bytes wide */ switch (size) { case 8: mask = 0xffffffffffffffff; data = val; break; case 4: mask = 0xffffffff; data = val; if ((offset & 0x4) != 0) { mask <<= 32; data <<= 32; } break; default: VM_CTR2(vhpet->vm, "hpet invalid mmio write: " "offset 0x%08x, size %d", offset, size); goto done; } /* Access to the HPET should be naturally aligned to its width */ if (offset & (size - 1)) { VM_CTR2(vhpet->vm, "hpet invalid mmio write: " "offset 0x%08x, size %d", offset, size); goto done; } if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { /* * Get the most recent value of the counter before updating * the 'config' register. If the HPET is going to be disabled * then we need to update 'countbase' with the value right * before it is disabled. */ nowptr = vhpet_counter_enabled(vhpet) ? &now : NULL; counter = vhpet_counter(vhpet, nowptr); oldval = vhpet->config; update_register(&vhpet->config, data, mask); /* * LegacyReplacement Routing is not supported so clear the * bit explicitly. */ vhpet->config &= ~HPET_CNF_LEG_RT; if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) { if (vhpet_counter_enabled(vhpet)) { vhpet_start_counting(vhpet); VM_CTR0(vhpet->vm, "hpet enabled"); } else { vhpet_stop_counting(vhpet, counter, now); VM_CTR0(vhpet->vm, "hpet disabled"); } } goto done; } if (offset == HPET_ISR || offset == HPET_ISR + 4) { isr_clear_mask = vhpet->isr & data; for (i = 0; i < VHPET_NUM_TIMERS; i++) { if ((isr_clear_mask & (1 << i)) != 0) { VM_CTR1(vhpet->vm, "hpet t%d isr cleared", i); vhpet_timer_clear_isr(vhpet, i); } } goto done; } if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { /* Zero-extend the counter to 64-bits before updating it */ val64 = vhpet_counter(vhpet, NULL); update_register(&val64, data, mask); vhpet->countbase = val64; if (vhpet_counter_enabled(vhpet)) vhpet_start_counting(vhpet); goto done; } for (i = 0; i < VHPET_NUM_TIMERS; i++) { if (offset == HPET_TIMER_CAP_CNF(i) || offset == HPET_TIMER_CAP_CNF(i) + 4) { vhpet_timer_update_config(vhpet, i, data, mask); break; } if (offset == HPET_TIMER_COMPARATOR(i) || offset == HPET_TIMER_COMPARATOR(i) + 4) { old_compval = vhpet->timer[i].compval; old_comprate = vhpet->timer[i].comprate; if (vhpet_periodic_timer(vhpet, i)) { /* * In periodic mode writes to the comparator * change the 'compval' register only if the * HPET_TCNF_VAL_SET bit is set in the config * register. */ val64 = vhpet->timer[i].comprate; update_register(&val64, data, mask); vhpet->timer[i].comprate = val64; if ((vhpet->timer[i].cap_config & HPET_TCNF_VAL_SET) != 0) { vhpet->timer[i].compval = val64; } } else { KASSERT(vhpet->timer[i].comprate == 0, ("vhpet one-shot timer %d has invalid " "rate %u", i, vhpet->timer[i].comprate)); val64 = vhpet->timer[i].compval; update_register(&val64, data, mask); vhpet->timer[i].compval = val64; } vhpet->timer[i].cap_config &= ~HPET_TCNF_VAL_SET; if (vhpet->timer[i].compval != old_compval || vhpet->timer[i].comprate != old_comprate) { if (vhpet_counter_enabled(vhpet)) { counter = vhpet_counter(vhpet, &now); vhpet_start_timer(vhpet, i, counter, now); } } break; } if (offset == HPET_TIMER_FSB_VAL(i) || offset == HPET_TIMER_FSB_ADDR(i)) { update_register(&vhpet->timer[i].msireg, data, mask); break; } } done: VHPET_UNLOCK(vhpet); return (0); } int vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval, int size, void *arg) { int i, offset; struct vhpet *vhpet; uint64_t data; vhpet = vm_hpet(vm); offset = gpa - VHPET_BASE; VHPET_LOCK(vhpet); /* Accesses to the HPET should be 4 or 8 bytes wide */ if (size != 4 && size != 8) { VM_CTR2(vhpet->vm, "hpet invalid mmio read: " "offset 0x%08x, size %d", offset, size); data = 0; goto done; } /* Access to the HPET should be naturally aligned to its width */ if (offset & (size - 1)) { VM_CTR2(vhpet->vm, "hpet invalid mmio read: " "offset 0x%08x, size %d", offset, size); data = 0; goto done; } if (offset == HPET_CAPABILITIES || offset == HPET_CAPABILITIES + 4) { data = vhpet_capabilities(); goto done; } if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { data = vhpet->config; goto done; } if (offset == HPET_ISR || offset == HPET_ISR + 4) { data = vhpet->isr; goto done; } if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { data = vhpet_counter(vhpet, NULL); goto done; } for (i = 0; i < VHPET_NUM_TIMERS; i++) { if (offset == HPET_TIMER_CAP_CNF(i) || offset == HPET_TIMER_CAP_CNF(i) + 4) { data = vhpet->timer[i].cap_config; break; } if (offset == HPET_TIMER_COMPARATOR(i) || offset == HPET_TIMER_COMPARATOR(i) + 4) { data = vhpet->timer[i].compval; break; } if (offset == HPET_TIMER_FSB_VAL(i) || offset == HPET_TIMER_FSB_ADDR(i)) { data = vhpet->timer[i].msireg; break; } } if (i >= VHPET_NUM_TIMERS) data = 0; done: VHPET_UNLOCK(vhpet); if (size == 4) { if (offset & 0x4) data >>= 32; } *rval = data; return (0); } struct vhpet * vhpet_init(struct vm *vm) { int i, pincount; struct vhpet *vhpet; uint64_t allowed_irqs; struct vhpet_callout_arg *arg; struct bintime bt; vhpet = malloc(sizeof(struct vhpet), M_VHPET, M_WAITOK | M_ZERO); vhpet->vm = vm; mtx_init(&vhpet->mtx, "vhpet lock", NULL, MTX_DEF); FREQ2BT(HPET_FREQ, &bt); vhpet->freq_sbt = bttosbt(bt); pincount = vioapic_pincount(vm); if (pincount >= 32) allowed_irqs = 0xff000000; /* irqs 24-31 */ else if (pincount >= 20) allowed_irqs = 0xf << (pincount - 4); /* 4 upper irqs */ else allowed_irqs = 0; /* * Initialize HPET timer hardware state. */ for (i = 0; i < VHPET_NUM_TIMERS; i++) { vhpet->timer[i].cap_config = allowed_irqs << 32; vhpet->timer[i].cap_config |= HPET_TCAP_PER_INT; vhpet->timer[i].cap_config |= HPET_TCAP_FSB_INT_DEL; vhpet->timer[i].compval = 0xffffffff; callout_init(&vhpet->timer[i].callout, 1); arg = &vhpet->timer[i].arg; arg->vhpet = vhpet; arg->timer_num = i; } return (vhpet); } void vhpet_cleanup(struct vhpet *vhpet) { int i; for (i = 0; i < VHPET_NUM_TIMERS; i++) callout_drain(&vhpet->timer[i].callout); free(vhpet, M_VHPET); } int vhpet_getcap(struct vm_hpet_cap *cap) { cap->capabilities = vhpet_capabilities(); return (0); } + +#ifdef BHYVE_SNAPSHOT +int +vhpet_snapshot(struct vhpet *vhpet, struct vm_snapshot_meta *meta) +{ + int i, ret; + uint32_t countbase; + + SNAPSHOT_VAR_OR_LEAVE(vhpet->freq_sbt, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->config, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->isr, meta, ret, done); + + /* at restore time the countbase should have the value it had when the + * snapshot was created; since the value is not directly kept in + * vhpet->countbase, but rather computed relative to the current system + * uptime using countbase_sbt, save the value retured by vhpet_counter + */ + if (meta->op == VM_SNAPSHOT_SAVE) + countbase = vhpet_counter(vhpet, NULL); + SNAPSHOT_VAR_OR_LEAVE(countbase, meta, ret, done); + if (meta->op == VM_SNAPSHOT_RESTORE) + vhpet->countbase = countbase; + + for (i = 0; i < nitems(vhpet->timer); i++) { + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].cap_config, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].msireg, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].compval, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].comprate, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].callout_sbt, + meta, ret, done); + } + +done: + return (ret); +} + +int +vhpet_restore_time(struct vhpet *vhpet) +{ + if (vhpet_counter_enabled(vhpet)) + vhpet_start_counting(vhpet); + + return (0); +} +#endif diff --git a/sys/amd64/vmm/io/vhpet.h b/sys/amd64/vmm/io/vhpet.h index 3d6b653055c9..113683c09b33 100644 --- a/sys/amd64/vmm/io/vhpet.h +++ b/sys/amd64/vmm/io/vhpet.h @@ -1,46 +1,52 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Tycho Nightingale * Copyright (c) 2013 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VHPET_H_ #define _VHPET_H_ #define VHPET_BASE 0xfed00000 #define VHPET_SIZE 1024 +struct vm_snapshot_meta; + struct vhpet *vhpet_init(struct vm *vm); void vhpet_cleanup(struct vhpet *vhpet); int vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, int size, void *arg); int vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *val, int size, void *arg); int vhpet_getcap(struct vm_hpet_cap *cap); +#ifdef BHYVE_SNAPSHOT +int vhpet_snapshot(struct vhpet *vhpet, struct vm_snapshot_meta *meta); +int vhpet_restore_time(struct vhpet *vhpet); +#endif #endif /* _VHPET_H_ */ diff --git a/sys/amd64/vmm/io/vioapic.c b/sys/amd64/vmm/io/vioapic.c index 31c1cabab094..a8117da4b879 100644 --- a/sys/amd64/vmm/io/vioapic.c +++ b/sys/amd64/vmm/io/vioapic.c @@ -1,501 +1,523 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Tycho Nightingale * Copyright (c) 2013 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include #include #include #include #include #include #include +#include #include "vmm_ktr.h" #include "vmm_lapic.h" #include "vlapic.h" #include "vioapic.h" #define IOREGSEL 0x00 #define IOWIN 0x10 #define REDIR_ENTRIES 32 #define RTBL_RO_BITS ((uint64_t)(IOART_REM_IRR | IOART_DELIVS)) struct vioapic { struct vm *vm; struct mtx mtx; uint32_t id; uint32_t ioregsel; struct { uint64_t reg; int acnt; /* sum of pin asserts (+1) and deasserts (-1) */ } rtbl[REDIR_ENTRIES]; }; #define VIOAPIC_LOCK(vioapic) mtx_lock_spin(&((vioapic)->mtx)) #define VIOAPIC_UNLOCK(vioapic) mtx_unlock_spin(&((vioapic)->mtx)) #define VIOAPIC_LOCKED(vioapic) mtx_owned(&((vioapic)->mtx)) static MALLOC_DEFINE(M_VIOAPIC, "vioapic", "bhyve virtual ioapic"); #define VIOAPIC_CTR1(vioapic, fmt, a1) \ VM_CTR1((vioapic)->vm, fmt, a1) #define VIOAPIC_CTR2(vioapic, fmt, a1, a2) \ VM_CTR2((vioapic)->vm, fmt, a1, a2) #define VIOAPIC_CTR3(vioapic, fmt, a1, a2, a3) \ VM_CTR3((vioapic)->vm, fmt, a1, a2, a3) #define VIOAPIC_CTR4(vioapic, fmt, a1, a2, a3, a4) \ VM_CTR4((vioapic)->vm, fmt, a1, a2, a3, a4) #ifdef KTR static const char * pinstate_str(bool asserted) { if (asserted) return ("asserted"); else return ("deasserted"); } #endif static void vioapic_send_intr(struct vioapic *vioapic, int pin) { int vector, delmode; uint32_t low, high, dest; bool level, phys; KASSERT(pin >= 0 && pin < REDIR_ENTRIES, ("vioapic_set_pinstate: invalid pin number %d", pin)); KASSERT(VIOAPIC_LOCKED(vioapic), ("vioapic_set_pinstate: vioapic is not locked")); low = vioapic->rtbl[pin].reg; high = vioapic->rtbl[pin].reg >> 32; if ((low & IOART_INTMASK) == IOART_INTMSET) { VIOAPIC_CTR1(vioapic, "ioapic pin%d: masked", pin); return; } phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); delmode = low & IOART_DELMOD; level = low & IOART_TRGRLVL ? true : false; if (level) vioapic->rtbl[pin].reg |= IOART_REM_IRR; vector = low & IOART_INTVEC; dest = high >> APIC_ID_SHIFT; vlapic_deliver_intr(vioapic->vm, level, dest, phys, delmode, vector); } static void vioapic_set_pinstate(struct vioapic *vioapic, int pin, bool newstate) { int oldcnt, newcnt; bool needintr; KASSERT(pin >= 0 && pin < REDIR_ENTRIES, ("vioapic_set_pinstate: invalid pin number %d", pin)); KASSERT(VIOAPIC_LOCKED(vioapic), ("vioapic_set_pinstate: vioapic is not locked")); oldcnt = vioapic->rtbl[pin].acnt; if (newstate) vioapic->rtbl[pin].acnt++; else vioapic->rtbl[pin].acnt--; newcnt = vioapic->rtbl[pin].acnt; if (newcnt < 0) { VIOAPIC_CTR2(vioapic, "ioapic pin%d: bad acnt %d", pin, newcnt); } needintr = false; if (oldcnt == 0 && newcnt == 1) { needintr = true; VIOAPIC_CTR1(vioapic, "ioapic pin%d: asserted", pin); } else if (oldcnt == 1 && newcnt == 0) { VIOAPIC_CTR1(vioapic, "ioapic pin%d: deasserted", pin); } else { VIOAPIC_CTR3(vioapic, "ioapic pin%d: %s, ignored, acnt %d", pin, pinstate_str(newstate), newcnt); } if (needintr) vioapic_send_intr(vioapic, pin); } enum irqstate { IRQSTATE_ASSERT, IRQSTATE_DEASSERT, IRQSTATE_PULSE }; static int vioapic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) { struct vioapic *vioapic; if (irq < 0 || irq >= REDIR_ENTRIES) return (EINVAL); vioapic = vm_ioapic(vm); VIOAPIC_LOCK(vioapic); switch (irqstate) { case IRQSTATE_ASSERT: vioapic_set_pinstate(vioapic, irq, true); break; case IRQSTATE_DEASSERT: vioapic_set_pinstate(vioapic, irq, false); break; case IRQSTATE_PULSE: vioapic_set_pinstate(vioapic, irq, true); vioapic_set_pinstate(vioapic, irq, false); break; default: panic("vioapic_set_irqstate: invalid irqstate %d", irqstate); } VIOAPIC_UNLOCK(vioapic); return (0); } int vioapic_assert_irq(struct vm *vm, int irq) { return (vioapic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); } int vioapic_deassert_irq(struct vm *vm, int irq) { return (vioapic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); } int vioapic_pulse_irq(struct vm *vm, int irq) { return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE)); } /* * Reset the vlapic's trigger-mode register to reflect the ioapic pin * configuration. */ static void vioapic_update_tmr(struct vm *vm, int vcpuid, void *arg) { struct vioapic *vioapic; struct vlapic *vlapic; uint32_t low, high, dest; int delmode, pin, vector; bool level, phys; vlapic = vm_lapic(vm, vcpuid); vioapic = vm_ioapic(vm); VIOAPIC_LOCK(vioapic); /* * Reset all vectors to be edge-triggered. */ vlapic_reset_tmr(vlapic); for (pin = 0; pin < REDIR_ENTRIES; pin++) { low = vioapic->rtbl[pin].reg; high = vioapic->rtbl[pin].reg >> 32; level = low & IOART_TRGRLVL ? true : false; if (!level) continue; /* * For a level-triggered 'pin' let the vlapic figure out if * an assertion on this 'pin' would result in an interrupt * being delivered to it. If yes, then it will modify the * TMR bit associated with this vector to level-triggered. */ phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); delmode = low & IOART_DELMOD; vector = low & IOART_INTVEC; dest = high >> APIC_ID_SHIFT; vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector); } VIOAPIC_UNLOCK(vioapic); } static uint32_t vioapic_read(struct vioapic *vioapic, int vcpuid, uint32_t addr) { int regnum, pin, rshift; regnum = addr & 0xff; switch (regnum) { case IOAPIC_ID: return (vioapic->id); break; case IOAPIC_VER: return (((REDIR_ENTRIES - 1) << MAXREDIRSHIFT) | 0x11); break; case IOAPIC_ARB: return (vioapic->id); break; default: break; } /* redirection table entries */ if (regnum >= IOAPIC_REDTBL && regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { pin = (regnum - IOAPIC_REDTBL) / 2; if ((regnum - IOAPIC_REDTBL) % 2) rshift = 32; else rshift = 0; return (vioapic->rtbl[pin].reg >> rshift); } return (0); } static void vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data) { uint64_t data64, mask64; uint64_t last, changed; int regnum, pin, lshift; cpuset_t allvcpus; regnum = addr & 0xff; switch (regnum) { case IOAPIC_ID: vioapic->id = data & APIC_ID_MASK; break; case IOAPIC_VER: case IOAPIC_ARB: /* readonly */ break; default: break; } /* redirection table entries */ if (regnum >= IOAPIC_REDTBL && regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { pin = (regnum - IOAPIC_REDTBL) / 2; if ((regnum - IOAPIC_REDTBL) % 2) lshift = 32; else lshift = 0; last = vioapic->rtbl[pin].reg; data64 = (uint64_t)data << lshift; mask64 = (uint64_t)0xffffffff << lshift; vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS; vioapic->rtbl[pin].reg |= data64 & ~RTBL_RO_BITS; VIOAPIC_CTR2(vioapic, "ioapic pin%d: redir table entry %#lx", pin, vioapic->rtbl[pin].reg); /* * If any fields in the redirection table entry (except mask * or polarity) have changed then rendezvous all the vcpus * to update their vlapic trigger-mode registers. */ changed = last ^ vioapic->rtbl[pin].reg; if (changed & ~(IOART_INTMASK | IOART_INTPOL)) { VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate " "vlapic trigger-mode register", pin); VIOAPIC_UNLOCK(vioapic); allvcpus = vm_active_cpus(vioapic->vm); (void)vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus, vioapic_update_tmr, NULL); VIOAPIC_LOCK(vioapic); } /* * Generate an interrupt if the following conditions are met: * - pin is not masked * - previous interrupt has been EOIed * - pin level is asserted */ if ((vioapic->rtbl[pin].reg & IOART_INTMASK) == IOART_INTMCLR && (vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0 && (vioapic->rtbl[pin].acnt > 0)) { VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at rtbl " "write, acnt %d", pin, vioapic->rtbl[pin].acnt); vioapic_send_intr(vioapic, pin); } } } static int vioapic_mmio_rw(struct vioapic *vioapic, int vcpuid, uint64_t gpa, uint64_t *data, int size, bool doread) { uint64_t offset; offset = gpa - VIOAPIC_BASE; /* * The IOAPIC specification allows 32-bit wide accesses to the * IOREGSEL (offset 0) and IOWIN (offset 16) registers. */ if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) { if (doread) *data = 0; return (0); } VIOAPIC_LOCK(vioapic); if (offset == IOREGSEL) { if (doread) *data = vioapic->ioregsel; else vioapic->ioregsel = *data; } else { if (doread) { *data = vioapic_read(vioapic, vcpuid, vioapic->ioregsel); } else { vioapic_write(vioapic, vcpuid, vioapic->ioregsel, *data); } } VIOAPIC_UNLOCK(vioapic); return (0); } int vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval, int size, void *arg) { int error; struct vioapic *vioapic; vioapic = vm_ioapic(vm); error = vioapic_mmio_rw(vioapic, vcpuid, gpa, rval, size, true); return (error); } int vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t wval, int size, void *arg) { int error; struct vioapic *vioapic; vioapic = vm_ioapic(vm); error = vioapic_mmio_rw(vioapic, vcpuid, gpa, &wval, size, false); return (error); } void vioapic_process_eoi(struct vm *vm, int vcpuid, int vector) { struct vioapic *vioapic; int pin; KASSERT(vector >= 0 && vector < 256, ("vioapic_process_eoi: invalid vector %d", vector)); vioapic = vm_ioapic(vm); VIOAPIC_CTR1(vioapic, "ioapic processing eoi for vector %d", vector); /* * XXX keep track of the pins associated with this vector instead * of iterating on every single pin each time. */ VIOAPIC_LOCK(vioapic); for (pin = 0; pin < REDIR_ENTRIES; pin++) { if ((vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0) continue; if ((vioapic->rtbl[pin].reg & IOART_INTVEC) != vector) continue; vioapic->rtbl[pin].reg &= ~IOART_REM_IRR; if (vioapic->rtbl[pin].acnt > 0) { VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at eoi, " "acnt %d", pin, vioapic->rtbl[pin].acnt); vioapic_send_intr(vioapic, pin); } } VIOAPIC_UNLOCK(vioapic); } struct vioapic * vioapic_init(struct vm *vm) { int i; struct vioapic *vioapic; vioapic = malloc(sizeof(struct vioapic), M_VIOAPIC, M_WAITOK | M_ZERO); vioapic->vm = vm; mtx_init(&vioapic->mtx, "vioapic lock", NULL, MTX_SPIN); /* Initialize all redirection entries to mask all interrupts */ for (i = 0; i < REDIR_ENTRIES; i++) vioapic->rtbl[i].reg = 0x0001000000010000UL; return (vioapic); } void vioapic_cleanup(struct vioapic *vioapic) { free(vioapic, M_VIOAPIC); } int vioapic_pincount(struct vm *vm) { return (REDIR_ENTRIES); } + +#ifdef BHYVE_SNAPSHOT +int +vioapic_snapshot(struct vioapic *vioapic, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + + SNAPSHOT_VAR_OR_LEAVE(vioapic->ioregsel, meta, ret, done); + + for (i = 0; i < nitems(vioapic->rtbl); i++) { + SNAPSHOT_VAR_OR_LEAVE(vioapic->rtbl[i].reg, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vioapic->rtbl[i].acnt, meta, ret, done); + } + +done: + return (ret); +} +#endif diff --git a/sys/amd64/vmm/io/vioapic.h b/sys/amd64/vmm/io/vioapic.h index 730c4b3f2ad9..19dbffe3ec24 100644 --- a/sys/amd64/vmm/io/vioapic.h +++ b/sys/amd64/vmm/io/vioapic.h @@ -1,52 +1,59 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Tycho Nightingale * Copyright (c) 2013 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VIOAPIC_H_ #define _VIOAPIC_H_ +struct vm_snapshot_meta; + #define VIOAPIC_BASE 0xFEC00000 #define VIOAPIC_SIZE 4096 struct vioapic *vioapic_init(struct vm *vm); void vioapic_cleanup(struct vioapic *vioapic); int vioapic_assert_irq(struct vm *vm, int irq); int vioapic_deassert_irq(struct vm *vm, int irq); int vioapic_pulse_irq(struct vm *vm, int irq); int vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t wval, int size, void *arg); int vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval, int size, void *arg); int vioapic_pincount(struct vm *vm); void vioapic_process_eoi(struct vm *vm, int vcpuid, int vector); +#ifdef BHYVE_SNAPSHOT +int vioapic_snapshot(struct vioapic *vioapic, + struct vm_snapshot_meta *meta); +#endif + #endif diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 069989f12386..be944bf097d2 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -1,1652 +1,1758 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include #include #include #include #include #include #include #include #include #include +#include #include "vmm_lapic.h" #include "vmm_ktr.h" #include "vmm_stat.h" #include "vlapic.h" #include "vlapic_priv.h" #include "vioapic.h" #define PRIO(x) ((x) >> 4) #define VLAPIC_VERSION (16) #define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0) /* * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the * vlapic_callout_handler() and vcpu accesses to: * - timer_freq_bt, timer_period_bt, timer_fire_bt * - timer LVT register */ #define VLAPIC_TIMER_LOCK(vlapic) mtx_lock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx)) /* * APIC timer frequency: * - arbitrary but chosen to be in the ballpark of contemporary hardware. * - power-of-two to avoid loss of precision when converted to a bintime. */ #define VLAPIC_BUS_FREQ (128 * 1024 * 1024) static void vlapic_set_error(struct vlapic *, uint32_t, bool); static __inline uint32_t vlapic_get_id(struct vlapic *vlapic) { if (x2apic(vlapic)) return (vlapic->vcpuid); else return (vlapic->vcpuid << 24); } static uint32_t x2apic_ldr(struct vlapic *vlapic) { int apicid; uint32_t ldr; apicid = vlapic_get_id(vlapic); ldr = 1 << (apicid & 0xf); ldr |= (apicid & 0xffff0) << 12; return (ldr); } void vlapic_dfr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; if (x2apic(vlapic)) { VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x", lapic->dfr); lapic->dfr = 0; return; } lapic->dfr &= APIC_DFR_MODEL_MASK; lapic->dfr |= APIC_DFR_RESERVED; if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model"); else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model"); else VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr); } void vlapic_ldr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; /* LDR is read-only in x2apic mode */ if (x2apic(vlapic)) { VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x", lapic->ldr); lapic->ldr = x2apic_ldr(vlapic); } else { lapic->ldr &= ~APIC_LDR_RESERVED; VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr); } } void vlapic_id_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; /* * We don't allow the ID register to be modified so reset it back to * its default value. */ lapic = vlapic->apic_page; lapic->id = vlapic_get_id(vlapic); } static int vlapic_timer_divisor(uint32_t dcr) { switch (dcr & 0xB) { case APIC_TDCR_1: return (1); case APIC_TDCR_2: return (2); case APIC_TDCR_4: return (4); case APIC_TDCR_8: return (8); case APIC_TDCR_16: return (16); case APIC_TDCR_32: return (32); case APIC_TDCR_64: return (64); case APIC_TDCR_128: return (128); default: panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); } } #if 0 static inline void vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) { printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset, *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS, *lvt & APIC_LVTT_M); } #endif static uint32_t vlapic_get_ccr(struct vlapic *vlapic) { struct bintime bt_now, bt_rem; struct LAPIC *lapic; uint32_t ccr; ccr = 0; lapic = vlapic->apic_page; VLAPIC_TIMER_LOCK(vlapic); if (callout_active(&vlapic->callout)) { /* * If the timer is scheduled to expire in the future then * compute the value of 'ccr' based on the remaining time. */ binuptime(&bt_now); if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) { bt_rem = vlapic->timer_fire_bt; bintime_sub(&bt_rem, &bt_now); ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt); ccr += bt_rem.frac / vlapic->timer_freq_bt.frac; } } KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, " "icr_timer is %#x", ccr, lapic->icr_timer)); VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x", ccr, lapic->icr_timer); VLAPIC_TIMER_UNLOCK(vlapic); return (ccr); } void vlapic_dcr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; int divisor; lapic = vlapic->apic_page; VLAPIC_TIMER_LOCK(vlapic); divisor = vlapic_timer_divisor(lapic->dcr_timer); VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", lapic->dcr_timer, divisor); /* * Update the timer frequency and the timer period. * * XXX changes to the frequency divider will not take effect until * the timer is reloaded. */ FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt); vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); VLAPIC_TIMER_UNLOCK(vlapic); } void vlapic_esr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; lapic->esr = vlapic->esr_pending; vlapic->esr_pending = 0; } int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) { struct LAPIC *lapic; uint32_t *irrptr, *tmrptr, mask; int idx; KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector)); lapic = vlapic->apic_page; if (!(lapic->svr & APIC_SVR_ENABLE)) { VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring " "interrupt %d", vector); return (0); } if (vector < 16) { vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR, false); VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d", vector); return (1); } if (vlapic->ops.set_intr_ready) return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level)); idx = (vector / 32) * 4; mask = 1 << (vector % 32); irrptr = &lapic->irr0; atomic_set_int(&irrptr[idx], mask); /* * Verify that the trigger-mode of the interrupt matches with * the vlapic TMR registers. */ tmrptr = &lapic->tmr0; if ((tmrptr[idx] & mask) != (level ? mask : 0)) { VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but " "interrupt is %s-triggered", idx / 4, tmrptr[idx], level ? "level" : "edge"); } VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); return (1); } static __inline uint32_t * vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) { struct LAPIC *lapic = vlapic->apic_page; int i; switch (offset) { case APIC_OFFSET_CMCI_LVT: return (&lapic->lvt_cmci); case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; return ((&lapic->lvt_timer) + i); default: panic("vlapic_get_lvt: invalid LVT\n"); } } static __inline int lvt_off_to_idx(uint32_t offset) { int index; switch (offset) { case APIC_OFFSET_CMCI_LVT: index = APIC_LVT_CMCI; break; case APIC_OFFSET_TIMER_LVT: index = APIC_LVT_TIMER; break; case APIC_OFFSET_THERM_LVT: index = APIC_LVT_THERMAL; break; case APIC_OFFSET_PERF_LVT: index = APIC_LVT_PMC; break; case APIC_OFFSET_LINT0_LVT: index = APIC_LVT_LINT0; break; case APIC_OFFSET_LINT1_LVT: index = APIC_LVT_LINT1; break; case APIC_OFFSET_ERROR_LVT: index = APIC_LVT_ERROR; break; default: index = -1; break; } KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: " "invalid lvt index %d for offset %#x", index, offset)); return (index); } static __inline uint32_t vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) { int idx; uint32_t val; idx = lvt_off_to_idx(offset); val = atomic_load_acq_32(&vlapic->lvt_last[idx]); return (val); } void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset) { uint32_t *lvtptr, mask, val; struct LAPIC *lapic; int idx; lapic = vlapic->apic_page; lvtptr = vlapic_get_lvtptr(vlapic, offset); val = *lvtptr; idx = lvt_off_to_idx(offset); if (!(lapic->svr & APIC_SVR_ENABLE)) val |= APIC_LVT_M; mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR; switch (offset) { case APIC_OFFSET_TIMER_LVT: mask |= APIC_LVTT_TM; break; case APIC_OFFSET_ERROR_LVT: break; case APIC_OFFSET_LINT0_LVT: case APIC_OFFSET_LINT1_LVT: mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP; /* FALLTHROUGH */ default: mask |= APIC_LVT_DM; break; } val &= mask; *lvtptr = val; atomic_store_rel_32(&vlapic->lvt_last[idx], val); } static void vlapic_mask_lvts(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; lapic->lvt_cmci |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT); lapic->lvt_timer |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT); lapic->lvt_thermal |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT); lapic->lvt_pcint |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT); lapic->lvt_lint0 |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT); lapic->lvt_lint1 |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT); lapic->lvt_error |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT); } static int vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt) { uint32_t mode, reg, vec; reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]); if (reg & APIC_LVT_M) return (0); vec = reg & APIC_LVT_VECTOR; mode = reg & APIC_LVT_DM; switch (mode) { case APIC_LVT_DM_FIXED: if (vec < 16) { vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, lvt == APIC_LVT_ERROR); return (0); } if (vlapic_set_intr_ready(vlapic, vec, false)) vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true); break; case APIC_LVT_DM_NMI: vm_inject_nmi(vlapic->vm, vlapic->vcpuid); break; case APIC_LVT_DM_EXTINT: vm_inject_extint(vlapic->vm, vlapic->vcpuid); break; default: // Other modes ignored return (0); } return (1); } #if 1 static void dump_isrvec_stk(struct vlapic *vlapic) { int i; uint32_t *isrptr; isrptr = &vlapic->apic_page->isr0; for (i = 0; i < 8; i++) printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); for (i = 0; i <= vlapic->isrvec_stk_top; i++) printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]); } #endif /* * Algorithm adopted from section "Interrupt, Task and Processor Priority" * in Intel Architecture Manual Vol 3a. */ static void vlapic_update_ppr(struct vlapic *vlapic) { int isrvec, tpr, ppr; /* * Note that the value on the stack at index 0 is always 0. * * This is a placeholder for the value of ISRV when none of the * bits is set in the ISRx registers. */ isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top]; tpr = vlapic->apic_page->tpr; #if 1 { int i, lastprio, curprio, vector, idx; uint32_t *isrptr; if (vlapic->isrvec_stk_top == 0 && isrvec != 0) panic("isrvec_stk is corrupted: %d", isrvec); /* * Make sure that the priority of the nested interrupts is * always increasing. */ lastprio = -1; for (i = 1; i <= vlapic->isrvec_stk_top; i++) { curprio = PRIO(vlapic->isrvec_stk[i]); if (curprio <= lastprio) { dump_isrvec_stk(vlapic); panic("isrvec_stk does not satisfy invariant"); } lastprio = curprio; } /* * Make sure that each bit set in the ISRx registers has a * corresponding entry on the isrvec stack. */ i = 1; isrptr = &vlapic->apic_page->isr0; for (vector = 0; vector < 256; vector++) { idx = (vector / 32) * 4; if (isrptr[idx] & (1 << (vector % 32))) { if (i > vlapic->isrvec_stk_top || vlapic->isrvec_stk[i] != vector) { dump_isrvec_stk(vlapic); panic("ISR and isrvec_stk out of sync"); } i++; } } } #endif if (PRIO(tpr) >= PRIO(isrvec)) ppr = tpr; else ppr = isrvec & 0xf0; vlapic->apic_page->ppr = ppr; VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); } void vlapic_sync_tpr(struct vlapic *vlapic) { vlapic_update_ppr(vlapic); } static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt"); static void vlapic_process_eoi(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *isrptr, *tmrptr; int i, idx, bitpos, vector; isrptr = &lapic->isr0; tmrptr = &lapic->tmr0; for (i = 7; i >= 0; i--) { idx = i * 4; bitpos = fls(isrptr[idx]); if (bitpos-- != 0) { if (vlapic->isrvec_stk_top <= 0) { panic("invalid vlapic isrvec_stk_top %d", vlapic->isrvec_stk_top); } isrptr[idx] &= ~(1 << bitpos); vector = i * 32 + bitpos; VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "EOI vector %d", vector); VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); vlapic->isrvec_stk_top--; vlapic_update_ppr(vlapic); if ((tmrptr[idx] & (1 << bitpos)) != 0) { vioapic_process_eoi(vlapic->vm, vlapic->vcpuid, vector); } return; } } VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "Gratuitous EOI"); vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1); } static __inline int vlapic_get_lvt_field(uint32_t lvt, uint32_t mask) { return (lvt & mask); } static __inline int vlapic_periodic_timer(struct vlapic *vlapic) { uint32_t lvt; lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); } static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic"); static void vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error) { vlapic->esr_pending |= mask; /* * Avoid infinite recursion if the error LVT itself is configured with * an illegal vector. */ if (lvt_error) return; if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) { vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1); } } static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); static void vlapic_fire_timer(struct vlapic *vlapic) { KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked")); if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) { VLAPIC_CTR0(vlapic, "vlapic timer fired"); vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1); } } static VMM_STAT(VLAPIC_INTR_CMC, "corrected machine check interrupts generated by vlapic"); void vlapic_fire_cmci(struct vlapic *vlapic) { if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) { vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1); } } static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1, "lvts triggered"); int vlapic_trigger_lvt(struct vlapic *vlapic, int vector) { if (vlapic_enabled(vlapic) == false) { /* * When the local APIC is global/hardware disabled, * LINT[1:0] pins are configured as INTR and NMI pins, * respectively. */ switch (vector) { case APIC_LVT_LINT0: vm_inject_extint(vlapic->vm, vlapic->vcpuid); break; case APIC_LVT_LINT1: vm_inject_nmi(vlapic->vm, vlapic->vcpuid); break; default: break; } return (0); } switch (vector) { case APIC_LVT_LINT0: case APIC_LVT_LINT1: case APIC_LVT_TIMER: case APIC_LVT_ERROR: case APIC_LVT_PMC: case APIC_LVT_THERMAL: case APIC_LVT_CMCI: if (vlapic_fire_lvt(vlapic, vector)) { vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, LVTS_TRIGGERRED, vector, 1); } break; default: return (EINVAL); } return (0); } static void vlapic_callout_handler(void *arg) { struct vlapic *vlapic; struct bintime bt, btnow; sbintime_t rem_sbt; vlapic = arg; VLAPIC_TIMER_LOCK(vlapic); if (callout_pending(&vlapic->callout)) /* callout was reset */ goto done; if (!callout_active(&vlapic->callout)) /* callout was stopped */ goto done; callout_deactivate(&vlapic->callout); vlapic_fire_timer(vlapic); if (vlapic_periodic_timer(vlapic)) { binuptime(&btnow); KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=), ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx", btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec, vlapic->timer_fire_bt.frac)); /* * Compute the delta between when the timer was supposed to * fire and the present time. */ bt = btnow; bintime_sub(&bt, &vlapic->timer_fire_bt); rem_sbt = bttosbt(vlapic->timer_period_bt); if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) { /* * Adjust the time until the next countdown downward * to account for the lost time. */ rem_sbt -= bttosbt(bt); } else { /* * If the delta is greater than the timer period then * just reset our time base instead of trying to catch * up. */ vlapic->timer_fire_bt = btnow; VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu " "usecs, period is %lu usecs - resetting time base", bttosbt(bt) / SBT_1US, bttosbt(vlapic->timer_period_bt) / SBT_1US); } bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); callout_reset_sbt(&vlapic->callout, rem_sbt, 0, vlapic_callout_handler, vlapic, 0); } done: VLAPIC_TIMER_UNLOCK(vlapic); } void vlapic_icrtmr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; sbintime_t sbt; uint32_t icr_timer; VLAPIC_TIMER_LOCK(vlapic); lapic = vlapic->apic_page; icr_timer = lapic->icr_timer; vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, icr_timer); if (icr_timer != 0) { binuptime(&vlapic->timer_fire_bt); bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); sbt = bttosbt(vlapic->timer_period_bt); callout_reset_sbt(&vlapic->callout, sbt, 0, vlapic_callout_handler, vlapic, 0); } else callout_stop(&vlapic->callout); VLAPIC_TIMER_UNLOCK(vlapic); } /* * This function populates 'dmask' with the set of vcpus that match the * addressing specified by the (dest, phys, lowprio) tuple. * * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) * or xAPIC (8-bit) destination field. */ static void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, bool lowprio, bool x2apic_dest) { struct vlapic *vlapic; uint32_t dfr, ldr, ldest, cluster; uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id; cpuset_t amask; int vcpuid; if ((x2apic_dest && dest == 0xffffffff) || (!x2apic_dest && dest == 0xff)) { /* * Broadcast in both logical and physical modes. */ *dmask = vm_active_cpus(vm); return; } if (phys) { /* * Physical mode: destination is APIC ID. */ CPU_ZERO(dmask); vcpuid = vm_apicid2vcpuid(vm, dest); amask = vm_active_cpus(vm); if (vcpuid < vm_get_maxcpus(vm) && CPU_ISSET(vcpuid, &amask)) CPU_SET(vcpuid, dmask); } else { /* * In the "Flat Model" the MDA is interpreted as an 8-bit wide * bitmask. This model is only available in the xAPIC mode. */ mda_flat_ldest = dest & 0xff; /* * In the "Cluster Model" the MDA is used to identify a * specific cluster and a set of APICs in that cluster. */ if (x2apic_dest) { mda_cluster_id = dest >> 16; mda_cluster_ldest = dest & 0xffff; } else { mda_cluster_id = (dest >> 4) & 0xf; mda_cluster_ldest = dest & 0xf; } /* * Logical mode: match each APIC that has a bit set * in its LDR that matches a bit in the ldest. */ CPU_ZERO(dmask); amask = vm_active_cpus(vm); while ((vcpuid = CPU_FFS(&amask)) != 0) { vcpuid--; CPU_CLR(vcpuid, &amask); vlapic = vm_lapic(vm, vcpuid); dfr = vlapic->apic_page->dfr; ldr = vlapic->apic_page->ldr; if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) { ldest = ldr >> 24; mda_ldest = mda_flat_ldest; } else if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) { if (x2apic(vlapic)) { cluster = ldr >> 16; ldest = ldr & 0xffff; } else { cluster = ldr >> 28; ldest = (ldr >> 24) & 0xf; } if (cluster != mda_cluster_id) continue; mda_ldest = mda_cluster_ldest; } else { /* * Guest has configured a bad logical * model for this vcpu - skip it. */ VLAPIC_CTR1(vlapic, "vlapic has bad logical " "model %x - cannot deliver interrupt", dfr); continue; } if ((mda_ldest & ldest) != 0) { CPU_SET(vcpuid, dmask); if (lowprio) break; } } } } static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu"); static void vlapic_set_tpr(struct vlapic *vlapic, uint8_t val) { struct LAPIC *lapic = vlapic->apic_page; if (lapic->tpr != val) { VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vlapic TPR changed " "from %#x to %#x", lapic->tpr, val); lapic->tpr = val; vlapic_update_ppr(vlapic); } } static uint8_t vlapic_get_tpr(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; return (lapic->tpr); } void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val) { uint8_t tpr; if (val & ~0xf) { vm_inject_gp(vlapic->vm, vlapic->vcpuid); return; } tpr = val << 4; vlapic_set_tpr(vlapic, tpr); } uint64_t vlapic_get_cr8(struct vlapic *vlapic) { uint8_t tpr; tpr = vlapic_get_tpr(vlapic); return (tpr >> 4); } int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) { int i; bool phys; cpuset_t dmask; uint64_t icrval; uint32_t dest, vec, mode; struct vlapic *vlapic2; struct vm_exit *vmexit; struct LAPIC *lapic; uint16_t maxcpus; lapic = vlapic->apic_page; lapic->icr_lo &= ~APIC_DELSTAT_PEND; icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo; if (x2apic(vlapic)) dest = icrval >> 32; else dest = icrval >> (32 + 24); vec = icrval & APIC_VECTOR_MASK; mode = icrval & APIC_DELMODE_MASK; if (mode == APIC_DELMODE_FIXED && vec < 16) { vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false); VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); return (0); } VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec); if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) { switch (icrval & APIC_DEST_MASK) { case APIC_DEST_DESTFLD: phys = ((icrval & APIC_DESTMODE_LOG) == 0); vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, x2apic(vlapic)); break; case APIC_DEST_SELF: CPU_SETOF(vlapic->vcpuid, &dmask); break; case APIC_DEST_ALLISELF: dmask = vm_active_cpus(vlapic->vm); break; case APIC_DEST_ALLESELF: dmask = vm_active_cpus(vlapic->vm); CPU_CLR(vlapic->vcpuid, &dmask); break; default: CPU_ZERO(&dmask); /* satisfy gcc */ break; } while ((i = CPU_FFS(&dmask)) != 0) { i--; CPU_CLR(i, &dmask); if (mode == APIC_DELMODE_FIXED) { lapic_intr_edge(vlapic->vm, i, vec); vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, IPIS_SENT, i, 1); VLAPIC_CTR2(vlapic, "vlapic sending ipi %d " "to vcpuid %d", vec, i); } else { vm_inject_nmi(vlapic->vm, i); VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi " "to vcpuid %d", i); } } return (0); /* handled completely in the kernel */ } maxcpus = vm_get_maxcpus(vlapic->vm); if (mode == APIC_DELMODE_INIT) { if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) return (0); if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) { vlapic2 = vm_lapic(vlapic->vm, dest); /* move from INIT to waiting-for-SIPI state */ if (vlapic2->boot_state == BS_INIT) { vlapic2->boot_state = BS_SIPI; } return (0); } } if (mode == APIC_DELMODE_STARTUP) { if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) { vlapic2 = vm_lapic(vlapic->vm, dest); /* * Ignore SIPIs in any state other than wait-for-SIPI */ if (vlapic2->boot_state != BS_SIPI) return (0); vlapic2->boot_state = BS_RUNNING; *retu = true; vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); vmexit->exitcode = VM_EXITCODE_SPINUP_AP; vmexit->u.spinup_ap.vcpu = dest; vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT; return (0); } } /* * This will cause a return to userland. */ return (1); } void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val) { int vec; KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode")); vec = val & 0xff; lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec); vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, IPIS_SENT, vlapic->vcpuid, 1); VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec); } int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) { struct LAPIC *lapic = vlapic->apic_page; int idx, i, bitpos, vector; uint32_t *irrptr, val; vlapic_update_ppr(vlapic); if (vlapic->ops.pending_intr) return ((*vlapic->ops.pending_intr)(vlapic, vecptr)); irrptr = &lapic->irr0; for (i = 7; i >= 0; i--) { idx = i * 4; val = atomic_load_acq_int(&irrptr[idx]); bitpos = fls(val); if (bitpos != 0) { vector = i * 32 + (bitpos - 1); if (PRIO(vector) > PRIO(lapic->ppr)) { VLAPIC_CTR1(vlapic, "pending intr %d", vector); if (vecptr != NULL) *vecptr = vector; return (1); } else break; } } return (0); } void vlapic_intr_accepted(struct vlapic *vlapic, int vector) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *irrptr, *isrptr; int idx, stk_top; if (vlapic->ops.intr_accepted) return ((*vlapic->ops.intr_accepted)(vlapic, vector)); /* * clear the ready bit for vector being accepted in irr * and set the vector as in service in isr. */ idx = (vector / 32) * 4; irrptr = &lapic->irr0; atomic_clear_int(&irrptr[idx], 1 << (vector % 32)); VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted"); isrptr = &lapic->isr0; isrptr[idx] |= 1 << (vector % 32); VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted"); /* * Update the PPR */ vlapic->isrvec_stk_top++; stk_top = vlapic->isrvec_stk_top; if (stk_top >= ISRVEC_STK_SIZE) panic("isrvec_stk_top overflow %d", stk_top); vlapic->isrvec_stk[stk_top] = vector; } void vlapic_svr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; uint32_t old, new, changed; lapic = vlapic->apic_page; new = lapic->svr; old = vlapic->svr_last; vlapic->svr_last = new; changed = old ^ new; if ((changed & APIC_SVR_ENABLE) != 0) { if ((new & APIC_SVR_ENABLE) == 0) { /* * The apic is now disabled so stop the apic timer * and mask all the LVT entries. */ VLAPIC_CTR0(vlapic, "vlapic is software-disabled"); VLAPIC_TIMER_LOCK(vlapic); callout_stop(&vlapic->callout); VLAPIC_TIMER_UNLOCK(vlapic); vlapic_mask_lvts(vlapic); } else { /* * The apic is now enabled so restart the apic timer * if it is configured in periodic mode. */ VLAPIC_CTR0(vlapic, "vlapic is software-enabled"); if (vlapic_periodic_timer(vlapic)) vlapic_icrtmr_write_handler(vlapic); } } } int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, uint64_t *data, bool *retu) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *reg; int i; /* Ignore MMIO accesses in x2APIC mode */ if (x2apic(vlapic) && mmio_access) { VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode", offset); *data = 0; goto done; } if (!x2apic(vlapic) && !mmio_access) { /* * XXX Generate GP fault for MSR accesses in xAPIC mode */ VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in " "xAPIC mode", offset); *data = 0; goto done; } if (offset > sizeof(*lapic)) { *data = 0; goto done; } offset &= ~3; switch(offset) { case APIC_OFFSET_ID: *data = lapic->id; break; case APIC_OFFSET_VER: *data = lapic->version; break; case APIC_OFFSET_TPR: *data = vlapic_get_tpr(vlapic); break; case APIC_OFFSET_APR: *data = lapic->apr; break; case APIC_OFFSET_PPR: *data = lapic->ppr; break; case APIC_OFFSET_EOI: *data = lapic->eoi; break; case APIC_OFFSET_LDR: *data = lapic->ldr; break; case APIC_OFFSET_DFR: *data = lapic->dfr; break; case APIC_OFFSET_SVR: *data = lapic->svr; break; case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: i = (offset - APIC_OFFSET_ISR0) >> 2; reg = &lapic->isr0; *data = *(reg + i); break; case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: i = (offset - APIC_OFFSET_TMR0) >> 2; reg = &lapic->tmr0; *data = *(reg + i); break; case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: i = (offset - APIC_OFFSET_IRR0) >> 2; reg = &lapic->irr0; *data = atomic_load_acq_int(reg + i); break; case APIC_OFFSET_ESR: *data = lapic->esr; break; case APIC_OFFSET_ICR_LOW: *data = lapic->icr_lo; if (x2apic(vlapic)) *data |= (uint64_t)lapic->icr_hi << 32; break; case APIC_OFFSET_ICR_HI: *data = lapic->icr_hi; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: *data = vlapic_get_lvt(vlapic, offset); #ifdef INVARIANTS reg = vlapic_get_lvtptr(vlapic, offset); KASSERT(*data == *reg, ("inconsistent lvt value at " "offset %#lx: %#lx/%#x", offset, *data, *reg)); #endif break; case APIC_OFFSET_TIMER_ICR: *data = lapic->icr_timer; break; case APIC_OFFSET_TIMER_CCR: *data = vlapic_get_ccr(vlapic); break; case APIC_OFFSET_TIMER_DCR: *data = lapic->dcr_timer; break; case APIC_OFFSET_SELF_IPI: /* * XXX generate a GP fault if vlapic is in x2apic mode */ *data = 0; break; case APIC_OFFSET_RRR: default: *data = 0; break; } done: VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data); return 0; } int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, uint64_t data, bool *retu) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *regptr; int retval; KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE, ("vlapic_write: invalid offset %#lx", offset)); VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx", offset, data); if (offset > sizeof(*lapic)) return (0); /* Ignore MMIO accesses in x2APIC mode */ if (x2apic(vlapic) && mmio_access) { VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx " "in x2APIC mode", data, offset); return (0); } /* * XXX Generate GP fault for MSR accesses in xAPIC mode */ if (!x2apic(vlapic) && !mmio_access) { VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx " "in xAPIC mode", data, offset); return (0); } retval = 0; switch(offset) { case APIC_OFFSET_ID: lapic->id = data; vlapic_id_write_handler(vlapic); break; case APIC_OFFSET_TPR: vlapic_set_tpr(vlapic, data & 0xff); break; case APIC_OFFSET_EOI: vlapic_process_eoi(vlapic); break; case APIC_OFFSET_LDR: lapic->ldr = data; vlapic_ldr_write_handler(vlapic); break; case APIC_OFFSET_DFR: lapic->dfr = data; vlapic_dfr_write_handler(vlapic); break; case APIC_OFFSET_SVR: lapic->svr = data; vlapic_svr_write_handler(vlapic); break; case APIC_OFFSET_ICR_LOW: lapic->icr_lo = data; if (x2apic(vlapic)) lapic->icr_hi = data >> 32; retval = vlapic_icrlo_write_handler(vlapic, retu); break; case APIC_OFFSET_ICR_HI: lapic->icr_hi = data; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: regptr = vlapic_get_lvtptr(vlapic, offset); *regptr = data; vlapic_lvt_write_handler(vlapic, offset); break; case APIC_OFFSET_TIMER_ICR: lapic->icr_timer = data; vlapic_icrtmr_write_handler(vlapic); break; case APIC_OFFSET_TIMER_DCR: lapic->dcr_timer = data; vlapic_dcr_write_handler(vlapic); break; case APIC_OFFSET_ESR: vlapic_esr_write_handler(vlapic); break; case APIC_OFFSET_SELF_IPI: if (x2apic(vlapic)) vlapic_self_ipi_handler(vlapic, data); break; case APIC_OFFSET_VER: case APIC_OFFSET_APR: case APIC_OFFSET_PPR: case APIC_OFFSET_RRR: case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: case APIC_OFFSET_TIMER_CCR: default: // Read only. break; } return (retval); } static void vlapic_reset(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; bzero(lapic, sizeof(struct LAPIC)); lapic->id = vlapic_get_id(vlapic); lapic->version = VLAPIC_VERSION; lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT); lapic->dfr = 0xffffffff; lapic->svr = APIC_SVR_VECTOR; vlapic_mask_lvts(vlapic); vlapic_reset_tmr(vlapic); lapic->dcr_timer = 0; vlapic_dcr_write_handler(vlapic); if (vlapic->vcpuid == 0) vlapic->boot_state = BS_RUNNING; /* BSP */ else vlapic->boot_state = BS_INIT; /* AP */ vlapic->svr_last = lapic->svr; } void vlapic_init(struct vlapic *vlapic) { KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized")); KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < vm_get_maxcpus(vlapic->vm), ("vlapic_init: vcpuid is not initialized")); KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not " "initialized")); /* * If the vlapic is configured in x2apic mode then it will be * accessed in the critical section via the MSR emulation code. * * Therefore the timer mutex must be a spinlock because blockable * mutexes cannot be acquired in a critical section. */ mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN); callout_init(&vlapic->callout, 1); vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; if (vlapic->vcpuid == 0) vlapic->msr_apicbase |= APICBASE_BSP; vlapic_reset(vlapic); } void vlapic_cleanup(struct vlapic *vlapic) { callout_drain(&vlapic->callout); } uint64_t vlapic_get_apicbase(struct vlapic *vlapic) { return (vlapic->msr_apicbase); } int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new) { if (vlapic->msr_apicbase != new) { VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx " "not supported", vlapic->msr_apicbase, new); return (-1); } return (0); } void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) { struct vlapic *vlapic; struct LAPIC *lapic; vlapic = vm_lapic(vm, vcpuid); if (state == X2APIC_DISABLED) vlapic->msr_apicbase &= ~APICBASE_X2APIC; else vlapic->msr_apicbase |= APICBASE_X2APIC; /* * Reset the local APIC registers whose values are mode-dependent. * * XXX this works because the APIC mode can be changed only at vcpu * initialization time. */ lapic = vlapic->apic_page; lapic->id = vlapic_get_id(vlapic); if (x2apic(vlapic)) { lapic->ldr = x2apic_ldr(vlapic); lapic->dfr = 0; } else { lapic->ldr = 0; lapic->dfr = 0xffffffff; } if (state == X2APIC_ENABLED) { if (vlapic->ops.enable_x2apic_mode) (*vlapic->ops.enable_x2apic_mode)(vlapic); } } void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, int delmode, int vec) { bool lowprio; int vcpuid; cpuset_t dmask; if (delmode != IOART_DELFIXED && delmode != IOART_DELLOPRI && delmode != IOART_DELEXINT) { VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode); return; } lowprio = (delmode == IOART_DELLOPRI); /* * We don't provide any virtual interrupt redirection hardware so * all interrupts originating from the ioapic or MSI specify the * 'dest' in the legacy xAPIC format. */ vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false); while ((vcpuid = CPU_FFS(&dmask)) != 0) { vcpuid--; CPU_CLR(vcpuid, &dmask); if (delmode == IOART_DELEXINT) { vm_inject_extint(vm, vcpuid); } else { lapic_set_intr(vm, vcpuid, vec, level); } } } void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum) { /* * Post an interrupt to the vcpu currently running on 'hostcpu'. * * This is done by leveraging features like Posted Interrupts (Intel) * Doorbell MSR (AMD AVIC) that avoid a VM exit. * * If neither of these features are available then fallback to * sending an IPI to 'hostcpu'. */ if (vlapic->ops.post_intr) (*vlapic->ops.post_intr)(vlapic, hostcpu); else ipi_cpu(hostcpu, ipinum); } bool vlapic_enabled(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 && (lapic->svr & APIC_SVR_ENABLE) != 0) return (true); else return (false); } static void vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level) { struct LAPIC *lapic; uint32_t *tmrptr, mask; int idx; lapic = vlapic->apic_page; tmrptr = &lapic->tmr0; idx = (vector / 32) * 4; mask = 1 << (vector % 32); if (level) tmrptr[idx] |= mask; else tmrptr[idx] &= ~mask; if (vlapic->ops.set_tmr != NULL) (*vlapic->ops.set_tmr)(vlapic, vector, level); } void vlapic_reset_tmr(struct vlapic *vlapic) { int vector; VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered"); for (vector = 0; vector <= 255; vector++) vlapic_set_tmr(vlapic, vector, false); } void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, int delmode, int vector) { cpuset_t dmask; bool lowprio; KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); /* * A level trigger is valid only for fixed and lowprio delivery modes. */ if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for " "delivery-mode %d", delmode); return; } lowprio = (delmode == APIC_DELMODE_LOWPRIO); vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false); if (!CPU_ISSET(vlapic->vcpuid, &dmask)) return; VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); vlapic_set_tmr(vlapic, vector, true); } + +#ifdef BHYVE_SNAPSHOT +static void +vlapic_reset_callout(struct vlapic *vlapic, uint32_t ccr) +{ + /* The implementation is similar to the one in the + * `vlapic_icrtmr_write_handler` function + */ + sbintime_t sbt; + struct bintime bt; + + VLAPIC_TIMER_LOCK(vlapic); + + bt = vlapic->timer_freq_bt; + bintime_mul(&bt, ccr); + + if (ccr != 0) { + binuptime(&vlapic->timer_fire_bt); + bintime_add(&vlapic->timer_fire_bt, &bt); + + sbt = bttosbt(bt); + callout_reset_sbt(&vlapic->callout, sbt, 0, + vlapic_callout_handler, vlapic, 0); + } else { + /* even if the CCR was 0, periodic timers should be reset */ + if (vlapic_periodic_timer(vlapic)) { + binuptime(&vlapic->timer_fire_bt); + bintime_add(&vlapic->timer_fire_bt, + &vlapic->timer_period_bt); + sbt = bttosbt(vlapic->timer_period_bt); + + callout_stop(&vlapic->callout); + callout_reset_sbt(&vlapic->callout, sbt, 0, + vlapic_callout_handler, vlapic, 0); + } + } + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +int +vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int i, ret; + struct vlapic *vlapic; + struct LAPIC *lapic; + uint32_t ccr; + + KASSERT(vm != NULL, ("%s: arg was NULL", __func__)); + + ret = 0; + + for (i = 0; i < VM_MAXCPU; i++) { + vlapic = vm_lapic(vm, i); + + /* snapshot the page first; timer period depends on icr_timer */ + lapic = vlapic->apic_page; + SNAPSHOT_BUF_OR_LEAVE(lapic, PAGE_SIZE, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vlapic->esr_pending, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.sec, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.frac, + meta, ret, done); + + /* + * Timer period is equal to 'icr_timer' ticks at a frequency of + * 'timer_freq_bt'. + */ + if (meta->op == VM_SNAPSHOT_RESTORE) { + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); + } + + SNAPSHOT_BUF_OR_LEAVE(vlapic->isrvec_stk, + sizeof(vlapic->isrvec_stk), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vlapic->isrvec_stk_top, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vlapic->boot_state, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(vlapic->lvt_last, + sizeof(vlapic->lvt_last), + meta, ret, done); + + if (meta->op == VM_SNAPSHOT_SAVE) + ccr = vlapic_get_ccr(vlapic); + + SNAPSHOT_VAR_OR_LEAVE(ccr, meta, ret, done); + + if (meta->op == VM_SNAPSHOT_RESTORE) { + /* Reset the value of the 'timer_fire_bt' and the vlapic + * callout based on the value of the current count + * register saved when the VM snapshot was created + */ + vlapic_reset_callout(vlapic, ccr); + } + } + +done: + return (ret); +} +#endif diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h index bd650efa8cc1..b87657c8bb51 100644 --- a/sys/amd64/vmm/io/vlapic.h +++ b/sys/amd64/vmm/io/vlapic.h @@ -1,112 +1,118 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VLAPIC_H_ #define _VLAPIC_H_ struct vm; +struct vm_snapshot_meta; enum x2apic_state; int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, uint64_t data, bool *retu); int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, uint64_t *data, bool *retu); /* * Returns 0 if there is no eligible vector that can be delivered to the * guest at this time and non-zero otherwise. * * If an eligible vector number is found and 'vecptr' is not NULL then it will * be stored in the location pointed to by 'vecptr'. * * Note that the vector does not automatically transition to the ISR as a * result of calling this function. */ int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr); /* * Transition 'vector' from IRR to ISR. This function is called with the * vector returned by 'vlapic_pending_intr()' when the guest is able to * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that * block interrupt delivery). */ void vlapic_intr_accepted(struct vlapic *vlapic, int vector); /* * Returns 1 if the vcpu needs to be notified of the interrupt and 0 otherwise. */ int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level); /* * Post an interrupt to the vcpu running on 'hostcpu'. This will use a * hardware assist if available (e.g. Posted Interrupt) or fall back to * sending an 'ipinum' to interrupt the 'hostcpu'. */ void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum); void vlapic_fire_cmci(struct vlapic *vlapic); int vlapic_trigger_lvt(struct vlapic *vlapic, int vector); void vlapic_sync_tpr(struct vlapic *vlapic); uint64_t vlapic_get_apicbase(struct vlapic *vlapic); int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val); void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s); bool vlapic_enabled(struct vlapic *vlapic); void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, int delmode, int vec); /* Reset the trigger-mode bits for all vectors to be edge-triggered */ void vlapic_reset_tmr(struct vlapic *vlapic); /* * Set the trigger-mode bit associated with 'vector' to level-triggered if * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to * this 'vlapic'. */ void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, int delmode, int vector); void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val); uint64_t vlapic_get_cr8(struct vlapic *vlapic); /* APIC write handlers */ void vlapic_id_write_handler(struct vlapic *vlapic); void vlapic_ldr_write_handler(struct vlapic *vlapic); void vlapic_dfr_write_handler(struct vlapic *vlapic); void vlapic_svr_write_handler(struct vlapic *vlapic); void vlapic_esr_write_handler(struct vlapic *vlapic); int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu); void vlapic_icrtmr_write_handler(struct vlapic *vlapic); void vlapic_dcr_write_handler(struct vlapic *vlapic); void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset); void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val); + +#ifdef BHYVE_SNAPSHOT +int vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta); +#endif + #endif /* _VLAPIC_H_ */ diff --git a/sys/amd64/vmm/io/vpmtmr.c b/sys/amd64/vmm/io/vpmtmr.c index 4df909777d88..f79e94f6d0fe 100644 --- a/sys/amd64/vmm/io/vpmtmr.c +++ b/sys/amd64/vmm/io/vpmtmr.c @@ -1,105 +1,121 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014, Neel Natu (neel@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include #include #include #include +#include #include "vpmtmr.h" /* * The ACPI Power Management timer is a free-running 24- or 32-bit * timer with a frequency of 3.579545MHz * * This implementation will be 32-bits */ #define PMTMR_FREQ 3579545 /* 3.579545MHz */ struct vpmtmr { sbintime_t freq_sbt; sbintime_t baseuptime; uint32_t baseval; }; static MALLOC_DEFINE(M_VPMTMR, "vpmtmr", "bhyve virtual acpi timer"); struct vpmtmr * vpmtmr_init(struct vm *vm) { struct vpmtmr *vpmtmr; struct bintime bt; vpmtmr = malloc(sizeof(struct vpmtmr), M_VPMTMR, M_WAITOK | M_ZERO); vpmtmr->baseuptime = sbinuptime(); vpmtmr->baseval = 0; FREQ2BT(PMTMR_FREQ, &bt); vpmtmr->freq_sbt = bttosbt(bt); return (vpmtmr); } void vpmtmr_cleanup(struct vpmtmr *vpmtmr) { free(vpmtmr, M_VPMTMR); } int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val) { struct vpmtmr *vpmtmr; sbintime_t now, delta; if (!in || bytes != 4) return (-1); vpmtmr = vm_pmtmr(vm); /* * No locking needed because 'baseuptime' and 'baseval' are * written only during initialization. */ now = sbinuptime(); delta = now - vpmtmr->baseuptime; KASSERT(delta >= 0, ("vpmtmr_handler: uptime went backwards: " "%#lx to %#lx", vpmtmr->baseuptime, now)); *val = vpmtmr->baseval + delta / vpmtmr->freq_sbt; return (0); } + +#ifdef BHYVE_SNAPSHOT +int +vpmtmr_snapshot(struct vpmtmr *vpmtmr, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(vpmtmr->baseval, meta, ret, done); + +done: + return (ret); +} +#endif diff --git a/sys/amd64/vmm/io/vpmtmr.h b/sys/amd64/vmm/io/vpmtmr.h index e6562da5c02e..a10c0b4e8309 100644 --- a/sys/amd64/vmm/io/vpmtmr.h +++ b/sys/amd64/vmm/io/vpmtmr.h @@ -1,44 +1,49 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Neel Natu (neel@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VPMTMR_H_ #define _VPMTMR_H_ #define IO_PMTMR 0x408 struct vpmtmr; +struct vm_snapshot_meta; struct vpmtmr *vpmtmr_init(struct vm *vm); void vpmtmr_cleanup(struct vpmtmr *pmtmr); int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val); +#ifdef BHYVE_SNAPSHOT +int vpmtmr_snapshot(struct vpmtmr *vpmtmr, struct vm_snapshot_meta *meta); +#endif + #endif diff --git a/sys/amd64/vmm/io/vrtc.c b/sys/amd64/vmm/io/vrtc.c index 954a78efb588..5d6968e3583e 100644 --- a/sys/amd64/vmm/io/vrtc.c +++ b/sys/amd64/vmm/io/vrtc.c @@ -1,1022 +1,1067 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014, Neel Natu (neel@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include #include #include #include #include #include #include #include +#include #include #include "vmm_ktr.h" #include "vatpic.h" #include "vioapic.h" #include "vrtc.h" /* Register layout of the RTC */ struct rtcdev { uint8_t sec; uint8_t alarm_sec; uint8_t min; uint8_t alarm_min; uint8_t hour; uint8_t alarm_hour; uint8_t day_of_week; uint8_t day_of_month; uint8_t month; uint8_t year; uint8_t reg_a; uint8_t reg_b; uint8_t reg_c; uint8_t reg_d; uint8_t nvram[36]; uint8_t century; uint8_t nvram2[128 - 51]; } __packed; CTASSERT(sizeof(struct rtcdev) == 128); CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY); struct vrtc { struct vm *vm; struct mtx mtx; struct callout callout; u_int addr; /* RTC register to read or write */ sbintime_t base_uptime; time_t base_rtctime; struct rtcdev rtcdev; }; #define VRTC_LOCK(vrtc) mtx_lock(&((vrtc)->mtx)) #define VRTC_UNLOCK(vrtc) mtx_unlock(&((vrtc)->mtx)) #define VRTC_LOCKED(vrtc) mtx_owned(&((vrtc)->mtx)) /* * RTC time is considered "broken" if: * - RTC updates are halted by the guest * - RTC date/time fields have invalid values */ #define VRTC_BROKEN_TIME ((time_t)-1) #define RTC_IRQ 8 #define RTCSB_BIN 0x04 #define RTCSB_ALL_INTRS (RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR) #define rtc_halted(vrtc) ((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0) #define aintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0) #define pintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0) #define uintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0) static void vrtc_callout_handler(void *arg); static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval); static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc"); SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL); static int rtc_flag_broken_time = 1; SYSCTL_INT(_hw_vmm_vrtc, OID_AUTO, flag_broken_time, CTLFLAG_RDTUN, &rtc_flag_broken_time, 0, "Stop guest when invalid RTC time is detected"); static __inline bool divider_enabled(int reg_a) { /* * The RTC is counting only when dividers are not held in reset. */ return ((reg_a & 0x70) == 0x20); } static __inline bool update_enabled(struct vrtc *vrtc) { /* * RTC date/time can be updated only if: * - divider is not held in reset * - guest has not disabled updates * - the date/time fields have valid contents */ if (!divider_enabled(vrtc->rtcdev.reg_a)) return (false); if (rtc_halted(vrtc)) return (false); if (vrtc->base_rtctime == VRTC_BROKEN_TIME) return (false); return (true); } static time_t vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime) { sbintime_t now, delta; time_t t, secs; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); t = vrtc->base_rtctime; *basetime = vrtc->base_uptime; if (update_enabled(vrtc)) { now = sbinuptime(); delta = now - vrtc->base_uptime; KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: " "%#lx to %#lx", vrtc->base_uptime, now)); secs = delta / SBT_1S; t += secs; *basetime += secs * SBT_1S; } return (t); } static __inline uint8_t rtcset(struct rtcdev *rtc, int val) { KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d", __func__, val)); return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]); } static void secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update) { struct clocktime ct; struct timespec ts; struct rtcdev *rtc; int hour; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); if (rtctime < 0) { KASSERT(rtctime == VRTC_BROKEN_TIME, ("%s: invalid vrtc time %#lx", __func__, rtctime)); return; } /* * If the RTC is halted then the guest has "ownership" of the * date/time fields. Don't update the RTC date/time fields in * this case (unless forced). */ if (rtc_halted(vrtc) && !force_update) return; ts.tv_sec = rtctime; ts.tv_nsec = 0; clock_ts_to_ct(&ts, &ct); KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d", ct.sec)); KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d", ct.min)); KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d", ct.hour)); KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d", ct.dow)); KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d", ct.day)); KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d", ct.mon)); KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d", ct.year)); rtc = &vrtc->rtcdev; rtc->sec = rtcset(rtc, ct.sec); rtc->min = rtcset(rtc, ct.min); if (rtc->reg_b & RTCSB_24HR) { hour = ct.hour; } else { /* * Convert to the 12-hour format. */ switch (ct.hour) { case 0: /* 12 AM */ case 12: /* 12 PM */ hour = 12; break; default: /* * The remaining 'ct.hour' values are interpreted as: * [1 - 11] -> 1 - 11 AM * [13 - 23] -> 1 - 11 PM */ hour = ct.hour % 12; break; } } rtc->hour = rtcset(rtc, hour); if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12) rtc->hour |= 0x80; /* set MSB to indicate PM */ rtc->day_of_week = rtcset(rtc, ct.dow + 1); rtc->day_of_month = rtcset(rtc, ct.day); rtc->month = rtcset(rtc, ct.mon); rtc->year = rtcset(rtc, ct.year % 100); rtc->century = rtcset(rtc, ct.year / 100); } static int rtcget(struct rtcdev *rtc, int val, int *retval) { uint8_t upper, lower; if (rtc->reg_b & RTCSB_BIN) { *retval = val; return (0); } lower = val & 0xf; upper = (val >> 4) & 0xf; if (lower > 9 || upper > 9) return (-1); *retval = upper * 10 + lower; return (0); } static time_t rtc_to_secs(struct vrtc *vrtc) { struct clocktime ct; struct timespec ts; struct rtcdev *rtc; struct vm *vm; int century, error, hour, pm, year; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); vm = vrtc->vm; rtc = &vrtc->rtcdev; bzero(&ct, sizeof(struct clocktime)); error = rtcget(rtc, rtc->sec, &ct.sec); if (error || ct.sec < 0 || ct.sec > 59) { VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec); goto fail; } error = rtcget(rtc, rtc->min, &ct.min); if (error || ct.min < 0 || ct.min > 59) { VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min); goto fail; } pm = 0; hour = rtc->hour; if ((rtc->reg_b & RTCSB_24HR) == 0) { if (hour & 0x80) { hour &= ~0x80; pm = 1; } } error = rtcget(rtc, hour, &ct.hour); if ((rtc->reg_b & RTCSB_24HR) == 0) { if (ct.hour >= 1 && ct.hour <= 12) { /* * Convert from 12-hour format to internal 24-hour * representation as follows: * * 12-hour format ct.hour * 12 AM 0 * 1 - 11 AM 1 - 11 * 12 PM 12 * 1 - 11 PM 13 - 23 */ if (ct.hour == 12) ct.hour = 0; if (pm) ct.hour += 12; } else { VM_CTR2(vm, "Invalid RTC 12-hour format %#x/%d", rtc->hour, ct.hour); goto fail; } } if (error || ct.hour < 0 || ct.hour > 23) { VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour); goto fail; } /* * Ignore 'rtc->dow' because some guests like Linux don't bother * setting it at all while others like OpenBSD/i386 set it incorrectly. * * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it. */ ct.dow = -1; error = rtcget(rtc, rtc->day_of_month, &ct.day); if (error || ct.day < 1 || ct.day > 31) { VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month, ct.day); goto fail; } error = rtcget(rtc, rtc->month, &ct.mon); if (error || ct.mon < 1 || ct.mon > 12) { VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon); goto fail; } error = rtcget(rtc, rtc->year, &year); if (error || year < 0 || year > 99) { VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year); goto fail; } error = rtcget(rtc, rtc->century, ¢ury); ct.year = century * 100 + year; if (error || ct.year < POSIX_BASE_YEAR) { VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century, ct.year); goto fail; } error = clock_ct_to_ts(&ct, &ts); if (error || ts.tv_sec < 0) { VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d", ct.year, ct.mon, ct.day); VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d", ct.hour, ct.min, ct.sec); goto fail; } return (ts.tv_sec); /* success */ fail: /* * Stop updating the RTC if the date/time fields programmed by * the guest are invalid. */ VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected"); return (VRTC_BROKEN_TIME); } static int vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase) { struct rtcdev *rtc; sbintime_t oldbase; time_t oldtime; uint8_t alarm_sec, alarm_min, alarm_hour; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); rtc = &vrtc->rtcdev; alarm_sec = rtc->alarm_sec; alarm_min = rtc->alarm_min; alarm_hour = rtc->alarm_hour; oldtime = vrtc->base_rtctime; VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx", oldtime, newtime); oldbase = vrtc->base_uptime; VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#lx to %#lx", oldbase, newbase); vrtc->base_uptime = newbase; if (newtime == oldtime) return (0); /* * If 'newtime' indicates that RTC updates are disabled then just * record that and return. There is no need to do alarm interrupt * processing in this case. */ if (newtime == VRTC_BROKEN_TIME) { vrtc->base_rtctime = VRTC_BROKEN_TIME; return (0); } /* * Return an error if RTC updates are halted by the guest. */ if (rtc_halted(vrtc)) { VM_CTR0(vrtc->vm, "RTC update halted by guest"); return (EBUSY); } do { /* * If the alarm interrupt is enabled and 'oldtime' is valid * then visit all the seconds between 'oldtime' and 'newtime' * to check for the alarm condition. * * Otherwise move the RTC time forward directly to 'newtime'. */ if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME) vrtc->base_rtctime++; else vrtc->base_rtctime = newtime; if (aintr_enabled(vrtc)) { /* * Update the RTC date/time fields before checking * if the alarm conditions are satisfied. */ secs_to_rtc(vrtc->base_rtctime, vrtc, 0); if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) && (alarm_min >= 0xC0 || alarm_min == rtc->min) && (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) { vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM); } } } while (vrtc->base_rtctime != newtime); if (uintr_enabled(vrtc)) vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE); return (0); } static sbintime_t vrtc_freq(struct vrtc *vrtc) { int ratesel; static sbintime_t pf[16] = { 0, SBT_1S / 256, SBT_1S / 128, SBT_1S / 8192, SBT_1S / 4096, SBT_1S / 2048, SBT_1S / 1024, SBT_1S / 512, SBT_1S / 256, SBT_1S / 128, SBT_1S / 64, SBT_1S / 32, SBT_1S / 16, SBT_1S / 8, SBT_1S / 4, SBT_1S / 2, }; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); /* * If both periodic and alarm interrupts are enabled then use the * periodic frequency to drive the callout. The minimum periodic * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so * piggyback the alarm on top of it. The same argument applies to * the update interrupt. */ if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) { ratesel = vrtc->rtcdev.reg_a & 0xf; return (pf[ratesel]); } else if (aintr_enabled(vrtc) && update_enabled(vrtc)) { return (SBT_1S); } else if (uintr_enabled(vrtc) && update_enabled(vrtc)) { return (SBT_1S); } else { return (0); } } static void vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt) { KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); if (freqsbt == 0) { if (callout_active(&vrtc->callout)) { VM_CTR0(vrtc->vm, "RTC callout stopped"); callout_stop(&vrtc->callout); } return; } VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", SBT_1S / freqsbt); callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler, vrtc, 0); } static void vrtc_callout_handler(void *arg) { struct vrtc *vrtc = arg; sbintime_t freqsbt, basetime; time_t rtctime; int error; VM_CTR0(vrtc->vm, "vrtc callout fired"); VRTC_LOCK(vrtc); if (callout_pending(&vrtc->callout)) /* callout was reset */ goto done; if (!callout_active(&vrtc->callout)) /* callout was stopped */ goto done; callout_deactivate(&vrtc->callout); KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0, ("gratuitous vrtc callout")); if (pintr_enabled(vrtc)) vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD); if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) { rtctime = vrtc_curtime(vrtc, &basetime); error = vrtc_time_update(vrtc, rtctime, basetime); KASSERT(error == 0, ("%s: vrtc_time_update error %d", __func__, error)); } freqsbt = vrtc_freq(vrtc); KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__)); vrtc_callout_reset(vrtc, freqsbt); done: VRTC_UNLOCK(vrtc); } static __inline void vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq) { int active; active = callout_active(&vrtc->callout) ? 1 : 0; KASSERT((freq == 0 && !active) || (freq != 0 && active), ("vrtc callout %s with frequency %#lx", active ? "active" : "inactive", freq)); } static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval) { struct rtcdev *rtc; int oldirqf, newirqf; uint8_t oldval, changed; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); rtc = &vrtc->rtcdev; newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE; oldirqf = rtc->reg_c & RTCIR_INT; if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) || (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) || (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) { newirqf = RTCIR_INT; } else { newirqf = 0; } oldval = rtc->reg_c; rtc->reg_c = newirqf | newval; changed = oldval ^ rtc->reg_c; if (changed) { VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x", oldval, rtc->reg_c); } if (!oldirqf && newirqf) { VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ); vatpic_pulse_irq(vrtc->vm, RTC_IRQ); vioapic_pulse_irq(vrtc->vm, RTC_IRQ); } else if (oldirqf && !newirqf) { VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ); } } static int vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval) { struct rtcdev *rtc; sbintime_t oldfreq, newfreq, basetime; time_t curtime, rtctime; int error; uint8_t oldval, changed; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); rtc = &vrtc->rtcdev; oldval = rtc->reg_b; oldfreq = vrtc_freq(vrtc); rtc->reg_b = newval; changed = oldval ^ newval; if (changed) { VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x", oldval, newval); } if (changed & RTCSB_HALT) { if ((newval & RTCSB_HALT) == 0) { rtctime = rtc_to_secs(vrtc); basetime = sbinuptime(); if (rtctime == VRTC_BROKEN_TIME) { if (rtc_flag_broken_time) return (-1); } } else { curtime = vrtc_curtime(vrtc, &basetime); KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch " "between vrtc basetime (%#lx) and curtime (%#lx)", __func__, vrtc->base_rtctime, curtime)); /* * Force a refresh of the RTC date/time fields so * they reflect the time right before the guest set * the HALT bit. */ secs_to_rtc(curtime, vrtc, 1); /* * Updates are halted so mark 'base_rtctime' to denote * that the RTC date/time is in flux. */ rtctime = VRTC_BROKEN_TIME; rtc->reg_b &= ~RTCSB_UINTR; } error = vrtc_time_update(vrtc, rtctime, basetime); KASSERT(error == 0, ("vrtc_time_update error %d", error)); } /* * Side effect of changes to the interrupt enable bits. */ if (changed & RTCSB_ALL_INTRS) vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c); /* * Change the callout frequency if it has changed. */ newfreq = vrtc_freq(vrtc); if (newfreq != oldfreq) vrtc_callout_reset(vrtc, newfreq); else vrtc_callout_check(vrtc, newfreq); /* * The side effect of bits that control the RTC date/time format * is handled lazily when those fields are actually read. */ return (0); } static void vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval) { sbintime_t oldfreq, newfreq; uint8_t oldval, changed; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); newval &= ~RTCSA_TUP; oldval = vrtc->rtcdev.reg_a; oldfreq = vrtc_freq(vrtc); if (divider_enabled(oldval) && !divider_enabled(newval)) { VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#lx", vrtc->base_rtctime, vrtc->base_uptime); } else if (!divider_enabled(oldval) && divider_enabled(newval)) { /* * If the dividers are coming out of reset then update * 'base_uptime' before this happens. This is done to * maintain the illusion that the RTC date/time was frozen * while the dividers were disabled. */ vrtc->base_uptime = sbinuptime(); VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#lx", vrtc->base_rtctime, vrtc->base_uptime); } else { /* NOTHING */ } vrtc->rtcdev.reg_a = newval; changed = oldval ^ newval; if (changed) { VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x", oldval, newval); } /* * Side effect of changes to rate select and divider enable bits. */ newfreq = vrtc_freq(vrtc); if (newfreq != oldfreq) vrtc_callout_reset(vrtc, newfreq); else vrtc_callout_check(vrtc, newfreq); } int vrtc_set_time(struct vm *vm, time_t secs) { struct vrtc *vrtc; int error; vrtc = vm_rtc(vm); VRTC_LOCK(vrtc); error = vrtc_time_update(vrtc, secs, sbinuptime()); VRTC_UNLOCK(vrtc); if (error) { VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error, secs); } else { VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs); } return (error); } time_t vrtc_get_time(struct vm *vm) { struct vrtc *vrtc; sbintime_t basetime; time_t t; vrtc = vm_rtc(vm); VRTC_LOCK(vrtc); t = vrtc_curtime(vrtc, &basetime); VRTC_UNLOCK(vrtc); return (t); } int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value) { struct vrtc *vrtc; uint8_t *ptr; vrtc = vm_rtc(vm); /* * Don't allow writes to RTC control registers or the date/time fields. */ if (offset < offsetof(struct rtcdev, nvram[0]) || offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) { VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d", offset); return (EINVAL); } VRTC_LOCK(vrtc); ptr = (uint8_t *)(&vrtc->rtcdev); ptr[offset] = value; VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset); VRTC_UNLOCK(vrtc); return (0); } int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval) { struct vrtc *vrtc; sbintime_t basetime; time_t curtime; uint8_t *ptr; /* * Allow all offsets in the RTC to be read. */ if (offset < 0 || offset >= sizeof(struct rtcdev)) return (EINVAL); vrtc = vm_rtc(vm); VRTC_LOCK(vrtc); /* * Update RTC date/time fields if necessary. */ if (offset < 10 || offset == RTC_CENTURY) { curtime = vrtc_curtime(vrtc, &basetime); secs_to_rtc(curtime, vrtc, 0); } ptr = (uint8_t *)(&vrtc->rtcdev); *retval = ptr[offset]; VRTC_UNLOCK(vrtc); return (0); } int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val) { struct vrtc *vrtc; vrtc = vm_rtc(vm); if (bytes != 1) return (-1); if (in) { *val = 0xff; return (0); } VRTC_LOCK(vrtc); vrtc->addr = *val & 0x7f; VRTC_UNLOCK(vrtc); return (0); } int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val) { struct vrtc *vrtc; struct rtcdev *rtc; sbintime_t basetime; time_t curtime; int error, offset; vrtc = vm_rtc(vm); rtc = &vrtc->rtcdev; if (bytes != 1) return (-1); VRTC_LOCK(vrtc); offset = vrtc->addr; if (offset >= sizeof(struct rtcdev)) { VRTC_UNLOCK(vrtc); return (-1); } error = 0; curtime = vrtc_curtime(vrtc, &basetime); vrtc_time_update(vrtc, curtime, basetime); /* * Update RTC date/time fields if necessary. * * This is not just for reads of the RTC. The side-effect of writing * the century byte requires other RTC date/time fields (e.g. sec) * to be updated here. */ if (offset < 10 || offset == RTC_CENTURY) secs_to_rtc(curtime, vrtc, 0); if (in) { if (offset == 12) { /* * XXX * reg_c interrupt flags are updated only if the * corresponding interrupt enable bit in reg_b is set. */ *val = vrtc->rtcdev.reg_c; vrtc_set_reg_c(vrtc, 0); } else { *val = *((uint8_t *)rtc + offset); } VCPU_CTR2(vm, vcpuid, "Read value %#x from RTC offset %#x", *val, offset); } else { switch (offset) { case 10: VCPU_CTR1(vm, vcpuid, "RTC reg_a set to %#x", *val); vrtc_set_reg_a(vrtc, *val); break; case 11: VCPU_CTR1(vm, vcpuid, "RTC reg_b set to %#x", *val); error = vrtc_set_reg_b(vrtc, *val); break; case 12: VCPU_CTR1(vm, vcpuid, "RTC reg_c set to %#x (ignored)", *val); break; case 13: VCPU_CTR1(vm, vcpuid, "RTC reg_d set to %#x (ignored)", *val); break; case 0: /* * High order bit of 'seconds' is readonly. */ *val &= 0x7f; /* FALLTHRU */ default: VCPU_CTR2(vm, vcpuid, "RTC offset %#x set to %#x", offset, *val); *((uint8_t *)rtc + offset) = *val; break; } /* * XXX some guests (e.g. OpenBSD) write the century byte * outside of RTCSB_HALT so re-calculate the RTC date/time. */ if (offset == RTC_CENTURY && !rtc_halted(vrtc)) { curtime = rtc_to_secs(vrtc); error = vrtc_time_update(vrtc, curtime, sbinuptime()); KASSERT(!error, ("vrtc_time_update error %d", error)); if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time) error = -1; } } VRTC_UNLOCK(vrtc); return (error); } void vrtc_reset(struct vrtc *vrtc) { struct rtcdev *rtc; VRTC_LOCK(vrtc); rtc = &vrtc->rtcdev; vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE)); vrtc_set_reg_c(vrtc, 0); KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active")); VRTC_UNLOCK(vrtc); } struct vrtc * vrtc_init(struct vm *vm) { struct vrtc *vrtc; struct rtcdev *rtc; time_t curtime; vrtc = malloc(sizeof(struct vrtc), M_VRTC, M_WAITOK | M_ZERO); vrtc->vm = vm; mtx_init(&vrtc->mtx, "vrtc lock", NULL, MTX_DEF); callout_init(&vrtc->callout, 1); /* Allow dividers to keep time but disable everything else */ rtc = &vrtc->rtcdev; rtc->reg_a = 0x20; rtc->reg_b = RTCSB_24HR; rtc->reg_c = 0; rtc->reg_d = RTCSD_PWR; /* Reset the index register to a safe value. */ vrtc->addr = RTC_STATUSD; /* * Initialize RTC time to 00:00:00 Jan 1, 1970. */ curtime = 0; VRTC_LOCK(vrtc); vrtc->base_rtctime = VRTC_BROKEN_TIME; vrtc_time_update(vrtc, curtime, sbinuptime()); secs_to_rtc(curtime, vrtc, 0); VRTC_UNLOCK(vrtc); return (vrtc); } void vrtc_cleanup(struct vrtc *vrtc) { callout_drain(&vrtc->callout); free(vrtc, M_VRTC); } + +#ifdef BHYVE_SNAPSHOT +int +vrtc_snapshot(struct vrtc *vrtc, struct vm_snapshot_meta *meta) +{ + int ret; + + VRTC_LOCK(vrtc); + + SNAPSHOT_VAR_OR_LEAVE(vrtc->addr, meta, ret, done); + if (meta->op == VM_SNAPSHOT_RESTORE) + vrtc->base_uptime = sbinuptime(); + SNAPSHOT_VAR_OR_LEAVE(vrtc->base_rtctime, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.min, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_min, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.hour, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_hour, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.day_of_week, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.day_of_month, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.month, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.year, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_a, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_b, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_c, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_d, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vrtc->rtcdev.nvram, sizeof(vrtc->rtcdev.nvram), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.century, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vrtc->rtcdev.nvram2, sizeof(vrtc->rtcdev.nvram2), + meta, ret, done); + + vrtc_callout_reset(vrtc, vrtc_freq(vrtc)); + + VRTC_UNLOCK(vrtc); + +done: + return (ret); +} +#endif diff --git a/sys/amd64/vmm/io/vrtc.h b/sys/amd64/vmm/io/vrtc.h index 836561c7b93b..791fb7db3e26 100644 --- a/sys/amd64/vmm/io/vrtc.h +++ b/sys/amd64/vmm/io/vrtc.h @@ -1,52 +1,57 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Neel Natu (neel@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VRTC_H_ #define _VRTC_H_ #include struct vrtc; +struct vm_snapshot_meta; struct vrtc *vrtc_init(struct vm *vm); void vrtc_cleanup(struct vrtc *vrtc); void vrtc_reset(struct vrtc *vrtc); time_t vrtc_get_time(struct vm *vm); int vrtc_set_time(struct vm *vm, time_t secs); int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value); int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval); int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val); int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val); +#ifdef BHYVE_SNAPSHOT +int vrtc_snapshot(struct vrtc *vrtc, struct vm_snapshot_meta *meta); +#endif + #endif diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 0f6b803098d6..b2f5fa62efe5 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -1,2732 +1,2924 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include #include #include #include #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include #include #include +#include +#include +#include +#include +#include #include #include #include #include #include #include #include #include #include +#include #include "vmm_ioport.h" #include "vmm_ktr.h" #include "vmm_host.h" #include "vmm_mem.h" #include "vmm_util.h" #include "vatpic.h" #include "vatpit.h" #include "vhpet.h" #include "vioapic.h" #include "vlapic.h" #include "vpmtmr.h" #include "vrtc.h" #include "vmm_stat.h" #include "vmm_lapic.h" #include "io/ppt.h" #include "io/iommu.h" struct vlapic; /* * Initialization: * (a) allocated when vcpu is created * (i) initialized when vcpu is created and when it is reinitialized * (o) initialized the first time the vcpu is created * (x) initialized before use */ struct vcpu { struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ enum vcpu_state state; /* (o) vcpu state */ int hostcpu; /* (o) vcpu's host cpu */ int reqidle; /* (i) request vcpu to idle */ struct vlapic *vlapic; /* (i) APIC device model */ enum x2apic_state x2apic_state; /* (i) APIC mode */ uint64_t exitintinfo; /* (i) events pending at VM exit */ int nmi_pending; /* (i) NMI pending */ int extint_pending; /* (i) INTR pending */ int exception_pending; /* (i) exception pending */ int exc_vector; /* (x) exception collateral */ int exc_errcode_valid; uint32_t exc_errcode; struct savefpu *guestfpu; /* (a,i) guest fpu state */ uint64_t guest_xcr0; /* (i) guest %xcr0 register */ void *stats; /* (a,i) statistics */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ uint64_t nextrip; /* (x) next instruction to execute */ + uint64_t tsc_offset; /* (o) TSC offsetting */ }; #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) struct mem_seg { size_t len; bool sysmem; struct vm_object *object; }; #define VM_MAX_MEMSEGS 3 struct mem_map { vm_paddr_t gpa; size_t len; vm_ooffset_t segoff; int segid; int prot; int flags; }; #define VM_MAX_MEMMAPS 8 /* * Initialization: * (o) initialized the first time the VM is created * (i) initialized when VM is created and when it is reinitialized * (x) initialized before use */ struct vm { void *cookie; /* (i) cpu-specific data */ void *iommu; /* (x) iommu-specific data */ struct vhpet *vhpet; /* (i) virtual HPET */ struct vioapic *vioapic; /* (i) virtual ioapic */ struct vatpic *vatpic; /* (i) virtual atpic */ struct vatpit *vatpit; /* (i) virtual atpit */ struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ struct vrtc *vrtc; /* (o) virtual RTC */ volatile cpuset_t active_cpus; /* (i) active vcpus */ volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ int suspend; /* (i) stop VM execution */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */ cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */ void *rendezvous_arg; /* (x) rendezvous func/arg */ vm_rendezvous_func_t rendezvous_func; struct mtx rendezvous_mtx; /* (o) rendezvous lock */ struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ struct vmspace *vmspace; /* (o) guest's address space */ char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ /* The following describe the vm cpu topology */ uint16_t sockets; /* (o) num of sockets */ uint16_t cores; /* (o) num of cores/socket */ uint16_t threads; /* (o) num of threads/core */ uint16_t maxcpus; /* (o) max pluggable cpus */ }; static int vmm_initialized; static struct vmm_ops *ops; #define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) #define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) #define VMRUN(vmi, vcpu, rip, pmap, evinfo) \ (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO) #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) #define VMSPACE_ALLOC(min, max) \ (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) #define VMSPACE_FREE(vmspace) \ (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) #define VMGETREG(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) #define VMSETREG(vmi, vcpu, num, val) \ (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) #define VMGETDESC(vmi, vcpu, num, desc) \ (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) #define VMSETDESC(vmi, vcpu, num, desc) \ (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) #define VMGETCAP(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) #define VMSETCAP(vmi, vcpu, num, val) \ (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) #define VLAPIC_INIT(vmi, vcpu) \ (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) #define VLAPIC_CLEANUP(vmi, vlapic) \ (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) +#ifdef BHYVE_SNAPSHOT +#define VM_SNAPSHOT_VMI(vmi, meta) \ + (ops != NULL ? (*ops->vmsnapshot)(vmi, meta) : ENXIO) +#define VM_SNAPSHOT_VMCX(vmi, meta, vcpuid) \ + (ops != NULL ? (*ops->vmcx_snapshot)(vmi, meta, vcpuid) : ENXIO) +#define VM_RESTORE_TSC(vmi, vcpuid, offset) \ + (ops != NULL ? (*ops->vm_restore_tsc)(vmi, vcpuid, offset) : ENXIO) +#endif #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) #define fpu_stop_emulating() clts() SDT_PROVIDER_DEFINE(vmm); static MALLOC_DEFINE(M_VM, "vm", "vm"); /* statistics */ static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL); /* * Halt the guest if all vcpus are executing a HLT instruction with * interrupts disabled. */ static int halt_detection_enabled = 1; SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, &halt_detection_enabled, 0, "Halt VM if all vcpus execute HLT with interrupts disabled"); static int vmm_ipinum; SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, "IPI vector used for vcpu notifications"); static int trace_guest_exceptions; SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, &trace_guest_exceptions, 0, "Trap into hypervisor on all guest exceptions and reflect them back"); static void vm_free_memmap(struct vm *vm, int ident); static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); #ifdef KTR static const char * vcpu_state2str(enum vcpu_state state) { switch (state) { case VCPU_IDLE: return ("idle"); case VCPU_FROZEN: return ("frozen"); case VCPU_RUNNING: return ("running"); case VCPU_SLEEPING: return ("sleeping"); default: return ("unknown"); } } #endif static void vcpu_cleanup(struct vm *vm, int i, bool destroy) { struct vcpu *vcpu = &vm->vcpu[i]; VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); if (destroy) { vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); } } static void vcpu_init(struct vm *vm, int vcpu_id, bool create) { struct vcpu *vcpu; KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, ("vcpu_init: invalid vcpu %d", vcpu_id)); vcpu = &vm->vcpu[vcpu_id]; if (create) { KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " "initialized", vcpu_id)); vcpu_lock_init(vcpu); vcpu->state = VCPU_IDLE; vcpu->hostcpu = NOCPU; vcpu->guestfpu = fpu_save_area_alloc(); vcpu->stats = vmm_stat_alloc(); + vcpu->tsc_offset = 0; } vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); vcpu->reqidle = 0; vcpu->exitintinfo = 0; vcpu->nmi_pending = 0; vcpu->extint_pending = 0; vcpu->exception_pending = 0; vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; fpu_save_area_reset(vcpu->guestfpu); vmm_stat_init(vcpu->stats); } int vcpu_trace_exceptions(struct vm *vm, int vcpuid) { return (trace_guest_exceptions); } struct vm_exit * vm_exitinfo(struct vm *vm, int cpuid) { struct vcpu *vcpu; if (cpuid < 0 || cpuid >= vm->maxcpus) panic("vm_exitinfo: invalid cpuid %d", cpuid); vcpu = &vm->vcpu[cpuid]; return (&vcpu->exitinfo); } static void vmm_resume(void) { VMM_RESUME(); } static int vmm_init(void) { int error; vmm_host_state_init(); vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : &IDTVEC(justreturn)); if (vmm_ipinum < 0) vmm_ipinum = IPI_AST; error = vmm_mem_init(); if (error) return (error); if (vmm_is_intel()) ops = &vmm_ops_intel; else if (vmm_is_svm()) ops = &vmm_ops_amd; else return (ENXIO); vmm_resume_p = vmm_resume; return (VMM_INIT(vmm_ipinum)); } static int vmm_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: vmmdev_init(); error = vmm_init(); if (error == 0) vmm_initialized = 1; break; case MOD_UNLOAD: error = vmmdev_cleanup(); if (error == 0) { vmm_resume_p = NULL; iommu_cleanup(); if (vmm_ipinum != IPI_AST) lapic_ipi_free(vmm_ipinum); error = VMM_CLEANUP(); /* * Something bad happened - prevent new * VMs from being created */ if (error) vmm_initialized = 0; } break; default: error = 0; break; } return (error); } static moduledata_t vmm_kmod = { "vmm", vmm_handler, NULL }; /* * vmm initialization has the following dependencies: * * - VT-x initialization requires smp_rendezvous() and therefore must happen * after SMP is fully functional (after SI_SUB_SMP). */ DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); static void vm_init(struct vm *vm, bool create) { int i; vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace)); vm->iommu = NULL; vm->vioapic = vioapic_init(vm); vm->vhpet = vhpet_init(vm); vm->vatpic = vatpic_init(vm); vm->vatpit = vatpit_init(vm); vm->vpmtmr = vpmtmr_init(vm); if (create) vm->vrtc = vrtc_init(vm); CPU_ZERO(&vm->active_cpus); CPU_ZERO(&vm->debug_cpus); vm->suspend = 0; CPU_ZERO(&vm->suspended_cpus); for (i = 0; i < vm->maxcpus; i++) vcpu_init(vm, i, create); } /* * The default CPU topology is a single thread per package. */ u_int cores_per_package = 1; u_int threads_per_core = 1; int vm_create(const char *name, struct vm **retvm) { struct vm *vm; struct vmspace *vmspace; /* * If vmm.ko could not be successfully initialized then don't attempt * to create the virtual machine. */ if (!vmm_initialized) return (ENXIO); if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) return (EINVAL); vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS); if (vmspace == NULL) return (ENOMEM); vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); vm->vmspace = vmspace; mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); vm->sockets = 1; vm->cores = cores_per_package; /* XXX backwards compatibility */ vm->threads = threads_per_core; /* XXX backwards compatibility */ vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ vm_init(vm, true); *retvm = vm; return (0); } void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) { *sockets = vm->sockets; *cores = vm->cores; *threads = vm->threads; *maxcpus = vm->maxcpus; } uint16_t vm_get_maxcpus(struct vm *vm) { return (vm->maxcpus); } int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) { if (maxcpus != 0) return (EINVAL); /* XXX remove when supported */ if ((sockets * cores * threads) > vm->maxcpus) return (EINVAL); /* XXX need to check sockets * cores * threads == vCPU, how? */ vm->sockets = sockets; vm->cores = cores; vm->threads = threads; vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ return(0); } static void vm_cleanup(struct vm *vm, bool destroy) { struct mem_map *mm; int i; ppt_unassign_all(vm); if (vm->iommu != NULL) iommu_destroy_domain(vm->iommu); if (destroy) vrtc_cleanup(vm->vrtc); else vrtc_reset(vm->vrtc); vpmtmr_cleanup(vm->vpmtmr); vatpit_cleanup(vm->vatpit); vhpet_cleanup(vm->vhpet); vatpic_cleanup(vm->vatpic); vioapic_cleanup(vm->vioapic); for (i = 0; i < vm->maxcpus; i++) vcpu_cleanup(vm, i, destroy); VMCLEANUP(vm->cookie); /* * System memory is removed from the guest address space only when * the VM is destroyed. This is because the mapping remains the same * across VM reset. * * Device memory can be relocated by the guest (e.g. using PCI BARs) * so those mappings are removed on a VM reset. */ for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (destroy || !sysmem_mapping(vm, mm)) vm_free_memmap(vm, i); } if (destroy) { for (i = 0; i < VM_MAX_MEMSEGS; i++) vm_free_memseg(vm, i); VMSPACE_FREE(vm->vmspace); vm->vmspace = NULL; } } void vm_destroy(struct vm *vm) { vm_cleanup(vm, true); free(vm, M_VM); } int vm_reinit(struct vm *vm) { int error; /* * A virtual machine can be reset only if all vcpus are suspended. */ if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { vm_cleanup(vm, false); vm_init(vm, false); error = 0; } else { error = EBUSY; } return (error); } const char * vm_name(struct vm *vm) { return (vm->name); } int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { vm_object_t obj; if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) return (ENOMEM); else return (0); } int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { vmm_mmio_free(vm->vmspace, gpa, len); return (0); } /* * Return 'true' if 'gpa' is allocated in the guest address space. * * This function is called in the context of a running vcpu which acts as * an implicit lock on 'vm->mem_maps[]'. */ bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) { struct mem_map *mm; int i; #ifdef INVARIANTS int hostcpu, state; state = vcpu_get_state(vm, vcpuid, &hostcpu); KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); #endif for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) return (true); /* 'gpa' is sysmem or devmem */ } if (ppt_is_mmio(vm, gpa)) return (true); /* 'gpa' is pci passthru mmio */ return (false); } int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) { struct mem_seg *seg; vm_object_t obj; if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); if (len == 0 || (len & PAGE_MASK)) return (EINVAL); seg = &vm->mem_segs[ident]; if (seg->object != NULL) { if (seg->len == len && seg->sysmem == sysmem) return (EEXIST); else return (EINVAL); } obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); if (obj == NULL) return (ENOMEM); seg->len = len; seg->object = obj; seg->sysmem = sysmem; return (0); } int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, vm_object_t *objptr) { struct mem_seg *seg; if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); seg = &vm->mem_segs[ident]; if (len) *len = seg->len; if (sysmem) *sysmem = seg->sysmem; if (objptr) *objptr = seg->object; return (0); } void vm_free_memseg(struct vm *vm, int ident) { struct mem_seg *seg; KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, ("%s: invalid memseg ident %d", __func__, ident)); seg = &vm->mem_segs[ident]; if (seg->object != NULL) { vm_object_deallocate(seg->object); bzero(seg, sizeof(struct mem_seg)); } } int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, size_t len, int prot, int flags) { struct mem_seg *seg; struct mem_map *m, *map; vm_ooffset_t last; int i, error; if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0) return (EINVAL); if (flags & ~VM_MEMMAP_F_WIRED) return (EINVAL); if (segid < 0 || segid >= VM_MAX_MEMSEGS) return (EINVAL); seg = &vm->mem_segs[segid]; if (seg->object == NULL) return (EINVAL); last = first + len; if (first < 0 || first >= last || last > seg->len) return (EINVAL); if ((gpa | first | last) & PAGE_MASK) return (EINVAL); map = NULL; for (i = 0; i < VM_MAX_MEMMAPS; i++) { m = &vm->mem_maps[i]; if (m->len == 0) { map = m; break; } } if (map == NULL) return (ENOSPC); error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, len, 0, VMFS_NO_SPACE, prot, prot, 0); if (error != KERN_SUCCESS) return (EFAULT); vm_object_reference(seg->object); if (flags & VM_MEMMAP_F_WIRED) { error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM : EFAULT); } } map->gpa = gpa; map->len = len; map->segoff = first; map->segid = segid; map->prot = prot; map->flags = flags; return (0); } int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { struct mem_map *mm, *mmnext; int i; mmnext = NULL; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (mm->len == 0 || mm->gpa < *gpa) continue; if (mmnext == NULL || mm->gpa < mmnext->gpa) mmnext = mm; } if (mmnext != NULL) { *gpa = mmnext->gpa; if (segid) *segid = mmnext->segid; if (segoff) *segoff = mmnext->segoff; if (len) *len = mmnext->len; if (prot) *prot = mmnext->prot; if (flags) *flags = mmnext->flags; return (0); } else { return (ENOENT); } } static void vm_free_memmap(struct vm *vm, int ident) { struct mem_map *mm; int error; mm = &vm->mem_maps[ident]; if (mm->len) { error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, mm->gpa + mm->len); KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d", __func__, error)); bzero(mm, sizeof(struct mem_map)); } } static __inline bool sysmem_mapping(struct vm *vm, struct mem_map *mm) { if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) return (true); else return (false); } vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm) { struct mem_map *mm; vm_paddr_t maxaddr; int i; maxaddr = 0; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (sysmem_mapping(vm, mm)) { if (maxaddr < mm->gpa + mm->len) maxaddr = mm->gpa + mm->len; } } return (maxaddr); } static void vm_iommu_modify(struct vm *vm, bool map) { int i, sz; vm_paddr_t gpa, hpa; struct mem_map *mm; void *vp, *cookie, *host_domain; sz = PAGE_SIZE; host_domain = iommu_host_domain(); for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (!sysmem_mapping(vm, mm)) continue; if (map) { KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, ("iommu map found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) continue; mm->flags |= VM_MEMMAP_F_IOMMU; } else { if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) continue; mm->flags &= ~VM_MEMMAP_F_IOMMU; KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, ("iommu unmap found invalid memmap %#lx/%#lx/%#x", mm->gpa, mm->len, mm->flags)); } gpa = mm->gpa; while (gpa < mm->gpa + mm->len) { vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE, &cookie); KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", vm_name(vm), gpa)); vm_gpa_release(cookie); hpa = DMAP_TO_PHYS((uintptr_t)vp); if (map) { iommu_create_mapping(vm->iommu, gpa, hpa, sz); iommu_remove_mapping(host_domain, hpa, sz); } else { iommu_remove_mapping(vm->iommu, gpa, sz); iommu_create_mapping(host_domain, hpa, hpa, sz); } gpa += PAGE_SIZE; } } /* * Invalidate the cached translations associated with the domain * from which pages were removed. */ if (map) iommu_invalidate_tlb(host_domain); else iommu_invalidate_tlb(vm->iommu); } #define vm_iommu_unmap(vm) vm_iommu_modify((vm), false) #define vm_iommu_map(vm) vm_iommu_modify((vm), true) int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; error = ppt_unassign_device(vm, bus, slot, func); if (error) return (error); if (ppt_assigned_devices(vm) == 0) vm_iommu_unmap(vm); return (0); } int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) { int error; vm_paddr_t maxaddr; /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ if (ppt_assigned_devices(vm) == 0) { KASSERT(vm->iommu == NULL, ("vm_assign_pptdev: iommu must be NULL")); maxaddr = vmm_sysmem_maxaddr(vm); vm->iommu = iommu_create_domain(maxaddr); if (vm->iommu == NULL) return (ENXIO); vm_iommu_map(vm); } error = ppt_assign_device(vm, bus, slot, func); return (error); } void * vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { int i, count, pageoff; struct mem_map *mm; vm_page_t m; #ifdef INVARIANTS /* * All vcpus are frozen by ioctls that modify the memory map * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is * guaranteed if at least one vcpu is in the VCPU_FROZEN state. */ int state; KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d", __func__, vcpuid)); for (i = 0; i < vm->maxcpus; i++) { if (vcpuid != -1 && vcpuid != i) continue; state = vcpu_get_state(vm, i, NULL); KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", __func__, state)); } #endif pageoff = gpa & PAGE_MASK; if (len > PAGE_SIZE - pageoff) panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); count = 0; for (i = 0; i < VM_MAX_MEMMAPS; i++) { mm = &vm->mem_maps[i]; if (sysmem_mapping(vm, mm) && gpa >= mm->gpa && gpa < mm->gpa + mm->len) { count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); break; } } if (count == 1) { *cookie = m; return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); } else { *cookie = NULL; return (NULL); } } void vm_gpa_release(void *cookie) { vm_page_t m = cookie; vm_page_unwire(m, PQ_ACTIVE); } int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) { if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); return (VMGETREG(vm->cookie, vcpu, reg, retval)); } int vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) { struct vcpu *vcpu; int error; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); error = VMSETREG(vm->cookie, vcpuid, reg, val); if (error || reg != VM_REG_GUEST_RIP) return (error); /* Set 'nextrip' to match the value of %rip */ VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val); vcpu = &vm->vcpu[vcpuid]; vcpu->nextrip = val; return (0); } static bool is_descriptor_table(int reg) { switch (reg) { case VM_REG_GUEST_IDTR: case VM_REG_GUEST_GDTR: return (true); default: return (false); } } static bool is_segment_register(int reg) { switch (reg) { case VM_REG_GUEST_ES: case VM_REG_GUEST_CS: case VM_REG_GUEST_SS: case VM_REG_GUEST_DS: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_TR: case VM_REG_GUEST_LDTR: return (true); default: return (false); } } int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (VMGETDESC(vm->cookie, vcpu, reg, desc)); } int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (VMSETDESC(vm->cookie, vcpu, reg, desc)); } static void restore_guest_fpustate(struct vcpu *vcpu) { /* flush host state to the pcb */ fpuexit(curthread); /* restore guest FPU state */ fpu_stop_emulating(); fpurestore(vcpu->guestfpu); /* restore guest XCR0 if XSAVE is enabled in the host */ if (rcr4() & CR4_XSAVE) load_xcr(0, vcpu->guest_xcr0); /* * The FPU is now "dirty" with the guest's state so turn on emulation * to trap any access to the FPU by the host. */ fpu_start_emulating(); } static void save_guest_fpustate(struct vcpu *vcpu) { if ((rcr0() & CR0_TS) == 0) panic("fpu emulation not enabled in host!"); /* save guest XCR0 and restore host XCR0 */ if (rcr4() & CR4_XSAVE) { vcpu->guest_xcr0 = rxcr(0); load_xcr(0, vmm_get_host_xcr0()); } /* save guest FPU state */ fpu_stop_emulating(); fpusave(vcpu->guestfpu); fpu_start_emulating(); } static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); static int vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, bool from_idle) { struct vcpu *vcpu; int error; vcpu = &vm->vcpu[vcpuid]; vcpu_assert_locked(vcpu); /* * State transitions from the vmmdev_ioctl() must always begin from * the VCPU_IDLE state. This guarantees that there is only a single * ioctl() operating on a vcpu at any point. */ if (from_idle) { while (vcpu->state != VCPU_IDLE) { vcpu->reqidle = 1; vcpu_notify_event_locked(vcpu, false); VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to " "idle requested", vcpu_state2str(vcpu->state)); msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); } } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); } if (vcpu->state == VCPU_RUNNING) { KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " "mismatch for running vcpu", curcpu, vcpu->hostcpu)); } else { KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " "vcpu that is not running", vcpu->hostcpu)); } /* * The following state transitions are allowed: * IDLE -> FROZEN -> IDLE * FROZEN -> RUNNING -> FROZEN * FROZEN -> SLEEPING -> FROZEN */ switch (vcpu->state) { case VCPU_IDLE: case VCPU_RUNNING: case VCPU_SLEEPING: error = (newstate != VCPU_FROZEN); break; case VCPU_FROZEN: error = (newstate == VCPU_FROZEN); break; default: error = 1; break; } if (error) return (EBUSY); VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); vcpu->state = newstate; if (newstate == VCPU_RUNNING) vcpu->hostcpu = curcpu; else vcpu->hostcpu = NOCPU; if (newstate == VCPU_IDLE) wakeup(&vcpu->state); return (0); } static void vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) panic("Error %d setting state to %d\n", error, newstate); } static void vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) panic("Error %d setting state to %d", error, newstate); } #define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ do { \ if (vcpuid >= 0) \ VCPU_CTR0(vm, vcpuid, fmt); \ else \ VM_CTR0(vm, fmt); \ } while (0) static int vm_handle_rendezvous(struct vm *vm, int vcpuid) { struct thread *td; int error; KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus), ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); error = 0; td = curthread; mtx_lock(&vm->rendezvous_mtx); while (vm->rendezvous_func != NULL) { /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus); if (vcpuid != -1 && CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); CPU_SET(vcpuid, &vm->rendezvous_done_cpus); } if (CPU_CMP(&vm->rendezvous_req_cpus, &vm->rendezvous_done_cpus) == 0) { VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); vm->rendezvous_func = NULL; wakeup(&vm->rendezvous_func); break; } RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, "vmrndv", hz); if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) { mtx_unlock(&vm->rendezvous_mtx); error = thread_check_susp(td, true); if (error != 0) return (error); mtx_lock(&vm->rendezvous_mtx); } } mtx_unlock(&vm->rendezvous_mtx); return (0); } /* * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. */ static int vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) { struct vcpu *vcpu; const char *wmesg; struct thread *td; int error, t, vcpu_halted, vm_halted; KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); vcpu = &vm->vcpu[vcpuid]; vcpu_halted = 0; vm_halted = 0; error = 0; td = curthread; vcpu_lock(vcpu); while (1) { /* * Do a final check for pending NMI or interrupts before * really putting this thread to sleep. Also check for * software events that would cause this vcpu to wakeup. * * These interrupts/events could have happened after the * vcpu returned from VMRUN() and before it acquired the * vcpu lock above. */ if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle) break; if (vm_nmi_pending(vm, vcpuid)) break; if (!intr_disabled) { if (vm_extint_pending(vm, vcpuid) || vlapic_pending_intr(vcpu->vlapic, NULL)) { break; } } /* Don't go to sleep if the vcpu thread needs to yield */ if (vcpu_should_yield(vm, vcpuid)) break; if (vcpu_debugged(vm, vcpuid)) break; /* * Some Linux guests implement "halt" by having all vcpus * execute HLT with interrupts disabled. 'halted_cpus' keeps * track of the vcpus that have entered this state. When all * vcpus enter the halted state the virtual machine is halted. */ if (intr_disabled) { wmesg = "vmhalt"; VCPU_CTR0(vm, vcpuid, "Halted"); if (!vcpu_halted && halt_detection_enabled) { vcpu_halted = 1; CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); } if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { vm_halted = 1; break; } } else { wmesg = "vmidle"; } t = ticks; vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); /* * XXX msleep_spin() cannot be interrupted by signals so * wake up periodically to check pending signals. */ msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) { vcpu_unlock(vcpu); error = thread_check_susp(td, false); if (error != 0) return (error); vcpu_lock(vcpu); } } if (vcpu_halted) CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); vcpu_unlock(vcpu); if (vm_halted) vm_suspend(vm, VM_SUSPEND_HALT); return (0); } static int vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) { int rv, ftype; struct vm_map *map; struct vcpu *vcpu; struct vm_exit *vme; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); ftype = vme->u.paging.fault_type; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, ("vm_handle_paging: invalid fault_type %d", ftype)); if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), vme->u.paging.gpa, ftype); if (rv == 0) { VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx", ftype == VM_PROT_READ ? "accessed" : "dirty", vme->u.paging.gpa); goto done; } } map = &vm->vmspace->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " "ftype = %d", rv, vme->u.paging.gpa, ftype); if (rv != KERN_SUCCESS) return (EFAULT); done: return (0); } static int vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) { struct vie *vie; struct vcpu *vcpu; struct vm_exit *vme; uint64_t gla, gpa, cs_base; struct vm_guest_paging *paging; mem_region_read_t mread; mem_region_write_t mwrite; enum vm_cpu_mode cpu_mode; int cs_d, error, fault; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); gla = vme->u.inst_emul.gla; gpa = vme->u.inst_emul.gpa; cs_base = vme->u.inst_emul.cs_base; cs_d = vme->u.inst_emul.cs_d; vie = &vme->u.inst_emul.vie; paging = &vme->u.inst_emul.paging; cpu_mode = paging->cpu_mode; VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa); /* Fetch, decode and emulate the faulting instruction */ if (vie->num_valid == 0) { error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip + cs_base, VIE_INST_SIZE, vie, &fault); } else { /* * The instruction bytes have already been copied into 'vie' */ error = fault = 0; } if (error || fault) return (error); if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) { VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx", vme->rip + cs_base); *retu = true; /* dump instruction bytes in userspace */ return (0); } /* * Update 'nextrip' based on the length of the emulated instruction. */ vme->inst_length = vie->num_processed; vcpu->nextrip += vie->num_processed; VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction " "decoding", vcpu->nextrip); /* return to userland unless this is an in-kernel emulated device */ if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { mread = lapic_mmio_read; mwrite = lapic_mmio_write; } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { mread = vioapic_mmio_read; mwrite = vioapic_mmio_write; } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { *retu = true; return (0); } error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, mread, mwrite, retu); return (error); } static int vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) { int error, i; struct vcpu *vcpu; struct thread *td; error = 0; vcpu = &vm->vcpu[vcpuid]; td = curthread; CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); /* * Wait until all 'active_cpus' have suspended themselves. * * Since a VM may be suspended at any time including when one or * more vcpus are doing a rendezvous we need to call the rendezvous * handler while we are waiting to prevent a deadlock. */ vcpu_lock(vcpu); while (error == 0) { if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); break; } if (vm->rendezvous_func == NULL) { VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) { vcpu_unlock(vcpu); error = thread_check_susp(td, false); vcpu_lock(vcpu); } } else { VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); vcpu_unlock(vcpu); error = vm_handle_rendezvous(vm, vcpuid); vcpu_lock(vcpu); } } vcpu_unlock(vcpu); /* * Wakeup the other sleeping vcpus and return to userspace. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->suspended_cpus)) { vcpu_notify_event(vm, i, false); } } *retu = true; return (error); } static int vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu) { struct vcpu *vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); vcpu->reqidle = 0; vcpu_unlock(vcpu); *retu = true; return (0); } int vm_suspend(struct vm *vm, enum vm_suspend_how how) { int i; if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) return (EINVAL); if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { VM_CTR2(vm, "virtual machine already suspended %d/%d", vm->suspend, how); return (EALREADY); } VM_CTR1(vm, "virtual machine successfully suspended %d", how); /* * Notify all active vcpus that they are now suspended. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm, i, false); } return (0); } void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_SUSPENDED; vmexit->u.suspended.how = vm->suspend; } void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_DEBUG; } void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress")); vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1); } void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_REQIDLE; vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); } void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); } int vm_run(struct vm *vm, struct vm_run *vmrun) { struct vm_eventinfo evinfo; int error, vcpuid; struct vcpu *vcpu; struct pcb *pcb; uint64_t tscval; struct vm_exit *vme; bool retu, intr_disabled; pmap_t pmap; vcpuid = vmrun->cpuid; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (!CPU_ISSET(vcpuid, &vm->active_cpus)) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; evinfo.rptr = &vm->rendezvous_func; evinfo.sptr = &vm->suspend; evinfo.iptr = &vcpu->reqidle; restart: critical_enter(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("vm_run: absurd pm_active")); tscval = rdtsc(); pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); restore_guest_fpustate(vcpu); vcpu_require_state(vm, vcpuid, VCPU_RUNNING); error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo); vcpu_require_state(vm, vcpuid, VCPU_FROZEN); save_guest_fpustate(vcpu); vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); critical_exit(); if (error == 0) { retu = false; vcpu->nextrip = vme->rip + vme->inst_length; switch (vme->exitcode) { case VM_EXITCODE_REQIDLE: error = vm_handle_reqidle(vm, vcpuid, &retu); break; case VM_EXITCODE_SUSPENDED: error = vm_handle_suspend(vm, vcpuid, &retu); break; case VM_EXITCODE_IOAPIC_EOI: vioapic_process_eoi(vm, vcpuid, vme->u.ioapic_eoi.vector); break; case VM_EXITCODE_RENDEZVOUS: error = vm_handle_rendezvous(vm, vcpuid); break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); break; case VM_EXITCODE_PAGING: error = vm_handle_paging(vm, vcpuid, &retu); break; case VM_EXITCODE_INST_EMUL: error = vm_handle_inst_emul(vm, vcpuid, &retu); break; case VM_EXITCODE_INOUT: case VM_EXITCODE_INOUT_STR: error = vm_handle_inout(vm, vcpuid, vme, &retu); break; case VM_EXITCODE_MONITOR: case VM_EXITCODE_MWAIT: case VM_EXITCODE_VMINSN: vm_inject_ud(vm, vcpuid); break; default: retu = true; /* handled in userland */ break; } } if (error == 0 && retu == false) goto restart; VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); /* copy the exit information */ bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); return (error); } int vm_restart_instruction(void *arg, int vcpuid) { struct vm *vm; struct vcpu *vcpu; enum vcpu_state state; uint64_t rip; int error; vm = arg; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; state = vcpu_get_state(vm, vcpuid, NULL); if (state == VCPU_RUNNING) { /* * When a vcpu is "running" the next instruction is determined * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. * Thus setting 'inst_length' to zero will cause the current * instruction to be restarted. */ vcpu->exitinfo.inst_length = 0; VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by " "setting inst_length to zero", vcpu->exitinfo.rip); } else if (state == VCPU_FROZEN) { /* * When a vcpu is "frozen" it is outside the critical section * around VMRUN() and 'nextrip' points to the next instruction. * Thus instruction restart is achieved by setting 'nextrip' * to the vcpu's %rip. */ error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); KASSERT(!error, ("%s: error %d getting rip", __func__, error)); VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " "nextrip from %#lx to %#lx", vcpu->nextrip, rip); vcpu->nextrip = rip; } else { panic("%s: invalid state %d", __func__, state); } return (0); } int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) { struct vcpu *vcpu; int type, vector; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; if (info & VM_INTINFO_VALID) { type = info & VM_INTINFO_TYPE; vector = info & 0xff; if (type == VM_INTINFO_NMI && vector != IDT_NMI) return (EINVAL); if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) return (EINVAL); if (info & VM_INTINFO_RSVD) return (EINVAL); } else { info = 0; } VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info); vcpu->exitintinfo = info; return (0); } enum exc_class { EXC_BENIGN, EXC_CONTRIBUTORY, EXC_PAGEFAULT }; #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ static enum exc_class exception_class(uint64_t info) { int type, vector; KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); type = info & VM_INTINFO_TYPE; vector = info & 0xff; /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ switch (type) { case VM_INTINFO_HWINTR: case VM_INTINFO_SWINTR: case VM_INTINFO_NMI: return (EXC_BENIGN); default: /* * Hardware exception. * * SVM and VT-x use identical type values to represent NMI, * hardware interrupt and software interrupt. * * SVM uses type '3' for all exceptions. VT-x uses type '3' * for exceptions except #BP and #OF. #BP and #OF use a type * value of '5' or '6'. Therefore we don't check for explicit * values of 'type' to classify 'intinfo' into a hardware * exception. */ break; } switch (vector) { case IDT_PF: case IDT_VE: return (EXC_PAGEFAULT); case IDT_DE: case IDT_TS: case IDT_NP: case IDT_SS: case IDT_GP: return (EXC_CONTRIBUTORY); default: return (EXC_BENIGN); } } static int nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, uint64_t *retinfo) { enum exc_class exc1, exc2; int type1, vector1; KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); /* * If an exception occurs while attempting to call the double-fault * handler the processor enters shutdown mode (aka triple fault). */ type1 = info1 & VM_INTINFO_TYPE; vector1 = info1 & 0xff; if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", info1, info2); vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); *retinfo = 0; return (0); } /* * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 */ exc1 = exception_class(info1); exc2 = exception_class(info2); if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { /* Convert nested fault into a double fault. */ *retinfo = IDT_DF; *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; *retinfo |= VM_INTINFO_DEL_ERRCODE; } else { /* Handle exceptions serially */ *retinfo = info2; } return (1); } static uint64_t vcpu_exception_intinfo(struct vcpu *vcpu) { uint64_t info = 0; if (vcpu->exception_pending) { info = vcpu->exc_vector & 0xff; info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; if (vcpu->exc_errcode_valid) { info |= VM_INTINFO_DEL_ERRCODE; info |= (uint64_t)vcpu->exc_errcode << 32; } } return (info); } int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) { struct vcpu *vcpu; uint64_t info1, info2; int valid; KASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid)); vcpu = &vm->vcpu[vcpuid]; info1 = vcpu->exitintinfo; vcpu->exitintinfo = 0; info2 = 0; if (vcpu->exception_pending) { info2 = vcpu_exception_intinfo(vcpu); vcpu->exception_pending = 0; VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", vcpu->exc_vector, info2); } if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { valid = nested_fault(vm, vcpuid, info1, info2, retinfo); } else if (info1 & VM_INTINFO_VALID) { *retinfo = info1; valid = 1; } else if (info2 & VM_INTINFO_VALID) { *retinfo = info2; valid = 1; } else { valid = 0; } if (valid) { VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), " "retinfo(%#lx)", __func__, info1, info2, *retinfo); } return (valid); } int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; *info1 = vcpu->exitintinfo; *info2 = vcpu_exception_intinfo(vcpu); return (0); } int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, uint32_t errcode, int restart_instruction) { struct vcpu *vcpu; uint64_t regval; int error; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (vector < 0 || vector >= 32) return (EINVAL); /* * A double fault exception should never be injected directly into * the guest. It is a derived exception that results from specific * combinations of nested faults. */ if (vector == IDT_DF) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; if (vcpu->exception_pending) { VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " "pending exception %d", vector, vcpu->exc_vector); return (EBUSY); } if (errcode_valid) { /* * Exceptions don't deliver an error code in real mode. */ error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); if (!(regval & CR0_PE)) errcode_valid = 0; } /* * From section 26.6.1 "Interruptibility State" in Intel SDM: * * Event blocking by "STI" or "MOV SS" is cleared after guest executes * one instruction or incurs an exception. */ error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", __func__, error)); if (restart_instruction) vm_restart_instruction(vm, vcpuid); vcpu->exception_pending = 1; vcpu->exc_vector = vector; vcpu->exc_errcode = errcode; vcpu->exc_errcode_valid = errcode_valid; VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); return (0); } void vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, int errcode) { struct vm *vm; int error, restart_instruction; vm = vmarg; restart_instruction = 1; error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, errcode, restart_instruction); KASSERT(error == 0, ("vm_inject_exception error %d", error)); } void vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2) { struct vm *vm; int error; vm = vmarg; VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx", error_code, cr2); error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); } static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); int vm_inject_nmi(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; vcpu->nmi_pending = 1; vcpu_notify_event(vm, vcpuid, false); return (0); } int vm_nmi_pending(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; return (vcpu->nmi_pending); } void vm_nmi_clear(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; if (vcpu->nmi_pending == 0) panic("vm_nmi_clear: inconsistent nmi_pending state"); vcpu->nmi_pending = 0; vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); } static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); int vm_inject_extint(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; vcpu->extint_pending = 1; vcpu_notify_event(vm, vcpuid, false); return (0); } int vm_extint_pending(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_extint_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; return (vcpu->extint_pending); } void vm_extint_clear(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_extint_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; if (vcpu->extint_pending == 0) panic("vm_extint_clear: inconsistent extint_pending state"); vcpu->extint_pending = 0; vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); } int vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) { if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (VMGETCAP(vm->cookie, vcpu, type, retval)); } int vm_set_capability(struct vm *vm, int vcpu, int type, int val) { if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (VMSETCAP(vm->cookie, vcpu, type, val)); } struct vlapic * vm_lapic(struct vm *vm, int cpu) { return (vm->vcpu[cpu].vlapic); } struct vioapic * vm_ioapic(struct vm *vm) { return (vm->vioapic); } struct vhpet * vm_hpet(struct vm *vm) { return (vm->vhpet); } bool vmm_is_pptdev(int bus, int slot, int func) { int b, f, i, n, s; char *val, *cp, *cp2; bool found; /* * XXX * The length of an environment variable is limited to 128 bytes which * puts an upper limit on the number of passthru devices that may be * specified using a single environment variable. * * Work around this by scanning multiple environment variable * names instead of a single one - yuck! */ const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ found = false; for (i = 0; names[i] != NULL && !found; i++) { cp = val = kern_getenv(names[i]); while (cp != NULL && *cp != '\0') { if ((cp2 = strchr(cp, ' ')) != NULL) *cp2 = '\0'; n = sscanf(cp, "%d/%d/%d", &b, &s, &f); if (n == 3 && bus == b && slot == s && func == f) { found = true; break; } if (cp2 != NULL) *cp2++ = ' '; cp = cp2; } freeenv(val); } return (found); } void * vm_iommu_domain(struct vm *vm) { return (vm->iommu); } int vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, bool from_idle) { int error; struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_set_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); vcpu_unlock(vcpu); return (error); } enum vcpu_state vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) { struct vcpu *vcpu; enum vcpu_state state; if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_get_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); state = vcpu->state; if (hostcpu != NULL) *hostcpu = vcpu->hostcpu; vcpu_unlock(vcpu); return (state); } int vm_activate_cpu(struct vm *vm, int vcpuid) { if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->active_cpus)) return (EBUSY); VCPU_CTR0(vm, vcpuid, "activated"); CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); return (0); } int vm_suspend_cpu(struct vm *vm, int vcpuid) { int i; if (vcpuid < -1 || vcpuid >= vm->maxcpus) return (EINVAL); if (vcpuid == -1) { vm->debug_cpus = vm->active_cpus; for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm, i, false); } } else { if (!CPU_ISSET(vcpuid, &vm->active_cpus)) return (EINVAL); CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); vcpu_notify_event(vm, vcpuid, false); } return (0); } int vm_resume_cpu(struct vm *vm, int vcpuid) { if (vcpuid < -1 || vcpuid >= vm->maxcpus) return (EINVAL); if (vcpuid == -1) { CPU_ZERO(&vm->debug_cpus); } else { if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) return (EINVAL); CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); } return (0); } int vcpu_debugged(struct vm *vm, int vcpuid) { return (CPU_ISSET(vcpuid, &vm->debug_cpus)); } cpuset_t vm_active_cpus(struct vm *vm) { return (vm->active_cpus); } cpuset_t vm_debug_cpus(struct vm *vm) { return (vm->debug_cpus); } cpuset_t vm_suspended_cpus(struct vm *vm) { return (vm->suspended_cpus); } void * vcpu_stats(struct vm *vm, int vcpuid) { return (vm->vcpu[vcpuid].stats); } int vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) { if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); *state = vm->vcpu[vcpuid].x2apic_state; return (0); } int vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) { if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (state >= X2APIC_STATE_LAST) return (EINVAL); vm->vcpu[vcpuid].x2apic_state = state; vlapic_set_x2apic_state(vm, vcpuid, state); return (0); } /* * This function is called to ensure that a vcpu "sees" a pending event * as soon as possible: * - If the vcpu thread is sleeping then it is woken up. * - If the vcpu is running on a different host_cpu then an IPI will be directed * to the host_cpu to cause the vcpu to trap into the hypervisor. */ static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) { int hostcpu; hostcpu = vcpu->hostcpu; if (vcpu->state == VCPU_RUNNING) { KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); if (hostcpu != curcpu) { if (lapic_intr) { vlapic_post_intr(vcpu->vlapic, hostcpu, vmm_ipinum); } else { ipi_cpu(hostcpu, vmm_ipinum); } } else { /* * If the 'vcpu' is running on 'curcpu' then it must * be sending a notification to itself (e.g. SELF_IPI). * The pending event will be picked up when the vcpu * transitions back to guest context. */ } } else { KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " "with hostcpu %d", vcpu->state, hostcpu)); if (vcpu->state == VCPU_SLEEPING) wakeup_one(vcpu); } } void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) { struct vcpu *vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); vcpu_notify_event_locked(vcpu, lapic_intr); vcpu_unlock(vcpu); } struct vmspace * vm_get_vmspace(struct vm *vm) { return (vm->vmspace); } int vm_apicid2vcpuid(struct vm *vm, int apicid) { /* * XXX apic id is assumed to be numerically identical to vcpu id */ return (apicid); } int vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, vm_rendezvous_func_t func, void *arg) { int error, i; /* * Enforce that this function is called without any locks */ WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus), ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); restart: mtx_lock(&vm->rendezvous_mtx); if (vm->rendezvous_func != NULL) { /* * If a rendezvous is already in progress then we need to * call the rendezvous handler in case this 'vcpuid' is one * of the targets of the rendezvous. */ RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); mtx_unlock(&vm->rendezvous_mtx); error = vm_handle_rendezvous(vm, vcpuid); if (error != 0) return (error); goto restart; } KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " "rendezvous is still in progress")); RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); vm->rendezvous_req_cpus = dest; CPU_ZERO(&vm->rendezvous_done_cpus); vm->rendezvous_arg = arg; vm->rendezvous_func = func; mtx_unlock(&vm->rendezvous_mtx); /* * Wake up any sleeping vcpus and trigger a VM-exit in any running * vcpus so they handle the rendezvous as soon as possible. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &dest)) vcpu_notify_event(vm, i, false); } return (vm_handle_rendezvous(vm, vcpuid)); } struct vatpic * vm_atpic(struct vm *vm) { return (vm->vatpic); } struct vatpit * vm_atpit(struct vm *vm) { return (vm->vatpit); } struct vpmtmr * vm_pmtmr(struct vm *vm) { return (vm->vpmtmr); } struct vrtc * vm_rtc(struct vm *vm) { return (vm->vrtc); } enum vm_reg_name vm_segment_name(int seg) { static enum vm_reg_name seg_names[] = { VM_REG_GUEST_ES, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_FS, VM_REG_GUEST_GS }; KASSERT(seg >= 0 && seg < nitems(seg_names), ("%s: invalid segment encoding %d", __func__, seg)); return (seg_names[seg]); } void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, int num_copyinfo) { int idx; for (idx = 0; idx < num_copyinfo; idx++) { if (copyinfo[idx].cookie != NULL) vm_gpa_release(copyinfo[idx].cookie); } bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); } int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, int num_copyinfo, int *fault) { int error, idx, nused; size_t n, off, remaining; void *hva, *cookie; uint64_t gpa; bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); nused = 0; remaining = len; while (remaining > 0) { KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = min(remaining, PAGE_SIZE - off); copyinfo[nused].gpa = gpa; copyinfo[nused].len = n; remaining -= n; gla += n; nused++; } for (idx = 0; idx < nused; idx++) { hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa, copyinfo[idx].len, prot, &cookie); if (hva == NULL) break; copyinfo[idx].hva = hva; copyinfo[idx].cookie = cookie; } if (idx != nused) { vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); return (EFAULT); } else { *fault = 0; return (0); } } void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, size_t len) { char *dst; int idx; dst = kaddr; idx = 0; while (len > 0) { bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); len -= copyinfo[idx].len; dst += copyinfo[idx].len; idx++; } } void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, struct vm_copyinfo *copyinfo, size_t len) { const char *src; int idx; src = kaddr; idx = 0; while (len > 0) { bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); len -= copyinfo[idx].len; src += copyinfo[idx].len; idx++; } } /* * Return the amount of in-use and wired memory for the VM. Since * these are global stats, only return the values with for vCPU 0 */ VMM_STAT_DECLARE(VMM_MEM_RESIDENT); VMM_STAT_DECLARE(VMM_MEM_WIRED); static void vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) { if (vcpu == 0) { vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, PAGE_SIZE * vmspace_resident_count(vm->vmspace)); } } static void vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) { if (vcpu == 0) { vmm_stat_set(vm, vcpu, VMM_MEM_WIRED, PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); } } VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); + +#ifdef BHYVE_SNAPSHOT +static int +vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + struct vcpu *vcpu; + + for (i = 0; i < VM_MAXCPU; i++) { + vcpu = &vm->vcpu[i]; + + SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done); + /* XXX we're cheating here, since the value of tsc_offset as + * saved here is actually the value of the guest's TSC value. + * + * It will be turned turned back into an actual offset when the + * TSC restore function is called + */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->tsc_offset, meta, ret, done); + } + +done: + return (ret); +} + +static int +vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + uint64_t now; + + ret = 0; + now = rdtsc(); + + if (meta->op == VM_SNAPSHOT_SAVE) { + /* XXX make tsc_offset take the value TSC proper as seen by the + * guest + */ + for (i = 0; i < VM_MAXCPU; i++) + vm->vcpu[i].tsc_offset += now; + } + + ret = vm_snapshot_vcpus(vm, meta); + if (ret != 0) { + printf("%s: failed to copy vm data to user buffer", __func__); + goto done; + } + + if (meta->op == VM_SNAPSHOT_SAVE) { + /* XXX turn tsc_offset back into an offset; actual value is only + * required for restore; using it otherwise would be wrong + */ + for (i = 0; i < VM_MAXCPU; i++) + vm->vcpu[i].tsc_offset -= now; + } + +done: + return (ret); +} + +static int +vm_snapshot_vmcx(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int i, error; + + error = 0; + + for (i = 0; i < VM_MAXCPU; i++) { + error = VM_SNAPSHOT_VMCX(vm->cookie, meta, i); + if (error != 0) { + printf("%s: failed to snapshot vmcs/vmcb data for " + "vCPU: %d; error: %d\n", __func__, i, error); + goto done; + } + } + +done: + return (error); +} + +/* + * Save kernel-side structures to user-space for snapshotting. + */ +int +vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int ret = 0; + + switch (meta->dev_req) { + case STRUCT_VMX: + ret = VM_SNAPSHOT_VMI(vm->cookie, meta); + break; + case STRUCT_VMCX: + ret = vm_snapshot_vmcx(vm, meta); + break; + case STRUCT_VM: + ret = vm_snapshot_vm(vm, meta); + break; + case STRUCT_VIOAPIC: + ret = vioapic_snapshot(vm_ioapic(vm), meta); + break; + case STRUCT_VLAPIC: + ret = vlapic_snapshot(vm, meta); + break; + case STRUCT_VHPET: + ret = vhpet_snapshot(vm_hpet(vm), meta); + break; + case STRUCT_VATPIC: + ret = vatpic_snapshot(vm_atpic(vm), meta); + break; + case STRUCT_VATPIT: + ret = vatpit_snapshot(vm_atpit(vm), meta); + break; + case STRUCT_VPMTMR: + ret = vpmtmr_snapshot(vm_pmtmr(vm), meta); + break; + case STRUCT_VRTC: + ret = vrtc_snapshot(vm_rtc(vm), meta); + break; + default: + printf("%s: failed to find the requested type %#x\n", + __func__, meta->dev_req); + ret = (EINVAL); + } + return (ret); +} + +int +vm_set_tsc_offset(struct vm *vm, int vcpuid, uint64_t offset) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + vcpu->tsc_offset = offset; + + return (0); +} + +int +vm_restore_time(struct vm *vm) +{ + int error, i; + uint64_t now; + struct vcpu *vcpu; + + now = rdtsc(); + + error = vhpet_restore_time(vm_hpet(vm)); + if (error) + return (error); + + for (i = 0; i < nitems(vm->vcpu); i++) { + vcpu = &vm->vcpu[i]; + + error = VM_RESTORE_TSC(vm->cookie, i, vcpu->tsc_offset - now); + if (error) + return (error); + } + + return (0); +} +#endif diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index 9818f300efec..e47b7081b795 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -1,1167 +1,1182 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include +#include +#include #include "vmm_lapic.h" #include "vmm_stat.h" #include "vmm_mem.h" #include "io/ppt.h" #include "io/vatpic.h" #include "io/vioapic.h" #include "io/vhpet.h" #include "io/vrtc.h" struct devmem_softc { int segid; char *name; struct cdev *cdev; struct vmmdev_softc *sc; SLIST_ENTRY(devmem_softc) link; }; struct vmmdev_softc { struct vm *vm; /* vm instance cookie */ struct cdev *cdev; SLIST_ENTRY(vmmdev_softc) link; SLIST_HEAD(, devmem_softc) devmem; int flags; }; #define VSC_LINKED 0x01 static SLIST_HEAD(, vmmdev_softc) head; static unsigned pr_allow_flag; static struct mtx vmmdev_mtx; static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); SYSCTL_DECL(_hw_vmm); static int vmm_priv_check(struct ucred *ucred); static int devmem_create_cdev(const char *vmname, int id, char *devmem); static void devmem_destroy(void *arg); static int vmm_priv_check(struct ucred *ucred) { if (jailed(ucred) && !(ucred->cr_prison->pr_allow & pr_allow_flag)) return (EPERM); return (0); } static int vcpu_lock_one(struct vmmdev_softc *sc, int vcpu) { int error; if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm)) return (EINVAL); error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); return (error); } static void vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu) { enum vcpu_state state; state = vcpu_get_state(sc->vm, vcpu, NULL); if (state != VCPU_FROZEN) { panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm), vcpu, state); } vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); } static int vcpu_lock_all(struct vmmdev_softc *sc) { int error, vcpu; uint16_t maxcpus; maxcpus = vm_get_maxcpus(sc->vm); for (vcpu = 0; vcpu < maxcpus; vcpu++) { error = vcpu_lock_one(sc, vcpu); if (error) break; } if (error) { while (--vcpu >= 0) vcpu_unlock_one(sc, vcpu); } return (error); } static void vcpu_unlock_all(struct vmmdev_softc *sc) { int vcpu; uint16_t maxcpus; maxcpus = vm_get_maxcpus(sc->vm); for (vcpu = 0; vcpu < maxcpus; vcpu++) vcpu_unlock_one(sc, vcpu); } static struct vmmdev_softc * vmmdev_lookup(const char *name) { struct vmmdev_softc *sc; #ifdef notyet /* XXX kernel is not compiled with invariants */ mtx_assert(&vmmdev_mtx, MA_OWNED); #endif SLIST_FOREACH(sc, &head, link) { if (strcmp(name, vm_name(sc->vm)) == 0) break; } return (sc); } static struct vmmdev_softc * vmmdev_lookup2(struct cdev *cdev) { return (cdev->si_drv1); } static int vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) { int error, off, c, prot; vm_paddr_t gpa, maxaddr; void *hpa, *cookie; struct vmmdev_softc *sc; uint16_t lastcpu; error = vmm_priv_check(curthread->td_ucred); if (error) return (error); sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); /* * Get a read lock on the guest memory map by freezing any vcpu. */ lastcpu = vm_get_maxcpus(sc->vm) - 1; error = vcpu_lock_one(sc, lastcpu); if (error) return (error); prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); maxaddr = vmm_sysmem_maxaddr(sc->vm); while (uio->uio_resid > 0 && error == 0) { gpa = uio->uio_offset; off = gpa & PAGE_MASK; c = min(uio->uio_resid, PAGE_SIZE - off); /* * The VM has a hole in its physical memory map. If we want to * use 'dd' to inspect memory beyond the hole we need to * provide bogus data for memory that lies in the hole. * * Since this device does not support lseek(2), dd(1) will * read(2) blocks of data to simulate the lseek(2). */ hpa = vm_gpa_hold(sc->vm, lastcpu, gpa, c, prot, &cookie); if (hpa == NULL) { if (uio->uio_rw == UIO_READ && gpa < maxaddr) error = uiomove(__DECONST(void *, zero_region), c, uio); else error = EFAULT; } else { error = uiomove(hpa, c, uio); vm_gpa_release(cookie); } } vcpu_unlock_one(sc, lastcpu); return (error); } CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1); static int get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg) { struct devmem_softc *dsc; int error; bool sysmem; error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL); if (error || mseg->len == 0) return (error); if (!sysmem) { SLIST_FOREACH(dsc, &sc->devmem, link) { if (dsc->segid == mseg->segid) break; } KASSERT(dsc != NULL, ("%s: devmem segment %d not found", __func__, mseg->segid)); error = copystr(dsc->name, mseg->name, sizeof(mseg->name), NULL); } else { bzero(mseg->name, sizeof(mseg->name)); } return (error); } static int alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg) { char *name; int error; bool sysmem; error = 0; name = NULL; sysmem = true; /* * The allocation is lengthened by 1 to hold a terminating NUL. It'll * by stripped off when devfs processes the full string. */ if (VM_MEMSEG_NAME(mseg)) { sysmem = false; name = malloc(sizeof(mseg->name), M_VMMDEV, M_WAITOK); error = copystr(mseg->name, name, sizeof(mseg->name), NULL); if (error) goto done; } error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem); if (error) goto done; if (VM_MEMSEG_NAME(mseg)) { error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name); if (error) vm_free_memseg(sc->vm, mseg->segid); else name = NULL; /* freed when 'cdev' is destroyed */ } done: free(name, M_VMMDEV); return (error); } static int vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_get_register(vm, vcpu, regnum[i], ®val[i]); if (error) break; } return (error); } static int vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_set_register(vm, vcpu, regnum[i], regval[i]); if (error) break; } return (error); } static int vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { int error, vcpu, state_changed, size; cpuset_t *cpuset; struct vmmdev_softc *sc; struct vm_register *vmreg; struct vm_seg_desc *vmsegdesc; struct vm_register_set *vmregset; struct vm_run *vmrun; struct vm_exception *vmexc; struct vm_lapic_irq *vmirq; struct vm_lapic_msi *vmmsi; struct vm_ioapic_irq *ioapic_irq; struct vm_isa_irq *isa_irq; struct vm_isa_irq_trigger *isa_irq_trigger; struct vm_capability *vmcap; struct vm_pptdev *pptdev; struct vm_pptdev_mmio *pptmmio; struct vm_pptdev_msi *pptmsi; struct vm_pptdev_msix *pptmsix; struct vm_nmi *vmnmi; struct vm_stats *vmstats; struct vm_stat_desc *statdesc; struct vm_x2apic *x2apic; struct vm_gpa_pte *gpapte; struct vm_suspend *vmsuspend; struct vm_gla2gpa *gg; struct vm_activate_cpu *vac; struct vm_cpuset *vm_cpuset; struct vm_intinfo *vmii; struct vm_rtc_time *rtctime; struct vm_rtc_data *rtcdata; struct vm_memmap *mm; struct vm_cpu_topology *topology; uint64_t *regvals; int *regnums; +#ifdef BHYVE_SNAPSHOT + struct vm_snapshot_meta *snapshot_meta; +#endif error = vmm_priv_check(curthread->td_ucred); if (error) return (error); sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); vcpu = -1; state_changed = 0; /* * Some VMM ioctls can operate only on vcpus that are not running. */ switch (cmd) { case VM_RUN: case VM_GET_REGISTER: case VM_SET_REGISTER: case VM_GET_SEGMENT_DESCRIPTOR: case VM_SET_SEGMENT_DESCRIPTOR: case VM_GET_REGISTER_SET: case VM_SET_REGISTER_SET: case VM_INJECT_EXCEPTION: case VM_GET_CAPABILITY: case VM_SET_CAPABILITY: case VM_PPTDEV_MSI: case VM_PPTDEV_MSIX: case VM_SET_X2APIC_STATE: case VM_GLA2GPA: case VM_GLA2GPA_NOFAULT: case VM_ACTIVATE_CPU: case VM_SET_INTINFO: case VM_GET_INTINFO: case VM_RESTART_INSTRUCTION: /* * XXX fragile, handle with care * Assumes that the first field of the ioctl data is the vcpu. */ vcpu = *(int *)data; error = vcpu_lock_one(sc, vcpu); if (error) goto done; state_changed = 1; break; case VM_MAP_PPTDEV_MMIO: case VM_BIND_PPTDEV: case VM_UNBIND_PPTDEV: case VM_ALLOC_MEMSEG: case VM_MMAP_MEMSEG: case VM_REINIT: /* * ioctls that operate on the entire virtual machine must * prevent all vcpus from running. */ error = vcpu_lock_all(sc); if (error) goto done; state_changed = 2; break; case VM_GET_MEMSEG: case VM_MMAP_GETNEXT: /* * Lock a vcpu to make sure that the memory map cannot be * modified while it is being inspected. */ vcpu = vm_get_maxcpus(sc->vm) - 1; error = vcpu_lock_one(sc, vcpu); if (error) goto done; state_changed = 1; break; default: break; } switch(cmd) { case VM_RUN: vmrun = (struct vm_run *)data; error = vm_run(sc->vm, vmrun); break; case VM_SUSPEND: vmsuspend = (struct vm_suspend *)data; error = vm_suspend(sc->vm, vmsuspend->how); break; case VM_REINIT: error = vm_reinit(sc->vm); break; case VM_STAT_DESC: { statdesc = (struct vm_stat_desc *)data; error = vmm_stat_desc_copy(statdesc->index, statdesc->desc, sizeof(statdesc->desc)); break; } case VM_STATS: { CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); vmstats = (struct vm_stats *)data; getmicrotime(&vmstats->tv); error = vmm_stat_copy(sc->vm, vmstats->cpuid, &vmstats->num_entries, vmstats->statbuf); break; } case VM_PPTDEV_MSI: pptmsi = (struct vm_pptdev_msi *)data; error = ppt_setup_msi(sc->vm, pptmsi->vcpu, pptmsi->bus, pptmsi->slot, pptmsi->func, pptmsi->addr, pptmsi->msg, pptmsi->numvec); break; case VM_PPTDEV_MSIX: pptmsix = (struct vm_pptdev_msix *)data; error = ppt_setup_msix(sc->vm, pptmsix->vcpu, pptmsix->bus, pptmsix->slot, pptmsix->func, pptmsix->idx, pptmsix->addr, pptmsix->msg, pptmsix->vector_control); break; case VM_MAP_PPTDEV_MMIO: pptmmio = (struct vm_pptdev_mmio *)data; error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, pptmmio->func, pptmmio->gpa, pptmmio->len, pptmmio->hpa); break; case VM_BIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_UNBIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_INJECT_EXCEPTION: vmexc = (struct vm_exception *)data; error = vm_inject_exception(sc->vm, vmexc->cpuid, vmexc->vector, vmexc->error_code_valid, vmexc->error_code, vmexc->restart_instruction); break; case VM_INJECT_NMI: vmnmi = (struct vm_nmi *)data; error = vm_inject_nmi(sc->vm, vmnmi->cpuid); break; case VM_LAPIC_IRQ: vmirq = (struct vm_lapic_irq *)data; error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector); break; case VM_LAPIC_LOCAL_IRQ: vmirq = (struct vm_lapic_irq *)data; error = lapic_set_local_intr(sc->vm, vmirq->cpuid, vmirq->vector); break; case VM_LAPIC_MSI: vmmsi = (struct vm_lapic_msi *)data; error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg); break; case VM_IOAPIC_ASSERT_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_DEASSERT_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_PULSE_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_PINCOUNT: *(int *)data = vioapic_pincount(sc->vm); break; case VM_ISA_ASSERT_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_assert_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_DEASSERT_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_deassert_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_PULSE_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_SET_IRQ_TRIGGER: isa_irq_trigger = (struct vm_isa_irq_trigger *)data; error = vatpic_set_irq_trigger(sc->vm, isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger); break; case VM_MMAP_GETNEXT: mm = (struct vm_memmap *)data; error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid, &mm->segoff, &mm->len, &mm->prot, &mm->flags); break; case VM_MMAP_MEMSEG: mm = (struct vm_memmap *)data; error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff, mm->len, mm->prot, mm->flags); break; case VM_ALLOC_MEMSEG: error = alloc_memseg(sc, (struct vm_memseg *)data); break; case VM_GET_MEMSEG: error = get_memseg(sc, (struct vm_memseg *)data); break; case VM_GET_REGISTER: vmreg = (struct vm_register *)data; error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, &vmreg->regval); break; case VM_SET_REGISTER: vmreg = (struct vm_register *)data; error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, vmreg->regval); break; case VM_SET_SEGMENT_DESCRIPTOR: vmsegdesc = (struct vm_seg_desc *)data; error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, vmsegdesc->regnum, &vmsegdesc->desc); break; case VM_GET_SEGMENT_DESCRIPTOR: vmsegdesc = (struct vm_seg_desc *)data; error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, vmsegdesc->regnum, &vmsegdesc->desc); break; case VM_GET_REGISTER_SET: vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, M_WAITOK); regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = vm_get_register_set(sc->vm, vmregset->cpuid, vmregset->count, regnums, regvals); if (error == 0) error = copyout(regvals, vmregset->regvals, sizeof(regvals[0]) * vmregset->count); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; case VM_SET_REGISTER_SET: vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, M_WAITOK); regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = copyin(vmregset->regvals, regvals, sizeof(regvals[0]) * vmregset->count); if (error == 0) error = vm_set_register_set(sc->vm, vmregset->cpuid, vmregset->count, regnums, regvals); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; case VM_GET_CAPABILITY: vmcap = (struct vm_capability *)data; error = vm_get_capability(sc->vm, vmcap->cpuid, vmcap->captype, &vmcap->capval); break; case VM_SET_CAPABILITY: vmcap = (struct vm_capability *)data; error = vm_set_capability(sc->vm, vmcap->cpuid, vmcap->captype, vmcap->capval); break; case VM_SET_X2APIC_STATE: x2apic = (struct vm_x2apic *)data; error = vm_set_x2apic_state(sc->vm, x2apic->cpuid, x2apic->state); break; case VM_GET_X2APIC_STATE: x2apic = (struct vm_x2apic *)data; error = vm_get_x2apic_state(sc->vm, x2apic->cpuid, &x2apic->state); break; case VM_GET_GPA_PMAP: gpapte = (struct vm_gpa_pte *)data; pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), gpapte->gpa, gpapte->pte, &gpapte->ptenum); error = 0; break; case VM_GET_HPET_CAPABILITIES: error = vhpet_getcap((struct vm_hpet_cap *)data); break; case VM_GLA2GPA: { CTASSERT(PROT_READ == VM_PROT_READ); CTASSERT(PROT_WRITE == VM_PROT_WRITE); CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); gg = (struct vm_gla2gpa *)data; error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla, gg->prot, &gg->gpa, &gg->fault); KASSERT(error == 0 || error == EFAULT, ("%s: vm_gla2gpa unknown error %d", __func__, error)); break; } case VM_GLA2GPA_NOFAULT: gg = (struct vm_gla2gpa *)data; error = vm_gla2gpa_nofault(sc->vm, gg->vcpuid, &gg->paging, gg->gla, gg->prot, &gg->gpa, &gg->fault); KASSERT(error == 0 || error == EFAULT, ("%s: vm_gla2gpa unknown error %d", __func__, error)); break; case VM_ACTIVATE_CPU: vac = (struct vm_activate_cpu *)data; error = vm_activate_cpu(sc->vm, vac->vcpuid); break; case VM_GET_CPUS: error = 0; vm_cpuset = (struct vm_cpuset *)data; size = vm_cpuset->cpusetsize; if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) { error = ERANGE; break; } cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO); if (vm_cpuset->which == VM_ACTIVE_CPUS) *cpuset = vm_active_cpus(sc->vm); else if (vm_cpuset->which == VM_SUSPENDED_CPUS) *cpuset = vm_suspended_cpus(sc->vm); else if (vm_cpuset->which == VM_DEBUG_CPUS) *cpuset = vm_debug_cpus(sc->vm); else error = EINVAL; if (error == 0) error = copyout(cpuset, vm_cpuset->cpus, size); free(cpuset, M_TEMP); break; case VM_SUSPEND_CPU: vac = (struct vm_activate_cpu *)data; error = vm_suspend_cpu(sc->vm, vac->vcpuid); break; case VM_RESUME_CPU: vac = (struct vm_activate_cpu *)data; error = vm_resume_cpu(sc->vm, vac->vcpuid); break; case VM_SET_INTINFO: vmii = (struct vm_intinfo *)data; error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1); break; case VM_GET_INTINFO: vmii = (struct vm_intinfo *)data; error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1, &vmii->info2); break; case VM_RTC_WRITE: rtcdata = (struct vm_rtc_data *)data; error = vrtc_nvram_write(sc->vm, rtcdata->offset, rtcdata->value); break; case VM_RTC_READ: rtcdata = (struct vm_rtc_data *)data; error = vrtc_nvram_read(sc->vm, rtcdata->offset, &rtcdata->value); break; case VM_RTC_SETTIME: rtctime = (struct vm_rtc_time *)data; error = vrtc_set_time(sc->vm, rtctime->secs); break; case VM_RTC_GETTIME: error = 0; rtctime = (struct vm_rtc_time *)data; rtctime->secs = vrtc_get_time(sc->vm); break; case VM_RESTART_INSTRUCTION: error = vm_restart_instruction(sc->vm, vcpu); break; case VM_SET_TOPOLOGY: topology = (struct vm_cpu_topology *)data; error = vm_set_topology(sc->vm, topology->sockets, topology->cores, topology->threads, topology->maxcpus); break; case VM_GET_TOPOLOGY: topology = (struct vm_cpu_topology *)data; vm_get_topology(sc->vm, &topology->sockets, &topology->cores, &topology->threads, &topology->maxcpus); error = 0; break; +#ifdef BHYVE_SNAPSHOT + case VM_SNAPSHOT_REQ: + snapshot_meta = (struct vm_snapshot_meta *)data; + error = vm_snapshot_req(sc->vm, snapshot_meta); + break; + case VM_RESTORE_TIME: + error = vm_restore_time(sc->vm); + break; +#endif default: error = ENOTTY; break; } if (state_changed == 1) vcpu_unlock_one(sc, vcpu); else if (state_changed == 2) vcpu_unlock_all(sc); done: /* * Make sure that no handler returns a kernel-internal * error value to userspace. */ KASSERT(error == ERESTART || error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); return (error); } static int vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize, struct vm_object **objp, int nprot) { struct vmmdev_softc *sc; vm_paddr_t gpa; size_t len; vm_ooffset_t segoff, first, last; int error, found, segid; uint16_t lastcpu; bool sysmem; error = vmm_priv_check(curthread->td_ucred); if (error) return (error); first = *offset; last = first + mapsize; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); sc = vmmdev_lookup2(cdev); if (sc == NULL) { /* virtual machine is in the process of being created */ return (EINVAL); } /* * Get a read lock on the guest memory map by freezing any vcpu. */ lastcpu = vm_get_maxcpus(sc->vm) - 1; error = vcpu_lock_one(sc, lastcpu); if (error) return (error); gpa = 0; found = 0; while (!found) { error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len, NULL, NULL); if (error) break; if (first >= gpa && last <= gpa + len) found = 1; else gpa += len; } if (found) { error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp); KASSERT(error == 0 && *objp != NULL, ("%s: invalid memory segment %d", __func__, segid)); if (sysmem) { vm_object_reference(*objp); *offset = segoff + (first - gpa); } else { error = EINVAL; } } vcpu_unlock_one(sc, lastcpu); return (error); } static void vmmdev_destroy(void *arg) { struct vmmdev_softc *sc = arg; struct devmem_softc *dsc; int error; error = vcpu_lock_all(sc); KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error)); while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) { KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__)); SLIST_REMOVE_HEAD(&sc->devmem, link); free(dsc->name, M_VMMDEV); free(dsc, M_VMMDEV); } if (sc->cdev != NULL) destroy_dev(sc->cdev); if (sc->vm != NULL) vm_destroy(sc->vm); if ((sc->flags & VSC_LINKED) != 0) { mtx_lock(&vmmdev_mtx); SLIST_REMOVE(&head, sc, vmmdev_softc, link); mtx_unlock(&vmmdev_mtx); } free(sc, M_VMMDEV); } static int sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) { struct devmem_softc *dsc; struct vmmdev_softc *sc; struct cdev *cdev; char *buf; int error, buflen; error = vmm_priv_check(req->td->td_ucred); if (error) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); strlcpy(buf, "beavis", buflen); error = sysctl_handle_string(oidp, buf, buflen, req); if (error != 0 || req->newptr == NULL) goto out; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(buf); if (sc == NULL || sc->cdev == NULL) { mtx_unlock(&vmmdev_mtx); error = EINVAL; goto out; } /* * The 'cdev' will be destroyed asynchronously when 'si_threadcount' * goes down to 0 so we should not do it again in the callback. * * Setting 'sc->cdev' to NULL is also used to indicate that the VM * is scheduled for destruction. */ cdev = sc->cdev; sc->cdev = NULL; mtx_unlock(&vmmdev_mtx); /* * Schedule all cdevs to be destroyed: * * - any new operations on the 'cdev' will return an error (ENXIO). * * - when the 'si_threadcount' dwindles down to zero the 'cdev' will * be destroyed and the callback will be invoked in a taskqueue * context. * * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev' */ SLIST_FOREACH(dsc, &sc->devmem, link) { KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed")); destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc); } destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); error = 0; out: free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_destroy, "A", NULL); static struct cdevsw vmmdevsw = { .d_name = "vmmdev", .d_version = D_VERSION, .d_ioctl = vmmdev_ioctl, .d_mmap_single = vmmdev_mmap_single, .d_read = vmmdev_rw, .d_write = vmmdev_rw, }; static int sysctl_vmm_create(SYSCTL_HANDLER_ARGS) { struct vm *vm; struct cdev *cdev; struct vmmdev_softc *sc, *sc2; char *buf; int error, buflen; error = vmm_priv_check(req->td->td_ucred); if (error) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); strlcpy(buf, "beavis", buflen); error = sysctl_handle_string(oidp, buf, buflen, req); if (error != 0 || req->newptr == NULL) goto out; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(buf); mtx_unlock(&vmmdev_mtx); if (sc != NULL) { error = EEXIST; goto out; } error = vm_create(buf, &vm); if (error != 0) goto out; sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); sc->vm = vm; SLIST_INIT(&sc->devmem); /* * Lookup the name again just in case somebody sneaked in when we * dropped the lock. */ mtx_lock(&vmmdev_mtx); sc2 = vmmdev_lookup(buf); if (sc2 == NULL) { SLIST_INSERT_HEAD(&head, sc, link); sc->flags |= VSC_LINKED; } mtx_unlock(&vmmdev_mtx); if (sc2 != NULL) { vmmdev_destroy(sc); error = EEXIST; goto out; } error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); if (error != 0) { vmmdev_destroy(sc); goto out; } mtx_lock(&vmmdev_mtx); sc->cdev = cdev; sc->cdev->si_drv1 = sc; mtx_unlock(&vmmdev_mtx); out: free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_create, "A", NULL); void vmmdev_init(void) { mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); pr_allow_flag = prison_add_allow(NULL, "vmm", NULL, "Allow use of vmm in a jail."); } int vmmdev_cleanup(void) { int error; if (SLIST_EMPTY(&head)) error = 0; else error = EBUSY; return (error); } static int devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len, struct vm_object **objp, int nprot) { struct devmem_softc *dsc; vm_ooffset_t first, last; size_t seglen; int error; uint16_t lastcpu; bool sysmem; dsc = cdev->si_drv1; if (dsc == NULL) { /* 'cdev' has been created but is not ready for use */ return (ENXIO); } first = *offset; last = *offset + len; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1; error = vcpu_lock_one(dsc->sc, lastcpu); if (error) return (error); error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp); KASSERT(error == 0 && !sysmem && *objp != NULL, ("%s: invalid devmem segment %d", __func__, dsc->segid)); vcpu_unlock_one(dsc->sc, lastcpu); if (seglen >= last) { vm_object_reference(*objp); return (0); } else { return (EINVAL); } } static struct cdevsw devmemsw = { .d_name = "devmem", .d_version = D_VERSION, .d_mmap_single = devmem_mmap_single, }; static int devmem_create_cdev(const char *vmname, int segid, char *devname) { struct devmem_softc *dsc; struct vmmdev_softc *sc; struct cdev *cdev; int error; error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL, UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname); if (error) return (error); dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO); mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(vmname); KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname)); if (sc->cdev == NULL) { /* virtual machine is being created or destroyed */ mtx_unlock(&vmmdev_mtx); free(dsc, M_VMMDEV); destroy_dev_sched_cb(cdev, NULL, 0); return (ENODEV); } dsc->segid = segid; dsc->name = devname; dsc->cdev = cdev; dsc->sc = sc; SLIST_INSERT_HEAD(&sc->devmem, dsc, link); mtx_unlock(&vmmdev_mtx); /* The 'cdev' is ready for use after 'si_drv1' is initialized */ cdev->si_drv1 = dsc; return (0); } static void devmem_destroy(void *arg) { struct devmem_softc *dsc = arg; KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__)); dsc->cdev = NULL; dsc->sc = NULL; } diff --git a/sys/amd64/vmm/vmm_snapshot.c b/sys/amd64/vmm/vmm_snapshot.c new file mode 100644 index 000000000000..c77bb05f76b7 --- /dev/null +++ b/sys/amd64/vmm/vmm_snapshot.c @@ -0,0 +1,141 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Flavius Anton + * Copyright (c) 2016 Mihai Tiganus + * Copyright (c) 2016-2019 Mihai Carabas + * Copyright (c) 2017-2019 Darius Mihai + * Copyright (c) 2017-2019 Elena Mihailescu + * Copyright (c) 2018-2019 Sergiu Weisz + * All rights reserved. + * The bhyve-snapshot feature was developed under sponsorships + * from Matthew Grooms. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include + +void +vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op) +{ + const char *opstr; + + if (op == VM_SNAPSHOT_SAVE) + opstr = "save"; + else if (op == VM_SNAPSHOT_RESTORE) + opstr = "restore"; + else + opstr = "unknown"; + + printf("%s: snapshot-%s failed for %s\r\n", __func__, opstr, bufname); +} + +int +vm_snapshot_buf(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta) +{ + struct vm_snapshot_buffer *buffer; + int op; + void *nv_data; + + nv_data = __DEVOLATILE(void *, data); + buffer = &meta->buffer; + op = meta->op; + + if (buffer->buf_rem < data_size) { + printf("%s: buffer too small\r\n", __func__); + return (E2BIG); + } + + if (op == VM_SNAPSHOT_SAVE) + copyout(nv_data, buffer->buf, data_size); + else if (op == VM_SNAPSHOT_RESTORE) + copyin(buffer->buf, nv_data, data_size); + else + return (EINVAL); + + buffer->buf += data_size; + buffer->buf_rem -= data_size; + + return (0); +} + +size_t +vm_get_snapshot_size(struct vm_snapshot_meta *meta) +{ + size_t length; + struct vm_snapshot_buffer *buffer; + + buffer = &meta->buffer; + + if (buffer->buf_size < buffer->buf_rem) { + printf("%s: Invalid buffer: size = %zu, rem = %zu\r\n", + __func__, buffer->buf_size, buffer->buf_rem); + length = 0; + } else { + length = buffer->buf_size - buffer->buf_rem; + } + + return (length); +} + +int +vm_snapshot_buf_cmp(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta) +{ + struct vm_snapshot_buffer *buffer; + int op; + int ret; + void *_data = *(void **)(void *)&data; + + buffer = &meta->buffer; + op = meta->op; + + if (buffer->buf_rem < data_size) { + printf("%s: buffer too small\r\n", __func__); + ret = E2BIG; + goto done; + } + + if (op == VM_SNAPSHOT_SAVE) { + ret = 0; + copyout(_data, buffer->buf, data_size); + } else if (op == VM_SNAPSHOT_RESTORE) { + ret = memcmp(_data, buffer->buf, data_size); + } else { + ret = EINVAL; + goto done; + } + + buffer->buf += data_size; + buffer->buf_rem -= data_size; + +done: + return (ret); +} diff --git a/sys/conf/config.mk b/sys/conf/config.mk index 6b405890458e..50188eee923b 100644 --- a/sys/conf/config.mk +++ b/sys/conf/config.mk @@ -1,65 +1,72 @@ # $FreeBSD$ # # Common code to marry kernel config(8) goo and module building goo. # # Generate options files that otherwise would be built # in substantially similar ways through the tree. Move # the code here when they all produce identical results # (or should) .if !defined(KERNBUILDDIR) opt_global.h: touch ${.TARGET} .if ${MACHINE} != "mips" @echo "#define SMP 1" >> ${.TARGET} @echo "#define MAC 1" >> ${.TARGET} @echo "#define VIMAGE 1" >> ${.TARGET} .endif +.if ${MK_BHYVE_SNAPSHOT} != "no" +opt_bhyve_snapshot.h: + @echo "#define BHYVE_SNAPSHOT 1" > ${.TARGET} +.endif opt_bpf.h: echo "#define DEV_BPF 1" > ${.TARGET} .if ${MK_INET_SUPPORT} != "no" opt_inet.h: @echo "#define INET 1" > ${.TARGET} @echo "#define TCP_OFFLOAD 1" >> ${.TARGET} .endif .if ${MK_INET6_SUPPORT} != "no" opt_inet6.h: @echo "#define INET6 1" > ${.TARGET} .endif .if ${MK_RATELIMIT} != "no" opt_ratelimit.h: @echo "#define RATELIMIT 1" > ${.TARGET} .endif opt_mrouting.h: echo "#define MROUTING 1" > ${.TARGET} opt_printf.h: echo "#define PRINTF_BUFR_SIZE 128" > ${.TARGET} opt_scsi.h: echo "#define SCSI_DELAY 15000" > ${.TARGET} opt_wlan.h: echo "#define IEEE80211_DEBUG 1" > ${.TARGET} echo "#define IEEE80211_SUPPORT_MESH 1" >> ${.TARGET} KERN_OPTS.i386=NEW_PCIB DEV_PCI KERN_OPTS.amd64=NEW_PCIB DEV_PCI KERN_OPTS.powerpc=NEW_PCIB DEV_PCI KERN_OPTS=MROUTING IEEE80211_DEBUG \ IEEE80211_SUPPORT_MESH DEV_BPF \ ${KERN_OPTS.${MACHINE}} ${KERN_OPTS_EXTRA} +.if ${MK_BHYVE_SNAPSHOT} != "no" +KERN_OPTS+= BHYVE_SNAPSHOT +.endif .if ${MK_INET_SUPPORT} != "no" KERN_OPTS+= INET TCP_OFFLOAD .endif .if ${MK_INET6_SUPPORT} != "no" KERN_OPTS+= INET6 .endif .elif !defined(KERN_OPTS) # Add all the options that are mentioned in any opt_*.h file when we # have a kernel build directory to pull them from. KERN_OPTS!=cat ${KERNBUILDDIR}/opt*.h | awk '{print $$2;}' | sort -u .export KERN_OPTS .endif .if !defined(NO_MODULES) && !defined(__MPATH) && !make(install) && \ (empty(.MAKEFLAGS:M-V) || defined(NO_SKIP_MPATH)) __MPATH!=find ${SYSDIR:tA}/ -name \*_if.m .export __MPATH .endif diff --git a/sys/conf/kern.opts.mk b/sys/conf/kern.opts.mk index 078d79913634..bc7ddfd06e7c 100644 --- a/sys/conf/kern.opts.mk +++ b/sys/conf/kern.opts.mk @@ -1,177 +1,178 @@ # $FreeBSD$ # Options set in the build system that affect the kernel somehow. # # Define MK_* variables (which are either "yes" or "no") for users # to set via WITH_*/WITHOUT_* in /etc/src.conf and override in the # make(1) environment. # These should be tested with `== "no"' or `!= "no"' in makefiles. # The NO_* variables should only be set by makefiles for variables # that haven't been converted over. # # Note: bsd.own.mk must be included before the rest of kern.opts.mk to make # building on 10.x and earlier work. This should be removed when that's no # longer supported since it confounds the defaults (since it uses the host's # notion of defaults rather than what's default in current when building # within sys/modules). .include # These options are used by the kernel build process (kern.mk and kmod.mk) # They have to be listed here so we can build modules outside of the # src tree. KLDXREF_CMD?= kldxref __DEFAULT_YES_OPTIONS = \ AUTOFS \ BHYVE \ BLUETOOTH \ CCD \ CDDL \ CRYPT \ CUSE \ EFI \ FORMAT_EXTENSIONS \ INET \ INET6 \ IPFILTER \ IPSEC_SUPPORT \ ISCSI \ KERNEL_SYMBOLS \ NETGRAPH \ PF \ SOURCELESS_HOST \ SOURCELESS_UCODE \ TESTS \ USB_GADGET_EXAMPLES \ ZFS __DEFAULT_NO_OPTIONS = \ + BHYVE_SNAPSHOT \ EXTRA_TCP_STACKS \ KERNEL_RETPOLINE \ OFED \ RATELIMIT \ REPRODUCIBLE_BUILD # Some options are totally broken on some architectures. We disable # them. If you need to enable them on an experimental basis, you # must change this code. # Note: These only apply to the list of modules we build by default # and sometimes what is in the opt_*.h files by default. # Kernel config files are unaffected, though some targets can be # affected by KERNEL_SYMBOLS, FORMAT_EXTENSIONS, CTF and SSP. # Things that don't work based on the CPU .if ${MACHINE_CPUARCH} == "arm" . if ${MACHINE_ARCH:Marmv[67]*} == "" BROKEN_OPTIONS+= CDDL ZFS . endif .endif .if ${MACHINE_CPUARCH} == "mips" BROKEN_OPTIONS+= CDDL ZFS SSP .endif .if ${MACHINE_CPUARCH} == "powerpc" && ${MACHINE_ARCH} == "powerpc" BROKEN_OPTIONS+= ZFS .endif .if ${MACHINE_CPUARCH} == "riscv" BROKEN_OPTIONS+= FORMAT_EXTENSIONS .endif # Things that don't work because the kernel doesn't have the support # for them. .if ${MACHINE} != "i386" && ${MACHINE} != "amd64" BROKEN_OPTIONS+= OFED .endif # Things that don't work based on toolchain support. .if ${MACHINE} != "i386" && ${MACHINE} != "amd64" BROKEN_OPTIONS+= KERNEL_RETPOLINE .endif # EFI doesn't exist on mips, powerpc, or riscv. .if ${MACHINE:Mmips} || ${MACHINE:Mpowerpc} || ${MACHINE:Mriscv} BROKEN_OPTIONS+=EFI .endif # expanded inline from bsd.mkopt.mk to avoid share/mk dependency # Those that default to yes .for var in ${__DEFAULT_YES_OPTIONS} .if !defined(MK_${var}) .if defined(WITHOUT_${var}) # WITHOUT always wins MK_${var}:= no .else MK_${var}:= yes .endif .else .if ${MK_${var}} != "yes" && ${MK_${var}} != "no" .error "Illegal value for MK_${var}: ${MK_${var}}" .endif .endif # !defined(MK_${var}) .endfor .undef __DEFAULT_YES_OPTIONS # Those that default to no .for var in ${__DEFAULT_NO_OPTIONS} .if !defined(MK_${var}) .if defined(WITH_${var}) && !defined(WITHOUT_${var}) # WITHOUT always wins MK_${var}:= yes .else MK_${var}:= no .endif .else .if ${MK_${var}} != "yes" && ${MK_${var}} != "no" .error "Illegal value for MK_${var}: ${MK_${var}}" .endif .endif # !defined(MK_${var}) .endfor .undef __DEFAULT_NO_OPTIONS # # MK_* options which are always no, usually because they are # unsupported/badly broken on this architecture. # .for var in ${BROKEN_OPTIONS} MK_${var}:= no .endfor .undef BROKEN_OPTIONS #end of bsd.mkopt.mk expanded inline. # # MK_*_SUPPORT options which default to "yes" unless their corresponding # MK_* variable is set to "no". # .for var in \ INET \ INET6 .if defined(WITHOUT_${var}_SUPPORT) || ${MK_${var}} == "no" MK_${var}_SUPPORT:= no .else .if defined(KERNBUILDDIR) # See if there's an opt_foo.h .if !defined(OPT_${var}) OPT_${var}!= cat ${KERNBUILDDIR}/opt_${var:tl}.h; echo .export OPT_${var} .endif .if ${OPT_${var}} == "" # nothing -> no MK_${var}_SUPPORT:= no .else MK_${var}_SUPPORT:= yes .endif .else # otherwise, yes MK_${var}_SUPPORT:= yes .endif .endif .endfor # Some modules only compile successfully if option FDT is set, due to #ifdef FDT # wrapped around declarations. Module makefiles can optionally compile such # things using .if !empty(OPT_FDT) .if !defined(OPT_FDT) && defined(KERNBUILDDIR) OPT_FDT!= sed -n '/FDT/p' ${KERNBUILDDIR}/opt_platform.h .export OPT_FDT .endif diff --git a/sys/conf/options.amd64 b/sys/conf/options.amd64 index 8939ddaf6246..cd90747ba732 100644 --- a/sys/conf/options.amd64 +++ b/sys/conf/options.amd64 @@ -1,70 +1,71 @@ # $FreeBSD$ # Options specific to AMD64 platform kernels AUTO_EOI_1 opt_auto_eoi.h AUTO_EOI_2 opt_auto_eoi.h +BHYVE_SNAPSHOT COUNT_XINVLTLB_HITS opt_smp.h COUNT_IPIS opt_smp.h MAXMEM MPTABLE_FORCE_HTT MP_WATCHDOG NKPT opt_pmap.h PV_STATS opt_pmap.h # Options for emulators. These should only be used at config time, so # they are handled like options for static filesystems # (see src/sys/conf/options), except for broken debugging options. COMPAT_AOUT opt_dontuse.h COMPAT_FREEBSD32 opt_global.h #COMPAT_LINUX opt_dontuse.h COMPAT_LINUX32 opt_compat.h LINPROCFS opt_dontuse.h LINSYSFS opt_dontuse.h NDISAPI opt_dontuse.h TIMER_FREQ opt_clock.h # options for serial support COM_ESP opt_sio.h COM_MULTIPORT opt_sio.h CONSPEED opt_sio.h GDBSPEED opt_sio.h COM_NO_ACPI opt_sio.h VGA_ALT_SEQACCESS opt_vga.h VGA_DEBUG opt_vga.h VGA_NO_FONT_LOADING opt_vga.h VGA_NO_MODE_CHANGE opt_vga.h VGA_SLOW_IOACCESS opt_vga.h VGA_WIDTH90 opt_vga.h VESA VESA_DEBUG opt_vesa.h # AGP debugging support AGP_DEBUG opt_agp.h ATKBD_DFLT_KEYMAP opt_atkbd.h # iWARP client interface support in ixl IXL_IW opt_ixl.h # ------------------------------- # EOF # ------------------------------- HAMMER opt_cpu.h PSM_HOOKRESUME opt_psm.h PSM_RESETAFTERSUSPEND opt_psm.h PSM_DEBUG opt_psm.h DEV_ATPIC opt_atpic.h # BPF just-in-time compiler BPF_JITTER opt_bpf.h XENHVM opt_global.h # options for the Intel C600 SAS driver (isci) ISCI_LOGGING opt_isci.h # EFI Runtime services support EFIRT opt_efirt.h diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile index 9471fc9074dc..b5d62c358272 100644 --- a/sys/modules/vmm/Makefile +++ b/sys/modules/vmm/Makefile @@ -1,83 +1,90 @@ # $FreeBSD$ +.include + KMOD= vmm -SRCS= opt_acpi.h opt_ddb.h device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h +SRCS= opt_acpi.h opt_bhyve_snapshot.h opt_ddb.h +SRCS+= device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h vnode_if.h DPSRCS+= vmx_assym.h svm_assym.h DPSRCS+= vmx_genassym.c svm_genassym.c offset.inc CFLAGS+= -DVMM_KEEP_STATS CFLAGS+= -I${SRCTOP}/sys/amd64/vmm CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/io CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/intel CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/amd # generic vmm support .PATH: ${SRCTOP}/sys/amd64/vmm SRCS+= vmm.c \ vmm_dev.c \ vmm_host.c \ vmm_instruction_emul.c \ vmm_ioport.c \ vmm_lapic.c \ vmm_mem.c \ vmm_stat.c \ vmm_util.c \ x86.c .PATH: ${SRCTOP}/sys/amd64/vmm/io SRCS+= iommu.c \ ppt.c \ vatpic.c \ vatpit.c \ vhpet.c \ vioapic.c \ vlapic.c \ vpmtmr.c \ vrtc.c # intel-specific files .PATH: ${SRCTOP}/sys/amd64/vmm/intel SRCS+= ept.c \ vmcs.c \ vmx_msr.c \ vmx_support.S \ vmx.c \ vtd.c # amd-specific files .PATH: ${SRCTOP}/sys/amd64/vmm/amd SRCS+= vmcb.c \ svm.c \ svm_support.S \ npt.c \ ivrs_drv.c \ amdvi_hw.c \ svm_msr.c +.if ${KERN_OPTS:MBHYVE_SNAPSHOT} != "" +SRCS+= vmm_snapshot.c +.endif + CLEANFILES= vmx_assym.h vmx_genassym.o svm_assym.h svm_genassym.o OBJS_DEPEND_GUESS.vmx_support.o+= vmx_assym.h OBJS_DEPEND_GUESS.svm_support.o+= svm_assym.h vmx_assym.h: vmx_genassym.o sh ${SYSDIR}/kern/genassym.sh vmx_genassym.o > ${.TARGET} svm_assym.h: svm_genassym.o sh ${SYSDIR}/kern/genassym.sh svm_genassym.o > ${.TARGET} vmx_support.o: ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ ${.IMPSRC} -o ${.TARGET} svm_support.o: ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ ${.IMPSRC} -o ${.TARGET} vmx_genassym.o: offset.inc ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} svm_genassym.o: offset.inc ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} .include diff --git a/tools/build/options/WITH_BHYVE_SNAPSHOT b/tools/build/options/WITH_BHYVE_SNAPSHOT new file mode 100644 index 000000000000..7e673f51c8bb --- /dev/null +++ b/tools/build/options/WITH_BHYVE_SNAPSHOT @@ -0,0 +1,7 @@ +.\" $FreeBSD$ +Set to include support for save and restore (snapshots) in +.Xr bhyve 8 +and +.Xr bhyvectl 8 . +.Pp +This option only affects amd64/amd64. diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile index 12bd477825bf..9a4460a1b90f 100644 --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -1,102 +1,117 @@ # # $FreeBSD$ # .include CFLAGS+=-I${SRCTOP}/sys .PATH: ${SRCTOP}/sys/cam/ctl PROG= bhyve PACKAGE= bhyve MAN= bhyve.8 BHYVE_SYSDIR?=${SRCTOP} SRCS= \ atkbdc.c \ acpi.c \ audio.c \ bhyvegc.c \ bhyverun.c \ block_if.c \ bootrom.c \ console.c \ consport.c \ ctl_util.c \ ctl_scsi_all.c \ dbgport.c \ fwctl.c \ gdb.c \ hda_codec.c \ inout.c \ ioapic.c \ mem.c \ mevent.c \ mptbl.c \ net_backends.c \ net_utils.c \ pci_ahci.c \ pci_e82545.c \ pci_emul.c \ pci_hda.c \ pci_fbuf.c \ pci_hostbridge.c \ pci_irq.c \ pci_lpc.c \ pci_nvme.c \ pci_passthru.c \ pci_virtio_block.c \ pci_virtio_console.c \ pci_virtio_net.c \ pci_virtio_rnd.c \ pci_virtio_scsi.c \ pci_uart.c \ pci_xhci.c \ pm.c \ post.c \ ps2kbd.c \ ps2mouse.c \ rfb.c \ rtc.c \ smbiostbl.c \ sockstream.c \ task_switch.c \ uart_emul.c \ usb_emul.c \ usb_mouse.c \ virtio.c \ vga.c \ vmgenc.c \ xmsr.c \ spinup_ap.c \ iov.c +.if ${MK_BHYVE_SNAPSHOT} != "no" +SRCS+= snapshot.c +.endif + .PATH: ${BHYVE_SYSDIR}/sys/amd64/vmm SRCS+= vmm_instruction_emul.c LIBADD= vmmapi md pthread z util sbuf cam +.if ${MK_BHYVE_SNAPSHOT} != "no" +LIBADD+= ucl xo +.endif .if ${MK_INET_SUPPORT} != "no" CFLAGS+=-DINET .endif .if ${MK_INET6_SUPPORT} != "no" CFLAGS+=-DINET6 .endif .if ${MK_OPENSSL} == "no" CFLAGS+=-DNO_OPENSSL .else LIBADD+= crypto .endif CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000 CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller +.if ${MK_BHYVE_SNAPSHOT} != "no" +CFLAGS+= -I${SRCTOP}/contrib/libucl/include + +# Temporary disable capsicum, until we integrate checkpoint code with it. +CFLAGS+= -DWITHOUT_CAPSICUM + +CFLAGS+= -DBHYVE_SNAPSHOT +.endif .ifdef GDB_LOG CFLAGS+=-DGDB_LOG .endif WARNS?= 2 .include diff --git a/usr.sbin/bhyve/Makefile.depend b/usr.sbin/bhyve/Makefile.depend index 8d3ff079d277..8222ceb6ad25 100644 --- a/usr.sbin/bhyve/Makefile.depend +++ b/usr.sbin/bhyve/Makefile.depend @@ -1,25 +1,27 @@ # $FreeBSD$ # Autogenerated - do NOT edit! DIRDEPS = \ include \ include/arpa \ include/xlocale \ lib/${CSU_DIR} \ lib/libc \ lib/libcam \ lib/libcapsicum \ lib/libcasper/libcasper \ lib/libcompiler_rt \ lib/libsbuf \ lib/libthr \ + lib/libucl \ lib/libutil \ lib/libvmmapi \ + lib/libxo \ lib/libz \ .include .if ${DEP_RELDIR} == ${_DEP_RELDIR} # local dependencies - needed for -jN in clean tree .endif diff --git a/usr.sbin/bhyve/atkbdc.c b/usr.sbin/bhyve/atkbdc.c index 1c1838c2e80c..a08f58f84b22 100644 --- a/usr.sbin/bhyve/atkbdc.c +++ b/usr.sbin/bhyve/atkbdc.c @@ -1,586 +1,632 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Tycho Nightingale * Copyright (c) 2015 Nahanni Systems Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include #include #include #include #include #include #include #include #include "acpi.h" #include "atkbdc.h" #include "inout.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "ps2kbd.h" #include "ps2mouse.h" #define KBD_DATA_PORT 0x60 #define KBD_STS_CTL_PORT 0x64 #define KBDC_RESET 0xfe #define KBD_DEV_IRQ 1 #define AUX_DEV_IRQ 12 /* controller commands */ #define KBDC_SET_COMMAND_BYTE 0x60 #define KBDC_GET_COMMAND_BYTE 0x20 #define KBDC_DISABLE_AUX_PORT 0xa7 #define KBDC_ENABLE_AUX_PORT 0xa8 #define KBDC_TEST_AUX_PORT 0xa9 #define KBDC_TEST_CTRL 0xaa #define KBDC_TEST_KBD_PORT 0xab #define KBDC_DISABLE_KBD_PORT 0xad #define KBDC_ENABLE_KBD_PORT 0xae #define KBDC_READ_INPORT 0xc0 #define KBDC_READ_OUTPORT 0xd0 #define KBDC_WRITE_OUTPORT 0xd1 #define KBDC_WRITE_KBD_OUTBUF 0xd2 #define KBDC_WRITE_AUX_OUTBUF 0xd3 #define KBDC_WRITE_TO_AUX 0xd4 /* controller command byte (set by KBDC_SET_COMMAND_BYTE) */ #define KBD_TRANSLATION 0x40 #define KBD_SYS_FLAG_BIT 0x04 #define KBD_DISABLE_KBD_PORT 0x10 #define KBD_DISABLE_AUX_PORT 0x20 #define KBD_ENABLE_AUX_INT 0x02 #define KBD_ENABLE_KBD_INT 0x01 #define KBD_KBD_CONTROL_BITS (KBD_DISABLE_KBD_PORT | KBD_ENABLE_KBD_INT) #define KBD_AUX_CONTROL_BITS (KBD_DISABLE_AUX_PORT | KBD_ENABLE_AUX_INT) /* controller status bits */ #define KBDS_KBD_BUFFER_FULL 0x01 #define KBDS_SYS_FLAG 0x04 #define KBDS_CTRL_FLAG 0x08 #define KBDS_AUX_BUFFER_FULL 0x20 /* controller output port */ #define KBDO_KBD_OUTFULL 0x10 #define KBDO_AUX_OUTFULL 0x20 #define RAMSZ 32 #define FIFOSZ 15 #define CTRL_CMD_FLAG 0x8000 struct kbd_dev { bool irq_active; int irq; uint8_t buffer[FIFOSZ]; int brd, bwr; int bcnt; }; struct aux_dev { bool irq_active; int irq; }; struct atkbdc_softc { struct vmctx *ctx; pthread_mutex_t mtx; struct ps2kbd_softc *ps2kbd_sc; struct ps2mouse_softc *ps2mouse_sc; uint8_t status; /* status register */ uint8_t outport; /* controller output port */ uint8_t ram[RAMSZ]; /* byte0 = controller config */ uint32_t curcmd; /* current command for next byte */ uint32_t ctrlbyte; struct kbd_dev kbd; struct aux_dev aux; }; +#ifdef BHYVE_SNAPSHOT +static struct atkbdc_softc *atkbdc_sc = NULL; +#endif + static void atkbdc_assert_kbd_intr(struct atkbdc_softc *sc) { if ((sc->ram[0] & KBD_ENABLE_KBD_INT) != 0) { sc->kbd.irq_active = true; vm_isa_pulse_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq); } } static void atkbdc_assert_aux_intr(struct atkbdc_softc *sc) { if ((sc->ram[0] & KBD_ENABLE_AUX_INT) != 0) { sc->aux.irq_active = true; vm_isa_pulse_irq(sc->ctx, sc->aux.irq, sc->aux.irq); } } static int atkbdc_kbd_queue_data(struct atkbdc_softc *sc, uint8_t val) { assert(pthread_mutex_isowned_np(&sc->mtx)); if (sc->kbd.bcnt < FIFOSZ) { sc->kbd.buffer[sc->kbd.bwr] = val; sc->kbd.bwr = (sc->kbd.bwr + 1) % FIFOSZ; sc->kbd.bcnt++; sc->status |= KBDS_KBD_BUFFER_FULL; sc->outport |= KBDO_KBD_OUTFULL; } else { printf("atkbd data buffer full\n"); } return (sc->kbd.bcnt < FIFOSZ); } static void atkbdc_kbd_read(struct atkbdc_softc *sc) { const uint8_t translation[256] = { 0xff, 0x43, 0x41, 0x3f, 0x3d, 0x3b, 0x3c, 0x58, 0x64, 0x44, 0x42, 0x40, 0x3e, 0x0f, 0x29, 0x59, 0x65, 0x38, 0x2a, 0x70, 0x1d, 0x10, 0x02, 0x5a, 0x66, 0x71, 0x2c, 0x1f, 0x1e, 0x11, 0x03, 0x5b, 0x67, 0x2e, 0x2d, 0x20, 0x12, 0x05, 0x04, 0x5c, 0x68, 0x39, 0x2f, 0x21, 0x14, 0x13, 0x06, 0x5d, 0x69, 0x31, 0x30, 0x23, 0x22, 0x15, 0x07, 0x5e, 0x6a, 0x72, 0x32, 0x24, 0x16, 0x08, 0x09, 0x5f, 0x6b, 0x33, 0x25, 0x17, 0x18, 0x0b, 0x0a, 0x60, 0x6c, 0x34, 0x35, 0x26, 0x27, 0x19, 0x0c, 0x61, 0x6d, 0x73, 0x28, 0x74, 0x1a, 0x0d, 0x62, 0x6e, 0x3a, 0x36, 0x1c, 0x1b, 0x75, 0x2b, 0x63, 0x76, 0x55, 0x56, 0x77, 0x78, 0x79, 0x7a, 0x0e, 0x7b, 0x7c, 0x4f, 0x7d, 0x4b, 0x47, 0x7e, 0x7f, 0x6f, 0x52, 0x53, 0x50, 0x4c, 0x4d, 0x48, 0x01, 0x45, 0x57, 0x4e, 0x51, 0x4a, 0x37, 0x49, 0x46, 0x54, 0x80, 0x81, 0x82, 0x41, 0x54, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff }; uint8_t val; uint8_t release = 0; assert(pthread_mutex_isowned_np(&sc->mtx)); if (sc->ram[0] & KBD_TRANSLATION) { while (ps2kbd_read(sc->ps2kbd_sc, &val) != -1) { if (val == 0xf0) { release = 0x80; continue; } else { val = translation[val] | release; } atkbdc_kbd_queue_data(sc, val); break; } } else { while (sc->kbd.bcnt < FIFOSZ) { if (ps2kbd_read(sc->ps2kbd_sc, &val) != -1) atkbdc_kbd_queue_data(sc, val); else break; } } if (((sc->ram[0] & KBD_DISABLE_AUX_PORT) || ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) && sc->kbd.bcnt > 0) atkbdc_assert_kbd_intr(sc); } static void atkbdc_aux_poll(struct atkbdc_softc *sc) { if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0) { sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL; sc->outport |= KBDO_AUX_OUTFULL; atkbdc_assert_aux_intr(sc); } } static void atkbdc_kbd_poll(struct atkbdc_softc *sc) { assert(pthread_mutex_isowned_np(&sc->mtx)); atkbdc_kbd_read(sc); } static void atkbdc_poll(struct atkbdc_softc *sc) { atkbdc_aux_poll(sc); atkbdc_kbd_poll(sc); } static void atkbdc_dequeue_data(struct atkbdc_softc *sc, uint8_t *buf) { assert(pthread_mutex_isowned_np(&sc->mtx)); if (ps2mouse_read(sc->ps2mouse_sc, buf) == 0) { if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) { if (sc->kbd.bcnt == 0) sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); else sc->status &= ~(KBDS_AUX_BUFFER_FULL); sc->outport &= ~KBDO_AUX_OUTFULL; } atkbdc_poll(sc); return; } if (sc->kbd.bcnt > 0) { *buf = sc->kbd.buffer[sc->kbd.brd]; sc->kbd.brd = (sc->kbd.brd + 1) % FIFOSZ; sc->kbd.bcnt--; if (sc->kbd.bcnt == 0) { sc->status &= ~KBDS_KBD_BUFFER_FULL; sc->outport &= ~KBDO_KBD_OUTFULL; } atkbdc_poll(sc); } if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0 && sc->kbd.bcnt == 0) { sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); } } static int atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { struct atkbdc_softc *sc; uint8_t buf; int retval; if (bytes != 1) return (-1); sc = arg; retval = 0; pthread_mutex_lock(&sc->mtx); if (in) { sc->curcmd = 0; if (sc->ctrlbyte != 0) { *eax = sc->ctrlbyte & 0xff; sc->ctrlbyte = 0; } else { /* read device buffer; includes kbd cmd responses */ atkbdc_dequeue_data(sc, &buf); *eax = buf; } sc->status &= ~KBDS_CTRL_FLAG; pthread_mutex_unlock(&sc->mtx); return (retval); } if (sc->status & KBDS_CTRL_FLAG) { /* * Command byte for the controller. */ switch (sc->curcmd) { case KBDC_SET_COMMAND_BYTE: sc->ram[0] = *eax; if (sc->ram[0] & KBD_SYS_FLAG_BIT) sc->status |= KBDS_SYS_FLAG; else sc->status &= ~KBDS_SYS_FLAG; break; case KBDC_WRITE_OUTPORT: sc->outport = *eax; break; case KBDC_WRITE_TO_AUX: ps2mouse_write(sc->ps2mouse_sc, *eax, 0); atkbdc_poll(sc); break; case KBDC_WRITE_KBD_OUTBUF: atkbdc_kbd_queue_data(sc, *eax); break; case KBDC_WRITE_AUX_OUTBUF: ps2mouse_write(sc->ps2mouse_sc, *eax, 1); sc->status |= (KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); atkbdc_aux_poll(sc); break; default: /* write to particular RAM byte */ if (sc->curcmd >= 0x61 && sc->curcmd <= 0x7f) { int byten; byten = (sc->curcmd - 0x60) & 0x1f; sc->ram[byten] = *eax & 0xff; } break; } sc->curcmd = 0; sc->status &= ~KBDS_CTRL_FLAG; pthread_mutex_unlock(&sc->mtx); return (retval); } /* * Data byte for the device. */ ps2kbd_write(sc->ps2kbd_sc, *eax); atkbdc_poll(sc); pthread_mutex_unlock(&sc->mtx); return (retval); } static int atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { struct atkbdc_softc *sc; int error, retval; if (bytes != 1) return (-1); sc = arg; retval = 0; pthread_mutex_lock(&sc->mtx); if (in) { /* read status register */ *eax = sc->status; pthread_mutex_unlock(&sc->mtx); return (retval); } sc->curcmd = 0; sc->status |= KBDS_CTRL_FLAG; sc->ctrlbyte = 0; switch (*eax) { case KBDC_GET_COMMAND_BYTE: sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[0]; break; case KBDC_TEST_CTRL: sc->ctrlbyte = CTRL_CMD_FLAG | 0x55; break; case KBDC_TEST_AUX_PORT: case KBDC_TEST_KBD_PORT: sc->ctrlbyte = CTRL_CMD_FLAG | 0; break; case KBDC_READ_INPORT: sc->ctrlbyte = CTRL_CMD_FLAG | 0; break; case KBDC_READ_OUTPORT: sc->ctrlbyte = CTRL_CMD_FLAG | sc->outport; break; case KBDC_SET_COMMAND_BYTE: case KBDC_WRITE_OUTPORT: case KBDC_WRITE_KBD_OUTBUF: case KBDC_WRITE_AUX_OUTBUF: sc->curcmd = *eax; break; case KBDC_DISABLE_KBD_PORT: sc->ram[0] |= KBD_DISABLE_KBD_PORT; break; case KBDC_ENABLE_KBD_PORT: sc->ram[0] &= ~KBD_DISABLE_KBD_PORT; if (sc->kbd.bcnt > 0) sc->status |= KBDS_KBD_BUFFER_FULL; atkbdc_poll(sc); break; case KBDC_WRITE_TO_AUX: sc->curcmd = *eax; break; case KBDC_DISABLE_AUX_PORT: sc->ram[0] |= KBD_DISABLE_AUX_PORT; ps2mouse_toggle(sc->ps2mouse_sc, 0); sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); sc->outport &= ~KBDS_AUX_BUFFER_FULL; break; case KBDC_ENABLE_AUX_PORT: sc->ram[0] &= ~KBD_DISABLE_AUX_PORT; ps2mouse_toggle(sc->ps2mouse_sc, 1); if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0) sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL; break; case KBDC_RESET: /* Pulse "reset" line */ error = vm_suspend(ctx, VM_SUSPEND_RESET); assert(error == 0 || errno == EALREADY); break; default: if (*eax >= 0x21 && *eax <= 0x3f) { /* read "byte N" from RAM */ int byten; byten = (*eax - 0x20) & 0x1f; sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[byten]; } break; } pthread_mutex_unlock(&sc->mtx); if (sc->ctrlbyte != 0) { sc->status |= KBDS_KBD_BUFFER_FULL; sc->status &= ~KBDS_AUX_BUFFER_FULL; atkbdc_assert_kbd_intr(sc); } else if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0 && (sc->ram[0] & KBD_DISABLE_AUX_PORT) == 0) { sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL; atkbdc_assert_aux_intr(sc); } else if (sc->kbd.bcnt > 0 && (sc->ram[0] & KBD_DISABLE_KBD_PORT) == 0) { sc->status |= KBDS_KBD_BUFFER_FULL; atkbdc_assert_kbd_intr(sc); } return (retval); } void atkbdc_event(struct atkbdc_softc *sc, int iskbd) { pthread_mutex_lock(&sc->mtx); if (iskbd) atkbdc_kbd_poll(sc); else atkbdc_aux_poll(sc); pthread_mutex_unlock(&sc->mtx); } void atkbdc_init(struct vmctx *ctx) { struct inout_port iop; struct atkbdc_softc *sc; int error; sc = calloc(1, sizeof(struct atkbdc_softc)); sc->ctx = ctx; pthread_mutex_init(&sc->mtx, NULL); bzero(&iop, sizeof(struct inout_port)); iop.name = "atkdbc"; iop.port = KBD_STS_CTL_PORT; iop.size = 1; iop.flags = IOPORT_F_INOUT; iop.handler = atkbdc_sts_ctl_handler; iop.arg = sc; error = register_inout(&iop); assert(error == 0); bzero(&iop, sizeof(struct inout_port)); iop.name = "atkdbc"; iop.port = KBD_DATA_PORT; iop.size = 1; iop.flags = IOPORT_F_INOUT; iop.handler = atkbdc_data_handler; iop.arg = sc; error = register_inout(&iop); assert(error == 0); pci_irq_reserve(KBD_DEV_IRQ); sc->kbd.irq = KBD_DEV_IRQ; pci_irq_reserve(AUX_DEV_IRQ); sc->aux.irq = AUX_DEV_IRQ; sc->ps2kbd_sc = ps2kbd_init(sc); sc->ps2mouse_sc = ps2mouse_init(sc); + +#ifdef BHYVE_SNAPSHOT + assert(atkbdc_sc == NULL); + atkbdc_sc = sc; +#endif +} + +#ifdef BHYVE_SNAPSHOT +int +atkbdc_snapshot(struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->status, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->outport, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(atkbdc_sc->ram, + sizeof(atkbdc_sc->ram), meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->curcmd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->ctrlbyte, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.irq_active, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.irq, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(atkbdc_sc->kbd.buffer, + sizeof(atkbdc_sc->kbd.buffer), meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.brd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.bwr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.bcnt, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->aux.irq_active, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->aux.irq, meta, ret, done); + + ret = ps2kbd_snapshot(atkbdc_sc->ps2kbd_sc, meta); + if (ret != 0) + goto done; + + ret = ps2mouse_snapshot(atkbdc_sc->ps2mouse_sc, meta); + +done: + return (ret); } +#endif static void atkbdc_dsdt(void) { dsdt_line(""); dsdt_line("Device (KBD)"); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0303\"))"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); dsdt_fixed_ioport(KBD_DATA_PORT, 1); dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1); dsdt_fixed_irq(1); dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); dsdt_line(""); dsdt_line("Device (MOU)"); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0F13\"))"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); dsdt_fixed_ioport(KBD_DATA_PORT, 1); dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1); dsdt_fixed_irq(12); dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); } LPC_DSDT(atkbdc_dsdt); diff --git a/usr.sbin/bhyve/atkbdc.h b/usr.sbin/bhyve/atkbdc.h index 85c8a7141eb2..14c00ed9ae88 100644 --- a/usr.sbin/bhyve/atkbdc.h +++ b/usr.sbin/bhyve/atkbdc.h @@ -1,38 +1,43 @@ /*- * Copyright (c) 2015 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _ATKBDC_H_ #define _ATKBDC_H_ struct atkbdc_softc; +struct vm_snapshot_meta; struct vmctx; void atkbdc_init(struct vmctx *ctx); void atkbdc_event(struct atkbdc_softc *sc, int iskbd); +#ifdef BHYVE_SNAPSHOT +int atkbdc_snapshot(struct vm_snapshot_meta *meta); +#endif + #endif /* _ATKBDC_H_ */ diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 index 5d329a54491e..85e5f0256fbf 100644 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -1,669 +1,685 @@ .\" Copyright (c) 2013 Peter Grehan .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd April 22, 2020 +.Dd May 04, 2020 .Dt BHYVE 8 .Os .Sh NAME .Nm bhyve .Nd "run a guest operating system inside a virtual machine" .Sh SYNOPSIS .Nm .Op Fl AabCeHhPSuWwxY .Oo .Sm off .Fl c\~ .Oo .Op Cm cpus= .Ar numcpus .Oc .Op Cm ,sockets= Ar n .Op Cm ,cores= Ar n .Op Cm ,threads= Ar n .Oc .Sm on .Op Fl G Ar port .Op Fl g Ar gdbport .Oo Fl l .Sm off .Cm help | Ar lpcdev Op Cm \&, Ar conf .Sm on .Oc .Oo Fl m .Sm off .Ar memsize .Oo .Cm K No | Cm k No | Cm M No | Cm m No | Cm G No | Cm g No | Cm T No | Cm t .Oc .Sm on .Oc .Op Fl p Ar vcpu Ns Cm \&: Ns Ar hostcpu +.Op Fl r Ar file .Oo Fl s .Sm off .Cm help | Ar slot Cm \&, Ar emulation Op Cm \&, Ar conf .Sm on .Oc .Op Fl U Ar uuid .Ar vmname .Sh DESCRIPTION .Nm is a hypervisor that runs guest operating systems inside a virtual machine. .Pp Parameters such as the number of virtual CPUs, amount of guest memory, and I/O connectivity can be specified with command-line parameters. .Pp If not using a boot ROM, the guest operating system must be loaded with .Xr bhyveload 8 or a similar boot loader before running .Nm , otherwise, it is enough to run .Nm with a boot ROM of choice. .Pp .Nm runs until the guest operating system reboots or an unhandled hypervisor exit is detected. .Sh OPTIONS .Bl -tag -width 10n .It Fl a The guest's local APIC is configured in xAPIC mode. The xAPIC mode is the default setting so this option is redundant. It will be deprecated in a future version. .It Fl A Generate ACPI tables. Required for .Fx Ns /amd64 guests. .It Fl b Enable a low-level console device supported by .Fx kernels compiled with .Cd "device bvmconsole" . This option will be deprecated in a future version. .It Fl c Op Ar setting ... Number of guest virtual CPUs and/or the CPU topology. The default value for each of .Ar numcpus , .Ar sockets , .Ar cores , and .Ar threads is 1. The current maximum number of guest virtual CPUs is 16. If .Ar numcpus is not specified then it will be calculated from the other arguments. The topology must be consistent in that the .Ar numcpus must equal the product of .Ar sockets , .Ar cores , and .Ar threads . If a .Ar setting is specified more than once the last one has precedence. .It Fl C Include guest memory in core file. .It Fl e Force .Nm to exit when a guest issues an access to an I/O port that is not emulated. This is intended for debug purposes. .It Fl g Ar gdbport For .Fx kernels compiled with .Cd "device bvmdebug" , allow a remote kernel kgdb to be relayed to the guest kernel gdb stub via a local IPv4 address and this port. This option will be deprecated in a future version. .It Fl G Ar port Start a debug server that uses the GDB protocol to export guest state to a debugger. An IPv4 TCP socket will be bound to the supplied .Ar port to listen for debugger connections. Only a single debugger may be attached to the debug server at a time. If .Ar port begins with .Sq w , .Nm will pause execution at the first instruction waiting for a debugger to attach. .It Fl h Print help message and exit. .It Fl H Yield the virtual CPU thread when a HLT instruction is detected. If this option is not specified, virtual CPUs will use 100% of a host CPU. .It Fl l Op Ar help|lpcdev Ns Op , Ns Ar conf Allow devices behind the LPC PCI-ISA bridge to be configured. The only supported devices are the TTY-class devices .Ar com1 and .Ar com2 and the boot ROM device .Ar bootrom . .Pp .Ar help print a list of supported LPC devices. .It Fl m Ar memsize Ns Op Ar K|k|M|m|G|g|T|t Guest physical memory size in bytes. This must be the same size that was given to .Xr bhyveload 8 . .Pp The size argument may be suffixed with one of K, M, G or T (either upper or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes, or terabytes. If no suffix is given, the value is assumed to be in megabytes. .Pp .Ar memsize defaults to 256M. .It Fl p Ar vcpu:hostcpu Pin guest's virtual CPU .Em vcpu to .Em hostcpu . .It Fl P Force the guest virtual CPU to exit when a PAUSE instruction is detected. +.It Fl r Ar file +Resume a guest from a snapshot. +The guest memory contents are restored from +.Ar file , +and the guest device and vCPU state are restored from the file +.Dq Ar file Ns .kern . +.Pp +Note that the current snapshot file format requires that the configuration of +devices in the new VM match the VM from which the snapshot was taken by specifying the +same +.Op Fl s +and +.Op Fl l +options. +The count of vCPUs and memory configuration are read from the snapshot. .It Fl s Op Ar help|slot,emulation Ns Op , Ns Ar conf Configure a virtual PCI slot and function. .Pp .Nm provides PCI bus emulation and virtual devices that can be attached to slots on the bus. There are 32 available slots, with the option of providing up to 8 functions per slot. .Bl -tag -width 10n .It Ar help print a list of supported PCI devices. .It Ar slot .Ar pcislot[:function] .Ar bus:pcislot:function .Pp The .Ar pcislot value is 0 to 31. The optional .Ar function value is 0 to 7. The optional .Ar bus value is 0 to 255. If not specified, the .Ar function value defaults to 0. If not specified, the .Ar bus value defaults to 0. .It Ar emulation .Bl -tag -width 10n .It Li hostbridge | Li amd_hostbridge .Pp Provide a simple host bridge. This is usually configured at slot 0, and is required by most guest operating systems. The .Li amd_hostbridge emulation is identical but uses a PCI vendor ID of .Li AMD . .It Li passthru PCI pass-through device. .It Li virtio-net Virtio network interface. .It Li virtio-blk Virtio block storage interface. .It Li virtio-scsi Virtio SCSI interface. .It Li virtio-rnd Virtio RNG interface. .It Li virtio-console Virtio console interface, which exposes multiple ports to the guest in the form of simple char devices for simple IO between the guest and host userspaces. .It Li ahci AHCI controller attached to arbitrary devices. .It Li ahci-cd AHCI controller attached to an ATAPI CD/DVD. .It Li ahci-hd AHCI controller attached to a SATA hard-drive. .It Li e1000 Intel e82545 network interface. .It Li uart PCI 16550 serial device. .It Li lpc LPC PCI-ISA bridge with COM1 and COM2 16550 serial ports and a boot ROM. The LPC bridge emulation can only be configured on bus 0. .It Li fbuf Raw framebuffer device attached to VNC server. .It Li xhci eXtensible Host Controller Interface (xHCI) USB controller. .It Li nvme NVM Express (NVMe) controller. .It Li hda High Definition Audio Controller. .El .It Op Ar conf This optional parameter describes the backend for device emulations. If .Ar conf is not specified, the device emulation has no backend and can be considered unconnected. .Pp Network devices: .Bl -tag -width 10n .It Ar tapN Ns Op , Ns Ar mac=xx:xx:xx:xx:xx:xx .It Ar vmnetN Ns Op , Ns Ar mac=xx:xx:xx:xx:xx:xx .Pp If .Ar mac is not specified, the MAC address is derived from a fixed OUI and the remaining bytes from an MD5 hash of the slot and function numbers and the device name. .Pp The MAC address is an ASCII string in .Xr ethers 5 format. .El .Pp Block storage devices: .Bl -tag -width 10n .It Pa /filename Ns Oo , Ns Ar block-device-options Oc .It Pa /dev/xxx Ns Oo , Ns Ar block-device-options Oc .El .Pp The .Ar block-device-options are: .Bl -tag -width 8n .It Li nocache Open the file with .Dv O_DIRECT . .It Li direct Open the file using .Dv O_SYNC . .It Li ro Force the file to be opened read-only. .It Li sectorsize= Ns Ar logical Ns Oo / Ns Ar physical Oc Specify the logical and physical sector sizes of the emulated disk. The physical sector size is optional and is equal to the logical sector size if not explicitly specified. .El .Pp SCSI devices: .Bl -tag -width 10n .It Pa /dev/cam/ctl Ns Oo Ar pp . Ns Ar vp Oc Ns Oo , Ns Ar scsi-device-options Oc .El .Pp The .Ar scsi-device-options are: .Bl -tag -width 10n .It Li iid= Ns Ar IID Initiator ID to use when sending requests to specified CTL port. The default value is 0. .El .Pp TTY devices: .Bl -tag -width 10n .It Li stdio Connect the serial port to the standard input and output of the .Nm process. .It Pa /dev/xxx Use the host TTY device for serial port I/O. .El .Pp Boot ROM device: .Bl -tag -width 10n .It Pa romfile Map .Ar romfile in the guest address space reserved for boot firmware. .El .Pp Pass-through devices: .Bl -tag -width 10n .It Ns Ar slot Ns / Ns Ar bus Ns / Ns Ar function Connect to a PCI device on the host at the selector described by .Ar slot , .Ar bus , and .Ar function numbers. .El .Pp Guest memory must be wired using the .Fl S option when a pass-through device is configured. .Pp The host device must have been reserved at boot-time using the .Va pptdevs loader variable as described in .Xr vmm 4 . .Pp Virtio console devices: .Bl -tag -width 10n .It Li port1= Ns Pa /path/to/port1.sock Ns ,anotherport= Ns Pa ... A maximum of 16 ports per device can be created. Every port is named and corresponds to a Unix domain socket created by .Nm . .Nm accepts at most one connection per port at a time. .Pp Limitations: .Bl -bullet -offset 2n .It Due to lack of destructors in .Nm , sockets on the filesystem must be cleaned up manually after .Nm exits. .It There is no way to use the "console port" feature, nor the console port resize at present. .It Emergency write is advertised, but no-op at present. .El .El .Pp Framebuffer devices: .Bl -tag -width 10n .It Xo .Oo rfb= Ns Oo Ar IP\&: Oc Ns Ar port Oc Ns Oo ,w= Ns Ar width Oc Ns Oo ,h= Ns .Ar height Oc Ns Oo ,vga= Ns Ar vgaconf Oc Ns Oo Ns ,wait Oc Ns Oo ,password= Ns .Ar password Oc .Xc .Bl -tag -width 8n .It Ar IPv4:port No or Ar [IPv6%zone]:port An .Ar IP address and a .Ar port VNC should listen on. The default is to listen on localhost IPv4 address and default VNC port 5900. An IPv6 address must be enclosed in square brackets and may contain an optional zone identifier. .It Ar width No and Ar height A display resolution, width and height, respectively. If not specified, a default resolution of 1024x768 pixels will be used. Minimal supported resolution is 640x480 pixels, and maximum is 1920x1200 pixels. .It Ar vgaconf Possible values for this option are .Dq io (default), .Dq on , and .Dq off . PCI graphics cards have a dual personality in that they are standard PCI devices with BAR addressing, but may also implicitly decode legacy VGA I/O space .Pq Ad 0x3c0-3df and memory space .Pq 64KB at Ad 0xA0000 . The default .Dq io option should be used for guests that attempt to issue BIOS calls which result in I/O port queries, and fail to boot if I/O decode is disabled. .Pp The .Dq on option should be used along with the CSM BIOS capability in UEFI to boot traditional BIOS guests that require the legacy VGA I/O and memory regions to be available. .Pp The .Dq off option should be used for the UEFI guests that assume that VGA adapter is present if they detect the I/O ports. An example of such a guest is .Ox in UEFI mode. .Pp Please refer to the .Nm .Fx wiki page .Pq Lk https://wiki.freebsd.org/bhyve for configuration notes of particular guests. .It wait Instruct .Nm to only boot upon the initiation of a VNC connection, simplifying the installation of operating systems that require immediate keyboard input. This can be removed for post-installation use. .It password This type of authentication is known to be cryptographically weak and is not intended for use on untrusted networks. Many implementations will want to use stronger security, such as running the session over an encrypted channel provided by IPsec or SSH. .El .El .Pp xHCI USB devices: .Bl -tag -width 10n .It Li tablet A USB tablet device which provides precise cursor synchronization when using VNC. .El .Pp NVMe devices: .Bl -tag -width 10n .It Li devpath Accepted device paths are: .Ar /dev/blockdev or .Ar /path/to/image or .Ar ram=size_in_MiB . .It Li maxq Max number of queues. .It Li qsz Max elements in each queue. .It Li ioslots Max number of concurrent I/O requests. .It Li sectsz Sector size (defaults to blockif sector size). .It Li ser Serial number with maximum 20 characters. .El .Pp HD Audio devices: .Bl -tag -width 10n .It Li play Playback device, typically .Ar /dev/dsp0 . .It Li rec Recording device, typically .Ar /dev/dsp0 . .El .El .It Fl S Wire guest memory. .It Fl u RTC keeps UTC time. .It Fl U Ar uuid Set the universally unique identifier .Pq UUID in the guest's System Management BIOS System Information structure. By default a UUID is generated from the host's hostname and .Ar vmname . .It Fl w Ignore accesses to unimplemented Model Specific Registers (MSRs). This is intended for debug purposes. .It Fl W Force virtio PCI device emulations to use MSI interrupts instead of MSI-X interrupts. .It Fl x The guest's local APIC is configured in x2APIC mode. .It Fl Y Disable MPtable generation. .It Ar vmname Alphanumeric name of the guest. This should be the same as that created by .Xr bhyveload 8 . .El .Sh DEBUG SERVER The current debug server provides limited support for debuggers. .Ss Registers Each virtual CPU is exposed to the debugger as a thread. .Pp General purpose registers can be queried for each virtual CPU, but other registers such as floating-point and system registers cannot be queried. .Ss Memory Memory (including memory mapped I/O regions) can be read and written by the debugger. Memory operations use virtual addresses that are resolved to physical addresses via the current virtual CPU's active address translation. .Ss Control The running guest can be interrupted by the debugger at any time .Pq for example, by pressing Ctrl-C in the debugger . .Pp Single stepping is only supported on Intel CPUs supporting the MTRAP VM exit. .Pp Breakpoints are supported on Intel CPUs that support single stepping. Note that continuing from a breakpoint while interrupts are enabled in the guest may not work as expected due to timer interrupts firing while single stepping over the breakpoint. .Sh SIGNAL HANDLING .Nm deals with the following signals: .Pp .Bl -tag -width indent -compact .It SIGTERM Trigger ACPI poweroff for a VM .El .Sh EXIT STATUS Exit status indicates how the VM was terminated: .Pp .Bl -tag -width indent -compact .It 0 rebooted .It 1 powered off .It 2 halted .It 3 triple fault .It 4 exited due to an error .El .Sh EXAMPLES If not using a boot ROM, the guest operating system must have been loaded with .Xr bhyveload 8 or a similar boot loader before .Xr bhyve 4 can be run. Otherwise, the boot loader is not needed. .Pp To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio block device backed by the .Pa /my/image filesystem image, and a serial port for the console: .Bd -literal -offset indent bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\ -l com1,stdio -A -H -P -m 1G vm1 .Ed .Pp Run a 24GB single-CPU virtual machine with three network ports, one of which has a MAC address specified: .Bd -literal -offset indent bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\ -s 2:1,virtio-net,tap1 \\ -s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\ -s 3,virtio-blk,/my/image -l com1,stdio \\ -A -H -P -m 24G bigvm .Ed .Pp Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI CD-ROM, a single virtio network port, an AMD hostbridge, and the console port connected to an .Xr nmdm 4 null-modem device. .Bd -literal -offset indent bhyve -c 4 \\ -s 0,amd_hostbridge -s 1,lpc \\ -s 1:0,ahci,hd:/images/disk.1,hd:/images/disk.2,\\ hd:/images/disk.3,hd:/images/disk.4,\\ hd:/images/disk.5,hd:/images/disk.6,\\ hd:/images/disk.7,hd:/images/disk.8,\\ cd:/images/install.iso \\ -s 3,virtio-net,tap0 \\ -l com1,/dev/nmdm0A \\ -A -H -P -m 8G .Ed .Pp Run a UEFI virtual machine with a display resolution of 800 by 600 pixels that can be accessed via VNC at: 0.0.0.0:5900. .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 3,ahci-cd,/path/to/uefi-OS-install.iso \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=0.0.0.0:5900,w=800,h=600,wait \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Pp Run a UEFI virtual machine with a VNC display that is bound to all IPv6 addresses on port 5900. .Bd -literal -offset indent bhyve -c 2 -m 4G -w -H \\ -s 0,hostbridge \\ -s 4,ahci-hd,disk.img \\ -s 5,virtio-net,tap0 \\ -s 29,fbuf,tcp=[::]:5900,w=800,h=600 \\ -s 30,xhci,tablet \\ -s 31,lpc -l com1,stdio \\ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ uefivm .Ed .Sh SEE ALSO .Xr bhyve 4 , .Xr nmdm 4 , .Xr vmm 4 , .Xr ethers 5 , .Xr bhyvectl 8 , .Xr bhyveload 8 .Pp .Rs .%A Intel .%B 64 and IA-32 Architectures Software Developer’s Manual .%V Volume 3 .Re .Sh HISTORY .Nm first appeared in .Fx 10.0 . .Sh AUTHORS .An Neel Natu Aq Mt neel@freebsd.org .An Peter Grehan Aq Mt grehan@freebsd.org diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index 324e40d2cda2..8d73bd38cae4 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -1,1254 +1,1428 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include +#ifdef BHYVE_SNAPSHOT +#include +#include +#endif #include +#ifdef BHYVE_SNAPSHOT +#include +#endif #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include +#ifdef BHYVE_SNAPSHOT +#include +#endif #include #include #include #include #include #include #include #include +#ifdef BHYVE_SNAPSHOT +#include +#include + +#include +#endif #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "bootrom.h" #include "inout.h" #include "dbgport.h" #include "debug.h" #include "fwctl.h" #include "gdb.h" #include "ioapic.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "smbiostbl.h" +#ifdef BHYVE_SNAPSHOT +#include "snapshot.h" +#endif #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include "vmgenc.h" #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ #define MB (1024UL * 1024) #define GB (1024UL * MB) static const char * const vmx_exit_reason_desc[] = { [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", [EXIT_REASON_EXT_INTR] = "External interrupt", [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", [EXIT_REASON_INIT] = "INIT signal", [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", [EXIT_REASON_SMI] = "Other SMI", [EXIT_REASON_INTR_WINDOW] = "Interrupt window", [EXIT_REASON_NMI_WINDOW] = "NMI window", [EXIT_REASON_TASK_SWITCH] = "Task switch", [EXIT_REASON_CPUID] = "CPUID", [EXIT_REASON_GETSEC] = "GETSEC", [EXIT_REASON_HLT] = "HLT", [EXIT_REASON_INVD] = "INVD", [EXIT_REASON_INVLPG] = "INVLPG", [EXIT_REASON_RDPMC] = "RDPMC", [EXIT_REASON_RDTSC] = "RDTSC", [EXIT_REASON_RSM] = "RSM", [EXIT_REASON_VMCALL] = "VMCALL", [EXIT_REASON_VMCLEAR] = "VMCLEAR", [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", [EXIT_REASON_VMPTRLD] = "VMPTRLD", [EXIT_REASON_VMPTRST] = "VMPTRST", [EXIT_REASON_VMREAD] = "VMREAD", [EXIT_REASON_VMRESUME] = "VMRESUME", [EXIT_REASON_VMWRITE] = "VMWRITE", [EXIT_REASON_VMXOFF] = "VMXOFF", [EXIT_REASON_VMXON] = "VMXON", [EXIT_REASON_CR_ACCESS] = "Control-register accesses", [EXIT_REASON_DR_ACCESS] = "MOV DR", [EXIT_REASON_INOUT] = "I/O instruction", [EXIT_REASON_RDMSR] = "RDMSR", [EXIT_REASON_WRMSR] = "WRMSR", [EXIT_REASON_INVAL_VMCS] = "VM-entry failure due to invalid guest state", [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", [EXIT_REASON_MWAIT] = "MWAIT", [EXIT_REASON_MTF] = "Monitor trap flag", [EXIT_REASON_MONITOR] = "MONITOR", [EXIT_REASON_PAUSE] = "PAUSE", [EXIT_REASON_MCE_DURING_ENTRY] = "VM-entry failure due to machine-check event", [EXIT_REASON_TPR] = "TPR below threshold", [EXIT_REASON_APIC_ACCESS] = "APIC access", [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", [EXIT_REASON_EPT_FAULT] = "EPT violation", [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", [EXIT_REASON_INVEPT] = "INVEPT", [EXIT_REASON_RDTSCP] = "RDTSCP", [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", [EXIT_REASON_INVVPID] = "INVVPID", [EXIT_REASON_WBINVD] = "WBINVD", [EXIT_REASON_XSETBV] = "XSETBV", [EXIT_REASON_APIC_WRITE] = "APIC write", [EXIT_REASON_RDRAND] = "RDRAND", [EXIT_REASON_INVPCID] = "INVPCID", [EXIT_REASON_VMFUNC] = "VMFUNC", [EXIT_REASON_ENCLS] = "ENCLS", [EXIT_REASON_RDSEED] = "RDSEED", [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", [EXIT_REASON_XSAVES] = "XSAVES", [EXIT_REASON_XRSTORS] = "XRSTORS" }; typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); -char *vmname; +const char *vmname; int guest_ncpus; uint16_t cores, maxcpus, sockets, threads; char *guest_uuid_str; int raw_stdio = 0; static int gdb_port = 0; static int guest_vmexit_on_hlt, guest_vmexit_on_pause; static int virtio_msix = 1; static int x2apic_mode = 0; /* default is xAPIC */ static int strictio; static int strictmsr = 1; static int acpi; static char *progname; static const int BSP = 0; static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); static struct vm_exit vmexit[VM_MAXCPU]; struct bhyvestats { uint64_t vmexit_bogus; uint64_t vmexit_reqidle; uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; uint64_t vmexit_inst_emul; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; } stats; struct mt_vmm_info { pthread_t mt_thr; struct vmctx *mt_ctx; int mt_vcpu; } mt_vmm_info[VM_MAXCPU]; static cpuset_t *vcpumap[VM_MAXCPU] = { NULL }; static void usage(int code) { fprintf(stderr, "Usage: %s [-abehuwxACHPSWY]\n" " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" " %*s [-g ] [-l ]\n" " %*s [-m mem] [-p vcpu:hostcpu] [-s ] [-U uuid] \n" " -a: local apic is in xAPIC mode (deprecated)\n" " -A: create ACPI tables\n" " -c: number of cpus and/or topology specification\n" " -C: include guest memory in core file\n" " -e: exit on unhandled I/O access\n" " -g: gdb port\n" " -h: help\n" " -H: vmexit from the guest on hlt\n" " -l: LPC device configuration\n" " -m: memory size in MB\n" +#ifdef BHYVE_SNAPSHOT + " -r: path to checkpoint file\n" +#endif " -p: pin 'vcpu' to 'hostcpu'\n" " -P: vmexit from the guest on pause\n" " -s: PCI slot config\n" " -S: guest memory cannot be swapped\n" " -u: RTC keeps UTC time\n" " -U: uuid\n" " -w: ignore unimplemented MSRs\n" " -W: force virtio to use single-vector MSI\n" " -x: local apic is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), "", (int)strlen(progname), "", (int)strlen(progname), ""); exit(code); } /* * XXX This parser is known to have the following issues: * 1. It accepts null key=value tokens ",,". * 2. It accepts whitespace after = and before value. * 3. Values out of range of INT are silently wrapped. * 4. It doesn't check non-final values. * 5. The apparently bogus limits of UINT16_MAX are for future expansion. * * The acceptance of a null specification ('-c ""') is by design to match the * manual page syntax specification, this results in a topology of 1 vCPU. */ static int topology_parse(const char *opt) { uint64_t ncpus; int c, chk, n, s, t, tmp; char *cp, *str; bool ns, scts; c = 1, n = 1, s = 1, t = 1; ns = false, scts = false; str = strdup(opt); if (str == NULL) goto out; while ((cp = strsep(&str, ",")) != NULL) { if (sscanf(cp, "%i%n", &tmp, &chk) == 1) { n = tmp; ns = true; } else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) { n = tmp; ns = true; } else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) { s = tmp; scts = true; } else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) { c = tmp; scts = true; } else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) { t = tmp; scts = true; #ifdef notyet /* Do not expose this until vmm.ko implements it */ } else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) { m = tmp; #endif /* Skip the empty argument case from -c "" */ } else if (cp[0] == '\0') continue; else goto out; /* Any trailing garbage causes an error */ if (cp[chk] != '\0') goto out; } free(str); str = NULL; /* * Range check 1 <= n <= UINT16_MAX all values */ if (n < 1 || s < 1 || c < 1 || t < 1 || n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX || t > UINT16_MAX) return (-1); /* If only the cpus was specified, use that as sockets */ if (!scts) s = n; /* * Compute sockets * cores * threads avoiding overflow * The range check above insures these are 16 bit values * If n was specified check it against computed ncpus */ ncpus = (uint64_t)s * c * t; if (ncpus > UINT16_MAX || (ns && n != ncpus)) return (-1); guest_ncpus = ncpus; sockets = s; cores = c; threads = t; return(0); out: free(str); return (-1); } static int pincpu_parse(const char *opt) { int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0 || vcpu >= VM_MAXCPU) { fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n", vcpu, VM_MAXCPU - 1); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } if (vcpumap[vcpu] == NULL) { if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) { perror("malloc"); return (-1); } CPU_ZERO(vcpumap[vcpu]); } CPU_SET(pcpu, vcpumap[vcpu]); return (0); } void vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, int errcode) { struct vmctx *ctx; int error, restart_instruction; ctx = arg; restart_instruction = 1; error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, restart_instruction); assert(error == 0); } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } +#ifdef BHYVE_SNAPSHOT +uintptr_t +paddr_host2guest(struct vmctx *ctx, void *addr) +{ + return (vm_rev_map_gpa(ctx, addr)); +} +#endif + int fbsdrun_vmexit_on_pause(void) { return (guest_vmexit_on_pause); } int fbsdrun_vmexit_on_hlt(void) { return (guest_vmexit_on_hlt); } int fbsdrun_virtio_msix(void) { return (virtio_msix); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct mt_vmm_info *mtp; int vcpu; mtp = param; vcpu = mtp->mt_vcpu; snprintf(tname, sizeof(tname), "vcpu %d", vcpu); pthread_set_name_np(mtp->mt_thr, tname); +#ifdef BHYVE_SNAPSHOT + checkpoint_cpu_add(vcpu); +#endif if (gdb_port != 0) gdb_cpu_add(vcpu); vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); /* not reached */ exit(1); return (NULL); } void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) { int error; assert(fromcpu == BSP); /* * The 'newcpu' must be activated in the context of 'fromcpu'. If * vm_activate_cpu() is delayed until newcpu's pthread starts running * then vmm.ko is out-of-sync with bhyve and this can create a race * with vm_suspend(). */ error = vm_activate_cpu(ctx, newcpu); if (error != 0) err(EX_OSERR, "could not activate CPU %d", newcpu); CPU_SET_ATOMIC(newcpu, &cpumask); /* * Set up the vmexit struct to allow execution to start * at the given RIP */ vmexit[newcpu].rip = rip; vmexit[newcpu].inst_length = 0; mt_vmm_info[newcpu].mt_ctx = ctx; mt_vmm_info[newcpu].mt_vcpu = newcpu; error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, fbsdrun_start_thread, &mt_vmm_info[newcpu]); assert(error == 0); } static int fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) { if (!CPU_ISSET(vcpu, &cpumask)) { fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); exit(4); } CPU_CLR_ATOMIC(vcpu, &cpumask); return (CPU_EMPTY(&cpumask)); } static int vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, uint32_t eax) { #if BHYVE_DEBUG /* * put guest-driven debug here */ #endif return (VMEXIT_CONTINUE); } static int vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; int bytes, port, in, out; int vcpu; vcpu = *pvcpu; port = vme->u.inout.port; bytes = vme->u.inout.bytes; in = vme->u.inout.in; out = !in; /* Extra-special case of host notifications */ if (out && port == GUEST_NIO_PORT) { error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); return (error); } error = emulate_inout(ctx, vcpu, vme, strictio); if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port, vmexit->rip); return (VMEXIT_ABORT); } else { return (VMEXIT_CONTINUE); } } static int vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { uint64_t val; uint32_t eax, edx; int error; val = 0; error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val); if (error != 0) { fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", vme->u.msr.code, *pvcpu); if (strictmsr) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } eax = val; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", vme->u.msr.code, vme->u.msr.wval, *pvcpu); if (strictmsr) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); } static int vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { (void)spinup_ap(ctx, *pvcpu, vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); return (VMEXIT_CONTINUE); } #define DEBUG_EPT_MISCONFIG #ifdef DEBUG_EPT_MISCONFIG #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; static int ept_misconfig_ptenum; #endif static const char * vmexit_vmx_desc(uint32_t exit_reason) { if (exit_reason >= nitems(vmx_exit_reason_desc) || vmx_exit_reason_desc[exit_reason] == NULL) return ("Unknown"); return (vmx_exit_reason_desc[exit_reason]); } static int vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tVMX\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status); fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason, vmexit_vmx_desc(vmexit->u.vmx.exit_reason)); fprintf(stderr, "\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); #ifdef DEBUG_EPT_MISCONFIG if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { vm_get_register(ctx, *pvcpu, VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), &ept_misconfig_gpa); vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, &ept_misconfig_ptenum); fprintf(stderr, "\tEPT misconfiguration:\n"); fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", ept_misconfig_ptenum, ept_misconfig_pte[0], ept_misconfig_pte[1], ept_misconfig_pte[2], ept_misconfig_pte[3]); } #endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } static int vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tSVM\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode); fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1); fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2); return (VMEXIT_ABORT); } static int vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_bogus++; return (VMEXIT_CONTINUE); } static int vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_reqidle++; return (VMEXIT_CONTINUE); } static int vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_hlt++; /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_pause++; return (VMEXIT_CONTINUE); } static int vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_mtrap++; - if (gdb_port == 0) { - fprintf(stderr, "vm_loop: unexpected VMEXIT_MTRAP\n"); - exit(4); - } - gdb_cpu_mtrap(*pvcpu); +#ifdef BHYVE_SNAPSHOT + checkpoint_cpu_suspend(*pvcpu); +#endif + if (gdb_port != 0) + gdb_cpu_mtrap(*pvcpu); +#ifdef BHYVE_SNAPSHOT + checkpoint_cpu_resume(*pvcpu); +#endif + return (VMEXIT_CONTINUE); } static int vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { int err, i; struct vie *vie; stats.vmexit_inst_emul++; vie = &vmexit->u.inst_emul.vie; err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, vie, &vmexit->u.inst_emul.paging); if (err) { if (err == ESRCH) { EPRINTLN("Unhandled memory access to 0x%lx\n", vmexit->u.inst_emul.gpa); } fprintf(stderr, "Failed to emulate instruction sequence [ "); for (i = 0; i < vie->num_valid; i++) fprintf(stderr, "%02x", vie->inst[i]); FPRINTLN(stderr, " ] at 0x%lx", vmexit->rip); return (VMEXIT_ABORT); } return (VMEXIT_CONTINUE); } static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; static int vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { enum vm_suspend_how how; how = vmexit->u.suspended.how; fbsdrun_deletecpu(ctx, *pvcpu); if (*pvcpu != BSP) { pthread_mutex_lock(&resetcpu_mtx); pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); } pthread_mutex_lock(&resetcpu_mtx); while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); switch (how) { case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: exit(1); case VM_SUSPEND_HALT: exit(2); case VM_SUSPEND_TRIPLEFAULT: exit(3); default: fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); exit(100); } return (0); /* NOTREACHED */ } static int vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { - if (gdb_port == 0) { - fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n"); - exit(4); - } - gdb_cpu_suspend(*pvcpu); +#ifdef BHYVE_SNAPSHOT + checkpoint_cpu_suspend(*pvcpu); +#endif + if (gdb_port != 0) + gdb_cpu_suspend(*pvcpu); +#ifdef BHYVE_SNAPSHOT + checkpoint_cpu_resume(*pvcpu); +#endif return (VMEXIT_CONTINUE); } static int vmexit_breakpoint(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { if (gdb_port == 0) { fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n"); exit(4); } gdb_cpu_breakpoint(*pvcpu, vmexit); return (VMEXIT_CONTINUE); } static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, [VM_EXITCODE_BPT] = vmexit_breakpoint, }; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) { int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus; if (vcpumap[vcpu] != NULL) { error = pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), vcpumap[vcpu]); assert(error == 0); } error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu, &active_cpus)); error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); assert(error == 0); while (1) { error = vm_run(ctx, vcpu, &vmexit[vcpu]); if (error != 0) break; exitcode = vmexit[vcpu].exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(4); } rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(4); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx) { int tmp, error; error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ if (error == 0) return (VM_MAXCPU); else return (1); } void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) { int err, tmp; if (fbsdrun_vmexit_on_hlt()) { err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_HLT] = vmexit_hlt; } if (fbsdrun_vmexit_on_pause()) { /* * pause exit support required for this mode */ err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_PAUSE] = vmexit_pause; } if (x2apic_mode) err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED); else err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool reinit, romboot; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; const cap_ioctl_t *cmds; size_t ncmds; #endif reinit = romboot = false; if (lpc_bootrom()) romboot = true; error = vm_create(vmname); if (error) { if (errno == EEXIST) { if (romboot) { reinit = true; } else { /* * The virtual machine has been setup by the * userspace bootloader. */ } } else { perror("vm_create"); exit(4); } } else { if (!romboot) { /* * If the virtual machine was just created then a * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); exit(4); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(4); } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); vm_get_ioctls(&ncmds); cmds = vm_get_ioctls(NULL); if (cmds == NULL) errx(EX_OSERR, "out of memory"); if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); free((cap_ioctl_t *)cmds); #endif if (reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(4); } } error = vm_set_topology(ctx, sockets, cores, threads, maxcpus); if (error) errx(EX_OSERR, "vm_set_topology"); return (ctx); } +void +spinup_vcpu(struct vmctx *ctx, int vcpu) +{ + int error; + uint64_t rip; + + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); + assert(error == 0); + + fbsdrun_set_capabilities(ctx, vcpu); + error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); + assert(error == 0); + + fbsdrun_addcpu(ctx, BSP, vcpu, rip); +} + int main(int argc, char *argv[]) { int c, error, dbg_port, err, bvmcons; int max_vcpus, mptgen, memflags; int rtc_localtime; bool gdb_stop; struct vmctx *ctx; uint64_t rip; size_t memsize; char *optstr; +#ifdef BHYVE_SNAPSHOT + char *restore_file; + struct restore_state rstate; + int vcpu; + + restore_file = NULL; +#endif bvmcons = 0; progname = basename(argv[0]); dbg_port = 0; gdb_stop = false; guest_ncpus = 1; sockets = cores = threads = 1; maxcpus = 0; memsize = 256 * MB; mptgen = 1; rtc_localtime = 1; memflags = 0; +#ifdef BHYVE_SNAPSHOT + optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:r:"; +#else optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:"; +#endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': x2apic_mode = 0; break; case 'A': acpi = 1; break; case 'b': bvmcons = 1; break; case 'p': if (pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': if (topology_parse(optarg) != 0) { errx(EX_USAGE, "invalid cpu topology " "'%s'", optarg); } break; case 'C': memflags |= VM_MEM_F_INCORE; break; case 'g': dbg_port = atoi(optarg); break; case 'G': if (optarg[0] == 'w') { gdb_stop = true; optarg++; } gdb_port = atoi(optarg); break; case 'l': if (strncmp(optarg, "help", strlen(optarg)) == 0) { lpc_print_supported_devices(); exit(0); } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; +#ifdef BHYVE_SNAPSHOT + case 'r': + restore_file = optarg; + break; +#endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); exit(0); } else if (pci_parse_slot(optarg) != 0) exit(4); else break; case 'S': memflags |= VM_MEM_F_WIRED; break; case 'm': error = vm_parse_memsize(optarg, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", optarg); break; case 'H': guest_vmexit_on_hlt = 1; break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': guest_vmexit_on_pause = 1; break; case 'e': strictio = 1; break; case 'u': rtc_localtime = 0; break; case 'U': guest_uuid_str = optarg; break; case 'w': strictmsr = 0; break; case 'W': virtio_msix = 0; break; case 'x': x2apic_mode = 1; break; case 'Y': mptgen = 0; break; case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; +#ifdef BHYVE_SNAPSHOT + if (argc > 1 || (argc == 0 && restore_file == NULL)) + usage(1); + + if (restore_file != NULL) { + error = load_restore_file(restore_file, &rstate); + if (error) { + fprintf(stderr, "Failed to read checkpoint info from " + "file: '%s'.\n", restore_file); + exit(1); + } + } + + if (argc == 1) { + vmname = argv[0]; + } else { + vmname = lookup_vmname(&rstate); + if (vmname == NULL) { + fprintf(stderr, "Cannot find VM name in restore file. " + "Please specify one.\n"); + exit(1); + } + } +#else if (argc != 1) usage(1); vmname = argv[0]; +#endif ctx = do_open(vmname); +#ifdef BHYVE_SNAPSHOT + if (restore_file != NULL) { + guest_ncpus = lookup_guest_ncpus(&rstate); + memflags = lookup_memflags(&rstate); + memsize = lookup_memsize(&rstate); + } + + if (guest_ncpus < 1) { + fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); + exit(1); + } +#endif + max_vcpus = num_vcpus_allowed(ctx); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(4); } fbsdrun_set_capabilities(ctx, BSP); vm_set_memflags(ctx, memflags); err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (err) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); exit(4); } init_mem(); init_inout(); init_bootrom(ctx); atkbdc_init(ctx); pci_irq_init(ctx); ioapic_init(ctx); rtc_init(ctx, rtc_localtime); sci_init(ctx); /* * Exit if a device emulation finds an error in its initilization */ if (init_pci(ctx) != 0) { perror("device emulation initialization error"); exit(4); } /* * Initialize after PCI, to allow a bootrom file to reserve the high * region. */ if (acpi) vmgenc_init(ctx); if (dbg_port != 0) init_dbgport(dbg_port); if (gdb_port != 0) init_gdb(ctx, gdb_port, gdb_stop); if (bvmcons) init_bvmcons(); if (lpc_bootrom()) { if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); exit(4); } error = vcpu_reset(ctx, BSP); assert(error == 0); } +#ifdef BHYVE_SNAPSHOT + if (restore_file != NULL) { + fprintf(stdout, "Pausing pci devs...\r\n"); + if (vm_pause_user_devs(ctx) != 0) { + fprintf(stderr, "Failed to pause PCI device state.\n"); + exit(1); + } + + fprintf(stdout, "Restoring vm mem...\r\n"); + if (restore_vm_mem(ctx, &rstate) != 0) { + fprintf(stderr, "Failed to restore VM memory.\n"); + exit(1); + } + + fprintf(stdout, "Restoring pci devs...\r\n"); + if (vm_restore_user_devs(ctx, &rstate) != 0) { + fprintf(stderr, "Failed to restore PCI device state.\n"); + exit(1); + } + + fprintf(stdout, "Restoring kernel structs...\r\n"); + if (vm_restore_kern_structs(ctx, &rstate) != 0) { + fprintf(stderr, "Failed to restore kernel structs.\n"); + exit(1); + } + + fprintf(stdout, "Resuming pci devs...\r\n"); + if (vm_resume_user_devs(ctx) != 0) { + fprintf(stderr, "Failed to resume PCI device state.\n"); + exit(1); + } + } +#endif + error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); assert(error == 0); /* * build the guest tables, MP etc. */ if (mptgen) { error = mptable_build(ctx, guest_ncpus); if (error) { perror("error to build the guest tables"); exit(4); } } error = smbios_build(ctx); assert(error == 0); if (acpi) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } if (lpc_bootrom()) fwctl_init(); /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); #ifndef WITHOUT_CAPSICUM caph_cache_catpages(); if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_enter() == -1) errx(EX_OSERR, "cap_enter() failed"); #endif +#ifdef BHYVE_SNAPSHOT + if (restore_file != NULL) + destroy_restore_state(&rstate); + + /* + * checkpointing thread for communication with bhyvectl + */ + if (init_checkpoint_thread(ctx) < 0) + printf("Failed to start checkpoint thread!\r\n"); + + if (restore_file != NULL) + vm_restore_time(ctx); +#endif + /* * Add CPU 0 */ fbsdrun_addcpu(ctx, BSP, BSP, rip); +#ifdef BHYVE_SNAPSHOT + /* + * If we restore a VM, start all vCPUs now (including APs), otherwise, + * let the guest OS to spin them up later via vmexits. + */ + if (restore_file != NULL) { + for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { + if (vcpu == BSP) + continue; + + fprintf(stdout, "spinning up vcpu no %d...\r\n", vcpu); + spinup_vcpu(ctx, vcpu); + } + } +#endif + /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(4); } diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h index 0b23a6a5c3ae..0177baca14e9 100644 --- a/usr.sbin/bhyve/bhyverun.h +++ b/usr.sbin/bhyve/bhyverun.h @@ -1,52 +1,55 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _FBSDRUN_H_ #define _FBSDRUN_H_ #define VMEXIT_CONTINUE (0) #define VMEXIT_ABORT (-1) struct vmctx; extern int guest_ncpus; extern uint16_t cores, sockets, threads; extern char *guest_uuid_str; -extern char *vmname; +extern const char *vmname; void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len); +#ifdef BHYVE_SNAPSHOT +uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr); +#endif void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu); void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip); int fbsdrun_muxed(void); int fbsdrun_vmexit_on_hlt(void); int fbsdrun_vmexit_on_pause(void); int fbsdrun_disable_x2apic(void); int fbsdrun_virtio_msix(void); #endif diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c index 50e1eed12f90..4c91038ca765 100644 --- a/usr.sbin/bhyve/block_if.c +++ b/usr.sbin/bhyve/block_if.c @@ -1,857 +1,988 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Peter Grehan * All rights reserved. * Copyright 2020 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include +#include #include "bhyverun.h" #include "debug.h" #include "mevent.h" #include "block_if.h" #define BLOCKIF_SIG 0xb109b109 #define BLOCKIF_NUMTHR 8 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) enum blockop { BOP_READ, BOP_WRITE, BOP_FLUSH, BOP_DELETE }; enum blockstat { BST_FREE, BST_BLOCK, BST_PEND, BST_BUSY, BST_DONE }; struct blockif_elem { TAILQ_ENTRY(blockif_elem) be_link; struct blockif_req *be_req; enum blockop be_op; enum blockstat be_status; pthread_t be_tid; off_t be_block; }; struct blockif_ctxt { int bc_magic; int bc_fd; int bc_ischr; int bc_isgeom; int bc_candelete; int bc_rdonly; off_t bc_size; int bc_sectsz; int bc_psectsz; int bc_psectoff; int bc_closing; + int bc_paused; + int bc_work_count; pthread_t bc_btid[BLOCKIF_NUMTHR]; pthread_mutex_t bc_mtx; pthread_cond_t bc_cond; + pthread_cond_t bc_paused_cond; + pthread_cond_t bc_work_done_cond; /* Request elements and free/pending/busy queues */ TAILQ_HEAD(, blockif_elem) bc_freeq; TAILQ_HEAD(, blockif_elem) bc_pendq; TAILQ_HEAD(, blockif_elem) bc_busyq; struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; }; static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; struct blockif_sig_elem { pthread_mutex_t bse_mtx; pthread_cond_t bse_cond; int bse_pending; struct blockif_sig_elem *bse_next; }; static struct blockif_sig_elem *blockif_bse_head; static int blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { struct blockif_elem *be, *tbe; off_t off; int i; be = TAILQ_FIRST(&bc->bc_freeq); assert(be != NULL); assert(be->be_status == BST_FREE); TAILQ_REMOVE(&bc->bc_freeq, be, be_link); be->be_req = breq; be->be_op = op; switch (op) { case BOP_READ: case BOP_WRITE: case BOP_DELETE: off = breq->br_offset; for (i = 0; i < breq->br_iovcnt; i++) off += breq->br_iov[i].iov_len; break; default: off = OFF_MAX; } be->be_block = off; TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { if (tbe->be_block == breq->br_offset) break; } if (tbe == NULL) { TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { if (tbe->be_block == breq->br_offset) break; } } if (tbe == NULL) be->be_status = BST_PEND; else be->be_status = BST_BLOCK; TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); return (be->be_status == BST_PEND); } static int blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) { struct blockif_elem *be; TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { if (be->be_status == BST_PEND) break; assert(be->be_status == BST_BLOCK); } if (be == NULL) return (0); TAILQ_REMOVE(&bc->bc_pendq, be, be_link); be->be_status = BST_BUSY; be->be_tid = t; TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); *bep = be; return (1); } static void blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) { struct blockif_elem *tbe; if (be->be_status == BST_DONE || be->be_status == BST_BUSY) TAILQ_REMOVE(&bc->bc_busyq, be, be_link); else TAILQ_REMOVE(&bc->bc_pendq, be, be_link); TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { if (tbe->be_req->br_offset == be->be_block) tbe->be_status = BST_PEND; } be->be_tid = 0; be->be_status = BST_FREE; be->be_req = NULL; TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); } +static int +blockif_flush_bc(struct blockif_ctxt *bc) +{ + if (bc->bc_ischr) { + if (ioctl(bc->bc_fd, DIOCGFLUSH)) + return (errno); + } else if (fsync(bc->bc_fd)) + return (errno); + + return (0); +} + static void blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) { struct blockif_req *br; off_t arg[2]; ssize_t clen, len, off, boff, voff; int i, err; br = be->be_req; if (br->br_iovcnt <= 1) buf = NULL; err = 0; switch (be->be_op) { case BOP_READ: if (buf == NULL) { if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, br->br_offset)) < 0) err = errno; else br->br_resid -= len; break; } i = 0; off = voff = 0; while (br->br_resid > 0) { len = MIN(br->br_resid, MAXPHYS); if (pread(bc->bc_fd, buf, len, br->br_offset + off) < 0) { err = errno; break; } boff = 0; do { clen = MIN(len - boff, br->br_iov[i].iov_len - voff); memcpy(br->br_iov[i].iov_base + voff, buf + boff, clen); if (clen < br->br_iov[i].iov_len - voff) voff += clen; else { i++; voff = 0; } boff += clen; } while (boff < len); off += len; br->br_resid -= len; } break; case BOP_WRITE: if (bc->bc_rdonly) { err = EROFS; break; } if (buf == NULL) { if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, br->br_offset)) < 0) err = errno; else br->br_resid -= len; break; } i = 0; off = voff = 0; while (br->br_resid > 0) { len = MIN(br->br_resid, MAXPHYS); boff = 0; do { clen = MIN(len - boff, br->br_iov[i].iov_len - voff); memcpy(buf + boff, br->br_iov[i].iov_base + voff, clen); if (clen < br->br_iov[i].iov_len - voff) voff += clen; else { i++; voff = 0; } boff += clen; } while (boff < len); if (pwrite(bc->bc_fd, buf, len, br->br_offset + off) < 0) { err = errno; break; } off += len; br->br_resid -= len; } break; case BOP_FLUSH: - if (bc->bc_ischr) { - if (ioctl(bc->bc_fd, DIOCGFLUSH)) - err = errno; - } else if (fsync(bc->bc_fd)) - err = errno; + err = blockif_flush_bc(bc); break; case BOP_DELETE: if (!bc->bc_candelete) err = EOPNOTSUPP; else if (bc->bc_rdonly) err = EROFS; else if (bc->bc_ischr) { arg[0] = br->br_offset; arg[1] = br->br_resid; if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) err = errno; else br->br_resid = 0; } else err = EOPNOTSUPP; break; default: err = EINVAL; break; } be->be_status = BST_DONE; (*br->br_callback)(br, err); } static void * blockif_thr(void *arg) { struct blockif_ctxt *bc; struct blockif_elem *be; pthread_t t; uint8_t *buf; bc = arg; if (bc->bc_isgeom) buf = malloc(MAXPHYS); else buf = NULL; t = pthread_self(); pthread_mutex_lock(&bc->bc_mtx); for (;;) { - while (blockif_dequeue(bc, t, &be)) { + bc->bc_work_count++; + + /* We cannot process work if the interface is paused */ + while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) { pthread_mutex_unlock(&bc->bc_mtx); blockif_proc(bc, be, buf); pthread_mutex_lock(&bc->bc_mtx); blockif_complete(bc, be); } + + bc->bc_work_count--; + + /* If none of the workers are busy, notify the main thread */ + if (bc->bc_work_count == 0) + pthread_cond_broadcast(&bc->bc_work_done_cond); + /* Check ctxt status here to see if exit requested */ if (bc->bc_closing) break; + + /* Make all worker threads wait here if the device is paused */ + while (bc->bc_paused) + pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx); + pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); } pthread_mutex_unlock(&bc->bc_mtx); if (buf) free(buf); pthread_exit(NULL); return (NULL); } static void blockif_sigcont_handler(int signal, enum ev_type type, void *arg) { struct blockif_sig_elem *bse; for (;;) { /* * Process the entire list even if not intended for * this thread. */ do { bse = blockif_bse_head; if (bse == NULL) return; } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, (uintptr_t)bse, (uintptr_t)bse->bse_next)); pthread_mutex_lock(&bse->bse_mtx); bse->bse_pending = 0; pthread_cond_signal(&bse->bse_cond); pthread_mutex_unlock(&bse->bse_mtx); } } static void blockif_init(void) { mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); (void) signal(SIGCONT, SIG_IGN); } struct blockif_ctxt * blockif_open(const char *optstr, const char *ident) { char tname[MAXCOMLEN + 1]; char name[MAXPATHLEN]; char *nopt, *xopts, *cp; struct blockif_ctxt *bc; struct stat sbuf; struct diocgattr_arg arg; off_t size, psectsz, psectoff; int extra, fd, i, sectsz; int nocache, sync, ro, candelete, geom, ssopt, pssopt; int nodelete; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; #endif pthread_once(&blockif_once, blockif_init); fd = -1; ssopt = 0; nocache = 0; sync = 0; ro = 0; nodelete = 0; /* * The first element in the optstring is always a pathname. * Optional elements follow */ nopt = xopts = strdup(optstr); while (xopts != NULL) { cp = strsep(&xopts, ","); if (cp == nopt) /* file or device pathname */ continue; else if (!strcmp(cp, "nocache")) nocache = 1; else if (!strcmp(cp, "nodelete")) nodelete = 1; else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) sync = 1; else if (!strcmp(cp, "ro")) ro = 1; else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) ; else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) pssopt = ssopt; else { EPRINTLN("Invalid device option \"%s\"", cp); goto err; } } extra = 0; if (nocache) extra |= O_DIRECT; if (sync) extra |= O_SYNC; fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); if (fd < 0 && !ro) { /* Attempt a r/w fail with a r/o open */ fd = open(nopt, O_RDONLY | extra); ro = 1; } if (fd < 0) { warn("Could not open backing file: %s", nopt); goto err; } if (fstat(fd, &sbuf) < 0) { warn("Could not stat backing file %s", nopt); goto err; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, CAP_WRITE); if (ro) cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); if (caph_rights_limit(fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif /* * Deal with raw devices */ size = sbuf.st_size; sectsz = DEV_BSIZE; psectsz = psectoff = 0; candelete = geom = 0; if (S_ISCHR(sbuf.st_mode)) { if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || ioctl(fd, DIOCGSECTORSIZE, §sz)) { perror("Could not fetch dev blk/sector size"); goto err; } assert(size != 0); assert(sectsz != 0); if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); arg.len = sizeof(arg.value.i); if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) candelete = arg.value.i; if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) geom = 1; } else psectsz = sbuf.st_blksize; #ifndef WITHOUT_CAPSICUM if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif if (ssopt != 0) { if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || ssopt > pssopt) { EPRINTLN("Invalid sector size %d/%d", ssopt, pssopt); goto err; } /* * Some backend drivers (e.g. cd0, ada0) require that the I/O * size be a multiple of the device's sector size. * * Validate that the emulated sector size complies with this * requirement. */ if (S_ISCHR(sbuf.st_mode)) { if (ssopt < sectsz || (ssopt % sectsz) != 0) { EPRINTLN("Sector size %d incompatible " "with underlying device sector size %d", ssopt, sectsz); goto err; } } sectsz = ssopt; psectsz = pssopt; psectoff = 0; } bc = calloc(1, sizeof(struct blockif_ctxt)); if (bc == NULL) { perror("calloc"); goto err; } bc->bc_magic = BLOCKIF_SIG; bc->bc_fd = fd; bc->bc_ischr = S_ISCHR(sbuf.st_mode); bc->bc_isgeom = geom; bc->bc_candelete = candelete; bc->bc_rdonly = ro; bc->bc_size = size; bc->bc_sectsz = sectsz; bc->bc_psectsz = psectsz; bc->bc_psectoff = psectoff; pthread_mutex_init(&bc->bc_mtx, NULL); pthread_cond_init(&bc->bc_cond, NULL); + bc->bc_paused = 0; + bc->bc_work_count = 0; + pthread_cond_init(&bc->bc_paused_cond, NULL); + pthread_cond_init(&bc->bc_work_done_cond, NULL); TAILQ_INIT(&bc->bc_freeq); TAILQ_INIT(&bc->bc_pendq); TAILQ_INIT(&bc->bc_busyq); for (i = 0; i < BLOCKIF_MAXREQ; i++) { bc->bc_reqs[i].be_status = BST_FREE; TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); } for (i = 0; i < BLOCKIF_NUMTHR; i++) { pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); pthread_set_name_np(bc->bc_btid[i], tname); } return (bc); err: if (fd >= 0) close(fd); free(nopt); return (NULL); } static int blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { int err; err = 0; pthread_mutex_lock(&bc->bc_mtx); if (!TAILQ_EMPTY(&bc->bc_freeq)) { /* * Enqueue and inform the block i/o thread * that there is work available */ if (blockif_enqueue(bc, breq, op)) pthread_cond_signal(&bc->bc_cond); } else { /* * Callers are not allowed to enqueue more than * the specified blockif queue limit. Return an * error to indicate that the queue length has been * exceeded. */ err = E2BIG; } pthread_mutex_unlock(&bc->bc_mtx); return (err); } int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_READ)); } int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_WRITE)); } int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_FLUSH)); } int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_DELETE)); } int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) { struct blockif_elem *be; assert(bc->bc_magic == BLOCKIF_SIG); pthread_mutex_lock(&bc->bc_mtx); + /* XXX: not waiting while paused */ + /* * Check pending requests. */ TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { if (be->be_req == breq) break; } if (be != NULL) { /* * Found it. */ blockif_complete(bc, be); pthread_mutex_unlock(&bc->bc_mtx); return (0); } /* * Check in-flight requests. */ TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { if (be->be_req == breq) break; } if (be == NULL) { /* * Didn't find it. */ pthread_mutex_unlock(&bc->bc_mtx); return (EINVAL); } /* * Interrupt the processing thread to force it return * prematurely via it's normal callback path. */ while (be->be_status == BST_BUSY) { struct blockif_sig_elem bse, *old_head; pthread_mutex_init(&bse.bse_mtx, NULL); pthread_cond_init(&bse.bse_cond, NULL); bse.bse_pending = 1; do { old_head = blockif_bse_head; bse.bse_next = old_head; } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, (uintptr_t)old_head, (uintptr_t)&bse)); pthread_kill(be->be_tid, SIGCONT); pthread_mutex_lock(&bse.bse_mtx); while (bse.bse_pending) pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); pthread_mutex_unlock(&bse.bse_mtx); } pthread_mutex_unlock(&bc->bc_mtx); /* * The processing thread has been interrupted. Since it's not * clear if the callback has been invoked yet, return EBUSY. */ return (EBUSY); } int blockif_close(struct blockif_ctxt *bc) { void *jval; int i; assert(bc->bc_magic == BLOCKIF_SIG); /* * Stop the block i/o thread */ pthread_mutex_lock(&bc->bc_mtx); bc->bc_closing = 1; pthread_mutex_unlock(&bc->bc_mtx); pthread_cond_broadcast(&bc->bc_cond); for (i = 0; i < BLOCKIF_NUMTHR; i++) pthread_join(bc->bc_btid[i], &jval); /* XXX Cancel queued i/o's ??? */ /* * Release resources */ bc->bc_magic = 0; close(bc->bc_fd); free(bc); return (0); } /* * Return virtual C/H/S values for a given block. Use the algorithm * outlined in the VHD specification to calculate values. */ void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) { off_t sectors; /* total sectors of the block dev */ off_t hcyl; /* cylinders times heads */ uint16_t secpt; /* sectors per track */ uint8_t heads; assert(bc->bc_magic == BLOCKIF_SIG); sectors = bc->bc_size / bc->bc_sectsz; /* Clamp the size to the largest possible with CHS */ if (sectors > 65535UL*16*255) sectors = 65535UL*16*255; if (sectors >= 65536UL*16*63) { secpt = 255; heads = 16; hcyl = sectors / secpt; } else { secpt = 17; hcyl = sectors / secpt; heads = (hcyl + 1023) / 1024; if (heads < 4) heads = 4; if (hcyl >= (heads * 1024) || heads > 16) { secpt = 31; heads = 16; hcyl = sectors / secpt; } if (hcyl >= (heads * 1024)) { secpt = 63; heads = 16; hcyl = sectors / secpt; } } *c = hcyl / heads; *h = heads; *s = secpt; } /* * Accessors */ off_t blockif_size(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_size); } int blockif_sectsz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_sectsz); } void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) { assert(bc->bc_magic == BLOCKIF_SIG); *size = bc->bc_psectsz; *off = bc->bc_psectoff; } int blockif_queuesz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (BLOCKIF_MAXREQ - 1); } int blockif_is_ro(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_rdonly); } int blockif_candelete(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_candelete); } + +#ifdef BHYVE_SNAPSHOT +void +blockif_pause(struct blockif_ctxt *bc) +{ + assert(bc != NULL); + assert(bc->bc_magic == BLOCKIF_SIG); + + pthread_mutex_lock(&bc->bc_mtx); + bc->bc_paused = 1; + + /* The interface is paused. Wait for workers to finish their work */ + while (bc->bc_work_count) + pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx); + pthread_mutex_unlock(&bc->bc_mtx); + + if (blockif_flush_bc(bc)) + fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n", + __func__); +} + +void +blockif_resume(struct blockif_ctxt *bc) +{ + assert(bc != NULL); + assert(bc->bc_magic == BLOCKIF_SIG); + + pthread_mutex_lock(&bc->bc_mtx); + bc->bc_paused = 0; + /* resume the threads waiting for paused */ + pthread_cond_broadcast(&bc->bc_paused_cond); + /* kick the threads after restore */ + pthread_cond_broadcast(&bc->bc_cond); + pthread_mutex_unlock(&bc->bc_mtx); +} + +int +blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta) +{ + int i; + struct iovec *iov; + int ret; + + SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done); + + /* + * XXX: The callback and parameter must be filled by the virtualized + * device that uses the interface, during its init; we're not touching + * them here. + */ + + /* Snapshot the iovecs. */ + for (i = 0; i < br->br_iovcnt; i++) { + iov = &br->br_iov[i]; + + SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done); + + /* We assume the iov is a guest-mapped address. */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len, + false, meta, ret, done); + } + +done: + return (ret); +} + +int +blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta) +{ + int ret; + + if (bc->bc_paused == 0) { + fprintf(stderr, "%s: Snapshot failed: " + "interface not paused.\r\n", __func__); + return (ENXIO); + } + + pthread_mutex_lock(&bc->bc_mtx); + + SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done); + +done: + pthread_mutex_unlock(&bc->bc_mtx); + return (ret); +} +#endif diff --git a/usr.sbin/bhyve/block_if.h b/usr.sbin/bhyve/block_if.h index 75c016447ac2..f3b5b6938ef1 100644 --- a/usr.sbin/bhyve/block_if.h +++ b/usr.sbin/bhyve/block_if.h @@ -1,78 +1,89 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Peter Grehan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * The block API to be used by bhyve block-device emulations. The routines * are thread safe, with no assumptions about the context of the completion * callback - it may occur in the caller's context, or asynchronously in * another thread. */ #ifndef _BLOCK_IF_H_ #define _BLOCK_IF_H_ #include #include +struct vm_snapshot_meta; + + /* * BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in * a single request. BLOCKIF_RING_MAX is the maxmimum number of * pending requests that can be queued. */ #define BLOCKIF_IOV_MAX 128 /* not practical to be IOV_MAX */ #define BLOCKIF_RING_MAX 128 struct blockif_req { int br_iovcnt; off_t br_offset; ssize_t br_resid; void (*br_callback)(struct blockif_req *req, int err); void *br_param; struct iovec br_iov[BLOCKIF_IOV_MAX]; }; struct blockif_ctxt; struct blockif_ctxt *blockif_open(const char *optstr, const char *ident); off_t blockif_size(struct blockif_ctxt *bc); void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s); int blockif_sectsz(struct blockif_ctxt *bc); void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off); int blockif_queuesz(struct blockif_ctxt *bc); int blockif_is_ro(struct blockif_ctxt *bc); int blockif_candelete(struct blockif_ctxt *bc); int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_close(struct blockif_ctxt *bc); +#ifdef BHYVE_SNAPSHOT +void blockif_pause(struct blockif_ctxt *bc); +void blockif_resume(struct blockif_ctxt *bc); +int blockif_snapshot_req(struct blockif_req *br, + struct vm_snapshot_meta *meta); +int blockif_snapshot(struct blockif_ctxt *bc, + struct vm_snapshot_meta *meta); +#endif #endif /* _BLOCK_IF_H_ */ diff --git a/usr.sbin/bhyve/mevent.c b/usr.sbin/bhyve/mevent.c index c0c69d37f311..649a6b09cb34 100644 --- a/usr.sbin/bhyve/mevent.c +++ b/usr.sbin/bhyve/mevent.c @@ -1,492 +1,492 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Micro event library for FreeBSD, designed for a single i/o thread * using kqueue, and having events be persistent by default. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include "mevent.h" #define MEVENT_MAX 64 -extern char *vmname; +extern const char *vmname; static pthread_t mevent_tid; static int mevent_timid = 43; static int mevent_pipefd[2]; static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER; struct mevent { void (*me_func)(int, enum ev_type, void *); #define me_msecs me_fd int me_fd; int me_timid; enum ev_type me_type; void *me_param; int me_cq; int me_state; /* Desired kevent flags. */ int me_closefd; LIST_ENTRY(mevent) me_list; }; static LIST_HEAD(listhead, mevent) global_head, change_head; static void mevent_qlock(void) { pthread_mutex_lock(&mevent_lmutex); } static void mevent_qunlock(void) { pthread_mutex_unlock(&mevent_lmutex); } static void mevent_pipe_read(int fd, enum ev_type type, void *param) { char buf[MEVENT_MAX]; int status; /* * Drain the pipe read side. The fd is non-blocking so this is * safe to do. */ do { status = read(fd, buf, sizeof(buf)); } while (status == MEVENT_MAX); } static void mevent_notify(void) { char c = '\0'; /* * If calling from outside the i/o thread, write a byte on the * pipe to force the i/o thread to exit the blocking kevent call. */ if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) { write(mevent_pipefd[1], &c, 1); } } static int mevent_kq_filter(struct mevent *mevp) { int retval; retval = 0; if (mevp->me_type == EVF_READ) retval = EVFILT_READ; if (mevp->me_type == EVF_WRITE) retval = EVFILT_WRITE; if (mevp->me_type == EVF_TIMER) retval = EVFILT_TIMER; if (mevp->me_type == EVF_SIGNAL) retval = EVFILT_SIGNAL; return (retval); } static int mevent_kq_flags(struct mevent *mevp) { return (mevp->me_state); } static int mevent_kq_fflags(struct mevent *mevp) { /* XXX nothing yet, perhaps EV_EOF for reads ? */ return (0); } static int mevent_build(int mfd, struct kevent *kev) { struct mevent *mevp, *tmpp; int i; i = 0; mevent_qlock(); LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { if (mevp->me_closefd) { /* * A close of the file descriptor will remove the * event */ close(mevp->me_fd); } else { if (mevp->me_type == EVF_TIMER) { kev[i].ident = mevp->me_timid; kev[i].data = mevp->me_msecs; } else { kev[i].ident = mevp->me_fd; kev[i].data = 0; } kev[i].filter = mevent_kq_filter(mevp); kev[i].flags = mevent_kq_flags(mevp); kev[i].fflags = mevent_kq_fflags(mevp); kev[i].udata = mevp; i++; } mevp->me_cq = 0; LIST_REMOVE(mevp, me_list); if (mevp->me_state & EV_DELETE) { free(mevp); } else { /* * We need to add the event only once, so we can * reset the EV_ADD bit after it has been propagated * to the kevent() arguments the first time. */ mevp->me_state &= ~EV_ADD; LIST_INSERT_HEAD(&global_head, mevp, me_list); } assert(i < MEVENT_MAX); } mevent_qunlock(); return (i); } static void mevent_handle(struct kevent *kev, int numev) { struct mevent *mevp; int i; for (i = 0; i < numev; i++) { mevp = kev[i].udata; /* XXX check for EV_ERROR ? */ (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); } } static struct mevent * mevent_add_state(int tfd, enum ev_type type, void (*func)(int, enum ev_type, void *), void *param, int state) { struct mevent *lp, *mevp; if (tfd < 0 || func == NULL) { return (NULL); } mevp = NULL; mevent_qlock(); /* * Verify that the fd/type tuple is not present in any list */ LIST_FOREACH(lp, &global_head, me_list) { if (type != EVF_TIMER && lp->me_fd == tfd && lp->me_type == type) { goto exit; } } LIST_FOREACH(lp, &change_head, me_list) { if (type != EVF_TIMER && lp->me_fd == tfd && lp->me_type == type) { goto exit; } } /* * Allocate an entry, populate it, and add it to the change list. */ mevp = calloc(1, sizeof(struct mevent)); if (mevp == NULL) { goto exit; } if (type == EVF_TIMER) { mevp->me_msecs = tfd; mevp->me_timid = mevent_timid++; } else mevp->me_fd = tfd; mevp->me_type = type; mevp->me_func = func; mevp->me_param = param; LIST_INSERT_HEAD(&change_head, mevp, me_list); mevp->me_cq = 1; mevp->me_state = state; mevent_notify(); exit: mevent_qunlock(); return (mevp); } struct mevent * mevent_add(int tfd, enum ev_type type, void (*func)(int, enum ev_type, void *), void *param) { return (mevent_add_state(tfd, type, func, param, EV_ADD)); } struct mevent * mevent_add_disabled(int tfd, enum ev_type type, void (*func)(int, enum ev_type, void *), void *param) { return (mevent_add_state(tfd, type, func, param, EV_ADD | EV_DISABLE)); } static int mevent_update(struct mevent *evp, bool enable) { int newstate; mevent_qlock(); /* * It's not possible to enable/disable a deleted event */ assert((evp->me_state & EV_DELETE) == 0); newstate = evp->me_state; if (enable) { newstate |= EV_ENABLE; newstate &= ~EV_DISABLE; } else { newstate |= EV_DISABLE; newstate &= ~EV_ENABLE; } /* * No update needed if state isn't changing */ if (evp->me_state != newstate) { evp->me_state = newstate; /* * Place the entry onto the changed list if not * already there. */ if (evp->me_cq == 0) { evp->me_cq = 1; LIST_REMOVE(evp, me_list); LIST_INSERT_HEAD(&change_head, evp, me_list); mevent_notify(); } } mevent_qunlock(); return (0); } int mevent_enable(struct mevent *evp) { return (mevent_update(evp, true)); } int mevent_disable(struct mevent *evp) { return (mevent_update(evp, false)); } static int mevent_delete_event(struct mevent *evp, int closefd) { mevent_qlock(); /* * Place the entry onto the changed list if not already there, and * mark as to be deleted. */ if (evp->me_cq == 0) { evp->me_cq = 1; LIST_REMOVE(evp, me_list); LIST_INSERT_HEAD(&change_head, evp, me_list); mevent_notify(); } evp->me_state = EV_DELETE; if (closefd) evp->me_closefd = 1; mevent_qunlock(); return (0); } int mevent_delete(struct mevent *evp) { return (mevent_delete_event(evp, 0)); } int mevent_delete_close(struct mevent *evp) { return (mevent_delete_event(evp, 1)); } static void mevent_set_name(void) { pthread_set_name_np(mevent_tid, "mevent"); } void mevent_dispatch(void) { struct kevent changelist[MEVENT_MAX]; struct kevent eventlist[MEVENT_MAX]; struct mevent *pipev; int mfd; int numev; int ret; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif mevent_tid = pthread_self(); mevent_set_name(); mfd = kqueue(); assert(mfd > 0); #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_KQUEUE); if (caph_rights_limit(mfd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif /* * Open the pipe that will be used for other threads to force * the blocking kqueue call to exit by writing to it. Set the * descriptor to non-blocking. */ ret = pipe(mevent_pipefd); if (ret < 0) { perror("pipe"); exit(0); } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); if (caph_rights_limit(mevent_pipefd[0], &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_rights_limit(mevent_pipefd[1], &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif /* * Add internal event handler for the pipe write fd */ pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL); assert(pipev != NULL); for (;;) { /* * Build changelist if required. * XXX the changelist can be put into the blocking call * to eliminate the extra syscall. Currently better for * debug. */ numev = mevent_build(mfd, changelist); if (numev) { ret = kevent(mfd, changelist, numev, NULL, 0, NULL); if (ret == -1) { perror("Error return from kevent change"); } } /* * Block awaiting events */ ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL); if (ret == -1 && errno != EINTR) { perror("Error return from kevent monitor"); } /* * Handle reported events */ mevent_handle(eventlist, ret); } } diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c index 23aee0fdac6b..49e6452a355d 100644 --- a/usr.sbin/bhyve/pci_ahci.c +++ b/usr.sbin/bhyve/pci_ahci.c @@ -1,2474 +1,2770 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Zhixiang Yu * Copyright (c) 2015-2016 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include +#include + #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "pci_emul.h" #include "ahci.h" #include "block_if.h" #define DEF_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ #define MAX_PORTS 32 /* AHCI supports 32 ports */ #define PxSIG_ATA 0x00000101 /* ATA drive */ #define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */ enum sata_fis_type { FIS_TYPE_REGH2D = 0x27, /* Register FIS - host to device */ FIS_TYPE_REGD2H = 0x34, /* Register FIS - device to host */ FIS_TYPE_DMAACT = 0x39, /* DMA activate FIS - device to host */ FIS_TYPE_DMASETUP = 0x41, /* DMA setup FIS - bidirectional */ FIS_TYPE_DATA = 0x46, /* Data FIS - bidirectional */ FIS_TYPE_BIST = 0x58, /* BIST activate FIS - bidirectional */ FIS_TYPE_PIOSETUP = 0x5F, /* PIO setup FIS - device to host */ FIS_TYPE_SETDEVBITS = 0xA1, /* Set dev bits FIS - device to host */ }; /* * SCSI opcodes */ #define TEST_UNIT_READY 0x00 #define REQUEST_SENSE 0x03 #define INQUIRY 0x12 #define START_STOP_UNIT 0x1B #define PREVENT_ALLOW 0x1E #define READ_CAPACITY 0x25 #define READ_10 0x28 #define POSITION_TO_ELEMENT 0x2B #define READ_TOC 0x43 #define GET_EVENT_STATUS_NOTIFICATION 0x4A #define MODE_SENSE_10 0x5A #define REPORT_LUNS 0xA0 #define READ_12 0xA8 #define READ_CD 0xBE /* * SCSI mode page codes */ #define MODEPAGE_RW_ERROR_RECOVERY 0x01 #define MODEPAGE_CD_CAPABILITIES 0x2A /* * ATA commands */ #define ATA_SF_ENAB_SATA_SF 0x10 #define ATA_SATA_SF_AN 0x05 #define ATA_SF_DIS_SATA_SF 0x90 /* * Debug printf */ #ifdef AHCI_DEBUG static FILE *dbg; #define DPRINTF(format, arg...) do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0) #else #define DPRINTF(format, arg...) #endif #define WPRINTF(format, arg...) printf(format, ##arg) #define AHCI_PORT_IDENT 20 + 1 struct ahci_ioreq { struct blockif_req io_req; struct ahci_port *io_pr; STAILQ_ENTRY(ahci_ioreq) io_flist; TAILQ_ENTRY(ahci_ioreq) io_blist; uint8_t *cfis; uint32_t len; uint32_t done; int slot; int more; + int readop; }; struct ahci_port { struct blockif_ctxt *bctx; struct pci_ahci_softc *pr_sc; uint8_t *cmd_lst; uint8_t *rfis; char ident[AHCI_PORT_IDENT]; int port; int atapi; int reset; int waitforclear; int mult_sectors; uint8_t xfermode; uint8_t err_cfis[20]; uint8_t sense_key; uint8_t asc; u_int ccs; uint32_t pending; uint32_t clb; uint32_t clbu; uint32_t fb; uint32_t fbu; uint32_t is; uint32_t ie; uint32_t cmd; uint32_t unused0; uint32_t tfd; uint32_t sig; uint32_t ssts; uint32_t sctl; uint32_t serr; uint32_t sact; uint32_t ci; uint32_t sntf; uint32_t fbs; /* * i/o request info */ struct ahci_ioreq *ioreq; int ioqsz; STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd; TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd; }; struct ahci_cmd_hdr { uint16_t flags; uint16_t prdtl; uint32_t prdbc; uint64_t ctba; uint32_t reserved[4]; }; struct ahci_prdt_entry { uint64_t dba; uint32_t reserved; #define DBCMASK 0x3fffff uint32_t dbc; }; struct pci_ahci_softc { struct pci_devinst *asc_pi; pthread_mutex_t mtx; int ports; uint32_t cap; uint32_t ghc; uint32_t is; uint32_t pi; uint32_t vs; uint32_t ccc_ctl; uint32_t ccc_pts; uint32_t em_loc; uint32_t em_ctl; uint32_t cap2; uint32_t bohc; uint32_t lintr; struct ahci_port port[MAX_PORTS]; }; #define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx) static void ahci_handle_port(struct ahci_port *p); static inline void lba_to_msf(uint8_t *buf, int lba) { lba += 150; buf[0] = (lba / 75) / 60; buf[1] = (lba / 75) % 60; buf[2] = lba % 75; } /* * Generate HBA interrupts on global IS register write. */ static void ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask) { struct pci_devinst *pi = sc->asc_pi; struct ahci_port *p; int i, nmsg; uint32_t mmask; /* Update global IS from PxIS/PxIE. */ for (i = 0; i < sc->ports; i++) { p = &sc->port[i]; if (p->is & p->ie) sc->is |= (1 << i); } DPRINTF("%s(%08x) %08x", __func__, mask, sc->is); /* If there is nothing enabled -- clear legacy interrupt and exit. */ if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) { if (sc->lintr) { pci_lintr_deassert(pi); sc->lintr = 0; } return; } /* If there is anything and no MSI -- assert legacy interrupt. */ nmsg = pci_msi_maxmsgnum(pi); if (nmsg == 0) { if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } return; } /* Assert respective MSIs for ports that were touched. */ for (i = 0; i < nmsg; i++) { if (sc->ports <= nmsg || i < nmsg - 1) mmask = 1 << i; else mmask = 0xffffffff << i; if (sc->is & mask && mmask & mask) pci_generate_msi(pi, i); } } /* * Generate HBA interrupt on specific port event. */ static void ahci_port_intr(struct ahci_port *p) { struct pci_ahci_softc *sc = p->pr_sc; struct pci_devinst *pi = sc->asc_pi; int nmsg; DPRINTF("%s(%d) %08x/%08x %08x", __func__, p->port, p->is, p->ie, sc->is); /* If there is nothing enabled -- we are done. */ if ((p->is & p->ie) == 0) return; /* In case of non-shared MSI always generate interrupt. */ nmsg = pci_msi_maxmsgnum(pi); if (sc->ports <= nmsg || p->port < nmsg - 1) { sc->is |= (1 << p->port); if ((sc->ghc & AHCI_GHC_IE) == 0) return; pci_generate_msi(pi, p->port); return; } /* If IS for this port is already set -- do nothing. */ if (sc->is & (1 << p->port)) return; sc->is |= (1 << p->port); /* If interrupts are enabled -- generate one. */ if ((sc->ghc & AHCI_GHC_IE) == 0) return; if (nmsg > 0) { pci_generate_msi(pi, nmsg - 1); } else if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } } static void ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis) { int offset, len, irq; if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE)) return; switch (ft) { case FIS_TYPE_REGD2H: offset = 0x40; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0; break; case FIS_TYPE_SETDEVBITS: offset = 0x58; len = 8; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0; break; case FIS_TYPE_PIOSETUP: offset = 0x20; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0; break; default: WPRINTF("unsupported fis type %d", ft); return; } if (fis[2] & ATA_S_ERROR) { p->waitforclear = 1; irq |= AHCI_P_IX_TFE; } memcpy(p->rfis + offset, fis, len); if (irq) { if (~p->is & irq) { p->is |= irq; ahci_port_intr(p); } } } static void ahci_write_fis_piosetup(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_PIOSETUP; ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis); } static void ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[8]; uint8_t error; error = (tfd >> 8) & 0xff; tfd &= 0x77; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_SETDEVBITS; fis[1] = (1 << 6); fis[2] = tfd; fis[3] = error; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = slot; p->err_cfis[2] = tfd; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else { *(uint32_t *)(fis + 4) = (1 << slot); p->sact &= ~(1 << slot); } p->tfd &= ~0x77; p->tfd |= tfd; ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis); } static void ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[20]; uint8_t error; error = (tfd >> 8) & 0xff; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = (1 << 6); fis[2] = tfd & 0xff; fis[3] = error; fis[4] = cfis[4]; fis[5] = cfis[5]; fis[6] = cfis[6]; fis[7] = cfis[7]; fis[8] = cfis[8]; fis[9] = cfis[9]; fis[10] = cfis[10]; fis[11] = cfis[11]; fis[12] = cfis[12]; fis[13] = cfis[13]; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = 0x80; p->err_cfis[2] = tfd & 0xff; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else p->ci &= ~(1 << slot); p->tfd = tfd; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot) { uint8_t fis[20]; p->tfd = ATA_S_READY | ATA_S_DSC; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = 0; /* No interrupt */ fis[2] = p->tfd; /* Status */ fis[3] = 0; /* No error */ p->ci &= ~(1 << slot); ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_reset_fis_d2h(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[3] = 1; fis[4] = 1; if (p->atapi) { fis[5] = 0x14; fis[6] = 0xeb; } fis[12] = 1; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_check_stopped(struct ahci_port *p) { /* * If we are no longer processing the command list and nothing * is in-flight, clear the running bit, the current command * slot, the command issue and active bits. */ if (!(p->cmd & AHCI_P_CMD_ST)) { if (p->pending == 0) { p->ccs = 0; p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK); p->ci = 0; p->sact = 0; p->waitforclear = 0; } } } static void ahci_port_stop(struct ahci_port *p) { struct ahci_ioreq *aior; uint8_t *cfis; int slot; int error; assert(pthread_mutex_isowned_np(&p->pr_sc->mtx)); TAILQ_FOREACH(aior, &p->iobhd, io_blist) { /* * Try to cancel the outstanding blockif request. */ error = blockif_cancel(p->bctx, &aior->io_req); if (error != 0) continue; slot = aior->slot; cfis = aior->cfis; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) p->sact &= ~(1 << slot); /* NCQ */ else p->ci &= ~(1 << slot); /* * This command is now done. */ p->pending &= ~(1 << slot); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); } ahci_check_stopped(p); } static void ahci_port_reset(struct ahci_port *pr) { pr->serr = 0; pr->sact = 0; pr->xfermode = ATA_UDMA6; pr->mult_sectors = 128; if (!pr->bctx) { pr->ssts = ATA_SS_DET_NO_DEVICE; pr->sig = 0xFFFFFFFF; pr->tfd = 0x7F; return; } pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE; if (pr->sctl & ATA_SC_SPD_MASK) pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK); else pr->ssts |= ATA_SS_SPD_GEN3; pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA; if (!pr->atapi) { pr->sig = PxSIG_ATA; pr->tfd |= ATA_S_READY; } else pr->sig = PxSIG_ATAPI; ahci_write_reset_fis_d2h(pr); } static void ahci_reset(struct pci_ahci_softc *sc) { int i; sc->ghc = AHCI_GHC_AE; sc->is = 0; if (sc->lintr) { pci_lintr_deassert(sc->asc_pi); sc->lintr = 0; } for (i = 0; i < sc->ports; i++) { sc->port[i].ie = 0; sc->port[i].is = 0; sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD); if (sc->port[i].bctx) sc->port[i].cmd |= AHCI_P_CMD_CPS; sc->port[i].sctl = 0; ahci_port_reset(&sc->port[i]); } } static void ata_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i ^ 1] = *src++; else dest[i ^ 1] = ' '; } } static void atapi_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i] = *src++; else dest[i] = ' '; } } /* * Build up the iovec based on the PRDT, 'done' and 'len'. */ static void ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior, struct ahci_prdt_entry *prdt, uint16_t prdtl) { struct blockif_req *breq = &aior->io_req; int i, j, skip, todo, left, extra; uint32_t dbcsz; /* Copy part of PRDT between 'done' and 'len' bytes into the iov. */ skip = aior->done; left = aior->len - aior->done; todo = 0; for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0; i++, prdt++) { dbcsz = (prdt->dbc & DBCMASK) + 1; /* Skip already done part of the PRDT */ if (dbcsz <= skip) { skip -= dbcsz; continue; } dbcsz -= skip; if (dbcsz > left) dbcsz = left; breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba + skip, dbcsz); breq->br_iov[j].iov_len = dbcsz; todo += dbcsz; left -= dbcsz; skip = 0; j++; } /* If we got limited by IOV length, round I/O down to sector size. */ if (j == BLOCKIF_IOV_MAX) { extra = todo % blockif_sectsz(p->bctx); todo -= extra; assert(todo > 0); while (extra > 0) { if (breq->br_iov[j - 1].iov_len > extra) { breq->br_iov[j - 1].iov_len -= extra; break; } extra -= breq->br_iov[j - 1].iov_len; j--; } } breq->br_iovcnt = j; breq->br_resid = todo; aior->done += todo; aior->more = (aior->done < aior->len && i < prdtl); } static void ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; struct ahci_prdt_entry *prdt; struct ahci_cmd_hdr *hdr; uint64_t lba; uint32_t len; int err, first, ncq, readop; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); ncq = 0; readop = 1; first = (done == 0); if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || cfis[2] == ATA_WRITE_FPDMA_QUEUED) readop = 0; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[11] << 8 | cfis[3]; if (!len) len = 65536; ncq = 1; } else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[13] << 8 | cfis[12]; if (!len) len = 65536; } else { lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) | (cfis[5] << 8) | cfis[4]; len = cfis[12]; if (!len) len = 256; } lba *= blockif_sectsz(p->bctx); len *= blockif_sectsz(p->bctx); /* Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; + aior->readop = readop; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); if (readop) err = blockif_read(p->bctx, breq); else err = blockif_write(p->bctx, breq); assert(err == 0); } static void ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_ioreq *aior; struct blockif_req *breq; int err; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = 0; aior->done = 0; aior->more = 0; breq = &aior->io_req; /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_flush(p->bctx, breq); assert(err == 0); } static inline void read_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; void *to; int i, len; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; to = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); sublen = MIN(len, dbcsz); memcpy(to, ptr, sublen); len -= sublen; to += sublen; prdt++; } } static void ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; uint8_t *entry; uint64_t elba; uint32_t len, elen; int err, first, ncq; uint8_t buf[512]; first = (done == 0); if (cfis[2] == ATA_DATA_SET_MANAGEMENT) { len = (uint16_t)cfis[13] << 8 | cfis[12]; len *= 512; ncq = 0; } else { /* ATA_SEND_FPDMA_QUEUED */ len = (uint16_t)cfis[11] << 8 | cfis[3]; len *= 512; ncq = 1; } read_prdt(p, slot, cfis, buf, sizeof(buf)); next: entry = &buf[done]; elba = ((uint64_t)entry[5] << 40) | ((uint64_t)entry[4] << 32) | ((uint64_t)entry[3] << 24) | ((uint64_t)entry[2] << 16) | ((uint64_t)entry[1] << 8) | entry[0]; elen = (uint16_t)entry[7] << 8 | entry[6]; done += 8; if (elen == 0) { if (done >= len) { if (ncq) { if (first) ahci_write_fis_d2h_ncq(p, slot); ahci_write_fis_sdb(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } else { ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } p->pending &= ~(1 << slot); ahci_check_stopped(p); if (!first) ahci_handle_port(p); return; } goto next; } /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->more = (len != done); breq = &aior->io_req; breq->br_offset = elba * blockif_sectsz(p->bctx); breq->br_resid = elen * blockif_sectsz(p->bctx); /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); err = blockif_delete(p->bctx, breq); assert(err == 0); } static inline void write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; void *from; int i, len; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; from = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); sublen = MIN(len, dbcsz); memcpy(ptr, from, sublen); len -= sublen; from += sublen; prdt++; } hdr->prdbc = size - len; } static void ahci_checksum(uint8_t *buf, int size) { int i; uint8_t sum = 0; for (i = 0; i < size - 1; i++) sum += buf[i]; buf[size - 1] = 0x100 - sum; } static void ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; uint32_t buf[128]; uint8_t *buf8 = (uint8_t *)buf; uint16_t *buf16 = (uint16_t *)buf; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0 || cfis[5] != 0 || cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); return; } memset(buf, 0, sizeof(buf)); if (cfis[4] == 0x00) { /* Log directory */ buf16[0x00] = 1; /* Version -- 1 */ buf16[0x10] = 1; /* NCQ Command Error Log -- 1 page */ buf16[0x13] = 1; /* SATA NCQ Send and Receive Log -- 1 page */ } else if (cfis[4] == 0x10) { /* NCQ Command Error Log */ memcpy(buf8, p->err_cfis, sizeof(p->err_cfis)); ahci_checksum(buf8, sizeof(buf)); } else if (cfis[4] == 0x13) { /* SATA NCQ Send and Receive Log */ if (blockif_candelete(p->bctx) && !blockif_is_ro(p->bctx)) { buf[0x00] = 1; /* SFQ DSM supported */ buf[0x01] = 1; /* SFQ DSM TRIM supported */ } } else { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); return; } if (cfis[2] == ATA_READ_LOG_EXT) ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } static void handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { uint16_t buf[256]; uint64_t sectors; int sectsz, psectsz, psectoff, candelete, ro; uint16_t cyl; uint8_t sech, heads; ro = blockif_is_ro(p->bctx); candelete = blockif_candelete(p->bctx); sectsz = blockif_sectsz(p->bctx); sectors = blockif_size(p->bctx) / sectsz; blockif_chs(p->bctx, &cyl, &heads, &sech); blockif_psectsz(p->bctx, &psectsz, &psectoff); memset(buf, 0, sizeof(buf)); buf[0] = 0x0040; buf[1] = cyl; buf[3] = heads; buf[6] = sech; ata_string((uint8_t *)(buf+10), p->ident, 20); ata_string((uint8_t *)(buf+23), "001", 8); ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40); buf[47] = (0x8000 | 128); buf[48] = 0; buf[49] = (1 << 8 | 1 << 9 | 1 << 11); buf[50] = (1 << 14); buf[53] = (1 << 1 | 1 << 2); if (p->mult_sectors) buf[59] = (0x100 | p->mult_sectors); if (sectors <= 0x0fffffff) { buf[60] = sectors; buf[61] = (sectors >> 16); } else { buf[60] = 0xffff; buf[61] = 0x0fff; } buf[63] = 0x7; if (p->xfermode & ATA_WDMA0) buf[63] |= (1 << ((p->xfermode & 7) + 8)); buf[64] = 0x3; buf[65] = 120; buf[66] = 120; buf[67] = 120; buf[68] = 120; buf[69] = 0; buf[75] = 31; buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 | ATA_SUPPORT_NCQ); buf[77] = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED | (p->ssts & ATA_SS_SPD_MASK) >> 3); buf[80] = 0x3f0; buf[81] = 0x28; buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); buf[83] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 14); buf[84] = (1 << 14); buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); buf[86] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 15); buf[87] = (1 << 14); buf[88] = 0x7f; if (p->xfermode & ATA_UDMA0) buf[88] |= (1 << ((p->xfermode & 7) + 8)); buf[100] = sectors; buf[101] = (sectors >> 16); buf[102] = (sectors >> 32); buf[103] = (sectors >> 48); if (candelete && !ro) { buf[69] |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT; buf[105] = 1; buf[169] = ATA_SUPPORT_DSM_TRIM; } buf[106] = 0x4000; buf[209] = 0x4000; if (psectsz > sectsz) { buf[106] |= 0x2000; buf[106] |= ffsl(psectsz / sectsz) - 1; buf[209] |= (psectoff / sectsz); } if (sectsz > 512) { buf[106] |= 0x1000; buf[117] = sectsz / 2; buf[118] = ((sectsz / 2) >> 16); } buf[119] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); buf[120] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); buf[222] = 0x1020; buf[255] = 0x00a5; ahci_checksum((uint8_t *)buf, sizeof(buf)); ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) { if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { uint16_t buf[256]; memset(buf, 0, sizeof(buf)); buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5); ata_string((uint8_t *)(buf+10), p->ident, 20); ata_string((uint8_t *)(buf+23), "001", 8); ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40); buf[49] = (1 << 9 | 1 << 8); buf[50] = (1 << 14 | 1); buf[53] = (1 << 2 | 1 << 1); buf[62] = 0x3f; buf[63] = 7; if (p->xfermode & ATA_WDMA0) buf[63] |= (1 << ((p->xfermode & 7) + 8)); buf[64] = 3; buf[65] = 120; buf[66] = 120; buf[67] = 120; buf[68] = 120; buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3); buf[77] = ((p->ssts & ATA_SS_SPD_MASK) >> 3); buf[78] = (1 << 5); buf[80] = 0x3f0; buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); buf[83] = (1 << 14); buf[84] = (1 << 14); buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); buf[87] = (1 << 14); buf[88] = 0x7f; if (p->xfermode & ATA_UDMA0) buf[88] |= (1 << ((p->xfermode & 7) + 8)); buf[222] = 0x1020; buf[255] = 0x00a5; ahci_checksum((uint8_t *)buf, sizeof(buf)); ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[36]; uint8_t *acmd; int len; uint32_t tfd; acmd = cfis + 0x40; if (acmd[1] & 1) { /* VPD */ if (acmd[2] == 0) { /* Supported VPD pages */ buf[0] = 0x05; buf[1] = 0; buf[2] = 0; buf[3] = 1; buf[4] = 0; len = 4 + buf[3]; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } } else { buf[0] = 0x05; buf[1] = 0x80; buf[2] = 0x00; buf[3] = 0x21; buf[4] = 31; buf[5] = 0; buf[6] = 0; buf[7] = 0; atapi_string(buf + 8, "BHYVE", 8); atapi_string(buf + 16, "BHYVE DVD-ROM", 16); atapi_string(buf + 32, "001", 4); len = sizeof(buf); } if (len > acmd[4]) len = acmd[4]; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, len); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[8]; uint64_t sectors; sectors = blockif_size(p->bctx) / 2048; be32enc(buf, sectors - 1); be32enc(buf + 4, 2048); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint8_t format; int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); format = acmd[9] >> 6; switch (format) { case 0: { int msf, size; uint64_t sectors; uint8_t start_track, buf[20], *bp; msf = (acmd[1] >> 1) & 1; start_track = acmd[6]; if (start_track > 1 && start_track != 0xaa) { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } bp = buf + 2; *bp++ = 1; *bp++ = 1; if (start_track <= 1) { *bp++ = 0; *bp++ = 0x14; *bp++ = 1; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } } *bp++ = 0; *bp++ = 0x14; *bp++ = 0xaa; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 1: { uint8_t buf[12]; memset(buf, 0, sizeof(buf)); buf[1] = 0xa; buf[2] = 0x1; buf[3] = 0x1; if (len > sizeof(buf)) len = sizeof(buf); write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 2: { int msf, size; uint64_t sectors; uint8_t *bp, buf[50]; msf = (acmd[1] >> 1) & 1; bp = buf + 2; *bp++ = 1; *bp++ = 1; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa1; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa2; *bp++ = 0; *bp++ = 0; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } default: { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); break; } } } static void atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[16]; memset(buf, 0, sizeof(buf)); buf[3] = 8; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; struct blockif_req *breq; uint8_t *acmd; uint64_t lba; uint32_t len; int err; acmd = cfis + 0x40; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); prdt = (struct ahci_prdt_entry *)(cfis + 0x80); lba = be32dec(acmd + 2); if (acmd[0] == READ_10) len = be16dec(acmd + 7); else len = be32dec(acmd + 6); if (len == 0) { cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } lba *= 2048; len *= 2048; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; + aior->readop = 1; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_read(p->bctx, breq); assert(err == 0); } static void atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[64]; uint8_t *acmd; int len; acmd = cfis + 0x40; len = acmd[4]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, len); buf[0] = 0x70 | (1 << 7); buf[2] = p->sense_key; buf[7] = 10; buf[12] = p->asc; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd = cfis + 0x40; uint32_t tfd; switch (acmd[4] & 3) { case 0: case 1: case 3: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; tfd = ATA_S_READY | ATA_S_DSC; break; case 2: /* TODO eject media */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x53; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; uint8_t pc, code; int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); pc = acmd[2] >> 6; code = acmd[2] & 0x3f; switch (pc) { case 0: switch (code) { case MODEPAGE_RW_ERROR_RECOVERY: { uint8_t buf[16]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 16 - 2); buf[2] = 0x70; buf[8] = 0x01; buf[9] = 16 - 10; buf[11] = 0x05; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } case MODEPAGE_CD_CAPABILITIES: { uint8_t buf[30]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 30 - 2); buf[2] = 0x70; buf[8] = 0x2A; buf[9] = 30 - 10; buf[10] = 0x08; buf[12] = 0x71; be16enc(&buf[18], 2); be16enc(&buf[20], 512); write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } default: goto error; break; } break; case 3: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x39; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; error: case 1: case 2: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_get_event_status_notification(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; acmd = cfis + 0x40; /* we don't support asynchronous operation */ if (!(acmd[1] & 1)) { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } else { uint8_t buf[8]; int len; len = be16dec(acmd + 7); if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 8 - 2); buf[2] = 0x04; buf[3] = 0x10; buf[5] = 0x02; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; acmd = cfis + 0x40; #ifdef AHCI_DEBUG { int i; DPRINTF("ACMD:"); for (i = 0; i < 16; i++) DPRINTF("%02x ", acmd[i]); DPRINTF(""); } #endif switch (acmd[0]) { case TEST_UNIT_READY: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case INQUIRY: atapi_inquiry(p, slot, cfis); break; case READ_CAPACITY: atapi_read_capacity(p, slot, cfis); break; case PREVENT_ALLOW: /* TODO */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case READ_TOC: atapi_read_toc(p, slot, cfis); break; case REPORT_LUNS: atapi_report_luns(p, slot, cfis); break; case READ_10: case READ_12: atapi_read(p, slot, cfis, 0); break; case REQUEST_SENSE: atapi_request_sense(p, slot, cfis); break; case START_STOP_UNIT: atapi_start_stop_unit(p, slot, cfis); break; case MODE_SENSE_10: atapi_mode_sense(p, slot, cfis); break; case GET_EVENT_STATUS_NOTIFICATION: atapi_get_event_status_notification(p, slot, cfis); break; default: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x20; ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { p->tfd |= ATA_S_BUSY; switch (cfis[2]) { case ATA_ATA_IDENTIFY: handle_identify(p, slot, cfis); break; case ATA_SETFEATURES: { switch (cfis[3]) { case ATA_SF_ENAB_SATA_SF: switch (cfis[12]) { case ATA_SATA_SF_AN: p->tfd = ATA_S_DSC | ATA_S_READY; break; default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } break; case ATA_SF_ENAB_WCACHE: case ATA_SF_DIS_WCACHE: case ATA_SF_ENAB_RCACHE: case ATA_SF_DIS_RCACHE: p->tfd = ATA_S_DSC | ATA_S_READY; break; case ATA_SF_SETXFER: { switch (cfis[12] & 0xf8) { case ATA_PIO: case ATA_PIO0: break; case ATA_WDMA0: case ATA_UDMA0: p->xfermode = (cfis[12] & 0x7); break; } p->tfd = ATA_S_DSC | ATA_S_READY; break; } default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; } case ATA_SET_MULTI: if (cfis[12] != 0 && (cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) { p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); } else { p->mult_sectors = cfis[12]; p->tfd = ATA_S_DSC | ATA_S_READY; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; case ATA_READ: case ATA_WRITE: case ATA_READ48: case ATA_WRITE48: case ATA_READ_MUL: case ATA_WRITE_MUL: case ATA_READ_MUL48: case ATA_WRITE_MUL48: case ATA_READ_DMA: case ATA_WRITE_DMA: case ATA_READ_DMA48: case ATA_WRITE_DMA48: case ATA_READ_FPDMA_QUEUED: case ATA_WRITE_FPDMA_QUEUED: ahci_handle_rw(p, slot, cfis, 0); break; case ATA_FLUSHCACHE: case ATA_FLUSHCACHE48: ahci_handle_flush(p, slot, cfis); break; case ATA_DATA_SET_MANAGEMENT: if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM && cfis[13] == 0 && cfis[12] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_SEND_FPDMA_QUEUED: if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM && cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM && cfis[11] == 0 && cfis[3] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_READ_LOG_EXT: case ATA_READ_LOG_DMA_EXT: ahci_handle_read_log(p, slot, cfis); break; case ATA_SECURITY_FREEZE_LOCK: case ATA_SMART_CMD: case ATA_NOP: ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_CHECK_POWER_MODE: cfis[12] = 0xff; /* always on */ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_STANDBY_CMD: case ATA_STANDBY_IMMEDIATE: case ATA_IDLE_CMD: case ATA_IDLE_IMMEDIATE: case ATA_SLEEP: case ATA_READ_VERIFY: case ATA_READ_VERIFY48: ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_ATAPI_IDENTIFY: handle_atapi_identify(p, slot, cfis); break; case ATA_PACKET_CMD: if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else handle_packet_cmd(p, slot, cfis); break; default: WPRINTF("Unsupported cmd:%02x", cfis[2]); ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_slot(struct ahci_port *p, int slot) { struct ahci_cmd_hdr *hdr; #ifdef AHCI_DEBUG struct ahci_prdt_entry *prdt; #endif struct pci_ahci_softc *sc; uint8_t *cfis; #ifdef AHCI_DEBUG int cfl, i; #endif sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); #ifdef AHCI_DEBUG cfl = (hdr->flags & 0x1f) * 4; #endif cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba, 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry)); #ifdef AHCI_DEBUG prdt = (struct ahci_prdt_entry *)(cfis + 0x80); DPRINTF("cfis:"); for (i = 0; i < cfl; i++) { if (i % 10 == 0) DPRINTF(""); DPRINTF("%02x ", cfis[i]); } DPRINTF(""); for (i = 0; i < hdr->prdtl; i++) { DPRINTF("%d@%08"PRIx64"", prdt->dbc & 0x3fffff, prdt->dba); prdt++; } #endif if (cfis[0] != FIS_TYPE_REGH2D) { WPRINTF("Not a H2D FIS:%02x", cfis[0]); return; } if (cfis[1] & 0x80) { ahci_handle_cmd(p, slot, cfis); } else { if (cfis[15] & (1 << 2)) p->reset = 1; else if (p->reset) { p->reset = 0; ahci_port_reset(p); } p->ci &= ~(1 << slot); } } static void ahci_handle_port(struct ahci_port *p) { if (!(p->cmd & AHCI_P_CMD_ST)) return; /* * Search for any new commands to issue ignoring those that * are already in-flight. Stop if device is busy or in error. */ for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) { if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0) break; if (p->waitforclear) break; if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) { p->cmd &= ~AHCI_P_CMD_CCS_MASK; p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT; ahci_handle_slot(p, p->ccs); } } } /* * blockif callback routine - this runs in the context of the blockif * i/o thread, so the mutex needs to be acquired. */ static void ata_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint32_t tfd; uint8_t *cfis; int slot, ncq, dsm; DPRINTF("%s %d", __func__, err); ncq = dsm = 0; aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) ncq = 1; if (cfis[2] == ATA_DATA_SET_MANAGEMENT || (cfis[2] == ATA_SEND_FPDMA_QUEUED && (cfis[13] & 0x1f) == ATA_SFPDMA_DSM)) dsm = 1; pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { if (dsm) ahci_handle_dsm_trim(p, slot, cfis, aior->done); else ahci_handle_rw(p, slot, cfis, aior->done); goto out; } if (!err) tfd = ATA_S_READY | ATA_S_DSC; else tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; if (ncq) ahci_write_fis_sdb(p, slot, cfis, tfd); else ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit", __func__); } static void atapi_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint8_t *cfis; uint32_t tfd; int slot; DPRINTF("%s %d", __func__, err); aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE); pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { atapi_read(p, slot, cfis, aior->done); goto out; } if (!err) { tfd = ATA_S_READY | ATA_S_DSC; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x21; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit", __func__); } static void pci_ahci_ioreq_init(struct ahci_port *pr) { struct ahci_ioreq *vr; int i; pr->ioqsz = blockif_queuesz(pr->bctx); pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq)); STAILQ_INIT(&pr->iofhd); /* * Add all i/o request entries to the free queue */ for (i = 0; i < pr->ioqsz; i++) { vr = &pr->ioreq[i]; vr->io_pr = pr; if (!pr->atapi) vr->io_req.br_callback = ata_ioreq_cb; else vr->io_req.br_callback = atapi_ioreq_cb; vr->io_req.br_param = vr; STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist); } TAILQ_INIT(&pr->iobhd); } static void pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; struct ahci_port *p = &sc->port[port]; DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"", port, offset, value); switch (offset) { case AHCI_P_CLB: p->clb = value; break; case AHCI_P_CLBU: p->clbu = value; break; case AHCI_P_FB: p->fb = value; break; case AHCI_P_FBU: p->fbu = value; break; case AHCI_P_IS: p->is &= ~value; ahci_port_intr(p); break; case AHCI_P_IE: p->ie = value & 0xFDC000FF; ahci_port_intr(p); break; case AHCI_P_CMD: { p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK); p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value; if (!(value & AHCI_P_CMD_ST)) { ahci_port_stop(p); } else { uint64_t clb; p->cmd |= AHCI_P_CMD_CR; clb = (uint64_t)p->clbu << 32 | p->clb; p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb, AHCI_CL_SIZE * AHCI_MAX_SLOTS); } if (value & AHCI_P_CMD_FRE) { uint64_t fb; p->cmd |= AHCI_P_CMD_FR; fb = (uint64_t)p->fbu << 32 | p->fb; /* we don't support FBSCP, so rfis size is 256Bytes */ p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256); } else { p->cmd &= ~AHCI_P_CMD_FR; } if (value & AHCI_P_CMD_CLO) { p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ); p->cmd &= ~AHCI_P_CMD_CLO; } if (value & AHCI_P_CMD_ICC_MASK) { p->cmd &= ~AHCI_P_CMD_ICC_MASK; } ahci_handle_port(p); break; } case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"", offset); break; case AHCI_P_SCTL: p->sctl = value; if (!(p->cmd & AHCI_P_CMD_ST)) { if (value & ATA_SC_DET_RESET) ahci_port_reset(p); } break; case AHCI_P_SERR: p->serr &= ~value; break; case AHCI_P_SACT: p->sact |= value; break; case AHCI_P_CI: p->ci |= value; ahci_handle_port(p); break; case AHCI_P_SNTF: case AHCI_P_FBS: default: break; } } static void pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"", offset, value); switch (offset) { case AHCI_CAP: case AHCI_PI: case AHCI_VS: case AHCI_CAP2: DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"", offset); break; case AHCI_GHC: if (value & AHCI_GHC_HR) { ahci_reset(sc); break; } if (value & AHCI_GHC_IE) sc->ghc |= AHCI_GHC_IE; else sc->ghc &= ~AHCI_GHC_IE; ahci_generate_intr(sc, 0xffffffff); break; case AHCI_IS: sc->is &= ~value; ahci_generate_intr(sc, value); break; default: break; } } static void pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_ahci_softc *sc = pi->pi_arg; assert(baridx == 5); assert((offset % 4) == 0 && size == 4); pthread_mutex_lock(&sc->mtx); if (offset < AHCI_OFFSET) pci_ahci_host_write(sc, offset, value); else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP) pci_ahci_port_write(sc, offset, value); else WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"", offset); pthread_mutex_unlock(&sc->mtx); } static uint64_t pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; switch (offset) { case AHCI_CAP: case AHCI_GHC: case AHCI_IS: case AHCI_PI: case AHCI_VS: case AHCI_CCCC: case AHCI_CCCP: case AHCI_EM_LOC: case AHCI_EM_CTL: case AHCI_CAP2: { uint32_t *p = &sc->cap; p += (offset - AHCI_CAP) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x", offset, value); return (value); } static uint64_t pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; switch (offset) { case AHCI_P_CLB: case AHCI_P_CLBU: case AHCI_P_FB: case AHCI_P_FBU: case AHCI_P_IS: case AHCI_P_IE: case AHCI_P_CMD: case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: case AHCI_P_SCTL: case AHCI_P_SERR: case AHCI_P_SACT: case AHCI_P_CI: case AHCI_P_SNTF: case AHCI_P_FBS: { uint32_t *p= &sc->port[port].clb; p += (offset - AHCI_P_CLB) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x", port, offset, value); return value; } static uint64_t pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t regoff, int size) { struct pci_ahci_softc *sc = pi->pi_arg; uint64_t offset; uint32_t value; assert(baridx == 5); assert(size == 1 || size == 2 || size == 4); assert((regoff & (size - 1)) == 0); pthread_mutex_lock(&sc->mtx); offset = regoff & ~0x3; /* round down to a multiple of 4 bytes */ if (offset < AHCI_OFFSET) value = pci_ahci_host_read(sc, offset); else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP) value = pci_ahci_port_read(sc, offset); else { value = 0; WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"", regoff); } value >>= 8 * (regoff & 0x3); pthread_mutex_unlock(&sc->mtx); return (value); } static int pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) { char bident[sizeof("XX:XX:XX")]; struct blockif_ctxt *bctxt; struct pci_ahci_softc *sc; int ret, slots, p; MD5_CTX mdctx; u_char digest[16]; char *next, *next2; ret = 0; #ifdef AHCI_DEBUG dbg = fopen("/tmp/log", "w+"); #endif sc = calloc(1, sizeof(struct pci_ahci_softc)); pi->pi_arg = sc; sc->asc_pi = pi; pthread_mutex_init(&sc->mtx, NULL); sc->ports = 0; sc->pi = 0; slots = 32; for (p = 0; p < MAX_PORTS && opts != NULL; p++, opts = next) { /* Identify and cut off type of present port. */ if (strncmp(opts, "hd:", 3) == 0) { atapi = 0; opts += 3; } else if (strncmp(opts, "cd:", 3) == 0) { atapi = 1; opts += 3; } /* Find and cut off the next port options. */ next = strstr(opts, ",hd:"); next2 = strstr(opts, ",cd:"); if (next == NULL || (next2 != NULL && next2 < next)) next = next2; if (next != NULL) { next[0] = 0; next++; } if (opts[0] == 0) continue; /* * Attempt to open the backing image. Use the PCI slot/func * and the port number for the identifier string. */ snprintf(bident, sizeof(bident), "%d:%d:%d", pi->pi_slot, pi->pi_func, p); bctxt = blockif_open(opts, bident); if (bctxt == NULL) { sc->ports = p; ret = 1; goto open_fail; } sc->port[p].bctx = bctxt; sc->port[p].pr_sc = sc; sc->port[p].port = p; sc->port[p].atapi = atapi; /* * Create an identifier for the backing file. * Use parts of the md5 sum of the filename */ MD5Init(&mdctx); MD5Update(&mdctx, opts, strlen(opts)); MD5Final(digest, &mdctx); snprintf(sc->port[p].ident, AHCI_PORT_IDENT, "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); /* * Allocate blockif request structures and add them * to the free list */ pci_ahci_ioreq_init(&sc->port[p]); sc->pi |= (1 << p); if (sc->port[p].ioqsz < slots) slots = sc->port[p].ioqsz; } sc->ports = p; /* Intel ICH8 AHCI */ --slots; if (sc->ports < DEF_PORTS) sc->ports = DEF_PORTS; sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF | AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP | AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)| AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC | (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1); sc->vs = 0x10300; sc->cap2 = AHCI_CAP2_APST; ahci_reset(sc); pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0); p = MIN(sc->ports, 16); p = flsl(p) - ((p & (p - 1)) ? 0 : 1); pci_emul_add_msicap(pi, 1 << p); pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32, AHCI_OFFSET + sc->ports * AHCI_STEP); pci_lintr_request(pi); open_fail: if (ret) { for (p = 0; p < sc->ports; p++) { if (sc->port[p].bctx != NULL) blockif_close(sc->port[p].bctx); } free(sc); } return (ret); } static int pci_ahci_hd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { return (pci_ahci_init(ctx, pi, opts, 0)); } static int pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { return (pci_ahci_init(ctx, pi, opts, 1)); } +#ifdef BHYVE_SNAPSHOT +static int +pci_ahci_snapshot_save_queues(struct ahci_port *port, + struct vm_snapshot_meta *meta) +{ + int ret; + int idx; + struct ahci_ioreq *ioreq; + + STAILQ_FOREACH(ioreq, &port->iofhd, io_flist) { + idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq); + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + } + + idx = -1; + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + + TAILQ_FOREACH(ioreq, &port->iobhd, io_blist) { + idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq); + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + + /* + * Snapshot only the busy requests; other requests are + * not valid. + */ + ret = blockif_snapshot_req(&ioreq->io_req, meta); + if (ret != 0) { + fprintf(stderr, "%s: failed to snapshot req\r\n", + __func__); + goto done; + } + } + + idx = -1; + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + +done: + return (ret); +} + +static int +pci_ahci_snapshot_restore_queues(struct ahci_port *port, + struct vm_snapshot_meta *meta) +{ + int ret; + int idx; + struct ahci_ioreq *ioreq; + + /* Empty the free queue before restoring. */ + while (!STAILQ_EMPTY(&port->iofhd)) + STAILQ_REMOVE_HEAD(&port->iofhd, io_flist); + + /* Restore the free queue. */ + while (1) { + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + if (idx == -1) + break; + + STAILQ_INSERT_TAIL(&port->iofhd, &port->ioreq[idx], io_flist); + } + + /* Restore the busy queue. */ + while (1) { + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + if (idx == -1) + break; + + ioreq = &port->ioreq[idx]; + TAILQ_INSERT_TAIL(&port->iobhd, ioreq, io_blist); + + /* + * Restore only the busy requests; other requests are + * not valid. + */ + ret = blockif_snapshot_req(&ioreq->io_req, meta); + if (ret != 0) { + fprintf(stderr, "%s: failed to restore request\r\n", + __func__); + goto done; + } + + /* Re-enqueue the requests in the block interface. */ + if (ioreq->readop) + ret = blockif_read(port->bctx, &ioreq->io_req); + else + ret = blockif_write(port->bctx, &ioreq->io_req); + + if (ret != 0) { + fprintf(stderr, + "%s: failed to re-enqueue request\r\n", + __func__); + goto done; + } + } + +done: + return (ret); +} + +static int +pci_ahci_snapshot(struct vm_snapshot_meta *meta) +{ + int i, j, ret; + void *bctx; + struct pci_devinst *pi; + struct pci_ahci_softc *sc; + struct ahci_port *port; + struct ahci_cmd_hdr *hdr; + struct ahci_ioreq *ioreq; + + pi = meta->dev_data; + sc = pi->pi_arg; + + /* TODO: add mtx lock/unlock */ + + SNAPSHOT_VAR_OR_LEAVE(sc->ports, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->cap, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->ghc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->is, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->pi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->vs, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->ccc_ctl, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->ccc_pts, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->em_loc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->em_ctl, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->cap2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->bohc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->lintr, meta, ret, done); + + for (i = 0; i < MAX_PORTS; i++) { + port = &sc->port[i]; + + if (meta->op == VM_SNAPSHOT_SAVE) + bctx = port->bctx; + + SNAPSHOT_VAR_OR_LEAVE(bctx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->port, meta, ret, done); + + /* Mostly for restore; save is ensured by the lines above. */ + if (((bctx == NULL) && (port->bctx != NULL)) || + ((bctx != NULL) && (port->bctx == NULL))) { + fprintf(stderr, "%s: ports not matching\r\n", __func__); + ret = EINVAL; + goto done; + } + + if (port->bctx == NULL) + continue; + + if (port->port != i) { + fprintf(stderr, "%s: ports not matching: " + "actual: %d expected: %d\r\n", + __func__, port->port, i); + ret = EINVAL; + goto done; + } + + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->cmd_lst, + AHCI_CL_SIZE * AHCI_MAX_SLOTS, false, meta, ret, done); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->rfis, 256, false, meta, + ret, done); + + SNAPSHOT_VAR_OR_LEAVE(port->ident, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->atapi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->reset, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->waitforclear, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->mult_sectors, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->xfermode, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->err_cfis, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->sense_key, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->asc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->ccs, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->pending, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(port->clb, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->clbu, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->fb, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->fbu, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->ie, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->cmd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->unused0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->tfd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->sig, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->ssts, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->sctl, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->serr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->sact, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->ci, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->sntf, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done); + + for (j = 0; j < port->ioqsz; j++) { + ioreq = &port->ioreq[j]; + + /* blockif_req snapshot done only for busy requests. */ + hdr = (struct ahci_cmd_hdr *)(port->cmd_lst + + ioreq->slot * AHCI_CL_SIZE); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ioreq->cfis, + 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry), + false, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(ioreq->len, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(ioreq->done, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(ioreq->slot, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(ioreq->more, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(ioreq->readop, meta, ret, done); + } + + /* Perform save / restore specific operations. */ + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = pci_ahci_snapshot_save_queues(port, meta); + if (ret != 0) + goto done; + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + ret = pci_ahci_snapshot_restore_queues(port, meta); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + + ret = blockif_snapshot(port->bctx, meta); + if (ret != 0) { + fprintf(stderr, "%s: failed to restore blockif\r\n", + __func__); + goto done; + } + } + +done: + return (ret); +} + +static int +pci_ahci_pause(struct vmctx *ctx, struct pci_devinst *pi) +{ + struct pci_ahci_softc *sc; + struct blockif_ctxt *bctxt; + int i; + + sc = pi->pi_arg; + + for (i = 0; i < MAX_PORTS; i++) { + bctxt = sc->port[i].bctx; + if (bctxt == NULL) + continue; + + blockif_pause(bctxt); + } + + return (0); +} + +static int +pci_ahci_resume(struct vmctx *ctx, struct pci_devinst *pi) +{ + struct pci_ahci_softc *sc; + struct blockif_ctxt *bctxt; + int i; + + sc = pi->pi_arg; + + for (i = 0; i < MAX_PORTS; i++) { + bctxt = sc->port[i].bctx; + if (bctxt == NULL) + continue; + + blockif_resume(bctxt); + } + + return (0); +} +#endif + /* * Use separate emulation names to distinguish drive and atapi devices */ struct pci_devemu pci_de_ahci = { .pe_emu = "ahci", .pe_init = pci_ahci_hd_init, .pe_barwrite = pci_ahci_write, - .pe_barread = pci_ahci_read + .pe_barread = pci_ahci_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = pci_ahci_snapshot, + .pe_pause = pci_ahci_pause, + .pe_resume = pci_ahci_resume, +#endif }; PCI_EMUL_SET(pci_de_ahci); struct pci_devemu pci_de_ahci_hd = { .pe_emu = "ahci-hd", .pe_init = pci_ahci_hd_init, .pe_barwrite = pci_ahci_write, - .pe_barread = pci_ahci_read + .pe_barread = pci_ahci_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = pci_ahci_snapshot, + .pe_pause = pci_ahci_pause, + .pe_resume = pci_ahci_resume, +#endif }; PCI_EMUL_SET(pci_de_ahci_hd); struct pci_devemu pci_de_ahci_cd = { .pe_emu = "ahci-cd", .pe_init = pci_ahci_atapi_init, .pe_barwrite = pci_ahci_write, - .pe_barread = pci_ahci_read + .pe_barread = pci_ahci_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = pci_ahci_snapshot, + .pe_pause = pci_ahci_pause, + .pe_resume = pci_ahci_resume, +#endif }; PCI_EMUL_SET(pci_de_ahci_cd); diff --git a/usr.sbin/bhyve/pci_e82545.c b/usr.sbin/bhyve/pci_e82545.c index dca981be85fa..c1443b6aa613 100644 --- a/usr.sbin/bhyve/pci_e82545.c +++ b/usr.sbin/bhyve/pci_e82545.c @@ -1,2388 +1,2547 @@ /* * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Alexander Motin * Copyright (c) 2015 Peter Grehan * Copyright (c) 2013 Jeremiah Lott, Avere Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif +#include + #include #include #include #include #include #include #include #include #include #include #include #include "e1000_regs.h" #include "e1000_defines.h" #include "mii.h" #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" #include "mevent.h" #include "net_utils.h" #include "net_backends.h" /* Hardware/register definitions XXX: move some to common code. */ #define E82545_VENDOR_ID_INTEL 0x8086 #define E82545_DEV_ID_82545EM_COPPER 0x100F #define E82545_SUBDEV_ID 0x1008 #define E82545_REVISION_4 4 #define E82545_MDIC_DATA_MASK 0x0000FFFF #define E82545_MDIC_OP_MASK 0x0c000000 #define E82545_MDIC_IE 0x20000000 #define E82545_EECD_FWE_DIS 0x00000010 /* Flash writes disabled */ #define E82545_EECD_FWE_EN 0x00000020 /* Flash writes enabled */ #define E82545_EECD_FWE_MASK 0x00000030 /* Flash writes mask */ #define E82545_BAR_REGISTER 0 #define E82545_BAR_REGISTER_LEN (128*1024) #define E82545_BAR_FLASH 1 #define E82545_BAR_FLASH_LEN (64*1024) #define E82545_BAR_IO 2 #define E82545_BAR_IO_LEN 8 #define E82545_IOADDR 0x00000000 #define E82545_IODATA 0x00000004 #define E82545_IO_REGISTER_MAX 0x0001FFFF #define E82545_IO_FLASH_BASE 0x00080000 #define E82545_IO_FLASH_MAX 0x000FFFFF #define E82545_ARRAY_ENTRY(reg, offset) (reg + (offset<<2)) #define E82545_RAR_MAX 15 #define E82545_MTA_MAX 127 #define E82545_VFTA_MAX 127 /* Slightly modified from the driver versions, hardcoded for 3 opcode bits, * followed by 6 address bits. * TODO: make opcode bits and addr bits configurable? * NVM Commands - Microwire */ #define E82545_NVM_OPCODE_BITS 3 #define E82545_NVM_ADDR_BITS 6 #define E82545_NVM_DATA_BITS 16 #define E82545_NVM_OPADDR_BITS (E82545_NVM_OPCODE_BITS + E82545_NVM_ADDR_BITS) #define E82545_NVM_ADDR_MASK ((1 << E82545_NVM_ADDR_BITS)-1) #define E82545_NVM_OPCODE_MASK \ (((1 << E82545_NVM_OPCODE_BITS) - 1) << E82545_NVM_ADDR_BITS) #define E82545_NVM_OPCODE_READ (0x6 << E82545_NVM_ADDR_BITS) /* read */ #define E82545_NVM_OPCODE_WRITE (0x5 << E82545_NVM_ADDR_BITS) /* write */ #define E82545_NVM_OPCODE_ERASE (0x7 << E82545_NVM_ADDR_BITS) /* erase */ #define E82545_NVM_OPCODE_EWEN (0x4 << E82545_NVM_ADDR_BITS) /* wr-enable */ #define E82545_NVM_EEPROM_SIZE 64 /* 64 * 16-bit values == 128K */ #define E1000_ICR_SRPD 0x00010000 /* This is an arbitrary number. There is no hard limit on the chip. */ #define I82545_MAX_TXSEGS 64 /* Legacy receive descriptor */ struct e1000_rx_desc { uint64_t buffer_addr; /* Address of the descriptor's data buffer */ uint16_t length; /* Length of data DMAed into data buffer */ uint16_t csum; /* Packet checksum */ uint8_t status; /* Descriptor status */ uint8_t errors; /* Descriptor Errors */ uint16_t special; }; /* Transmit descriptor types */ #define E1000_TXD_MASK (E1000_TXD_CMD_DEXT | 0x00F00000) #define E1000_TXD_TYP_L (0) #define E1000_TXD_TYP_C (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_C) #define E1000_TXD_TYP_D (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D) /* Legacy transmit descriptor */ struct e1000_tx_desc { uint64_t buffer_addr; /* Address of the descriptor's data buffer */ union { uint32_t data; struct { uint16_t length; /* Data buffer length */ uint8_t cso; /* Checksum offset */ uint8_t cmd; /* Descriptor control */ } flags; } lower; union { uint32_t data; struct { uint8_t status; /* Descriptor status */ uint8_t css; /* Checksum start */ uint16_t special; } fields; } upper; }; /* Context descriptor */ struct e1000_context_desc { union { uint32_t ip_config; struct { uint8_t ipcss; /* IP checksum start */ uint8_t ipcso; /* IP checksum offset */ uint16_t ipcse; /* IP checksum end */ } ip_fields; } lower_setup; union { uint32_t tcp_config; struct { uint8_t tucss; /* TCP checksum start */ uint8_t tucso; /* TCP checksum offset */ uint16_t tucse; /* TCP checksum end */ } tcp_fields; } upper_setup; uint32_t cmd_and_length; union { uint32_t data; struct { uint8_t status; /* Descriptor status */ uint8_t hdr_len; /* Header length */ uint16_t mss; /* Maximum segment size */ } fields; } tcp_seg_setup; }; /* Data descriptor */ struct e1000_data_desc { uint64_t buffer_addr; /* Address of the descriptor's buffer address */ union { uint32_t data; struct { uint16_t length; /* Data buffer length */ uint8_t typ_len_ext; uint8_t cmd; } flags; } lower; union { uint32_t data; struct { uint8_t status; /* Descriptor status */ uint8_t popts; /* Packet Options */ uint16_t special; } fields; } upper; }; union e1000_tx_udesc { struct e1000_tx_desc td; struct e1000_context_desc cd; struct e1000_data_desc dd; }; /* Tx checksum info for a packet. */ struct ck_info { int ck_valid; /* ck_info is valid */ uint8_t ck_start; /* start byte of cksum calcuation */ uint8_t ck_off; /* offset of cksum insertion */ uint16_t ck_len; /* length of cksum calc: 0 is to packet-end */ }; /* * Debug printf */ static int e82545_debug = 0; #define WPRINTF(msg,params...) PRINTLN("e82545: " msg, params) #define DPRINTF(msg,params...) if (e82545_debug) WPRINTF(msg, params) #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) /* s/w representation of the RAL/RAH regs */ struct eth_uni { int eu_valid; int eu_addrsel; struct ether_addr eu_eth; }; struct e82545_softc { struct pci_devinst *esc_pi; struct vmctx *esc_ctx; struct mevent *esc_mevpitr; pthread_mutex_t esc_mtx; struct ether_addr esc_mac; net_backend_t *esc_be; /* General */ uint32_t esc_CTRL; /* x0000 device ctl */ uint32_t esc_FCAL; /* x0028 flow ctl addr lo */ uint32_t esc_FCAH; /* x002C flow ctl addr hi */ uint32_t esc_FCT; /* x0030 flow ctl type */ uint32_t esc_VET; /* x0038 VLAN eth type */ uint32_t esc_FCTTV; /* x0170 flow ctl tx timer */ uint32_t esc_LEDCTL; /* x0E00 LED control */ uint32_t esc_PBA; /* x1000 pkt buffer allocation */ /* Interrupt control */ int esc_irq_asserted; uint32_t esc_ICR; /* x00C0 cause read/clear */ uint32_t esc_ITR; /* x00C4 intr throttling */ uint32_t esc_ICS; /* x00C8 cause set */ uint32_t esc_IMS; /* x00D0 mask set/read */ uint32_t esc_IMC; /* x00D8 mask clear */ /* Transmit */ union e1000_tx_udesc *esc_txdesc; struct e1000_context_desc esc_txctx; pthread_t esc_tx_tid; pthread_cond_t esc_tx_cond; int esc_tx_enabled; int esc_tx_active; uint32_t esc_TXCW; /* x0178 transmit config */ uint32_t esc_TCTL; /* x0400 transmit ctl */ uint32_t esc_TIPG; /* x0410 inter-packet gap */ uint16_t esc_AIT; /* x0458 Adaptive Interframe Throttle */ uint64_t esc_tdba; /* verified 64-bit desc table addr */ uint32_t esc_TDBAL; /* x3800 desc table addr, low bits */ uint32_t esc_TDBAH; /* x3804 desc table addr, hi 32-bits */ uint32_t esc_TDLEN; /* x3808 # descriptors in bytes */ uint16_t esc_TDH; /* x3810 desc table head idx */ uint16_t esc_TDHr; /* internal read version of TDH */ uint16_t esc_TDT; /* x3818 desc table tail idx */ uint32_t esc_TIDV; /* x3820 intr delay */ uint32_t esc_TXDCTL; /* x3828 desc control */ uint32_t esc_TADV; /* x382C intr absolute delay */ /* L2 frame acceptance */ struct eth_uni esc_uni[16]; /* 16 x unicast MAC addresses */ uint32_t esc_fmcast[128]; /* Multicast filter bit-match */ uint32_t esc_fvlan[128]; /* VLAN 4096-bit filter */ /* Receive */ struct e1000_rx_desc *esc_rxdesc; pthread_cond_t esc_rx_cond; int esc_rx_enabled; int esc_rx_active; int esc_rx_loopback; uint32_t esc_RCTL; /* x0100 receive ctl */ uint32_t esc_FCRTL; /* x2160 flow cntl thresh, low */ uint32_t esc_FCRTH; /* x2168 flow cntl thresh, hi */ uint64_t esc_rdba; /* verified 64-bit desc table addr */ uint32_t esc_RDBAL; /* x2800 desc table addr, low bits */ uint32_t esc_RDBAH; /* x2804 desc table addr, hi 32-bits*/ uint32_t esc_RDLEN; /* x2808 #descriptors */ uint16_t esc_RDH; /* x2810 desc table head idx */ uint16_t esc_RDT; /* x2818 desc table tail idx */ uint32_t esc_RDTR; /* x2820 intr delay */ uint32_t esc_RXDCTL; /* x2828 desc control */ uint32_t esc_RADV; /* x282C intr absolute delay */ uint32_t esc_RSRPD; /* x2C00 recv small packet detect */ uint32_t esc_RXCSUM; /* x5000 receive cksum ctl */ /* IO Port register access */ uint32_t io_addr; /* Shadow copy of MDIC */ uint32_t mdi_control; /* Shadow copy of EECD */ uint32_t eeprom_control; /* Latest NVM in/out */ uint16_t nvm_data; uint16_t nvm_opaddr; /* stats */ uint32_t missed_pkt_count; /* dropped for no room in rx queue */ uint32_t pkt_rx_by_size[6]; uint32_t pkt_tx_by_size[6]; uint32_t good_pkt_rx_count; uint32_t bcast_pkt_rx_count; uint32_t mcast_pkt_rx_count; uint32_t good_pkt_tx_count; uint32_t bcast_pkt_tx_count; uint32_t mcast_pkt_tx_count; uint32_t oversize_rx_count; uint32_t tso_tx_count; uint64_t good_octets_rx; uint64_t good_octets_tx; uint64_t missed_octets; /* counts missed and oversized */ uint8_t nvm_bits:6; /* number of bits remaining in/out */ uint8_t nvm_mode:2; #define E82545_NVM_MODE_OPADDR 0x0 #define E82545_NVM_MODE_DATAIN 0x1 #define E82545_NVM_MODE_DATAOUT 0x2 /* EEPROM data */ uint16_t eeprom_data[E82545_NVM_EEPROM_SIZE]; }; static void e82545_reset(struct e82545_softc *sc, int dev); static void e82545_rx_enable(struct e82545_softc *sc); static void e82545_rx_disable(struct e82545_softc *sc); static void e82545_rx_callback(int fd, enum ev_type type, void *param); static void e82545_tx_start(struct e82545_softc *sc); static void e82545_tx_enable(struct e82545_softc *sc); static void e82545_tx_disable(struct e82545_softc *sc); static inline int e82545_size_stat_index(uint32_t size) { if (size <= 64) { return 0; } else if (size >= 1024) { return 5; } else { /* should be 1-4 */ return (ffs(size) - 6); } } static void e82545_init_eeprom(struct e82545_softc *sc) { uint16_t checksum, i; /* mac addr */ sc->eeprom_data[NVM_MAC_ADDR] = ((uint16_t)sc->esc_mac.octet[0]) | (((uint16_t)sc->esc_mac.octet[1]) << 8); sc->eeprom_data[NVM_MAC_ADDR+1] = ((uint16_t)sc->esc_mac.octet[2]) | (((uint16_t)sc->esc_mac.octet[3]) << 8); sc->eeprom_data[NVM_MAC_ADDR+2] = ((uint16_t)sc->esc_mac.octet[4]) | (((uint16_t)sc->esc_mac.octet[5]) << 8); /* pci ids */ sc->eeprom_data[NVM_SUB_DEV_ID] = E82545_SUBDEV_ID; sc->eeprom_data[NVM_SUB_VEN_ID] = E82545_VENDOR_ID_INTEL; sc->eeprom_data[NVM_DEV_ID] = E82545_DEV_ID_82545EM_COPPER; sc->eeprom_data[NVM_VEN_ID] = E82545_VENDOR_ID_INTEL; /* fill in the checksum */ checksum = 0; for (i = 0; i < NVM_CHECKSUM_REG; i++) { checksum += sc->eeprom_data[i]; } checksum = NVM_SUM - checksum; sc->eeprom_data[NVM_CHECKSUM_REG] = checksum; DPRINTF("eeprom checksum: 0x%x", checksum); } static void e82545_write_mdi(struct e82545_softc *sc, uint8_t reg_addr, uint8_t phy_addr, uint32_t data) { DPRINTF("Write mdi reg:0x%x phy:0x%x data: 0x%x", reg_addr, phy_addr, data); } static uint32_t e82545_read_mdi(struct e82545_softc *sc, uint8_t reg_addr, uint8_t phy_addr) { //DPRINTF("Read mdi reg:0x%x phy:0x%x", reg_addr, phy_addr); switch (reg_addr) { case PHY_STATUS: return (MII_SR_LINK_STATUS | MII_SR_AUTONEG_CAPS | MII_SR_AUTONEG_COMPLETE); case PHY_AUTONEG_ADV: return NWAY_AR_SELECTOR_FIELD; case PHY_LP_ABILITY: return 0; case PHY_1000T_STATUS: return (SR_1000T_LP_FD_CAPS | SR_1000T_REMOTE_RX_STATUS | SR_1000T_LOCAL_RX_STATUS); case PHY_ID1: return (M88E1011_I_PHY_ID >> 16) & 0xFFFF; case PHY_ID2: return (M88E1011_I_PHY_ID | E82545_REVISION_4) & 0xFFFF; default: DPRINTF("Unknown mdi read reg:0x%x phy:0x%x", reg_addr, phy_addr); return 0; } /* not reached */ } static void e82545_eecd_strobe(struct e82545_softc *sc) { /* Microwire state machine */ /* DPRINTF("eeprom state machine srtobe " "0x%x 0x%x 0x%x 0x%x", sc->nvm_mode, sc->nvm_bits, sc->nvm_opaddr, sc->nvm_data);*/ if (sc->nvm_bits == 0) { DPRINTF("eeprom state machine not expecting data! " "0x%x 0x%x 0x%x 0x%x", sc->nvm_mode, sc->nvm_bits, sc->nvm_opaddr, sc->nvm_data); return; } sc->nvm_bits--; if (sc->nvm_mode == E82545_NVM_MODE_DATAOUT) { /* shifting out */ if (sc->nvm_data & 0x8000) { sc->eeprom_control |= E1000_EECD_DO; } else { sc->eeprom_control &= ~E1000_EECD_DO; } sc->nvm_data <<= 1; if (sc->nvm_bits == 0) { /* read done, back to opcode mode. */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; } } else if (sc->nvm_mode == E82545_NVM_MODE_DATAIN) { /* shifting in */ sc->nvm_data <<= 1; if (sc->eeprom_control & E1000_EECD_DI) { sc->nvm_data |= 1; } if (sc->nvm_bits == 0) { /* eeprom write */ uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK; if (op != E82545_NVM_OPCODE_WRITE) { DPRINTF("Illegal eeprom write op 0x%x", sc->nvm_opaddr); } else if (addr >= E82545_NVM_EEPROM_SIZE) { DPRINTF("Illegal eeprom write addr 0x%x", sc->nvm_opaddr); } else { DPRINTF("eeprom write eeprom[0x%x] = 0x%x", addr, sc->nvm_data); sc->eeprom_data[addr] = sc->nvm_data; } /* back to opcode mode */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; } } else if (sc->nvm_mode == E82545_NVM_MODE_OPADDR) { sc->nvm_opaddr <<= 1; if (sc->eeprom_control & E1000_EECD_DI) { sc->nvm_opaddr |= 1; } if (sc->nvm_bits == 0) { uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; switch (op) { case E82545_NVM_OPCODE_EWEN: DPRINTF("eeprom write enable: 0x%x", sc->nvm_opaddr); /* back to opcode mode */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; break; case E82545_NVM_OPCODE_READ: { uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK; sc->nvm_mode = E82545_NVM_MODE_DATAOUT; sc->nvm_bits = E82545_NVM_DATA_BITS; if (addr < E82545_NVM_EEPROM_SIZE) { sc->nvm_data = sc->eeprom_data[addr]; DPRINTF("eeprom read: eeprom[0x%x] = 0x%x", addr, sc->nvm_data); } else { DPRINTF("eeprom illegal read: 0x%x", sc->nvm_opaddr); sc->nvm_data = 0; } break; } case E82545_NVM_OPCODE_WRITE: sc->nvm_mode = E82545_NVM_MODE_DATAIN; sc->nvm_bits = E82545_NVM_DATA_BITS; sc->nvm_data = 0; break; default: DPRINTF("eeprom unknown op: 0x%x", sc->nvm_opaddr); /* back to opcode mode */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; } } } else { DPRINTF("eeprom state machine wrong state! " "0x%x 0x%x 0x%x 0x%x", sc->nvm_mode, sc->nvm_bits, sc->nvm_opaddr, sc->nvm_data); } } static void e82545_itr_callback(int fd, enum ev_type type, void *param) { uint32_t new; struct e82545_softc *sc = param; pthread_mutex_lock(&sc->esc_mtx); new = sc->esc_ICR & sc->esc_IMS; if (new && !sc->esc_irq_asserted) { DPRINTF("itr callback: lintr assert %x", new); sc->esc_irq_asserted = 1; pci_lintr_assert(sc->esc_pi); } else { mevent_delete(sc->esc_mevpitr); sc->esc_mevpitr = NULL; } pthread_mutex_unlock(&sc->esc_mtx); } static void e82545_icr_assert(struct e82545_softc *sc, uint32_t bits) { uint32_t new; DPRINTF("icr assert: 0x%x", bits); /* * An interrupt is only generated if bits are set that * aren't already in the ICR, these bits are unmasked, * and there isn't an interrupt already pending. */ new = bits & ~sc->esc_ICR & sc->esc_IMS; sc->esc_ICR |= bits; if (new == 0) { DPRINTF("icr assert: masked %x, ims %x", new, sc->esc_IMS); } else if (sc->esc_mevpitr != NULL) { DPRINTF("icr assert: throttled %x, ims %x", new, sc->esc_IMS); } else if (!sc->esc_irq_asserted) { DPRINTF("icr assert: lintr assert %x", new); sc->esc_irq_asserted = 1; pci_lintr_assert(sc->esc_pi); if (sc->esc_ITR != 0) { sc->esc_mevpitr = mevent_add( (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ EVF_TIMER, e82545_itr_callback, sc); } } } static void e82545_ims_change(struct e82545_softc *sc, uint32_t bits) { uint32_t new; /* * Changing the mask may allow previously asserted * but masked interrupt requests to generate an interrupt. */ new = bits & sc->esc_ICR & ~sc->esc_IMS; sc->esc_IMS |= bits; if (new == 0) { DPRINTF("ims change: masked %x, ims %x", new, sc->esc_IMS); } else if (sc->esc_mevpitr != NULL) { DPRINTF("ims change: throttled %x, ims %x", new, sc->esc_IMS); } else if (!sc->esc_irq_asserted) { DPRINTF("ims change: lintr assert %x", new); sc->esc_irq_asserted = 1; pci_lintr_assert(sc->esc_pi); if (sc->esc_ITR != 0) { sc->esc_mevpitr = mevent_add( (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ EVF_TIMER, e82545_itr_callback, sc); } } } static void e82545_icr_deassert(struct e82545_softc *sc, uint32_t bits) { DPRINTF("icr deassert: 0x%x", bits); sc->esc_ICR &= ~bits; /* * If there are no longer any interrupt sources and there * was an asserted interrupt, clear it */ if (sc->esc_irq_asserted && !(sc->esc_ICR & sc->esc_IMS)) { DPRINTF("icr deassert: lintr deassert %x", bits); pci_lintr_deassert(sc->esc_pi); sc->esc_irq_asserted = 0; } } static void e82545_intr_write(struct e82545_softc *sc, uint32_t offset, uint32_t value) { DPRINTF("intr_write: off %x, val %x", offset, value); switch (offset) { case E1000_ICR: e82545_icr_deassert(sc, value); break; case E1000_ITR: sc->esc_ITR = value; break; case E1000_ICS: sc->esc_ICS = value; /* not used: store for debug */ e82545_icr_assert(sc, value); break; case E1000_IMS: e82545_ims_change(sc, value); break; case E1000_IMC: sc->esc_IMC = value; /* for debug */ sc->esc_IMS &= ~value; // XXX clear interrupts if all ICR bits now masked // and interrupt was pending ? break; default: break; } } static uint32_t e82545_intr_read(struct e82545_softc *sc, uint32_t offset) { uint32_t retval; retval = 0; DPRINTF("intr_read: off %x", offset); switch (offset) { case E1000_ICR: retval = sc->esc_ICR; sc->esc_ICR = 0; e82545_icr_deassert(sc, ~0); break; case E1000_ITR: retval = sc->esc_ITR; break; case E1000_ICS: /* write-only register */ break; case E1000_IMS: retval = sc->esc_IMS; break; case E1000_IMC: /* write-only register */ break; default: break; } return (retval); } static void e82545_devctl(struct e82545_softc *sc, uint32_t val) { sc->esc_CTRL = val & ~E1000_CTRL_RST; if (val & E1000_CTRL_RST) { DPRINTF("e1k: s/w reset, ctl %x", val); e82545_reset(sc, 1); } /* XXX check for phy reset ? */ } static void e82545_rx_update_rdba(struct e82545_softc *sc) { /* XXX verify desc base/len within phys mem range */ sc->esc_rdba = (uint64_t)sc->esc_RDBAH << 32 | sc->esc_RDBAL; /* Cache host mapping of guest descriptor array */ sc->esc_rxdesc = paddr_guest2host(sc->esc_ctx, sc->esc_rdba, sc->esc_RDLEN); } static void e82545_rx_ctl(struct e82545_softc *sc, uint32_t val) { int on; on = ((val & E1000_RCTL_EN) == E1000_RCTL_EN); /* Save RCTL after stripping reserved bits 31:27,24,21,14,11:10,0 */ sc->esc_RCTL = val & ~0xF9204c01; DPRINTF("rx_ctl - %s RCTL %x, val %x", on ? "on" : "off", sc->esc_RCTL, val); /* state change requested */ if (on != sc->esc_rx_enabled) { if (on) { /* Catch disallowed/unimplemented settings */ //assert(!(val & E1000_RCTL_LBM_TCVR)); if (sc->esc_RCTL & E1000_RCTL_LBM_TCVR) { sc->esc_rx_loopback = 1; } else { sc->esc_rx_loopback = 0; } e82545_rx_update_rdba(sc); e82545_rx_enable(sc); } else { e82545_rx_disable(sc); sc->esc_rx_loopback = 0; sc->esc_rdba = 0; sc->esc_rxdesc = NULL; } } } static void e82545_tx_update_tdba(struct e82545_softc *sc) { /* XXX verify desc base/len within phys mem range */ sc->esc_tdba = (uint64_t)sc->esc_TDBAH << 32 | sc->esc_TDBAL; /* Cache host mapping of guest descriptor array */ sc->esc_txdesc = paddr_guest2host(sc->esc_ctx, sc->esc_tdba, sc->esc_TDLEN); } static void e82545_tx_ctl(struct e82545_softc *sc, uint32_t val) { int on; on = ((val & E1000_TCTL_EN) == E1000_TCTL_EN); /* ignore TCTL_EN settings that don't change state */ if (on == sc->esc_tx_enabled) return; if (on) { e82545_tx_update_tdba(sc); e82545_tx_enable(sc); } else { e82545_tx_disable(sc); sc->esc_tdba = 0; sc->esc_txdesc = NULL; } /* Save TCTL value after stripping reserved bits 31:25,23,2,0 */ sc->esc_TCTL = val & ~0xFE800005; } int e82545_bufsz(uint32_t rctl) { switch (rctl & (E1000_RCTL_BSEX | E1000_RCTL_SZ_256)) { case (E1000_RCTL_SZ_2048): return (2048); case (E1000_RCTL_SZ_1024): return (1024); case (E1000_RCTL_SZ_512): return (512); case (E1000_RCTL_SZ_256): return (256); case (E1000_RCTL_BSEX|E1000_RCTL_SZ_16384): return (16384); case (E1000_RCTL_BSEX|E1000_RCTL_SZ_8192): return (8192); case (E1000_RCTL_BSEX|E1000_RCTL_SZ_4096): return (4096); } return (256); /* Forbidden value. */ } /* XXX one packet at a time until this is debugged */ static void e82545_rx_callback(int fd, enum ev_type type, void *param) { struct e82545_softc *sc = param; struct e1000_rx_desc *rxd; struct iovec vec[64]; int left, len, lim, maxpktsz, maxpktdesc, bufsz, i, n, size; uint32_t cause = 0; uint16_t *tp, tag, head; pthread_mutex_lock(&sc->esc_mtx); DPRINTF("rx_run: head %x, tail %x", sc->esc_RDH, sc->esc_RDT); if (!sc->esc_rx_enabled || sc->esc_rx_loopback) { DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped", sc->esc_rx_enabled, sc->esc_rx_loopback); while (netbe_rx_discard(sc->esc_be) > 0) { } goto done1; } bufsz = e82545_bufsz(sc->esc_RCTL); maxpktsz = (sc->esc_RCTL & E1000_RCTL_LPE) ? 16384 : 1522; maxpktdesc = (maxpktsz + bufsz - 1) / bufsz; size = sc->esc_RDLEN / 16; head = sc->esc_RDH; left = (size + sc->esc_RDT - head) % size; if (left < maxpktdesc) { DPRINTF("rx overflow (%d < %d) -- packet(s) dropped", left, maxpktdesc); while (netbe_rx_discard(sc->esc_be) > 0) { } goto done1; } sc->esc_rx_active = 1; pthread_mutex_unlock(&sc->esc_mtx); for (lim = size / 4; lim > 0 && left >= maxpktdesc; lim -= n) { /* Grab rx descriptor pointed to by the head pointer */ for (i = 0; i < maxpktdesc; i++) { rxd = &sc->esc_rxdesc[(head + i) % size]; vec[i].iov_base = paddr_guest2host(sc->esc_ctx, rxd->buffer_addr, bufsz); vec[i].iov_len = bufsz; } len = netbe_recv(sc->esc_be, vec, maxpktdesc); if (len <= 0) { DPRINTF("netbe_recv() returned %d", len); goto done; } /* * Adjust the packet length based on whether the CRC needs * to be stripped or if the packet is less than the minimum * eth packet size. */ if (len < ETHER_MIN_LEN - ETHER_CRC_LEN) len = ETHER_MIN_LEN - ETHER_CRC_LEN; if (!(sc->esc_RCTL & E1000_RCTL_SECRC)) len += ETHER_CRC_LEN; n = (len + bufsz - 1) / bufsz; DPRINTF("packet read %d bytes, %d segs, head %d", len, n, head); /* Apply VLAN filter. */ tp = (uint16_t *)vec[0].iov_base + 6; if ((sc->esc_RCTL & E1000_RCTL_VFE) && (ntohs(tp[0]) == sc->esc_VET)) { tag = ntohs(tp[1]) & 0x0fff; if ((sc->esc_fvlan[tag >> 5] & (1 << (tag & 0x1f))) != 0) { DPRINTF("known VLAN %d", tag); } else { DPRINTF("unknown VLAN %d", tag); n = 0; continue; } } /* Update all consumed descriptors. */ for (i = 0; i < n - 1; i++) { rxd = &sc->esc_rxdesc[(head + i) % size]; rxd->length = bufsz; rxd->csum = 0; rxd->errors = 0; rxd->special = 0; rxd->status = E1000_RXD_STAT_DD; } rxd = &sc->esc_rxdesc[(head + i) % size]; rxd->length = len % bufsz; rxd->csum = 0; rxd->errors = 0; rxd->special = 0; /* XXX signal no checksum for now */ rxd->status = E1000_RXD_STAT_PIF | E1000_RXD_STAT_IXSM | E1000_RXD_STAT_EOP | E1000_RXD_STAT_DD; /* Schedule receive interrupts. */ if (len <= sc->esc_RSRPD) { cause |= E1000_ICR_SRPD | E1000_ICR_RXT0; } else { /* XXX: RDRT and RADV timers should be here. */ cause |= E1000_ICR_RXT0; } head = (head + n) % size; left -= n; } done: pthread_mutex_lock(&sc->esc_mtx); sc->esc_rx_active = 0; if (sc->esc_rx_enabled == 0) pthread_cond_signal(&sc->esc_rx_cond); sc->esc_RDH = head; /* Respect E1000_RCTL_RDMTS */ left = (size + sc->esc_RDT - head) % size; if (left < (size >> (((sc->esc_RCTL >> 8) & 3) + 1))) cause |= E1000_ICR_RXDMT0; /* Assert all accumulated interrupts. */ if (cause != 0) e82545_icr_assert(sc, cause); done1: DPRINTF("rx_run done: head %x, tail %x", sc->esc_RDH, sc->esc_RDT); pthread_mutex_unlock(&sc->esc_mtx); } static uint16_t e82545_carry(uint32_t sum) { sum = (sum & 0xFFFF) + (sum >> 16); if (sum > 0xFFFF) sum -= 0xFFFF; return (sum); } static uint16_t e82545_buf_checksum(uint8_t *buf, int len) { int i; uint32_t sum = 0; /* Checksum all the pairs of bytes first... */ for (i = 0; i < (len & ~1U); i += 2) sum += *((u_int16_t *)(buf + i)); /* * If there's a single byte left over, checksum it, too. * Network byte order is big-endian, so the remaining byte is * the high byte. */ if (i < len) sum += htons(buf[i] << 8); return (e82545_carry(sum)); } static uint16_t e82545_iov_checksum(struct iovec *iov, int iovcnt, int off, int len) { int now, odd; uint32_t sum = 0, s; /* Skip completely unneeded vectors. */ while (iovcnt > 0 && iov->iov_len <= off && off > 0) { off -= iov->iov_len; iov++; iovcnt--; } /* Calculate checksum of requested range. */ odd = 0; while (len > 0 && iovcnt > 0) { now = MIN(len, iov->iov_len - off); s = e82545_buf_checksum(iov->iov_base + off, now); sum += odd ? (s << 8) : s; odd ^= (now & 1); len -= now; off = 0; iov++; iovcnt--; } return (e82545_carry(sum)); } /* * Return the transmit descriptor type. */ int e82545_txdesc_type(uint32_t lower) { int type; type = 0; if (lower & E1000_TXD_CMD_DEXT) type = lower & E1000_TXD_MASK; return (type); } static void e82545_transmit_checksum(struct iovec *iov, int iovcnt, struct ck_info *ck) { uint16_t cksum; int cklen; DPRINTF("tx cksum: iovcnt/s/off/len %d/%d/%d/%d", iovcnt, ck->ck_start, ck->ck_off, ck->ck_len); cklen = ck->ck_len ? ck->ck_len - ck->ck_start + 1 : INT_MAX; cksum = e82545_iov_checksum(iov, iovcnt, ck->ck_start, cklen); *(uint16_t *)((uint8_t *)iov[0].iov_base + ck->ck_off) = ~cksum; } static void e82545_transmit_backend(struct e82545_softc *sc, struct iovec *iov, int iovcnt) { if (sc->esc_be == NULL) return; (void) netbe_send(sc->esc_be, iov, iovcnt); } static void e82545_transmit_done(struct e82545_softc *sc, uint16_t head, uint16_t tail, uint16_t dsize, int *tdwb) { union e1000_tx_udesc *dsc; for ( ; head != tail; head = (head + 1) % dsize) { dsc = &sc->esc_txdesc[head]; if (dsc->td.lower.data & E1000_TXD_CMD_RS) { dsc->td.upper.data |= E1000_TXD_STAT_DD; *tdwb = 1; } } } static int e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail, uint16_t dsize, uint16_t *rhead, int *tdwb) { uint8_t *hdr, *hdrp; struct iovec iovb[I82545_MAX_TXSEGS + 2]; struct iovec tiov[I82545_MAX_TXSEGS + 2]; struct e1000_context_desc *cd; struct ck_info ckinfo[2]; struct iovec *iov; union e1000_tx_udesc *dsc; int desc, dtype, len, ntype, iovcnt, tlen, tcp, tso; int mss, paylen, seg, tiovcnt, left, now, nleft, nnow, pv, pvoff; unsigned hdrlen, vlen; uint32_t tcpsum, tcpseq; uint16_t ipcs, tcpcs, ipid, ohead; ckinfo[0].ck_valid = ckinfo[1].ck_valid = 0; iovcnt = 0; tlen = 0; ntype = 0; tso = 0; ohead = head; /* iovb[0/1] may be used for writable copy of headers. */ iov = &iovb[2]; for (desc = 0; ; desc++, head = (head + 1) % dsize) { if (head == tail) { *rhead = head; return (0); } dsc = &sc->esc_txdesc[head]; dtype = e82545_txdesc_type(dsc->td.lower.data); if (desc == 0) { switch (dtype) { case E1000_TXD_TYP_C: DPRINTF("tx ctxt desc idx %d: %016jx " "%08x%08x", head, dsc->td.buffer_addr, dsc->td.upper.data, dsc->td.lower.data); /* Save context and return */ sc->esc_txctx = dsc->cd; goto done; case E1000_TXD_TYP_L: DPRINTF("tx legacy desc idx %d: %08x%08x", head, dsc->td.upper.data, dsc->td.lower.data); /* * legacy cksum start valid in first descriptor */ ntype = dtype; ckinfo[0].ck_start = dsc->td.upper.fields.css; break; case E1000_TXD_TYP_D: DPRINTF("tx data desc idx %d: %08x%08x", head, dsc->td.upper.data, dsc->td.lower.data); ntype = dtype; break; default: break; } } else { /* Descriptor type must be consistent */ assert(dtype == ntype); DPRINTF("tx next desc idx %d: %08x%08x", head, dsc->td.upper.data, dsc->td.lower.data); } len = (dtype == E1000_TXD_TYP_L) ? dsc->td.lower.flags.length : dsc->dd.lower.data & 0xFFFFF; if (len > 0) { /* Strip checksum supplied by guest. */ if ((dsc->td.lower.data & E1000_TXD_CMD_EOP) != 0 && (dsc->td.lower.data & E1000_TXD_CMD_IFCS) == 0) len -= 2; tlen += len; if (iovcnt < I82545_MAX_TXSEGS) { iov[iovcnt].iov_base = paddr_guest2host( sc->esc_ctx, dsc->td.buffer_addr, len); iov[iovcnt].iov_len = len; } iovcnt++; } /* * Pull out info that is valid in the final descriptor * and exit descriptor loop. */ if (dsc->td.lower.data & E1000_TXD_CMD_EOP) { if (dtype == E1000_TXD_TYP_L) { if (dsc->td.lower.data & E1000_TXD_CMD_IC) { ckinfo[0].ck_valid = 1; ckinfo[0].ck_off = dsc->td.lower.flags.cso; ckinfo[0].ck_len = 0; } } else { cd = &sc->esc_txctx; if (dsc->dd.lower.data & E1000_TXD_CMD_TSE) tso = 1; if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_IXSM) ckinfo[0].ck_valid = 1; if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_IXSM || tso) { ckinfo[0].ck_start = cd->lower_setup.ip_fields.ipcss; ckinfo[0].ck_off = cd->lower_setup.ip_fields.ipcso; ckinfo[0].ck_len = cd->lower_setup.ip_fields.ipcse; } if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_TXSM) ckinfo[1].ck_valid = 1; if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_TXSM || tso) { ckinfo[1].ck_start = cd->upper_setup.tcp_fields.tucss; ckinfo[1].ck_off = cd->upper_setup.tcp_fields.tucso; ckinfo[1].ck_len = cd->upper_setup.tcp_fields.tucse; } } break; } } if (iovcnt > I82545_MAX_TXSEGS) { WPRINTF("tx too many descriptors (%d > %d) -- dropped", iovcnt, I82545_MAX_TXSEGS); goto done; } hdrlen = vlen = 0; /* Estimate writable space for VLAN header insertion. */ if ((sc->esc_CTRL & E1000_CTRL_VME) && (dsc->td.lower.data & E1000_TXD_CMD_VLE)) { hdrlen = ETHER_ADDR_LEN*2; vlen = ETHER_VLAN_ENCAP_LEN; } if (!tso) { /* Estimate required writable space for checksums. */ if (ckinfo[0].ck_valid) hdrlen = MAX(hdrlen, ckinfo[0].ck_off + 2); if (ckinfo[1].ck_valid) hdrlen = MAX(hdrlen, ckinfo[1].ck_off + 2); /* Round up writable space to the first vector. */ if (hdrlen != 0 && iov[0].iov_len > hdrlen && iov[0].iov_len < hdrlen + 100) hdrlen = iov[0].iov_len; } else { /* In case of TSO header length provided by software. */ hdrlen = sc->esc_txctx.tcp_seg_setup.fields.hdr_len; /* * Cap the header length at 240 based on 7.2.4.5 of * the Intel 82576EB (Rev 2.63) datasheet. */ if (hdrlen > 240) { WPRINTF("TSO hdrlen too large: %d", hdrlen); goto done; } /* * If VLAN insertion is requested, ensure the header * at least holds the amount of data copied during * VLAN insertion below. * * XXX: Realistic packets will include a full Ethernet * header before the IP header at ckinfo[0].ck_start, * but this check is sufficient to prevent * out-of-bounds access below. */ if (vlen != 0 && hdrlen < ETHER_ADDR_LEN*2) { WPRINTF("TSO hdrlen too small for vlan insertion " "(%d vs %d) -- dropped", hdrlen, ETHER_ADDR_LEN*2); goto done; } /* * Ensure that the header length covers the used fields * in the IP and TCP headers as well as the IP and TCP * checksums. The following fields are accessed below: * * Header | Field | Offset | Length * -------+-------+--------+------- * IPv4 | len | 2 | 2 * IPv4 | ID | 4 | 2 * IPv6 | len | 4 | 2 * TCP | seq # | 4 | 4 * TCP | flags | 13 | 1 * UDP | len | 4 | 4 */ if (hdrlen < ckinfo[0].ck_start + 6 || hdrlen < ckinfo[0].ck_off + 2) { WPRINTF("TSO hdrlen too small for IP fields (%d) " "-- dropped", hdrlen); goto done; } if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) { if (hdrlen < ckinfo[1].ck_start + 14 || (ckinfo[1].ck_valid && hdrlen < ckinfo[1].ck_off + 2)) { WPRINTF("TSO hdrlen too small for TCP fields " "(%d) -- dropped", hdrlen); goto done; } } else { if (hdrlen < ckinfo[1].ck_start + 8) { WPRINTF("TSO hdrlen too small for UDP fields " "(%d) -- dropped", hdrlen); goto done; } } } /* Allocate, fill and prepend writable header vector. */ if (hdrlen != 0) { hdr = __builtin_alloca(hdrlen + vlen); hdr += vlen; for (left = hdrlen, hdrp = hdr; left > 0; left -= now, hdrp += now) { now = MIN(left, iov->iov_len); memcpy(hdrp, iov->iov_base, now); iov->iov_base += now; iov->iov_len -= now; if (iov->iov_len == 0) { iov++; iovcnt--; } } iov--; iovcnt++; iov->iov_base = hdr; iov->iov_len = hdrlen; } else hdr = NULL; /* Insert VLAN tag. */ if (vlen != 0) { hdr -= ETHER_VLAN_ENCAP_LEN; memmove(hdr, hdr + ETHER_VLAN_ENCAP_LEN, ETHER_ADDR_LEN*2); hdrlen += ETHER_VLAN_ENCAP_LEN; hdr[ETHER_ADDR_LEN*2 + 0] = sc->esc_VET >> 8; hdr[ETHER_ADDR_LEN*2 + 1] = sc->esc_VET & 0xff; hdr[ETHER_ADDR_LEN*2 + 2] = dsc->td.upper.fields.special >> 8; hdr[ETHER_ADDR_LEN*2 + 3] = dsc->td.upper.fields.special & 0xff; iov->iov_base = hdr; iov->iov_len += ETHER_VLAN_ENCAP_LEN; /* Correct checksum offsets after VLAN tag insertion. */ ckinfo[0].ck_start += ETHER_VLAN_ENCAP_LEN; ckinfo[0].ck_off += ETHER_VLAN_ENCAP_LEN; if (ckinfo[0].ck_len != 0) ckinfo[0].ck_len += ETHER_VLAN_ENCAP_LEN; ckinfo[1].ck_start += ETHER_VLAN_ENCAP_LEN; ckinfo[1].ck_off += ETHER_VLAN_ENCAP_LEN; if (ckinfo[1].ck_len != 0) ckinfo[1].ck_len += ETHER_VLAN_ENCAP_LEN; } /* Simple non-TSO case. */ if (!tso) { /* Calculate checksums and transmit. */ if (ckinfo[0].ck_valid) e82545_transmit_checksum(iov, iovcnt, &ckinfo[0]); if (ckinfo[1].ck_valid) e82545_transmit_checksum(iov, iovcnt, &ckinfo[1]); e82545_transmit_backend(sc, iov, iovcnt); goto done; } /* Doing TSO. */ tcp = (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) != 0; mss = sc->esc_txctx.tcp_seg_setup.fields.mss; paylen = (sc->esc_txctx.cmd_and_length & 0x000fffff); DPRINTF("tx %s segmentation offload %d+%d/%d bytes %d iovs", tcp ? "TCP" : "UDP", hdrlen, paylen, mss, iovcnt); ipid = ntohs(*(uint16_t *)&hdr[ckinfo[0].ck_start + 4]); tcpseq = 0; if (tcp) tcpseq = ntohl(*(uint32_t *)&hdr[ckinfo[1].ck_start + 4]); ipcs = *(uint16_t *)&hdr[ckinfo[0].ck_off]; tcpcs = 0; if (ckinfo[1].ck_valid) /* Save partial pseudo-header checksum. */ tcpcs = *(uint16_t *)&hdr[ckinfo[1].ck_off]; pv = 1; pvoff = 0; for (seg = 0, left = paylen; left > 0; seg++, left -= now) { now = MIN(left, mss); /* Construct IOVs for the segment. */ /* Include whole original header. */ tiov[0].iov_base = hdr; tiov[0].iov_len = hdrlen; tiovcnt = 1; /* Include respective part of payload IOV. */ for (nleft = now; pv < iovcnt && nleft > 0; nleft -= nnow) { nnow = MIN(nleft, iov[pv].iov_len - pvoff); tiov[tiovcnt].iov_base = iov[pv].iov_base + pvoff; tiov[tiovcnt++].iov_len = nnow; if (pvoff + nnow == iov[pv].iov_len) { pv++; pvoff = 0; } else pvoff += nnow; } DPRINTF("tx segment %d %d+%d bytes %d iovs", seg, hdrlen, now, tiovcnt); /* Update IP header. */ if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_IP) { /* IPv4 -- set length and ID */ *(uint16_t *)&hdr[ckinfo[0].ck_start + 2] = htons(hdrlen - ckinfo[0].ck_start + now); *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = htons(ipid + seg); } else { /* IPv6 -- set length */ *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = htons(hdrlen - ckinfo[0].ck_start - 40 + now); } /* Update pseudo-header checksum. */ tcpsum = tcpcs; tcpsum += htons(hdrlen - ckinfo[1].ck_start + now); /* Update TCP/UDP headers. */ if (tcp) { /* Update sequence number and FIN/PUSH flags. */ *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = htonl(tcpseq + paylen - left); if (now < left) { hdr[ckinfo[1].ck_start + 13] &= ~(TH_FIN | TH_PUSH); } } else { /* Update payload length. */ *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = hdrlen - ckinfo[1].ck_start + now; } /* Calculate checksums and transmit. */ if (ckinfo[0].ck_valid) { *(uint16_t *)&hdr[ckinfo[0].ck_off] = ipcs; e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[0]); } if (ckinfo[1].ck_valid) { *(uint16_t *)&hdr[ckinfo[1].ck_off] = e82545_carry(tcpsum); e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[1]); } e82545_transmit_backend(sc, tiov, tiovcnt); } done: head = (head + 1) % dsize; e82545_transmit_done(sc, ohead, head, dsize, tdwb); *rhead = head; return (desc + 1); } static void e82545_tx_run(struct e82545_softc *sc) { uint32_t cause; uint16_t head, rhead, tail, size; int lim, tdwb, sent; head = sc->esc_TDH; tail = sc->esc_TDT; size = sc->esc_TDLEN / 16; DPRINTF("tx_run: head %x, rhead %x, tail %x", sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); pthread_mutex_unlock(&sc->esc_mtx); rhead = head; tdwb = 0; for (lim = size / 4; sc->esc_tx_enabled && lim > 0; lim -= sent) { sent = e82545_transmit(sc, head, tail, size, &rhead, &tdwb); if (sent == 0) break; head = rhead; } pthread_mutex_lock(&sc->esc_mtx); sc->esc_TDH = head; sc->esc_TDHr = rhead; cause = 0; if (tdwb) cause |= E1000_ICR_TXDW; if (lim != size / 4 && sc->esc_TDH == sc->esc_TDT) cause |= E1000_ICR_TXQE; if (cause) e82545_icr_assert(sc, cause); DPRINTF("tx_run done: head %x, rhead %x, tail %x", sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); } static _Noreturn void * e82545_tx_thread(void *param) { struct e82545_softc *sc = param; pthread_mutex_lock(&sc->esc_mtx); for (;;) { while (!sc->esc_tx_enabled || sc->esc_TDHr == sc->esc_TDT) { if (sc->esc_tx_enabled && sc->esc_TDHr != sc->esc_TDT) break; sc->esc_tx_active = 0; if (sc->esc_tx_enabled == 0) pthread_cond_signal(&sc->esc_tx_cond); pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); } sc->esc_tx_active = 1; /* Process some tx descriptors. Lock dropped inside. */ e82545_tx_run(sc); } } static void e82545_tx_start(struct e82545_softc *sc) { if (sc->esc_tx_active == 0) pthread_cond_signal(&sc->esc_tx_cond); } static void e82545_tx_enable(struct e82545_softc *sc) { sc->esc_tx_enabled = 1; } static void e82545_tx_disable(struct e82545_softc *sc) { sc->esc_tx_enabled = 0; while (sc->esc_tx_active) pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); } static void e82545_rx_enable(struct e82545_softc *sc) { sc->esc_rx_enabled = 1; } static void e82545_rx_disable(struct e82545_softc *sc) { sc->esc_rx_enabled = 0; while (sc->esc_rx_active) pthread_cond_wait(&sc->esc_rx_cond, &sc->esc_mtx); } static void e82545_write_ra(struct e82545_softc *sc, int reg, uint32_t wval) { struct eth_uni *eu; int idx; idx = reg >> 1; assert(idx < 15); eu = &sc->esc_uni[idx]; if (reg & 0x1) { /* RAH */ eu->eu_valid = ((wval & E1000_RAH_AV) == E1000_RAH_AV); eu->eu_addrsel = (wval >> 16) & 0x3; eu->eu_eth.octet[5] = wval >> 8; eu->eu_eth.octet[4] = wval; } else { /* RAL */ eu->eu_eth.octet[3] = wval >> 24; eu->eu_eth.octet[2] = wval >> 16; eu->eu_eth.octet[1] = wval >> 8; eu->eu_eth.octet[0] = wval; } } static uint32_t e82545_read_ra(struct e82545_softc *sc, int reg) { struct eth_uni *eu; uint32_t retval; int idx; idx = reg >> 1; assert(idx < 15); eu = &sc->esc_uni[idx]; if (reg & 0x1) { /* RAH */ retval = (eu->eu_valid << 31) | (eu->eu_addrsel << 16) | (eu->eu_eth.octet[5] << 8) | eu->eu_eth.octet[4]; } else { /* RAL */ retval = (eu->eu_eth.octet[3] << 24) | (eu->eu_eth.octet[2] << 16) | (eu->eu_eth.octet[1] << 8) | eu->eu_eth.octet[0]; } return (retval); } static void e82545_write_register(struct e82545_softc *sc, uint32_t offset, uint32_t value) { int ridx; if (offset & 0x3) { DPRINTF("Unaligned register write offset:0x%x value:0x%x", offset, value); return; } DPRINTF("Register write: 0x%x value: 0x%x", offset, value); switch (offset) { case E1000_CTRL: case E1000_CTRL_DUP: e82545_devctl(sc, value); break; case E1000_FCAL: sc->esc_FCAL = value; break; case E1000_FCAH: sc->esc_FCAH = value & ~0xFFFF0000; break; case E1000_FCT: sc->esc_FCT = value & ~0xFFFF0000; break; case E1000_VET: sc->esc_VET = value & ~0xFFFF0000; break; case E1000_FCTTV: sc->esc_FCTTV = value & ~0xFFFF0000; break; case E1000_LEDCTL: sc->esc_LEDCTL = value & ~0x30303000; break; case E1000_PBA: sc->esc_PBA = value & 0x0000FF80; break; case E1000_ICR: case E1000_ITR: case E1000_ICS: case E1000_IMS: case E1000_IMC: e82545_intr_write(sc, offset, value); break; case E1000_RCTL: e82545_rx_ctl(sc, value); break; case E1000_FCRTL: sc->esc_FCRTL = value & ~0xFFFF0007; break; case E1000_FCRTH: sc->esc_FCRTH = value & ~0xFFFF0007; break; case E1000_RDBAL(0): sc->esc_RDBAL = value & ~0xF; if (sc->esc_rx_enabled) { /* Apparently legal: update cached address */ e82545_rx_update_rdba(sc); } break; case E1000_RDBAH(0): assert(!sc->esc_rx_enabled); sc->esc_RDBAH = value; break; case E1000_RDLEN(0): assert(!sc->esc_rx_enabled); sc->esc_RDLEN = value & ~0xFFF0007F; break; case E1000_RDH(0): /* XXX should only ever be zero ? Range check ? */ sc->esc_RDH = value; break; case E1000_RDT(0): /* XXX if this opens up the rx ring, do something ? */ sc->esc_RDT = value; break; case E1000_RDTR: /* ignore FPD bit 31 */ sc->esc_RDTR = value & ~0xFFFF0000; break; case E1000_RXDCTL(0): sc->esc_RXDCTL = value & ~0xFEC0C0C0; break; case E1000_RADV: sc->esc_RADV = value & ~0xFFFF0000; break; case E1000_RSRPD: sc->esc_RSRPD = value & ~0xFFFFF000; break; case E1000_RXCSUM: sc->esc_RXCSUM = value & ~0xFFFFF800; break; case E1000_TXCW: sc->esc_TXCW = value & ~0x3FFF0000; break; case E1000_TCTL: e82545_tx_ctl(sc, value); break; case E1000_TIPG: sc->esc_TIPG = value; break; case E1000_AIT: sc->esc_AIT = value; break; case E1000_TDBAL(0): sc->esc_TDBAL = value & ~0xF; if (sc->esc_tx_enabled) { /* Apparently legal */ e82545_tx_update_tdba(sc); } break; case E1000_TDBAH(0): //assert(!sc->esc_tx_enabled); sc->esc_TDBAH = value; break; case E1000_TDLEN(0): //assert(!sc->esc_tx_enabled); sc->esc_TDLEN = value & ~0xFFF0007F; break; case E1000_TDH(0): //assert(!sc->esc_tx_enabled); /* XXX should only ever be zero ? Range check ? */ sc->esc_TDHr = sc->esc_TDH = value; break; case E1000_TDT(0): /* XXX range check ? */ sc->esc_TDT = value; if (sc->esc_tx_enabled) e82545_tx_start(sc); break; case E1000_TIDV: sc->esc_TIDV = value & ~0xFFFF0000; break; case E1000_TXDCTL(0): //assert(!sc->esc_tx_enabled); sc->esc_TXDCTL = value & ~0xC0C0C0; break; case E1000_TADV: sc->esc_TADV = value & ~0xFFFF0000; break; case E1000_RAL(0) ... E1000_RAH(15): /* convert to u32 offset */ ridx = (offset - E1000_RAL(0)) >> 2; e82545_write_ra(sc, ridx, value); break; case E1000_MTA ... (E1000_MTA + (127*4)): sc->esc_fmcast[(offset - E1000_MTA) >> 2] = value; break; case E1000_VFTA ... (E1000_VFTA + (127*4)): sc->esc_fvlan[(offset - E1000_VFTA) >> 2] = value; break; case E1000_EECD: { //DPRINTF("EECD write 0x%x -> 0x%x", sc->eeprom_control, value); /* edge triggered low->high */ uint32_t eecd_strobe = ((sc->eeprom_control & E1000_EECD_SK) ? 0 : (value & E1000_EECD_SK)); uint32_t eecd_mask = (E1000_EECD_SK|E1000_EECD_CS| E1000_EECD_DI|E1000_EECD_REQ); sc->eeprom_control &= ~eecd_mask; sc->eeprom_control |= (value & eecd_mask); /* grant/revoke immediately */ if (value & E1000_EECD_REQ) { sc->eeprom_control |= E1000_EECD_GNT; } else { sc->eeprom_control &= ~E1000_EECD_GNT; } if (eecd_strobe && (sc->eeprom_control & E1000_EECD_CS)) { e82545_eecd_strobe(sc); } return; } case E1000_MDIC: { uint8_t reg_addr = (uint8_t)((value & E1000_MDIC_REG_MASK) >> E1000_MDIC_REG_SHIFT); uint8_t phy_addr = (uint8_t)((value & E1000_MDIC_PHY_MASK) >> E1000_MDIC_PHY_SHIFT); sc->mdi_control = (value & ~(E1000_MDIC_ERROR|E1000_MDIC_DEST)); if ((value & E1000_MDIC_READY) != 0) { DPRINTF("Incorrect MDIC ready bit: 0x%x", value); return; } switch (value & E82545_MDIC_OP_MASK) { case E1000_MDIC_OP_READ: sc->mdi_control &= ~E82545_MDIC_DATA_MASK; sc->mdi_control |= e82545_read_mdi(sc, reg_addr, phy_addr); break; case E1000_MDIC_OP_WRITE: e82545_write_mdi(sc, reg_addr, phy_addr, value & E82545_MDIC_DATA_MASK); break; default: DPRINTF("Unknown MDIC op: 0x%x", value); return; } /* TODO: barrier? */ sc->mdi_control |= E1000_MDIC_READY; if (value & E82545_MDIC_IE) { // TODO: generate interrupt } return; } case E1000_MANC: case E1000_STATUS: return; default: DPRINTF("Unknown write register: 0x%x value:%x", offset, value); return; } } static uint32_t e82545_read_register(struct e82545_softc *sc, uint32_t offset) { uint32_t retval; int ridx; if (offset & 0x3) { DPRINTF("Unaligned register read offset:0x%x", offset); return 0; } DPRINTF("Register read: 0x%x", offset); switch (offset) { case E1000_CTRL: retval = sc->esc_CTRL; break; case E1000_STATUS: retval = E1000_STATUS_FD | E1000_STATUS_LU | E1000_STATUS_SPEED_1000; break; case E1000_FCAL: retval = sc->esc_FCAL; break; case E1000_FCAH: retval = sc->esc_FCAH; break; case E1000_FCT: retval = sc->esc_FCT; break; case E1000_VET: retval = sc->esc_VET; break; case E1000_FCTTV: retval = sc->esc_FCTTV; break; case E1000_LEDCTL: retval = sc->esc_LEDCTL; break; case E1000_PBA: retval = sc->esc_PBA; break; case E1000_ICR: case E1000_ITR: case E1000_ICS: case E1000_IMS: case E1000_IMC: retval = e82545_intr_read(sc, offset); break; case E1000_RCTL: retval = sc->esc_RCTL; break; case E1000_FCRTL: retval = sc->esc_FCRTL; break; case E1000_FCRTH: retval = sc->esc_FCRTH; break; case E1000_RDBAL(0): retval = sc->esc_RDBAL; break; case E1000_RDBAH(0): retval = sc->esc_RDBAH; break; case E1000_RDLEN(0): retval = sc->esc_RDLEN; break; case E1000_RDH(0): retval = sc->esc_RDH; break; case E1000_RDT(0): retval = sc->esc_RDT; break; case E1000_RDTR: retval = sc->esc_RDTR; break; case E1000_RXDCTL(0): retval = sc->esc_RXDCTL; break; case E1000_RADV: retval = sc->esc_RADV; break; case E1000_RSRPD: retval = sc->esc_RSRPD; break; case E1000_RXCSUM: retval = sc->esc_RXCSUM; break; case E1000_TXCW: retval = sc->esc_TXCW; break; case E1000_TCTL: retval = sc->esc_TCTL; break; case E1000_TIPG: retval = sc->esc_TIPG; break; case E1000_AIT: retval = sc->esc_AIT; break; case E1000_TDBAL(0): retval = sc->esc_TDBAL; break; case E1000_TDBAH(0): retval = sc->esc_TDBAH; break; case E1000_TDLEN(0): retval = sc->esc_TDLEN; break; case E1000_TDH(0): retval = sc->esc_TDH; break; case E1000_TDT(0): retval = sc->esc_TDT; break; case E1000_TIDV: retval = sc->esc_TIDV; break; case E1000_TXDCTL(0): retval = sc->esc_TXDCTL; break; case E1000_TADV: retval = sc->esc_TADV; break; case E1000_RAL(0) ... E1000_RAH(15): /* convert to u32 offset */ ridx = (offset - E1000_RAL(0)) >> 2; retval = e82545_read_ra(sc, ridx); break; case E1000_MTA ... (E1000_MTA + (127*4)): retval = sc->esc_fmcast[(offset - E1000_MTA) >> 2]; break; case E1000_VFTA ... (E1000_VFTA + (127*4)): retval = sc->esc_fvlan[(offset - E1000_VFTA) >> 2]; break; case E1000_EECD: //DPRINTF("EECD read %x", sc->eeprom_control); retval = sc->eeprom_control; break; case E1000_MDIC: retval = sc->mdi_control; break; case E1000_MANC: retval = 0; break; /* stats that we emulate. */ case E1000_MPC: retval = sc->missed_pkt_count; break; case E1000_PRC64: retval = sc->pkt_rx_by_size[0]; break; case E1000_PRC127: retval = sc->pkt_rx_by_size[1]; break; case E1000_PRC255: retval = sc->pkt_rx_by_size[2]; break; case E1000_PRC511: retval = sc->pkt_rx_by_size[3]; break; case E1000_PRC1023: retval = sc->pkt_rx_by_size[4]; break; case E1000_PRC1522: retval = sc->pkt_rx_by_size[5]; break; case E1000_GPRC: retval = sc->good_pkt_rx_count; break; case E1000_BPRC: retval = sc->bcast_pkt_rx_count; break; case E1000_MPRC: retval = sc->mcast_pkt_rx_count; break; case E1000_GPTC: case E1000_TPT: retval = sc->good_pkt_tx_count; break; case E1000_GORCL: retval = (uint32_t)sc->good_octets_rx; break; case E1000_GORCH: retval = (uint32_t)(sc->good_octets_rx >> 32); break; case E1000_TOTL: case E1000_GOTCL: retval = (uint32_t)sc->good_octets_tx; break; case E1000_TOTH: case E1000_GOTCH: retval = (uint32_t)(sc->good_octets_tx >> 32); break; case E1000_ROC: retval = sc->oversize_rx_count; break; case E1000_TORL: retval = (uint32_t)(sc->good_octets_rx + sc->missed_octets); break; case E1000_TORH: retval = (uint32_t)((sc->good_octets_rx + sc->missed_octets) >> 32); break; case E1000_TPR: retval = sc->good_pkt_rx_count + sc->missed_pkt_count + sc->oversize_rx_count; break; case E1000_PTC64: retval = sc->pkt_tx_by_size[0]; break; case E1000_PTC127: retval = sc->pkt_tx_by_size[1]; break; case E1000_PTC255: retval = sc->pkt_tx_by_size[2]; break; case E1000_PTC511: retval = sc->pkt_tx_by_size[3]; break; case E1000_PTC1023: retval = sc->pkt_tx_by_size[4]; break; case E1000_PTC1522: retval = sc->pkt_tx_by_size[5]; break; case E1000_MPTC: retval = sc->mcast_pkt_tx_count; break; case E1000_BPTC: retval = sc->bcast_pkt_tx_count; break; case E1000_TSCTC: retval = sc->tso_tx_count; break; /* stats that are always 0. */ case E1000_CRCERRS: case E1000_ALGNERRC: case E1000_SYMERRS: case E1000_RXERRC: case E1000_SCC: case E1000_ECOL: case E1000_MCC: case E1000_LATECOL: case E1000_COLC: case E1000_DC: case E1000_TNCRS: case E1000_SEC: case E1000_CEXTERR: case E1000_RLEC: case E1000_XONRXC: case E1000_XONTXC: case E1000_XOFFRXC: case E1000_XOFFTXC: case E1000_FCRUC: case E1000_RNBC: case E1000_RUC: case E1000_RFC: case E1000_RJC: case E1000_MGTPRC: case E1000_MGTPDC: case E1000_MGTPTC: case E1000_TSCTFC: retval = 0; break; default: DPRINTF("Unknown read register: 0x%x", offset); retval = 0; break; } return (retval); } static void e82545_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct e82545_softc *sc; //DPRINTF("Write bar:%d offset:0x%lx value:0x%lx size:%d", baridx, offset, value, size); sc = pi->pi_arg; pthread_mutex_lock(&sc->esc_mtx); switch (baridx) { case E82545_BAR_IO: switch (offset) { case E82545_IOADDR: if (size != 4) { DPRINTF("Wrong io addr write sz:%d value:0x%lx", size, value); } else sc->io_addr = (uint32_t)value; break; case E82545_IODATA: if (size != 4) { DPRINTF("Wrong io data write size:%d value:0x%lx", size, value); } else if (sc->io_addr > E82545_IO_REGISTER_MAX) { DPRINTF("Non-register io write addr:0x%x value:0x%lx", sc->io_addr, value); } else e82545_write_register(sc, sc->io_addr, (uint32_t)value); break; default: DPRINTF("Unknown io bar write offset:0x%lx value:0x%lx size:%d", offset, value, size); break; } break; case E82545_BAR_REGISTER: if (size != 4) { DPRINTF("Wrong register write size:%d offset:0x%lx value:0x%lx", size, offset, value); } else e82545_write_register(sc, (uint32_t)offset, (uint32_t)value); break; default: DPRINTF("Unknown write bar:%d off:0x%lx val:0x%lx size:%d", baridx, offset, value, size); } pthread_mutex_unlock(&sc->esc_mtx); } static uint64_t e82545_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct e82545_softc *sc; uint64_t retval; //DPRINTF("Read bar:%d offset:0x%lx size:%d", baridx, offset, size); sc = pi->pi_arg; retval = 0; pthread_mutex_lock(&sc->esc_mtx); switch (baridx) { case E82545_BAR_IO: switch (offset) { case E82545_IOADDR: if (size != 4) { DPRINTF("Wrong io addr read sz:%d", size); } else retval = sc->io_addr; break; case E82545_IODATA: if (size != 4) { DPRINTF("Wrong io data read sz:%d", size); } if (sc->io_addr > E82545_IO_REGISTER_MAX) { DPRINTF("Non-register io read addr:0x%x", sc->io_addr); } else retval = e82545_read_register(sc, sc->io_addr); break; default: DPRINTF("Unknown io bar read offset:0x%lx size:%d", offset, size); break; } break; case E82545_BAR_REGISTER: if (size != 4) { DPRINTF("Wrong register read size:%d offset:0x%lx", size, offset); } else retval = e82545_read_register(sc, (uint32_t)offset); break; default: DPRINTF("Unknown read bar:%d offset:0x%lx size:%d", baridx, offset, size); break; } pthread_mutex_unlock(&sc->esc_mtx); return (retval); } static void e82545_reset(struct e82545_softc *sc, int drvr) { int i; e82545_rx_disable(sc); e82545_tx_disable(sc); /* clear outstanding interrupts */ if (sc->esc_irq_asserted) pci_lintr_deassert(sc->esc_pi); /* misc */ if (!drvr) { sc->esc_FCAL = 0; sc->esc_FCAH = 0; sc->esc_FCT = 0; sc->esc_VET = 0; sc->esc_FCTTV = 0; } sc->esc_LEDCTL = 0x07061302; sc->esc_PBA = 0x00100030; /* start nvm in opcode mode. */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; sc->eeprom_control = E1000_EECD_PRES | E82545_EECD_FWE_EN; e82545_init_eeprom(sc); /* interrupt */ sc->esc_ICR = 0; sc->esc_ITR = 250; sc->esc_ICS = 0; sc->esc_IMS = 0; sc->esc_IMC = 0; /* L2 filters */ if (!drvr) { memset(sc->esc_fvlan, 0, sizeof(sc->esc_fvlan)); memset(sc->esc_fmcast, 0, sizeof(sc->esc_fmcast)); memset(sc->esc_uni, 0, sizeof(sc->esc_uni)); /* XXX not necessary on 82545 ?? */ sc->esc_uni[0].eu_valid = 1; memcpy(sc->esc_uni[0].eu_eth.octet, sc->esc_mac.octet, ETHER_ADDR_LEN); } else { /* Clear RAH valid bits */ for (i = 0; i < 16; i++) sc->esc_uni[i].eu_valid = 0; } /* receive */ if (!drvr) { sc->esc_RDBAL = 0; sc->esc_RDBAH = 0; } sc->esc_RCTL = 0; sc->esc_FCRTL = 0; sc->esc_FCRTH = 0; sc->esc_RDLEN = 0; sc->esc_RDH = 0; sc->esc_RDT = 0; sc->esc_RDTR = 0; sc->esc_RXDCTL = (1 << 24) | (1 << 16); /* default GRAN/WTHRESH */ sc->esc_RADV = 0; sc->esc_RXCSUM = 0; /* transmit */ if (!drvr) { sc->esc_TDBAL = 0; sc->esc_TDBAH = 0; sc->esc_TIPG = 0; sc->esc_AIT = 0; sc->esc_TIDV = 0; sc->esc_TADV = 0; } sc->esc_tdba = 0; sc->esc_txdesc = NULL; sc->esc_TXCW = 0; sc->esc_TCTL = 0; sc->esc_TDLEN = 0; sc->esc_TDT = 0; sc->esc_TDHr = sc->esc_TDH = 0; sc->esc_TXDCTL = 0; } static int e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { char nstr[80]; struct e82545_softc *sc; char *devname; char *vtopts; int mac_provided; DPRINTF("Loading with options: %s", opts); /* Setup our softc */ sc = calloc(1, sizeof(*sc)); pi->pi_arg = sc; sc->esc_pi = pi; sc->esc_ctx = ctx; pthread_mutex_init(&sc->esc_mtx, NULL); pthread_cond_init(&sc->esc_rx_cond, NULL); pthread_cond_init(&sc->esc_tx_cond, NULL); pthread_create(&sc->esc_tx_tid, NULL, e82545_tx_thread, sc); snprintf(nstr, sizeof(nstr), "e82545-%d:%d tx", pi->pi_slot, pi->pi_func); pthread_set_name_np(sc->esc_tx_tid, nstr); pci_set_cfgdata16(pi, PCIR_DEVICE, E82545_DEV_ID_82545EM_COPPER); pci_set_cfgdata16(pi, PCIR_VENDOR, E82545_VENDOR_ID_INTEL); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_NETWORK_ETHERNET); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, E82545_SUBDEV_ID); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, E82545_VENDOR_ID_INTEL); pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); pci_set_cfgdata8(pi, PCIR_INTPIN, 0x1); /* TODO: this card also supports msi, but the freebsd driver for it * does not, so I have not implemented it. */ pci_lintr_request(pi); pci_emul_alloc_bar(pi, E82545_BAR_REGISTER, PCIBAR_MEM32, E82545_BAR_REGISTER_LEN); pci_emul_alloc_bar(pi, E82545_BAR_FLASH, PCIBAR_MEM32, E82545_BAR_FLASH_LEN); pci_emul_alloc_bar(pi, E82545_BAR_IO, PCIBAR_IO, E82545_BAR_IO_LEN); /* * Attempt to open the net backend and read the MAC address * if specified. Copied from virtio-net, slightly modified. */ mac_provided = 0; sc->esc_be = NULL; if (opts != NULL) { int err = 0; devname = vtopts = strdup(opts); (void) strsep(&vtopts, ","); /* * Parse the list of options in the form * key1=value1,...,keyN=valueN. */ while (vtopts != NULL) { char *value = vtopts; char *key; key = strsep(&value, "="); if (value == NULL) break; vtopts = value; (void) strsep(&vtopts, ","); if (strcmp(key, "mac") == 0) { err = net_parsemac(value, sc->esc_mac.octet); if (err) break; mac_provided = 1; } } if (err) { free(devname); return (err); } err = netbe_init(&sc->esc_be, devname, e82545_rx_callback, sc); free(devname); if (err) return (err); } if (!mac_provided) { net_genmac(pi, sc->esc_mac.octet); } netbe_rx_enable(sc->esc_be); /* H/w initiated reset */ e82545_reset(sc, 0); return (0); } +#ifdef BHYVE_SNAPSHOT +static int +e82545_snapshot(struct vm_snapshot_meta *meta) +{ + int i; + int ret; + struct e82545_softc *sc; + struct pci_devinst *pi; + uint64_t bitmap_value; + + pi = meta->dev_data; + sc = pi->pi_arg; + + /* esc_mevp and esc_mevpitr should be reinitiated at init. */ + SNAPSHOT_VAR_OR_LEAVE(sc->esc_mac, meta, ret, done); + + /* General */ + SNAPSHOT_VAR_OR_LEAVE(sc->esc_CTRL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCT, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_VET, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCTTV, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_LEDCTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_PBA, meta, ret, done); + + /* Interrupt control */ + SNAPSHOT_VAR_OR_LEAVE(sc->esc_irq_asserted, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICR, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_ITR, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICS, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMS, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMC, meta, ret, done); + + /* + * Transmit + * + * The fields in the unions are in superposition to access certain + * bytes in the larger uint variables. + * e.g., ip_config = [ipcss|ipcso|ipcse0|ipcse1] + */ + SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.lower_setup.ip_config, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.upper_setup.tcp_config, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.cmd_and_length, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.tcp_seg_setup.data, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_enabled, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_active, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXCW, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TCTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIPG, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_AIT, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_tdba, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDLEN, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDHr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDT, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIDV, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXDCTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TADV, meta, ret, done); + + /* Has dependency on esc_TDLEN; reoreder of fields from struct. */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_txdesc, sc->esc_TDLEN, + true, meta, ret, done); + + /* L2 frame acceptance */ + for (i = 0; i < nitems(sc->esc_uni); i++) { + SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_valid, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_addrsel, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_eth, meta, ret, done); + } + + SNAPSHOT_BUF_OR_LEAVE(sc->esc_fmcast, sizeof(sc->esc_fmcast), + meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(sc->esc_fvlan, sizeof(sc->esc_fvlan), + meta, ret, done); + + /* Receive */ + SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_enabled, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_active, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_loopback, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RCTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_rdba, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDLEN, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDT, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDTR, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXDCTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RADV, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RSRPD, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXCSUM, meta, ret, done); + + /* Has dependency on esc_RDLEN; reoreder of fields from struct. */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_rxdesc, sc->esc_TDLEN, + true, meta, ret, done); + + /* IO Port register access */ + SNAPSHOT_VAR_OR_LEAVE(sc->io_addr, meta, ret, done); + + /* Shadow copy of MDIC */ + SNAPSHOT_VAR_OR_LEAVE(sc->mdi_control, meta, ret, done); + + /* Shadow copy of EECD */ + SNAPSHOT_VAR_OR_LEAVE(sc->eeprom_control, meta, ret, done); + + /* Latest NVM in/out */ + SNAPSHOT_VAR_OR_LEAVE(sc->nvm_data, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->nvm_opaddr, meta, ret, done); + + /* Stats */ + SNAPSHOT_VAR_OR_LEAVE(sc->missed_pkt_count, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(sc->pkt_rx_by_size, sizeof(sc->pkt_rx_by_size), + meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(sc->pkt_tx_by_size, sizeof(sc->pkt_tx_by_size), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_rx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_rx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_rx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_tx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_tx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_tx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->oversize_rx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->tso_tx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_rx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_tx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->missed_octets, meta, ret, done); + + if (meta->op == VM_SNAPSHOT_SAVE) + bitmap_value = sc->nvm_bits; + SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done); + if (meta->op == VM_SNAPSHOT_RESTORE) + sc->nvm_bits = bitmap_value; + + if (meta->op == VM_SNAPSHOT_SAVE) + bitmap_value = sc->nvm_bits; + SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done); + if (meta->op == VM_SNAPSHOT_RESTORE) + sc->nvm_bits = bitmap_value; + + /* EEPROM data */ + SNAPSHOT_BUF_OR_LEAVE(sc->eeprom_data, sizeof(sc->eeprom_data), + meta, ret, done); + +done: + return (ret); +} +#endif + struct pci_devemu pci_de_e82545 = { .pe_emu = "e1000", .pe_init = e82545_init, .pe_barwrite = e82545_write, - .pe_barread = e82545_read + .pe_barread = e82545_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = e82545_snapshot, +#endif }; PCI_EMUL_SET(pci_de_e82545); diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c index 145b33b5ffd2..e4b83896241e 100644 --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -1,2142 +1,2340 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include "acpi.h" #include "bhyverun.h" #include "debug.h" #include "inout.h" #include "ioapic.h" #include "mem.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #define CONF1_ADDR_PORT 0x0cf8 #define CONF1_DATA_PORT 0x0cfc #define CONF1_ENABLE 0x80000000ul #define MAXBUSES (PCI_BUSMAX + 1) #define MAXSLOTS (PCI_SLOTMAX + 1) #define MAXFUNCS (PCI_FUNCMAX + 1) struct funcinfo { char *fi_name; char *fi_param; struct pci_devinst *fi_devi; }; struct intxinfo { int ii_count; int ii_pirq_pin; int ii_ioapic_irq; }; struct slotinfo { struct intxinfo si_intpins[4]; struct funcinfo si_funcs[MAXFUNCS]; }; struct businfo { uint16_t iobase, iolimit; /* I/O window */ uint32_t membase32, memlimit32; /* mmio window below 4GB */ uint64_t membase64, memlimit64; /* mmio window above 4GB */ struct slotinfo slotinfo[MAXSLOTS]; }; static struct businfo *pci_businfo[MAXBUSES]; SET_DECLARE(pci_devemu_set, struct pci_devemu); static uint64_t pci_emul_iobase; static uint64_t pci_emul_membase32; static uint64_t pci_emul_membase64; #define PCI_EMUL_IOBASE 0x2000 #define PCI_EMUL_IOLIMIT 0x10000 #define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ #define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); #define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE #define PCI_EMUL_MEMBASE64 0xD000000000UL #define PCI_EMUL_MEMLIMIT64 0xFD00000000UL static struct pci_devemu *pci_emul_finddev(char *name); static void pci_lintr_route(struct pci_devinst *pi); static void pci_lintr_update(struct pci_devinst *pi); static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, int coff, int bytes, uint32_t *val); static __inline void CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes) { if (bytes == 1) pci_set_cfgdata8(pi, coff, val); else if (bytes == 2) pci_set_cfgdata16(pi, coff, val); else pci_set_cfgdata32(pi, coff, val); } static __inline uint32_t CFGREAD(struct pci_devinst *pi, int coff, int bytes) { if (bytes == 1) return (pci_get_cfgdata8(pi, coff)); else if (bytes == 2) return (pci_get_cfgdata16(pi, coff)); else return (pci_get_cfgdata32(pi, coff)); } /* * I/O access */ /* * Slot options are in the form: * * ::,[,] * [:],[,] * * slot is 0..31 * func is 0..7 * emul is a string describing the type of PCI device e.g. virtio-net * config is an optional string, depending on the device, that can be * used for configuration. * Examples are: * 1,virtio-net,tap0 * 3:0,dummy */ static void pci_parse_slot_usage(char *aopt) { EPRINTLN("Invalid PCI slot info field \"%s\"", aopt); } int pci_parse_slot(char *opt) { struct businfo *bi; struct slotinfo *si; char *emul, *config, *str, *cp; int error, bnum, snum, fnum; error = -1; str = strdup(opt); emul = config = NULL; if ((cp = strchr(str, ',')) != NULL) { *cp = '\0'; emul = cp + 1; if ((cp = strchr(emul, ',')) != NULL) { *cp = '\0'; config = cp + 1; } } else { pci_parse_slot_usage(opt); goto done; } /* :: */ if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) { bnum = 0; /* : */ if (sscanf(str, "%d:%d", &snum, &fnum) != 2) { fnum = 0; /* */ if (sscanf(str, "%d", &snum) != 1) { snum = -1; } } } if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS || fnum < 0 || fnum >= MAXFUNCS) { pci_parse_slot_usage(opt); goto done; } if (pci_businfo[bnum] == NULL) pci_businfo[bnum] = calloc(1, sizeof(struct businfo)); bi = pci_businfo[bnum]; si = &bi->slotinfo[snum]; if (si->si_funcs[fnum].fi_name != NULL) { EPRINTLN("pci slot %d:%d already occupied!", snum, fnum); goto done; } if (pci_emul_finddev(emul) == NULL) { EPRINTLN("pci slot %d:%d: unknown device \"%s\"", snum, fnum, emul); goto done; } error = 0; si->si_funcs[fnum].fi_name = emul; si->si_funcs[fnum].fi_param = config; done: if (error) free(str); return (error); } void pci_print_supported_devices() { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; printf("%s\n", pdp->pe_emu); } } static int pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) { if (offset < pi->pi_msix.pba_offset) return (0); if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { return (0); } return (1); } int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, uint64_t value) { int msix_entry_offset; int tab_index; char *dest; /* support only 4 or 8 byte writes */ if (size != 4 && size != 8) return (-1); /* * Return if table index is beyond what device supports */ tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index >= pi->pi_msix.table_count) return (-1); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned writes */ if ((msix_entry_offset % size) != 0) return (-1); dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 4) *((uint32_t *)dest) = value; else *((uint64_t *)dest) = value; return (0); } uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) { char *dest; int msix_entry_offset; int tab_index; uint64_t retval = ~0; /* * The PCI standard only allows 4 and 8 byte accesses to the MSI-X * table but we also allow 1 byte access to accommodate reads from * ddb. */ if (size != 1 && size != 4 && size != 8) return (retval); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned reads */ if ((msix_entry_offset % size) != 0) { return (retval); } tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index < pi->pi_msix.table_count) { /* valid MSI-X Table access */ dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 1) retval = *((uint8_t *)dest); else if (size == 4) retval = *((uint32_t *)dest); else retval = *((uint64_t *)dest); } else if (pci_valid_pba_offset(pi, offset)) { /* return 0 for PBA access */ retval = 0; } return (retval); } int pci_msix_table_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.table_bar); else return (-1); } int pci_msix_pba_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.pba_bar); else return (-1); } static int pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { struct pci_devinst *pdi = arg; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int i; for (i = 0; i <= PCI_BARMAX; i++) { if (pdi->pi_bar[i].type == PCIBAR_IO && port >= pdi->pi_bar[i].addr && port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { offset = port - pdi->pi_bar[i].addr; if (in) *eax = (*pe->pe_barread)(ctx, vcpu, pdi, i, offset, bytes); else (*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset, bytes, *eax); return (0); } } return (-1); } static int pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { struct pci_devinst *pdi = arg1; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int bidx = (int) arg2; assert(bidx <= PCI_BARMAX); assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 || pdi->pi_bar[bidx].type == PCIBAR_MEM64); assert(addr >= pdi->pi_bar[bidx].addr && addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); offset = addr - pdi->pi_bar[bidx].addr; if (dir == MEM_F_WRITE) { if (size == 8) { (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, 4, *val & 0xffffffff); (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset + 4, 4, *val >> 32); } else { (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, size, *val); } } else { if (size == 8) { *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset, 4); *val |= (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset + 4, 4) << 32; } else { *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset, size); } } return (0); } static int pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, uint64_t *addr) { uint64_t base; assert((size & (size - 1)) == 0); /* must be a power of 2 */ base = roundup2(*baseptr, size); if (base + size <= limit) { *addr = base; *baseptr = base + size; return (0); } else return (-1); } int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size) { return (pci_emul_alloc_pbar(pdi, idx, 0, type, size)); } /* * Register (or unregister) the MMIO or I/O region associated with the BAR * register 'idx' of an emulated pci device. */ static void modify_bar_registration(struct pci_devinst *pi, int idx, int registration) { int error; struct inout_port iop; struct mem_range mr; switch (pi->pi_bar[idx].type) { case PCIBAR_IO: bzero(&iop, sizeof(struct inout_port)); iop.name = pi->pi_name; iop.port = pi->pi_bar[idx].addr; iop.size = pi->pi_bar[idx].size; if (registration) { iop.flags = IOPORT_F_INOUT; iop.handler = pci_emul_io_handler; iop.arg = pi; error = register_inout(&iop); } else error = unregister_inout(&iop); break; case PCIBAR_MEM32: case PCIBAR_MEM64: bzero(&mr, sizeof(struct mem_range)); mr.name = pi->pi_name; mr.base = pi->pi_bar[idx].addr; mr.size = pi->pi_bar[idx].size; if (registration) { mr.flags = MEM_F_RW; mr.handler = pci_emul_mem_handler; mr.arg1 = pi; mr.arg2 = idx; error = register_mem(&mr); } else error = unregister_mem(&mr); break; default: error = EINVAL; break; } assert(error == 0); } static void unregister_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 0); } static void register_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 1); } /* Are we decoding i/o port accesses for the emulated pci device? */ static int porten(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_PORTEN); } /* Are we decoding memory accesses for the emulated pci device? */ static int memen(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_MEMEN); } /* * Update the MMIO or I/O address that is decoded by the BAR register. * * If the pci device has enabled the address space decoding then intercept * the address range decoded by the BAR register. */ static void update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) { int decode; if (pi->pi_bar[idx].type == PCIBAR_IO) decode = porten(pi); else decode = memen(pi); if (decode) unregister_bar(pi, idx); switch (type) { case PCIBAR_IO: case PCIBAR_MEM32: pi->pi_bar[idx].addr = addr; break; case PCIBAR_MEM64: pi->pi_bar[idx].addr &= ~0xffffffffUL; pi->pi_bar[idx].addr |= addr; break; case PCIBAR_MEMHI64: pi->pi_bar[idx].addr &= 0xffffffff; pi->pi_bar[idx].addr |= addr; break; default: assert(0); } if (decode) register_bar(pi, idx); } int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, enum pcibar_type type, uint64_t size) { int error; uint64_t *baseptr, limit, addr, mask, lobits, bar; uint16_t cmd, enbit; assert(idx >= 0 && idx <= PCI_BARMAX); if ((size & (size - 1)) != 0) size = 1UL << flsl(size); /* round up to a power of 2 */ /* Enforce minimum BAR sizes required by the PCI standard */ if (type == PCIBAR_IO) { if (size < 4) size = 4; } else { if (size < 16) size = 16; } switch (type) { case PCIBAR_NONE: baseptr = NULL; addr = mask = lobits = enbit = 0; break; case PCIBAR_IO: baseptr = &pci_emul_iobase; limit = PCI_EMUL_IOLIMIT; mask = PCIM_BAR_IO_BASE; lobits = PCIM_BAR_IO_SPACE; enbit = PCIM_CMD_PORTEN; break; case PCIBAR_MEM64: /* * XXX * Some drivers do not work well if the 64-bit BAR is allocated * above 4GB. Allow for this by allocating small requests under * 4GB unless then allocation size is larger than some arbitrary * number (32MB currently). */ if (size > 32 * 1024 * 1024) { /* * XXX special case for device requiring peer-peer DMA */ if (size == 0x100000000UL) baseptr = &hostbase; else baseptr = &pci_emul_membase64; limit = PCI_EMUL_MEMLIMIT64; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | PCIM_BAR_MEM_PREFETCH; } else { baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64; } enbit = PCIM_CMD_MEMEN; break; case PCIBAR_MEM32: baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; enbit = PCIM_CMD_MEMEN; break; default: printf("pci_emul_alloc_base: invalid bar type %d\n", type); assert(0); } if (baseptr != NULL) { error = pci_emul_alloc_resource(baseptr, limit, size, &addr); if (error != 0) return (error); } pdi->pi_bar[idx].type = type; pdi->pi_bar[idx].addr = addr; pdi->pi_bar[idx].size = size; /* Initialize the BAR register in config space */ bar = (addr & mask) | lobits; pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar); if (type == PCIBAR_MEM64) { assert(idx + 1 <= PCI_BARMAX); pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); } cmd = pci_get_cfgdata16(pdi, PCIR_COMMAND); if ((cmd & enbit) != enbit) pci_set_cfgdata16(pdi, PCIR_COMMAND, cmd | enbit); register_bar(pdi, idx); return (0); } #define CAP_START_OFFSET 0x40 static int pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen) { int i, capoff, reallen; uint16_t sts; assert(caplen > 0); reallen = roundup2(caplen, 4); /* dword aligned */ sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) == 0) capoff = CAP_START_OFFSET; else capoff = pi->pi_capend + 1; /* Check if we have enough space */ if (capoff + reallen > PCI_REGMAX + 1) return (-1); /* Set the previous capability pointer */ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) { pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff); pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT); } else pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff); /* Copy the capability */ for (i = 0; i < caplen; i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); /* Set the next capability pointer */ pci_set_cfgdata8(pi, capoff + 1, 0); pi->pi_prevcap = capoff; pi->pi_capend = capoff + reallen - 1; return (0); } static struct pci_devemu * pci_emul_finddev(char *name) { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; if (!strcmp(pdp->pe_emu, name)) { return (pdp); } } return (NULL); } static int pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot, int func, struct funcinfo *fi) { struct pci_devinst *pdi; int err; pdi = calloc(1, sizeof(struct pci_devinst)); pdi->pi_vmctx = ctx; pdi->pi_bus = bus; pdi->pi_slot = slot; pdi->pi_func = func; pthread_mutex_init(&pdi->pi_lintr.lock, NULL); pdi->pi_lintr.pin = 0; pdi->pi_lintr.state = IDLE; pdi->pi_lintr.pirq_pin = 0; pdi->pi_lintr.ioapic_irq = 0; pdi->pi_d = pde; snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot); /* Disable legacy interrupts */ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_BUSMASTEREN); err = (*pde->pe_init)(ctx, pdi, fi->fi_param); if (err == 0) fi->fi_devi = pdi; else free(pdi); return (err); } void pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) { int mmc; /* Number of msi messages must be a power of 2 between 1 and 32 */ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); mmc = ffs(msgnum) - 1; bzero(msicap, sizeof(struct msicap)); msicap->capid = PCIY_MSI; msicap->nextptr = nextptr; msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1); } int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) { struct msicap msicap; pci_populate_msicap(&msicap, msgnum, 0); return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); } static void pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, uint32_t msix_tab_size) { assert(msix_tab_size % 4096 == 0); bzero(msixcap, sizeof(struct msixcap)); msixcap->capid = PCIY_MSIX; /* * Message Control Register, all fields set to * zero except for the Table Size. * Note: Table size N is encoded as N-1 */ msixcap->msgctrl = msgnum - 1; /* * MSI-X BAR setup: * - MSI-X table start at offset 0 * - PBA table starts at a 4K aligned offset after the MSI-X table */ msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK; msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK); } static void pci_msix_table_init(struct pci_devinst *pi, int table_entries) { int i, table_size; assert(table_entries > 0); assert(table_entries <= MAX_MSIX_TABLE_ENTRIES); table_size = table_entries * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* set mask bit of vector control register */ for (i = 0; i < table_entries; i++) pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) { uint32_t tab_size; struct msixcap msixcap; assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE; /* Align table size to nearest 4K */ tab_size = roundup2(tab_size, 4096); pi->pi_msix.table_bar = barnum; pi->pi_msix.pba_bar = barnum; pi->pi_msix.table_offset = 0; pi->pi_msix.table_count = msgnum; pi->pi_msix.pba_offset = tab_size; pi->pi_msix.pba_size = PBA_SIZE(msgnum); pci_msix_table_init(pi, msgnum); pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size); /* allocate memory for MSI-X Table and PBA */ pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32, tab_size + pi->pi_msix.pba_size); return (pci_emul_add_capability(pi, (u_char *)&msixcap, sizeof(msixcap))); } void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask; int off; off = offset - capoff; /* Message Control Register */ if (off == 2 && bytes == 2) { rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; pci_lintr_update(pi); } CFGWRITE(pi, offset, val, bytes); } void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask, msgdata, mme; uint32_t addrlo; /* * If guest is writing to the message control register make sure * we do not overwrite read-only fields. */ if ((offset - capoff) == 2 && bytes == 2) { rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; } CFGWRITE(pi, offset, val, bytes); msgctrl = pci_get_cfgdata16(pi, capoff + 2); addrlo = pci_get_cfgdata32(pi, capoff + 4); if (msgctrl & PCIM_MSICTRL_64BIT) msgdata = pci_get_cfgdata16(pi, capoff + 12); else msgdata = pci_get_cfgdata16(pi, capoff + 8); mme = msgctrl & PCIM_MSICTRL_MME_MASK; pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0; if (pi->pi_msi.enabled) { pi->pi_msi.addr = addrlo; pi->pi_msi.msg_data = msgdata; pi->pi_msi.maxmsgnum = 1 << (mme >> 4); } else { pi->pi_msi.maxmsgnum = 0; } pci_lintr_update(pi); } void pciecap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { /* XXX don't write to the readonly parts */ CFGWRITE(pi, offset, val, bytes); } #define PCIECAP_VERSION 0x2 int pci_emul_add_pciecap(struct pci_devinst *pi, int type) { int err; struct pciecap pciecap; bzero(&pciecap, sizeof(pciecap)); /* * Use the integrated endpoint type for endpoints on a root complex bus. * * NB: bhyve currently only supports a single PCI bus that is the root * complex bus, so all endpoints are integrated. */ if ((type == PCIEM_TYPE_ENDPOINT) && (pi->pi_bus == 0)) type = PCIEM_TYPE_ROOT_INT_EP; pciecap.capid = PCIY_EXPRESS; pciecap.pcie_capabilities = PCIECAP_VERSION | type; if (type != PCIEM_TYPE_ROOT_INT_EP) { pciecap.link_capabilities = 0x411; /* gen1, x1 */ pciecap.link_status = 0x11; /* gen1, x1 */ } err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap)); return (err); } /* * This function assumes that 'coff' is in the capabilities region of the * config space. */ static void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val) { int capid; uint8_t capoff, nextoff; /* Do not allow un-aligned writes */ if ((offset & (bytes - 1)) != 0) return; /* Find the capability that we want to update */ capoff = CAP_START_OFFSET; while (1) { nextoff = pci_get_cfgdata8(pi, capoff + 1); if (nextoff == 0) break; if (offset >= capoff && offset < nextoff) break; capoff = nextoff; } assert(offset >= capoff); /* * Capability ID and Next Capability Pointer are readonly. * However, some o/s's do 4-byte writes that include these. * For this case, trim the write back to 2 bytes and adjust * the data. */ if (offset == capoff || offset == capoff + 1) { if (offset == capoff && bytes == 4) { bytes = 2; offset += 2; val >>= 16; } else return; } capid = pci_get_cfgdata8(pi, capoff); switch (capid) { case PCIY_MSI: msicap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_MSIX: msixcap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_EXPRESS: pciecap_cfgwrite(pi, capoff, offset, bytes, val); break; default: break; } } static int pci_emul_iscap(struct pci_devinst *pi, int offset) { uint16_t sts; sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) != 0) { if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend) return (1); } return (0); } static int pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { /* * Ignore writes; return 0xff's for reads. The mem read code * will take care of truncating to the correct size. */ if (dir == MEM_F_READ) { *val = 0xffffffffffffffff; } return (0); } static int pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int bytes, uint64_t *val, void *arg1, long arg2) { int bus, slot, func, coff, in; coff = addr & 0xfff; func = (addr >> 12) & 0x7; slot = (addr >> 15) & 0x1f; bus = (addr >> 20) & 0xff; in = (dir == MEM_F_READ); if (in) *val = ~0UL; pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val); return (0); } uint64_t pci_ecfg_base(void) { return (PCI_EMUL_ECFG_BASE); } #define BUSIO_ROUNDUP 32 #define BUSMEM_ROUNDUP (1024 * 1024) int init_pci(struct vmctx *ctx) { struct mem_range mr; struct pci_devemu *pde; struct businfo *bi; struct slotinfo *si; struct funcinfo *fi; size_t lowmem; int bus, slot, func; int error; pci_emul_iobase = PCI_EMUL_IOBASE; pci_emul_membase32 = vm_get_lowmem_limit(ctx); pci_emul_membase64 = PCI_EMUL_MEMBASE64; for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; /* * Keep track of the i/o and memory resources allocated to * this bus. */ bi->iobase = pci_emul_iobase; bi->membase32 = pci_emul_membase32; bi->membase64 = pci_emul_membase64; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_name == NULL) continue; pde = pci_emul_finddev(fi->fi_name); assert(pde != NULL); error = pci_emul_init(ctx, pde, bus, slot, func, fi); if (error) return (error); } } /* * Add some slop to the I/O and memory resources decoded by * this bus to give a guest some flexibility if it wants to * reprogram the BARs. */ pci_emul_iobase += BUSIO_ROUNDUP; pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP); bi->iolimit = pci_emul_iobase; pci_emul_membase32 += BUSMEM_ROUNDUP; pci_emul_membase32 = roundup2(pci_emul_membase32, BUSMEM_ROUNDUP); bi->memlimit32 = pci_emul_membase32; pci_emul_membase64 += BUSMEM_ROUNDUP; pci_emul_membase64 = roundup2(pci_emul_membase64, BUSMEM_ROUNDUP); bi->memlimit64 = pci_emul_membase64; } /* * PCI backends are initialized before routing INTx interrupts * so that LPC devices are able to reserve ISA IRQs before * routing PIRQ pins. */ for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_devi == NULL) continue; pci_lintr_route(fi->fi_devi); } } } lpc_pirq_routed(); /* * The guest physical memory map looks like the following: * [0, lowmem) guest system memory * [lowmem, lowmem_limit) memory hole (may be absent) * [lowmem_limit, 0xE0000000) PCI hole (32-bit BAR allocation) * [0xE0000000, 0xF0000000) PCI extended config window * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware * [4GB, 4GB + highmem) */ /* * Accesses to memory addresses that are not allocated to system * memory or PCI devices return 0xff's. */ lowmem = vm_get_lowmem_size(ctx); bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI hole"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = lowmem; mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; mr.handler = pci_emul_fallback_handler; error = register_mem_fallback(&mr); assert(error == 0); /* PCI extended config space */ bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI ECFG"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = PCI_EMUL_ECFG_BASE; mr.size = PCI_EMUL_ECFG_SIZE; mr.handler = pci_emul_ecfg_handler; error = register_mem(&mr); assert(error == 0); return (0); } static void pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, void *arg) { dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" Zero,"); dsdt_line(" 0x%X", ioapic_irq); dsdt_line(" },"); } static void pci_pirq_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, void *arg) { char *name; name = lpc_pirq_name(pirq_pin); if (name == NULL) return; dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" %s,", name); dsdt_line(" 0x00"); dsdt_line(" },"); free(name); } /* * A bhyve virtual machine has a flat PCI hierarchy with a root port * corresponding to each PCI bus. */ static void pci_bus_write_dsdt(int bus) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; int count, func, slot; /* * If there are no devices on this 'bus' then just return. */ if ((bi = pci_businfo[bus]) == NULL) { /* * Bus 0 is special because it decodes the I/O ports used * for PCI config space access even if there are no devices * on it. */ if (bus != 0) return; } dsdt_line(" Device (PC%02X)", bus); dsdt_line(" {"); dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))"); dsdt_line(" Method (_BBN, 0, NotSerialized)"); dsdt_line(" {"); dsdt_line(" Return (0x%08X)", bus); dsdt_line(" }"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, " "MaxFixed, PosDecode,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bus); dsdt_line(" 0x%04X, // Range Maximum", bus); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0001, // Length"); dsdt_line(" ,, )"); if (bus == 0) { dsdt_indent(3); dsdt_fixed_ioport(0xCF8, 8); dsdt_unindent(3); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0000, // Range Minimum"); dsdt_line(" 0x0CF7, // Range Maximum"); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0CF8, // Length"); dsdt_line(" ,, , TypeStatic)"); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0D00, // Range Minimum"); dsdt_line(" 0x%04X, // Range Maximum", PCI_EMUL_IOBASE - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", PCI_EMUL_IOBASE - 0x0D00); dsdt_line(" ,, , TypeStatic)"); if (bi == NULL) { dsdt_line(" })"); goto done; } } assert(bi != NULL); /* i/o window */ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bi->iobase); dsdt_line(" 0x%04X, // Range Maximum", bi->iolimit - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", bi->iolimit - bi->iobase); dsdt_line(" ,, , TypeStatic)"); /* mmio window (32-bit) */ dsdt_line(" DWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x00000000, // Granularity"); dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32); dsdt_line(" 0x%08X, // Range Maximum\n", bi->memlimit32 - 1); dsdt_line(" 0x00000000, // Translation Offset"); dsdt_line(" 0x%08X, // Length\n", bi->memlimit32 - bi->membase32); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); /* mmio window (64-bit) */ dsdt_line(" QWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x0000000000000000, // Granularity"); dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64); dsdt_line(" 0x%016lX, // Range Maximum\n", bi->memlimit64 - 1); dsdt_line(" 0x0000000000000000, // Translation Offset"); dsdt_line(" 0x%016lX, // Length\n", bi->memlimit64 - bi->membase64); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); dsdt_line(" })"); count = pci_count_lintr(bus); if (count != 0) { dsdt_indent(2); dsdt_line("Name (PPRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); dsdt_line("})"); dsdt_line("Name (APRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_apic_prt_entry, NULL); dsdt_line("})"); dsdt_line("Method (_PRT, 0, NotSerialized)"); dsdt_line("{"); dsdt_line(" If (PICM)"); dsdt_line(" {"); dsdt_line(" Return (APRT)"); dsdt_line(" }"); dsdt_line(" Else"); dsdt_line(" {"); dsdt_line(" Return (PPRT)"); dsdt_line(" }"); dsdt_line("}"); dsdt_unindent(2); } dsdt_indent(2); for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { pi = si->si_funcs[func].fi_devi; if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL) pi->pi_d->pe_write_dsdt(pi); } } dsdt_unindent(2); done: dsdt_line(" }"); } void pci_write_dsdt(void) { int bus; dsdt_indent(1); dsdt_line("Name (PICM, 0x00)"); dsdt_line("Method (_PIC, 1, NotSerialized)"); dsdt_line("{"); dsdt_line(" Store (Arg0, PICM)"); dsdt_line("}"); dsdt_line(""); dsdt_line("Scope (_SB)"); dsdt_line("{"); for (bus = 0; bus < MAXBUSES; bus++) pci_bus_write_dsdt(bus); dsdt_line("}"); dsdt_unindent(1); } int pci_bus_configured(int bus) { assert(bus >= 0 && bus < MAXBUSES); return (pci_businfo[bus] != NULL); } int pci_msi_enabled(struct pci_devinst *pi) { return (pi->pi_msi.enabled); } int pci_msi_maxmsgnum(struct pci_devinst *pi) { if (pi->pi_msi.enabled) return (pi->pi_msi.maxmsgnum); else return (0); } int pci_msix_enabled(struct pci_devinst *pi) { return (pi->pi_msix.enabled && !pi->pi_msi.enabled); } void pci_generate_msix(struct pci_devinst *pi, int index) { struct msix_table_entry *mte; if (!pci_msix_enabled(pi)) return; if (pi->pi_msix.function_mask) return; if (index >= pi->pi_msix.table_count) return; mte = &pi->pi_msix.table[index]; if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { /* XXX Set PBA bit if interrupt is disabled */ vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data); } } void pci_generate_msi(struct pci_devinst *pi, int index) { if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) { vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr, pi->pi_msi.msg_data + index); } } static bool pci_lintr_permitted(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (!(pi->pi_msi.enabled || pi->pi_msix.enabled || (cmd & PCIM_CMD_INTxDIS))); } void pci_lintr_request(struct pci_devinst *pi) { struct businfo *bi; struct slotinfo *si; int bestpin, bestcount, pin; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); /* * Just allocate a pin from our slot. The pin will be * assigned IRQs later when interrupts are routed. */ si = &bi->slotinfo[pi->pi_slot]; bestpin = 0; bestcount = si->si_intpins[0].ii_count; for (pin = 1; pin < 4; pin++) { if (si->si_intpins[pin].ii_count < bestcount) { bestpin = pin; bestcount = si->si_intpins[pin].ii_count; } } si->si_intpins[bestpin].ii_count++; pi->pi_lintr.pin = bestpin + 1; pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1); } static void pci_lintr_route(struct pci_devinst *pi) { struct businfo *bi; struct intxinfo *ii; if (pi->pi_lintr.pin == 0) return; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1]; /* * Attempt to allocate an I/O APIC pin for this intpin if one * is not yet assigned. */ if (ii->ii_ioapic_irq == 0) ii->ii_ioapic_irq = ioapic_pci_alloc_irq(pi); assert(ii->ii_ioapic_irq > 0); /* * Attempt to allocate a PIRQ pin for this intpin if one is * not yet assigned. */ if (ii->ii_pirq_pin == 0) ii->ii_pirq_pin = pirq_alloc_pin(pi); assert(ii->ii_pirq_pin > 0); pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq; pi->pi_lintr.pirq_pin = ii->ii_pirq_pin; pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin)); } void pci_lintr_assert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == IDLE) { if (pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } else pi->pi_lintr.state = PENDING; } pthread_mutex_unlock(&pi->pi_lintr.lock); } void pci_lintr_deassert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED) { pi->pi_lintr.state = IDLE; pci_irq_deassert(pi); } else if (pi->pi_lintr.state == PENDING) pi->pi_lintr.state = IDLE; pthread_mutex_unlock(&pi->pi_lintr.lock); } static void pci_lintr_update(struct pci_devinst *pi) { pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) { pci_irq_deassert(pi); pi->pi_lintr.state = PENDING; } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } pthread_mutex_unlock(&pi->pi_lintr.lock); } int pci_count_lintr(int bus) { int count, slot, pin; struct slotinfo *slotinfo; count = 0; if (pci_businfo[bus] != NULL) { for (slot = 0; slot < MAXSLOTS; slot++) { slotinfo = &pci_businfo[bus]->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { if (slotinfo->si_intpins[pin].ii_count != 0) count++; } } } return (count); } void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg) { struct businfo *bi; struct slotinfo *si; struct intxinfo *ii; int slot, pin; if ((bi = pci_businfo[bus]) == NULL) return; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { ii = &si->si_intpins[pin]; if (ii->ii_count != 0) cb(bus, slot, pin + 1, ii->ii_pirq_pin, ii->ii_ioapic_irq, arg); } } } /* * Return 1 if the emulated device in 'slot' is a multi-function device. * Return 0 otherwise. */ static int pci_emul_is_mfdev(int bus, int slot) { struct businfo *bi; struct slotinfo *si; int f, numfuncs; numfuncs = 0; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; for (f = 0; f < MAXFUNCS; f++) { if (si->si_funcs[f].fi_devi != NULL) { numfuncs++; } } } return (numfuncs > 1); } /* * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on * whether or not is a multi-function being emulated in the pci 'slot'. */ static void pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) { int mfdev; if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) { mfdev = pci_emul_is_mfdev(bus, slot); switch (bytes) { case 1: case 2: *rv &= ~PCIM_MFDEV; if (mfdev) { *rv |= PCIM_MFDEV; } break; case 4: *rv &= ~(PCIM_MFDEV << 16); if (mfdev) { *rv |= (PCIM_MFDEV << 16); } break; } } } /* * Update device state in response to changes to the PCI command * register. */ void pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old) { int i; uint16_t changed, new; new = pci_get_cfgdata16(pi, PCIR_COMMAND); changed = old ^ new; /* * If the MMIO or I/O address space decoding has changed then * register/unregister all BARs that decode that address space. */ for (i = 0; i <= PCI_BARMAX; i++) { switch (pi->pi_bar[i].type) { case PCIBAR_NONE: case PCIBAR_MEMHI64: break; case PCIBAR_IO: /* I/O address space decoding changed? */ if (changed & PCIM_CMD_PORTEN) { if (new & PCIM_CMD_PORTEN) register_bar(pi, i); else unregister_bar(pi, i); } break; case PCIBAR_MEM32: case PCIBAR_MEM64: /* MMIO address space decoding changed? */ if (changed & PCIM_CMD_MEMEN) { if (new & PCIM_CMD_MEMEN) register_bar(pi, i); else unregister_bar(pi, i); } break; default: assert(0); } } /* * If INTx has been unmasked and is pending, assert the * interrupt. */ pci_lintr_update(pi); } static void pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes) { int rshift; uint32_t cmd, old, readonly; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ /* * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3. * * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are * 'write 1 to clear'. However these bits are not set to '1' by * any device emulation so it is simpler to treat them as readonly. */ rshift = (coff & 0x3) * 8; readonly = 0xFFFFF880 >> rshift; old = CFGREAD(pi, coff, bytes); new &= ~readonly; new |= (old & readonly); CFGWRITE(pi, coff, new, bytes); /* update config */ pci_emul_cmd_changed(pi, cmd); } static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, int coff, int bytes, uint32_t *eax) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; struct pci_devemu *pe; int idx, needcfg; uint64_t addr, bar, mask; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; pi = si->si_funcs[func].fi_devi; } else pi = NULL; /* * Just return if there is no device at this slot:func or if the * the guest is doing an un-aligned access. */ if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || (coff & (bytes - 1)) != 0) { if (in) *eax = 0xffffffff; return; } /* * Ignore all writes beyond the standard config space and return all * ones on reads. */ if (coff >= PCI_REGMAX + 1) { if (in) { *eax = 0xffffffff; /* * Extended capabilities begin at offset 256 in config * space. Absence of extended capabilities is signaled * with all 0s in the extended capability header at * offset 256. */ if (coff <= PCI_REGMAX + 4) *eax = 0x00000000; } return; } pe = pi->pi_d; /* * Config read */ if (in) { /* Let the device emulation override the default handler */ if (pe->pe_cfgread != NULL) { needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes, eax); } else { needcfg = 1; } if (needcfg) *eax = CFGREAD(pi, coff, bytes); pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax); } else { /* Let the device emulation override the default handler */ if (pe->pe_cfgwrite != NULL && (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0) return; /* * Special handling for write to BAR registers */ if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) { /* * Ignore writes to BAR registers that are not * 4-byte aligned. */ if (bytes != 4 || (coff & 0x3) != 0) return; idx = (coff - PCIR_BAR(0)) / 4; mask = ~(pi->pi_bar[idx].size - 1); switch (pi->pi_bar[idx].type) { case PCIBAR_NONE: pi->pi_bar[idx].addr = bar = 0; break; case PCIBAR_IO: addr = *eax & mask; addr &= 0xffff; bar = addr | PCIM_BAR_IO_SPACE; /* * Register the new BAR value for interception */ if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_IO); } break; case PCIBAR_MEM32: addr = bar = *eax & mask; bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM32); } break; case PCIBAR_MEM64: addr = bar = *eax & mask; bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | PCIM_BAR_MEM_PREFETCH; if (addr != (uint32_t)pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM64); } break; case PCIBAR_MEMHI64: mask = ~(pi->pi_bar[idx - 1].size - 1); addr = ((uint64_t)*eax << 32) & mask; bar = addr >> 32; if (bar != pi->pi_bar[idx - 1].addr >> 32) { update_bar_address(pi, addr, idx - 1, PCIBAR_MEMHI64); } break; default: assert(0); } pci_set_cfgdata32(pi, coff, bar); } else if (pci_emul_iscap(pi, coff)) { pci_emul_capwrite(pi, coff, bytes, *eax); } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) { pci_emul_cmdsts_write(pi, coff, *eax, bytes); } else { CFGWRITE(pi, coff, *eax, bytes); } } } static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; static int pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { uint32_t x; if (bytes != 4) { if (in) *eax = (bytes == 2) ? 0xffff : 0xff; return (0); } if (in) { x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff; if (cfgenable) x |= CONF1_ENABLE; *eax = x; } else { x = *eax; cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; cfgoff = x & PCI_REGMAX; cfgfunc = (x >> 8) & PCI_FUNCMAX; cfgslot = (x >> 11) & PCI_SLOTMAX; cfgbus = (x >> 16) & PCI_BUSMAX; } return (0); } INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); static int pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { int coff; assert(bytes == 1 || bytes == 2 || bytes == 4); coff = cfgoff + (port - CONF1_DATA_PORT); if (cfgenable) { pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes, eax); } else { /* Ignore accesses to cfgdata if not enabled by cfgaddr */ if (in) *eax = 0xffffffff; } return (0); } INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); +#ifdef BHYVE_SNAPSHOT +/* + * Saves/restores PCI device emulated state. Returns 0 on success. + */ +static int +pci_snapshot_pci_dev(struct vm_snapshot_meta *meta) +{ + struct pci_devinst *pi; + int i; + int ret; + + pi = meta->dev_data; + + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.enabled, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.addr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.msg_data, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.maxmsgnum, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.enabled, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_bar, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_bar, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_offset, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_offset, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_size, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.function_mask, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_page_offset, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(pi->pi_cfgdata, sizeof(pi->pi_cfgdata), + meta, ret, done); + + for (i = 0; i < nitems(pi->pi_bar); i++) { + SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].type, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].size, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].addr, meta, ret, done); + } + + /* Restore MSI-X table. */ + for (i = 0; i < pi->pi_msix.table_count; i++) { + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].addr, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].msg_data, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].vector_control, + meta, ret, done); + } + +done: + return (ret); +} + +static int +pci_find_slotted_dev(const char *dev_name, struct pci_devemu **pde, + struct pci_devinst **pdi) +{ + struct businfo *bi; + struct slotinfo *si; + struct funcinfo *fi; + int bus, slot, func; + + assert(dev_name != NULL); + assert(pde != NULL); + assert(pdi != NULL); + + for (bus = 0; bus < MAXBUSES; bus++) { + if ((bi = pci_businfo[bus]) == NULL) + continue; + + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (func = 0; func < MAXFUNCS; func++) { + fi = &si->si_funcs[func]; + if (fi->fi_name == NULL) + continue; + if (strcmp(dev_name, fi->fi_name)) + continue; + + *pde = pci_emul_finddev(fi->fi_name); + assert(*pde != NULL); + + *pdi = fi->fi_devi; + return (0); + } + } + } + + return (EINVAL); +} + +int +pci_snapshot(struct vm_snapshot_meta *meta) +{ + struct pci_devemu *pde; + struct pci_devinst *pdi; + int ret; + + assert(meta->dev_name != NULL); + + ret = pci_find_slotted_dev(meta->dev_name, &pde, &pdi); + if (ret != 0) { + fprintf(stderr, "%s: no such name: %s\r\n", + __func__, meta->dev_name); + memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); + return (0); + } + + meta->dev_data = pdi; + + if (pde->pe_snapshot == NULL) { + fprintf(stderr, "%s: not implemented yet for: %s\r\n", + __func__, meta->dev_name); + return (-1); + } + + ret = pci_snapshot_pci_dev(meta); + if (ret != 0) { + fprintf(stderr, "%s: failed to snapshot pci dev\r\n", + __func__); + return (-1); + } + + ret = (*pde->pe_snapshot)(meta); + + return (ret); +} + +int +pci_pause(struct vmctx *ctx, const char *dev_name) +{ + struct pci_devemu *pde; + struct pci_devinst *pdi; + int ret; + + assert(dev_name != NULL); + + ret = pci_find_slotted_dev(dev_name, &pde, &pdi); + if (ret != 0) { + /* + * It is possible to call this function without + * checking that the device is inserted first. + */ + fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name); + return (0); + } + + if (pde->pe_pause == NULL) { + /* The pause/resume functionality is optional. */ + fprintf(stderr, "%s: not implemented for: %s\n", + __func__, dev_name); + return (0); + } + + return (*pde->pe_pause)(ctx, pdi); +} + +int +pci_resume(struct vmctx *ctx, const char *dev_name) +{ + struct pci_devemu *pde; + struct pci_devinst *pdi; + int ret; + + assert(dev_name != NULL); + + ret = pci_find_slotted_dev(dev_name, &pde, &pdi); + if (ret != 0) { + /* + * It is possible to call this function without + * checking that the device is inserted first. + */ + fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name); + return (0); + } + + if (pde->pe_resume == NULL) { + /* The pause/resume functionality is optional. */ + fprintf(stderr, "%s: not implemented for: %s\n", + __func__, dev_name); + return (0); + } + + return (*pde->pe_resume)(ctx, pdi); +} +#endif + #define PCI_EMUL_TEST #ifdef PCI_EMUL_TEST /* * Define a dummy test device */ #define DIOSZ 8 #define DMEMSZ 4096 struct pci_emul_dsoftc { - uint8_t ioregs[DIOSZ]; + uint8_t ioregs[DIOSZ]; uint8_t memregs[2][DMEMSZ]; }; #define PCI_EMUL_MSI_MSGS 4 #define PCI_EMUL_MSIX_MSGS 16 static int pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { int error; struct pci_emul_dsoftc *sc; sc = calloc(1, sizeof(struct pci_emul_dsoftc)); pi->pi_arg = sc; pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS); assert(error == 0); error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ); assert(error == 0); return (0); } static void pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { int i; struct pci_emul_dsoftc *sc = pi->pi_arg; if (baridx == 0) { if (offset + size > DIOSZ) { printf("diow: iow too large, offset %ld size %d\n", offset, size); return; } if (size == 1) { sc->ioregs[offset] = value & 0xff; } else if (size == 2) { *(uint16_t *)&sc->ioregs[offset] = value & 0xffff; } else if (size == 4) { *(uint32_t *)&sc->ioregs[offset] = value; } else { printf("diow: iow unknown size %d\n", size); } /* * Special magic value to generate an interrupt */ if (offset == 4 && size == 4 && pci_msi_enabled(pi)) pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi)); if (value == 0xabcdef) { for (i = 0; i < pci_msi_maxmsgnum(pi); i++) pci_generate_msi(pi, i); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("diow: memw too large, offset %ld size %d\n", offset, size); return; } i = baridx - 1; /* 'memregs' index */ if (size == 1) { sc->memregs[i][offset] = value; } else if (size == 2) { *(uint16_t *)&sc->memregs[i][offset] = value; } else if (size == 4) { *(uint32_t *)&sc->memregs[i][offset] = value; } else if (size == 8) { *(uint64_t *)&sc->memregs[i][offset] = value; } else { printf("diow: memw unknown size %d\n", size); } - + /* * magic interrupt ?? */ } if (baridx > 2 || baridx < 0) { printf("diow: unknown bar idx %d\n", baridx); } } static uint64_t pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_emul_dsoftc *sc = pi->pi_arg; uint32_t value; int i; if (baridx == 0) { if (offset + size > DIOSZ) { printf("dior: ior too large, offset %ld size %d\n", offset, size); return (0); } - + value = 0; if (size == 1) { value = sc->ioregs[offset]; } else if (size == 2) { value = *(uint16_t *) &sc->ioregs[offset]; } else if (size == 4) { value = *(uint32_t *) &sc->ioregs[offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("dior: memr too large, offset %ld size %d\n", offset, size); return (0); } - + i = baridx - 1; /* 'memregs' index */ if (size == 1) { value = sc->memregs[i][offset]; } else if (size == 2) { value = *(uint16_t *) &sc->memregs[i][offset]; } else if (size == 4) { value = *(uint32_t *) &sc->memregs[i][offset]; } else if (size == 8) { value = *(uint64_t *) &sc->memregs[i][offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx > 2 || baridx < 0) { printf("dior: unknown bar idx %d\n", baridx); return (0); } return (value); } +#ifdef BHYVE_SNAPSHOT +int +pci_emul_snapshot(struct vm_snapshot_meta *meta) +{ + + return (0); +} +#endif + struct pci_devemu pci_dummy = { .pe_emu = "dummy", .pe_init = pci_emul_dinit, .pe_barwrite = pci_emul_diow, - .pe_barread = pci_emul_dior + .pe_barread = pci_emul_dior, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = pci_emul_snapshot, +#endif }; PCI_EMUL_SET(pci_dummy); #endif /* PCI_EMUL_TEST */ diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h index fba2e8845af8..1cefa5ed042d 100644 --- a/usr.sbin/bhyve/pci_emul.h +++ b/usr.sbin/bhyve/pci_emul.h @@ -1,292 +1,303 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _PCI_EMUL_H_ #define _PCI_EMUL_H_ #include #include #include #include #include #include #define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */ struct vmctx; struct pci_devinst; struct memory_region; +struct vm_snapshot_meta; struct pci_devemu { char *pe_emu; /* Name of device emulation */ /* instance creation */ int (*pe_init)(struct vmctx *, struct pci_devinst *, char *opts); /* ACPI DSDT enumeration */ void (*pe_write_dsdt)(struct pci_devinst *); /* config space read/write callbacks */ int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int offset, int bytes, uint32_t val); int (*pe_cfgread)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int offset, int bytes, uint32_t *retval); /* BAR read/write callbacks */ void (*pe_barwrite)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value); uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size); + + /* Save/restore device state */ + int (*pe_snapshot)(struct vm_snapshot_meta *meta); + int (*pe_pause)(struct vmctx *ctx, struct pci_devinst *pi); + int (*pe_resume)(struct vmctx *ctx, struct pci_devinst *pi); }; #define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x); enum pcibar_type { PCIBAR_NONE, PCIBAR_IO, PCIBAR_MEM32, PCIBAR_MEM64, PCIBAR_MEMHI64 }; struct pcibar { enum pcibar_type type; /* io or memory */ uint64_t size; uint64_t addr; }; #define PI_NAMESZ 40 struct msix_table_entry { uint64_t addr; uint32_t msg_data; uint32_t vector_control; } __packed; /* * In case the structure is modified to hold extra information, use a define * for the size that should be emulated. */ #define MSIX_TABLE_ENTRY_SIZE 16 #define MAX_MSIX_TABLE_ENTRIES 2048 #define PBA_SIZE(msgnum) (roundup2((msgnum), 64) / 8) enum lintr_stat { IDLE, ASSERTED, PENDING }; struct pci_devinst { struct pci_devemu *pi_d; struct vmctx *pi_vmctx; uint8_t pi_bus, pi_slot, pi_func; char pi_name[PI_NAMESZ]; int pi_bar_getsize; int pi_prevcap; int pi_capend; struct { int8_t pin; enum lintr_stat state; int pirq_pin; int ioapic_irq; pthread_mutex_t lock; } pi_lintr; struct { int enabled; uint64_t addr; uint64_t msg_data; int maxmsgnum; } pi_msi; struct { int enabled; int table_bar; int pba_bar; uint32_t table_offset; int table_count; uint32_t pba_offset; int pba_size; int function_mask; struct msix_table_entry *table; /* allocated at runtime */ void *pba_page; int pba_page_offset; } pi_msix; void *pi_arg; /* devemu-private data */ u_char pi_cfgdata[PCI_REGMAX + 1]; struct pcibar pi_bar[PCI_BARMAX + 1]; }; struct msicap { uint8_t capid; uint8_t nextptr; uint16_t msgctrl; uint32_t addrlo; uint32_t addrhi; uint16_t msgdata; } __packed; static_assert(sizeof(struct msicap) == 14, "compile-time assertion failed"); struct msixcap { uint8_t capid; uint8_t nextptr; uint16_t msgctrl; uint32_t table_info; /* bar index and offset within it */ uint32_t pba_info; /* bar index and offset within it */ } __packed; static_assert(sizeof(struct msixcap) == 12, "compile-time assertion failed"); struct pciecap { uint8_t capid; uint8_t nextptr; uint16_t pcie_capabilities; uint32_t dev_capabilities; /* all devices */ uint16_t dev_control; uint16_t dev_status; uint32_t link_capabilities; /* devices with links */ uint16_t link_control; uint16_t link_status; uint32_t slot_capabilities; /* ports with slots */ uint16_t slot_control; uint16_t slot_status; uint16_t root_control; /* root ports */ uint16_t root_capabilities; uint32_t root_status; uint32_t dev_capabilities2; /* all devices */ uint16_t dev_control2; uint16_t dev_status2; uint32_t link_capabilities2; /* devices with links */ uint16_t link_control2; uint16_t link_status2; uint32_t slot_capabilities2; /* ports with slots */ uint16_t slot_control2; uint16_t slot_status2; } __packed; static_assert(sizeof(struct pciecap) == 60, "compile-time assertion failed"); typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin, int ioapic_irq, void *arg); int init_pci(struct vmctx *ctx); void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val); void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val); void pci_callback(void); int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size); int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, enum pcibar_type type, uint64_t size); int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type); void pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old); void pci_generate_msi(struct pci_devinst *pi, int msgnum); void pci_generate_msix(struct pci_devinst *pi, int msgnum); void pci_lintr_assert(struct pci_devinst *pi); void pci_lintr_deassert(struct pci_devinst *pi); void pci_lintr_request(struct pci_devinst *pi); int pci_msi_enabled(struct pci_devinst *pi); int pci_msix_enabled(struct pci_devinst *pi); int pci_msix_table_bar(struct pci_devinst *pi); int pci_msix_pba_bar(struct pci_devinst *pi); int pci_msi_maxmsgnum(struct pci_devinst *pi); int pci_parse_slot(char *opt); void pci_print_supported_devices(); void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum); int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, uint64_t value); uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size); int pci_count_lintr(int bus); void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg); void pci_write_dsdt(void); uint64_t pci_ecfg_base(void); int pci_bus_configured(int bus); +#ifdef BHYVE_SNAPSHOT +int pci_snapshot(struct vm_snapshot_meta *meta); +int pci_pause(struct vmctx *ctx, const char *dev_name); +int pci_resume(struct vmctx *ctx, const char *dev_name); +#endif static __inline void pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val) { assert(offset <= PCI_REGMAX); *(uint8_t *)(pi->pi_cfgdata + offset) = val; } static __inline void pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val) { assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); *(uint16_t *)(pi->pi_cfgdata + offset) = val; } static __inline void pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val) { assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); *(uint32_t *)(pi->pi_cfgdata + offset) = val; } static __inline uint8_t pci_get_cfgdata8(struct pci_devinst *pi, int offset) { assert(offset <= PCI_REGMAX); return (*(uint8_t *)(pi->pi_cfgdata + offset)); } static __inline uint16_t pci_get_cfgdata16(struct pci_devinst *pi, int offset) { assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); return (*(uint16_t *)(pi->pi_cfgdata + offset)); } static __inline uint32_t pci_get_cfgdata32(struct pci_devinst *pi, int offset) { assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); return (*(uint32_t *)(pi->pi_cfgdata + offset)); } #endif /* _PCI_EMUL_H_ */ diff --git a/usr.sbin/bhyve/pci_fbuf.c b/usr.sbin/bhyve/pci_fbuf.c index 8961875356da..0bd740a0908c 100644 --- a/usr.sbin/bhyve/pci_fbuf.c +++ b/usr.sbin/bhyve/pci_fbuf.c @@ -1,449 +1,466 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015 Nahanni Systems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include #include #include "bhyvegc.h" #include "bhyverun.h" #include "debug.h" #include "console.h" #include "inout.h" #include "pci_emul.h" #include "rfb.h" #include "vga.h" /* * bhyve Framebuffer device emulation. * BAR0 points to the current mode information. * BAR1 is the 32-bit framebuffer address. * * -s ,fbuf,wait,vga=on|io|off,rfb=:port,w=width,h=height */ static int fbuf_debug = 1; #define DEBUG_INFO 1 #define DEBUG_VERBOSE 4 #define DPRINTF(level, params) if (level <= fbuf_debug) PRINTLN params #define KB (1024UL) #define MB (1024 * 1024UL) #define DMEMSZ 128 #define FB_SIZE (16*MB) #define COLS_MAX 1920 #define ROWS_MAX 1200 #define COLS_DEFAULT 1024 #define ROWS_DEFAULT 768 #define COLS_MIN 640 #define ROWS_MIN 480 struct pci_fbuf_softc { struct pci_devinst *fsc_pi; struct { uint32_t fbsize; uint16_t width; uint16_t height; uint16_t depth; uint16_t refreshrate; uint8_t reserved[116]; } __packed memregs; /* rfb server */ char *rfb_host; char *rfb_password; int rfb_port; int rfb_wait; int vga_enabled; int vga_full; uint32_t fbaddr; char *fb_base; uint16_t gc_width; uint16_t gc_height; void *vgasc; struct bhyvegc_image *gc_image; }; static struct pci_fbuf_softc *fbuf_sc; #define PCI_FBUF_MSI_MSGS 4 static void pci_fbuf_usage(char *opt) { EPRINTLN("Invalid fbuf emulation option \"%s\"", opt); EPRINTLN("fbuf: {wait,}{vga=on|io|off,}rfb=:port" "{,w=width}{,h=height}"); } static void pci_fbuf_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_fbuf_softc *sc; uint8_t *p; assert(baridx == 0); sc = pi->pi_arg; DPRINTF(DEBUG_VERBOSE, ("fbuf wr: offset 0x%lx, size: %d, value: 0x%lx", offset, size, value)); if (offset + size > DMEMSZ) { printf("fbuf: write too large, offset %ld size %d\n", offset, size); return; } p = (uint8_t *)&sc->memregs + offset; switch (size) { case 1: *p = value; break; case 2: *(uint16_t *)p = value; break; case 4: *(uint32_t *)p = value; break; case 8: *(uint64_t *)p = value; break; default: printf("fbuf: write unknown size %d\n", size); break; } if (!sc->gc_image->vgamode && sc->memregs.width == 0 && sc->memregs.height == 0) { DPRINTF(DEBUG_INFO, ("switching to VGA mode")); sc->gc_image->vgamode = 1; sc->gc_width = 0; sc->gc_height = 0; } else if (sc->gc_image->vgamode && sc->memregs.width != 0 && sc->memregs.height != 0) { DPRINTF(DEBUG_INFO, ("switching to VESA mode")); sc->gc_image->vgamode = 0; } } uint64_t pci_fbuf_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_fbuf_softc *sc; uint8_t *p; uint64_t value; assert(baridx == 0); sc = pi->pi_arg; if (offset + size > DMEMSZ) { printf("fbuf: read too large, offset %ld size %d\n", offset, size); return (0); } p = (uint8_t *)&sc->memregs + offset; value = 0; switch (size) { case 1: value = *p; break; case 2: value = *(uint16_t *)p; break; case 4: value = *(uint32_t *)p; break; case 8: value = *(uint64_t *)p; break; default: printf("fbuf: read unknown size %d\n", size); break; } DPRINTF(DEBUG_VERBOSE, ("fbuf rd: offset 0x%lx, size: %d, value: 0x%lx", offset, size, value)); return (value); } static int pci_fbuf_parse_opts(struct pci_fbuf_softc *sc, char *opts) { char *uopts, *uoptsbak, *xopts, *config; char *tmpstr; int ret; ret = 0; uoptsbak = uopts = strdup(opts); while ((xopts = strsep(&uopts, ",")) != NULL) { if (strcmp(xopts, "wait") == 0) { sc->rfb_wait = 1; continue; } if ((config = strchr(xopts, '=')) == NULL) { pci_fbuf_usage(xopts); ret = -1; goto done; } *config++ = '\0'; DPRINTF(DEBUG_VERBOSE, ("pci_fbuf option %s = %s", xopts, config)); if (!strcmp(xopts, "tcp") || !strcmp(xopts, "rfb")) { /* * IPv4 -- host-ip:port * IPv6 -- [host-ip%zone]:port * XXX for now port is mandatory. */ tmpstr = strsep(&config, "]"); if (config) { if (tmpstr[0] == '[') tmpstr++; sc->rfb_host = strdup(tmpstr); if (config[0] == ':') config++; else { pci_fbuf_usage(xopts); ret = -1; goto done; } sc->rfb_port = atoi(config); } else { config = tmpstr; tmpstr = strsep(&config, ":"); if (!config) sc->rfb_port = atoi(tmpstr); else { sc->rfb_port = atoi(config); sc->rfb_host = strdup(tmpstr); } } } else if (!strcmp(xopts, "vga")) { if (!strcmp(config, "off")) { sc->vga_enabled = 0; } else if (!strcmp(config, "io")) { sc->vga_enabled = 1; sc->vga_full = 0; } else if (!strcmp(config, "on")) { sc->vga_enabled = 1; sc->vga_full = 1; } else { pci_fbuf_usage(xopts); ret = -1; goto done; } } else if (!strcmp(xopts, "w")) { sc->memregs.width = atoi(config); if (sc->memregs.width > COLS_MAX) { pci_fbuf_usage(xopts); ret = -1; goto done; } else if (sc->memregs.width == 0) sc->memregs.width = 1920; } else if (!strcmp(xopts, "h")) { sc->memregs.height = atoi(config); if (sc->memregs.height > ROWS_MAX) { pci_fbuf_usage(xopts); ret = -1; goto done; } else if (sc->memregs.height == 0) sc->memregs.height = 1080; } else if (!strcmp(xopts, "password")) { sc->rfb_password = strdup(config); } else { pci_fbuf_usage(xopts); ret = -1; goto done; } } done: free(uoptsbak); return (ret); } extern void vga_render(struct bhyvegc *gc, void *arg); void pci_fbuf_render(struct bhyvegc *gc, void *arg) { struct pci_fbuf_softc *sc; sc = arg; if (sc->vga_full && sc->gc_image->vgamode) { /* TODO: mode switching to vga and vesa should use the special * EFI-bhyve protocol port. */ vga_render(gc, sc->vgasc); return; } if (sc->gc_width != sc->memregs.width || sc->gc_height != sc->memregs.height) { bhyvegc_resize(gc, sc->memregs.width, sc->memregs.height); sc->gc_width = sc->memregs.width; sc->gc_height = sc->memregs.height; } return; } static int pci_fbuf_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { int error, prot; struct pci_fbuf_softc *sc; if (fbuf_sc != NULL) { EPRINTLN("Only one frame buffer device is allowed."); return (-1); } sc = calloc(1, sizeof(struct pci_fbuf_softc)); pi->pi_arg = sc; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x40FB); pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_DISPLAY); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_DISPLAY_VGA); error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, DMEMSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, FB_SIZE); assert(error == 0); error = pci_emul_add_msicap(pi, PCI_FBUF_MSI_MSGS); assert(error == 0); sc->fbaddr = pi->pi_bar[1].addr; sc->memregs.fbsize = FB_SIZE; sc->memregs.width = COLS_DEFAULT; sc->memregs.height = ROWS_DEFAULT; sc->memregs.depth = 32; sc->vga_enabled = 1; sc->vga_full = 0; sc->fsc_pi = pi; error = pci_fbuf_parse_opts(sc, opts); if (error != 0) goto done; /* XXX until VGA rendering is enabled */ if (sc->vga_full != 0) { EPRINTLN("pci_fbuf: VGA rendering not enabled"); goto done; } sc->fb_base = vm_create_devmem(ctx, VM_FRAMEBUFFER, "framebuffer", FB_SIZE); if (sc->fb_base == MAP_FAILED) { error = -1; goto done; } DPRINTF(DEBUG_INFO, ("fbuf frame buffer base: %p [sz %lu]", sc->fb_base, FB_SIZE)); /* * Map the framebuffer into the guest address space. * XXX This may fail if the BAR is different than a prior * run. In this case flag the error. This will be fixed * when a change_memseg api is available. */ prot = PROT_READ | PROT_WRITE; if (vm_mmap_memseg(ctx, sc->fbaddr, VM_FRAMEBUFFER, 0, FB_SIZE, prot) != 0) { EPRINTLN("pci_fbuf: mapseg failed - try deleting VM and restarting"); error = -1; goto done; } console_init(sc->memregs.width, sc->memregs.height, sc->fb_base); console_fb_register(pci_fbuf_render, sc); if (sc->vga_enabled) sc->vgasc = vga_init(!sc->vga_full); sc->gc_image = console_get_image(); fbuf_sc = sc; memset((void *)sc->fb_base, 0, FB_SIZE); error = rfb_init(sc->rfb_host, sc->rfb_port, sc->rfb_wait, sc->rfb_password); done: if (error) free(sc); return (error); } +#ifdef BHYVE_SNAPSHOT +static int +pci_fbuf_snapshot(struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_BUF_OR_LEAVE(fbuf_sc->fb_base, FB_SIZE, meta, ret, err); + +err: + return (ret); +} +#endif + struct pci_devemu pci_fbuf = { .pe_emu = "fbuf", .pe_init = pci_fbuf_init, .pe_barwrite = pci_fbuf_write, - .pe_barread = pci_fbuf_read + .pe_barread = pci_fbuf_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = pci_fbuf_snapshot, +#endif }; PCI_EMUL_SET(pci_fbuf); diff --git a/usr.sbin/bhyve/pci_lpc.c b/usr.sbin/bhyve/pci_lpc.c index 1e4b513ec494..4ebdd7039cbc 100644 --- a/usr.sbin/bhyve/pci_lpc.c +++ b/usr.sbin/bhyve/pci_lpc.c @@ -1,463 +1,487 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Neel Natu * Copyright (c) 2013 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include #include #include "acpi.h" #include "debug.h" #include "bootrom.h" #include "inout.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "uart_emul.h" #define IO_ICU1 0x20 #define IO_ICU2 0xA0 SET_DECLARE(lpc_dsdt_set, struct lpc_dsdt); SET_DECLARE(lpc_sysres_set, struct lpc_sysres); #define ELCR_PORT 0x4d0 SYSRES_IO(ELCR_PORT, 2); #define IO_TIMER1_PORT 0x40 #define NMISC_PORT 0x61 SYSRES_IO(NMISC_PORT, 1); static struct pci_devinst *lpc_bridge; static const char *romfile; #define LPC_UART_NUM 2 static struct lpc_uart_softc { struct uart_softc *uart_softc; const char *opts; int iobase; int irq; int enabled; } lpc_uart_softc[LPC_UART_NUM]; static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" }; /* * LPC device configuration is in the following form: * [,] * For e.g. "com1,stdio" or "bootrom,/var/romfile" */ int lpc_device_parse(const char *opts) { int unit, error; char *str, *cpy, *lpcdev; error = -1; str = cpy = strdup(opts); lpcdev = strsep(&str, ","); if (lpcdev != NULL) { if (strcasecmp(lpcdev, "bootrom") == 0) { romfile = str; error = 0; goto done; } for (unit = 0; unit < LPC_UART_NUM; unit++) { if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) { lpc_uart_softc[unit].opts = str; error = 0; goto done; } } } done: if (error) free(cpy); return (error); } void lpc_print_supported_devices() { size_t i; printf("bootrom\n"); for (i = 0; i < LPC_UART_NUM; i++) printf("%s\n", lpc_uart_names[i]); } const char * lpc_bootrom(void) { return (romfile); } static void lpc_uart_intr_assert(void *arg) { struct lpc_uart_softc *sc = arg; assert(sc->irq >= 0); vm_isa_pulse_irq(lpc_bridge->pi_vmctx, sc->irq, sc->irq); } static void lpc_uart_intr_deassert(void *arg) { /* * The COM devices on the LPC bus generate edge triggered interrupts, * so nothing more to do here. */ } static int lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { int offset; struct lpc_uart_softc *sc = arg; offset = port - sc->iobase; switch (bytes) { case 1: if (in) *eax = uart_read(sc->uart_softc, offset); else uart_write(sc->uart_softc, offset, *eax); break; case 2: if (in) { *eax = uart_read(sc->uart_softc, offset); *eax |= uart_read(sc->uart_softc, offset + 1) << 8; } else { uart_write(sc->uart_softc, offset, *eax); uart_write(sc->uart_softc, offset + 1, *eax >> 8); } break; default: return (-1); } return (0); } static int lpc_init(struct vmctx *ctx) { struct lpc_uart_softc *sc; struct inout_port iop; const char *name; int unit, error; if (romfile != NULL) { error = bootrom_loadrom(ctx, romfile); if (error) return (error); } /* COM1 and COM2 */ for (unit = 0; unit < LPC_UART_NUM; unit++) { sc = &lpc_uart_softc[unit]; name = lpc_uart_names[unit]; if (uart_legacy_alloc(unit, &sc->iobase, &sc->irq) != 0) { EPRINTLN("Unable to allocate resources for " "LPC device %s", name); return (-1); } pci_irq_reserve(sc->irq); sc->uart_softc = uart_init(lpc_uart_intr_assert, lpc_uart_intr_deassert, sc); if (uart_set_backend(sc->uart_softc, sc->opts) != 0) { EPRINTLN("Unable to initialize backend '%s' " "for LPC device %s", sc->opts, name); return (-1); } bzero(&iop, sizeof(struct inout_port)); iop.name = name; iop.port = sc->iobase; iop.size = UART_IO_BAR_SIZE; iop.flags = IOPORT_F_INOUT; iop.handler = lpc_uart_io_handler; iop.arg = sc; error = register_inout(&iop); assert(error == 0); sc->enabled = 1; } return (0); } static void pci_lpc_write_dsdt(struct pci_devinst *pi) { struct lpc_dsdt **ldpp, *ldp; dsdt_line(""); dsdt_line("Device (ISA)"); dsdt_line("{"); dsdt_line(" Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func); dsdt_line(" OperationRegion (LPCR, PCI_Config, 0x00, 0x100)"); dsdt_line(" Field (LPCR, AnyAcc, NoLock, Preserve)"); dsdt_line(" {"); dsdt_line(" Offset (0x60),"); dsdt_line(" PIRA, 8,"); dsdt_line(" PIRB, 8,"); dsdt_line(" PIRC, 8,"); dsdt_line(" PIRD, 8,"); dsdt_line(" Offset (0x68),"); dsdt_line(" PIRE, 8,"); dsdt_line(" PIRF, 8,"); dsdt_line(" PIRG, 8,"); dsdt_line(" PIRH, 8"); dsdt_line(" }"); dsdt_line(""); dsdt_indent(1); SET_FOREACH(ldpp, lpc_dsdt_set) { ldp = *ldpp; ldp->handler(); } dsdt_line(""); dsdt_line("Device (PIC)"); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0000\"))"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); dsdt_fixed_ioport(IO_ICU1, 2); dsdt_fixed_ioport(IO_ICU2, 2); dsdt_fixed_irq(2); dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); dsdt_line(""); dsdt_line("Device (TIMR)"); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0100\"))"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); dsdt_fixed_ioport(IO_TIMER1_PORT, 4); dsdt_fixed_irq(0); dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); dsdt_unindent(1); dsdt_line("}"); } static void pci_lpc_sysres_dsdt(void) { struct lpc_sysres **lspp, *lsp; dsdt_line(""); dsdt_line("Device (SIO)"); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0C02\"))"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); SET_FOREACH(lspp, lpc_sysres_set) { lsp = *lspp; switch (lsp->type) { case LPC_SYSRES_IO: dsdt_fixed_ioport(lsp->base, lsp->length); break; case LPC_SYSRES_MEM: dsdt_fixed_mem32(lsp->base, lsp->length); break; } } dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); } LPC_DSDT(pci_lpc_sysres_dsdt); static void pci_lpc_uart_dsdt(void) { struct lpc_uart_softc *sc; int unit; for (unit = 0; unit < LPC_UART_NUM; unit++) { sc = &lpc_uart_softc[unit]; if (!sc->enabled) continue; dsdt_line(""); dsdt_line("Device (%s)", lpc_uart_names[unit]); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0501\"))"); dsdt_line(" Name (_UID, %d)", unit + 1); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); dsdt_fixed_ioport(sc->iobase, UART_IO_BAR_SIZE); dsdt_fixed_irq(sc->irq); dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); } } LPC_DSDT(pci_lpc_uart_dsdt); static int pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t val) { int pirq_pin; if (bytes == 1) { pirq_pin = 0; if (coff >= 0x60 && coff <= 0x63) pirq_pin = coff - 0x60 + 1; if (coff >= 0x68 && coff <= 0x6b) pirq_pin = coff - 0x68 + 5; if (pirq_pin != 0) { pirq_write(ctx, pirq_pin, val); pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin)); return (0); } } return (-1); } static void pci_lpc_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { } static uint64_t pci_lpc_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { return (0); } #define LPC_DEV 0x7000 #define LPC_VENDOR 0x8086 static int pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { /* * Do not allow more than one LPC bridge to be configured. */ if (lpc_bridge != NULL) { EPRINTLN("Only one LPC bridge is allowed."); return (-1); } /* * Enforce that the LPC can only be configured on bus 0. This * simplifies the ACPI DSDT because it can provide a decode for * all legacy i/o ports behind bus 0. */ if (pi->pi_bus != 0) { EPRINTLN("LPC bridge can be present only on bus 0."); return (-1); } if (lpc_init(ctx) != 0) return (-1); /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, LPC_DEV); pci_set_cfgdata16(pi, PCIR_VENDOR, LPC_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA); lpc_bridge = pi; return (0); } char * lpc_pirq_name(int pin) { char *name; if (lpc_bridge == NULL) return (NULL); asprintf(&name, "\\_SB.PC00.ISA.LNK%c,", 'A' + pin - 1); return (name); } void lpc_pirq_routed(void) { int pin; if (lpc_bridge == NULL) return; for (pin = 0; pin < 4; pin++) pci_set_cfgdata8(lpc_bridge, 0x60 + pin, pirq_read(pin + 1)); for (pin = 0; pin < 4; pin++) pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5)); } +#ifdef BHYVE_SNAPSHOT +static int +pci_lpc_snapshot(struct vm_snapshot_meta *meta) +{ + int unit, ret; + struct uart_softc *sc; + + for (unit = 0; unit < LPC_UART_NUM; unit++) { + sc = lpc_uart_softc[unit].uart_softc; + + ret = uart_snapshot(sc, meta); + if (ret != 0) + goto done; + } + +done: + return (ret); +} +#endif + struct pci_devemu pci_de_lpc = { .pe_emu = "lpc", .pe_init = pci_lpc_init, .pe_write_dsdt = pci_lpc_write_dsdt, .pe_cfgwrite = pci_lpc_cfgwrite, .pe_barwrite = pci_lpc_write, - .pe_barread = pci_lpc_read + .pe_barread = pci_lpc_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = pci_lpc_snapshot, +#endif }; PCI_EMUL_SET(pci_de_lpc); diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c index 04ff7718c333..4fd8943efffa 100644 --- a/usr.sbin/bhyve/pci_virtio_block.c +++ b/usr.sbin/bhyve/pci_virtio_block.c @@ -1,528 +1,577 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * Copyright 2020 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include +#include + #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #include "block_if.h" #define VTBLK_BSIZE 512 #define VTBLK_RINGSZ 128 _Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request"); #define VTBLK_S_OK 0 #define VTBLK_S_IOERR 1 #define VTBLK_S_UNSUPP 2 #define VTBLK_BLK_ID_BYTES 20 + 1 /* Capability bits */ #define VTBLK_F_BARRIER (1 << 0) /* Does host support barriers? */ #define VTBLK_F_SIZE_MAX (1 << 1) /* Indicates maximum segment size */ #define VTBLK_F_SEG_MAX (1 << 2) /* Indicates maximum # of segments */ #define VTBLK_F_GEOMETRY (1 << 4) /* Legacy geometry available */ #define VTBLK_F_RO (1 << 5) /* Disk is read-only */ #define VTBLK_F_BLK_SIZE (1 << 6) /* Block size of disk is available*/ #define VTBLK_F_SCSI (1 << 7) /* Supports scsi command passthru */ #define VTBLK_F_FLUSH (1 << 9) /* Writeback mode enabled after reset */ #define VTBLK_F_WCE (1 << 9) /* Legacy alias for FLUSH */ #define VTBLK_F_TOPOLOGY (1 << 10) /* Topology information is available */ #define VTBLK_F_CONFIG_WCE (1 << 11) /* Writeback mode available in config */ #define VTBLK_F_MQ (1 << 12) /* Multi-Queue */ #define VTBLK_F_DISCARD (1 << 13) /* Trim blocks */ #define VTBLK_F_WRITE_ZEROES (1 << 14) /* Write zeros */ /* * Host capabilities */ #define VTBLK_S_HOSTCAPS \ ( VTBLK_F_SEG_MAX | \ VTBLK_F_BLK_SIZE | \ VTBLK_F_FLUSH | \ VTBLK_F_TOPOLOGY | \ VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ /* * The current blockif_delete() interface only allows a single delete * request at a time. */ #define VTBLK_MAX_DISCARD_SEG 1 /* * An arbitrary limit to prevent excessive latency due to large * delete requests. */ #define VTBLK_MAX_DISCARD_SECT ((16 << 20) / VTBLK_BSIZE) /* 16 MiB */ /* * Config space "registers" */ struct vtblk_config { uint64_t vbc_capacity; uint32_t vbc_size_max; uint32_t vbc_seg_max; struct { uint16_t cylinders; uint8_t heads; uint8_t sectors; } vbc_geometry; uint32_t vbc_blk_size; struct { uint8_t physical_block_exp; uint8_t alignment_offset; uint16_t min_io_size; uint32_t opt_io_size; } vbc_topology; uint8_t vbc_writeback; uint8_t unused0[1]; uint16_t num_queues; uint32_t max_discard_sectors; uint32_t max_discard_seg; uint32_t discard_sector_alignment; uint32_t max_write_zeroes_sectors; uint32_t max_write_zeroes_seg; uint8_t write_zeroes_may_unmap; uint8_t unused1[3]; } __packed; /* * Fixed-size block header */ struct virtio_blk_hdr { #define VBH_OP_READ 0 #define VBH_OP_WRITE 1 #define VBH_OP_SCSI_CMD 2 #define VBH_OP_SCSI_CMD_OUT 3 #define VBH_OP_FLUSH 4 #define VBH_OP_FLUSH_OUT 5 #define VBH_OP_IDENT 8 #define VBH_OP_DISCARD 11 #define VBH_OP_WRITE_ZEROES 13 #define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ uint32_t vbh_type; uint32_t vbh_ioprio; uint64_t vbh_sector; } __packed; /* * Debug printf */ static int pci_vtblk_debug; #define DPRINTF(params) if (pci_vtblk_debug) PRINTLN params #define WPRINTF(params) PRINTLN params struct pci_vtblk_ioreq { struct blockif_req io_req; struct pci_vtblk_softc *io_sc; uint8_t *io_status; uint16_t io_idx; }; struct virtio_blk_discard_write_zeroes { uint64_t sector; uint32_t num_sectors; struct { uint32_t unmap:1; uint32_t reserved:31; } flags; }; /* * Per-device softc */ struct pci_vtblk_softc { struct virtio_softc vbsc_vs; pthread_mutex_t vsc_mtx; struct vqueue_info vbsc_vq; struct vtblk_config vbsc_cfg; struct virtio_consts vbsc_consts; struct blockif_ctxt *bc; char vbsc_ident[VTBLK_BLK_ID_BYTES]; struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; }; static void pci_vtblk_reset(void *); static void pci_vtblk_notify(void *, struct vqueue_info *); static int pci_vtblk_cfgread(void *, int, int, uint32_t *); static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); +#ifdef BHYVE_SNAPSHOT +static void pci_vtblk_pause(void *); +static void pci_vtblk_resume(void *); +static int pci_vtblk_snapshot(void *, struct vm_snapshot_meta *); +#endif static struct virtio_consts vtblk_vi_consts = { "vtblk", /* our name */ 1, /* we support 1 virtqueue */ sizeof(struct vtblk_config), /* config reg size */ pci_vtblk_reset, /* reset */ pci_vtblk_notify, /* device-wide qnotify */ pci_vtblk_cfgread, /* read PCI config */ pci_vtblk_cfgwrite, /* write PCI config */ NULL, /* apply negotiated features */ VTBLK_S_HOSTCAPS, /* our capabilities */ +#ifdef BHYVE_SNAPSHOT + pci_vtblk_pause, /* pause blockif threads */ + pci_vtblk_resume, /* resume blockif threads */ + pci_vtblk_snapshot, /* save / restore device state */ +#endif }; static void pci_vtblk_reset(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device reset requested !")); vi_reset_dev(&sc->vbsc_vs); } static void pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err) { struct pci_vtblk_softc *sc = io->io_sc; /* convert errno into a virtio block error return */ if (err == EOPNOTSUPP || err == ENOSYS) *io->io_status = VTBLK_S_UNSUPP; else if (err != 0) *io->io_status = VTBLK_S_IOERR; else *io->io_status = VTBLK_S_OK; /* * Return the descriptor back to the host. * We wrote 1 byte (our status) to host. */ vq_relchain(&sc->vbsc_vq, io->io_idx, 1); vq_endchains(&sc->vbsc_vq, 0); } +#ifdef BHYVE_SNAPSHOT +static void +pci_vtblk_pause(void *vsc) +{ + struct pci_vtblk_softc *sc = vsc; + + DPRINTF(("vtblk: device pause requested !\n")); + blockif_pause(sc->bc); +} + +static void +pci_vtblk_resume(void *vsc) +{ + struct pci_vtblk_softc *sc = vsc; + + DPRINTF(("vtblk: device resume requested !\n")); + blockif_resume(sc->bc); +} + +static int +pci_vtblk_snapshot(void *vsc, struct vm_snapshot_meta *meta) +{ + int ret; + struct pci_vtblk_softc *sc = vsc; + + SNAPSHOT_VAR_OR_LEAVE(sc->vbsc_cfg, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(sc->vbsc_ident, sizeof(sc->vbsc_ident), + meta, ret, done); + +done: + return (ret); +} +#endif + static void pci_vtblk_done(struct blockif_req *br, int err) { struct pci_vtblk_ioreq *io = br->br_param; struct pci_vtblk_softc *sc = io->io_sc; pthread_mutex_lock(&sc->vsc_mtx); pci_vtblk_done_locked(io, err); pthread_mutex_unlock(&sc->vsc_mtx); } static void pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) { struct virtio_blk_hdr *vbh; struct pci_vtblk_ioreq *io; int i, n; int err; ssize_t iolen; int writeop, type; struct iovec iov[BLOCKIF_IOV_MAX + 2]; uint16_t idx, flags[BLOCKIF_IOV_MAX + 2]; struct virtio_blk_discard_write_zeroes *discard; n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags); /* * The first descriptor will be the read-only fixed header, * and the last is for status (hence +2 above and below). * The remaining iov's are the actual data I/O vectors. * * XXX - note - this fails on crash dump, which does a * VIRTIO_BLK_T_FLUSH with a zero transfer length */ assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2); io = &sc->vbsc_ios[idx]; assert((flags[0] & VRING_DESC_F_WRITE) == 0); assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); vbh = (struct virtio_blk_hdr *)iov[0].iov_base; memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); io->io_req.br_iovcnt = n - 2; io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE; io->io_status = (uint8_t *)iov[--n].iov_base; assert(iov[n].iov_len == 1); assert(flags[n] & VRING_DESC_F_WRITE); /* * XXX * The guest should not be setting the BARRIER flag because * we don't advertise the capability. */ type = vbh->vbh_type & ~VBH_FLAG_BARRIER; writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD); iolen = 0; for (i = 1; i < n; i++) { /* * - write op implies read-only descriptor, * - read/ident op implies write-only descriptor, * therefore test the inverse of the descriptor bit * to the op. */ assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop); iolen += iov[i].iov_len; } io->io_req.br_resid = iolen; DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld", writeop ? "write/discard" : "read/ident", iolen, i - 1, io->io_req.br_offset)); switch (type) { case VBH_OP_READ: err = blockif_read(sc->bc, &io->io_req); break; case VBH_OP_WRITE: err = blockif_write(sc->bc, &io->io_req); break; case VBH_OP_DISCARD: /* * We currently only support a single request, if the guest * has submitted a request that doesn't conform to the * requirements, we return a error. */ if (iov[1].iov_len != sizeof (*discard)) { pci_vtblk_done_locked(io, EINVAL); return; } /* The segments to discard are provided rather than data */ discard = (struct virtio_blk_discard_write_zeroes *) iov[1].iov_base; /* * virtio v1.1 5.2.6.2: * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP * for discard and write zeroes commands if any unknown flag is * set. Furthermore, the device MUST set the status byte to * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag * is set. * * Currently there are no known flags for a DISCARD request. */ if (discard->flags.unmap != 0 || discard->flags.reserved != 0) { pci_vtblk_done_locked(io, ENOTSUP); return; } /* Make sure the request doesn't exceed our size limit */ if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) { pci_vtblk_done_locked(io, EINVAL); return; } io->io_req.br_offset = discard->sector * VTBLK_BSIZE; io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE; err = blockif_delete(sc->bc, &io->io_req); break; case VBH_OP_FLUSH: case VBH_OP_FLUSH_OUT: err = blockif_flush(sc->bc, &io->io_req); break; case VBH_OP_IDENT: /* Assume a single buffer */ /* S/n equal to buffer is not zero-terminated. */ memset(iov[1].iov_base, 0, iov[1].iov_len); strncpy(iov[1].iov_base, sc->vbsc_ident, MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); pci_vtblk_done_locked(io, 0); return; default: pci_vtblk_done_locked(io, EOPNOTSUPP); return; } assert(err == 0); } static void pci_vtblk_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtblk_softc *sc = vsc; while (vq_has_descs(vq)) pci_vtblk_proc(sc, vq); } static int pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { char bident[sizeof("XX:X:X")]; struct blockif_ctxt *bctxt; MD5_CTX mdctx; u_char digest[16]; struct pci_vtblk_softc *sc; off_t size; int i, sectsz, sts, sto; if (opts == NULL) { WPRINTF(("virtio-block: backing device required")); return (1); } /* * The supplied backing file has to exist */ snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); bctxt = blockif_open(opts, bident); if (bctxt == NULL) { perror("Could not open backing file"); return (1); } size = blockif_size(bctxt); sectsz = blockif_sectsz(bctxt); blockif_psectsz(bctxt, &sts, &sto); sc = calloc(1, sizeof(struct pci_vtblk_softc)); sc->bc = bctxt; for (i = 0; i < VTBLK_RINGSZ; i++) { struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i]; io->io_req.br_callback = pci_vtblk_done; io->io_req.br_param = io; io->io_sc = sc; io->io_idx = i; } bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts)); if (blockif_candelete(sc->bc)) sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD; pthread_mutex_init(&sc->vsc_mtx, NULL); /* init virtio softc and virtqueues */ vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, pi, &sc->vbsc_vq); sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; /* sc->vbsc_vq.vq_notify = we have no per-queue notify */ /* * Create an identifier for the backing file. Use parts of the * md5 sum of the filename */ MD5Init(&mdctx); MD5Update(&mdctx, opts, strlen(opts)); MD5Final(digest, &mdctx); snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES, "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); /* setup virtio block config space */ sc->vbsc_cfg.vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */ sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ /* * If Linux is presented with a seg_max greater than the virtio queue * size, it can stumble into situations where it violates its own * invariants and panics. For safety, we keep seg_max clamped, paying * heed to the two extra descriptors needed for the header and status * of a request. */ sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX); sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */ sc->vbsc_cfg.vbc_geometry.heads = 0; sc->vbsc_cfg.vbc_geometry.sectors = 0; sc->vbsc_cfg.vbc_blk_size = sectsz; sc->vbsc_cfg.vbc_topology.physical_block_exp = (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0; sc->vbsc_cfg.vbc_topology.alignment_offset = (sto != 0) ? ((sts - sto) / sectsz) : 0; sc->vbsc_cfg.vbc_topology.min_io_size = 0; sc->vbsc_cfg.vbc_topology.opt_io_size = 0; sc->vbsc_cfg.vbc_writeback = 0; sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT; sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG; sc->vbsc_cfg.discard_sector_alignment = sectsz / VTBLK_BSIZE; /* * Should we move some of this into virtio.c? Could * have the device, class, and subdev_0 as fields in * the virtio constants structure. */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) { blockif_close(sc->bc); free(sc); return (1); } vi_set_io_bar(&sc->vbsc_vs, 0); return (0); } static int pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value) { DPRINTF(("vtblk: write to readonly reg %d", offset)); return (1); } static int pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtblk_softc *sc = vsc; void *ptr; /* our caller has already verified offset and size */ ptr = (uint8_t *)&sc->vbsc_cfg + offset; memcpy(retval, ptr, size); return (0); } struct pci_devemu pci_de_vblk = { .pe_emu = "virtio-blk", .pe_init = pci_vtblk_init, .pe_barwrite = vi_pci_write, - .pe_barread = vi_pci_read + .pe_barread = vi_pci_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = vi_pci_snapshot, +#endif }; PCI_EMUL_SET(pci_de_vblk); diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c index adc273128585..a0fcd9055e65 100644 --- a/usr.sbin/bhyve/pci_virtio_net.c +++ b/usr.sbin/bhyve/pci_virtio_net.c @@ -1,749 +1,830 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include +#include #include #include /* IFNAMSIZ */ #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" #include "mevent.h" #include "virtio.h" #include "net_utils.h" #include "net_backends.h" #include "iov.h" #define VTNET_RINGSZ 1024 #define VTNET_MAXSEGS 256 #define VTNET_MAX_PKT_LEN (65536 + 64) #define VTNET_MIN_MTU ETHERMIN #define VTNET_MAX_MTU 65535 #define VTNET_S_HOSTCAPS \ ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | \ VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) /* * PCI config-space "registers" */ struct virtio_net_config { uint8_t mac[6]; uint16_t status; uint16_t max_virtqueue_pairs; uint16_t mtu; } __packed; /* * Queue definitions. */ #define VTNET_RXQ 0 #define VTNET_TXQ 1 #define VTNET_CTLQ 2 /* NB: not yet supported */ #define VTNET_MAXQ 3 /* * Debug printf */ static int pci_vtnet_debug; #define DPRINTF(params) if (pci_vtnet_debug) PRINTLN params #define WPRINTF(params) PRINTLN params /* * Per-device softc */ struct pci_vtnet_softc { struct virtio_softc vsc_vs; struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; pthread_mutex_t vsc_mtx; net_backend_t *vsc_be; int resetting; /* protected by tx_mtx */ uint64_t vsc_features; /* negotiated features */ pthread_mutex_t rx_mtx; int rx_merge; /* merged rx bufs in use */ pthread_t tx_tid; pthread_mutex_t tx_mtx; pthread_cond_t tx_cond; int tx_in_progress; size_t vhdrlen; size_t be_vhdrlen; struct virtio_net_config vsc_config; struct virtio_consts vsc_consts; }; static void pci_vtnet_reset(void *); /* static void pci_vtnet_notify(void *, struct vqueue_info *); */ static int pci_vtnet_cfgread(void *, int, int, uint32_t *); static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); static void pci_vtnet_neg_features(void *, uint64_t); +#ifdef BHYVE_SNAPSHOT +static void pci_vtnet_pause(void *); +static void pci_vtnet_resume(void *); +static int pci_vtnet_snapshot(void *, struct vm_snapshot_meta *); +#endif static struct virtio_consts vtnet_vi_consts = { "vtnet", /* our name */ VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ sizeof(struct virtio_net_config), /* config reg size */ pci_vtnet_reset, /* reset */ NULL, /* device-wide qnotify -- not used */ pci_vtnet_cfgread, /* read PCI config */ pci_vtnet_cfgwrite, /* write PCI config */ pci_vtnet_neg_features, /* apply negotiated features */ VTNET_S_HOSTCAPS, /* our capabilities */ +#ifdef BHYVE_SNAPSHOT + pci_vtnet_pause, /* pause rx/tx threads */ + pci_vtnet_resume, /* resume rx/tx threads */ + pci_vtnet_snapshot, /* save / restore device state */ +#endif }; static void pci_vtnet_reset(void *vsc) { struct pci_vtnet_softc *sc = vsc; DPRINTF(("vtnet: device reset requested !")); /* Acquire the RX lock to block RX processing. */ pthread_mutex_lock(&sc->rx_mtx); /* * Make sure receive operation is disabled at least until we * re-negotiate the features, since receive operation depends * on the value of sc->rx_merge and the header length, which * are both set in pci_vtnet_neg_features(). * Receive operation will be enabled again once the guest adds * the first receive buffers and kicks us. */ netbe_rx_disable(sc->vsc_be); /* Set sc->resetting and give a chance to the TX thread to stop. */ pthread_mutex_lock(&sc->tx_mtx); sc->resetting = 1; while (sc->tx_in_progress) { pthread_mutex_unlock(&sc->tx_mtx); usleep(10000); pthread_mutex_lock(&sc->tx_mtx); } /* * Now reset rings, MSI-X vectors, and negotiated capabilities. * Do that with the TX lock held, since we need to reset * sc->resetting. */ vi_reset_dev(&sc->vsc_vs); sc->resetting = 0; pthread_mutex_unlock(&sc->tx_mtx); pthread_mutex_unlock(&sc->rx_mtx); } static __inline struct iovec * iov_trim_hdr(struct iovec *iov, int *iovcnt, unsigned int hlen) { struct iovec *riov; if (iov[0].iov_len < hlen) { /* * Not enough header space in the first fragment. * That's not ok for us. */ return NULL; } iov[0].iov_len -= hlen; if (iov[0].iov_len == 0) { *iovcnt -= 1; if (*iovcnt == 0) { /* * Only space for the header. That's not * enough for us. */ return NULL; } riov = &iov[1]; } else { iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + hlen); riov = &iov[0]; } return (riov); } struct virtio_mrg_rxbuf_info { uint16_t idx; uint16_t pad; uint32_t len; }; static void pci_vtnet_rx(struct pci_vtnet_softc *sc) { int prepend_hdr_len = sc->vhdrlen - sc->be_vhdrlen; struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS]; struct iovec iov[VTNET_MAXSEGS + 1]; struct vqueue_info *vq; vq = &sc->vsc_queues[VTNET_RXQ]; for (;;) { struct virtio_net_rxhdr *hdr; uint32_t riov_bytes; struct iovec *riov; uint32_t ulen; int riov_len; int n_chains; ssize_t rlen; ssize_t plen; plen = netbe_peek_recvlen(sc->vsc_be); if (plen <= 0) { /* * No more packets (plen == 0), or backend errored * (plen < 0). Interrupt if needed and stop. */ vq_endchains(vq, /*used_all_avail=*/0); return; } plen += prepend_hdr_len; /* * Get a descriptor chain to store the next ingress * packet. In case of mergeable rx buffers, get as * many chains as necessary in order to make room * for plen bytes. */ riov_bytes = 0; riov_len = 0; riov = iov; n_chains = 0; do { int n = vq_getchain(vq, &info[n_chains].idx, riov, VTNET_MAXSEGS - riov_len, NULL); if (n == 0) { /* * No rx buffers. Enable RX kicks and double * check. */ vq_kick_enable(vq); if (!vq_has_descs(vq)) { /* * Still no buffers. Return the unused * chains (if any), interrupt if needed * (including for NOTIFY_ON_EMPTY), and * disable the backend until the next * kick. */ vq_retchains(vq, n_chains); vq_endchains(vq, /*used_all_avail=*/1); netbe_rx_disable(sc->vsc_be); return; } /* More rx buffers found, so keep going. */ vq_kick_disable(vq); continue; } assert(n >= 1 && riov_len + n <= VTNET_MAXSEGS); riov_len += n; if (!sc->rx_merge) { n_chains = 1; break; } info[n_chains].len = (uint32_t)count_iov(riov, n); riov_bytes += info[n_chains].len; riov += n; n_chains++; } while (riov_bytes < plen && riov_len < VTNET_MAXSEGS); riov = iov; hdr = riov[0].iov_base; if (prepend_hdr_len > 0) { /* * The frontend uses a virtio-net header, but the * backend does not. We need to prepend a zeroed * header. */ riov = iov_trim_hdr(riov, &riov_len, prepend_hdr_len); if (riov == NULL) { /* * The first collected chain is nonsensical, * as it is not even enough to store the * virtio-net header. Just drop it. */ vq_relchain(vq, info[0].idx, 0); vq_retchains(vq, n_chains - 1); continue; } memset(hdr, 0, prepend_hdr_len); } rlen = netbe_recv(sc->vsc_be, riov, riov_len); if (rlen != plen - prepend_hdr_len) { /* * If this happens it means there is something * wrong with the backend (e.g., some other * process is stealing our packets). */ WPRINTF(("netbe_recv: expected %zd bytes, " "got %zd", plen - prepend_hdr_len, rlen)); vq_retchains(vq, n_chains); continue; } ulen = (uint32_t)plen; /* * Publish the used buffers to the guest, reporting the * number of bytes that we wrote. */ if (!sc->rx_merge) { vq_relchain(vq, info[0].idx, ulen); } else { uint32_t iolen; int i = 0; do { iolen = info[i].len; if (iolen > ulen) { iolen = ulen; } vq_relchain_prepare(vq, info[i].idx, iolen); ulen -= iolen; i++; } while (ulen > 0); hdr->vrh_bufs = i; vq_relchain_publish(vq); assert(i == n_chains); } } } /* * Called when there is read activity on the backend file descriptor. * Each buffer posted by the guest is assumed to be able to contain * an entire ethernet frame + rx header. */ static void pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) { struct pci_vtnet_softc *sc = param; pthread_mutex_lock(&sc->rx_mtx); pci_vtnet_rx(sc); pthread_mutex_unlock(&sc->rx_mtx); } /* Called on RX kick. */ static void pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) { struct pci_vtnet_softc *sc = vsc; /* * A qnotify means that the rx process can now begin. */ pthread_mutex_lock(&sc->rx_mtx); vq_kick_disable(vq); netbe_rx_enable(sc->vsc_be); pthread_mutex_unlock(&sc->rx_mtx); } /* TX virtqueue processing, called by the TX thread. */ static void pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) { struct iovec iov[VTNET_MAXSEGS + 1]; struct iovec *siov = iov; uint16_t idx; ssize_t len; int n; /* * Obtain chain of descriptors. The first descriptor also * contains the virtio-net header. */ n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); assert(n >= 1 && n <= VTNET_MAXSEGS); if (sc->vhdrlen != sc->be_vhdrlen) { /* * The frontend uses a virtio-net header, but the backend * does not. We simply strip the header and ignore it, as * it should be zero-filled. */ siov = iov_trim_hdr(siov, &n, sc->vhdrlen); } if (siov == NULL) { /* The chain is nonsensical. Just drop it. */ len = 0; } else { len = netbe_send(sc->vsc_be, siov, n); if (len < 0) { /* * If send failed, report that 0 bytes * were read. */ len = 0; } } /* * Return the processed chain to the guest, reporting * the number of bytes that we read. */ vq_relchain(vq, idx, len); } /* Called on TX kick. */ static void pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) { struct pci_vtnet_softc *sc = vsc; /* * Any ring entries to process? */ if (!vq_has_descs(vq)) return; /* Signal the tx thread for processing */ pthread_mutex_lock(&sc->tx_mtx); vq_kick_disable(vq); if (sc->tx_in_progress == 0) pthread_cond_signal(&sc->tx_cond); pthread_mutex_unlock(&sc->tx_mtx); } /* * Thread which will handle processing of TX desc */ static void * pci_vtnet_tx_thread(void *param) { struct pci_vtnet_softc *sc = param; struct vqueue_info *vq; int error; vq = &sc->vsc_queues[VTNET_TXQ]; /* * Let us wait till the tx queue pointers get initialised & * first tx signaled */ pthread_mutex_lock(&sc->tx_mtx); error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); assert(error == 0); for (;;) { /* note - tx mutex is locked here */ while (sc->resetting || !vq_has_descs(vq)) { vq_kick_enable(vq); if (!sc->resetting && vq_has_descs(vq)) break; sc->tx_in_progress = 0; error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); assert(error == 0); } vq_kick_disable(vq); sc->tx_in_progress = 1; pthread_mutex_unlock(&sc->tx_mtx); do { /* * Run through entries, placing them into * iovecs and sending when an end-of-packet * is found */ pci_vtnet_proctx(sc, vq); } while (vq_has_descs(vq)); /* * Generate an interrupt if needed. */ vq_endchains(vq, /*used_all_avail=*/1); pthread_mutex_lock(&sc->tx_mtx); } } #ifdef notyet static void pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) { DPRINTF(("vtnet: control qnotify!")); } #endif static int pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { struct pci_vtnet_softc *sc; char tname[MAXCOMLEN + 1]; int mac_provided; int mtu_provided; unsigned long mtu = ETHERMTU; /* * Allocate data structures for further virtio initializations. * sc also contains a copy of vtnet_vi_consts, since capabilities * change depending on the backend. */ sc = calloc(1, sizeof(struct pci_vtnet_softc)); sc->vsc_consts = vtnet_vi_consts; pthread_mutex_init(&sc->vsc_mtx, NULL); sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; #ifdef notyet sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; #endif /* * Attempt to open the backend device and read the MAC address * if specified. */ mac_provided = 0; mtu_provided = 0; if (opts != NULL) { char *devname; char *vtopts; int err = 0; /* Get the device name. */ devname = vtopts = strdup(opts); (void) strsep(&vtopts, ","); /* * Parse the list of options in the form * key1=value1,...,keyN=valueN. */ while (vtopts != NULL) { char *value = vtopts; char *key; key = strsep(&value, "="); if (value == NULL) break; vtopts = value; (void) strsep(&vtopts, ","); if (strcmp(key, "mac") == 0) { err = net_parsemac(value, sc->vsc_config.mac); if (err) break; mac_provided = 1; } else if (strcmp(key, "mtu") == 0) { err = net_parsemtu(value, &mtu); if (err) break; if (mtu < VTNET_MIN_MTU || mtu > VTNET_MAX_MTU) { err = EINVAL; errno = EINVAL; break; } mtu_provided = 1; } } if (err) { free(devname); free(sc); return (err); } err = netbe_init(&sc->vsc_be, devname, pci_vtnet_rx_callback, sc); free(devname); if (err) { free(sc); return (err); } sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF | netbe_get_cap(sc->vsc_be); } if (!mac_provided) { net_genmac(pi, sc->vsc_config.mac); } sc->vsc_config.mtu = mtu; if (mtu_provided) { sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MTU; } /* * Since we do not actually support multiqueue, * set the maximum virtqueue pairs to 1. */ sc->vsc_config.max_virtqueue_pairs = 1; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); /* Link is up if we managed to open backend device. */ sc->vsc_config.status = (opts == NULL || sc->vsc_be); vi_softc_linkup(&sc->vsc_vs, &sc->vsc_consts, sc, pi, sc->vsc_queues); sc->vsc_vs.vs_mtx = &sc->vsc_mtx; /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) { free(sc); return (1); } /* use BAR 0 to map config regs in IO space */ vi_set_io_bar(&sc->vsc_vs, 0); sc->resetting = 0; sc->rx_merge = 0; sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2; pthread_mutex_init(&sc->rx_mtx, NULL); /* * Initialize tx semaphore & spawn TX processing thread. * As of now, only one thread for TX desc processing is * spawned. */ sc->tx_in_progress = 0; pthread_mutex_init(&sc->tx_mtx, NULL); pthread_cond_init(&sc->tx_cond, NULL); pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, pi->pi_func); pthread_set_name_np(sc->tx_tid, tname); return (0); } static int pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) { struct pci_vtnet_softc *sc = vsc; void *ptr; if (offset < (int)sizeof(sc->vsc_config.mac)) { assert(offset + size <= (int)sizeof(sc->vsc_config.mac)); /* * The driver is allowed to change the MAC address */ ptr = &sc->vsc_config.mac[offset]; memcpy(ptr, &value, size); } else { /* silently ignore other writes */ DPRINTF(("vtnet: write to readonly reg %d", offset)); } return (0); } static int pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtnet_softc *sc = vsc; void *ptr; ptr = (uint8_t *)&sc->vsc_config + offset; memcpy(retval, ptr, size); return (0); } static void pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vtnet_softc *sc = vsc; sc->vsc_features = negotiated_features; if (negotiated_features & VIRTIO_NET_F_MRG_RXBUF) { sc->vhdrlen = sizeof(struct virtio_net_rxhdr); sc->rx_merge = 1; } else { /* * Without mergeable rx buffers, virtio-net header is 2 * bytes shorter than sizeof(struct virtio_net_rxhdr). */ sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2; sc->rx_merge = 0; } /* Tell the backend to enable some capabilities it has advertised. */ netbe_set_cap(sc->vsc_be, negotiated_features, sc->vhdrlen); sc->be_vhdrlen = netbe_get_vnet_hdr_len(sc->vsc_be); assert(sc->be_vhdrlen == 0 || sc->be_vhdrlen == sc->vhdrlen); } +#ifdef BHYVE_SNAPSHOT +static void +pci_vtnet_pause(void *vsc) +{ + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device pause requested !\n")); + + /* Acquire the RX lock to block RX processing. */ + pthread_mutex_lock(&sc->rx_mtx); + + /* Wait for the transmit thread to finish its processing. */ + pthread_mutex_lock(&sc->tx_mtx); + while (sc->tx_in_progress) { + pthread_mutex_unlock(&sc->tx_mtx); + usleep(10000); + pthread_mutex_lock(&sc->tx_mtx); + } +} + +static void +pci_vtnet_resume(void *vsc) +{ + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device resume requested !\n")); + + pthread_mutex_unlock(&sc->tx_mtx); + /* The RX lock should have been acquired in vtnet_pause. */ + pthread_mutex_unlock(&sc->rx_mtx); +} + +static int +pci_vtnet_snapshot(void *vsc, struct vm_snapshot_meta *meta) +{ + int ret; + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device snapshot requested !\n")); + + /* + * Queues and consts should have been saved by the more generic + * vi_pci_snapshot function. We need to save only our features and + * config. + */ + + SNAPSHOT_VAR_OR_LEAVE(sc->vsc_features, meta, ret, done); + + /* Force reapply negociated features at restore time */ + if (meta->op == VM_SNAPSHOT_RESTORE) { + pci_vtnet_neg_features(sc, sc->vsc_features); + netbe_rx_enable(sc->vsc_be); + } + + SNAPSHOT_VAR_OR_LEAVE(sc->vsc_config, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rx_merge, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->vhdrlen, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->be_vhdrlen, meta, ret, done); + +done: + return (ret); +} +#endif + static struct pci_devemu pci_de_vnet = { .pe_emu = "virtio-net", .pe_init = pci_vtnet_init, .pe_barwrite = vi_pci_write, - .pe_barread = vi_pci_read + .pe_barread = vi_pci_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = vi_pci_snapshot, + .pe_pause = vi_pci_pause, + .pe_resume = vi_pci_resume, +#endif }; PCI_EMUL_SET(pci_de_vnet); diff --git a/usr.sbin/bhyve/pci_xhci.c b/usr.sbin/bhyve/pci_xhci.c index 672f35c91ef8..0847d5bb38b5 100644 --- a/usr.sbin/bhyve/pci_xhci.c +++ b/usr.sbin/bhyve/pci_xhci.c @@ -1,2866 +1,3124 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Leon Dang * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* XHCI options: -s ,xhci,{devices} devices: tablet USB tablet mouse */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include +#include + #include #include #include #include #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" #include "pci_xhci.h" #include "usb_emul.h" static int xhci_debug = 0; #define DPRINTF(params) if (xhci_debug) PRINTLN params #define WPRINTF(params) PRINTLN params #define XHCI_NAME "xhci" #define XHCI_MAX_DEVS 8 /* 4 USB3 + 4 USB2 devs */ #define XHCI_MAX_SLOTS 64 /* min allowed by Windows drivers */ /* * XHCI data structures can be up to 64k, but limit paddr_guest2host mapping * to 4k to avoid going over the guest physical memory barrier. */ #define XHCI_PADDR_SZ 4096 /* paddr_guest2host max size */ #define XHCI_ERST_MAX 0 /* max 2^entries event ring seg tbl */ #define XHCI_CAPLEN (4*8) /* offset of op register space */ #define XHCI_HCCPRAMS2 0x1C /* offset of HCCPARAMS2 register */ #define XHCI_PORTREGS_START 0x400 #define XHCI_DOORBELL_MAX 256 #define XHCI_STREAMS_MAX 1 /* 4-15 in XHCI spec */ /* caplength and hci-version registers */ #define XHCI_SET_CAPLEN(x) ((x) & 0xFF) #define XHCI_SET_HCIVERSION(x) (((x) & 0xFFFF) << 16) #define XHCI_GET_HCIVERSION(x) (((x) >> 16) & 0xFFFF) /* hcsparams1 register */ #define XHCI_SET_HCSP1_MAXSLOTS(x) ((x) & 0xFF) #define XHCI_SET_HCSP1_MAXINTR(x) (((x) & 0x7FF) << 8) #define XHCI_SET_HCSP1_MAXPORTS(x) (((x) & 0xFF) << 24) /* hcsparams2 register */ #define XHCI_SET_HCSP2_IST(x) ((x) & 0x0F) #define XHCI_SET_HCSP2_ERSTMAX(x) (((x) & 0x0F) << 4) #define XHCI_SET_HCSP2_MAXSCRATCH_HI(x) (((x) & 0x1F) << 21) #define XHCI_SET_HCSP2_MAXSCRATCH_LO(x) (((x) & 0x1F) << 27) /* hcsparams3 register */ #define XHCI_SET_HCSP3_U1EXITLATENCY(x) ((x) & 0xFF) #define XHCI_SET_HCSP3_U2EXITLATENCY(x) (((x) & 0xFFFF) << 16) /* hccparams1 register */ #define XHCI_SET_HCCP1_AC64(x) ((x) & 0x01) #define XHCI_SET_HCCP1_BNC(x) (((x) & 0x01) << 1) #define XHCI_SET_HCCP1_CSZ(x) (((x) & 0x01) << 2) #define XHCI_SET_HCCP1_PPC(x) (((x) & 0x01) << 3) #define XHCI_SET_HCCP1_PIND(x) (((x) & 0x01) << 4) #define XHCI_SET_HCCP1_LHRC(x) (((x) & 0x01) << 5) #define XHCI_SET_HCCP1_LTC(x) (((x) & 0x01) << 6) #define XHCI_SET_HCCP1_NSS(x) (((x) & 0x01) << 7) #define XHCI_SET_HCCP1_PAE(x) (((x) & 0x01) << 8) #define XHCI_SET_HCCP1_SPC(x) (((x) & 0x01) << 9) #define XHCI_SET_HCCP1_SEC(x) (((x) & 0x01) << 10) #define XHCI_SET_HCCP1_CFC(x) (((x) & 0x01) << 11) #define XHCI_SET_HCCP1_MAXPSA(x) (((x) & 0x0F) << 12) #define XHCI_SET_HCCP1_XECP(x) (((x) & 0xFFFF) << 16) /* hccparams2 register */ #define XHCI_SET_HCCP2_U3C(x) ((x) & 0x01) #define XHCI_SET_HCCP2_CMC(x) (((x) & 0x01) << 1) #define XHCI_SET_HCCP2_FSC(x) (((x) & 0x01) << 2) #define XHCI_SET_HCCP2_CTC(x) (((x) & 0x01) << 3) #define XHCI_SET_HCCP2_LEC(x) (((x) & 0x01) << 4) #define XHCI_SET_HCCP2_CIC(x) (((x) & 0x01) << 5) /* other registers */ #define XHCI_SET_DOORBELL(x) ((x) & ~0x03) #define XHCI_SET_RTSOFFSET(x) ((x) & ~0x0F) /* register masks */ #define XHCI_PS_PLS_MASK (0xF << 5) /* port link state */ #define XHCI_PS_SPEED_MASK (0xF << 10) /* port speed */ #define XHCI_PS_PIC_MASK (0x3 << 14) /* port indicator */ /* port register set */ #define XHCI_PORTREGS_BASE 0x400 /* base offset */ #define XHCI_PORTREGS_PORT0 0x3F0 #define XHCI_PORTREGS_SETSZ 0x10 /* size of a set */ #define MASK_64_HI(x) ((x) & ~0xFFFFFFFFULL) #define MASK_64_LO(x) ((x) & 0xFFFFFFFFULL) #define FIELD_REPLACE(a,b,m,s) (((a) & ~((m) << (s))) | \ (((b) & (m)) << (s))) #define FIELD_COPY(a,b,m,s) (((a) & ~((m) << (s))) | \ (((b) & ((m) << (s))))) +#define SNAP_DEV_NAME_LEN 128 + struct pci_xhci_trb_ring { uint64_t ringaddr; /* current dequeue guest address */ uint32_t ccs; /* consumer cycle state */ }; /* device endpoint transfer/stream rings */ struct pci_xhci_dev_ep { union { struct xhci_trb *_epu_tr; struct xhci_stream_ctx *_epu_sctx; } _ep_trbsctx; #define ep_tr _ep_trbsctx._epu_tr #define ep_sctx _ep_trbsctx._epu_sctx union { struct pci_xhci_trb_ring _epu_trb; struct pci_xhci_trb_ring *_epu_sctx_trbs; } _ep_trb_rings; #define ep_ringaddr _ep_trb_rings._epu_trb.ringaddr #define ep_ccs _ep_trb_rings._epu_trb.ccs #define ep_sctx_trbs _ep_trb_rings._epu_sctx_trbs struct usb_data_xfer *ep_xfer; /* transfer chain */ }; /* device context base address array: maps slot->device context */ struct xhci_dcbaa { uint64_t dcba[USB_MAX_DEVICES+1]; /* xhci_dev_ctx ptrs */ }; /* port status registers */ struct pci_xhci_portregs { uint32_t portsc; /* port status and control */ uint32_t portpmsc; /* port pwr mgmt status & control */ uint32_t portli; /* port link info */ uint32_t porthlpmc; /* port hardware LPM control */ } __packed; #define XHCI_PS_SPEED_SET(x) (((x) & 0xF) << 10) /* xHC operational registers */ struct pci_xhci_opregs { uint32_t usbcmd; /* usb command */ uint32_t usbsts; /* usb status */ uint32_t pgsz; /* page size */ uint32_t dnctrl; /* device notification control */ uint64_t crcr; /* command ring control */ uint64_t dcbaap; /* device ctx base addr array ptr */ uint32_t config; /* configure */ /* guest mapped addresses: */ struct xhci_trb *cr_p; /* crcr dequeue */ struct xhci_dcbaa *dcbaa_p; /* dev ctx array ptr */ }; /* xHC runtime registers */ struct pci_xhci_rtsregs { uint32_t mfindex; /* microframe index */ struct { /* interrupter register set */ uint32_t iman; /* interrupter management */ uint32_t imod; /* interrupter moderation */ uint32_t erstsz; /* event ring segment table size */ uint32_t rsvd; uint64_t erstba; /* event ring seg-tbl base addr */ uint64_t erdp; /* event ring dequeue ptr */ } intrreg __packed; /* guest mapped addresses */ struct xhci_event_ring_seg *erstba_p; struct xhci_trb *erst_p; /* event ring segment tbl */ int er_deq_seg; /* event ring dequeue segment */ int er_enq_idx; /* event ring enqueue index - xHCI */ int er_enq_seg; /* event ring enqueue segment */ uint32_t er_events_cnt; /* number of events in ER */ uint32_t event_pcs; /* producer cycle state flag */ }; struct pci_xhci_softc; /* * USB device emulation container. * This is referenced from usb_hci->hci_sc; 1 pci_xhci_dev_emu for each * emulated device instance. */ struct pci_xhci_dev_emu { struct pci_xhci_softc *xsc; /* XHCI contexts */ struct xhci_dev_ctx *dev_ctx; struct pci_xhci_dev_ep eps[XHCI_MAX_ENDPOINTS]; int dev_slotstate; struct usb_devemu *dev_ue; /* USB emulated dev */ void *dev_sc; /* device's softc */ struct usb_hci hci; }; struct pci_xhci_softc { struct pci_devinst *xsc_pi; pthread_mutex_t mtx; uint32_t caplength; /* caplen & hciversion */ uint32_t hcsparams1; /* structural parameters 1 */ uint32_t hcsparams2; /* structural parameters 2 */ uint32_t hcsparams3; /* structural parameters 3 */ uint32_t hccparams1; /* capability parameters 1 */ uint32_t dboff; /* doorbell offset */ uint32_t rtsoff; /* runtime register space offset */ uint32_t hccparams2; /* capability parameters 2 */ uint32_t regsend; /* end of configuration registers */ struct pci_xhci_opregs opregs; struct pci_xhci_rtsregs rtsregs; struct pci_xhci_portregs *portregs; struct pci_xhci_dev_emu **devices; /* XHCI[port] = device */ struct pci_xhci_dev_emu **slots; /* slots assigned from 1 */ int ndevices; int usb2_port_start; int usb3_port_start; }; /* portregs and devices arrays are set up to start from idx=1 */ #define XHCI_PORTREG_PTR(x,n) &(x)->portregs[(n)] #define XHCI_DEVINST_PTR(x,n) (x)->devices[(n)] #define XHCI_SLOTDEV_PTR(x,n) (x)->slots[(n)] #define XHCI_HALTED(sc) ((sc)->opregs.usbsts & XHCI_STS_HCH) +#define XHCI_GADDR_SIZE(a) (XHCI_PADDR_SZ - \ + (((uint64_t) (a)) & (XHCI_PADDR_SZ - 1))) #define XHCI_GADDR(sc,a) paddr_guest2host((sc)->xsc_pi->pi_vmctx, \ - (a), \ - XHCI_PADDR_SZ - ((a) & (XHCI_PADDR_SZ-1))) + (a), XHCI_GADDR_SIZE(a)) static int xhci_in_use; /* map USB errors to XHCI */ static const int xhci_usb_errors[USB_ERR_MAX] = { [USB_ERR_NORMAL_COMPLETION] = XHCI_TRB_ERROR_SUCCESS, [USB_ERR_PENDING_REQUESTS] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NOT_STARTED] = XHCI_TRB_ERROR_ENDP_NOT_ON, [USB_ERR_INVAL] = XHCI_TRB_ERROR_INVALID, [USB_ERR_NOMEM] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_CANCELLED] = XHCI_TRB_ERROR_STOPPED, [USB_ERR_BAD_ADDRESS] = XHCI_TRB_ERROR_PARAMETER, [USB_ERR_BAD_BUFSIZE] = XHCI_TRB_ERROR_PARAMETER, [USB_ERR_BAD_FLAG] = XHCI_TRB_ERROR_PARAMETER, [USB_ERR_NO_CALLBACK] = XHCI_TRB_ERROR_STALL, [USB_ERR_IN_USE] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NO_ADDR] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NO_PIPE] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_ZERO_NFRAMES] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_ZERO_MAXP] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_SET_ADDR_FAILED] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NO_POWER] = XHCI_TRB_ERROR_ENDP_NOT_ON, [USB_ERR_TOO_DEEP] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_IOERROR] = XHCI_TRB_ERROR_TRB, [USB_ERR_NOT_CONFIGURED] = XHCI_TRB_ERROR_ENDP_NOT_ON, [USB_ERR_TIMEOUT] = XHCI_TRB_ERROR_CMD_ABORTED, [USB_ERR_SHORT_XFER] = XHCI_TRB_ERROR_SHORT_PKT, [USB_ERR_STALLED] = XHCI_TRB_ERROR_STALL, [USB_ERR_INTERRUPTED] = XHCI_TRB_ERROR_CMD_ABORTED, [USB_ERR_DMA_LOAD_FAILED] = XHCI_TRB_ERROR_DATA_BUF, [USB_ERR_BAD_CONTEXT] = XHCI_TRB_ERROR_TRB, [USB_ERR_NO_ROOT_HUB] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_NO_INTR_THREAD] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_NOT_LOCKED] = XHCI_TRB_ERROR_UNDEFINED, }; #define USB_TO_XHCI_ERR(e) ((e) < USB_ERR_MAX ? xhci_usb_errors[(e)] : \ XHCI_TRB_ERROR_INVALID) static int pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb, int do_intr); static void pci_xhci_dump_trb(struct xhci_trb *trb); static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc); static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot); static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm); static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, uint32_t streamid, uint64_t ringaddr, int ccs); static void pci_xhci_set_evtrb(struct xhci_trb *evtrb, uint64_t port, uint32_t errcode, uint32_t evtype) { evtrb->qwTrb0 = port << 24; evtrb->dwTrb2 = XHCI_TRB_2_ERROR_SET(errcode); evtrb->dwTrb3 = XHCI_TRB_3_TYPE_SET(evtype); } /* controller reset */ static void pci_xhci_reset(struct pci_xhci_softc *sc) { int i; sc->rtsregs.er_enq_idx = 0; sc->rtsregs.er_events_cnt = 0; sc->rtsregs.event_pcs = 1; for (i = 1; i <= XHCI_MAX_SLOTS; i++) { pci_xhci_reset_slot(sc, i); } } static uint32_t pci_xhci_usbcmd_write(struct pci_xhci_softc *sc, uint32_t cmd) { int do_intr = 0; int i; if (cmd & XHCI_CMD_RS) { do_intr = (sc->opregs.usbcmd & XHCI_CMD_RS) == 0; sc->opregs.usbcmd |= XHCI_CMD_RS; sc->opregs.usbsts &= ~XHCI_STS_HCH; sc->opregs.usbsts |= XHCI_STS_PCD; /* Queue port change event on controller run from stop */ if (do_intr) for (i = 1; i <= XHCI_MAX_DEVS; i++) { struct pci_xhci_dev_emu *dev; struct pci_xhci_portregs *port; struct xhci_trb evtrb; if ((dev = XHCI_DEVINST_PTR(sc, i)) == NULL) continue; port = XHCI_PORTREG_PTR(sc, i); port->portsc |= XHCI_PS_CSC | XHCI_PS_CCS; port->portsc &= ~XHCI_PS_PLS_MASK; /* * XHCI 4.19.3 USB2 RxDetect->Polling, * USB3 Polling->U0 */ if (dev->dev_ue->ue_usbver == 2) port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL); else port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0); pci_xhci_set_evtrb(&evtrb, i, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); if (pci_xhci_insert_event(sc, &evtrb, 0) != XHCI_TRB_ERROR_SUCCESS) break; } } else { sc->opregs.usbcmd &= ~XHCI_CMD_RS; sc->opregs.usbsts |= XHCI_STS_HCH; sc->opregs.usbsts &= ~XHCI_STS_PCD; } /* start execution of schedule; stop when set to 0 */ cmd |= sc->opregs.usbcmd & XHCI_CMD_RS; if (cmd & XHCI_CMD_HCRST) { /* reset controller */ pci_xhci_reset(sc); cmd &= ~XHCI_CMD_HCRST; } cmd &= ~(XHCI_CMD_CSS | XHCI_CMD_CRS); if (do_intr) pci_xhci_assert_interrupt(sc); return (cmd); } static void pci_xhci_portregs_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { struct xhci_trb evtrb; struct pci_xhci_portregs *p; int port; uint32_t oldpls, newpls; if (sc->portregs == NULL) return; port = (offset - XHCI_PORTREGS_PORT0) / XHCI_PORTREGS_SETSZ; offset = (offset - XHCI_PORTREGS_PORT0) % XHCI_PORTREGS_SETSZ; DPRINTF(("pci_xhci: portregs wr offset 0x%lx, port %u: 0x%lx", offset, port, value)); assert(port >= 0); if (port > XHCI_MAX_DEVS) { DPRINTF(("pci_xhci: portregs_write port %d > ndevices", port)); return; } if (XHCI_DEVINST_PTR(sc, port) == NULL) { DPRINTF(("pci_xhci: portregs_write to unattached port %d", port)); } p = XHCI_PORTREG_PTR(sc, port); switch (offset) { case 0: /* port reset or warm reset */ if (value & (XHCI_PS_PR | XHCI_PS_WPR)) { pci_xhci_reset_port(sc, port, value & XHCI_PS_WPR); break; } if ((p->portsc & XHCI_PS_PP) == 0) { WPRINTF(("pci_xhci: portregs_write to unpowered " "port %d", port)); break; } /* Port status and control register */ oldpls = XHCI_PS_PLS_GET(p->portsc); newpls = XHCI_PS_PLS_GET(value); p->portsc &= XHCI_PS_PED | XHCI_PS_PLS_MASK | XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK; if (XHCI_DEVINST_PTR(sc, port)) p->portsc |= XHCI_PS_CCS; p->portsc |= (value & ~(XHCI_PS_OCA | XHCI_PS_PR | XHCI_PS_PED | XHCI_PS_PLS_MASK | /* link state */ XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK | /* port indicator */ XHCI_PS_LWS | XHCI_PS_DR | XHCI_PS_WPR)); /* clear control bits */ p->portsc &= ~(value & (XHCI_PS_CSC | XHCI_PS_PEC | XHCI_PS_WRC | XHCI_PS_OCC | XHCI_PS_PRC | XHCI_PS_PLC | XHCI_PS_CEC | XHCI_PS_CAS)); /* port disable request; for USB3, don't care */ if (value & XHCI_PS_PED) DPRINTF(("Disable port %d request", port)); if (!(value & XHCI_PS_LWS)) break; DPRINTF(("Port new PLS: %d", newpls)); switch (newpls) { case 0: /* U0 */ case 3: /* U3 */ if (oldpls != newpls) { p->portsc &= ~XHCI_PS_PLS_MASK; p->portsc |= XHCI_PS_PLS_SET(newpls) | XHCI_PS_PLC; if (oldpls != 0 && newpls == 0) { pci_xhci_set_evtrb(&evtrb, port, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); pci_xhci_insert_event(sc, &evtrb, 1); } } break; default: DPRINTF(("Unhandled change port %d PLS %u", port, newpls)); break; } break; case 4: /* Port power management status and control register */ p->portpmsc = value; break; case 8: /* Port link information register */ DPRINTF(("pci_xhci attempted write to PORTLI, port %d", port)); break; case 12: /* * Port hardware LPM control register. * For USB3, this register is reserved. */ p->porthlpmc = value; break; } } struct xhci_dev_ctx * pci_xhci_get_dev_ctx(struct pci_xhci_softc *sc, uint32_t slot) { uint64_t devctx_addr; struct xhci_dev_ctx *devctx; assert(slot > 0 && slot <= sc->ndevices); assert(sc->opregs.dcbaa_p != NULL); devctx_addr = sc->opregs.dcbaa_p->dcba[slot]; if (devctx_addr == 0) { DPRINTF(("get_dev_ctx devctx_addr == 0")); return (NULL); } DPRINTF(("pci_xhci: get dev ctx, slot %u devctx addr %016lx", slot, devctx_addr)); devctx = XHCI_GADDR(sc, devctx_addr & ~0x3FUL); return (devctx); } struct xhci_trb * pci_xhci_trb_next(struct pci_xhci_softc *sc, struct xhci_trb *curtrb, uint64_t *guestaddr) { struct xhci_trb *next; assert(curtrb != NULL); if (XHCI_TRB_3_TYPE_GET(curtrb->dwTrb3) == XHCI_TRB_TYPE_LINK) { if (guestaddr) *guestaddr = curtrb->qwTrb0 & ~0xFUL; next = XHCI_GADDR(sc, curtrb->qwTrb0 & ~0xFUL); } else { if (guestaddr) *guestaddr += sizeof(struct xhci_trb) & ~0xFUL; next = curtrb + 1; } return (next); } static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc) { sc->rtsregs.intrreg.erdp |= XHCI_ERDP_LO_BUSY; sc->rtsregs.intrreg.iman |= XHCI_IMAN_INTR_PEND; sc->opregs.usbsts |= XHCI_STS_EINT; /* only trigger interrupt if permitted */ if ((sc->opregs.usbcmd & XHCI_CMD_INTE) && (sc->rtsregs.intrreg.iman & XHCI_IMAN_INTR_ENA)) { if (pci_msi_enabled(sc->xsc_pi)) pci_generate_msi(sc->xsc_pi, 0); else pci_lintr_assert(sc->xsc_pi); } } static void pci_xhci_deassert_interrupt(struct pci_xhci_softc *sc) { if (!pci_msi_enabled(sc->xsc_pi)) pci_lintr_assert(sc->xsc_pi); } static void pci_xhci_init_ep(struct pci_xhci_dev_emu *dev, int epid) { struct xhci_dev_ctx *dev_ctx; struct pci_xhci_dev_ep *devep; struct xhci_endp_ctx *ep_ctx; uint32_t pstreams; int i; dev_ctx = dev->dev_ctx; ep_ctx = &dev_ctx->ctx_ep[epid]; devep = &dev->eps[epid]; pstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0); if (pstreams > 0) { DPRINTF(("init_ep %d with pstreams %d", epid, pstreams)); assert(devep->ep_sctx_trbs == NULL); devep->ep_sctx = XHCI_GADDR(dev->xsc, ep_ctx->qwEpCtx2 & XHCI_EPCTX_2_TR_DQ_PTR_MASK); devep->ep_sctx_trbs = calloc(pstreams, sizeof(struct pci_xhci_trb_ring)); for (i = 0; i < pstreams; i++) { devep->ep_sctx_trbs[i].ringaddr = devep->ep_sctx[i].qwSctx0 & XHCI_SCTX_0_TR_DQ_PTR_MASK; devep->ep_sctx_trbs[i].ccs = XHCI_SCTX_0_DCS_GET(devep->ep_sctx[i].qwSctx0); } } else { DPRINTF(("init_ep %d with no pstreams", epid)); devep->ep_ringaddr = ep_ctx->qwEpCtx2 & XHCI_EPCTX_2_TR_DQ_PTR_MASK; devep->ep_ccs = XHCI_EPCTX_2_DCS_GET(ep_ctx->qwEpCtx2); devep->ep_tr = XHCI_GADDR(dev->xsc, devep->ep_ringaddr); DPRINTF(("init_ep tr DCS %x", devep->ep_ccs)); } if (devep->ep_xfer == NULL) { devep->ep_xfer = malloc(sizeof(struct usb_data_xfer)); USB_DATA_XFER_INIT(devep->ep_xfer); } } static void pci_xhci_disable_ep(struct pci_xhci_dev_emu *dev, int epid) { struct xhci_dev_ctx *dev_ctx; struct pci_xhci_dev_ep *devep; struct xhci_endp_ctx *ep_ctx; DPRINTF(("pci_xhci disable_ep %d", epid)); dev_ctx = dev->dev_ctx; ep_ctx = &dev_ctx->ctx_ep[epid]; ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_DISABLED; devep = &dev->eps[epid]; if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0 && devep->ep_sctx_trbs != NULL) free(devep->ep_sctx_trbs); if (devep->ep_xfer != NULL) { free(devep->ep_xfer); devep->ep_xfer = NULL; } memset(devep, 0, sizeof(struct pci_xhci_dev_ep)); } /* reset device at slot and data structures related to it */ static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot) { struct pci_xhci_dev_emu *dev; dev = XHCI_SLOTDEV_PTR(sc, slot); if (!dev) { DPRINTF(("xhci reset unassigned slot (%d)?", slot)); } else { dev->dev_slotstate = XHCI_ST_DISABLED; } /* TODO: reset ring buffer pointers */ } static int pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb, int do_intr) { struct pci_xhci_rtsregs *rts; uint64_t erdp; int erdp_idx; int err; struct xhci_trb *evtrbptr; err = XHCI_TRB_ERROR_SUCCESS; rts = &sc->rtsregs; erdp = rts->intrreg.erdp & ~0xF; erdp_idx = (erdp - rts->erstba_p[rts->er_deq_seg].qwEvrsTablePtr) / sizeof(struct xhci_trb); DPRINTF(("pci_xhci: insert event 0[%lx] 2[%x] 3[%x]", evtrb->qwTrb0, evtrb->dwTrb2, evtrb->dwTrb3)); DPRINTF(("\terdp idx %d/seg %d, enq idx %d/seg %d, pcs %u", erdp_idx, rts->er_deq_seg, rts->er_enq_idx, rts->er_enq_seg, rts->event_pcs)); DPRINTF(("\t(erdp=0x%lx, erst=0x%lx, tblsz=%u, do_intr %d)", erdp, rts->erstba_p->qwEvrsTablePtr, rts->erstba_p->dwEvrsTableSize, do_intr)); evtrbptr = &rts->erst_p[rts->er_enq_idx]; /* TODO: multi-segment table */ if (rts->er_events_cnt >= rts->erstba_p->dwEvrsTableSize) { DPRINTF(("pci_xhci[%d] cannot insert event; ring full", __LINE__)); err = XHCI_TRB_ERROR_EV_RING_FULL; goto done; } if (rts->er_events_cnt == rts->erstba_p->dwEvrsTableSize - 1) { struct xhci_trb errev; if ((evtrbptr->dwTrb3 & 0x1) == (rts->event_pcs & 0x1)) { DPRINTF(("pci_xhci[%d] insert evt err: ring full", __LINE__)); errev.qwTrb0 = 0; errev.dwTrb2 = XHCI_TRB_2_ERROR_SET( XHCI_TRB_ERROR_EV_RING_FULL); errev.dwTrb3 = XHCI_TRB_3_TYPE_SET( XHCI_TRB_EVENT_HOST_CTRL) | rts->event_pcs; rts->er_events_cnt++; memcpy(&rts->erst_p[rts->er_enq_idx], &errev, sizeof(struct xhci_trb)); rts->er_enq_idx = (rts->er_enq_idx + 1) % rts->erstba_p->dwEvrsTableSize; err = XHCI_TRB_ERROR_EV_RING_FULL; do_intr = 1; goto done; } } else { rts->er_events_cnt++; } evtrb->dwTrb3 &= ~XHCI_TRB_3_CYCLE_BIT; evtrb->dwTrb3 |= rts->event_pcs; memcpy(&rts->erst_p[rts->er_enq_idx], evtrb, sizeof(struct xhci_trb)); rts->er_enq_idx = (rts->er_enq_idx + 1) % rts->erstba_p->dwEvrsTableSize; if (rts->er_enq_idx == 0) rts->event_pcs ^= 1; done: if (do_intr) pci_xhci_assert_interrupt(sc); return (err); } static uint32_t pci_xhci_cmd_enable_slot(struct pci_xhci_softc *sc, uint32_t *slot) { struct pci_xhci_dev_emu *dev; uint32_t cmderr; int i; cmderr = XHCI_TRB_ERROR_NO_SLOTS; if (sc->portregs != NULL) for (i = 1; i <= XHCI_MAX_SLOTS; i++) { dev = XHCI_SLOTDEV_PTR(sc, i); if (dev && dev->dev_slotstate == XHCI_ST_DISABLED) { *slot = i; dev->dev_slotstate = XHCI_ST_ENABLED; cmderr = XHCI_TRB_ERROR_SUCCESS; dev->hci.hci_address = i; break; } } DPRINTF(("pci_xhci enable slot (error=%d) slot %u", cmderr != XHCI_TRB_ERROR_SUCCESS, *slot)); return (cmderr); } static uint32_t pci_xhci_cmd_disable_slot(struct pci_xhci_softc *sc, uint32_t slot) { struct pci_xhci_dev_emu *dev; uint32_t cmderr; DPRINTF(("pci_xhci disable slot %u", slot)); cmderr = XHCI_TRB_ERROR_NO_SLOTS; if (sc->portregs == NULL) goto done; if (slot > sc->ndevices) { cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; goto done; } dev = XHCI_SLOTDEV_PTR(sc, slot); if (dev) { if (dev->dev_slotstate == XHCI_ST_DISABLED) { cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; } else { dev->dev_slotstate = XHCI_ST_DISABLED; cmderr = XHCI_TRB_ERROR_SUCCESS; /* TODO: reset events and endpoints */ } } done: return (cmderr); } static uint32_t pci_xhci_cmd_reset_device(struct pci_xhci_softc *sc, uint32_t slot) { struct pci_xhci_dev_emu *dev; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; uint32_t cmderr; int i; cmderr = XHCI_TRB_ERROR_NO_SLOTS; if (sc->portregs == NULL) goto done; DPRINTF(("pci_xhci reset device slot %u", slot)); dev = XHCI_SLOTDEV_PTR(sc, slot); if (!dev || dev->dev_slotstate == XHCI_ST_DISABLED) cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; else { dev->dev_slotstate = XHCI_ST_DEFAULT; dev->hci.hci_address = 0; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); /* slot state */ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_DEFAULT, 0x1F, 27); /* number of contexts */ dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); /* reset all eps other than ep-0 */ for (i = 2; i <= 31; i++) { ep_ctx = &dev_ctx->ctx_ep[i]; ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_DISABLED, 0x7, 0); } cmderr = XHCI_TRB_ERROR_SUCCESS; } pci_xhci_reset_slot(sc, slot); done: return (cmderr); } static uint32_t pci_xhci_cmd_address_device(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct pci_xhci_dev_emu *dev; struct xhci_input_dev_ctx *input_ctx; struct xhci_slot_ctx *islot_ctx; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep0_ctx; uint32_t cmderr; input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); islot_ctx = &input_ctx->ctx_slot; ep0_ctx = &input_ctx->ctx_ep[1]; cmderr = XHCI_TRB_ERROR_SUCCESS; DPRINTF(("pci_xhci: address device, input ctl: D 0x%08x A 0x%08x,", input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1)); DPRINTF((" slot %08x %08x %08x %08x", islot_ctx->dwSctx0, islot_ctx->dwSctx1, islot_ctx->dwSctx2, islot_ctx->dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); /* when setting address: drop-ctx=0, add-ctx=slot+ep0 */ if ((input_ctx->ctx_input.dwInCtx0 != 0) || (input_ctx->ctx_input.dwInCtx1 & 0x03) != 0x03) { DPRINTF(("pci_xhci: address device, input ctl invalid")); cmderr = XHCI_TRB_ERROR_TRB; goto done; } /* assign address to slot */ dev_ctx = pci_xhci_get_dev_ctx(sc, slot); DPRINTF(("pci_xhci: address device, dev ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); dev->hci.hci_address = slot; dev->dev_ctx = dev_ctx; if (dev->dev_ue->ue_reset == NULL || dev->dev_ue->ue_reset(dev->dev_sc) < 0) { cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; goto done; } memcpy(&dev_ctx->ctx_slot, islot_ctx, sizeof(struct xhci_slot_ctx)); dev_ctx->ctx_slot.dwSctx3 = XHCI_SCTX_3_SLOT_STATE_SET(XHCI_ST_SLCTX_ADDRESSED) | XHCI_SCTX_3_DEV_ADDR_SET(slot); memcpy(&dev_ctx->ctx_ep[1], ep0_ctx, sizeof(struct xhci_endp_ctx)); ep0_ctx = &dev_ctx->ctx_ep[1]; ep0_ctx->dwEpCtx0 = (ep0_ctx->dwEpCtx0 & ~0x7) | XHCI_EPCTX_0_EPSTATE_SET(XHCI_ST_EPCTX_RUNNING); pci_xhci_init_ep(dev, 1); dev->dev_slotstate = XHCI_ST_ADDRESSED; DPRINTF(("pci_xhci: address device, output ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); done: return (cmderr); } static uint32_t pci_xhci_cmd_config_ep(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct xhci_input_dev_ctx *input_ctx; struct pci_xhci_dev_emu *dev; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx, *iep_ctx; uint32_t cmderr; int i; cmderr = XHCI_TRB_ERROR_SUCCESS; DPRINTF(("pci_xhci config_ep slot %u", slot)); dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); if ((trb->dwTrb3 & XHCI_TRB_3_DCEP_BIT) != 0) { DPRINTF(("pci_xhci config_ep - deconfigure ep slot %u", slot)); if (dev->dev_ue->ue_stop != NULL) dev->dev_ue->ue_stop(dev->dev_sc); dev->dev_slotstate = XHCI_ST_ADDRESSED; dev->hci.hci_address = 0; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); /* number of contexts */ dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); /* slot state */ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_ADDRESSED, 0x1F, 27); /* disable endpoints */ for (i = 2; i < 32; i++) pci_xhci_disable_ep(dev, i); cmderr = XHCI_TRB_ERROR_SUCCESS; goto done; } if (dev->dev_slotstate < XHCI_ST_ADDRESSED) { DPRINTF(("pci_xhci: config_ep slotstate x%x != addressed", dev->dev_slotstate)); cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; goto done; } /* In addressed/configured state; * for each drop endpoint ctx flag: * ep->state = DISABLED * for each add endpoint ctx flag: * cp(ep-in, ep-out) * ep->state = RUNNING * for each drop+add endpoint flag: * reset ep resources * cp(ep-in, ep-out) * ep->state = RUNNING * if input->DisabledCtx[2-31] < 30: (at least 1 ep not disabled) * slot->state = configured */ input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); dev_ctx = dev->dev_ctx; DPRINTF(("pci_xhci: config_ep inputctx: D:x%08x A:x%08x 7:x%08x", input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1, input_ctx->ctx_input.dwInCtx7)); for (i = 2; i <= 31; i++) { ep_ctx = &dev_ctx->ctx_ep[i]; if (input_ctx->ctx_input.dwInCtx0 & XHCI_INCTX_0_DROP_MASK(i)) { DPRINTF((" config ep - dropping ep %d", i)); pci_xhci_disable_ep(dev, i); } if (input_ctx->ctx_input.dwInCtx1 & XHCI_INCTX_1_ADD_MASK(i)) { iep_ctx = &input_ctx->ctx_ep[i]; DPRINTF((" enable ep[%d] %08x %08x %016lx %08x", i, iep_ctx->dwEpCtx0, iep_ctx->dwEpCtx1, iep_ctx->qwEpCtx2, iep_ctx->dwEpCtx4)); memcpy(ep_ctx, iep_ctx, sizeof(struct xhci_endp_ctx)); pci_xhci_init_ep(dev, i); /* ep state */ ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); } } /* slot state to configured */ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_CONFIGURED, 0x1F, 27); dev_ctx->ctx_slot.dwSctx0 = FIELD_COPY( dev_ctx->ctx_slot.dwSctx0, input_ctx->ctx_slot.dwSctx0, 0x1F, 27); dev->dev_slotstate = XHCI_ST_CONFIGURED; DPRINTF(("EP configured; slot %u [0]=0x%08x [1]=0x%08x [2]=0x%08x " "[3]=0x%08x", slot, dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); done: return (cmderr); } static uint32_t pci_xhci_cmd_reset_ep(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; uint32_t cmderr, epid; uint32_t type; epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); DPRINTF(("pci_xhci: reset ep %u: slot %u", epid, slot)); cmderr = XHCI_TRB_ERROR_SUCCESS; type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); if (type == XHCI_TRB_TYPE_STOP_EP && (trb->dwTrb3 & XHCI_TRB_3_SUSP_EP_BIT) != 0) { /* XXX suspend endpoint for 10ms */ } if (epid < 1 || epid > 31) { DPRINTF(("pci_xhci: reset ep: invalid epid %u", epid)); cmderr = XHCI_TRB_ERROR_TRB; goto done; } devep = &dev->eps[epid]; if (devep->ep_xfer != NULL) USB_DATA_XFER_RESET(devep->ep_xfer); dev_ctx = dev->dev_ctx; assert(dev_ctx != NULL); ep_ctx = &dev_ctx->ctx_ep[epid]; ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) == 0) ep_ctx->qwEpCtx2 = devep->ep_ringaddr | devep->ep_ccs; DPRINTF(("pci_xhci: reset ep[%u] %08x %08x %016lx %08x", epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, ep_ctx->dwEpCtx4)); if (type == XHCI_TRB_TYPE_RESET_EP && (dev->dev_ue->ue_reset == NULL || dev->dev_ue->ue_reset(dev->dev_sc) < 0)) { cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; goto done; } done: return (cmderr); } static uint32_t pci_xhci_find_stream(struct pci_xhci_softc *sc, struct xhci_endp_ctx *ep, uint32_t streamid, struct xhci_stream_ctx **osctx) { struct xhci_stream_ctx *sctx; uint32_t maxpstreams; maxpstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep->dwEpCtx0); if (maxpstreams == 0) return (XHCI_TRB_ERROR_TRB); if (maxpstreams > XHCI_STREAMS_MAX) return (XHCI_TRB_ERROR_INVALID_SID); if (XHCI_EPCTX_0_LSA_GET(ep->dwEpCtx0) == 0) { DPRINTF(("pci_xhci: find_stream; LSA bit not set")); return (XHCI_TRB_ERROR_INVALID_SID); } /* only support primary stream */ if (streamid > maxpstreams) return (XHCI_TRB_ERROR_STREAM_TYPE); sctx = XHCI_GADDR(sc, ep->qwEpCtx2 & ~0xFUL) + streamid; if (!XHCI_SCTX_0_SCT_GET(sctx->qwSctx0)) return (XHCI_TRB_ERROR_STREAM_TYPE); *osctx = sctx; return (XHCI_TRB_ERROR_SUCCESS); } static uint32_t pci_xhci_cmd_set_tr(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; uint32_t cmderr, epid; uint32_t streamid; cmderr = XHCI_TRB_ERROR_SUCCESS; dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); DPRINTF(("pci_xhci set_tr: new-tr x%016lx, SCT %u DCS %u", (trb->qwTrb0 & ~0xF), (uint32_t)((trb->qwTrb0 >> 1) & 0x7), (uint32_t)(trb->qwTrb0 & 0x1))); DPRINTF((" stream-id %u, slot %u, epid %u, C %u", (trb->dwTrb2 >> 16) & 0xFFFF, XHCI_TRB_3_SLOT_GET(trb->dwTrb3), XHCI_TRB_3_EP_GET(trb->dwTrb3), trb->dwTrb3 & 0x1)); epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); if (epid < 1 || epid > 31) { DPRINTF(("pci_xhci: set_tr_deq: invalid epid %u", epid)); cmderr = XHCI_TRB_ERROR_TRB; goto done; } dev_ctx = dev->dev_ctx; assert(dev_ctx != NULL); ep_ctx = &dev_ctx->ctx_ep[epid]; devep = &dev->eps[epid]; switch (XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0)) { case XHCI_ST_EPCTX_STOPPED: case XHCI_ST_EPCTX_ERROR: break; default: DPRINTF(("pci_xhci cmd set_tr invalid state %x", XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0))); cmderr = XHCI_TRB_ERROR_CONTEXT_STATE; goto done; } streamid = XHCI_TRB_2_STREAM_GET(trb->dwTrb2); if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0) { struct xhci_stream_ctx *sctx; sctx = NULL; cmderr = pci_xhci_find_stream(sc, ep_ctx, streamid, &sctx); if (sctx != NULL) { assert(devep->ep_sctx != NULL); devep->ep_sctx[streamid].qwSctx0 = trb->qwTrb0; devep->ep_sctx_trbs[streamid].ringaddr = trb->qwTrb0 & ~0xF; devep->ep_sctx_trbs[streamid].ccs = XHCI_EPCTX_2_DCS_GET(trb->qwTrb0); } } else { if (streamid != 0) { DPRINTF(("pci_xhci cmd set_tr streamid %x != 0", streamid)); } ep_ctx->qwEpCtx2 = trb->qwTrb0 & ~0xFUL; devep->ep_ringaddr = ep_ctx->qwEpCtx2 & ~0xFUL; devep->ep_ccs = trb->qwTrb0 & 0x1; devep->ep_tr = XHCI_GADDR(sc, devep->ep_ringaddr); DPRINTF(("pci_xhci set_tr first TRB:")); pci_xhci_dump_trb(devep->ep_tr); } ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; done: return (cmderr); } static uint32_t pci_xhci_cmd_eval_ctx(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct xhci_input_dev_ctx *input_ctx; struct xhci_slot_ctx *islot_ctx; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep0_ctx; uint32_t cmderr; input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); islot_ctx = &input_ctx->ctx_slot; ep0_ctx = &input_ctx->ctx_ep[1]; cmderr = XHCI_TRB_ERROR_SUCCESS; DPRINTF(("pci_xhci: eval ctx, input ctl: D 0x%08x A 0x%08x,", input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1)); DPRINTF((" slot %08x %08x %08x %08x", islot_ctx->dwSctx0, islot_ctx->dwSctx1, islot_ctx->dwSctx2, islot_ctx->dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); /* this command expects drop-ctx=0 & add-ctx=slot+ep0 */ if ((input_ctx->ctx_input.dwInCtx0 != 0) || (input_ctx->ctx_input.dwInCtx1 & 0x03) == 0) { DPRINTF(("pci_xhci: eval ctx, input ctl invalid")); cmderr = XHCI_TRB_ERROR_TRB; goto done; } /* assign address to slot; in this emulation, slot_id = address */ dev_ctx = pci_xhci_get_dev_ctx(sc, slot); DPRINTF(("pci_xhci: eval ctx, dev ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); if (input_ctx->ctx_input.dwInCtx1 & 0x01) { /* slot ctx */ /* set max exit latency */ dev_ctx->ctx_slot.dwSctx1 = FIELD_COPY( dev_ctx->ctx_slot.dwSctx1, input_ctx->ctx_slot.dwSctx1, 0xFFFF, 0); /* set interrupter target */ dev_ctx->ctx_slot.dwSctx2 = FIELD_COPY( dev_ctx->ctx_slot.dwSctx2, input_ctx->ctx_slot.dwSctx2, 0x3FF, 22); } if (input_ctx->ctx_input.dwInCtx1 & 0x02) { /* control ctx */ /* set max packet size */ dev_ctx->ctx_ep[1].dwEpCtx1 = FIELD_COPY( dev_ctx->ctx_ep[1].dwEpCtx1, ep0_ctx->dwEpCtx1, 0xFFFF, 16); ep0_ctx = &dev_ctx->ctx_ep[1]; } DPRINTF(("pci_xhci: eval ctx, output ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); done: return (cmderr); } static int pci_xhci_complete_commands(struct pci_xhci_softc *sc) { struct xhci_trb evtrb; struct xhci_trb *trb; uint64_t crcr; uint32_t ccs; /* cycle state (XHCI 4.9.2) */ uint32_t type; uint32_t slot; uint32_t cmderr; int error; error = 0; sc->opregs.crcr |= XHCI_CRCR_LO_CRR; trb = sc->opregs.cr_p; ccs = sc->opregs.crcr & XHCI_CRCR_LO_RCS; crcr = sc->opregs.crcr & ~0xF; while (1) { sc->opregs.cr_p = trb; type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); if ((trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT) != (ccs & XHCI_TRB_3_CYCLE_BIT)) break; DPRINTF(("pci_xhci: cmd type 0x%x, Trb0 x%016lx dwTrb2 x%08x" " dwTrb3 x%08x, TRB_CYCLE %u/ccs %u", type, trb->qwTrb0, trb->dwTrb2, trb->dwTrb3, trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT, ccs)); cmderr = XHCI_TRB_ERROR_SUCCESS; evtrb.dwTrb2 = 0; evtrb.dwTrb3 = (ccs & XHCI_TRB_3_CYCLE_BIT) | XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_CMD_COMPLETE); slot = 0; switch (type) { case XHCI_TRB_TYPE_LINK: /* 0x06 */ if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) ccs ^= XHCI_CRCR_LO_RCS; break; case XHCI_TRB_TYPE_ENABLE_SLOT: /* 0x09 */ cmderr = pci_xhci_cmd_enable_slot(sc, &slot); break; case XHCI_TRB_TYPE_DISABLE_SLOT: /* 0x0A */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_disable_slot(sc, slot); break; case XHCI_TRB_TYPE_ADDRESS_DEVICE: /* 0x0B */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_address_device(sc, slot, trb); break; case XHCI_TRB_TYPE_CONFIGURE_EP: /* 0x0C */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_config_ep(sc, slot, trb); break; case XHCI_TRB_TYPE_EVALUATE_CTX: /* 0x0D */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_eval_ctx(sc, slot, trb); break; case XHCI_TRB_TYPE_RESET_EP: /* 0x0E */ DPRINTF(("Reset Endpoint on slot %d", slot)); slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); break; case XHCI_TRB_TYPE_STOP_EP: /* 0x0F */ DPRINTF(("Stop Endpoint on slot %d", slot)); slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); break; case XHCI_TRB_TYPE_SET_TR_DEQUEUE: /* 0x10 */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_set_tr(sc, slot, trb); break; case XHCI_TRB_TYPE_RESET_DEVICE: /* 0x11 */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_reset_device(sc, slot); break; case XHCI_TRB_TYPE_FORCE_EVENT: /* 0x12 */ /* TODO: */ break; case XHCI_TRB_TYPE_NEGOTIATE_BW: /* 0x13 */ break; case XHCI_TRB_TYPE_SET_LATENCY_TOL: /* 0x14 */ break; case XHCI_TRB_TYPE_GET_PORT_BW: /* 0x15 */ break; case XHCI_TRB_TYPE_FORCE_HEADER: /* 0x16 */ break; case XHCI_TRB_TYPE_NOOP_CMD: /* 0x17 */ break; default: DPRINTF(("pci_xhci: unsupported cmd %x", type)); break; } if (type != XHCI_TRB_TYPE_LINK) { /* * insert command completion event and assert intr */ evtrb.qwTrb0 = crcr; evtrb.dwTrb2 |= XHCI_TRB_2_ERROR_SET(cmderr); evtrb.dwTrb3 |= XHCI_TRB_3_SLOT_SET(slot); DPRINTF(("pci_xhci: command 0x%x result: 0x%x", type, cmderr)); pci_xhci_insert_event(sc, &evtrb, 1); } trb = pci_xhci_trb_next(sc, trb, &crcr); } sc->opregs.crcr = crcr | (sc->opregs.crcr & XHCI_CRCR_LO_CA) | ccs; sc->opregs.crcr &= ~XHCI_CRCR_LO_CRR; return (error); } static void pci_xhci_dump_trb(struct xhci_trb *trb) { static const char *trbtypes[] = { "RESERVED", "NORMAL", "SETUP_STAGE", "DATA_STAGE", "STATUS_STAGE", "ISOCH", "LINK", "EVENT_DATA", "NOOP", "ENABLE_SLOT", "DISABLE_SLOT", "ADDRESS_DEVICE", "CONFIGURE_EP", "EVALUATE_CTX", "RESET_EP", "STOP_EP", "SET_TR_DEQUEUE", "RESET_DEVICE", "FORCE_EVENT", "NEGOTIATE_BW", "SET_LATENCY_TOL", "GET_PORT_BW", "FORCE_HEADER", "NOOP_CMD" }; uint32_t type; type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); DPRINTF(("pci_xhci: trb[@%p] type x%02x %s 0:x%016lx 2:x%08x 3:x%08x", trb, type, type <= XHCI_TRB_TYPE_NOOP_CMD ? trbtypes[type] : "INVALID", trb->qwTrb0, trb->dwTrb2, trb->dwTrb3)); } static int pci_xhci_xfer_complete(struct pci_xhci_softc *sc, struct usb_data_xfer *xfer, uint32_t slot, uint32_t epid, int *do_intr) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; struct xhci_trb *trb; struct xhci_trb evtrb; uint32_t trbflags; uint32_t edtla; int i, err; dev = XHCI_SLOTDEV_PTR(sc, slot); devep = &dev->eps[epid]; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); assert(dev_ctx != NULL); ep_ctx = &dev_ctx->ctx_ep[epid]; err = XHCI_TRB_ERROR_SUCCESS; *do_intr = 0; edtla = 0; /* go through list of TRBs and insert event(s) */ for (i = xfer->head; xfer->ndata > 0; ) { evtrb.qwTrb0 = (uint64_t)xfer->data[i].hci_data; trb = XHCI_GADDR(sc, evtrb.qwTrb0); trbflags = trb->dwTrb3; DPRINTF(("pci_xhci: xfer[%d] done?%u:%d trb %x %016lx %x " "(err %d) IOC?%d", i, xfer->data[i].processed, xfer->data[i].blen, XHCI_TRB_3_TYPE_GET(trbflags), evtrb.qwTrb0, trbflags, err, trb->dwTrb3 & XHCI_TRB_3_IOC_BIT ? 1 : 0)); if (!xfer->data[i].processed) { xfer->head = i; break; } xfer->ndata--; edtla += xfer->data[i].bdone; trb->dwTrb3 = (trb->dwTrb3 & ~0x1) | (xfer->data[i].ccs); pci_xhci_update_ep_ring(sc, dev, devep, ep_ctx, xfer->data[i].streamid, xfer->data[i].trbnext, xfer->data[i].ccs); /* Only interrupt if IOC or short packet */ if (!(trb->dwTrb3 & XHCI_TRB_3_IOC_BIT) && !((err == XHCI_TRB_ERROR_SHORT_PKT) && (trb->dwTrb3 & XHCI_TRB_3_ISP_BIT))) { i = (i + 1) % USB_MAX_XFER_BLOCKS; continue; } evtrb.dwTrb2 = XHCI_TRB_2_ERROR_SET(err) | XHCI_TRB_2_REM_SET(xfer->data[i].blen); evtrb.dwTrb3 = XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_TRANSFER) | XHCI_TRB_3_SLOT_SET(slot) | XHCI_TRB_3_EP_SET(epid); if (XHCI_TRB_3_TYPE_GET(trbflags) == XHCI_TRB_TYPE_EVENT_DATA) { DPRINTF(("pci_xhci EVENT_DATA edtla %u", edtla)); evtrb.qwTrb0 = trb->qwTrb0; evtrb.dwTrb2 = (edtla & 0xFFFFF) | XHCI_TRB_2_ERROR_SET(err); evtrb.dwTrb3 |= XHCI_TRB_3_ED_BIT; edtla = 0; } *do_intr = 1; err = pci_xhci_insert_event(sc, &evtrb, 0); if (err != XHCI_TRB_ERROR_SUCCESS) { break; } i = (i + 1) % USB_MAX_XFER_BLOCKS; } return (err); } static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, uint32_t streamid, uint64_t ringaddr, int ccs) { if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) { devep->ep_sctx[streamid].qwSctx0 = (ringaddr & ~0xFUL) | (ccs & 0x1); devep->ep_sctx_trbs[streamid].ringaddr = ringaddr & ~0xFUL; devep->ep_sctx_trbs[streamid].ccs = ccs & 0x1; ep_ctx->qwEpCtx2 = (ep_ctx->qwEpCtx2 & ~0x1) | (ccs & 0x1); DPRINTF(("xhci update ep-ring stream %d, addr %lx", streamid, devep->ep_sctx[streamid].qwSctx0)); } else { devep->ep_ringaddr = ringaddr & ~0xFUL; devep->ep_ccs = ccs & 0x1; devep->ep_tr = XHCI_GADDR(sc, ringaddr & ~0xFUL); ep_ctx->qwEpCtx2 = (ringaddr & ~0xFUL) | (ccs & 0x1); DPRINTF(("xhci update ep-ring, addr %lx", (devep->ep_ringaddr | devep->ep_ccs))); } } /* * Outstanding transfer still in progress (device NAK'd earlier) so retry * the transfer again to see if it succeeds. */ static int pci_xhci_try_usb_xfer(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, uint32_t slot, uint32_t epid) { struct usb_data_xfer *xfer; int err; int do_intr; ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); err = 0; do_intr = 0; xfer = devep->ep_xfer; USB_DATA_XFER_LOCK(xfer); /* outstanding requests queued up */ if (dev->dev_ue->ue_data != NULL) { err = dev->dev_ue->ue_data(dev->dev_sc, xfer, epid & 0x1 ? USB_XFER_IN : USB_XFER_OUT, epid/2); if (err == USB_ERR_CANCELLED) { if (USB_DATA_GET_ERRCODE(&xfer->data[xfer->head]) == USB_NAK) err = XHCI_TRB_ERROR_SUCCESS; } else { err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr); if (err == XHCI_TRB_ERROR_SUCCESS && do_intr) { pci_xhci_assert_interrupt(sc); } /* XXX should not do it if error? */ USB_DATA_XFER_RESET(xfer); } } USB_DATA_XFER_UNLOCK(xfer); return (err); } static int pci_xhci_handle_transfer(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, struct xhci_trb *trb, uint32_t slot, uint32_t epid, uint64_t addr, uint32_t ccs, uint32_t streamid) { struct xhci_trb *setup_trb; struct usb_data_xfer *xfer; struct usb_data_xfer_block *xfer_block; uint64_t val; uint32_t trbflags; int do_intr, err; int do_retry; ep_ctx->dwEpCtx0 = FIELD_REPLACE(ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); xfer = devep->ep_xfer; USB_DATA_XFER_LOCK(xfer); DPRINTF(("pci_xhci handle_transfer slot %u", slot)); retry: err = 0; do_retry = 0; do_intr = 0; setup_trb = NULL; while (1) { pci_xhci_dump_trb(trb); trbflags = trb->dwTrb3; if (XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK && (trbflags & XHCI_TRB_3_CYCLE_BIT) != (ccs & XHCI_TRB_3_CYCLE_BIT)) { DPRINTF(("Cycle-bit changed trbflags %x, ccs %x", trbflags & XHCI_TRB_3_CYCLE_BIT, ccs)); break; } xfer_block = NULL; switch (XHCI_TRB_3_TYPE_GET(trbflags)) { case XHCI_TRB_TYPE_LINK: if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) ccs ^= 0x1; xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); xfer_block->processed = 1; break; case XHCI_TRB_TYPE_SETUP_STAGE: if ((trbflags & XHCI_TRB_3_IDT_BIT) == 0 || XHCI_TRB_2_BYTES_GET(trb->dwTrb2) != 8) { DPRINTF(("pci_xhci: invalid setup trb")); err = XHCI_TRB_ERROR_TRB; goto errout; } setup_trb = trb; val = trb->qwTrb0; if (!xfer->ureq) xfer->ureq = malloc( sizeof(struct usb_device_request)); memcpy(xfer->ureq, &val, sizeof(struct usb_device_request)); xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); xfer_block->processed = 1; break; case XHCI_TRB_TYPE_NORMAL: case XHCI_TRB_TYPE_ISOCH: if (setup_trb != NULL) { DPRINTF(("pci_xhci: trb not supposed to be in " "ctl scope")); err = XHCI_TRB_ERROR_TRB; goto errout; } /* fall through */ case XHCI_TRB_TYPE_DATA_STAGE: xfer_block = usb_data_xfer_append(xfer, (void *)(trbflags & XHCI_TRB_3_IDT_BIT ? &trb->qwTrb0 : XHCI_GADDR(sc, trb->qwTrb0)), trb->dwTrb2 & 0x1FFFF, (void *)addr, ccs); break; case XHCI_TRB_TYPE_STATUS_STAGE: xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); break; case XHCI_TRB_TYPE_NOOP: xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); xfer_block->processed = 1; break; case XHCI_TRB_TYPE_EVENT_DATA: xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); if ((epid > 1) && (trbflags & XHCI_TRB_3_IOC_BIT)) { xfer_block->processed = 1; } break; default: DPRINTF(("pci_xhci: handle xfer unexpected trb type " "0x%x", XHCI_TRB_3_TYPE_GET(trbflags))); err = XHCI_TRB_ERROR_TRB; goto errout; } trb = pci_xhci_trb_next(sc, trb, &addr); DPRINTF(("pci_xhci: next trb: 0x%lx", (uint64_t)trb)); if (xfer_block) { xfer_block->trbnext = addr; xfer_block->streamid = streamid; } if (!setup_trb && !(trbflags & XHCI_TRB_3_CHAIN_BIT) && XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK) { break; } /* handle current batch that requires interrupt on complete */ if (trbflags & XHCI_TRB_3_IOC_BIT) { DPRINTF(("pci_xhci: trb IOC bit set")); if (epid == 1) do_retry = 1; break; } } DPRINTF(("pci_xhci[%d]: xfer->ndata %u", __LINE__, xfer->ndata)); if (epid == 1) { err = USB_ERR_NOT_STARTED; if (dev->dev_ue->ue_request != NULL) err = dev->dev_ue->ue_request(dev->dev_sc, xfer); setup_trb = NULL; } else { /* handle data transfer */ pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); err = XHCI_TRB_ERROR_SUCCESS; goto errout; } err = USB_TO_XHCI_ERR(err); if ((err == XHCI_TRB_ERROR_SUCCESS) || (err == XHCI_TRB_ERROR_SHORT_PKT)) { err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr); if (err != XHCI_TRB_ERROR_SUCCESS) do_retry = 0; } errout: if (err == XHCI_TRB_ERROR_EV_RING_FULL) DPRINTF(("pci_xhci[%d]: event ring full", __LINE__)); if (!do_retry) USB_DATA_XFER_UNLOCK(xfer); if (do_intr) pci_xhci_assert_interrupt(sc); if (do_retry) { USB_DATA_XFER_RESET(xfer); DPRINTF(("pci_xhci[%d]: retry:continuing with next TRBs", __LINE__)); goto retry; } if (epid == 1) USB_DATA_XFER_RESET(xfer); return (err); } static void pci_xhci_device_doorbell(struct pci_xhci_softc *sc, uint32_t slot, uint32_t epid, uint32_t streamid) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; struct pci_xhci_trb_ring *sctx_tr; struct xhci_trb *trb; uint64_t ringaddr; uint32_t ccs; DPRINTF(("pci_xhci doorbell slot %u epid %u stream %u", slot, epid, streamid)); if (slot == 0 || slot > sc->ndevices) { DPRINTF(("pci_xhci: invalid doorbell slot %u", slot)); return; } if (epid == 0 || epid >= XHCI_MAX_ENDPOINTS) { DPRINTF(("pci_xhci: invalid endpoint %u", epid)); return; } dev = XHCI_SLOTDEV_PTR(sc, slot); devep = &dev->eps[epid]; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); if (!dev_ctx) { return; } ep_ctx = &dev_ctx->ctx_ep[epid]; sctx_tr = NULL; DPRINTF(("pci_xhci: device doorbell ep[%u] %08x %08x %016lx %08x", epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, ep_ctx->dwEpCtx4)); if (ep_ctx->qwEpCtx2 == 0) return; /* handle pending transfers */ if (devep->ep_xfer->ndata > 0) { pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); return; } /* get next trb work item */ if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) { struct xhci_stream_ctx *sctx; /* * Stream IDs of 0, 65535 (any stream), and 65534 * (prime) are invalid. */ if (streamid == 0 || streamid == 65534 || streamid == 65535) { DPRINTF(("pci_xhci: invalid stream %u", streamid)); return; } sctx = NULL; pci_xhci_find_stream(sc, ep_ctx, streamid, &sctx); if (sctx == NULL) { DPRINTF(("pci_xhci: invalid stream %u", streamid)); return; } sctx_tr = &devep->ep_sctx_trbs[streamid]; ringaddr = sctx_tr->ringaddr; ccs = sctx_tr->ccs; trb = XHCI_GADDR(sc, sctx_tr->ringaddr & ~0xFUL); DPRINTF(("doorbell, stream %u, ccs %lx, trb ccs %x", streamid, ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); } else { if (streamid != 0) { DPRINTF(("pci_xhci: invalid stream %u", streamid)); return; } ringaddr = devep->ep_ringaddr; ccs = devep->ep_ccs; trb = devep->ep_tr; DPRINTF(("doorbell, ccs %lx, trb ccs %x", ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); } if (XHCI_TRB_3_TYPE_GET(trb->dwTrb3) == 0) { DPRINTF(("pci_xhci: ring %lx trb[%lx] EP %u is RESERVED?", ep_ctx->qwEpCtx2, devep->ep_ringaddr, epid)); return; } pci_xhci_handle_transfer(sc, dev, devep, ep_ctx, trb, slot, epid, ringaddr, ccs, streamid); } static void pci_xhci_dbregs_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { offset = (offset - sc->dboff) / sizeof(uint32_t); DPRINTF(("pci_xhci: doorbell write offset 0x%lx: 0x%lx", offset, value)); if (XHCI_HALTED(sc)) { DPRINTF(("pci_xhci: controller halted")); return; } if (offset == 0) pci_xhci_complete_commands(sc); else if (sc->portregs != NULL) pci_xhci_device_doorbell(sc, offset, XHCI_DB_TARGET_GET(value), XHCI_DB_SID_GET(value)); } static void pci_xhci_rtsregs_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { struct pci_xhci_rtsregs *rts; offset -= sc->rtsoff; if (offset == 0) { DPRINTF(("pci_xhci attempted write to MFINDEX")); return; } DPRINTF(("pci_xhci: runtime regs write offset 0x%lx: 0x%lx", offset, value)); offset -= 0x20; /* start of intrreg */ rts = &sc->rtsregs; switch (offset) { case 0x00: if (value & XHCI_IMAN_INTR_PEND) rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; rts->intrreg.iman = (value & XHCI_IMAN_INTR_ENA) | (rts->intrreg.iman & XHCI_IMAN_INTR_PEND); if (!(value & XHCI_IMAN_INTR_ENA)) pci_xhci_deassert_interrupt(sc); break; case 0x04: rts->intrreg.imod = value; break; case 0x08: rts->intrreg.erstsz = value & 0xFFFF; break; case 0x10: /* ERSTBA low bits */ rts->intrreg.erstba = MASK_64_HI(sc->rtsregs.intrreg.erstba) | (value & ~0x3F); break; case 0x14: /* ERSTBA high bits */ rts->intrreg.erstba = (value << 32) | MASK_64_LO(sc->rtsregs.intrreg.erstba); rts->erstba_p = XHCI_GADDR(sc, sc->rtsregs.intrreg.erstba & ~0x3FUL); rts->erst_p = XHCI_GADDR(sc, sc->rtsregs.erstba_p->qwEvrsTablePtr & ~0x3FUL); rts->er_enq_idx = 0; rts->er_events_cnt = 0; DPRINTF(("pci_xhci: wr erstba erst (%p) ptr 0x%lx, sz %u", rts->erstba_p, rts->erstba_p->qwEvrsTablePtr, rts->erstba_p->dwEvrsTableSize)); break; case 0x18: /* ERDP low bits */ rts->intrreg.erdp = MASK_64_HI(sc->rtsregs.intrreg.erdp) | (rts->intrreg.erdp & XHCI_ERDP_LO_BUSY) | (value & ~0xF); if (value & XHCI_ERDP_LO_BUSY) { rts->intrreg.erdp &= ~XHCI_ERDP_LO_BUSY; rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; } rts->er_deq_seg = XHCI_ERDP_LO_SINDEX(value); break; case 0x1C: /* ERDP high bits */ rts->intrreg.erdp = (value << 32) | MASK_64_LO(sc->rtsregs.intrreg.erdp); if (rts->er_events_cnt > 0) { uint64_t erdp; uint32_t erdp_i; erdp = rts->intrreg.erdp & ~0xF; erdp_i = (erdp - rts->erstba_p->qwEvrsTablePtr) / sizeof(struct xhci_trb); if (erdp_i <= rts->er_enq_idx) rts->er_events_cnt = rts->er_enq_idx - erdp_i; else rts->er_events_cnt = rts->erstba_p->dwEvrsTableSize - (erdp_i - rts->er_enq_idx); DPRINTF(("pci_xhci: erdp 0x%lx, events cnt %u", erdp, rts->er_events_cnt)); } break; default: DPRINTF(("pci_xhci attempted write to RTS offset 0x%lx", offset)); break; } } static uint64_t pci_xhci_portregs_read(struct pci_xhci_softc *sc, uint64_t offset) { int port; uint32_t *p; if (sc->portregs == NULL) return (0); port = (offset - 0x3F0) / 0x10; if (port > XHCI_MAX_DEVS) { DPRINTF(("pci_xhci: portregs_read port %d >= XHCI_MAX_DEVS", port)); /* return default value for unused port */ return (XHCI_PS_SPEED_SET(3)); } offset = (offset - 0x3F0) % 0x10; p = &sc->portregs[port].portsc; p += offset / sizeof(uint32_t); DPRINTF(("pci_xhci: portregs read offset 0x%lx port %u -> 0x%x", offset, port, *p)); return (*p); } static void pci_xhci_hostop_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { offset -= XHCI_CAPLEN; if (offset < 0x400) DPRINTF(("pci_xhci: hostop write offset 0x%lx: 0x%lx", offset, value)); switch (offset) { case XHCI_USBCMD: sc->opregs.usbcmd = pci_xhci_usbcmd_write(sc, value & 0x3F0F); break; case XHCI_USBSTS: /* clear bits on write */ sc->opregs.usbsts &= ~(value & (XHCI_STS_HSE|XHCI_STS_EINT|XHCI_STS_PCD|XHCI_STS_SSS| XHCI_STS_RSS|XHCI_STS_SRE|XHCI_STS_CNR)); break; case XHCI_PAGESIZE: /* read only */ break; case XHCI_DNCTRL: sc->opregs.dnctrl = value & 0xFFFF; break; case XHCI_CRCR_LO: if (sc->opregs.crcr & XHCI_CRCR_LO_CRR) { sc->opregs.crcr &= ~(XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); sc->opregs.crcr |= value & (XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); } else { sc->opregs.crcr = MASK_64_HI(sc->opregs.crcr) | (value & (0xFFFFFFC0 | XHCI_CRCR_LO_RCS)); } break; case XHCI_CRCR_HI: if (!(sc->opregs.crcr & XHCI_CRCR_LO_CRR)) { sc->opregs.crcr = MASK_64_LO(sc->opregs.crcr) | (value << 32); sc->opregs.cr_p = XHCI_GADDR(sc, sc->opregs.crcr & ~0xF); } if (sc->opregs.crcr & XHCI_CRCR_LO_CS) { /* Stop operation of Command Ring */ } if (sc->opregs.crcr & XHCI_CRCR_LO_CA) { /* Abort command */ } break; case XHCI_DCBAAP_LO: sc->opregs.dcbaap = MASK_64_HI(sc->opregs.dcbaap) | (value & 0xFFFFFFC0); break; case XHCI_DCBAAP_HI: sc->opregs.dcbaap = MASK_64_LO(sc->opregs.dcbaap) | (value << 32); sc->opregs.dcbaa_p = XHCI_GADDR(sc, sc->opregs.dcbaap & ~0x3FUL); DPRINTF(("pci_xhci: opregs dcbaap = 0x%lx (vaddr 0x%lx)", sc->opregs.dcbaap, (uint64_t)sc->opregs.dcbaa_p)); break; case XHCI_CONFIG: sc->opregs.config = value & 0x03FF; break; default: if (offset >= 0x400) pci_xhci_portregs_write(sc, offset, value); break; } } static void pci_xhci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_xhci_softc *sc; sc = pi->pi_arg; assert(baridx == 0); pthread_mutex_lock(&sc->mtx); if (offset < XHCI_CAPLEN) /* read only registers */ WPRINTF(("pci_xhci: write RO-CAPs offset %ld", offset)); else if (offset < sc->dboff) pci_xhci_hostop_write(sc, offset, value); else if (offset < sc->rtsoff) pci_xhci_dbregs_write(sc, offset, value); else if (offset < sc->regsend) pci_xhci_rtsregs_write(sc, offset, value); else WPRINTF(("pci_xhci: write invalid offset %ld", offset)); pthread_mutex_unlock(&sc->mtx); } static uint64_t pci_xhci_hostcap_read(struct pci_xhci_softc *sc, uint64_t offset) { uint64_t value; switch (offset) { case XHCI_CAPLENGTH: /* 0x00 */ value = sc->caplength; break; case XHCI_HCSPARAMS1: /* 0x04 */ value = sc->hcsparams1; break; case XHCI_HCSPARAMS2: /* 0x08 */ value = sc->hcsparams2; break; case XHCI_HCSPARAMS3: /* 0x0C */ value = sc->hcsparams3; break; case XHCI_HCSPARAMS0: /* 0x10 */ value = sc->hccparams1; break; case XHCI_DBOFF: /* 0x14 */ value = sc->dboff; break; case XHCI_RTSOFF: /* 0x18 */ value = sc->rtsoff; break; case XHCI_HCCPRAMS2: /* 0x1C */ value = sc->hccparams2; break; default: value = 0; break; } DPRINTF(("pci_xhci: hostcap read offset 0x%lx -> 0x%lx", offset, value)); return (value); } static uint64_t pci_xhci_hostop_read(struct pci_xhci_softc *sc, uint64_t offset) { uint64_t value; offset = (offset - XHCI_CAPLEN); switch (offset) { case XHCI_USBCMD: /* 0x00 */ value = sc->opregs.usbcmd; break; case XHCI_USBSTS: /* 0x04 */ value = sc->opregs.usbsts; break; case XHCI_PAGESIZE: /* 0x08 */ value = sc->opregs.pgsz; break; case XHCI_DNCTRL: /* 0x14 */ value = sc->opregs.dnctrl; break; case XHCI_CRCR_LO: /* 0x18 */ value = sc->opregs.crcr & XHCI_CRCR_LO_CRR; break; case XHCI_CRCR_HI: /* 0x1C */ value = 0; break; case XHCI_DCBAAP_LO: /* 0x30 */ value = sc->opregs.dcbaap & 0xFFFFFFFF; break; case XHCI_DCBAAP_HI: /* 0x34 */ value = (sc->opregs.dcbaap >> 32) & 0xFFFFFFFF; break; case XHCI_CONFIG: /* 0x38 */ value = sc->opregs.config; break; default: if (offset >= 0x400) value = pci_xhci_portregs_read(sc, offset); else value = 0; break; } if (offset < 0x400) DPRINTF(("pci_xhci: hostop read offset 0x%lx -> 0x%lx", offset, value)); return (value); } static uint64_t pci_xhci_dbregs_read(struct pci_xhci_softc *sc, uint64_t offset) { /* read doorbell always returns 0 */ return (0); } static uint64_t pci_xhci_rtsregs_read(struct pci_xhci_softc *sc, uint64_t offset) { uint32_t value; offset -= sc->rtsoff; value = 0; if (offset == XHCI_MFINDEX) { value = sc->rtsregs.mfindex; } else if (offset >= 0x20) { int item; uint32_t *p; offset -= 0x20; item = offset % 32; assert(offset < sizeof(sc->rtsregs.intrreg)); p = &sc->rtsregs.intrreg.iman; p += item / sizeof(uint32_t); value = *p; } DPRINTF(("pci_xhci: rtsregs read offset 0x%lx -> 0x%x", offset, value)); return (value); } static uint64_t pci_xhci_xecp_read(struct pci_xhci_softc *sc, uint64_t offset) { uint32_t value; offset -= sc->regsend; value = 0; switch (offset) { case 0: /* rev major | rev minor | next-cap | cap-id */ value = (0x02 << 24) | (4 << 8) | XHCI_ID_PROTOCOLS; break; case 4: /* name string = "USB" */ value = 0x20425355; break; case 8: /* psic | proto-defined | compat # | compat offset */ value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb2_port_start; break; case 12: break; case 16: /* rev major | rev minor | next-cap | cap-id */ value = (0x03 << 24) | XHCI_ID_PROTOCOLS; break; case 20: /* name string = "USB" */ value = 0x20425355; break; case 24: /* psic | proto-defined | compat # | compat offset */ value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb3_port_start; break; case 28: break; default: DPRINTF(("pci_xhci: xecp invalid offset 0x%lx", offset)); break; } DPRINTF(("pci_xhci: xecp read offset 0x%lx -> 0x%x", offset, value)); return (value); } static uint64_t pci_xhci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_xhci_softc *sc; uint32_t value; sc = pi->pi_arg; assert(baridx == 0); pthread_mutex_lock(&sc->mtx); if (offset < XHCI_CAPLEN) value = pci_xhci_hostcap_read(sc, offset); else if (offset < sc->dboff) value = pci_xhci_hostop_read(sc, offset); else if (offset < sc->rtsoff) value = pci_xhci_dbregs_read(sc, offset); else if (offset < sc->regsend) value = pci_xhci_rtsregs_read(sc, offset); else if (offset < (sc->regsend + 4*32)) value = pci_xhci_xecp_read(sc, offset); else { value = 0; WPRINTF(("pci_xhci: read invalid offset %ld", offset)); } pthread_mutex_unlock(&sc->mtx); switch (size) { case 1: value &= 0xFF; break; case 2: value &= 0xFFFF; break; case 4: value &= 0xFFFFFFFF; break; } return (value); } static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm) { struct pci_xhci_portregs *port; struct pci_xhci_dev_emu *dev; struct xhci_trb evtrb; int error; assert(portn <= XHCI_MAX_DEVS); DPRINTF(("xhci reset port %d", portn)); port = XHCI_PORTREG_PTR(sc, portn); dev = XHCI_DEVINST_PTR(sc, portn); if (dev) { port->portsc &= ~(XHCI_PS_PLS_MASK | XHCI_PS_PR | XHCI_PS_PRC); port->portsc |= XHCI_PS_PED | XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); if (warm && dev->dev_ue->ue_usbver == 3) { port->portsc |= XHCI_PS_WRC; } if ((port->portsc & XHCI_PS_PRC) == 0) { port->portsc |= XHCI_PS_PRC; pci_xhci_set_evtrb(&evtrb, portn, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); error = pci_xhci_insert_event(sc, &evtrb, 1); if (error != XHCI_TRB_ERROR_SUCCESS) DPRINTF(("xhci reset port insert event " "failed")); } } } static void pci_xhci_init_port(struct pci_xhci_softc *sc, int portn) { struct pci_xhci_portregs *port; struct pci_xhci_dev_emu *dev; port = XHCI_PORTREG_PTR(sc, portn); dev = XHCI_DEVINST_PTR(sc, portn); if (dev) { port->portsc = XHCI_PS_CCS | /* connected */ XHCI_PS_PP; /* port power */ if (dev->dev_ue->ue_usbver == 2) { port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL) | XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); } else { port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0) | XHCI_PS_PED | /* enabled */ XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); } DPRINTF(("Init port %d 0x%x", portn, port->portsc)); } else { port->portsc = XHCI_PS_PLS_SET(UPS_PORT_LS_RX_DET) | XHCI_PS_PP; DPRINTF(("Init empty port %d 0x%x", portn, port->portsc)); } } static int pci_xhci_dev_intr(struct usb_hci *hci, int epctx) { struct pci_xhci_dev_emu *dev; struct xhci_dev_ctx *dev_ctx; struct xhci_trb evtrb; struct pci_xhci_softc *sc; struct pci_xhci_portregs *p; struct xhci_endp_ctx *ep_ctx; int error = 0; int dir_in; int epid; dir_in = epctx & 0x80; epid = epctx & ~0x80; /* HW endpoint contexts are 0-15; convert to epid based on dir */ epid = (epid * 2) + (dir_in ? 1 : 0); assert(epid >= 1 && epid <= 31); dev = hci->hci_sc; sc = dev->xsc; /* check if device is ready; OS has to initialise it */ if (sc->rtsregs.erstba_p == NULL || (sc->opregs.usbcmd & XHCI_CMD_RS) == 0 || dev->dev_ctx == NULL) return (0); p = XHCI_PORTREG_PTR(sc, hci->hci_port); /* raise event if link U3 (suspended) state */ if (XHCI_PS_PLS_GET(p->portsc) == 3) { p->portsc &= ~XHCI_PS_PLS_MASK; p->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_RESUME); if ((p->portsc & XHCI_PS_PLC) != 0) return (0); p->portsc |= XHCI_PS_PLC; pci_xhci_set_evtrb(&evtrb, hci->hci_port, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); error = pci_xhci_insert_event(sc, &evtrb, 0); if (error != XHCI_TRB_ERROR_SUCCESS) goto done; } dev_ctx = dev->dev_ctx; ep_ctx = &dev_ctx->ctx_ep[epid]; if ((ep_ctx->dwEpCtx0 & 0x7) == XHCI_ST_EPCTX_DISABLED) { DPRINTF(("xhci device interrupt on disabled endpoint %d", epid)); return (0); } DPRINTF(("xhci device interrupt on endpoint %d", epid)); pci_xhci_device_doorbell(sc, hci->hci_port, epid, 0); done: return (error); } static int pci_xhci_dev_event(struct usb_hci *hci, enum hci_usbev evid, void *param) { DPRINTF(("xhci device event port %d", hci->hci_port)); return (0); } static void pci_xhci_device_usage(char *opt) { EPRINTLN("Invalid USB emulation \"%s\"", opt); } static int pci_xhci_parse_opts(struct pci_xhci_softc *sc, char *opts) { struct pci_xhci_dev_emu **devices; struct pci_xhci_dev_emu *dev; struct usb_devemu *ue; void *devsc; char *uopt, *xopts, *config; int usb3_port, usb2_port, i; uopt = NULL; usb3_port = sc->usb3_port_start - 1; usb2_port = sc->usb2_port_start - 1; devices = NULL; if (opts == NULL) goto portsfinal; devices = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_dev_emu *)); sc->slots = calloc(XHCI_MAX_SLOTS, sizeof(struct pci_xhci_dev_emu *)); sc->devices = devices; sc->ndevices = 0; uopt = strdup(opts); for (xopts = strtok(uopt, ","); xopts != NULL; xopts = strtok(NULL, ",")) { if (usb2_port == ((sc->usb2_port_start-1) + XHCI_MAX_DEVS/2) || usb3_port == ((sc->usb3_port_start-1) + XHCI_MAX_DEVS/2)) { WPRINTF(("pci_xhci max number of USB 2 or 3 " "devices reached, max %d", XHCI_MAX_DEVS/2)); usb2_port = usb3_port = -1; goto done; } /* device[=] */ if ((config = strchr(xopts, '=')) == NULL) config = ""; /* no config */ else *config++ = '\0'; ue = usb_emu_finddev(xopts); if (ue == NULL) { pci_xhci_device_usage(xopts); DPRINTF(("pci_xhci device not found %s", xopts)); usb2_port = usb3_port = -1; goto done; } DPRINTF(("pci_xhci adding device %s, opts \"%s\"", xopts, config)); dev = calloc(1, sizeof(struct pci_xhci_dev_emu)); dev->xsc = sc; dev->hci.hci_sc = dev; dev->hci.hci_intr = pci_xhci_dev_intr; dev->hci.hci_event = pci_xhci_dev_event; if (ue->ue_usbver == 2) { dev->hci.hci_port = usb2_port + 1; devices[usb2_port] = dev; usb2_port++; } else { dev->hci.hci_port = usb3_port + 1; devices[usb3_port] = dev; usb3_port++; } dev->hci.hci_address = 0; devsc = ue->ue_init(&dev->hci, config); if (devsc == NULL) { pci_xhci_device_usage(xopts); usb2_port = usb3_port = -1; goto done; } dev->dev_ue = ue; dev->dev_sc = devsc; /* assign slot number to device */ sc->slots[sc->ndevices] = dev; sc->ndevices++; } portsfinal: sc->portregs = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_portregs)); if (sc->ndevices > 0) { /* port and slot numbering start from 1 */ sc->devices--; sc->portregs--; sc->slots--; for (i = 1; i <= XHCI_MAX_DEVS; i++) { pci_xhci_init_port(sc, i); } } else { WPRINTF(("pci_xhci no USB devices configured")); sc->ndevices = 1; } done: if (devices != NULL) { if (usb2_port <= 0 && usb3_port <= 0) { sc->devices = NULL; for (i = 0; devices[i] != NULL; i++) free(devices[i]); sc->ndevices = -1; free(devices); } } free(uopt); return (sc->ndevices); } static int pci_xhci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { struct pci_xhci_softc *sc; int error; if (xhci_in_use) { WPRINTF(("pci_xhci controller already defined")); return (-1); } xhci_in_use = 1; sc = calloc(1, sizeof(struct pci_xhci_softc)); pi->pi_arg = sc; sc->xsc_pi = pi; sc->usb2_port_start = (XHCI_MAX_DEVS/2) + 1; sc->usb3_port_start = 1; /* discover devices */ error = pci_xhci_parse_opts(sc, opts); if (error < 0) goto done; else error = 0; sc->caplength = XHCI_SET_CAPLEN(XHCI_CAPLEN) | XHCI_SET_HCIVERSION(0x0100); sc->hcsparams1 = XHCI_SET_HCSP1_MAXPORTS(XHCI_MAX_DEVS) | XHCI_SET_HCSP1_MAXINTR(1) | /* interrupters */ XHCI_SET_HCSP1_MAXSLOTS(XHCI_MAX_SLOTS); sc->hcsparams2 = XHCI_SET_HCSP2_ERSTMAX(XHCI_ERST_MAX) | XHCI_SET_HCSP2_IST(0x04); sc->hcsparams3 = 0; /* no latency */ sc->hccparams1 = XHCI_SET_HCCP1_NSS(1) | /* no 2nd-streams */ XHCI_SET_HCCP1_SPC(1) | /* short packet */ XHCI_SET_HCCP1_MAXPSA(XHCI_STREAMS_MAX); sc->hccparams2 = XHCI_SET_HCCP2_LEC(1) | XHCI_SET_HCCP2_U3C(1); sc->dboff = XHCI_SET_DOORBELL(XHCI_CAPLEN + XHCI_PORTREGS_START + XHCI_MAX_DEVS * sizeof(struct pci_xhci_portregs)); /* dboff must be 32-bit aligned */ if (sc->dboff & 0x3) sc->dboff = (sc->dboff + 0x3) & ~0x3; /* rtsoff must be 32-bytes aligned */ sc->rtsoff = XHCI_SET_RTSOFFSET(sc->dboff + (XHCI_MAX_SLOTS+1) * 32); if (sc->rtsoff & 0x1F) sc->rtsoff = (sc->rtsoff + 0x1F) & ~0x1F; DPRINTF(("pci_xhci dboff: 0x%x, rtsoff: 0x%x", sc->dboff, sc->rtsoff)); sc->opregs.usbsts = XHCI_STS_HCH; sc->opregs.pgsz = XHCI_PAGESIZE_4K; pci_xhci_reset(sc); sc->regsend = sc->rtsoff + 0x20 + 32; /* only 1 intrpter */ /* * Set extended capabilities pointer to be after regsend; * value of xecp field is 32-bit offset. */ sc->hccparams1 |= XHCI_SET_HCCP1_XECP(sc->regsend/4); pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1E31); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SERIALBUS); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_SERIALBUS_USB); pci_set_cfgdata8(pi, PCIR_PROGIF,PCIP_SERIALBUS_USB_XHCI); pci_set_cfgdata8(pi, PCI_USBREV, PCI_USB_REV_3_0); pci_emul_add_msicap(pi, 1); /* regsend + xecp registers */ pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, sc->regsend + 4*32); DPRINTF(("pci_xhci pci_emu_alloc: %d", sc->regsend + 4*32)); pci_lintr_request(pi); pthread_mutex_init(&sc->mtx, NULL); done: if (error) { free(sc); } return (error); } +#ifdef BHYVE_SNAPSHOT +static void +pci_xhci_map_devs_slots(struct pci_xhci_softc *sc, int maps[]) +{ + int i, j; + struct pci_xhci_dev_emu *dev, *slot; + + memset(maps, 0, sizeof(maps[0]) * XHCI_MAX_SLOTS); + + for (i = 1; i <= XHCI_MAX_SLOTS; i++) { + for (j = 1; j <= XHCI_MAX_DEVS; j++) { + slot = XHCI_SLOTDEV_PTR(sc, i); + dev = XHCI_DEVINST_PTR(sc, j); + + if (slot == dev) + maps[i] = j; + } + } +} +static int +pci_xhci_snapshot_ep(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, + int idx, struct vm_snapshot_meta *meta) +{ + int k; + int ret; + struct usb_data_xfer *xfer; + struct usb_data_xfer_block *xfer_block; + + /* some sanity checks */ + if (meta->op == VM_SNAPSHOT_SAVE) + xfer = dev->eps[idx].ep_xfer; + + SNAPSHOT_VAR_OR_LEAVE(xfer, meta, ret, done); + if (xfer == NULL) { + ret = 0; + goto done; + } + + if (meta->op == VM_SNAPSHOT_RESTORE) { + pci_xhci_init_ep(dev, idx); + xfer = dev->eps[idx].ep_xfer; + } + + /* save / restore proper */ + for (k = 0; k < USB_MAX_XFER_BLOCKS; k++) { + xfer_block = &xfer->data[k]; + + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(xfer_block->buf, + XHCI_GADDR_SIZE(xfer_block->buf), true, meta, ret, + done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->blen, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->bdone, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->processed, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->hci_data, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->ccs, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->streamid, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->trbnext, meta, ret, done); + } + + SNAPSHOT_VAR_OR_LEAVE(xfer->ureq, meta, ret, done); + if (xfer->ureq) { + /* xfer->ureq is not allocated at restore time */ + if (meta->op == VM_SNAPSHOT_RESTORE) + xfer->ureq = malloc(sizeof(struct usb_device_request)); + + SNAPSHOT_BUF_OR_LEAVE(xfer->ureq, + sizeof(struct usb_device_request), + meta, ret, done); + } + + SNAPSHOT_VAR_OR_LEAVE(xfer->ndata, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer->head, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer->tail, meta, ret, done); + +done: + return (ret); +} + +static int +pci_xhci_snapshot(struct vm_snapshot_meta *meta) +{ + int i, j; + int ret; + int restore_idx; + struct pci_devinst *pi; + struct pci_xhci_softc *sc; + struct pci_xhci_portregs *port; + struct pci_xhci_dev_emu *dev; + char dname[SNAP_DEV_NAME_LEN]; + int maps[XHCI_MAX_SLOTS + 1]; + + pi = meta->dev_data; + sc = pi->pi_arg; + + SNAPSHOT_VAR_OR_LEAVE(sc->caplength, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams1, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams3, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hccparams1, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->dboff, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsoff, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hccparams2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->regsend, meta, ret, done); + + /* opregs */ + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbcmd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbsts, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.pgsz, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dnctrl, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.crcr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dcbaap, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.config, meta, ret, done); + + /* opregs.cr_p */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.cr_p, + XHCI_GADDR_SIZE(sc->opregs.cr_p), false, meta, ret, done); + + /* opregs.dcbaa_p */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.dcbaa_p, + XHCI_GADDR_SIZE(sc->opregs.dcbaa_p), false, meta, ret, done); + + /* rtsregs */ + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.mfindex, meta, ret, done); + + /* rtsregs.intrreg */ + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.iman, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.imod, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstsz, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.rsvd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstba, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erdp, meta, ret, done); + + /* rtsregs.erstba_p */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erstba_p, + XHCI_GADDR_SIZE(sc->rtsregs.erstba_p), false, meta, ret, done); + + /* rtsregs.erst_p */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erst_p, + XHCI_GADDR_SIZE(sc->rtsregs.erst_p), false, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_deq_seg, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_idx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_seg, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_events_cnt, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.event_pcs, meta, ret, done); + + /* sanity checking */ + for (i = 1; i <= XHCI_MAX_DEVS; i++) { + dev = XHCI_DEVINST_PTR(sc, i); + if (dev == NULL) + continue; + + if (meta->op == VM_SNAPSHOT_SAVE) + restore_idx = i; + SNAPSHOT_VAR_OR_LEAVE(restore_idx, meta, ret, done); + + /* check if the restored device (when restoring) is sane */ + if (restore_idx != i) { + fprintf(stderr, "%s: idx not matching: actual: %d, " + "expected: %d\r\n", __func__, restore_idx, i); + ret = EINVAL; + goto done; + } + + if (meta->op == VM_SNAPSHOT_SAVE) { + memset(dname, 0, sizeof(dname)); + strncpy(dname, dev->dev_ue->ue_emu, sizeof(dname) - 1); + } + + SNAPSHOT_BUF_OR_LEAVE(dname, sizeof(dname), meta, ret, done); + + if (meta->op == VM_SNAPSHOT_RESTORE) { + dname[sizeof(dname) - 1] = '\0'; + if (strcmp(dev->dev_ue->ue_emu, dname)) { + fprintf(stderr, "%s: device names mismatch: " + "actual: %s, expected: %s\r\n", + __func__, dname, dev->dev_ue->ue_emu); + + ret = EINVAL; + goto done; + } + } + } + + /* portregs */ + for (i = 1; i <= XHCI_MAX_DEVS; i++) { + port = XHCI_PORTREG_PTR(sc, i); + dev = XHCI_DEVINST_PTR(sc, i); + + if (dev == NULL) + continue; + + SNAPSHOT_VAR_OR_LEAVE(port->portsc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->portpmsc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->portli, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->porthlpmc, meta, ret, done); + } + + /* slots */ + if (meta->op == VM_SNAPSHOT_SAVE) + pci_xhci_map_devs_slots(sc, maps); + + for (i = 1; i <= XHCI_MAX_SLOTS; i++) { + SNAPSHOT_VAR_OR_LEAVE(maps[i], meta, ret, done); + + if (meta->op == VM_SNAPSHOT_SAVE) { + dev = XHCI_SLOTDEV_PTR(sc, i); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + if (maps[i] != 0) + dev = XHCI_DEVINST_PTR(sc, maps[i]); + else + dev = NULL; + + XHCI_SLOTDEV_PTR(sc, i) = dev; + } else { + /* error */ + ret = EINVAL; + goto done; + } + + if (dev == NULL) + continue; + + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(dev->dev_ctx, + XHCI_GADDR_SIZE(dev->dev_ctx), false, meta, ret, done); + + for (j = 1; j < XHCI_MAX_ENDPOINTS; j++) { + ret = pci_xhci_snapshot_ep(sc, dev, j, meta); + if (ret != 0) + goto done; + } + + SNAPSHOT_VAR_OR_LEAVE(dev->dev_slotstate, meta, ret, done); + + /* devices[i]->dev_sc */ + dev->dev_ue->ue_snapshot(dev->dev_sc, meta); + + /* devices[i]->hci */ + SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_address, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_port, meta, ret, done); + } + + SNAPSHOT_VAR_OR_LEAVE(sc->ndevices, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->usb2_port_start, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->usb3_port_start, meta, ret, done); + +done: + return (ret); +} +#endif struct pci_devemu pci_de_xhci = { .pe_emu = "xhci", .pe_init = pci_xhci_init, .pe_barwrite = pci_xhci_write, - .pe_barread = pci_xhci_read + .pe_barread = pci_xhci_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = pci_xhci_snapshot, +#endif }; PCI_EMUL_SET(pci_de_xhci); diff --git a/usr.sbin/bhyve/ps2kbd.c b/usr.sbin/bhyve/ps2kbd.c index 3e6a1b67ca38..ef20fa47e0a9 100644 --- a/usr.sbin/bhyve/ps2kbd.c +++ b/usr.sbin/bhyve/ps2kbd.c @@ -1,384 +1,401 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015 Tycho Nightingale * Copyright (c) 2015 Nahanni Systems Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include +#include + #include #include #include #include +#include #include #include #include #include "atkbdc.h" #include "debug.h" #include "console.h" /* keyboard device commands */ #define PS2KC_RESET_DEV 0xff #define PS2KC_DISABLE 0xf5 #define PS2KC_ENABLE 0xf4 #define PS2KC_SET_TYPEMATIC 0xf3 #define PS2KC_SEND_DEV_ID 0xf2 #define PS2KC_SET_SCANCODE_SET 0xf0 #define PS2KC_ECHO 0xee #define PS2KC_SET_LEDS 0xed #define PS2KC_BAT_SUCCESS 0xaa #define PS2KC_ACK 0xfa #define PS2KBD_FIFOSZ 16 struct fifo { uint8_t buf[PS2KBD_FIFOSZ]; int rindex; /* index to read from */ int windex; /* index to write to */ int num; /* number of bytes in the fifo */ int size; /* size of the fifo */ }; struct ps2kbd_softc { struct atkbdc_softc *atkbdc_sc; pthread_mutex_t mtx; bool enabled; struct fifo fifo; uint8_t curcmd; /* current command for next byte */ }; #define SCANCODE_E0_PREFIX 1 struct extended_translation { uint32_t keysym; uint8_t scancode; int flags; }; /* * FIXME: Pause/break and Print Screen/SysRq require special handling. */ static const struct extended_translation extended_translations[] = { {0xff08, 0x66}, /* Back space */ {0xff09, 0x0d}, /* Tab */ {0xff0d, 0x5a}, /* Return */ {0xff1b, 0x76}, /* Escape */ {0xff50, 0x6c, SCANCODE_E0_PREFIX}, /* Home */ {0xff51, 0x6b, SCANCODE_E0_PREFIX}, /* Left arrow */ {0xff52, 0x75, SCANCODE_E0_PREFIX}, /* Up arrow */ {0xff53, 0x74, SCANCODE_E0_PREFIX}, /* Right arrow */ {0xff54, 0x72, SCANCODE_E0_PREFIX}, /* Down arrow */ {0xff55, 0x7d, SCANCODE_E0_PREFIX}, /* PgUp */ {0xff56, 0x7a, SCANCODE_E0_PREFIX}, /* PgDown */ {0xff57, 0x69, SCANCODE_E0_PREFIX}, /* End */ {0xff63, 0x70, SCANCODE_E0_PREFIX}, /* Ins */ {0xff8d, 0x5a, SCANCODE_E0_PREFIX}, /* Keypad Enter */ {0xffe1, 0x12}, /* Left shift */ {0xffe2, 0x59}, /* Right shift */ {0xffe3, 0x14}, /* Left control */ {0xffe4, 0x14, SCANCODE_E0_PREFIX}, /* Right control */ /* {0xffe7, XXX}, Left meta */ /* {0xffe8, XXX}, Right meta */ {0xffe9, 0x11}, /* Left alt */ {0xfe03, 0x11, SCANCODE_E0_PREFIX}, /* AltGr */ {0xffea, 0x11, SCANCODE_E0_PREFIX}, /* Right alt */ {0xffeb, 0x1f, SCANCODE_E0_PREFIX}, /* Left Windows */ {0xffec, 0x27, SCANCODE_E0_PREFIX}, /* Right Windows */ {0xffbe, 0x05}, /* F1 */ {0xffbf, 0x06}, /* F2 */ {0xffc0, 0x04}, /* F3 */ {0xffc1, 0x0c}, /* F4 */ {0xffc2, 0x03}, /* F5 */ {0xffc3, 0x0b}, /* F6 */ {0xffc4, 0x83}, /* F7 */ {0xffc5, 0x0a}, /* F8 */ {0xffc6, 0x01}, /* F9 */ {0xffc7, 0x09}, /* F10 */ {0xffc8, 0x78}, /* F11 */ {0xffc9, 0x07}, /* F12 */ {0xffff, 0x71, SCANCODE_E0_PREFIX}, /* Del */ {0xff14, 0x7e}, /* ScrollLock */ /* NumLock and Keypads*/ {0xff7f, 0x77}, /* NumLock */ {0xffaf, 0x4a, SCANCODE_E0_PREFIX}, /* Keypad slash */ {0xffaa, 0x7c}, /* Keypad asterisk */ {0xffad, 0x7b}, /* Keypad minus */ {0xffab, 0x79}, /* Keypad plus */ {0xffb7, 0x6c}, /* Keypad 7 */ {0xff95, 0x6c}, /* Keypad home */ {0xffb8, 0x75}, /* Keypad 8 */ {0xff97, 0x75}, /* Keypad up arrow */ {0xffb9, 0x7d}, /* Keypad 9 */ {0xff9a, 0x7d}, /* Keypad PgUp */ {0xffb4, 0x6b}, /* Keypad 4 */ {0xff96, 0x6b}, /* Keypad left arrow */ {0xffb5, 0x73}, /* Keypad 5 */ {0xff9d, 0x73}, /* Keypad empty */ {0xffb6, 0x74}, /* Keypad 6 */ {0xff98, 0x74}, /* Keypad right arrow */ {0xffb1, 0x69}, /* Keypad 1 */ {0xff9c, 0x69}, /* Keypad end */ {0xffb2, 0x72}, /* Keypad 2 */ {0xff99, 0x72}, /* Keypad down arrow */ {0xffb3, 0x7a}, /* Keypad 3 */ {0xff9b, 0x7a}, /* Keypad PgDown */ {0xffb0, 0x70}, /* Keypad 0 */ {0xff9e, 0x70}, /* Keypad ins */ {0xffae, 0x71}, /* Keypad . */ {0xff9f, 0x71}, /* Keypad del */ {0, 0, 0} /* Terminator */ }; /* ASCII to type 2 scancode lookup table */ static const uint8_t ascii_translations[128] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0x16, 0x52, 0x26, 0x25, 0x2e, 0x3d, 0x52, 0x46, 0x45, 0x3e, 0x55, 0x41, 0x4e, 0x49, 0x4a, 0x45, 0x16, 0x1e, 0x26, 0x25, 0x2e, 0x36, 0x3d, 0x3e, 0x46, 0x4c, 0x4c, 0x41, 0x55, 0x49, 0x4a, 0x1e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34, 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44, 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d, 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x36, 0x4e, 0x0e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34, 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44, 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d, 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x0e, 0x00, }; static void fifo_init(struct ps2kbd_softc *sc) { struct fifo *fifo; fifo = &sc->fifo; fifo->size = sizeof(((struct fifo *)0)->buf); } static void fifo_reset(struct ps2kbd_softc *sc) { struct fifo *fifo; fifo = &sc->fifo; bzero(fifo, sizeof(struct fifo)); fifo->size = sizeof(((struct fifo *)0)->buf); } static void fifo_put(struct ps2kbd_softc *sc, uint8_t val) { struct fifo *fifo; fifo = &sc->fifo; if (fifo->num < fifo->size) { fifo->buf[fifo->windex] = val; fifo->windex = (fifo->windex + 1) % fifo->size; fifo->num++; } } static int fifo_get(struct ps2kbd_softc *sc, uint8_t *val) { struct fifo *fifo; fifo = &sc->fifo; if (fifo->num > 0) { *val = fifo->buf[fifo->rindex]; fifo->rindex = (fifo->rindex + 1) % fifo->size; fifo->num--; return (0); } return (-1); } int ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val) { int retval; pthread_mutex_lock(&sc->mtx); retval = fifo_get(sc, val); pthread_mutex_unlock(&sc->mtx); return (retval); } void ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val) { pthread_mutex_lock(&sc->mtx); if (sc->curcmd) { switch (sc->curcmd) { case PS2KC_SET_TYPEMATIC: fifo_put(sc, PS2KC_ACK); break; case PS2KC_SET_SCANCODE_SET: fifo_put(sc, PS2KC_ACK); break; case PS2KC_SET_LEDS: fifo_put(sc, PS2KC_ACK); break; default: EPRINTLN("Unhandled ps2 keyboard current " "command byte 0x%02x", val); break; } sc->curcmd = 0; } else { switch (val) { case 0x00: fifo_put(sc, PS2KC_ACK); break; case PS2KC_RESET_DEV: fifo_reset(sc); fifo_put(sc, PS2KC_ACK); fifo_put(sc, PS2KC_BAT_SUCCESS); break; case PS2KC_DISABLE: sc->enabled = false; fifo_put(sc, PS2KC_ACK); break; case PS2KC_ENABLE: sc->enabled = true; fifo_reset(sc); fifo_put(sc, PS2KC_ACK); break; case PS2KC_SET_TYPEMATIC: sc->curcmd = val; fifo_put(sc, PS2KC_ACK); break; case PS2KC_SEND_DEV_ID: fifo_put(sc, PS2KC_ACK); fifo_put(sc, 0xab); fifo_put(sc, 0x83); break; case PS2KC_SET_SCANCODE_SET: sc->curcmd = val; fifo_put(sc, PS2KC_ACK); break; case PS2KC_ECHO: fifo_put(sc, PS2KC_ECHO); break; case PS2KC_SET_LEDS: sc->curcmd = val; fifo_put(sc, PS2KC_ACK); break; default: EPRINTLN("Unhandled ps2 keyboard command " "0x%02x", val); break; } } pthread_mutex_unlock(&sc->mtx); } /* * Translate keysym to type 2 scancode and insert into keyboard buffer. */ static void ps2kbd_keysym_queue(struct ps2kbd_softc *sc, int down, uint32_t keysym) { assert(pthread_mutex_isowned_np(&sc->mtx)); int e0_prefix, found; uint8_t code; const struct extended_translation *trans; found = 0; if (keysym < 0x80) { code = ascii_translations[keysym]; e0_prefix = 0; found = 1; } else { for (trans = &(extended_translations[0]); trans->keysym != 0; trans++) { if (keysym == trans->keysym) { code = trans->scancode; e0_prefix = trans->flags & SCANCODE_E0_PREFIX; found = 1; break; } } } if (!found) { EPRINTLN("Unhandled ps2 keyboard keysym 0x%x", keysym); return; } if (e0_prefix) fifo_put(sc, 0xe0); if (!down) fifo_put(sc, 0xf0); fifo_put(sc, code); } static void ps2kbd_event(int down, uint32_t keysym, void *arg) { struct ps2kbd_softc *sc = arg; int fifo_full; pthread_mutex_lock(&sc->mtx); if (!sc->enabled) { pthread_mutex_unlock(&sc->mtx); return; } fifo_full = sc->fifo.num == PS2KBD_FIFOSZ; ps2kbd_keysym_queue(sc, down, keysym); pthread_mutex_unlock(&sc->mtx); if (!fifo_full) atkbdc_event(sc->atkbdc_sc, 1); } struct ps2kbd_softc * ps2kbd_init(struct atkbdc_softc *atkbdc_sc) { struct ps2kbd_softc *sc; sc = calloc(1, sizeof (struct ps2kbd_softc)); pthread_mutex_init(&sc->mtx, NULL); fifo_init(sc); sc->atkbdc_sc = atkbdc_sc; console_kbd_register(ps2kbd_event, sc, 1); return (sc); } +#ifdef BHYVE_SNAPSHOT +int +ps2kbd_snapshot(struct ps2kbd_softc *sc, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(sc->enabled, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->curcmd, meta, ret, done); + +done: + return (ret); +} +#endif + diff --git a/usr.sbin/bhyve/ps2kbd.h b/usr.sbin/bhyve/ps2kbd.h index 17be6d046673..3cf87be1b7f3 100644 --- a/usr.sbin/bhyve/ps2kbd.h +++ b/usr.sbin/bhyve/ps2kbd.h @@ -1,41 +1,46 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _PS2KBD_H_ #define _PS2KBD_H_ struct atkbdc_softc; +struct vm_snapshot_meta; struct ps2kbd_softc *ps2kbd_init(struct atkbdc_softc *sc); int ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val); void ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val); +#ifdef BHYVE_SNAPSHOT +int ps2kbd_snapshot(struct ps2kbd_softc *sc, struct vm_snapshot_meta *meta); +#endif + #endif /* _PS2KBD_H_ */ diff --git a/usr.sbin/bhyve/ps2mouse.c b/usr.sbin/bhyve/ps2mouse.c index f42d2e726023..afe817710f30 100644 --- a/usr.sbin/bhyve/ps2mouse.c +++ b/usr.sbin/bhyve/ps2mouse.c @@ -1,419 +1,441 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015 Tycho Nightingale * Copyright (c) 2015 Nahanni Systems Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include +#include + #include #include #include #include +#include #include #include #include #include "atkbdc.h" #include "debug.h" #include "console.h" /* mouse device commands */ #define PS2MC_RESET_DEV 0xff #define PS2MC_SET_DEFAULTS 0xf6 #define PS2MC_DISABLE 0xf5 #define PS2MC_ENABLE 0xf4 #define PS2MC_SET_SAMPLING_RATE 0xf3 #define PS2MC_SEND_DEV_ID 0xf2 #define PS2MC_SET_REMOTE_MODE 0xf0 #define PS2MC_SEND_DEV_DATA 0xeb #define PS2MC_SET_STREAM_MODE 0xea #define PS2MC_SEND_DEV_STATUS 0xe9 #define PS2MC_SET_RESOLUTION 0xe8 #define PS2MC_SET_SCALING1 0xe7 #define PS2MC_SET_SCALING2 0xe6 #define PS2MC_BAT_SUCCESS 0xaa #define PS2MC_ACK 0xfa /* mouse device id */ #define PS2MOUSE_DEV_ID 0x0 /* mouse data bits */ #define PS2M_DATA_Y_OFLOW 0x80 #define PS2M_DATA_X_OFLOW 0x40 #define PS2M_DATA_Y_SIGN 0x20 #define PS2M_DATA_X_SIGN 0x10 #define PS2M_DATA_AONE 0x08 #define PS2M_DATA_MID_BUTTON 0x04 #define PS2M_DATA_RIGHT_BUTTON 0x02 #define PS2M_DATA_LEFT_BUTTON 0x01 /* mouse status bits */ #define PS2M_STS_REMOTE_MODE 0x40 #define PS2M_STS_ENABLE_DEV 0x20 #define PS2M_STS_SCALING_21 0x10 #define PS2M_STS_MID_BUTTON 0x04 #define PS2M_STS_RIGHT_BUTTON 0x02 #define PS2M_STS_LEFT_BUTTON 0x01 #define PS2MOUSE_FIFOSZ 16 struct fifo { uint8_t buf[PS2MOUSE_FIFOSZ]; int rindex; /* index to read from */ int windex; /* index to write to */ int num; /* number of bytes in the fifo */ int size; /* size of the fifo */ }; struct ps2mouse_softc { struct atkbdc_softc *atkbdc_sc; pthread_mutex_t mtx; uint8_t status; uint8_t resolution; uint8_t sampling_rate; int ctrlenable; struct fifo fifo; uint8_t curcmd; /* current command for next byte */ int cur_x, cur_y; int delta_x, delta_y; }; static void fifo_init(struct ps2mouse_softc *sc) { struct fifo *fifo; fifo = &sc->fifo; fifo->size = sizeof(((struct fifo *)0)->buf); } static void fifo_reset(struct ps2mouse_softc *sc) { struct fifo *fifo; fifo = &sc->fifo; bzero(fifo, sizeof(struct fifo)); fifo->size = sizeof(((struct fifo *)0)->buf); } static void fifo_put(struct ps2mouse_softc *sc, uint8_t val) { struct fifo *fifo; fifo = &sc->fifo; if (fifo->num < fifo->size) { fifo->buf[fifo->windex] = val; fifo->windex = (fifo->windex + 1) % fifo->size; fifo->num++; } } static int fifo_get(struct ps2mouse_softc *sc, uint8_t *val) { struct fifo *fifo; fifo = &sc->fifo; if (fifo->num > 0) { *val = fifo->buf[fifo->rindex]; fifo->rindex = (fifo->rindex + 1) % fifo->size; fifo->num--; return (0); } return (-1); } static void movement_reset(struct ps2mouse_softc *sc) { assert(pthread_mutex_isowned_np(&sc->mtx)); sc->delta_x = 0; sc->delta_y = 0; } static void movement_update(struct ps2mouse_softc *sc, int x, int y) { sc->delta_x += x - sc->cur_x; sc->delta_y += sc->cur_y - y; sc->cur_x = x; sc->cur_y = y; } static void movement_get(struct ps2mouse_softc *sc) { uint8_t val0, val1, val2; assert(pthread_mutex_isowned_np(&sc->mtx)); val0 = PS2M_DATA_AONE; val0 |= sc->status & (PS2M_DATA_LEFT_BUTTON | PS2M_DATA_RIGHT_BUTTON | PS2M_DATA_MID_BUTTON); if (sc->delta_x >= 0) { if (sc->delta_x > 255) { val0 |= PS2M_DATA_X_OFLOW; val1 = 255; } else val1 = sc->delta_x; } else { val0 |= PS2M_DATA_X_SIGN; if (sc->delta_x < -255) { val0 |= PS2M_DATA_X_OFLOW; val1 = 255; } else val1 = sc->delta_x; } sc->delta_x = 0; if (sc->delta_y >= 0) { if (sc->delta_y > 255) { val0 |= PS2M_DATA_Y_OFLOW; val2 = 255; } else val2 = sc->delta_y; } else { val0 |= PS2M_DATA_Y_SIGN; if (sc->delta_y < -255) { val0 |= PS2M_DATA_Y_OFLOW; val2 = 255; } else val2 = sc->delta_y; } sc->delta_y = 0; if (sc->fifo.num < (sc->fifo.size - 3)) { fifo_put(sc, val0); fifo_put(sc, val1); fifo_put(sc, val2); } } static void ps2mouse_reset(struct ps2mouse_softc *sc) { assert(pthread_mutex_isowned_np(&sc->mtx)); fifo_reset(sc); movement_reset(sc); sc->status = PS2M_STS_ENABLE_DEV; sc->resolution = 4; sc->sampling_rate = 100; sc->cur_x = 0; sc->cur_y = 0; sc->delta_x = 0; sc->delta_y = 0; } int ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val) { int retval; pthread_mutex_lock(&sc->mtx); retval = fifo_get(sc, val); pthread_mutex_unlock(&sc->mtx); return (retval); } int ps2mouse_fifocnt(struct ps2mouse_softc *sc) { return (sc->fifo.num); } void ps2mouse_toggle(struct ps2mouse_softc *sc, int enable) { pthread_mutex_lock(&sc->mtx); if (enable) sc->ctrlenable = 1; else { sc->ctrlenable = 0; sc->fifo.rindex = 0; sc->fifo.windex = 0; sc->fifo.num = 0; } pthread_mutex_unlock(&sc->mtx); } void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert) { pthread_mutex_lock(&sc->mtx); fifo_reset(sc); if (sc->curcmd) { switch (sc->curcmd) { case PS2MC_SET_SAMPLING_RATE: sc->sampling_rate = val; fifo_put(sc, PS2MC_ACK); break; case PS2MC_SET_RESOLUTION: sc->resolution = val; fifo_put(sc, PS2MC_ACK); break; default: EPRINTLN("Unhandled ps2 mouse current " "command byte 0x%02x", val); break; } sc->curcmd = 0; } else if (insert) { fifo_put(sc, val); } else { switch (val) { case 0x00: fifo_put(sc, PS2MC_ACK); break; case PS2MC_RESET_DEV: ps2mouse_reset(sc); fifo_put(sc, PS2MC_ACK); fifo_put(sc, PS2MC_BAT_SUCCESS); fifo_put(sc, PS2MOUSE_DEV_ID); break; case PS2MC_SET_DEFAULTS: ps2mouse_reset(sc); fifo_put(sc, PS2MC_ACK); break; case PS2MC_DISABLE: fifo_reset(sc); sc->status &= ~PS2M_STS_ENABLE_DEV; fifo_put(sc, PS2MC_ACK); break; case PS2MC_ENABLE: fifo_reset(sc); sc->status |= PS2M_STS_ENABLE_DEV; fifo_put(sc, PS2MC_ACK); break; case PS2MC_SET_SAMPLING_RATE: sc->curcmd = val; fifo_put(sc, PS2MC_ACK); break; case PS2MC_SEND_DEV_ID: fifo_put(sc, PS2MC_ACK); fifo_put(sc, PS2MOUSE_DEV_ID); break; case PS2MC_SET_REMOTE_MODE: sc->status |= PS2M_STS_REMOTE_MODE; fifo_put(sc, PS2MC_ACK); break; case PS2MC_SEND_DEV_DATA: fifo_put(sc, PS2MC_ACK); movement_get(sc); break; case PS2MC_SET_STREAM_MODE: sc->status &= ~PS2M_STS_REMOTE_MODE; fifo_put(sc, PS2MC_ACK); break; case PS2MC_SEND_DEV_STATUS: fifo_put(sc, PS2MC_ACK); fifo_put(sc, sc->status); fifo_put(sc, sc->resolution); fifo_put(sc, sc->sampling_rate); break; case PS2MC_SET_RESOLUTION: sc->curcmd = val; fifo_put(sc, PS2MC_ACK); break; case PS2MC_SET_SCALING1: case PS2MC_SET_SCALING2: fifo_put(sc, PS2MC_ACK); break; default: fifo_put(sc, PS2MC_ACK); EPRINTLN("Unhandled ps2 mouse command " "0x%02x", val); break; } } pthread_mutex_unlock(&sc->mtx); } static void ps2mouse_event(uint8_t button, int x, int y, void *arg) { struct ps2mouse_softc *sc = arg; pthread_mutex_lock(&sc->mtx); movement_update(sc, x, y); sc->status &= ~(PS2M_STS_LEFT_BUTTON | PS2M_STS_RIGHT_BUTTON | PS2M_STS_MID_BUTTON); if (button & (1 << 0)) sc->status |= PS2M_STS_LEFT_BUTTON; if (button & (1 << 1)) sc->status |= PS2M_STS_MID_BUTTON; if (button & (1 << 2)) sc->status |= PS2M_STS_RIGHT_BUTTON; if ((sc->status & PS2M_STS_ENABLE_DEV) == 0 || !sc->ctrlenable) { /* no data reporting */ pthread_mutex_unlock(&sc->mtx); return; } movement_get(sc); pthread_mutex_unlock(&sc->mtx); if (sc->fifo.num > 0) atkbdc_event(sc->atkbdc_sc, 0); } struct ps2mouse_softc * ps2mouse_init(struct atkbdc_softc *atkbdc_sc) { struct ps2mouse_softc *sc; sc = calloc(1, sizeof (struct ps2mouse_softc)); pthread_mutex_init(&sc->mtx, NULL); fifo_init(sc); sc->atkbdc_sc = atkbdc_sc; pthread_mutex_lock(&sc->mtx); ps2mouse_reset(sc); pthread_mutex_unlock(&sc->mtx); console_ptr_register(ps2mouse_event, sc, 1); return (sc); } - +#ifdef BHYVE_SNAPSHOT +int +ps2mouse_snapshot(struct ps2mouse_softc *sc, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(sc->status, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->resolution, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->sampling_rate, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->ctrlenable, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->curcmd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->cur_x, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->cur_y, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->delta_x, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->delta_y, meta, ret, done); + +done: + return (ret); +} +#endif diff --git a/usr.sbin/bhyve/ps2mouse.h b/usr.sbin/bhyve/ps2mouse.h index 59430b01e2b1..4ae755ef4411 100644 --- a/usr.sbin/bhyve/ps2mouse.h +++ b/usr.sbin/bhyve/ps2mouse.h @@ -1,43 +1,48 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _PS2MOUSE_H_ #define _PS2MOUSE_H_ struct atkbdc_softc; +struct vm_snapshot_meta; struct ps2mouse_softc *ps2mouse_init(struct atkbdc_softc *sc); int ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val); void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert); void ps2mouse_toggle(struct ps2mouse_softc *sc, int enable); int ps2mouse_fifocnt(struct ps2mouse_softc *sc); +#ifdef BHYVE_SNAPSHOT +int ps2mouse_snapshot(struct ps2mouse_softc *sc, struct vm_snapshot_meta *meta); +#endif + #endif /* _PS2MOUSE_H_ */ diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c new file mode 100644 index 000000000000..22bfd8d28a61 --- /dev/null +++ b/usr.sbin/bhyve/snapshot.c @@ -0,0 +1,1742 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Flavius Anton + * Copyright (c) 2016 Mihai Tiganus + * Copyright (c) 2016-2019 Mihai Carabas + * Copyright (c) 2017-2019 Darius Mihai + * Copyright (c) 2017-2019 Elena Mihailescu + * Copyright (c) 2018-2019 Sergiu Weisz + * All rights reserved. + * The bhyve-snapshot feature was developed under sponsorships + * from Matthew Grooms. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include + +#include +#include + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include + +#include "bhyverun.h" +#include "acpi.h" +#include "atkbdc.h" +#include "inout.h" +#include "dbgport.h" +#include "fwctl.h" +#include "ioapic.h" +#include "mem.h" +#include "mevent.h" +#include "mptbl.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" +#include "smbiostbl.h" +#include "snapshot.h" +#include "xmsr.h" +#include "spinup_ap.h" +#include "rtc.h" + +#include +#include + +struct spinner_info { + const size_t *crtval; + const size_t maxval; + const size_t total; +}; + +extern int guest_ncpus; + +static struct winsize winsize; +static sig_t old_winch_handler; + +#define KB (1024UL) +#define MB (1024UL * KB) +#define GB (1024UL * MB) + +#define SNAPSHOT_CHUNK (4 * MB) +#define PROG_BUF_SZ (8192) + +#define BHYVE_RUN_DIR "/var/run/bhyve" +#define CHECKPOINT_RUN_DIR BHYVE_RUN_DIR "/checkpoint" +#define MAX_VMNAME 100 + +#define MAX_MSG_SIZE 1024 + +#define SNAPSHOT_BUFFER_SIZE (20 * MB) + +#define JSON_STRUCT_ARR_KEY "structs" +#define JSON_DEV_ARR_KEY "devices" +#define JSON_BASIC_METADATA_KEY "basic metadata" +#define JSON_SNAPSHOT_REQ_KEY "snapshot_req" +#define JSON_SIZE_KEY "size" +#define JSON_FILE_OFFSET_KEY "file_offset" + +#define JSON_NCPUS_KEY "ncpus" +#define JSON_VMNAME_KEY "vmname" +#define JSON_MEMSIZE_KEY "memsize" +#define JSON_MEMFLAGS_KEY "memflags" + +#define min(a,b) \ +({ \ + __typeof__ (a) _a = (a); \ + __typeof__ (b) _b = (b); \ + _a < _b ? _a : _b; \ + }) + +const struct vm_snapshot_dev_info snapshot_devs[] = { + { "atkbdc", atkbdc_snapshot, NULL, NULL }, + { "virtio-net", pci_snapshot, pci_pause, pci_resume }, + { "virtio-blk", pci_snapshot, pci_pause, pci_resume }, + { "lpc", pci_snapshot, NULL, NULL }, + { "fbuf", pci_snapshot, NULL, NULL }, + { "xhci", pci_snapshot, NULL, NULL }, + { "e1000", pci_snapshot, NULL, NULL }, + { "ahci", pci_snapshot, pci_pause, pci_resume }, + { "ahci-hd", pci_snapshot, pci_pause, pci_resume }, + { "ahci-cd", pci_snapshot, NULL, NULL }, +}; + +const struct vm_snapshot_kern_info snapshot_kern_structs[] = { + { "vhpet", STRUCT_VHPET }, + { "vm", STRUCT_VM }, + { "vmx", STRUCT_VMX }, + { "vioapic", STRUCT_VIOAPIC }, + { "vlapic", STRUCT_VLAPIC }, + { "vmcx", STRUCT_VMCX }, + { "vatpit", STRUCT_VATPIT }, + { "vatpic", STRUCT_VATPIC }, + { "vpmtmr", STRUCT_VPMTMR }, + { "vrtc", STRUCT_VRTC }, +}; + +static cpuset_t vcpus_active, vcpus_suspended; +static pthread_mutex_t vcpu_lock; +static pthread_cond_t vcpus_idle, vcpus_can_run; +static bool checkpoint_active; + +/* + * TODO: Harden this function and all of its callers since 'base_str' is a user + * provided string. + */ +static char * +strcat_extension(const char *base_str, const char *ext) +{ + char *res; + size_t base_len, ext_len; + + base_len = strnlen(base_str, MAX_VMNAME); + ext_len = strnlen(ext, MAX_VMNAME); + + if (base_len + ext_len > MAX_VMNAME) { + fprintf(stderr, "Filename exceeds maximum length.\n"); + return (NULL); + } + + res = malloc(base_len + ext_len + 1); + if (res == NULL) { + perror("Failed to allocate memory."); + return (NULL); + } + + memcpy(res, base_str, base_len); + memcpy(res + base_len, ext, ext_len); + res[base_len + ext_len] = 0; + + return (res); +} + +void +destroy_restore_state(struct restore_state *rstate) +{ + if (rstate == NULL) { + fprintf(stderr, "Attempting to destroy NULL restore struct.\n"); + return; + } + + if (rstate->kdata_map != MAP_FAILED) + munmap(rstate->kdata_map, rstate->kdata_len); + + if (rstate->kdata_fd > 0) + close(rstate->kdata_fd); + if (rstate->vmmem_fd > 0) + close(rstate->vmmem_fd); + + if (rstate->meta_root_obj != NULL) + ucl_object_unref(rstate->meta_root_obj); + if (rstate->meta_parser != NULL) + ucl_parser_free(rstate->meta_parser); +} + +static int +load_vmmem_file(const char *filename, struct restore_state *rstate) +{ + struct stat sb; + int err; + + rstate->vmmem_fd = open(filename, O_RDONLY); + if (rstate->vmmem_fd < 0) { + perror("Failed to open restore file"); + return (-1); + } + + err = fstat(rstate->vmmem_fd, &sb); + if (err < 0) { + perror("Failed to stat restore file"); + goto err_load_vmmem; + } + + if (sb.st_size == 0) { + fprintf(stderr, "Restore file is empty.\n"); + goto err_load_vmmem; + } + + rstate->vmmem_len = sb.st_size; + + return (0); + +err_load_vmmem: + if (rstate->vmmem_fd > 0) + close(rstate->vmmem_fd); + return (-1); +} + +static int +load_kdata_file(const char *filename, struct restore_state *rstate) +{ + struct stat sb; + int err; + + rstate->kdata_fd = open(filename, O_RDONLY); + if (rstate->kdata_fd < 0) { + perror("Failed to open kernel data file"); + return (-1); + } + + err = fstat(rstate->kdata_fd, &sb); + if (err < 0) { + perror("Failed to stat kernel data file"); + goto err_load_kdata; + } + + if (sb.st_size == 0) { + fprintf(stderr, "Kernel data file is empty.\n"); + goto err_load_kdata; + } + + rstate->kdata_len = sb.st_size; + rstate->kdata_map = mmap(NULL, rstate->kdata_len, PROT_READ, + MAP_SHARED, rstate->kdata_fd, 0); + if (rstate->kdata_map == MAP_FAILED) { + perror("Failed to map restore file"); + goto err_load_kdata; + } + + return (0); + +err_load_kdata: + if (rstate->kdata_fd > 0) + close(rstate->kdata_fd); + return (-1); +} + +static int +load_metadata_file(const char *filename, struct restore_state *rstate) +{ + const ucl_object_t *obj; + struct ucl_parser *parser; + int err; + + parser = ucl_parser_new(UCL_PARSER_DEFAULT); + if (parser == NULL) { + fprintf(stderr, "Failed to initialize UCL parser.\n"); + goto err_load_metadata; + } + + err = ucl_parser_add_file(parser, filename); + if (err == 0) { + fprintf(stderr, "Failed to parse metadata file: '%s'\n", + filename); + err = -1; + goto err_load_metadata; + } + + obj = ucl_parser_get_object(parser); + if (obj == NULL) { + fprintf(stderr, "Failed to parse object.\n"); + err = -1; + goto err_load_metadata; + } + + rstate->meta_parser = parser; + rstate->meta_root_obj = (ucl_object_t *)obj; + + return (0); + +err_load_metadata: + if (parser != NULL) + ucl_parser_free(parser); + return (err); +} + +int +load_restore_file(const char *filename, struct restore_state *rstate) +{ + int err = 0; + char *kdata_filename = NULL, *meta_filename = NULL; + + assert(filename != NULL); + assert(rstate != NULL); + + memset(rstate, 0, sizeof(*rstate)); + rstate->kdata_map = MAP_FAILED; + + err = load_vmmem_file(filename, rstate); + if (err != 0) { + fprintf(stderr, "Failed to load guest RAM file.\n"); + goto err_restore; + } + + kdata_filename = strcat_extension(filename, ".kern"); + if (kdata_filename == NULL) { + fprintf(stderr, "Failed to construct kernel data filename.\n"); + goto err_restore; + } + + err = load_kdata_file(kdata_filename, rstate); + if (err != 0) { + fprintf(stderr, "Failed to load guest kernel data file.\n"); + goto err_restore; + } + + meta_filename = strcat_extension(filename, ".meta"); + if (meta_filename == NULL) { + fprintf(stderr, "Failed to construct kernel metadata filename.\n"); + goto err_restore; + } + + err = load_metadata_file(meta_filename, rstate); + if (err != 0) { + fprintf(stderr, "Failed to load guest metadata file.\n"); + goto err_restore; + } + + return (0); + +err_restore: + destroy_restore_state(rstate); + if (kdata_filename != NULL) + free(kdata_filename); + if (meta_filename != NULL) + free(meta_filename); + return (-1); +} + +#define JSON_GET_INT_OR_RETURN(key, obj, result_ptr, ret) \ +do { \ + const ucl_object_t *obj__; \ + obj__ = ucl_object_lookup(obj, key); \ + if (obj__ == NULL) { \ + fprintf(stderr, "Missing key: '%s'", key); \ + return (ret); \ + } \ + if (!ucl_object_toint_safe(obj__, result_ptr)) { \ + fprintf(stderr, "Cannot convert '%s' value to int.", key); \ + return (ret); \ + } \ +} while(0) + +#define JSON_GET_STRING_OR_RETURN(key, obj, result_ptr, ret) \ +do { \ + const ucl_object_t *obj__; \ + obj__ = ucl_object_lookup(obj, key); \ + if (obj__ == NULL) { \ + fprintf(stderr, "Missing key: '%s'", key); \ + return (ret); \ + } \ + if (!ucl_object_tostring_safe(obj__, result_ptr)) { \ + fprintf(stderr, "Cannot convert '%s' value to string.", key); \ + return (ret); \ + } \ +} while(0) + +static void * +lookup_struct(enum snapshot_req struct_id, struct restore_state *rstate, + size_t *struct_size) +{ + const ucl_object_t *structs = NULL, *obj = NULL; + ucl_object_iter_t it = NULL; + int64_t snapshot_req, size, file_offset; + + structs = ucl_object_lookup(rstate->meta_root_obj, JSON_STRUCT_ARR_KEY); + if (structs == NULL) { + fprintf(stderr, "Failed to find '%s' object.\n", + JSON_STRUCT_ARR_KEY); + return (NULL); + } + + if (ucl_object_type((ucl_object_t *)structs) != UCL_ARRAY) { + fprintf(stderr, "Object '%s' is not an array.\n", + JSON_STRUCT_ARR_KEY); + return (NULL); + } + + while ((obj = ucl_object_iterate(structs, &it, true)) != NULL) { + snapshot_req = -1; + JSON_GET_INT_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj, + &snapshot_req, NULL); + assert(snapshot_req >= 0); + if ((enum snapshot_req) snapshot_req == struct_id) { + JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj, + &size, NULL); + assert(size >= 0); + + JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj, + &file_offset, NULL); + assert(file_offset >= 0); + assert(file_offset + size <= rstate->kdata_len); + + *struct_size = (size_t)size; + return (rstate->kdata_map + file_offset); + } + } + + return (NULL); +} + +static void * +lookup_check_dev(const char *dev_name, struct restore_state *rstate, + const ucl_object_t *obj, size_t *data_size) +{ + const char *snapshot_req; + int64_t size, file_offset; + + snapshot_req = NULL; + JSON_GET_STRING_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj, + &snapshot_req, NULL); + assert(snapshot_req != NULL); + if (!strcmp(snapshot_req, dev_name)) { + JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj, + &size, NULL); + assert(size >= 0); + + JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj, + &file_offset, NULL); + assert(file_offset >= 0); + assert(file_offset + size <= rstate->kdata_len); + + *data_size = (size_t)size; + return (rstate->kdata_map + file_offset); + } + + return (NULL); +} + +static void* +lookup_dev(const char *dev_name, struct restore_state *rstate, + size_t *data_size) +{ + const ucl_object_t *devs = NULL, *obj = NULL; + ucl_object_iter_t it = NULL; + void *ret; + + devs = ucl_object_lookup(rstate->meta_root_obj, JSON_DEV_ARR_KEY); + if (devs == NULL) { + fprintf(stderr, "Failed to find '%s' object.\n", + JSON_DEV_ARR_KEY); + return (NULL); + } + + if (ucl_object_type((ucl_object_t *)devs) != UCL_ARRAY) { + fprintf(stderr, "Object '%s' is not an array.\n", + JSON_DEV_ARR_KEY); + return (NULL); + } + + while ((obj = ucl_object_iterate(devs, &it, true)) != NULL) { + ret = lookup_check_dev(dev_name, rstate, obj, data_size); + if (ret != NULL) + return (ret); + } + + return (NULL); +} + +static const ucl_object_t * +lookup_basic_metadata_object(struct restore_state *rstate) +{ + const ucl_object_t *basic_meta_obj = NULL; + + basic_meta_obj = ucl_object_lookup(rstate->meta_root_obj, + JSON_BASIC_METADATA_KEY); + if (basic_meta_obj == NULL) { + fprintf(stderr, "Failed to find '%s' object.\n", + JSON_BASIC_METADATA_KEY); + return (NULL); + } + + if (ucl_object_type((ucl_object_t *)basic_meta_obj) != UCL_OBJECT) { + fprintf(stderr, "Object '%s' is not a JSON object.\n", + JSON_BASIC_METADATA_KEY); + return (NULL); + } + + return (basic_meta_obj); +} + +const char * +lookup_vmname(struct restore_state *rstate) +{ + const char *vmname; + const ucl_object_t *obj; + + obj = lookup_basic_metadata_object(rstate); + if (obj == NULL) + return (NULL); + + JSON_GET_STRING_OR_RETURN(JSON_VMNAME_KEY, obj, &vmname, NULL); + return (vmname); +} + +int +lookup_memflags(struct restore_state *rstate) +{ + int64_t memflags; + const ucl_object_t *obj; + + obj = lookup_basic_metadata_object(rstate); + if (obj == NULL) + return (0); + + JSON_GET_INT_OR_RETURN(JSON_MEMFLAGS_KEY, obj, &memflags, 0); + + return ((int)memflags); +} + +size_t +lookup_memsize(struct restore_state *rstate) +{ + int64_t memsize; + const ucl_object_t *obj; + + obj = lookup_basic_metadata_object(rstate); + if (obj == NULL) + return (0); + + JSON_GET_INT_OR_RETURN(JSON_MEMSIZE_KEY, obj, &memsize, 0); + if (memsize < 0) + memsize = 0; + + return ((size_t)memsize); +} + + +int +lookup_guest_ncpus(struct restore_state *rstate) +{ + int64_t ncpus; + const ucl_object_t *obj; + + obj = lookup_basic_metadata_object(rstate); + if (obj == NULL) + return (0); + + JSON_GET_INT_OR_RETURN(JSON_NCPUS_KEY, obj, &ncpus, 0); + return ((int)ncpus); +} + +static void +winch_handler(int signal) +{ +#ifdef TIOCGWINSZ + ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize); +#endif /* TIOCGWINSZ */ +} + +static int +print_progress(size_t crtval, const size_t maxval) +{ + size_t rc; + double crtval_gb, maxval_gb; + size_t i, win_width, prog_start, prog_done, prog_end; + int mval_len; + + static char prog_buf[PROG_BUF_SZ]; + static const size_t len = sizeof(prog_buf); + + static size_t div; + static char *div_str; + + static char wip_bar[] = { '/', '-', '\\', '|' }; + static int wip_idx = 0; + + if (maxval == 0) { + printf("[0B / 0B]\r\n"); + return (0); + } + + if (crtval > maxval) + crtval = maxval; + + if (maxval > 10 * GB) { + div = GB; + div_str = "GiB"; + } else if (maxval > 10 * MB) { + div = MB; + div_str = "MiB"; + } else { + div = KB; + div_str = "KiB"; + } + + crtval_gb = (double) crtval / div; + maxval_gb = (double) maxval / div; + + rc = snprintf(prog_buf, len, "%.03lf", maxval_gb); + if (rc == len) { + fprintf(stderr, "Maxval too big\n"); + return (-1); + } + mval_len = rc; + + rc = snprintf(prog_buf, len, "\r[%*.03lf%s / %.03lf%s] |", + mval_len, crtval_gb, div_str, maxval_gb, div_str); + + if (rc == len) { + fprintf(stderr, "Buffer too small to print progress\n"); + return (-1); + } + + win_width = min(winsize.ws_col, len); + prog_start = rc; + + if (prog_start < (win_width - 2)) { + prog_end = win_width - prog_start - 2; + prog_done = prog_end * (crtval_gb / maxval_gb); + + for (i = prog_start; i < prog_start + prog_done; i++) + prog_buf[i] = '#'; + + if (crtval != maxval) { + prog_buf[i] = wip_bar[wip_idx]; + wip_idx = (wip_idx + 1) % sizeof(wip_bar); + i++; + } else { + prog_buf[i++] = '#'; + } + + for (; i < win_width - 2; i++) + prog_buf[i] = '_'; + + prog_buf[win_width - 2] = '|'; + } + + prog_buf[win_width - 1] = '\0'; + write(STDOUT_FILENO, prog_buf, win_width); + + return (0); +} + +static void * +snapshot_spinner_cb(void *arg) +{ + int rc; + size_t crtval, maxval, total; + struct spinner_info *si; + struct timespec ts; + + si = arg; + if (si == NULL) + pthread_exit(NULL); + + ts.tv_sec = 0; + ts.tv_nsec = 50 * 1000 * 1000; /* 50 ms sleep time */ + + do { + crtval = *si->crtval; + maxval = si->maxval; + total = si->total; + + rc = print_progress(crtval, total); + if (rc < 0) { + fprintf(stderr, "Failed to parse progress\n"); + break; + } + + nanosleep(&ts, NULL); + } while (crtval < maxval); + + pthread_exit(NULL); + return NULL; +} + +static int +vm_snapshot_mem_part(const int snapfd, const size_t foff, void *src, + const size_t len, const size_t totalmem, const bool op_wr) +{ + int rc; + size_t part_done, todo, rem; + ssize_t done; + bool show_progress; + pthread_t spinner_th; + struct spinner_info *si; + + if (lseek(snapfd, foff, SEEK_SET) < 0) { + perror("Failed to change file offset"); + return (-1); + } + + show_progress = false; + if (isatty(STDIN_FILENO) && (winsize.ws_col != 0)) + show_progress = true; + + part_done = foff; + rem = len; + + if (show_progress) { + si = &(struct spinner_info) { + .crtval = &part_done, + .maxval = foff + len, + .total = totalmem + }; + + rc = pthread_create(&spinner_th, 0, snapshot_spinner_cb, si); + if (rc) { + perror("Unable to create spinner thread"); + show_progress = false; + } + } + + while (rem > 0) { + if (show_progress) + todo = min(SNAPSHOT_CHUNK, rem); + else + todo = rem; + + if (op_wr) + done = write(snapfd, src, todo); + else + done = read(snapfd, src, todo); + if (done < 0) { + perror("Failed to write in file"); + return (-1); + } + + src += done; + part_done += done; + rem -= done; + } + + if (show_progress) { + rc = pthread_join(spinner_th, NULL); + if (rc) + perror("Unable to end spinner thread"); + } + + return (0); +} + +static size_t +vm_snapshot_mem(struct vmctx *ctx, int snapfd, size_t memsz, const bool op_wr) +{ + int ret; + size_t lowmem, highmem, totalmem; + char *baseaddr; + + ret = vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem, &highmem); + if (ret) { + fprintf(stderr, "%s: unable to retrieve guest memory size\r\n", + __func__); + return (0); + } + totalmem = lowmem + highmem; + + if ((op_wr == false) && (totalmem != memsz)) { + fprintf(stderr, "%s: mem size mismatch: %ld vs %ld\r\n", + __func__, totalmem, memsz); + return (0); + } + + winsize.ws_col = 80; +#ifdef TIOCGWINSZ + ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize); +#endif /* TIOCGWINSZ */ + old_winch_handler = signal(SIGWINCH, winch_handler); + + ret = vm_snapshot_mem_part(snapfd, 0, baseaddr, lowmem, + totalmem, op_wr); + if (ret) { + fprintf(stderr, "%s: Could not %s lowmem\r\n", + __func__, op_wr ? "write" : "read"); + totalmem = 0; + goto done; + } + + if (highmem == 0) + goto done; + + ret = vm_snapshot_mem_part(snapfd, lowmem, baseaddr + 4*GB, + highmem, totalmem, op_wr); + if (ret) { + fprintf(stderr, "%s: Could not %s highmem\r\n", + __func__, op_wr ? "write" : "read"); + totalmem = 0; + goto done; + } + +done: + printf("\r\n"); + signal(SIGWINCH, old_winch_handler); + + return (totalmem); +} + +int +restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate) +{ + size_t restored; + + restored = vm_snapshot_mem(ctx, rstate->vmmem_fd, rstate->vmmem_len, + false); + + if (restored != rstate->vmmem_len) + return (-1); + + return (0); +} + +static int +vm_restore_kern_struct(struct vmctx *ctx, struct restore_state *rstate, + const struct vm_snapshot_kern_info *info) +{ + void *struct_ptr; + size_t struct_size; + int ret; + struct vm_snapshot_meta *meta; + + struct_ptr = lookup_struct(info->req, rstate, &struct_size); + if (struct_ptr == NULL) { + fprintf(stderr, "%s: Failed to lookup struct %s\r\n", + __func__, info->struct_name); + ret = -1; + goto done; + } + + if (struct_size == 0) { + fprintf(stderr, "%s: Kernel struct size was 0 for: %s\r\n", + __func__, info->struct_name); + ret = -1; + goto done; + } + + meta = &(struct vm_snapshot_meta) { + .ctx = ctx, + .dev_name = info->struct_name, + .dev_req = info->req, + + .buffer.buf_start = struct_ptr, + .buffer.buf_size = struct_size, + + .buffer.buf = struct_ptr, + .buffer.buf_rem = struct_size, + + .op = VM_SNAPSHOT_RESTORE, + }; + + ret = vm_snapshot_req(meta); + if (ret != 0) { + fprintf(stderr, "%s: Failed to restore struct: %s\r\n", + __func__, info->struct_name); + goto done; + } + +done: + return (ret); +} + +int +vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate) +{ + int ret; + int i; + + for (i = 0; i < nitems(snapshot_kern_structs); i++) { + ret = vm_restore_kern_struct(ctx, rstate, + &snapshot_kern_structs[i]); + if (ret != 0) + return (ret); + } + + return (0); +} + +int +vm_restore_user_dev(struct vmctx *ctx, struct restore_state *rstate, + const struct vm_snapshot_dev_info *info) +{ + void *dev_ptr; + size_t dev_size; + int ret; + struct vm_snapshot_meta *meta; + + dev_ptr = lookup_dev(info->dev_name, rstate, &dev_size); + if (dev_ptr == NULL) { + fprintf(stderr, "Failed to lookup dev: %s\r\n", info->dev_name); + fprintf(stderr, "Continuing the restore/migration process\r\n"); + return (0); + } + + if (dev_size == 0) { + fprintf(stderr, "%s: Device size is 0. " + "Assuming %s is not used\r\n", + __func__, info->dev_name); + return (0); + } + + meta = &(struct vm_snapshot_meta) { + .ctx = ctx, + .dev_name = info->dev_name, + + .buffer.buf_start = dev_ptr, + .buffer.buf_size = dev_size, + + .buffer.buf = dev_ptr, + .buffer.buf_rem = dev_size, + + .op = VM_SNAPSHOT_RESTORE, + }; + + ret = (*info->snapshot_cb)(meta); + if (ret != 0) { + fprintf(stderr, "Failed to restore dev: %s\r\n", + info->dev_name); + return (-1); + } + + return (0); +} + + +int +vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate) +{ + int ret; + int i; + + for (i = 0; i < nitems(snapshot_devs); i++) { + ret = vm_restore_user_dev(ctx, rstate, &snapshot_devs[i]); + if (ret != 0) + return (ret); + } + + return 0; +} + +int +vm_pause_user_devs(struct vmctx *ctx) +{ + const struct vm_snapshot_dev_info *info; + int ret; + int i; + + for (i = 0; i < nitems(snapshot_devs); i++) { + info = &snapshot_devs[i]; + if (info->pause_cb == NULL) + continue; + + ret = info->pause_cb(ctx, info->dev_name); + if (ret != 0) + return (ret); + } + + return (0); +} + +int +vm_resume_user_devs(struct vmctx *ctx) +{ + const struct vm_snapshot_dev_info *info; + int ret; + int i; + + for (i = 0; i < nitems(snapshot_devs); i++) { + info = &snapshot_devs[i]; + if (info->resume_cb == NULL) + continue; + + ret = info->resume_cb(ctx, info->dev_name); + if (ret != 0) + return (ret); + } + + return (0); +} + +static int +vm_snapshot_kern_struct(int data_fd, xo_handle_t *xop, const char *array_key, + struct vm_snapshot_meta *meta, off_t *offset) +{ + int ret; + size_t data_size; + ssize_t write_cnt; + + ret = vm_snapshot_req(meta); + if (ret != 0) { + fprintf(stderr, "%s: Failed to snapshot struct %s\r\n", + __func__, meta->dev_name); + ret = -1; + goto done; + } + + data_size = vm_get_snapshot_size(meta); + + write_cnt = write(data_fd, meta->buffer.buf_start, data_size); + if (write_cnt != data_size) { + perror("Failed to write all snapshotted data."); + ret = -1; + goto done; + } + + /* Write metadata. */ + xo_open_instance_h(xop, array_key); + xo_emit_h(xop, "{:debug_name/%s}\n", meta->dev_name); + xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%d}\n", + meta->dev_req); + xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); + xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); + xo_close_instance_h(xop, JSON_STRUCT_ARR_KEY); + + *offset += data_size; + +done: + return (ret); +} + +static int +vm_snapshot_kern_structs(struct vmctx *ctx, int data_fd, xo_handle_t *xop) +{ + int ret, i, error; + size_t offset, buf_size; + char *buffer; + struct vm_snapshot_meta *meta; + + error = 0; + offset = 0; + buf_size = SNAPSHOT_BUFFER_SIZE; + + buffer = malloc(SNAPSHOT_BUFFER_SIZE * sizeof(char)); + if (buffer == NULL) { + error = ENOMEM; + perror("Failed to allocate memory for snapshot buffer"); + goto err_vm_snapshot_kern_data; + } + + meta = &(struct vm_snapshot_meta) { + .ctx = ctx, + + .buffer.buf_start = buffer, + .buffer.buf_size = buf_size, + + .op = VM_SNAPSHOT_SAVE, + }; + + xo_open_list_h(xop, JSON_STRUCT_ARR_KEY); + for (i = 0; i < nitems(snapshot_kern_structs); i++) { + meta->dev_name = snapshot_kern_structs[i].struct_name; + meta->dev_req = snapshot_kern_structs[i].req; + + memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); + meta->buffer.buf = meta->buffer.buf_start; + meta->buffer.buf_rem = meta->buffer.buf_size; + + ret = vm_snapshot_kern_struct(data_fd, xop, JSON_DEV_ARR_KEY, + meta, &offset); + if (ret != 0) { + error = -1; + goto err_vm_snapshot_kern_data; + } + } + xo_close_list_h(xop, JSON_STRUCT_ARR_KEY); + +err_vm_snapshot_kern_data: + if (buffer != NULL) + free(buffer); + return (error); +} + +static int +vm_snapshot_basic_metadata(struct vmctx *ctx, xo_handle_t *xop, size_t memsz) +{ + int error; + int memflags; + char vmname_buf[MAX_VMNAME]; + + memset(vmname_buf, 0, MAX_VMNAME); + error = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1); + if (error != 0) { + perror("Failed to get VM name"); + goto err; + } + + memflags = vm_get_memflags(ctx); + + xo_open_container_h(xop, JSON_BASIC_METADATA_KEY); + xo_emit_h(xop, "{:" JSON_NCPUS_KEY "/%ld}\n", guest_ncpus); + xo_emit_h(xop, "{:" JSON_VMNAME_KEY "/%s}\n", vmname_buf); + xo_emit_h(xop, "{:" JSON_MEMSIZE_KEY "/%lu}\n", memsz); + xo_emit_h(xop, "{:" JSON_MEMFLAGS_KEY "/%d}\n", memflags); + xo_close_container_h(xop, JSON_BASIC_METADATA_KEY); + +err: + return (error); +} + +static int +vm_snapshot_dev_write_data(int data_fd, xo_handle_t *xop, const char *array_key, + struct vm_snapshot_meta *meta, off_t *offset) +{ + int ret; + size_t data_size; + + data_size = vm_get_snapshot_size(meta); + + ret = write(data_fd, meta->buffer.buf_start, data_size); + if (ret != data_size) { + perror("Failed to write all snapshotted data."); + return (-1); + } + + /* Write metadata. */ + xo_open_instance_h(xop, array_key); + xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name); + xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); + xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); + xo_close_instance_h(xop, array_key); + + *offset += data_size; + + return (0); +} + +static int +vm_snapshot_user_dev(const struct vm_snapshot_dev_info *info, + int data_fd, xo_handle_t *xop, + struct vm_snapshot_meta *meta, off_t *offset) +{ + int ret; + + ret = (*info->snapshot_cb)(meta); + if (ret != 0) { + fprintf(stderr, "Failed to snapshot %s; ret=%d\r\n", + meta->dev_name, ret); + return (ret); + } + + ret = vm_snapshot_dev_write_data(data_fd, xop, JSON_DEV_ARR_KEY, meta, + offset); + if (ret != 0) + return (ret); + + return (0); +} + +static int +vm_snapshot_user_devs(struct vmctx *ctx, int data_fd, xo_handle_t *xop) +{ + int ret, i; + off_t offset; + void *buffer; + size_t buf_size; + struct vm_snapshot_meta *meta; + + buf_size = SNAPSHOT_BUFFER_SIZE; + + offset = lseek(data_fd, 0, SEEK_CUR); + if (offset < 0) { + perror("Failed to get data file current offset."); + return (-1); + } + + buffer = malloc(buf_size); + if (buffer == NULL) { + perror("Failed to allocate memory for snapshot buffer"); + ret = ENOSPC; + goto snapshot_err; + } + + meta = &(struct vm_snapshot_meta) { + .ctx = ctx, + + .buffer.buf_start = buffer, + .buffer.buf_size = buf_size, + + .op = VM_SNAPSHOT_SAVE, + }; + + xo_open_list_h(xop, JSON_DEV_ARR_KEY); + + /* Restore other devices that support this feature */ + for (i = 0; i < nitems(snapshot_devs); i++) { + meta->dev_name = snapshot_devs[i].dev_name; + + memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); + meta->buffer.buf = meta->buffer.buf_start; + meta->buffer.buf_rem = meta->buffer.buf_size; + + ret = vm_snapshot_user_dev(&snapshot_devs[i], data_fd, xop, + meta, &offset); + if (ret != 0) + goto snapshot_err; + } + + xo_close_list_h(xop, JSON_DEV_ARR_KEY); + +snapshot_err: + if (buffer != NULL) + free(buffer); + return (ret); +} + +void +checkpoint_cpu_add(int vcpu) +{ + + pthread_mutex_lock(&vcpu_lock); + CPU_SET(vcpu, &vcpus_active); + + if (checkpoint_active) { + CPU_SET(vcpu, &vcpus_suspended); + while (checkpoint_active) + pthread_cond_wait(&vcpus_can_run, &vcpu_lock); + CPU_CLR(vcpu, &vcpus_suspended); + } + pthread_mutex_unlock(&vcpu_lock); +} + +/* + * When a vCPU is suspended for any reason, it calls + * checkpoint_cpu_suspend(). This records that the vCPU is idle. + * Before returning from suspension, checkpoint_cpu_resume() is + * called. In suspend we note that the vCPU is idle. In resume we + * pause the vCPU thread until the checkpoint is complete. The reason + * for the two-step process is that vCPUs might already be stopped in + * the debug server when a checkpoint is requested. This approach + * allows us to account for and handle those vCPUs. + */ +void +checkpoint_cpu_suspend(int vcpu) +{ + + pthread_mutex_lock(&vcpu_lock); + CPU_SET(vcpu, &vcpus_suspended); + if (checkpoint_active && CPU_CMP(&vcpus_active, &vcpus_suspended) == 0) + pthread_cond_signal(&vcpus_idle); + pthread_mutex_unlock(&vcpu_lock); +} + +void +checkpoint_cpu_resume(int vcpu) +{ + + pthread_mutex_lock(&vcpu_lock); + while (checkpoint_active) + pthread_cond_wait(&vcpus_can_run, &vcpu_lock); + CPU_CLR(vcpu, &vcpus_suspended); + pthread_mutex_unlock(&vcpu_lock); +} + +static void +vm_vcpu_pause(struct vmctx *ctx) +{ + + pthread_mutex_lock(&vcpu_lock); + checkpoint_active = true; + vm_suspend_cpu(ctx, -1); + while (CPU_CMP(&vcpus_active, &vcpus_suspended) != 0) + pthread_cond_wait(&vcpus_idle, &vcpu_lock); + pthread_mutex_unlock(&vcpu_lock); +} + +static void +vm_vcpu_resume(struct vmctx *ctx) +{ + + pthread_mutex_lock(&vcpu_lock); + checkpoint_active = false; + pthread_mutex_unlock(&vcpu_lock); + vm_resume_cpu(ctx, -1); + pthread_cond_broadcast(&vcpus_can_run); +} + +static int +vm_checkpoint(struct vmctx *ctx, char *checkpoint_file, bool stop_vm) +{ + int fd_checkpoint = 0, kdata_fd = 0; + int ret = 0; + int error = 0; + size_t memsz; + xo_handle_t *xop = NULL; + char *meta_filename = NULL; + char *kdata_filename = NULL; + FILE *meta_file = NULL; + + kdata_filename = strcat_extension(checkpoint_file, ".kern"); + if (kdata_filename == NULL) { + fprintf(stderr, "Failed to construct kernel data filename.\n"); + return (-1); + } + + kdata_fd = open(kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); + if (kdata_fd < 0) { + perror("Failed to open kernel data snapshot file."); + error = -1; + goto done; + } + + fd_checkpoint = open(checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700); + + if (fd_checkpoint < 0) { + perror("Failed to create checkpoint file"); + error = -1; + goto done; + } + + meta_filename = strcat_extension(checkpoint_file, ".meta"); + if (meta_filename == NULL) { + fprintf(stderr, "Failed to construct vm metadata filename.\n"); + goto done; + } + + meta_file = fopen(meta_filename, "w"); + if (meta_file == NULL) { + perror("Failed to open vm metadata snapshot file."); + goto done; + } + + xop = xo_create_to_file(meta_file, XO_STYLE_JSON, XOF_PRETTY); + if (xop == NULL) { + perror("Failed to get libxo handle on metadata file."); + goto done; + } + + vm_vcpu_pause(ctx); + + ret = vm_pause_user_devs(ctx); + if (ret != 0) { + fprintf(stderr, "Could not pause devices\r\n"); + error = ret; + goto done; + } + + memsz = vm_snapshot_mem(ctx, fd_checkpoint, 0, true); + if (memsz == 0) { + perror("Could not write guest memory to file"); + error = -1; + goto done; + } + + ret = vm_snapshot_basic_metadata(ctx, xop, memsz); + if (ret != 0) { + fprintf(stderr, "Failed to snapshot vm basic metadata.\n"); + error = -1; + goto done; + } + + + ret = vm_snapshot_kern_structs(ctx, kdata_fd, xop); + if (ret != 0) { + fprintf(stderr, "Failed to snapshot vm kernel data.\n"); + error = -1; + goto done; + } + + ret = vm_snapshot_user_devs(ctx, kdata_fd, xop); + if (ret != 0) { + fprintf(stderr, "Failed to snapshot device state.\n"); + error = -1; + goto done; + } + + xo_finish_h(xop); + + if (stop_vm) { + vm_destroy(ctx); + exit(0); + } + +done: + ret = vm_resume_user_devs(ctx); + if (ret != 0) + fprintf(stderr, "Could not resume devices\r\n"); + vm_vcpu_resume(ctx); + if (fd_checkpoint > 0) + close(fd_checkpoint); + if (meta_filename != NULL) + free(meta_filename); + if (kdata_filename != NULL) + free(kdata_filename); + if (xop != NULL) + xo_destroy(xop); + if (meta_file != NULL) + fclose(meta_file); + if (kdata_fd > 0) + close(kdata_fd); + return (error); +} + +int +get_checkpoint_msg(int conn_fd, struct vmctx *ctx) +{ + unsigned char buf[MAX_MSG_SIZE]; + struct checkpoint_op *checkpoint_op; + int len, recv_len, total_recv = 0; + int err = 0; + + len = sizeof(struct checkpoint_op); /* expected length */ + while ((recv_len = recv(conn_fd, buf + total_recv, len - total_recv, 0)) > 0) { + total_recv += recv_len; + } + if (recv_len < 0) { + perror("Error while receiving data from bhyvectl"); + err = -1; + goto done; + } + + checkpoint_op = (struct checkpoint_op *)buf; + switch (checkpoint_op->op) { + case START_CHECKPOINT: + err = vm_checkpoint(ctx, checkpoint_op->snapshot_filename, false); + break; + case START_SUSPEND: + err = vm_checkpoint(ctx, checkpoint_op->snapshot_filename, true); + break; + default: + fprintf(stderr, "Unrecognized checkpoint operation.\n"); + err = -1; + } + +done: + close(conn_fd); + return (err); +} + +/* + * Listen for commands from bhyvectl + */ +void * +checkpoint_thread(void *param) +{ + struct checkpoint_thread_info *thread_info; + int conn_fd, ret; + + pthread_set_name_np(pthread_self(), "checkpoint thread"); + thread_info = (struct checkpoint_thread_info *)param; + + while ((conn_fd = accept(thread_info->socket_fd, NULL, NULL)) > -1) { + ret = get_checkpoint_msg(conn_fd, thread_info->ctx); + if (ret != 0) { + fprintf(stderr, "Failed to read message on checkpoint " + "socket. Retrying.\n"); + } + } + if (conn_fd < -1) { + perror("Failed to accept connection"); + } + + return (NULL); +} + +/* + * Create directory tree to store runtime specific information: + * i.e. UNIX sockets for IPC with bhyvectl. + */ +static int +make_checkpoint_dir(void) +{ + int err; + + err = mkdir(BHYVE_RUN_DIR, 0755); + if (err < 0 && errno != EEXIST) + return (err); + + err = mkdir(CHECKPOINT_RUN_DIR, 0755); + if (err < 0 && errno != EEXIST) + return (err); + + return 0; +} + +/* + * Create the listening socket for IPC with bhyvectl + */ +int +init_checkpoint_thread(struct vmctx *ctx) +{ + struct checkpoint_thread_info *checkpoint_info = NULL; + struct sockaddr_un addr; + int socket_fd; + pthread_t checkpoint_pthread; + char vmname_buf[MAX_VMNAME]; + int ret, err = 0; + + memset(&addr, 0, sizeof(addr)); + + err = pthread_mutex_init(&vcpu_lock, NULL); + if (err != 0) + errc(1, err, "checkpoint mutex init"); + err = pthread_cond_init(&vcpus_idle, NULL); + if (err == 0) + err = pthread_cond_init(&vcpus_can_run, NULL); + if (err != 0) + errc(1, err, "checkpoint cv init"); + + socket_fd = socket(PF_UNIX, SOCK_STREAM, 0); + if (socket_fd < 0) { + perror("Socket creation failed (IPC with bhyvectl"); + err = -1; + goto fail; + } + + err = make_checkpoint_dir(); + if (err < 0) { + perror("Failed to create checkpoint runtime directory"); + goto fail; + } + + addr.sun_family = AF_UNIX; + + err = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1); + if (err != 0) { + perror("Failed to get VM name"); + goto fail; + } + + snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s", + CHECKPOINT_RUN_DIR, vmname_buf); + addr.sun_len = SUN_LEN(&addr); + unlink(addr.sun_path); + + if (bind(socket_fd, (struct sockaddr *)&addr, addr.sun_len) != 0) { + perror("Failed to bind socket (IPC with bhyvectl)"); + err = -1; + goto fail; + } + + if (listen(socket_fd, 10) < 0) { + perror("Failed to listen on socket (IPC with bhyvectl)"); + err = -1; + goto fail; + } + + checkpoint_info = calloc(1, sizeof(*checkpoint_info)); + checkpoint_info->ctx = ctx; + checkpoint_info->socket_fd = socket_fd; + + ret = pthread_create(&checkpoint_pthread, NULL, checkpoint_thread, + checkpoint_info); + if (ret < 0) { + err = ret; + goto fail; + } + + return (0); +fail: + free(checkpoint_info); + if (socket_fd > 0) + close(socket_fd); + unlink(addr.sun_path); + + return (err); +} + +void +vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op) +{ + const char *__op; + + if (op == VM_SNAPSHOT_SAVE) + __op = "save"; + else if (op == VM_SNAPSHOT_RESTORE) + __op = "restore"; + else + __op = "unknown"; + + fprintf(stderr, "%s: snapshot-%s failed for %s\r\n", + __func__, __op, bufname); +} + +int +vm_snapshot_buf(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta) +{ + struct vm_snapshot_buffer *buffer; + int op; + + buffer = &meta->buffer; + op = meta->op; + + if (buffer->buf_rem < data_size) { + fprintf(stderr, "%s: buffer too small\r\n", __func__); + return (E2BIG); + } + + if (op == VM_SNAPSHOT_SAVE) + memcpy(buffer->buf, (uint8_t *) data, data_size); + else if (op == VM_SNAPSHOT_RESTORE) + memcpy((uint8_t *) data, buffer->buf, data_size); + else + return (EINVAL); + + buffer->buf += data_size; + buffer->buf_rem -= data_size; + + return (0); +} + +size_t +vm_get_snapshot_size(struct vm_snapshot_meta *meta) +{ + size_t length; + struct vm_snapshot_buffer *buffer; + + buffer = &meta->buffer; + + if (buffer->buf_size < buffer->buf_rem) { + fprintf(stderr, "%s: Invalid buffer: size = %zu, rem = %zu\r\n", + __func__, buffer->buf_size, buffer->buf_rem); + length = 0; + } else { + length = buffer->buf_size - buffer->buf_rem; + } + + return (length); +} + +int +vm_snapshot_guest2host_addr(void **addrp, size_t len, bool restore_null, + struct vm_snapshot_meta *meta) +{ + int ret; + vm_paddr_t gaddr; + + if (meta->op == VM_SNAPSHOT_SAVE) { + gaddr = paddr_host2guest(meta->ctx, *addrp); + if (gaddr == (vm_paddr_t) -1) { + if (!restore_null || + (restore_null && (*addrp != NULL))) { + ret = EFAULT; + goto done; + } + } + + SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); + if (gaddr == (vm_paddr_t) -1) { + if (!restore_null) { + ret = EFAULT; + goto done; + } + } + + *addrp = paddr_guest2host(meta->ctx, gaddr, len); + } else { + ret = EINVAL; + } + +done: + return (ret); +} + +int +vm_snapshot_buf_cmp(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta) +{ + struct vm_snapshot_buffer *buffer; + int op; + int ret; + + buffer = &meta->buffer; + op = meta->op; + + if (buffer->buf_rem < data_size) { + fprintf(stderr, "%s: buffer too small\r\n", __func__); + ret = E2BIG; + goto done; + } + + if (op == VM_SNAPSHOT_SAVE) { + ret = 0; + memcpy(buffer->buf, (uint8_t *) data, data_size); + } else if (op == VM_SNAPSHOT_RESTORE) { + ret = memcmp((uint8_t *) data, buffer->buf, data_size); + } else { + ret = EINVAL; + goto done; + } + + buffer->buf += data_size; + buffer->buf_rem -= data_size; + +done: + return (ret); +} diff --git a/usr.sbin/bhyve/snapshot.h b/usr.sbin/bhyve/snapshot.h new file mode 100644 index 000000000000..f9ea3d573089 --- /dev/null +++ b/usr.sbin/bhyve/snapshot.h @@ -0,0 +1,105 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Flavius Anton + * Copyright (c) 2016 Mihai Tiganus + * Copyright (c) 2016-2019 Mihai Carabas + * Copyright (c) 2017-2019 Darius Mihai + * Copyright (c) 2017-2019 Elena Mihailescu + * Copyright (c) 2018-2019 Sergiu Weisz + * All rights reserved. + * The bhyve-snapshot feature was developed under sponsorships + * from Matthew Grooms. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _BHYVE_SNAPSHOT_ +#define _BHYVE_SNAPSHOT_ + +#include +#include +#include + +struct vmctx; + +struct restore_state { + int kdata_fd; + int vmmem_fd; + + void *kdata_map; + size_t kdata_len; + + size_t vmmem_len; + + struct ucl_parser *meta_parser; + ucl_object_t *meta_root_obj; +}; + +struct checkpoint_thread_info { + struct vmctx *ctx; + int socket_fd; +}; + +typedef int (*vm_snapshot_dev_cb)(struct vm_snapshot_meta *); +typedef int (*vm_pause_dev_cb) (struct vmctx *, const char *); +typedef int (*vm_resume_dev_cb) (struct vmctx *, const char *); + +struct vm_snapshot_dev_info { + const char *dev_name; /* device name */ + vm_snapshot_dev_cb snapshot_cb; /* callback for device snapshot */ + vm_pause_dev_cb pause_cb; /* callback for device pause */ + vm_resume_dev_cb resume_cb; /* callback for device resume */ +}; + +struct vm_snapshot_kern_info { + const char *struct_name; /* kernel structure name*/ + enum snapshot_req req; /* request type */ +}; + +void destroy_restore_state(struct restore_state *rstate); + +const char *lookup_vmname(struct restore_state *rstate); +int lookup_memflags(struct restore_state *rstate); +size_t lookup_memsize(struct restore_state *rstate); +int lookup_guest_ncpus(struct restore_state *rstate); + +void checkpoint_cpu_add(int vcpu); +void checkpoint_cpu_resume(int vcpu); +void checkpoint_cpu_suspend(int vcpu); + +int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate); +int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate); + +int vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate); +int vm_pause_user_devs(struct vmctx *ctx); +int vm_resume_user_devs(struct vmctx *ctx); + +int get_checkpoint_msg(int conn_fd, struct vmctx *ctx); +void *checkpoint_thread(void *param); +int init_checkpoint_thread(struct vmctx *ctx); + +int load_restore_file(const char *filename, struct restore_state *rstate); + +#endif diff --git a/usr.sbin/bhyve/uart_emul.c b/usr.sbin/bhyve/uart_emul.c index 930344a52935..a89974590a1f 100644 --- a/usr.sbin/bhyve/uart_emul.c +++ b/usr.sbin/bhyve/uart_emul.c @@ -1,721 +1,755 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 NetApp, Inc. * Copyright (c) 2013 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #ifndef WITHOUT_CAPSICUM #include #include #endif +#include + #include #include #include #include #include #include #include #include #include #include #include #include #include "mevent.h" #include "uart_emul.h" #include "debug.h" #define COM1_BASE 0x3F8 #define COM1_IRQ 4 #define COM2_BASE 0x2F8 #define COM2_IRQ 3 #define DEFAULT_RCLK 1843200 #define DEFAULT_BAUD 9600 #define FCR_RX_MASK 0xC0 #define MCR_OUT1 0x04 #define MCR_OUT2 0x08 #define MSR_DELTA_MASK 0x0f #ifndef REG_SCR #define REG_SCR com_scr #endif #define FIFOSZ 16 static bool uart_stdio; /* stdio in use for i/o */ static struct termios tio_stdio_orig; static struct { int baseaddr; int irq; bool inuse; } uart_lres[] = { { COM1_BASE, COM1_IRQ, false}, { COM2_BASE, COM2_IRQ, false}, }; #define UART_NLDEVS (sizeof(uart_lres) / sizeof(uart_lres[0])) struct fifo { uint8_t buf[FIFOSZ]; int rindex; /* index to read from */ int windex; /* index to write to */ int num; /* number of characters in the fifo */ int size; /* size of the fifo */ }; struct ttyfd { bool opened; int rfd; /* fd for reading */ int wfd; /* fd for writing, may be == rfd */ }; struct uart_softc { pthread_mutex_t mtx; /* protects all softc elements */ uint8_t data; /* Data register (R/W) */ uint8_t ier; /* Interrupt enable register (R/W) */ uint8_t lcr; /* Line control register (R/W) */ uint8_t mcr; /* Modem control register (R/W) */ uint8_t lsr; /* Line status register (R/W) */ uint8_t msr; /* Modem status register (R/W) */ uint8_t fcr; /* FIFO control register (W) */ uint8_t scr; /* Scratch register (R/W) */ uint8_t dll; /* Baudrate divisor latch LSB */ uint8_t dlh; /* Baudrate divisor latch MSB */ struct fifo rxfifo; struct mevent *mev; struct ttyfd tty; bool thre_int_pending; /* THRE interrupt pending */ void *arg; uart_intr_func_t intr_assert; uart_intr_func_t intr_deassert; }; static void uart_drain(int fd, enum ev_type ev, void *arg); static void ttyclose(void) { tcsetattr(STDIN_FILENO, TCSANOW, &tio_stdio_orig); } static void ttyopen(struct ttyfd *tf) { struct termios orig, new; tcgetattr(tf->rfd, &orig); new = orig; cfmakeraw(&new); new.c_cflag |= CLOCAL; tcsetattr(tf->rfd, TCSANOW, &new); if (uart_stdio) { tio_stdio_orig = orig; atexit(ttyclose); } raw_stdio = 1; } static int ttyread(struct ttyfd *tf) { unsigned char rb; if (read(tf->rfd, &rb, 1) == 1) return (rb); else return (-1); } static void ttywrite(struct ttyfd *tf, unsigned char wb) { (void)write(tf->wfd, &wb, 1); } static void rxfifo_reset(struct uart_softc *sc, int size) { char flushbuf[32]; struct fifo *fifo; ssize_t nread; int error; fifo = &sc->rxfifo; bzero(fifo, sizeof(struct fifo)); fifo->size = size; if (sc->tty.opened) { /* * Flush any unread input from the tty buffer. */ while (1) { nread = read(sc->tty.rfd, flushbuf, sizeof(flushbuf)); if (nread != sizeof(flushbuf)) break; } /* * Enable mevent to trigger when new characters are available * on the tty fd. */ error = mevent_enable(sc->mev); assert(error == 0); } } static int rxfifo_available(struct uart_softc *sc) { struct fifo *fifo; fifo = &sc->rxfifo; return (fifo->num < fifo->size); } static int rxfifo_putchar(struct uart_softc *sc, uint8_t ch) { struct fifo *fifo; int error; fifo = &sc->rxfifo; if (fifo->num < fifo->size) { fifo->buf[fifo->windex] = ch; fifo->windex = (fifo->windex + 1) % fifo->size; fifo->num++; if (!rxfifo_available(sc)) { if (sc->tty.opened) { /* * Disable mevent callback if the FIFO is full. */ error = mevent_disable(sc->mev); assert(error == 0); } } return (0); } else return (-1); } static int rxfifo_getchar(struct uart_softc *sc) { struct fifo *fifo; int c, error, wasfull; wasfull = 0; fifo = &sc->rxfifo; if (fifo->num > 0) { if (!rxfifo_available(sc)) wasfull = 1; c = fifo->buf[fifo->rindex]; fifo->rindex = (fifo->rindex + 1) % fifo->size; fifo->num--; if (wasfull) { if (sc->tty.opened) { error = mevent_enable(sc->mev); assert(error == 0); } } return (c); } else return (-1); } static int rxfifo_numchars(struct uart_softc *sc) { struct fifo *fifo = &sc->rxfifo; return (fifo->num); } static void uart_opentty(struct uart_softc *sc) { ttyopen(&sc->tty); sc->mev = mevent_add(sc->tty.rfd, EVF_READ, uart_drain, sc); assert(sc->mev != NULL); } static uint8_t modem_status(uint8_t mcr) { uint8_t msr; if (mcr & MCR_LOOPBACK) { /* * In the loopback mode certain bits from the MCR are * reflected back into MSR. */ msr = 0; if (mcr & MCR_RTS) msr |= MSR_CTS; if (mcr & MCR_DTR) msr |= MSR_DSR; if (mcr & MCR_OUT1) msr |= MSR_RI; if (mcr & MCR_OUT2) msr |= MSR_DCD; } else { /* * Always assert DCD and DSR so tty open doesn't block * even if CLOCAL is turned off. */ msr = MSR_DCD | MSR_DSR; } assert((msr & MSR_DELTA_MASK) == 0); return (msr); } /* * The IIR returns a prioritized interrupt reason: * - receive data available * - transmit holding register empty * - modem status change * * Return an interrupt reason if one is available. */ static int uart_intr_reason(struct uart_softc *sc) { if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0) return (IIR_RLS); else if (rxfifo_numchars(sc) > 0 && (sc->ier & IER_ERXRDY) != 0) return (IIR_RXTOUT); else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0) return (IIR_TXRDY); else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0) return (IIR_MLSC); else return (IIR_NOPEND); } static void uart_reset(struct uart_softc *sc) { uint16_t divisor; divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16; sc->dll = divisor; sc->dlh = divisor >> 16; sc->msr = modem_status(sc->mcr); rxfifo_reset(sc, 1); /* no fifo until enabled by software */ } /* * Toggle the COM port's intr pin depending on whether or not we have an * interrupt condition to report to the processor. */ static void uart_toggle_intr(struct uart_softc *sc) { uint8_t intr_reason; intr_reason = uart_intr_reason(sc); if (intr_reason == IIR_NOPEND) (*sc->intr_deassert)(sc->arg); else (*sc->intr_assert)(sc->arg); } static void uart_drain(int fd, enum ev_type ev, void *arg) { struct uart_softc *sc; int ch; sc = arg; assert(fd == sc->tty.rfd); assert(ev == EVF_READ); /* * This routine is called in the context of the mevent thread * to take out the softc lock to protect against concurrent * access from a vCPU i/o exit */ pthread_mutex_lock(&sc->mtx); if ((sc->mcr & MCR_LOOPBACK) != 0) { (void) ttyread(&sc->tty); } else { while (rxfifo_available(sc) && ((ch = ttyread(&sc->tty)) != -1)) { rxfifo_putchar(sc, ch); } uart_toggle_intr(sc); } pthread_mutex_unlock(&sc->mtx); } void uart_write(struct uart_softc *sc, int offset, uint8_t value) { int fifosz; uint8_t msr; pthread_mutex_lock(&sc->mtx); /* * Take care of the special case DLAB accesses first */ if ((sc->lcr & LCR_DLAB) != 0) { if (offset == REG_DLL) { sc->dll = value; goto done; } if (offset == REG_DLH) { sc->dlh = value; goto done; } } switch (offset) { case REG_DATA: if (sc->mcr & MCR_LOOPBACK) { if (rxfifo_putchar(sc, value) != 0) sc->lsr |= LSR_OE; } else if (sc->tty.opened) { ttywrite(&sc->tty, value); } /* else drop on floor */ sc->thre_int_pending = true; break; case REG_IER: /* Set pending when IER_ETXRDY is raised (edge-triggered). */ if ((sc->ier & IER_ETXRDY) == 0 && (value & IER_ETXRDY) != 0) sc->thre_int_pending = true; /* * Apply mask so that bits 4-7 are 0 * Also enables bits 0-3 only if they're 1 */ sc->ier = value & 0x0F; break; case REG_FCR: /* * When moving from FIFO and 16450 mode and vice versa, * the FIFO contents are reset. */ if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) { fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1; rxfifo_reset(sc, fifosz); } /* * The FCR_ENABLE bit must be '1' for the programming * of other FCR bits to be effective. */ if ((value & FCR_ENABLE) == 0) { sc->fcr = 0; } else { if ((value & FCR_RCV_RST) != 0) rxfifo_reset(sc, FIFOSZ); sc->fcr = value & (FCR_ENABLE | FCR_DMA | FCR_RX_MASK); } break; case REG_LCR: sc->lcr = value; break; case REG_MCR: /* Apply mask so that bits 5-7 are 0 */ sc->mcr = value & 0x1F; msr = modem_status(sc->mcr); /* * Detect if there has been any change between the * previous and the new value of MSR. If there is * then assert the appropriate MSR delta bit. */ if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS)) sc->msr |= MSR_DCTS; if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR)) sc->msr |= MSR_DDSR; if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD)) sc->msr |= MSR_DDCD; if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0) sc->msr |= MSR_TERI; /* * Update the value of MSR while retaining the delta * bits. */ sc->msr &= MSR_DELTA_MASK; sc->msr |= msr; break; case REG_LSR: /* * Line status register is not meant to be written to * during normal operation. */ break; case REG_MSR: /* * As far as I can tell MSR is a read-only register. */ break; case REG_SCR: sc->scr = value; break; default: break; } done: uart_toggle_intr(sc); pthread_mutex_unlock(&sc->mtx); } uint8_t uart_read(struct uart_softc *sc, int offset) { uint8_t iir, intr_reason, reg; pthread_mutex_lock(&sc->mtx); /* * Take care of the special case DLAB accesses first */ if ((sc->lcr & LCR_DLAB) != 0) { if (offset == REG_DLL) { reg = sc->dll; goto done; } if (offset == REG_DLH) { reg = sc->dlh; goto done; } } switch (offset) { case REG_DATA: reg = rxfifo_getchar(sc); break; case REG_IER: reg = sc->ier; break; case REG_IIR: iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0; intr_reason = uart_intr_reason(sc); /* * Deal with side effects of reading the IIR register */ if (intr_reason == IIR_TXRDY) sc->thre_int_pending = false; iir |= intr_reason; reg = iir; break; case REG_LCR: reg = sc->lcr; break; case REG_MCR: reg = sc->mcr; break; case REG_LSR: /* Transmitter is always ready for more data */ sc->lsr |= LSR_TEMT | LSR_THRE; /* Check for new receive data */ if (rxfifo_numchars(sc) > 0) sc->lsr |= LSR_RXRDY; else sc->lsr &= ~LSR_RXRDY; reg = sc->lsr; /* The LSR_OE bit is cleared on LSR read */ sc->lsr &= ~LSR_OE; break; case REG_MSR: /* * MSR delta bits are cleared on read */ reg = sc->msr; sc->msr &= ~MSR_DELTA_MASK; break; case REG_SCR: reg = sc->scr; break; default: reg = 0xFF; break; } done: uart_toggle_intr(sc); pthread_mutex_unlock(&sc->mtx); return (reg); } int uart_legacy_alloc(int which, int *baseaddr, int *irq) { if (which < 0 || which >= UART_NLDEVS || uart_lres[which].inuse) return (-1); uart_lres[which].inuse = true; *baseaddr = uart_lres[which].baseaddr; *irq = uart_lres[which].irq; return (0); } struct uart_softc * uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert, void *arg) { struct uart_softc *sc; sc = calloc(1, sizeof(struct uart_softc)); sc->arg = arg; sc->intr_assert = intr_assert; sc->intr_deassert = intr_deassert; pthread_mutex_init(&sc->mtx, NULL); uart_reset(sc); return (sc); } static int uart_stdio_backend(struct uart_softc *sc) { #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; #endif if (uart_stdio) return (-1); sc->tty.rfd = STDIN_FILENO; sc->tty.wfd = STDOUT_FILENO; sc->tty.opened = true; if (fcntl(sc->tty.rfd, F_SETFL, O_NONBLOCK) != 0) return (-1); if (fcntl(sc->tty.wfd, F_SETFL, O_NONBLOCK) != 0) return (-1); #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ); if (caph_rights_limit(sc->tty.rfd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_ioctls_limit(sc->tty.rfd, cmds, nitems(cmds)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif uart_stdio = true; return (0); } static int uart_tty_backend(struct uart_softc *sc, const char *opts) { #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; #endif int fd; fd = open(opts, O_RDWR | O_NONBLOCK); if (fd < 0) return (-1); if (!isatty(fd)) { close(fd); return (-1); } sc->tty.rfd = sc->tty.wfd = fd; sc->tty.opened = true; #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ, CAP_WRITE); if (caph_rights_limit(fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif return (0); } int uart_set_backend(struct uart_softc *sc, const char *opts) { int retval; if (opts == NULL) return (0); if (strcmp("stdio", opts) == 0) retval = uart_stdio_backend(sc); else retval = uart_tty_backend(sc, opts); if (retval == 0) uart_opentty(sc); return (retval); } + +#ifdef BHYVE_SNAPSHOT +int +uart_snapshot(struct uart_softc *sc, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(sc->data, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->ier, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->lcr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->mcr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->lsr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->msr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->fcr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->scr, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->dll, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->dlh, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.rindex, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.windex, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.num, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.size, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(sc->rxfifo.buf, sizeof(sc->rxfifo.buf), + meta, ret, done); + + sc->thre_int_pending = 1; + +done: + return (ret); +} +#endif diff --git a/usr.sbin/bhyve/uart_emul.h b/usr.sbin/bhyve/uart_emul.h index a87202df1f96..5a53294da89e 100644 --- a/usr.sbin/bhyve/uart_emul.h +++ b/usr.sbin/bhyve/uart_emul.h @@ -1,47 +1,50 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _UART_EMUL_H_ #define _UART_EMUL_H_ - #define UART_IO_BAR_SIZE 8 struct uart_softc; +struct vm_snapshot_meta; typedef void (*uart_intr_func_t)(void *arg); struct uart_softc *uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert, void *arg); int uart_legacy_alloc(int unit, int *ioaddr, int *irq); uint8_t uart_read(struct uart_softc *sc, int offset); void uart_write(struct uart_softc *sc, int offset, uint8_t value); int uart_set_backend(struct uart_softc *sc, const char *opt); +#ifdef BHYVE_SNAPSHOT +int uart_snapshot(struct uart_softc *sc, struct vm_snapshot_meta *meta); +#endif #endif diff --git a/usr.sbin/bhyve/usb_emul.h b/usr.sbin/bhyve/usb_emul.h index c52411dd0650..d6e1e616cd82 100644 --- a/usr.sbin/bhyve/usb_emul.h +++ b/usr.sbin/bhyve/usb_emul.h @@ -1,158 +1,158 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Leon Dang * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _USB_EMUL_H_ #define _USB_EMUL_H_ #include #include #include #define USB_MAX_XFER_BLOCKS 8 #define USB_XFER_OUT 0 #define USB_XFER_IN 1 - struct usb_hci; struct usb_device_request; struct usb_data_xfer; +struct vm_snapshot_meta; /* Device emulation handlers */ struct usb_devemu { char *ue_emu; /* name of device emulation */ int ue_usbver; /* usb version: 2 or 3 */ int ue_usbspeed; /* usb device speed */ /* instance creation */ void *(*ue_init)(struct usb_hci *hci, char *opt); /* handlers */ int (*ue_request)(void *sc, struct usb_data_xfer *xfer); int (*ue_data)(void *sc, struct usb_data_xfer *xfer, int dir, int epctx); int (*ue_reset)(void *sc); int (*ue_remove)(void *sc); int (*ue_stop)(void *sc); + int (*ue_snapshot)(void *scarg, struct vm_snapshot_meta *meta); }; #define USB_EMUL_SET(x) DATA_SET(usb_emu_set, x); /* * USB device events to notify HCI when state changes */ enum hci_usbev { USBDEV_ATTACH, USBDEV_RESET, USBDEV_STOP, USBDEV_REMOVE, }; /* usb controller, ie xhci, ehci */ struct usb_hci { int (*hci_intr)(struct usb_hci *hci, int epctx); int (*hci_event)(struct usb_hci *hci, enum hci_usbev evid, void *param); void *hci_sc; /* private softc for hci */ /* controller managed fields */ int hci_address; int hci_port; }; /* * Each xfer block is mapped to the hci transfer block. * On input into the device handler, blen is set to the lenght of buf. * The device handler is to update blen to reflect on the residual size * of the buffer, i.e. len(buf) - len(consumed). */ struct usb_data_xfer_block { void *buf; /* IN or OUT pointer */ int blen; /* in:len(buf), out:len(remaining) */ int bdone; /* bytes transferred */ uint32_t processed; /* device processed this + errcode */ void *hci_data; /* HCI private reference */ int ccs; uint32_t streamid; uint64_t trbnext; /* next TRB guest address */ }; struct usb_data_xfer { struct usb_data_xfer_block data[USB_MAX_XFER_BLOCKS]; struct usb_device_request *ureq; /* setup ctl request */ int ndata; /* # of data items */ int head; int tail; pthread_mutex_t mtx; }; enum USB_ERRCODE { USB_ACK, USB_NAK, USB_STALL, USB_NYET, USB_ERR, USB_SHORT }; #define USB_DATA_GET_ERRCODE(x) (x)->processed >> 8 #define USB_DATA_SET_ERRCODE(x,e) do { \ (x)->processed = ((x)->processed & 0xFF) | (e << 8); \ } while (0) #define USB_DATA_OK(x,i) ((x)->data[(i)].buf != NULL) #define USB_DATA_XFER_INIT(x) do { \ memset((x), 0, sizeof(*(x))); \ pthread_mutex_init(&((x)->mtx), NULL); \ } while (0) #define USB_DATA_XFER_RESET(x) do { \ memset((x)->data, 0, sizeof((x)->data)); \ (x)->ndata = 0; \ (x)->head = (x)->tail = 0; \ } while (0) #define USB_DATA_XFER_LOCK(x) do { \ pthread_mutex_lock(&((x)->mtx)); \ } while (0) #define USB_DATA_XFER_UNLOCK(x) do { \ pthread_mutex_unlock(&((x)->mtx)); \ } while (0) - struct usb_devemu *usb_emu_finddev(char *name); struct usb_data_xfer_block *usb_data_xfer_append(struct usb_data_xfer *xfer, void *buf, int blen, void *hci_data, int ccs); #endif /* _USB_EMUL_H_ */ diff --git a/usr.sbin/bhyve/usb_mouse.c b/usr.sbin/bhyve/usb_mouse.c index da3800c11fc2..5398da818c7f 100644 --- a/usr.sbin/bhyve/usb_mouse.c +++ b/usr.sbin/bhyve/usb_mouse.c @@ -1,803 +1,831 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Leon Dang * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include +#include + #include #include #include #include #include #include #include "usb_emul.h" #include "console.h" #include "bhyvegc.h" #include "debug.h" static int umouse_debug = 0; #define DPRINTF(params) if (umouse_debug) PRINTLN params #define WPRINTF(params) PRINTLN params /* USB endpoint context (1-15) for reporting mouse data events*/ #define UMOUSE_INTR_ENDPT 1 #define UMOUSE_REPORT_DESC_TYPE 0x22 #define UMOUSE_GET_REPORT 0x01 #define UMOUSE_GET_IDLE 0x02 #define UMOUSE_GET_PROTOCOL 0x03 #define UMOUSE_SET_REPORT 0x09 #define UMOUSE_SET_IDLE 0x0A #define UMOUSE_SET_PROTOCOL 0x0B #define HSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) } enum { UMSTR_LANG, UMSTR_MANUFACTURER, UMSTR_PRODUCT, UMSTR_SERIAL, UMSTR_CONFIG, UMSTR_MAX }; static const char *umouse_desc_strings[] = { "\x04\x09", "BHYVE", "HID Tablet", "01", "HID Tablet Device", }; struct umouse_hid_descriptor { uint8_t bLength; uint8_t bDescriptorType; uint8_t bcdHID[2]; uint8_t bCountryCode; uint8_t bNumDescriptors; uint8_t bReportDescriptorType; uint8_t wItemLength[2]; } __packed; struct umouse_config_desc { struct usb_config_descriptor confd; struct usb_interface_descriptor ifcd; struct umouse_hid_descriptor hidd; struct usb_endpoint_descriptor endpd; struct usb_endpoint_ss_comp_descriptor sscompd; } __packed; #define MOUSE_MAX_X 0x8000 #define MOUSE_MAX_Y 0x8000 static const uint8_t umouse_report_desc[] = { 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ 0x09, 0x02, /* USAGE (Mouse) */ 0xa1, 0x01, /* COLLECTION (Application) */ 0x09, 0x01, /* USAGE (Pointer) */ 0xa1, 0x00, /* COLLECTION (Physical) */ 0x05, 0x09, /* USAGE_PAGE (Button) */ 0x19, 0x01, /* USAGE_MINIMUM (Button 1) */ 0x29, 0x03, /* USAGE_MAXIMUM (Button 3) */ 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ 0x75, 0x01, /* REPORT_SIZE (1) */ 0x95, 0x03, /* REPORT_COUNT (3) */ 0x81, 0x02, /* INPUT (Data,Var,Abs); 3 buttons */ 0x75, 0x05, /* REPORT_SIZE (5) */ 0x95, 0x01, /* REPORT_COUNT (1) */ 0x81, 0x03, /* INPUT (Cnst,Var,Abs); padding */ 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ 0x09, 0x30, /* USAGE (X) */ 0x09, 0x31, /* USAGE (Y) */ 0x35, 0x00, /* PHYSICAL_MINIMUM (0) */ 0x46, 0xff, 0x7f, /* PHYSICAL_MAXIMUM (0x7fff) */ 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ 0x26, 0xff, 0x7f, /* LOGICAL_MAXIMUM (0x7fff) */ 0x75, 0x10, /* REPORT_SIZE (16) */ 0x95, 0x02, /* REPORT_COUNT (2) */ 0x81, 0x02, /* INPUT (Data,Var,Abs) */ 0x05, 0x01, /* USAGE Page (Generic Desktop) */ 0x09, 0x38, /* USAGE (Wheel) */ 0x35, 0x00, /* PHYSICAL_MINIMUM (0) */ 0x45, 0x00, /* PHYSICAL_MAXIMUM (0) */ 0x15, 0x81, /* LOGICAL_MINIMUM (-127) */ 0x25, 0x7f, /* LOGICAL_MAXIMUM (127) */ 0x75, 0x08, /* REPORT_SIZE (8) */ 0x95, 0x01, /* REPORT_COUNT (1) */ 0x81, 0x06, /* INPUT (Data,Var,Rel) */ 0xc0, /* END_COLLECTION */ 0xc0 /* END_COLLECTION */ }; struct umouse_report { uint8_t buttons; /* bits: 0 left, 1 right, 2 middle */ int16_t x; /* x position */ int16_t y; /* y position */ int8_t z; /* z wheel position */ } __packed; #define MSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) } static struct usb_device_descriptor umouse_dev_desc = { .bLength = sizeof(umouse_dev_desc), .bDescriptorType = UDESC_DEVICE, MSETW(.bcdUSB, UD_USB_3_0), .bMaxPacketSize = 8, /* max packet size */ MSETW(.idVendor, 0xFB5D), /* vendor */ MSETW(.idProduct, 0x0001), /* product */ MSETW(.bcdDevice, 0), /* device version */ .iManufacturer = UMSTR_MANUFACTURER, .iProduct = UMSTR_PRODUCT, .iSerialNumber = UMSTR_SERIAL, .bNumConfigurations = 1, }; static struct umouse_config_desc umouse_confd = { .confd = { .bLength = sizeof(umouse_confd.confd), .bDescriptorType = UDESC_CONFIG, .wTotalLength[0] = sizeof(umouse_confd), .bNumInterface = 1, .bConfigurationValue = 1, .iConfiguration = UMSTR_CONFIG, .bmAttributes = UC_BUS_POWERED | UC_REMOTE_WAKEUP, .bMaxPower = 0, }, .ifcd = { .bLength = sizeof(umouse_confd.ifcd), .bDescriptorType = UDESC_INTERFACE, .bNumEndpoints = 1, .bInterfaceClass = UICLASS_HID, .bInterfaceSubClass = UISUBCLASS_BOOT, .bInterfaceProtocol = UIPROTO_MOUSE, }, .hidd = { .bLength = sizeof(umouse_confd.hidd), .bDescriptorType = 0x21, .bcdHID = { 0x01, 0x10 }, .bCountryCode = 0, .bNumDescriptors = 1, .bReportDescriptorType = UMOUSE_REPORT_DESC_TYPE, .wItemLength = { sizeof(umouse_report_desc), 0 }, }, .endpd = { .bLength = sizeof(umouse_confd.endpd), .bDescriptorType = UDESC_ENDPOINT, .bEndpointAddress = UE_DIR_IN | UMOUSE_INTR_ENDPT, .bmAttributes = UE_INTERRUPT, .wMaxPacketSize[0] = 8, .bInterval = 0xA, }, .sscompd = { .bLength = sizeof(umouse_confd.sscompd), .bDescriptorType = UDESC_ENDPOINT_SS_COMP, .bMaxBurst = 0, .bmAttributes = 0, MSETW(.wBytesPerInterval, 0), }, }; struct umouse_bos_desc { struct usb_bos_descriptor bosd; struct usb_devcap_ss_descriptor usbssd; } __packed; struct umouse_bos_desc umouse_bosd = { .bosd = { .bLength = sizeof(umouse_bosd.bosd), .bDescriptorType = UDESC_BOS, HSETW(.wTotalLength, sizeof(umouse_bosd)), .bNumDeviceCaps = 1, }, .usbssd = { .bLength = sizeof(umouse_bosd.usbssd), .bDescriptorType = UDESC_DEVICE_CAPABILITY, .bDevCapabilityType = 3, .bmAttributes = 0, HSETW(.wSpeedsSupported, 0x08), .bFunctionalitySupport = 3, .bU1DevExitLat = 0xa, /* dummy - not used */ .wU2DevExitLat = { 0x20, 0x00 }, } }; struct umouse_softc { struct usb_hci *hci; char *opt; struct umouse_report um_report; int newdata; struct { uint8_t idle; uint8_t protocol; uint8_t feature; } hid; pthread_mutex_t mtx; pthread_mutex_t ev_mtx; int polling; struct timeval prev_evt; }; static void umouse_event(uint8_t button, int x, int y, void *arg) { struct umouse_softc *sc; struct bhyvegc_image *gc; gc = console_get_image(); if (gc == NULL) { /* not ready */ return; } sc = arg; pthread_mutex_lock(&sc->mtx); sc->um_report.buttons = 0; sc->um_report.z = 0; if (button & 0x01) sc->um_report.buttons |= 0x01; /* left */ if (button & 0x02) sc->um_report.buttons |= 0x04; /* middle */ if (button & 0x04) sc->um_report.buttons |= 0x02; /* right */ if (button & 0x8) sc->um_report.z = 1; if (button & 0x10) sc->um_report.z = -1; /* scale coords to mouse resolution */ sc->um_report.x = MOUSE_MAX_X * x / gc->width; sc->um_report.y = MOUSE_MAX_Y * y / gc->height; sc->newdata = 1; pthread_mutex_unlock(&sc->mtx); pthread_mutex_lock(&sc->ev_mtx); sc->hci->hci_intr(sc->hci, UE_DIR_IN | UMOUSE_INTR_ENDPT); pthread_mutex_unlock(&sc->ev_mtx); } static void * umouse_init(struct usb_hci *hci, char *opt) { struct umouse_softc *sc; sc = calloc(1, sizeof(struct umouse_softc)); sc->hci = hci; sc->hid.protocol = 1; /* REPORT protocol */ sc->opt = strdup(opt); pthread_mutex_init(&sc->mtx, NULL); pthread_mutex_init(&sc->ev_mtx, NULL); console_ptr_register(umouse_event, sc, 10); return (sc); } #define UREQ(x,y) ((x) | ((y) << 8)) static int umouse_request(void *scarg, struct usb_data_xfer *xfer) { struct umouse_softc *sc; struct usb_data_xfer_block *data; const char *str; uint16_t value; uint16_t index; uint16_t len; uint16_t slen; uint8_t *udata; int err; int i, idx; int eshort; sc = scarg; data = NULL; udata = NULL; idx = xfer->head; for (i = 0; i < xfer->ndata; i++) { xfer->data[idx].bdone = 0; if (data == NULL && USB_DATA_OK(xfer,i)) { data = &xfer->data[idx]; udata = data->buf; } xfer->data[idx].processed = 1; idx = (idx + 1) % USB_MAX_XFER_BLOCKS; } err = USB_ERR_NORMAL_COMPLETION; eshort = 0; if (!xfer->ureq) { DPRINTF(("umouse_request: port %d", sc->hci->hci_port)); goto done; } value = UGETW(xfer->ureq->wValue); index = UGETW(xfer->ureq->wIndex); len = UGETW(xfer->ureq->wLength); DPRINTF(("umouse_request: port %d, type 0x%x, req 0x%x, val 0x%x, " "idx 0x%x, len %u", sc->hci->hci_port, xfer->ureq->bmRequestType, xfer->ureq->bRequest, value, index, len)); switch (UREQ(xfer->ureq->bRequest, xfer->ureq->bmRequestType)) { case UREQ(UR_GET_CONFIG, UT_READ_DEVICE): DPRINTF(("umouse: (UR_GET_CONFIG, UT_READ_DEVICE)")); if (!data) break; *udata = umouse_confd.confd.bConfigurationValue; data->blen = len > 0 ? len - 1 : 0; eshort = data->blen > 0; data->bdone += 1; break; case UREQ(UR_GET_DESCRIPTOR, UT_READ_DEVICE): DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_DEVICE) val %x", value >> 8)); if (!data) break; switch (value >> 8) { case UDESC_DEVICE: DPRINTF(("umouse: (->UDESC_DEVICE) len %u ?= " "sizeof(umouse_dev_desc) %lu", len, sizeof(umouse_dev_desc))); if ((value & 0xFF) != 0) { err = USB_ERR_IOERROR; goto done; } if (len > sizeof(umouse_dev_desc)) { data->blen = len - sizeof(umouse_dev_desc); len = sizeof(umouse_dev_desc); } else data->blen = 0; memcpy(data->buf, &umouse_dev_desc, len); data->bdone += len; break; case UDESC_CONFIG: DPRINTF(("umouse: (->UDESC_CONFIG)")); if ((value & 0xFF) != 0) { err = USB_ERR_IOERROR; goto done; } if (len > sizeof(umouse_confd)) { data->blen = len - sizeof(umouse_confd); len = sizeof(umouse_confd); } else data->blen = 0; memcpy(data->buf, &umouse_confd, len); data->bdone += len; break; case UDESC_STRING: DPRINTF(("umouse: (->UDESC_STRING)")); str = NULL; if ((value & 0xFF) < UMSTR_MAX) str = umouse_desc_strings[value & 0xFF]; else goto done; if ((value & 0xFF) == UMSTR_LANG) { udata[0] = 4; udata[1] = UDESC_STRING; data->blen = len - 2; len -= 2; data->bdone += 2; if (len >= 2) { udata[2] = str[0]; udata[3] = str[1]; data->blen -= 2; data->bdone += 2; } else data->blen = 0; goto done; } slen = 2 + strlen(str) * 2; udata[0] = slen; udata[1] = UDESC_STRING; if (len > slen) { data->blen = len - slen; len = slen; } else data->blen = 0; for (i = 2; i < len; i += 2) { udata[i] = *str++; udata[i+1] = '\0'; } data->bdone += slen; break; case UDESC_BOS: DPRINTF(("umouse: USB3 BOS")); if (len > sizeof(umouse_bosd)) { data->blen = len - sizeof(umouse_bosd); len = sizeof(umouse_bosd); } else data->blen = 0; memcpy(udata, &umouse_bosd, len); data->bdone += len; break; default: DPRINTF(("umouse: unknown(%d)->ERROR", value >> 8)); err = USB_ERR_IOERROR; goto done; } eshort = data->blen > 0; break; case UREQ(UR_GET_DESCRIPTOR, UT_READ_INTERFACE): DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_INTERFACE) " "0x%x", (value >> 8))); if (!data) break; switch (value >> 8) { case UMOUSE_REPORT_DESC_TYPE: if (len > sizeof(umouse_report_desc)) { data->blen = len - sizeof(umouse_report_desc); len = sizeof(umouse_report_desc); } else data->blen = 0; memcpy(data->buf, umouse_report_desc, len); data->bdone += len; break; default: DPRINTF(("umouse: IO ERROR")); err = USB_ERR_IOERROR; goto done; } eshort = data->blen > 0; break; case UREQ(UR_GET_INTERFACE, UT_READ_INTERFACE): DPRINTF(("umouse: (UR_GET_INTERFACE, UT_READ_INTERFACE)")); if (index != 0) { DPRINTF(("umouse get_interface, invalid index %d", index)); err = USB_ERR_IOERROR; goto done; } if (!data) break; if (len > 0) { *udata = 0; data->blen = len - 1; } eshort = data->blen > 0; data->bdone += 1; break; case UREQ(UR_GET_STATUS, UT_READ_DEVICE): DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_DEVICE)")); if (data != NULL && len > 1) { if (sc->hid.feature == UF_DEVICE_REMOTE_WAKEUP) USETW(udata, UDS_REMOTE_WAKEUP); else USETW(udata, 0); data->blen = len - 2; data->bdone += 2; } eshort = data->blen > 0; break; case UREQ(UR_GET_STATUS, UT_READ_INTERFACE): case UREQ(UR_GET_STATUS, UT_READ_ENDPOINT): DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_INTERFACE)")); if (data != NULL && len > 1) { USETW(udata, 0); data->blen = len - 2; data->bdone += 2; } eshort = data->blen > 0; break; case UREQ(UR_SET_ADDRESS, UT_WRITE_DEVICE): /* XXX Controller should've handled this */ DPRINTF(("umouse set address %u", value)); break; case UREQ(UR_SET_CONFIG, UT_WRITE_DEVICE): DPRINTF(("umouse set config %u", value)); break; case UREQ(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE): DPRINTF(("umouse set descriptor %u", value)); break; case UREQ(UR_CLEAR_FEATURE, UT_WRITE_DEVICE): DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x", value)); if (value == UF_DEVICE_REMOTE_WAKEUP) sc->hid.feature = 0; break; case UREQ(UR_SET_FEATURE, UT_WRITE_DEVICE): DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x", value)); if (value == UF_DEVICE_REMOTE_WAKEUP) sc->hid.feature = UF_DEVICE_REMOTE_WAKEUP; break; case UREQ(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE): case UREQ(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT): case UREQ(UR_SET_FEATURE, UT_WRITE_INTERFACE): case UREQ(UR_SET_FEATURE, UT_WRITE_ENDPOINT): DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_INTERFACE)")); err = USB_ERR_IOERROR; goto done; case UREQ(UR_SET_INTERFACE, UT_WRITE_INTERFACE): DPRINTF(("umouse set interface %u", value)); break; case UREQ(UR_ISOCH_DELAY, UT_WRITE_DEVICE): DPRINTF(("umouse set isoch delay %u", value)); break; case UREQ(UR_SET_SEL, 0): DPRINTF(("umouse set sel")); break; case UREQ(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT): DPRINTF(("umouse synch frame")); break; /* HID device requests */ case UREQ(UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE): DPRINTF(("umouse: (UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE) " "0x%x", (value >> 8))); if (!data) break; if ((value >> 8) == 0x01 && len >= sizeof(sc->um_report)) { /* TODO read from backend */ if (len > sizeof(sc->um_report)) { data->blen = len - sizeof(sc->um_report); len = sizeof(sc->um_report); } else data->blen = 0; memcpy(data->buf, &sc->um_report, len); data->bdone += len; } else { err = USB_ERR_IOERROR; goto done; } eshort = data->blen > 0; break; case UREQ(UMOUSE_GET_IDLE, UT_READ_CLASS_INTERFACE): if (data != NULL && len > 0) { *udata = sc->hid.idle; data->blen = len - 1; data->bdone += 1; } eshort = data->blen > 0; break; case UREQ(UMOUSE_GET_PROTOCOL, UT_READ_CLASS_INTERFACE): if (data != NULL && len > 0) { *udata = sc->hid.protocol; data->blen = len - 1; data->bdone += 1; } eshort = data->blen > 0; break; case UREQ(UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE): DPRINTF(("umouse: (UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE) ignored")); break; case UREQ(UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE): sc->hid.idle = UGETW(xfer->ureq->wValue) >> 8; DPRINTF(("umouse: (UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE) %x", sc->hid.idle)); break; case UREQ(UMOUSE_SET_PROTOCOL, UT_WRITE_CLASS_INTERFACE): sc->hid.protocol = UGETW(xfer->ureq->wValue) >> 8; DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_CLASS_INTERFACE) %x", sc->hid.protocol)); break; default: DPRINTF(("**** umouse request unhandled")); err = USB_ERR_IOERROR; break; } done: if (xfer->ureq && (xfer->ureq->bmRequestType & UT_WRITE) && (err == USB_ERR_NORMAL_COMPLETION) && (data != NULL)) data->blen = 0; else if (eshort) err = USB_ERR_SHORT_XFER; DPRINTF(("umouse request error code %d (0=ok), blen %u txlen %u", err, (data ? data->blen : 0), (data ? data->bdone : 0))); return (err); } static int umouse_data_handler(void *scarg, struct usb_data_xfer *xfer, int dir, int epctx) { struct umouse_softc *sc; struct usb_data_xfer_block *data; uint8_t *udata; int len, i, idx; int err; DPRINTF(("umouse handle data - DIR=%s|EP=%d, blen %d", dir ? "IN" : "OUT", epctx, xfer->data[0].blen)); /* find buffer to add data */ udata = NULL; err = USB_ERR_NORMAL_COMPLETION; /* handle xfer at first unprocessed item with buffer */ data = NULL; idx = xfer->head; for (i = 0; i < xfer->ndata; i++) { data = &xfer->data[idx]; if (data->buf != NULL && data->blen != 0) { break; } else { data->processed = 1; data = NULL; } idx = (idx + 1) % USB_MAX_XFER_BLOCKS; } if (!data) goto done; udata = data->buf; len = data->blen; if (udata == NULL) { DPRINTF(("umouse no buffer provided for input")); err = USB_ERR_NOMEM; goto done; } sc = scarg; if (dir) { pthread_mutex_lock(&sc->mtx); if (!sc->newdata) { err = USB_ERR_CANCELLED; USB_DATA_SET_ERRCODE(&xfer->data[xfer->head], USB_NAK); pthread_mutex_unlock(&sc->mtx); goto done; } if (sc->polling) { err = USB_ERR_STALLED; USB_DATA_SET_ERRCODE(data, USB_STALL); pthread_mutex_unlock(&sc->mtx); goto done; } sc->polling = 1; if (len > 0) { sc->newdata = 0; data->processed = 1; data->bdone += 6; memcpy(udata, &sc->um_report, 6); data->blen = len - 6; if (data->blen > 0) err = USB_ERR_SHORT_XFER; } sc->polling = 0; pthread_mutex_unlock(&sc->mtx); } else { USB_DATA_SET_ERRCODE(data, USB_STALL); err = USB_ERR_STALLED; } done: return (err); } static int umouse_reset(void *scarg) { struct umouse_softc *sc; sc = scarg; sc->newdata = 0; return (0); } static int umouse_remove(void *scarg) { return (0); } static int umouse_stop(void *scarg) { return (0); } +#ifdef BHYVE_SNAPSHOT +static int +umouse_snapshot(void *scarg, struct vm_snapshot_meta *meta) +{ + int ret; + struct umouse_softc *sc; + + sc = scarg; + + SNAPSHOT_VAR_OR_LEAVE(sc->um_report, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->newdata, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hid.idle, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hid.protocol, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hid.feature, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->polling, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->prev_evt.tv_sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->prev_evt.tv_usec, meta, ret, done); + +done: + return (ret); +} +#endif struct usb_devemu ue_mouse = { .ue_emu = "tablet", .ue_usbver = 3, .ue_usbspeed = USB_SPEED_HIGH, .ue_init = umouse_init, .ue_request = umouse_request, .ue_data = umouse_data_handler, .ue_reset = umouse_reset, .ue_remove = umouse_remove, - .ue_stop = umouse_stop + .ue_stop = umouse_stop, +#ifdef BHYVE_SNAPSHOT + .ue_snapshot = umouse_snapshot, +#endif }; USB_EMUL_SET(ue_mouse); diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c index d899a5779570..f3deb72b081c 100644 --- a/usr.sbin/bhyve/virtio.c +++ b/usr.sbin/bhyve/virtio.c @@ -1,808 +1,956 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Chris Torek * All rights reserved. * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" /* * Functions for dealing with generalized "virtual devices" as * defined by */ /* * In case we decide to relax the "virtio softc comes at the * front of virtio-based device softc" constraint, let's use * this to convert. */ #define DEV_SOFTC(vs) ((void *)(vs)) /* * Link a virtio_softc to its constants, the device softc, and * the PCI emulation. */ void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, void *dev_softc, struct pci_devinst *pi, struct vqueue_info *queues) { int i; /* vs and dev_softc addresses must match */ assert((void *)vs == dev_softc); vs->vs_vc = vc; vs->vs_pi = pi; pi->pi_arg = vs; vs->vs_queues = queues; for (i = 0; i < vc->vc_nvq; i++) { queues[i].vq_vs = vs; queues[i].vq_num = i; } } /* * Reset device (device-wide). This erases all queues, i.e., * all the queues become invalid (though we don't wipe out the * internal pointers, we just clear the VQ_ALLOC flag). * * It resets negotiated features to "none". * * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR. */ void vi_reset_dev(struct virtio_softc *vs) { struct vqueue_info *vq; int i, nvq; if (vs->vs_mtx) assert(pthread_mutex_isowned_np(vs->vs_mtx)); nvq = vs->vs_vc->vc_nvq; for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { vq->vq_flags = 0; vq->vq_last_avail = 0; vq->vq_next_used = 0; vq->vq_save_used = 0; vq->vq_pfn = 0; vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR; } vs->vs_negotiated_caps = 0; vs->vs_curq = 0; /* vs->vs_status = 0; -- redundant */ if (vs->vs_isr) pci_lintr_deassert(vs->vs_pi); vs->vs_isr = 0; vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR; } /* * Set I/O BAR (usually 0) to map PCI config registers. */ void vi_set_io_bar(struct virtio_softc *vs, int barnum) { size_t size; /* * ??? should we use CFG0 if MSI-X is disabled? * Existing code did not... */ size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize; pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size); } /* * Initialize MSI-X vector capabilities if we're to use MSI-X, * or MSI capabilities if not. * * We assume we want one MSI-X vector per queue, here, plus one * for the config vec. */ int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix) { int nvec; if (use_msix) { vs->vs_flags |= VIRTIO_USE_MSIX; VS_LOCK(vs); vi_reset_dev(vs); /* set all vectors to NO_VECTOR */ VS_UNLOCK(vs); nvec = vs->vs_vc->vc_nvq + 1; if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum)) return (1); } else vs->vs_flags &= ~VIRTIO_USE_MSIX; /* Only 1 MSI vector for bhyve */ pci_emul_add_msicap(vs->vs_pi, 1); /* Legacy interrupts are mandatory for virtio devices */ pci_lintr_request(vs->vs_pi); return (0); } /* * Initialize the currently-selected virtio queue (vs->vs_curq). * The guest just gave us a page frame number, from which we can * calculate the addresses of the queue. */ void vi_vq_init(struct virtio_softc *vs, uint32_t pfn) { struct vqueue_info *vq; uint64_t phys; size_t size; char *base; vq = &vs->vs_queues[vs->vs_curq]; vq->vq_pfn = pfn; phys = (uint64_t)pfn << VRING_PFN; size = vring_size(vq->vq_qsize); base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size); /* First page(s) are descriptors... */ vq->vq_desc = (struct virtio_desc *)base; base += vq->vq_qsize * sizeof(struct virtio_desc); /* ... immediately followed by "avail" ring (entirely uint16_t's) */ vq->vq_avail = (struct vring_avail *)base; base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t); /* Then it's rounded up to the next page... */ base = (char *)roundup2((uintptr_t)base, VRING_ALIGN); /* ... and the last page(s) are the used ring. */ vq->vq_used = (struct vring_used *)base; /* Mark queue as allocated, and start at 0 when we use it. */ vq->vq_flags = VQ_ALLOC; vq->vq_last_avail = 0; vq->vq_next_used = 0; vq->vq_save_used = 0; } /* * Helper inline for vq_getchain(): record the i'th "real" * descriptor. */ static inline void _vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx, struct iovec *iov, int n_iov, uint16_t *flags) { if (i >= n_iov) return; iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len); iov[i].iov_len = vd->vd_len; if (flags != NULL) flags[i] = vd->vd_flags; } #define VQ_MAX_DESCRIPTORS 512 /* see below */ /* * Examine the chain of descriptors starting at the "next one" to * make sure that they describe a sensible request. If so, return * the number of "real" descriptors that would be needed/used in * acting on this request. This may be smaller than the number of * available descriptors, e.g., if there are two available but * they are two separate requests, this just returns 1. Or, it * may be larger: if there are indirect descriptors involved, * there may only be one descriptor available but it may be an * indirect pointing to eight more. We return 8 in this case, * i.e., we do not count the indirect descriptors, only the "real" * ones. * * Basically, this vets the vd_flags and vd_next field of each * descriptor and tells you how many are involved. Since some may * be indirect, this also needs the vmctx (in the pci_devinst * at vs->vs_pi) so that it can find indirect descriptors. * * As we process each descriptor, we copy and adjust it (guest to * host address wise, also using the vmtctx) into the given iov[] * array (of the given size). If the array overflows, we stop * placing values into the array but keep processing descriptors, * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1. * So you, the caller, must not assume that iov[] is as big as the * return value (you can process the same thing twice to allocate * a larger iov array if needed, or supply a zero length to find * out how much space is needed). * * If you want to verify the WRITE flag on each descriptor, pass a * non-NULL "flags" pointer to an array of "uint16_t" of the same size * as n_iov and we'll copy each vd_flags field after unwinding any * indirects. * * If some descriptor(s) are invalid, this prints a diagnostic message * and returns -1. If no descriptors are ready now it simply returns 0. * * You are assumed to have done a vq_ring_ready() if needed (note * that vq_has_descs() does one). */ int vq_getchain(struct vqueue_info *vq, uint16_t *pidx, struct iovec *iov, int n_iov, uint16_t *flags) { int i; u_int ndesc, n_indir; u_int idx, next; volatile struct virtio_desc *vdir, *vindir, *vp; struct vmctx *ctx; struct virtio_softc *vs; const char *name; vs = vq->vq_vs; name = vs->vs_vc->vc_name; /* * Note: it's the responsibility of the guest not to * update vq->vq_avail->va_idx until all of the descriptors * the guest has written are valid (including all their * vd_next fields and vd_flags). * * Compute (va_idx - last_avail) in integers mod 2**16. This is * the number of descriptors the device has made available * since the last time we updated vq->vq_last_avail. * * We just need to do the subtraction as an unsigned int, * then trim off excess bits. */ idx = vq->vq_last_avail; ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx); if (ndesc == 0) return (0); if (ndesc > vq->vq_qsize) { /* XXX need better way to diagnose issues */ EPRINTLN( "%s: ndesc (%u) out of range, driver confused?", name, (u_int)ndesc); return (-1); } /* * Now count/parse "involved" descriptors starting from * the head of the chain. * * To prevent loops, we could be more complicated and * check whether we're re-visiting a previously visited * index, but we just abort if the count gets excessive. */ ctx = vs->vs_pi->pi_vmctx; *pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)]; vq->vq_last_avail++; for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) { if (next >= vq->vq_qsize) { EPRINTLN( "%s: descriptor index %u out of range, " "driver confused?", name, next); return (-1); } vdir = &vq->vq_desc[next]; if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) { _vq_record(i, vdir, ctx, iov, n_iov, flags); i++; } else if ((vs->vs_vc->vc_hv_caps & VIRTIO_RING_F_INDIRECT_DESC) == 0) { EPRINTLN( "%s: descriptor has forbidden INDIRECT flag, " "driver confused?", name); return (-1); } else { n_indir = vdir->vd_len / 16; if ((vdir->vd_len & 0xf) || n_indir == 0) { EPRINTLN( "%s: invalid indir len 0x%x, " "driver confused?", name, (u_int)vdir->vd_len); return (-1); } vindir = paddr_guest2host(ctx, vdir->vd_addr, vdir->vd_len); /* * Indirects start at the 0th, then follow * their own embedded "next"s until those run * out. Each one's indirect flag must be off * (we don't really have to check, could just * ignore errors...). */ next = 0; for (;;) { vp = &vindir[next]; if (vp->vd_flags & VRING_DESC_F_INDIRECT) { EPRINTLN( "%s: indirect desc has INDIR flag," " driver confused?", name); return (-1); } _vq_record(i, vp, ctx, iov, n_iov, flags); if (++i > VQ_MAX_DESCRIPTORS) goto loopy; if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0) break; next = vp->vd_next; if (next >= n_indir) { EPRINTLN( "%s: invalid next %u > %u, " "driver confused?", name, (u_int)next, n_indir); return (-1); } } } if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) return (i); } loopy: EPRINTLN( "%s: descriptor loop? count > %d - driver confused?", name, i); return (-1); } /* * Return the first n_chain request chains back to the available queue. * * (These chains are the ones you handled when you called vq_getchain() * and used its positive return value.) */ void vq_retchains(struct vqueue_info *vq, uint16_t n_chains) { vq->vq_last_avail -= n_chains; } void vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) { volatile struct vring_used *vuh; volatile struct virtio_used *vue; uint16_t mask; /* * Notes: * - mask is N-1 where N is a power of 2 so computes x % N * - vuh points to the "used" data shared with guest * - vue points to the "used" ring entry we want to update * * (I apologize for the two fields named vu_idx; the * virtio spec calls the one that vue points to, "id"...) */ mask = vq->vq_qsize - 1; vuh = vq->vq_used; vue = &vuh->vu_ring[vq->vq_next_used++ & mask]; vue->vu_idx = idx; vue->vu_tlen = iolen; } void vq_relchain_publish(struct vqueue_info *vq) { /* * Ensure the used descriptor is visible before updating the index. * This is necessary on ISAs with memory ordering less strict than x86 * (and even on x86 to act as a compiler barrier). */ atomic_thread_fence_rel(); vq->vq_used->vu_idx = vq->vq_next_used; } /* * Return specified request chain to the guest, setting its I/O length * to the provided value. * * (This chain is the one you handled when you called vq_getchain() * and used its positive return value.) */ void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) { vq_relchain_prepare(vq, idx, iolen); vq_relchain_publish(vq); } /* * Driver has finished processing "available" chains and calling * vq_relchain on each one. If driver used all the available * chains, used_all should be set. * * If the "used" index moved we may need to inform the guest, i.e., * deliver an interrupt. Even if the used index did NOT move we * may need to deliver an interrupt, if the avail ring is empty and * we are supposed to interrupt on empty. * * Note that used_all_avail is provided by the caller because it's * a snapshot of the ring state when he decided to finish interrupt * processing -- it's possible that descriptors became available after * that point. (It's also typically a constant 1/True as well.) */ void vq_endchains(struct vqueue_info *vq, int used_all_avail) { struct virtio_softc *vs; uint16_t event_idx, new_idx, old_idx; int intr; /* * Interrupt generation: if we're using EVENT_IDX, * interrupt if we've crossed the event threshold. * Otherwise interrupt is generated if we added "used" entries, * but suppressed by VRING_AVAIL_F_NO_INTERRUPT. * * In any case, though, if NOTIFY_ON_EMPTY is set and the * entire avail was processed, we need to interrupt always. */ vs = vq->vq_vs; old_idx = vq->vq_save_used; vq->vq_save_used = new_idx = vq->vq_used->vu_idx; /* * Use full memory barrier between vu_idx store from preceding * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or * va_flags below. */ atomic_thread_fence_seq_cst(); if (used_all_avail && (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) intr = 1; else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) { event_idx = VQ_USED_EVENT_IDX(vq); /* * This calculation is per docs and the kernel * (see src/sys/dev/virtio/virtio_ring.h). */ intr = (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old_idx); } else { intr = new_idx != old_idx && !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT); } if (intr) vq_interrupt(vs, vq); } /* Note: these are in sorted order to make for a fast search */ static struct config_reg { uint16_t cr_offset; /* register offset */ uint8_t cr_size; /* size (bytes) */ uint8_t cr_ro; /* true => reg is read only */ const char *cr_name; /* name of reg */ } config_regs[] = { { VTCFG_R_HOSTCAP, 4, 1, "HOSTCAP" }, { VTCFG_R_GUESTCAP, 4, 0, "GUESTCAP" }, { VTCFG_R_PFN, 4, 0, "PFN" }, { VTCFG_R_QNUM, 2, 1, "QNUM" }, { VTCFG_R_QSEL, 2, 0, "QSEL" }, { VTCFG_R_QNOTIFY, 2, 0, "QNOTIFY" }, { VTCFG_R_STATUS, 1, 0, "STATUS" }, { VTCFG_R_ISR, 1, 0, "ISR" }, { VTCFG_R_CFGVEC, 2, 0, "CFGVEC" }, { VTCFG_R_QVEC, 2, 0, "QVEC" }, }; static inline struct config_reg * vi_find_cr(int offset) { u_int hi, lo, mid; struct config_reg *cr; lo = 0; hi = sizeof(config_regs) / sizeof(*config_regs) - 1; while (hi >= lo) { mid = (hi + lo) >> 1; cr = &config_regs[mid]; if (cr->cr_offset == offset) return (cr); if (cr->cr_offset < offset) lo = mid + 1; else hi = mid - 1; } return (NULL); } /* * Handle pci config space reads. * If it's to the MSI-X info, do that. * If it's part of the virtio standard stuff, do that. * Otherwise dispatch to the actual driver. */ uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct virtio_softc *vs = pi->pi_arg; struct virtio_consts *vc; struct config_reg *cr; uint64_t virtio_config_size, max; const char *name; uint32_t newoff; uint32_t value; int error; if (vs->vs_flags & VIRTIO_USE_MSIX) { if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { return (pci_emul_msix_tread(pi, offset, size)); } } /* XXX probably should do something better than just assert() */ assert(baridx == 0); if (vs->vs_mtx) pthread_mutex_lock(vs->vs_mtx); vc = vs->vs_vc; name = vc->vc_name; value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff; if (size != 1 && size != 2 && size != 4) goto bad; if (pci_msix_enabled(pi)) virtio_config_size = VTCFG_R_CFG1; else virtio_config_size = VTCFG_R_CFG0; if (offset >= virtio_config_size) { /* * Subtract off the standard size (including MSI-X * registers if enabled) and dispatch to underlying driver. * If that fails, fall into general code. */ newoff = offset - virtio_config_size; max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; if (newoff + size > max) goto bad; error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value); if (!error) goto done; } bad: cr = vi_find_cr(offset); if (cr == NULL || cr->cr_size != size) { if (cr != NULL) { /* offset must be OK, so size must be bad */ EPRINTLN( "%s: read from %s: bad size %d", name, cr->cr_name, size); } else { EPRINTLN( "%s: read from bad offset/size %jd/%d", name, (uintmax_t)offset, size); } goto done; } switch (offset) { case VTCFG_R_HOSTCAP: value = vc->vc_hv_caps; break; case VTCFG_R_GUESTCAP: value = vs->vs_negotiated_caps; break; case VTCFG_R_PFN: if (vs->vs_curq < vc->vc_nvq) value = vs->vs_queues[vs->vs_curq].vq_pfn; break; case VTCFG_R_QNUM: value = vs->vs_curq < vc->vc_nvq ? vs->vs_queues[vs->vs_curq].vq_qsize : 0; break; case VTCFG_R_QSEL: value = vs->vs_curq; break; case VTCFG_R_QNOTIFY: value = 0; /* XXX */ break; case VTCFG_R_STATUS: value = vs->vs_status; break; case VTCFG_R_ISR: value = vs->vs_isr; vs->vs_isr = 0; /* a read clears this flag */ if (value) pci_lintr_deassert(pi); break; case VTCFG_R_CFGVEC: value = vs->vs_msix_cfg_idx; break; case VTCFG_R_QVEC: value = vs->vs_curq < vc->vc_nvq ? vs->vs_queues[vs->vs_curq].vq_msix_idx : VIRTIO_MSI_NO_VECTOR; break; } done: if (vs->vs_mtx) pthread_mutex_unlock(vs->vs_mtx); return (value); } /* * Handle pci config space writes. * If it's to the MSI-X info, do that. * If it's part of the virtio standard stuff, do that. * Otherwise dispatch to the actual driver. */ void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct virtio_softc *vs = pi->pi_arg; struct vqueue_info *vq; struct virtio_consts *vc; struct config_reg *cr; uint64_t virtio_config_size, max; const char *name; uint32_t newoff; int error; if (vs->vs_flags & VIRTIO_USE_MSIX) { if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { pci_emul_msix_twrite(pi, offset, size, value); return; } } /* XXX probably should do something better than just assert() */ assert(baridx == 0); if (vs->vs_mtx) pthread_mutex_lock(vs->vs_mtx); vc = vs->vs_vc; name = vc->vc_name; if (size != 1 && size != 2 && size != 4) goto bad; if (pci_msix_enabled(pi)) virtio_config_size = VTCFG_R_CFG1; else virtio_config_size = VTCFG_R_CFG0; if (offset >= virtio_config_size) { /* * Subtract off the standard size (including MSI-X * registers if enabled) and dispatch to underlying driver. */ newoff = offset - virtio_config_size; max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; if (newoff + size > max) goto bad; error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value); if (!error) goto done; } bad: cr = vi_find_cr(offset); if (cr == NULL || cr->cr_size != size || cr->cr_ro) { if (cr != NULL) { /* offset must be OK, wrong size and/or reg is R/O */ if (cr->cr_size != size) EPRINTLN( "%s: write to %s: bad size %d", name, cr->cr_name, size); if (cr->cr_ro) EPRINTLN( "%s: write to read-only reg %s", name, cr->cr_name); } else { EPRINTLN( "%s: write to bad offset/size %jd/%d", name, (uintmax_t)offset, size); } goto done; } switch (offset) { case VTCFG_R_GUESTCAP: vs->vs_negotiated_caps = value & vc->vc_hv_caps; if (vc->vc_apply_features) (*vc->vc_apply_features)(DEV_SOFTC(vs), vs->vs_negotiated_caps); break; case VTCFG_R_PFN: if (vs->vs_curq >= vc->vc_nvq) goto bad_qindex; vi_vq_init(vs, value); break; case VTCFG_R_QSEL: /* * Note that the guest is allowed to select an * invalid queue; we just need to return a QNUM * of 0 while the bad queue is selected. */ vs->vs_curq = value; break; case VTCFG_R_QNOTIFY: if (value >= vc->vc_nvq) { EPRINTLN("%s: queue %d notify out of range", name, (int)value); goto done; } vq = &vs->vs_queues[value]; if (vq->vq_notify) (*vq->vq_notify)(DEV_SOFTC(vs), vq); else if (vc->vc_qnotify) (*vc->vc_qnotify)(DEV_SOFTC(vs), vq); else EPRINTLN( "%s: qnotify queue %d: missing vq/vc notify", name, (int)value); break; case VTCFG_R_STATUS: vs->vs_status = value; if (value == 0) (*vc->vc_reset)(DEV_SOFTC(vs)); break; case VTCFG_R_CFGVEC: vs->vs_msix_cfg_idx = value; break; case VTCFG_R_QVEC: if (vs->vs_curq >= vc->vc_nvq) goto bad_qindex; vq = &vs->vs_queues[vs->vs_curq]; vq->vq_msix_idx = value; break; } goto done; bad_qindex: EPRINTLN( "%s: write config reg %s: curq %d >= max %d", name, cr->cr_name, vs->vs_curq, vc->vc_nvq); done: if (vs->vs_mtx) pthread_mutex_unlock(vs->vs_mtx); } + +#ifdef BHYVE_SNAPSHOT +int +vi_pci_pause(struct vmctx *ctx, struct pci_devinst *pi) +{ + struct virtio_softc *vs; + struct virtio_consts *vc; + + vs = pi->pi_arg; + vc = vs->vs_vc; + + vc = vs->vs_vc; + assert(vc->vc_pause != NULL); + (*vc->vc_pause)(DEV_SOFTC(vs)); + + return (0); +} + +int +vi_pci_resume(struct vmctx *ctx, struct pci_devinst *pi) +{ + struct virtio_softc *vs; + struct virtio_consts *vc; + + vs = pi->pi_arg; + vc = vs->vs_vc; + + vc = vs->vs_vc; + assert(vc->vc_resume != NULL); + (*vc->vc_resume)(DEV_SOFTC(vs)); + + return (0); +} + +static int +vi_pci_snapshot_softc(struct virtio_softc *vs, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(vs->vs_flags, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vs->vs_negotiated_caps, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vs->vs_curq, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vs->vs_status, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vs->vs_isr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vs->vs_msix_cfg_idx, meta, ret, done); + +done: + return (ret); +} + +static int +vi_pci_snapshot_consts(struct virtio_consts *vc, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_nvq, meta, ret, done); + SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_cfgsize, meta, ret, done); + SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_hv_caps, meta, ret, done); + +done: + return (ret); +} + +static int +vi_pci_snapshot_queues(struct virtio_softc *vs, struct vm_snapshot_meta *meta) +{ + int i; + int ret; + struct virtio_consts *vc; + struct vqueue_info *vq; + uint64_t addr_size; + + vc = vs->vs_vc; + + /* Save virtio queue info */ + for (i = 0; i < vc->vc_nvq; i++) { + vq = &vs->vs_queues[i]; + + SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_qsize, meta, ret, done); + SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_num, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vq->vq_flags, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vq->vq_last_avail, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vq->vq_next_used, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vq->vq_save_used, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vq->vq_msix_idx, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vq->vq_pfn, meta, ret, done); + + addr_size = vq->vq_qsize * sizeof(struct virtio_desc); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_desc, addr_size, + false, meta, ret, done); + + addr_size = (2 + vq->vq_qsize + 1) * sizeof(uint16_t); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_avail, addr_size, + false, meta, ret, done); + + addr_size = (2 + 2 * vq->vq_qsize + 1) * sizeof(uint16_t); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_used, addr_size, + false, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(vq->vq_desc, vring_size(vq->vq_qsize), + meta, ret, done); + } + +done: + return (ret); +} + +int +vi_pci_snapshot(struct vm_snapshot_meta *meta) +{ + int ret; + struct pci_devinst *pi; + struct virtio_softc *vs; + struct virtio_consts *vc; + + pi = meta->dev_data; + vs = pi->pi_arg; + vc = vs->vs_vc; + + /* Save virtio softc */ + ret = vi_pci_snapshot_softc(vs, meta); + if (ret != 0) + goto done; + + /* Save virtio consts */ + ret = vi_pci_snapshot_consts(vc, meta); + if (ret != 0) + goto done; + + /* Save virtio queue info */ + ret = vi_pci_snapshot_queues(vs, meta); + if (ret != 0) + goto done; + + /* Save device softc, if needed */ + if (vc->vc_snapshot != NULL) { + ret = (*vc->vc_snapshot)(DEV_SOFTC(vs), meta); + if (ret != 0) + goto done; + } + +done: + return (ret); +} +#endif diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h index ab95d9d213e3..e9432e012b27 100644 --- a/usr.sbin/bhyve/virtio.h +++ b/usr.sbin/bhyve/virtio.h @@ -1,494 +1,504 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Chris Torek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VIRTIO_H_ #define _VIRTIO_H_ #include /* * These are derived from several virtio specifications. * * Some useful links: * https://github.com/rustyrussell/virtio-spec * http://people.redhat.com/pbonzini/virtio-spec.pdf */ /* * A virtual device has zero or more "virtual queues" (virtqueue). * Each virtqueue uses at least two 4096-byte pages, laid out thus: * * +-----------------------------------------------+ * | "desc": descriptors, 16 bytes each | * | ----------------------------------------- | * | "avail": 2 uint16; uint16; 1 uint16 | * | ----------------------------------------- | * | pad to 4k boundary | * +-----------------------------------------------+ * | "used": 2 x uint16; elems; 1 uint16 | * | ----------------------------------------- | * | pad to 4k boundary | * +-----------------------------------------------+ * * The number that appears here is always a power of two and is * limited to no more than 32768 (as it must fit in a 16-bit field). * If is sufficiently large, the above will occupy more than * two pages. In any case, all pages must be physically contiguous * within the guest's physical address space. * * The 16-byte "desc" descriptors consist of a 64-bit guest * physical address , a 32-bit length , a 16-bit * , and a 16-bit field (all in guest byte order). * * There are three flags that may be set : * NEXT descriptor is chained, so use its "next" field * WRITE descriptor is for host to write into guest RAM * (else host is to read from guest RAM) * INDIRECT descriptor address field is (guest physical) * address of a linear array of descriptors * * Unless INDIRECT is set, is the number of bytes that may * be read/written from guest physical address . If * INDIRECT is set, WRITE is ignored and provides the length * of the indirect descriptors (and must be a multiple of * 16). Note that NEXT may still be set in the main descriptor * pointing to the indirect, and should be set in each indirect * descriptor that uses the next descriptor (these should generally * be numbered sequentially). However, INDIRECT must not be set * in the indirect descriptors. Upon reaching an indirect descriptor * without a NEXT bit, control returns to the direct descriptors. * * Except inside an indirect, each value must be in the * range [0 .. N) (i.e., the half-open interval). (Inside an * indirect, each must be in the range [0 .. /16).) * * The "avail" data structures reside in the same pages as the * "desc" structures since both together are used by the device to * pass information to the hypervisor's virtual driver. These * begin with a 16-bit field and 16-bit index , then * have 16-bit values, followed by one final 16-bit * field . The entries are simply indices * indices into the descriptor ring (and thus must meet the same * constraints as each value). However, is counted * up from 0 (initially) and simply wraps around after 65535; it * is taken mod to find the next available entry. * * The "used" ring occupies a separate page or pages, and contains * values written from the virtual driver back to the guest OS. * This begins with a 16-bit and 16-bit , then there * are "vring_used" elements, followed by a 16-bit . * The "vring_used" elements consist of a 32-bit and a * 32-bit (vu_tlen below). The is simply the index of * the head of a descriptor chain the guest made available * earlier, and the is the number of bytes actually written, * e.g., in the case of a network driver that provided a large * receive buffer but received only a small amount of data. * * The two event fields, and , in the * avail and used rings (respectively -- note the reversal!), are * always provided, but are used only if the virtual device * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature * negotiation. Similarly, both rings provide a flag -- * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in * their field, indicating that the guest does not need an * interrupt, or that the hypervisor driver does not need a * notify, when descriptors are added to the corresponding ring. * (These are provided only for interrupt optimization and need * not be implemented.) */ #define VRING_ALIGN 4096 #define VRING_DESC_F_NEXT (1 << 0) #define VRING_DESC_F_WRITE (1 << 1) #define VRING_DESC_F_INDIRECT (1 << 2) struct virtio_desc { /* AKA vring_desc */ uint64_t vd_addr; /* guest physical address */ uint32_t vd_len; /* length of scatter/gather seg */ uint16_t vd_flags; /* VRING_F_DESC_* */ uint16_t vd_next; /* next desc if F_NEXT */ } __packed; struct virtio_used { /* AKA vring_used_elem */ uint32_t vu_idx; /* head of used descriptor chain */ uint32_t vu_tlen; /* length written-to */ } __packed; #define VRING_AVAIL_F_NO_INTERRUPT 1 struct vring_avail { uint16_t va_flags; /* VRING_AVAIL_F_* */ uint16_t va_idx; /* counts to 65535, then cycles */ uint16_t va_ring[]; /* size N, reported in QNUM value */ /* uint16_t va_used_event; -- after N ring entries */ } __packed; #define VRING_USED_F_NO_NOTIFY 1 struct vring_used { uint16_t vu_flags; /* VRING_USED_F_* */ uint16_t vu_idx; /* counts to 65535, then cycles */ struct virtio_used vu_ring[]; /* size N */ /* uint16_t vu_avail_event; -- after N ring entries */ } __packed; /* * The address of any given virtual queue is determined by a single * Page Frame Number register. The guest writes the PFN into the * PCI config space. However, a device that has two or more * virtqueues can have a different PFN, and size, for each queue. * The number of queues is determinable via the PCI config space * VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means * queue #0, 1 means queue#1, etc. Once a queue is selected, the * remaining PFN and QNUM registers refer to that queue. * * QNUM is a read-only register containing a nonzero power of two * that indicates the (hypervisor's) queue size. Or, if reading it * produces zero, the hypervisor does not have a corresponding * queue. (The number of possible queues depends on the virtual * device. The block device has just one; the network device * provides either two -- 0 = receive, 1 = transmit -- or three, * with 2 = control.) * * PFN is a read/write register giving the physical page address of * the virtqueue in guest memory (the guest must allocate enough space * based on the hypervisor's provided QNUM). * * QNOTIFY is effectively write-only: when the guest writes a queue * number to the register, the hypervisor should scan the specified * virtqueue. (Reading QNOTIFY currently always gets 0). */ /* * PFN register shift amount */ #define VRING_PFN 12 /* * Virtio device types * * XXX Should really be merged with defines */ #define VIRTIO_TYPE_NET 1 #define VIRTIO_TYPE_BLOCK 2 #define VIRTIO_TYPE_CONSOLE 3 #define VIRTIO_TYPE_ENTROPY 4 #define VIRTIO_TYPE_BALLOON 5 #define VIRTIO_TYPE_IOMEMORY 6 #define VIRTIO_TYPE_RPMSG 7 #define VIRTIO_TYPE_SCSI 8 #define VIRTIO_TYPE_9P 9 /* experimental IDs start at 65535 and work down */ /* * PCI vendor/device IDs */ #define VIRTIO_VENDOR 0x1AF4 #define VIRTIO_DEV_NET 0x1000 #define VIRTIO_DEV_BLOCK 0x1001 #define VIRTIO_DEV_CONSOLE 0x1003 #define VIRTIO_DEV_RANDOM 0x1005 #define VIRTIO_DEV_SCSI 0x1008 /* * PCI config space constants. * * If MSI-X is enabled, the ISR register is generally not used, * and the configuration vector and queue vector appear at offsets * 20 and 22 with the remaining configuration registers at 24. * If MSI-X is not enabled, those two registers disappear and * the remaining configuration registers start at offset 20. */ #define VTCFG_R_HOSTCAP 0 #define VTCFG_R_GUESTCAP 4 #define VTCFG_R_PFN 8 #define VTCFG_R_QNUM 12 #define VTCFG_R_QSEL 14 #define VTCFG_R_QNOTIFY 16 #define VTCFG_R_STATUS 18 #define VTCFG_R_ISR 19 #define VTCFG_R_CFGVEC 20 #define VTCFG_R_QVEC 22 #define VTCFG_R_CFG0 20 /* No MSI-X */ #define VTCFG_R_CFG1 24 /* With MSI-X */ #define VTCFG_R_MSIX 20 /* * Bits in VTCFG_R_STATUS. Guests need not actually set any of these, * but a guest writing 0 to this register means "please reset". */ #define VTCFG_STATUS_ACK 0x01 /* guest OS has acknowledged dev */ #define VTCFG_STATUS_DRIVER 0x02 /* guest OS driver is loaded */ #define VTCFG_STATUS_DRIVER_OK 0x04 /* guest OS driver ready */ #define VTCFG_STATUS_FAILED 0x80 /* guest has given up on this dev */ /* * Bits in VTCFG_R_ISR. These apply only if not using MSI-X. * * (We don't [yet?] ever use CONF_CHANGED.) */ #define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */ #define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */ #define VIRTIO_MSI_NO_VECTOR 0xFFFF /* * Feature flags. * Note: bits 0 through 23 are reserved to each device type. */ #define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24) #define VIRTIO_RING_F_INDIRECT_DESC (1 << 28) #define VIRTIO_RING_F_EVENT_IDX (1 << 29) /* From section 2.3, "Virtqueue Configuration", of the virtio specification */ static inline size_t vring_size(u_int qsz) { size_t size; /* constant 3 below = va_flags, va_idx, va_used_event */ size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz); size = roundup2(size, VRING_ALIGN); /* constant 3 below = vu_flags, vu_idx, vu_avail_event */ size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz; size = roundup2(size, VRING_ALIGN); return (size); } struct vmctx; struct pci_devinst; struct vqueue_info; +struct vm_snapshot_meta; /* * A virtual device, with some number (possibly 0) of virtual * queues and some size (possibly 0) of configuration-space * registers private to the device. The virtio_softc should come * at the front of each "derived class", so that a pointer to the * virtio_softc is also a pointer to the more specific, derived- * from-virtio driver's softc. * * Note: inside each hypervisor virtio driver, changes to these * data structures must be locked against other threads, if any. * Except for PCI config space register read/write, we assume each * driver does the required locking, but we need a pointer to the * lock (if there is one) for PCI config space read/write ops. * * When the guest reads or writes the device's config space, the * generic layer checks for operations on the special registers * described above. If the offset of the register(s) being read * or written is past the CFG area (CFG0 or CFG1), the request is * passed on to the virtual device, after subtracting off the * generic-layer size. (So, drivers can just use the offset as * an offset into "struct config", for instance.) * * (The virtio layer also makes sure that the read or write is to/ * from a "good" config offset, hence vc_cfgsize, and on BAR #0. * However, the driver must verify the read or write size and offset * and that no one is writing a readonly register.) * * The BROKED flag ("this thing done gone and broked") is for future * use. */ #define VIRTIO_USE_MSIX 0x01 #define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */ #define VIRTIO_BROKED 0x08 /* ??? */ struct virtio_softc { struct virtio_consts *vs_vc; /* constants (see below) */ int vs_flags; /* VIRTIO_* flags from above */ pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */ struct pci_devinst *vs_pi; /* PCI device instance */ uint32_t vs_negotiated_caps; /* negotiated capabilities */ struct vqueue_info *vs_queues; /* one per vc_nvq */ int vs_curq; /* current queue */ uint8_t vs_status; /* value from last status write */ uint8_t vs_isr; /* ISR flags, if not MSI-X */ uint16_t vs_msix_cfg_idx; /* MSI-X vector for config event */ }; #define VS_LOCK(vs) \ do { \ if (vs->vs_mtx) \ pthread_mutex_lock(vs->vs_mtx); \ } while (0) #define VS_UNLOCK(vs) \ do { \ if (vs->vs_mtx) \ pthread_mutex_unlock(vs->vs_mtx); \ } while (0) struct virtio_consts { const char *vc_name; /* name of driver (for diagnostics) */ int vc_nvq; /* number of virtual queues */ size_t vc_cfgsize; /* size of dev-specific config regs */ void (*vc_reset)(void *); /* called on virtual device reset */ void (*vc_qnotify)(void *, struct vqueue_info *); /* called on QNOTIFY if no VQ notify */ int (*vc_cfgread)(void *, int, int, uint32_t *); /* called to read config regs */ int (*vc_cfgwrite)(void *, int, int, uint32_t); /* called to write config regs */ void (*vc_apply_features)(void *, uint64_t); /* called to apply negotiated features */ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ + void (*vc_pause)(void *); /* called to pause device activity */ + void (*vc_resume)(void *); /* called to resume device activity */ + int (*vc_snapshot)(void *, struct vm_snapshot_meta *); + /* called to save / restore device state */ }; /* * Data structure allocated (statically) per virtual queue. * * Drivers may change vq_qsize after a reset. When the guest OS * requests a device reset, the hypervisor first calls * vs->vs_vc->vc_reset(); then the data structure below is * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq). * * The remaining fields should only be fussed-with by the generic * code. * * Note: the addresses of vq_desc, vq_avail, and vq_used are all * computable from each other, but it's a lot simpler if we just * keep a pointer to each one. The event indices are similarly * (but more easily) computable, and this time we'll compute them: * they're just XX_ring[N]. */ #define VQ_ALLOC 0x01 /* set once we have a pfn */ #define VQ_BROKED 0x02 /* ??? */ struct vqueue_info { uint16_t vq_qsize; /* size of this queue (a power of 2) */ void (*vq_notify)(void *, struct vqueue_info *); /* called instead of vc_notify, if not NULL */ struct virtio_softc *vq_vs; /* backpointer to softc */ uint16_t vq_num; /* we're the num'th queue in the softc */ uint16_t vq_flags; /* flags (see above) */ uint16_t vq_last_avail; /* a recent value of vq_avail->va_idx */ uint16_t vq_next_used; /* index of the next used slot to be filled */ uint16_t vq_save_used; /* saved vq_used->vu_idx; see vq_endchains */ uint16_t vq_msix_idx; /* MSI-X index, or VIRTIO_MSI_NO_VECTOR */ uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */ volatile struct virtio_desc *vq_desc; /* descriptor array */ volatile struct vring_avail *vq_avail; /* the "avail" ring */ volatile struct vring_used *vq_used; /* the "used" ring */ }; /* as noted above, these are sort of backwards, name-wise */ #define VQ_AVAIL_EVENT_IDX(vq) \ (*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize]) #define VQ_USED_EVENT_IDX(vq) \ ((vq)->vq_avail->va_ring[(vq)->vq_qsize]) /* * Is this ring ready for I/O? */ static inline int vq_ring_ready(struct vqueue_info *vq) { return (vq->vq_flags & VQ_ALLOC); } /* * Are there "available" descriptors? (This does not count * how many, just returns True if there are some.) */ static inline int vq_has_descs(struct vqueue_info *vq) { return (vq_ring_ready(vq) && vq->vq_last_avail != vq->vq_avail->va_idx); } /* * Deliver an interrupt to guest on the given virtual queue * (if possible, or a generic MSI interrupt if not using MSI-X). */ static inline void vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq) { if (pci_msix_enabled(vs->vs_pi)) pci_generate_msix(vs->vs_pi, vq->vq_msix_idx); else { VS_LOCK(vs); vs->vs_isr |= VTCFG_ISR_QUEUES; pci_generate_msi(vs->vs_pi, 0); pci_lintr_assert(vs->vs_pi); VS_UNLOCK(vs); } } static inline void vq_kick_enable(struct vqueue_info *vq) { vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY; /* * Full memory barrier to make sure the store to vu_flags * happens before the load from va_idx, which results from * a subsequent call to vq_has_descs(). */ atomic_thread_fence_seq_cst(); } static inline void vq_kick_disable(struct vqueue_info *vq) { vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; } struct iovec; void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, void *dev_softc, struct pci_devinst *pi, struct vqueue_info *queues); int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); void vi_reset_dev(struct virtio_softc *); void vi_set_io_bar(struct virtio_softc *, int); int vq_getchain(struct vqueue_info *vq, uint16_t *pidx, struct iovec *iov, int n_iov, uint16_t *flags); void vq_retchains(struct vqueue_info *vq, uint16_t n_chains); void vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); void vq_relchain_publish(struct vqueue_info *vq); void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); void vq_endchains(struct vqueue_info *vq, int used_all_avail); uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size); void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value); +#ifdef BHYVE_SNAPSHOT +int vi_pci_snapshot(struct vm_snapshot_meta *meta); +int vi_pci_pause(struct vmctx *ctx, struct pci_devinst *pi); +int vi_pci_resume(struct vmctx *ctx, struct pci_devinst *pi); +#endif #endif /* _VIRTIO_H_ */ diff --git a/usr.sbin/bhyvectl/Makefile b/usr.sbin/bhyvectl/Makefile index 0ffca5675cb9..58eaf49dae3a 100644 --- a/usr.sbin/bhyvectl/Makefile +++ b/usr.sbin/bhyvectl/Makefile @@ -1,17 +1,23 @@ # # $FreeBSD$ # +.include + PROG= bhyvectl SRCS= bhyvectl.c PACKAGE= bhyve MAN= bhyvectl.8 LIBADD= vmmapi util WARNS?= 3 CFLAGS+= -I${SRCTOP}/sys/amd64/vmm +.if ${MK_BHYVE_SNAPSHOT} != "no" +CFLAGS+= -DBHYVE_SNAPSHOT +.endif + .include diff --git a/usr.sbin/bhyvectl/bhyvectl.8 b/usr.sbin/bhyvectl/bhyvectl.8 index 035f9f6c7586..6adf87ca4537 100644 --- a/usr.sbin/bhyvectl/bhyvectl.8 +++ b/usr.sbin/bhyvectl/bhyvectl.8 @@ -1,97 +1,114 @@ .\" Copyright (c) 2015 Christian Brueffer .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd November 13, 2016 +.Dd May 04, 2020 .Dt BHYVECTL 8 .Os .Sh NAME .Nm bhyvectl .Nd "control utility for bhyve instances" .Sh SYNOPSIS .Nm .Fl -vm= Ns Ar .Op Fl -create .Op Fl -destroy .Op Fl -get-stats .Op Fl -inject-nmi .Op Fl -force-reset .Op Fl -force-poweroff +.Op Fl -checkpoint= Ns Ar +.Op Fl -suspend= Ns Ar .Sh DESCRIPTION The .Nm command is a control utility for active .Xr bhyve 8 virtual machine instances. .Pp .Em Note : Most .Nm flags are intended for querying and setting the state of an active instance. These commands are intended for development purposes, and are not documented here. A complete list can be obtained by executing .Nm without any arguments. .Pp The user-facing options are as follows: .Bl -tag -width ".Fl d Ar argument" .It Fl -vm= Ns Ar Operate on the virtual machine .Ar . .It Fl -create Create the specified VM. .It Fl -destroy Destroy the specified VM. .It Fl -get-stats Retrieve statistics for the specified VM. .It Fl -inject-nmi Inject a non-maskable interrupt (NMI) into the VM. .It Fl -force-reset Force the VM to reset. .It Fl -force-poweroff Force the VM to power off. +.It Fl -checkpoint= Ns Ar +Save a snapshot of a virtual machine. +The guest memory contents are saved in the file given in +.Ar . +The guest device and vCPU state are saved in the file +.Ar .kern . +.It Fl -suspend= Ns Ar +Save a snapshot of a virtual machine similar to +.Fl -checkpoint . +The virtual machine will terminate after the snapshot has been +saved. .El .Sh EXIT STATUS .Ex -std .Sh EXAMPLES Destroy the VM called fbsd10: .Pp .Dl "bhyvectl --vm=fbsd10 --destroy" +.Sh COMPATIBILITY +The snapshot file format is not yet stable and is subject to future changes. +Backwards compatibility support for the current snapshot file format is not +guaranteed when future changes are made. .Sh SEE ALSO .Xr bhyve 8 , .Xr bhyveload 8 .Sh HISTORY The .Nm command first appeared in .Fx 10.1 . .Sh AUTHORS .An -nosplit The .Nm utility was written by .An Peter Grehan and .An Neel Natu . diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c index 8274e6eafccb..d2c4a1488fe8 100644 --- a/usr.sbin/bhyvectl/bhyvectl.c +++ b/usr.sbin/bhyvectl/bhyvectl.c @@ -1,2350 +1,2469 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include +#include + #include "amd/vmcb.h" #include "intel/vmcs.h" #define MB (1UL << 20) #define GB (1UL << 30) #define REQ_ARG required_argument #define NO_ARG no_argument #define OPT_ARG optional_argument +#define CHECKPOINT_RUN_DIR "/var/run/bhyve/checkpoint" +#define MAX_VMNAME 100 + static const char *progname; static void usage(bool cpu_intel) { (void)fprintf(stderr, "Usage: %s --vm=\n" " [--cpu=]\n" " [--create]\n" " [--destroy]\n" +#ifdef BHYVE_SNAPSHOT + " [--checkpoint=]\n" + " [--suspend=]\n" +#endif " [--get-all]\n" " [--get-stats]\n" " [--set-desc-ds]\n" " [--get-desc-ds]\n" " [--set-desc-es]\n" " [--get-desc-es]\n" " [--set-desc-gs]\n" " [--get-desc-gs]\n" " [--set-desc-fs]\n" " [--get-desc-fs]\n" " [--set-desc-cs]\n" " [--get-desc-cs]\n" " [--set-desc-ss]\n" " [--get-desc-ss]\n" " [--set-desc-tr]\n" " [--get-desc-tr]\n" " [--set-desc-ldtr]\n" " [--get-desc-ldtr]\n" " [--set-desc-gdtr]\n" " [--get-desc-gdtr]\n" " [--set-desc-idtr]\n" " [--get-desc-idtr]\n" " [--run]\n" " [--capname=]\n" " [--getcap]\n" " [--setcap=<0|1>]\n" " [--desc-base=]\n" " [--desc-limit=]\n" " [--desc-access=]\n" " [--set-cr0=]\n" " [--get-cr0]\n" " [--set-cr2=]\n" " [--get-cr2]\n" " [--set-cr3=]\n" " [--get-cr3]\n" " [--set-cr4=]\n" " [--get-cr4]\n" " [--set-dr0=]\n" " [--get-dr0]\n" " [--set-dr1=]\n" " [--get-dr1]\n" " [--set-dr2=]\n" " [--get-dr2]\n" " [--set-dr3=]\n" " [--get-dr3]\n" " [--set-dr6=]\n" " [--get-dr6]\n" " [--set-dr7=]\n" " [--get-dr7]\n" " [--set-rsp=]\n" " [--get-rsp]\n" " [--set-rip=]\n" " [--get-rip]\n" " [--get-rax]\n" " [--set-rax=]\n" " [--get-rbx]\n" " [--get-rcx]\n" " [--get-rdx]\n" " [--get-rsi]\n" " [--get-rdi]\n" " [--get-rbp]\n" " [--get-r8]\n" " [--get-r9]\n" " [--get-r10]\n" " [--get-r11]\n" " [--get-r12]\n" " [--get-r13]\n" " [--get-r14]\n" " [--get-r15]\n" " [--set-rflags=]\n" " [--get-rflags]\n" " [--set-cs]\n" " [--get-cs]\n" " [--set-ds]\n" " [--get-ds]\n" " [--set-es]\n" " [--get-es]\n" " [--set-fs]\n" " [--get-fs]\n" " [--set-gs]\n" " [--get-gs]\n" " [--set-ss]\n" " [--get-ss]\n" " [--get-tr]\n" " [--get-ldtr]\n" " [--set-x2apic-state=]\n" " [--get-x2apic-state]\n" " [--unassign-pptdev=]\n" " [--set-mem=]\n" " [--get-lowmem]\n" " [--get-highmem]\n" " [--get-gpa-pmap]\n" " [--assert-lapic-lvt=]\n" " [--inject-nmi]\n" " [--force-reset]\n" " [--force-poweroff]\n" " [--get-rtc-time]\n" " [--set-rtc-time=]\n" " [--get-rtc-nvram]\n" " [--set-rtc-nvram=]\n" " [--rtc-nvram-offset=]\n" " [--get-active-cpus]\n" " [--get-suspended-cpus]\n" " [--get-intinfo]\n" " [--get-eptp]\n" " [--set-exception-bitmap]\n" " [--get-exception-bitmap]\n" " [--get-tsc-offset]\n" " [--get-guest-pat]\n" " [--get-io-bitmap-address]\n" " [--get-msr-bitmap]\n" " [--get-msr-bitmap-address]\n" " [--get-guest-sysenter]\n" " [--get-exit-reason]\n" " [--get-cpu-topology]\n", progname); if (cpu_intel) { (void)fprintf(stderr, " [--get-vmcs-pinbased-ctls]\n" " [--get-vmcs-procbased-ctls]\n" " [--get-vmcs-procbased-ctls2]\n" " [--get-vmcs-entry-interruption-info]\n" " [--set-vmcs-entry-interruption-info=]\n" " [--get-vmcs-guest-physical-address\n" " [--get-vmcs-guest-linear-address\n" " [--get-vmcs-host-pat]\n" " [--get-vmcs-host-cr0]\n" " [--get-vmcs-host-cr3]\n" " [--get-vmcs-host-cr4]\n" " [--get-vmcs-host-rip]\n" " [--get-vmcs-host-rsp]\n" " [--get-vmcs-cr0-mask]\n" " [--get-vmcs-cr0-shadow]\n" " [--get-vmcs-cr4-mask]\n" " [--get-vmcs-cr4-shadow]\n" " [--get-vmcs-cr3-targets]\n" " [--get-vmcs-apic-access-address]\n" " [--get-vmcs-virtual-apic-address]\n" " [--get-vmcs-tpr-threshold]\n" " [--get-vmcs-vpid]\n" " [--get-vmcs-instruction-error]\n" " [--get-vmcs-exit-ctls]\n" " [--get-vmcs-entry-ctls]\n" " [--get-vmcs-link]\n" " [--get-vmcs-exit-qualification]\n" " [--get-vmcs-exit-interruption-info]\n" " [--get-vmcs-exit-interruption-error]\n" " [--get-vmcs-interruptibility]\n" ); } else { (void)fprintf(stderr, " [--get-vmcb-intercepts]\n" " [--get-vmcb-asid]\n" " [--get-vmcb-exit-details]\n" " [--get-vmcb-tlb-ctrl]\n" " [--get-vmcb-virq]\n" " [--get-avic-apic-bar]\n" " [--get-avic-backing-page]\n" " [--get-avic-table]\n" ); } exit(1); } static int get_rtc_time, set_rtc_time; static int get_rtc_nvram, set_rtc_nvram; static int rtc_nvram_offset; static uint8_t rtc_nvram_value; static time_t rtc_secs; static int get_stats, getcap, setcap, capval, get_gpa_pmap; static int inject_nmi, assert_lapic_lvt; static int force_reset, force_poweroff; static const char *capname; static int create, destroy, get_memmap, get_memseg; static int get_intinfo; static int get_active_cpus, get_suspended_cpus; static uint64_t memsize; static int set_cr0, get_cr0, set_cr2, get_cr2, set_cr3, get_cr3; static int set_cr4, get_cr4; static int set_efer, get_efer; static int set_dr0, get_dr0; static int set_dr1, get_dr1; static int set_dr2, get_dr2; static int set_dr3, get_dr3; static int set_dr6, get_dr6; static int set_dr7, get_dr7; static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags; static int set_rax, get_rax; static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp; static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15; static int set_desc_ds, get_desc_ds; static int set_desc_es, get_desc_es; static int set_desc_fs, get_desc_fs; static int set_desc_gs, get_desc_gs; static int set_desc_cs, get_desc_cs; static int set_desc_ss, get_desc_ss; static int set_desc_gdtr, get_desc_gdtr; static int set_desc_idtr, get_desc_idtr; static int set_desc_tr, get_desc_tr; static int set_desc_ldtr, get_desc_ldtr; static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr; static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr; static int set_x2apic_state, get_x2apic_state; enum x2apic_state x2apic_state; static int unassign_pptdev, bus, slot, func; static int run; static int get_cpu_topology; +#ifdef BHYVE_SNAPSHOT +static int vm_checkpoint_opt; +static int vm_suspend_opt; +#endif /* * VMCB specific. */ static int get_vmcb_intercept, get_vmcb_exit_details, get_vmcb_tlb_ctrl; static int get_vmcb_virq, get_avic_table; /* * VMCS-specific fields */ static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2; static int get_eptp, get_io_bitmap, get_tsc_offset; static int get_vmcs_entry_interruption_info, set_vmcs_entry_interruption_info; static int get_vmcs_interruptibility; uint32_t vmcs_entry_interruption_info; static int get_vmcs_gpa, get_vmcs_gla; static int get_exception_bitmap, set_exception_bitmap, exception_bitmap; static int get_cr0_mask, get_cr0_shadow; static int get_cr4_mask, get_cr4_shadow; static int get_cr3_targets; static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold; static int get_msr_bitmap, get_msr_bitmap_address; static int get_vpid_asid; static int get_inst_err, get_exit_ctls, get_entry_ctls; static int get_host_cr0, get_host_cr3, get_host_cr4; static int get_host_rip, get_host_rsp; static int get_guest_pat, get_host_pat; static int get_guest_sysenter, get_vmcs_link; static int get_exit_reason, get_vmcs_exit_qualification; static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error; static int get_vmcs_exit_inst_length; static uint64_t desc_base; static uint32_t desc_limit, desc_access; static int get_all; static void dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu) { printf("vm exit[%d]\n", vcpu); printf("\trip\t\t0x%016lx\n", vmexit->rip); printf("\tinst_length\t%d\n", vmexit->inst_length); switch (vmexit->exitcode) { case VM_EXITCODE_INOUT: printf("\treason\t\tINOUT\n"); printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT"); printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes); printf("\tflags\t\t%s%s\n", vmexit->u.inout.string ? "STRING " : "", vmexit->u.inout.rep ? "REP " : ""); printf("\tport\t\t0x%04x\n", vmexit->u.inout.port); printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax); break; case VM_EXITCODE_VMX: printf("\treason\t\tVMX\n"); printf("\tstatus\t\t%d\n", vmexit->u.vmx.status); printf("\texit_reason\t0x%08x (%u)\n", vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason); printf("\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); printf("\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); printf("\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); break; case VM_EXITCODE_SVM: printf("\treason\t\tSVM\n"); printf("\texit_reason\t\t%#lx\n", vmexit->u.svm.exitcode); printf("\texitinfo1\t\t%#lx\n", vmexit->u.svm.exitinfo1); printf("\texitinfo2\t\t%#lx\n", vmexit->u.svm.exitinfo2); break; default: printf("*** unknown vm run exitcode %d\n", vmexit->exitcode); break; } } /* AMD 6th generation and Intel compatible MSRs */ #define MSR_AMD6TH_START 0xC0000000 #define MSR_AMD6TH_END 0xC0001FFF /* AMD 7th and 8th generation compatible MSRs */ #define MSR_AMD7TH_START 0xC0010000 #define MSR_AMD7TH_END 0xC0011FFF static const char * msr_name(uint32_t msr) { static char buf[32]; switch(msr) { case MSR_TSC: return ("MSR_TSC"); case MSR_EFER: return ("MSR_EFER"); case MSR_STAR: return ("MSR_STAR"); case MSR_LSTAR: return ("MSR_LSTAR"); case MSR_CSTAR: return ("MSR_CSTAR"); case MSR_SF_MASK: return ("MSR_SF_MASK"); case MSR_FSBASE: return ("MSR_FSBASE"); case MSR_GSBASE: return ("MSR_GSBASE"); case MSR_KGSBASE: return ("MSR_KGSBASE"); case MSR_SYSENTER_CS_MSR: return ("MSR_SYSENTER_CS_MSR"); case MSR_SYSENTER_ESP_MSR: return ("MSR_SYSENTER_ESP_MSR"); case MSR_SYSENTER_EIP_MSR: return ("MSR_SYSENTER_EIP_MSR"); case MSR_PAT: return ("MSR_PAT"); } snprintf(buf, sizeof(buf), "MSR %#08x", msr); return (buf); } static inline void print_msr_pm(uint64_t msr, int vcpu, int readable, int writeable) { if (readable || writeable) { printf("%-20s[%d]\t\t%c%c\n", msr_name(msr), vcpu, readable ? 'R' : '-', writeable ? 'W' : '-'); } } /* * Reference APM vol2, section 15.11 MSR Intercepts. */ static void dump_amd_msr_pm(const char *bitmap, int vcpu) { int byte, bit, readable, writeable; uint32_t msr; for (msr = 0; msr < 0x2000; msr++) { byte = msr / 4; bit = (msr % 4) * 2; /* Look at MSRs in the range 0x00000000 to 0x00001FFF */ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; print_msr_pm(msr, vcpu, readable, writeable); /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */ byte += 2048; readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable, writeable); /* MSR 0xC0010000 to 0xC0011FF is only for AMD */ byte += 4096; readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; print_msr_pm(msr + MSR_AMD7TH_START, vcpu, readable, writeable); } } /* * Reference Intel SDM Vol3 Section 24.6.9 MSR-Bitmap Address */ static void dump_intel_msr_pm(const char *bitmap, int vcpu) { int byte, bit, readable, writeable; uint32_t msr; for (msr = 0; msr < 0x2000; msr++) { byte = msr / 8; bit = msr & 0x7; /* Look at MSRs in the range 0x00000000 to 0x00001FFF */ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; print_msr_pm(msr, vcpu, readable, writeable); /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */ byte += 1024; readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable, writeable); } } static int dump_msr_bitmap(int vcpu, uint64_t addr, bool cpu_intel) { int error, fd, map_size; const char *bitmap; error = -1; bitmap = MAP_FAILED; fd = open("/dev/mem", O_RDONLY, 0); if (fd < 0) { perror("Couldn't open /dev/mem"); goto done; } if (cpu_intel) map_size = PAGE_SIZE; else map_size = 2 * PAGE_SIZE; bitmap = mmap(NULL, map_size, PROT_READ, MAP_SHARED, fd, addr); if (bitmap == MAP_FAILED) { perror("mmap failed"); goto done; } if (cpu_intel) dump_intel_msr_pm(bitmap, vcpu); else dump_amd_msr_pm(bitmap, vcpu); error = 0; done: if (bitmap != MAP_FAILED) munmap((void *)bitmap, map_size); if (fd >= 0) close(fd); return (error); } static int vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val) { return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val)); } static int vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val) { return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val)); } static int vm_get_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes, uint64_t *ret_val) { return (vm_get_register(ctx, vcpu, VMCB_ACCESS(off, bytes), ret_val)); } static int vm_set_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes, uint64_t val) { return (vm_set_register(ctx, vcpu, VMCB_ACCESS(off, bytes), val)); } enum { VMNAME = 1000, /* avoid collision with return values from getopt */ VCPU, SET_MEM, SET_EFER, SET_CR0, SET_CR2, SET_CR3, SET_CR4, SET_DR0, SET_DR1, SET_DR2, SET_DR3, SET_DR6, SET_DR7, SET_RSP, SET_RIP, SET_RAX, SET_RFLAGS, DESC_BASE, DESC_LIMIT, DESC_ACCESS, SET_CS, SET_DS, SET_ES, SET_FS, SET_GS, SET_SS, SET_TR, SET_LDTR, SET_X2APIC_STATE, SET_EXCEPTION_BITMAP, SET_VMCS_ENTRY_INTERRUPTION_INFO, SET_CAP, CAPNAME, UNASSIGN_PPTDEV, GET_GPA_PMAP, ASSERT_LAPIC_LVT, SET_RTC_TIME, SET_RTC_NVRAM, RTC_NVRAM_OFFSET, +#ifdef BHYVE_SNAPSHOT + SET_CHECKPOINT_FILE, + SET_SUSPEND_FILE, +#endif }; static void print_cpus(const char *banner, const cpuset_t *cpus) { int i, first; first = 1; printf("%s:\t", banner); if (!CPU_EMPTY(cpus)) { for (i = 0; i < CPU_SETSIZE; i++) { if (CPU_ISSET(i, cpus)) { printf("%s%d", first ? " " : ", ", i); first = 0; } } } else printf(" (none)"); printf("\n"); } static void print_intinfo(const char *banner, uint64_t info) { int type; printf("%s:\t", banner); if (info & VM_INTINFO_VALID) { type = info & VM_INTINFO_TYPE; switch (type) { case VM_INTINFO_HWINTR: printf("extint"); break; case VM_INTINFO_NMI: printf("nmi"); break; case VM_INTINFO_SWINTR: printf("swint"); break; default: printf("exception"); break; } printf(" vector %d", (int)VM_INTINFO_VECTOR(info)); if (info & VM_INTINFO_DEL_ERRCODE) printf(" errcode %#x", (u_int)(info >> 32)); } else { printf("n/a"); } printf("\n"); } static bool cpu_vendor_intel(void) { u_int regs[4]; char cpu_vendor[13]; do_cpuid(0, regs); ((u_int *)&cpu_vendor)[0] = regs[1]; ((u_int *)&cpu_vendor)[1] = regs[3]; ((u_int *)&cpu_vendor)[2] = regs[2]; cpu_vendor[12] = '\0'; if (strcmp(cpu_vendor, "AuthenticAMD") == 0) { return (false); } else if (strcmp(cpu_vendor, "HygonGenuine") == 0) { return (false); } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) { return (true); } else { fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor); exit(1); } } static int get_all_registers(struct vmctx *ctx, int vcpu) { uint64_t cr0, cr2, cr3, cr4, dr0, dr1, dr2, dr3, dr6, dr7; uint64_t rsp, rip, rflags, efer; uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp; uint64_t r8, r9, r10, r11, r12, r13, r14, r15; int error = 0; if (!error && (get_efer || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer); if (error == 0) printf("efer[%d]\t\t0x%016lx\n", vcpu, efer); } if (!error && (get_cr0 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0); if (error == 0) printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0); } if (!error && (get_cr2 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR2, &cr2); if (error == 0) printf("cr2[%d]\t\t0x%016lx\n", vcpu, cr2); } if (!error && (get_cr3 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3); if (error == 0) printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3); } if (!error && (get_cr4 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4); if (error == 0) printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4); } if (!error && (get_dr0 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR0, &dr0); if (error == 0) printf("dr0[%d]\t\t0x%016lx\n", vcpu, dr0); } if (!error && (get_dr1 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR1, &dr1); if (error == 0) printf("dr1[%d]\t\t0x%016lx\n", vcpu, dr1); } if (!error && (get_dr2 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR2, &dr2); if (error == 0) printf("dr2[%d]\t\t0x%016lx\n", vcpu, dr2); } if (!error && (get_dr3 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR3, &dr3); if (error == 0) printf("dr3[%d]\t\t0x%016lx\n", vcpu, dr3); } if (!error && (get_dr6 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR6, &dr6); if (error == 0) printf("dr6[%d]\t\t0x%016lx\n", vcpu, dr6); } if (!error && (get_dr7 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7); if (error == 0) printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7); } if (!error && (get_rsp || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp); if (error == 0) printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp); } if (!error && (get_rip || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); if (error == 0) printf("rip[%d]\t\t0x%016lx\n", vcpu, rip); } if (!error && (get_rax || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax); if (error == 0) printf("rax[%d]\t\t0x%016lx\n", vcpu, rax); } if (!error && (get_rbx || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx); if (error == 0) printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx); } if (!error && (get_rcx || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx); if (error == 0) printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx); } if (!error && (get_rdx || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx); if (error == 0) printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx); } if (!error && (get_rsi || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi); if (error == 0) printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi); } if (!error && (get_rdi || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi); if (error == 0) printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi); } if (!error && (get_rbp || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp); if (error == 0) printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp); } if (!error && (get_r8 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8); if (error == 0) printf("r8[%d]\t\t0x%016lx\n", vcpu, r8); } if (!error && (get_r9 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9); if (error == 0) printf("r9[%d]\t\t0x%016lx\n", vcpu, r9); } if (!error && (get_r10 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10); if (error == 0) printf("r10[%d]\t\t0x%016lx\n", vcpu, r10); } if (!error && (get_r11 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11); if (error == 0) printf("r11[%d]\t\t0x%016lx\n", vcpu, r11); } if (!error && (get_r12 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12); if (error == 0) printf("r12[%d]\t\t0x%016lx\n", vcpu, r12); } if (!error && (get_r13 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13); if (error == 0) printf("r13[%d]\t\t0x%016lx\n", vcpu, r13); } if (!error && (get_r14 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14); if (error == 0) printf("r14[%d]\t\t0x%016lx\n", vcpu, r14); } if (!error && (get_r15 || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15); if (error == 0) printf("r15[%d]\t\t0x%016lx\n", vcpu, r15); } if (!error && (get_rflags || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, &rflags); if (error == 0) printf("rflags[%d]\t0x%016lx\n", vcpu, rflags); } return (error); } static int get_all_segments(struct vmctx *ctx, int vcpu) { uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; int error = 0; if (!error && (get_desc_ds || get_all)) { error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpu, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_es || get_all)) { error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpu, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_fs || get_all)) { error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpu, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_gs || get_all)) { error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpu, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_ss || get_all)) { error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpu, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_cs || get_all)) { error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpu, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_tr || get_all)) { error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpu, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_ldtr || get_all)) { error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", vcpu, desc_base, desc_limit, desc_access); } } if (!error && (get_desc_gdtr || get_all)) { error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("gdtr[%d]\t\t0x%016lx/0x%08x\n", vcpu, desc_base, desc_limit); } } if (!error && (get_desc_idtr || get_all)) { error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR, &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("idtr[%d]\t\t0x%016lx/0x%08x\n", vcpu, desc_base, desc_limit); } } if (!error && (get_cs || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs); if (error == 0) printf("cs[%d]\t\t0x%04lx\n", vcpu, cs); } if (!error && (get_ds || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds); if (error == 0) printf("ds[%d]\t\t0x%04lx\n", vcpu, ds); } if (!error && (get_es || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es); if (error == 0) printf("es[%d]\t\t0x%04lx\n", vcpu, es); } if (!error && (get_fs || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs); if (error == 0) printf("fs[%d]\t\t0x%04lx\n", vcpu, fs); } if (!error && (get_gs || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs); if (error == 0) printf("gs[%d]\t\t0x%04lx\n", vcpu, gs); } if (!error && (get_ss || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss); if (error == 0) printf("ss[%d]\t\t0x%04lx\n", vcpu, ss); } if (!error && (get_tr || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr); if (error == 0) printf("tr[%d]\t\t0x%04lx\n", vcpu, tr); } if (!error && (get_ldtr || get_all)) { error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr); if (error == 0) printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr); } return (error); } static int get_misc_vmcs(struct vmctx *ctx, int vcpu) { uint64_t ctl, cr0, cr3, cr4, rsp, rip, pat, addr, u64; int error = 0; if (!error && (get_cr0_mask || get_all)) { uint64_t cr0mask; error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask); if (error == 0) printf("cr0_mask[%d]\t\t0x%016lx\n", vcpu, cr0mask); } if (!error && (get_cr0_shadow || get_all)) { uint64_t cr0shadow; error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW, &cr0shadow); if (error == 0) printf("cr0_shadow[%d]\t\t0x%016lx\n", vcpu, cr0shadow); } if (!error && (get_cr4_mask || get_all)) { uint64_t cr4mask; error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask); if (error == 0) printf("cr4_mask[%d]\t\t0x%016lx\n", vcpu, cr4mask); } if (!error && (get_cr4_shadow || get_all)) { uint64_t cr4shadow; error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW, &cr4shadow); if (error == 0) printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpu, cr4shadow); } if (!error && (get_cr3_targets || get_all)) { uint64_t target_count, target_addr; error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT, &target_count); if (error == 0) { printf("cr3_target_count[%d]\t0x%016lx\n", vcpu, target_count); } error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0, &target_addr); if (error == 0) { printf("cr3_target0[%d]\t\t0x%016lx\n", vcpu, target_addr); } error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1, &target_addr); if (error == 0) { printf("cr3_target1[%d]\t\t0x%016lx\n", vcpu, target_addr); } error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2, &target_addr); if (error == 0) { printf("cr3_target2[%d]\t\t0x%016lx\n", vcpu, target_addr); } error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3, &target_addr); if (error == 0) { printf("cr3_target3[%d]\t\t0x%016lx\n", vcpu, target_addr); } } if (!error && (get_pinbased_ctls || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl); if (error == 0) printf("pinbased_ctls[%d]\t0x%016lx\n", vcpu, ctl); } if (!error && (get_procbased_ctls || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_PRI_PROC_BASED_CTLS, &ctl); if (error == 0) printf("procbased_ctls[%d]\t0x%016lx\n", vcpu, ctl); } if (!error && (get_procbased_ctls2 || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_SEC_PROC_BASED_CTLS, &ctl); if (error == 0) printf("procbased_ctls2[%d]\t0x%016lx\n", vcpu, ctl); } if (!error && (get_vmcs_gla || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_LINEAR_ADDRESS, &u64); if (error == 0) printf("gla[%d]\t\t0x%016lx\n", vcpu, u64); } if (!error && (get_vmcs_gpa || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_PHYSICAL_ADDRESS, &u64); if (error == 0) printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64); } if (!error && (get_vmcs_entry_interruption_info || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64); if (error == 0) { printf("entry_interruption_info[%d]\t0x%016lx\n", vcpu, u64); } } if (!error && (get_tpr_threshold || get_all)) { uint64_t threshold; error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD, &threshold); if (error == 0) printf("tpr_threshold[%d]\t0x%016lx\n", vcpu, threshold); } if (!error && (get_inst_err || get_all)) { uint64_t insterr; error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR, &insterr); if (error == 0) { printf("instruction_error[%d]\t0x%016lx\n", vcpu, insterr); } } if (!error && (get_exit_ctls || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl); if (error == 0) printf("exit_ctls[%d]\t\t0x%016lx\n", vcpu, ctl); } if (!error && (get_entry_ctls || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl); if (error == 0) printf("entry_ctls[%d]\t\t0x%016lx\n", vcpu, ctl); } if (!error && (get_host_pat || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat); if (error == 0) printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat); } if (!error && (get_host_cr0 || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0); if (error == 0) printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0); } if (!error && (get_host_cr3 || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3); if (error == 0) printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3); } if (!error && (get_host_cr4 || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4); if (error == 0) printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4); } if (!error && (get_host_rip || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip); if (error == 0) printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip); } if (!error && (get_host_rsp || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp); if (error == 0) printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp); } if (!error && (get_vmcs_link || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr); if (error == 0) printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr); } if (!error && (get_vmcs_exit_interruption_info || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_INFO, &u64); if (error == 0) { printf("vmcs_exit_interruption_info[%d]\t0x%016lx\n", vcpu, u64); } } if (!error && (get_vmcs_exit_interruption_error || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_ERRCODE, &u64); if (error == 0) { printf("vmcs_exit_interruption_error[%d]\t0x%016lx\n", vcpu, u64); } } if (!error && (get_vmcs_interruptibility || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_INTERRUPTIBILITY, &u64); if (error == 0) { printf("vmcs_guest_interruptibility[%d]\t0x%016lx\n", vcpu, u64); } } if (!error && (get_vmcs_exit_inst_length || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INSTRUCTION_LENGTH, &u64); if (error == 0) printf("vmcs_exit_inst_length[%d]\t0x%08x\n", vcpu, (uint32_t)u64); } if (!error && (get_vmcs_exit_qualification || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION, &u64); if (error == 0) printf("vmcs_exit_qualification[%d]\t0x%016lx\n", vcpu, u64); } return (error); } static int get_misc_vmcb(struct vmctx *ctx, int vcpu) { uint64_t ctl, addr; int error = 0; if (!error && (get_vmcb_intercept || get_all)) { error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_CR_INTERCEPT, 4, &ctl); if (error == 0) printf("cr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_DR_INTERCEPT, 4, &ctl); if (error == 0) printf("dr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXC_INTERCEPT, 4, &ctl); if (error == 0) printf("exc_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST1_INTERCEPT, 4, &ctl); if (error == 0) printf("inst1_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST2_INTERCEPT, 4, &ctl); if (error == 0) printf("inst2_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); } if (!error && (get_vmcb_tlb_ctrl || get_all)) { error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_TLB_CTRL, 4, &ctl); if (error == 0) printf("TLB ctrl[%d]\t0x%016lx\n", vcpu, ctl); } if (!error && (get_vmcb_exit_details || get_all)) { error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO1, 8, &ctl); if (error == 0) printf("exitinfo1[%d]\t0x%016lx\n", vcpu, ctl); error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO2, 8, &ctl); if (error == 0) printf("exitinfo2[%d]\t0x%016lx\n", vcpu, ctl); error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINTINFO, 8, &ctl); if (error == 0) printf("exitintinfo[%d]\t0x%016lx\n", vcpu, ctl); } if (!error && (get_vmcb_virq || get_all)) { error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_VIRQ, 8, &ctl); if (error == 0) printf("v_irq/tpr[%d]\t0x%016lx\n", vcpu, ctl); } if (!error && (get_apic_access_addr || get_all)) { error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_BAR, 8, &addr); if (error == 0) printf("AVIC apic_bar[%d]\t0x%016lx\n", vcpu, addr); } if (!error && (get_virtual_apic_addr || get_all)) { error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PAGE, 8, &addr); if (error == 0) printf("AVIC backing page[%d]\t0x%016lx\n", vcpu, addr); } if (!error && (get_avic_table || get_all)) { error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_LT, 8, &addr); if (error == 0) printf("AVIC logical table[%d]\t0x%016lx\n", vcpu, addr); error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PT, 8, &addr); if (error == 0) printf("AVIC physical table[%d]\t0x%016lx\n", vcpu, addr); } return (error); } static struct option * setup_options(bool cpu_intel) { const struct option common_opts[] = { { "vm", REQ_ARG, 0, VMNAME }, { "cpu", REQ_ARG, 0, VCPU }, { "set-mem", REQ_ARG, 0, SET_MEM }, { "set-efer", REQ_ARG, 0, SET_EFER }, { "set-cr0", REQ_ARG, 0, SET_CR0 }, { "set-cr2", REQ_ARG, 0, SET_CR2 }, { "set-cr3", REQ_ARG, 0, SET_CR3 }, { "set-cr4", REQ_ARG, 0, SET_CR4 }, { "set-dr0", REQ_ARG, 0, SET_DR0 }, { "set-dr1", REQ_ARG, 0, SET_DR1 }, { "set-dr2", REQ_ARG, 0, SET_DR2 }, { "set-dr3", REQ_ARG, 0, SET_DR3 }, { "set-dr6", REQ_ARG, 0, SET_DR6 }, { "set-dr7", REQ_ARG, 0, SET_DR7 }, { "set-rsp", REQ_ARG, 0, SET_RSP }, { "set-rip", REQ_ARG, 0, SET_RIP }, { "set-rax", REQ_ARG, 0, SET_RAX }, { "set-rflags", REQ_ARG, 0, SET_RFLAGS }, { "desc-base", REQ_ARG, 0, DESC_BASE }, { "desc-limit", REQ_ARG, 0, DESC_LIMIT }, { "desc-access",REQ_ARG, 0, DESC_ACCESS }, { "set-cs", REQ_ARG, 0, SET_CS }, { "set-ds", REQ_ARG, 0, SET_DS }, { "set-es", REQ_ARG, 0, SET_ES }, { "set-fs", REQ_ARG, 0, SET_FS }, { "set-gs", REQ_ARG, 0, SET_GS }, { "set-ss", REQ_ARG, 0, SET_SS }, { "set-tr", REQ_ARG, 0, SET_TR }, { "set-ldtr", REQ_ARG, 0, SET_LDTR }, { "set-x2apic-state",REQ_ARG, 0, SET_X2APIC_STATE }, { "set-exception-bitmap", REQ_ARG, 0, SET_EXCEPTION_BITMAP }, { "capname", REQ_ARG, 0, CAPNAME }, { "unassign-pptdev", REQ_ARG, 0, UNASSIGN_PPTDEV }, { "setcap", REQ_ARG, 0, SET_CAP }, { "get-gpa-pmap", REQ_ARG, 0, GET_GPA_PMAP }, { "assert-lapic-lvt", REQ_ARG, 0, ASSERT_LAPIC_LVT }, { "get-rtc-time", NO_ARG, &get_rtc_time, 1 }, { "set-rtc-time", REQ_ARG, 0, SET_RTC_TIME }, { "rtc-nvram-offset", REQ_ARG, 0, RTC_NVRAM_OFFSET }, { "get-rtc-nvram", NO_ARG, &get_rtc_nvram, 1 }, { "set-rtc-nvram", REQ_ARG, 0, SET_RTC_NVRAM }, { "getcap", NO_ARG, &getcap, 1 }, { "get-stats", NO_ARG, &get_stats, 1 }, { "get-desc-ds",NO_ARG, &get_desc_ds, 1 }, { "set-desc-ds",NO_ARG, &set_desc_ds, 1 }, { "get-desc-es",NO_ARG, &get_desc_es, 1 }, { "set-desc-es",NO_ARG, &set_desc_es, 1 }, { "get-desc-ss",NO_ARG, &get_desc_ss, 1 }, { "set-desc-ss",NO_ARG, &set_desc_ss, 1 }, { "get-desc-cs",NO_ARG, &get_desc_cs, 1 }, { "set-desc-cs",NO_ARG, &set_desc_cs, 1 }, { "get-desc-fs",NO_ARG, &get_desc_fs, 1 }, { "set-desc-fs",NO_ARG, &set_desc_fs, 1 }, { "get-desc-gs",NO_ARG, &get_desc_gs, 1 }, { "set-desc-gs",NO_ARG, &set_desc_gs, 1 }, { "get-desc-tr",NO_ARG, &get_desc_tr, 1 }, { "set-desc-tr",NO_ARG, &set_desc_tr, 1 }, { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 }, { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 }, { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 }, { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 }, { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 }, { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 }, { "get-memmap", NO_ARG, &get_memmap, 1 }, { "get-memseg", NO_ARG, &get_memseg, 1 }, { "get-efer", NO_ARG, &get_efer, 1 }, { "get-cr0", NO_ARG, &get_cr0, 1 }, { "get-cr2", NO_ARG, &get_cr2, 1 }, { "get-cr3", NO_ARG, &get_cr3, 1 }, { "get-cr4", NO_ARG, &get_cr4, 1 }, { "get-dr0", NO_ARG, &get_dr0, 1 }, { "get-dr1", NO_ARG, &get_dr1, 1 }, { "get-dr2", NO_ARG, &get_dr2, 1 }, { "get-dr3", NO_ARG, &get_dr3, 1 }, { "get-dr6", NO_ARG, &get_dr6, 1 }, { "get-dr7", NO_ARG, &get_dr7, 1 }, { "get-rsp", NO_ARG, &get_rsp, 1 }, { "get-rip", NO_ARG, &get_rip, 1 }, { "get-rax", NO_ARG, &get_rax, 1 }, { "get-rbx", NO_ARG, &get_rbx, 1 }, { "get-rcx", NO_ARG, &get_rcx, 1 }, { "get-rdx", NO_ARG, &get_rdx, 1 }, { "get-rsi", NO_ARG, &get_rsi, 1 }, { "get-rdi", NO_ARG, &get_rdi, 1 }, { "get-rbp", NO_ARG, &get_rbp, 1 }, { "get-r8", NO_ARG, &get_r8, 1 }, { "get-r9", NO_ARG, &get_r9, 1 }, { "get-r10", NO_ARG, &get_r10, 1 }, { "get-r11", NO_ARG, &get_r11, 1 }, { "get-r12", NO_ARG, &get_r12, 1 }, { "get-r13", NO_ARG, &get_r13, 1 }, { "get-r14", NO_ARG, &get_r14, 1 }, { "get-r15", NO_ARG, &get_r15, 1 }, { "get-rflags", NO_ARG, &get_rflags, 1 }, { "get-cs", NO_ARG, &get_cs, 1 }, { "get-ds", NO_ARG, &get_ds, 1 }, { "get-es", NO_ARG, &get_es, 1 }, { "get-fs", NO_ARG, &get_fs, 1 }, { "get-gs", NO_ARG, &get_gs, 1 }, { "get-ss", NO_ARG, &get_ss, 1 }, { "get-tr", NO_ARG, &get_tr, 1 }, { "get-ldtr", NO_ARG, &get_ldtr, 1 }, { "get-eptp", NO_ARG, &get_eptp, 1 }, { "get-exception-bitmap", NO_ARG, &get_exception_bitmap, 1 }, { "get-io-bitmap-address", NO_ARG, &get_io_bitmap, 1 }, { "get-tsc-offset", NO_ARG, &get_tsc_offset, 1 }, { "get-msr-bitmap", NO_ARG, &get_msr_bitmap, 1 }, { "get-msr-bitmap-address", NO_ARG, &get_msr_bitmap_address, 1 }, { "get-guest-pat", NO_ARG, &get_guest_pat, 1 }, { "get-guest-sysenter", NO_ARG, &get_guest_sysenter, 1 }, { "get-exit-reason", NO_ARG, &get_exit_reason, 1 }, { "get-x2apic-state", NO_ARG, &get_x2apic_state, 1 }, { "get-all", NO_ARG, &get_all, 1 }, { "run", NO_ARG, &run, 1 }, { "create", NO_ARG, &create, 1 }, { "destroy", NO_ARG, &destroy, 1 }, { "inject-nmi", NO_ARG, &inject_nmi, 1 }, { "force-reset", NO_ARG, &force_reset, 1 }, { "force-poweroff", NO_ARG, &force_poweroff, 1 }, { "get-active-cpus", NO_ARG, &get_active_cpus, 1 }, { "get-suspended-cpus", NO_ARG, &get_suspended_cpus, 1 }, { "get-intinfo", NO_ARG, &get_intinfo, 1 }, { "get-cpu-topology", NO_ARG, &get_cpu_topology, 1 }, +#ifdef BHYVE_SNAPSHOT + { "checkpoint", REQ_ARG, 0, SET_CHECKPOINT_FILE}, + { "suspend", REQ_ARG, 0, SET_SUSPEND_FILE}, +#endif }; const struct option intel_opts[] = { { "get-vmcs-pinbased-ctls", NO_ARG, &get_pinbased_ctls, 1 }, { "get-vmcs-procbased-ctls", NO_ARG, &get_procbased_ctls, 1 }, { "get-vmcs-procbased-ctls2", NO_ARG, &get_procbased_ctls2, 1 }, { "get-vmcs-guest-linear-address", NO_ARG, &get_vmcs_gla, 1 }, { "get-vmcs-guest-physical-address", NO_ARG, &get_vmcs_gpa, 1 }, { "get-vmcs-entry-interruption-info", NO_ARG, &get_vmcs_entry_interruption_info, 1}, { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 }, { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 }, { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 }, { "get-vmcs-cr4-shadow", NO_ARG, &get_cr4_shadow, 1 }, { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1 }, { "get-vmcs-tpr-threshold", NO_ARG, &get_tpr_threshold, 1 }, { "get-vmcs-vpid", NO_ARG, &get_vpid_asid, 1 }, { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 }, { "get-vmcs-entry-ctls", NO_ARG, &get_entry_ctls, 1 }, { "get-vmcs-instruction-error", NO_ARG, &get_inst_err, 1 }, { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 }, { "get-vmcs-host-cr0", NO_ARG, &get_host_cr0, 1 }, { "set-vmcs-entry-interruption-info", REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO }, { "get-vmcs-exit-qualification", NO_ARG, &get_vmcs_exit_qualification, 1 }, { "get-vmcs-exit-inst-length", NO_ARG, &get_vmcs_exit_inst_length, 1 }, { "get-vmcs-interruptibility", NO_ARG, &get_vmcs_interruptibility, 1 }, { "get-vmcs-exit-interruption-error", NO_ARG, &get_vmcs_exit_interruption_error, 1 }, { "get-vmcs-exit-interruption-info", NO_ARG, &get_vmcs_exit_interruption_info, 1 }, { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 }, { "get-vmcs-host-cr3", NO_ARG, &get_host_cr3, 1 }, { "get-vmcs-host-cr4", NO_ARG, &get_host_cr4, 1 }, { "get-vmcs-host-rip", NO_ARG, &get_host_rip, 1 }, { "get-vmcs-host-rsp", NO_ARG, &get_host_rsp, 1 }, { "get-apic-access-address", NO_ARG, &get_apic_access_addr, 1}, { "get-virtual-apic-address", NO_ARG, &get_virtual_apic_addr, 1} }; const struct option amd_opts[] = { { "get-vmcb-intercepts", NO_ARG, &get_vmcb_intercept, 1 }, { "get-vmcb-asid", NO_ARG, &get_vpid_asid, 1 }, { "get-vmcb-exit-details", NO_ARG, &get_vmcb_exit_details, 1 }, { "get-vmcb-tlb-ctrl", NO_ARG, &get_vmcb_tlb_ctrl, 1 }, { "get-vmcb-virq", NO_ARG, &get_vmcb_virq, 1 }, { "get-avic-apic-bar", NO_ARG, &get_apic_access_addr, 1 }, { "get-avic-backing-page", NO_ARG, &get_virtual_apic_addr, 1 }, { "get-avic-table", NO_ARG, &get_avic_table, 1 } }; const struct option null_opt = { NULL, 0, NULL, 0 }; struct option *all_opts; char *cp; int optlen; optlen = sizeof(common_opts); if (cpu_intel) optlen += sizeof(intel_opts); else optlen += sizeof(amd_opts); optlen += sizeof(null_opt); all_opts = malloc(optlen); cp = (char *)all_opts; memcpy(cp, common_opts, sizeof(common_opts)); cp += sizeof(common_opts); if (cpu_intel) { memcpy(cp, intel_opts, sizeof(intel_opts)); cp += sizeof(intel_opts); } else { memcpy(cp, amd_opts, sizeof(amd_opts)); cp += sizeof(amd_opts); } memcpy(cp, &null_opt, sizeof(null_opt)); cp += sizeof(null_opt); return (all_opts); } static const char * wday_str(int idx) { static const char *weekdays[] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" }; if (idx >= 0 && idx < 7) return (weekdays[idx]); else return ("UNK"); } static const char * mon_str(int idx) { static const char *months[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" }; if (idx >= 0 && idx < 12) return (months[idx]); else return ("UNK"); } static int show_memmap(struct vmctx *ctx) { char name[SPECNAMELEN + 1], numbuf[8]; vm_ooffset_t segoff; vm_paddr_t gpa; size_t maplen, seglen; int error, flags, prot, segid, delim; printf("Address Length Segment Offset "); printf("Prot Flags\n"); gpa = 0; while (1) { error = vm_mmap_getnext(ctx, &gpa, &segid, &segoff, &maplen, &prot, &flags); if (error) return (errno == ENOENT ? 0 : error); error = vm_get_memseg(ctx, segid, &seglen, name, sizeof(name)); if (error) return (error); printf("%-12lX", gpa); humanize_number(numbuf, sizeof(numbuf), maplen, "B", HN_AUTOSCALE, HN_NOSPACE); printf("%-12s", numbuf); printf("%-12s", name[0] ? name : "sysmem"); printf("%-12lX", segoff); printf("%c%c%c ", prot & PROT_READ ? 'R' : '-', prot & PROT_WRITE ? 'W' : '-', prot & PROT_EXEC ? 'X' : '-'); delim = '\0'; if (flags & VM_MEMMAP_F_WIRED) { printf("%cwired", delim); delim = '/'; } if (flags & VM_MEMMAP_F_IOMMU) { printf("%ciommu", delim); delim = '/'; } printf("\n"); gpa += maplen; } } static int show_memseg(struct vmctx *ctx) { char name[SPECNAMELEN + 1], numbuf[8]; size_t seglen; int error, segid; printf("ID Length Name\n"); segid = 0; while (1) { error = vm_get_memseg(ctx, segid, &seglen, name, sizeof(name)); if (error) return (errno == EINVAL ? 0 : error); if (seglen) { printf("%-4d", segid); humanize_number(numbuf, sizeof(numbuf), seglen, "B", HN_AUTOSCALE, HN_NOSPACE); printf("%-12s", numbuf); printf("%s", name[0] ? name : "sysmem"); printf("\n"); } segid++; } } +#ifdef BHYVE_SNAPSHOT +static int +send_checkpoint_op_req(struct vmctx *ctx, struct checkpoint_op *op) +{ + struct sockaddr_un addr; + int socket_fd, len, len_sent, total_sent; + int err = 0; + char vmname_buf[MAX_VMNAME]; + + socket_fd = socket(PF_UNIX, SOCK_STREAM, 0); + if (socket_fd < 0) { + perror("Error creating bhyvectl socket"); + err = -1; + goto done; + } + + memset(&addr, 0, sizeof(struct sockaddr_un)); + addr.sun_family = AF_UNIX; + + err = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1); + if (err != 0) { + perror("Failed to get VM name"); + goto done; + } + + snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s", CHECKPOINT_RUN_DIR, vmname_buf); + + if (connect(socket_fd, (struct sockaddr *)&addr, + sizeof(struct sockaddr_un)) != 0) { + perror("Connect to VM socket failed"); + err = -1; + goto done; + } + + len = sizeof(*op); + total_sent = 0; + while ((len_sent = send(socket_fd, (char *)op + total_sent, len - total_sent, 0)) > 0) { + total_sent += len_sent; + } + + if (len_sent < 0) { + perror("Failed to send checkpoint operation request"); + err = -1; + } + +done: + if (socket_fd > 0) + close(socket_fd); + return (err); +} + +static int +send_start_checkpoint(struct vmctx *ctx, const char *checkpoint_file) +{ + struct checkpoint_op op; + + op.op = START_CHECKPOINT; + strncpy(op.snapshot_filename, checkpoint_file, MAX_SNAPSHOT_VMNAME); + op.snapshot_filename[MAX_SNAPSHOT_VMNAME - 1] = 0; + + return (send_checkpoint_op_req(ctx, &op)); +} + +static int +send_start_suspend(struct vmctx *ctx, const char *suspend_file) +{ + struct checkpoint_op op; + + op.op = START_SUSPEND; + strncpy(op.snapshot_filename, suspend_file, MAX_SNAPSHOT_VMNAME); + op.snapshot_filename[MAX_SNAPSHOT_VMNAME - 1] = 0; + + return (send_checkpoint_op_req(ctx, &op)); +} +#endif + int main(int argc, char *argv[]) { char *vmname; int error, ch, vcpu, ptenum; vm_paddr_t gpa_pmap; struct vm_exit vmexit; uint64_t rax, cr0, cr2, cr3, cr4, dr0, dr1, dr2, dr3, dr6, dr7; uint64_t rsp, rip, rflags, efer, pat; uint64_t eptp, bm, addr, u64, pteval[4], *pte, info[2]; struct vmctx *ctx; cpuset_t cpus; bool cpu_intel; uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; struct tm tm; struct option *opts; +#ifdef BHYVE_SNAPSHOT + char *checkpoint_file, *suspend_file; +#endif cpu_intel = cpu_vendor_intel(); opts = setup_options(cpu_intel); vcpu = 0; vmname = NULL; assert_lapic_lvt = -1; progname = basename(argv[0]); while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) { switch (ch) { case 0: break; case VMNAME: vmname = optarg; break; case VCPU: vcpu = atoi(optarg); break; case SET_MEM: memsize = atoi(optarg) * MB; memsize = roundup(memsize, 2 * MB); break; case SET_EFER: efer = strtoul(optarg, NULL, 0); set_efer = 1; break; case SET_CR0: cr0 = strtoul(optarg, NULL, 0); set_cr0 = 1; break; case SET_CR2: cr2 = strtoul(optarg, NULL, 0); set_cr2 = 1; break; case SET_CR3: cr3 = strtoul(optarg, NULL, 0); set_cr3 = 1; break; case SET_CR4: cr4 = strtoul(optarg, NULL, 0); set_cr4 = 1; break; case SET_DR0: dr0 = strtoul(optarg, NULL, 0); set_dr0 = 1; break; case SET_DR1: dr1 = strtoul(optarg, NULL, 0); set_dr1 = 1; break; case SET_DR2: dr2 = strtoul(optarg, NULL, 0); set_dr2 = 1; break; case SET_DR3: dr3 = strtoul(optarg, NULL, 0); set_dr3 = 1; break; case SET_DR6: dr6 = strtoul(optarg, NULL, 0); set_dr6 = 1; break; case SET_DR7: dr7 = strtoul(optarg, NULL, 0); set_dr7 = 1; break; case SET_RSP: rsp = strtoul(optarg, NULL, 0); set_rsp = 1; break; case SET_RIP: rip = strtoul(optarg, NULL, 0); set_rip = 1; break; case SET_RAX: rax = strtoul(optarg, NULL, 0); set_rax = 1; break; case SET_RFLAGS: rflags = strtoul(optarg, NULL, 0); set_rflags = 1; break; case DESC_BASE: desc_base = strtoul(optarg, NULL, 0); break; case DESC_LIMIT: desc_limit = strtoul(optarg, NULL, 0); break; case DESC_ACCESS: desc_access = strtoul(optarg, NULL, 0); break; case SET_CS: cs = strtoul(optarg, NULL, 0); set_cs = 1; break; case SET_DS: ds = strtoul(optarg, NULL, 0); set_ds = 1; break; case SET_ES: es = strtoul(optarg, NULL, 0); set_es = 1; break; case SET_FS: fs = strtoul(optarg, NULL, 0); set_fs = 1; break; case SET_GS: gs = strtoul(optarg, NULL, 0); set_gs = 1; break; case SET_SS: ss = strtoul(optarg, NULL, 0); set_ss = 1; break; case SET_TR: tr = strtoul(optarg, NULL, 0); set_tr = 1; break; case SET_LDTR: ldtr = strtoul(optarg, NULL, 0); set_ldtr = 1; break; case SET_X2APIC_STATE: x2apic_state = strtol(optarg, NULL, 0); set_x2apic_state = 1; break; case SET_EXCEPTION_BITMAP: exception_bitmap = strtoul(optarg, NULL, 0); set_exception_bitmap = 1; break; case SET_VMCS_ENTRY_INTERRUPTION_INFO: vmcs_entry_interruption_info = strtoul(optarg, NULL, 0); set_vmcs_entry_interruption_info = 1; break; case SET_CAP: capval = strtoul(optarg, NULL, 0); setcap = 1; break; case SET_RTC_TIME: rtc_secs = strtoul(optarg, NULL, 0); set_rtc_time = 1; break; case SET_RTC_NVRAM: rtc_nvram_value = (uint8_t)strtoul(optarg, NULL, 0); set_rtc_nvram = 1; break; case RTC_NVRAM_OFFSET: rtc_nvram_offset = strtoul(optarg, NULL, 0); break; case GET_GPA_PMAP: gpa_pmap = strtoul(optarg, NULL, 0); get_gpa_pmap = 1; break; case CAPNAME: capname = optarg; break; case UNASSIGN_PPTDEV: unassign_pptdev = 1; if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3) usage(cpu_intel); break; case ASSERT_LAPIC_LVT: assert_lapic_lvt = atoi(optarg); break; +#ifdef BHYVE_SNAPSHOT + case SET_CHECKPOINT_FILE: + vm_checkpoint_opt = 1; + checkpoint_file = optarg; + break; + case SET_SUSPEND_FILE: + vm_suspend_opt = 1; + suspend_file = optarg; + break; +#endif default: usage(cpu_intel); } } argc -= optind; argv += optind; if (vmname == NULL) usage(cpu_intel); error = 0; if (!error && create) error = vm_create(vmname); if (!error) { ctx = vm_open(vmname); if (ctx == NULL) { printf("VM:%s is not created.\n", vmname); exit (1); } } if (!error && memsize) error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (!error && set_efer) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer); if (!error && set_cr0) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0); if (!error && set_cr2) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR2, cr2); if (!error && set_cr3) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3); if (!error && set_cr4) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4); if (!error && set_dr0) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR0, dr0); if (!error && set_dr1) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR1, dr1); if (!error && set_dr2) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR2, dr2); if (!error && set_dr3) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR3, dr3); if (!error && set_dr6) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR6, dr6); if (!error && set_dr7) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7); if (!error && set_rsp) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp); if (!error && set_rip) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip); if (!error && set_rax) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax); if (!error && set_rflags) { error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); } if (!error && set_desc_ds) { error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS, desc_base, desc_limit, desc_access); } if (!error && set_desc_es) { error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES, desc_base, desc_limit, desc_access); } if (!error && set_desc_ss) { error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS, desc_base, desc_limit, desc_access); } if (!error && set_desc_cs) { error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS, desc_base, desc_limit, desc_access); } if (!error && set_desc_fs) { error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS, desc_base, desc_limit, desc_access); } if (!error && set_desc_gs) { error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS, desc_base, desc_limit, desc_access); } if (!error && set_desc_tr) { error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR, desc_base, desc_limit, desc_access); } if (!error && set_desc_ldtr) { error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR, desc_base, desc_limit, desc_access); } if (!error && set_desc_gdtr) { error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR, desc_base, desc_limit, 0); } if (!error && set_desc_idtr) { error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR, desc_base, desc_limit, 0); } if (!error && set_cs) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs); if (!error && set_ds) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds); if (!error && set_es) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es); if (!error && set_fs) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs); if (!error && set_gs) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs); if (!error && set_ss) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss); if (!error && set_tr) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr); if (!error && set_ldtr) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr); if (!error && set_x2apic_state) error = vm_set_x2apic_state(ctx, vcpu, x2apic_state); if (!error && unassign_pptdev) error = vm_unassign_pptdev(ctx, bus, slot, func); if (!error && set_exception_bitmap) { if (cpu_intel) error = vm_set_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP, exception_bitmap); else error = vm_set_vmcb_field(ctx, vcpu, VMCB_OFF_EXC_INTERCEPT, 4, exception_bitmap); } if (!error && cpu_intel && set_vmcs_entry_interruption_info) { error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO, vmcs_entry_interruption_info); } if (!error && inject_nmi) { error = vm_inject_nmi(ctx, vcpu); } if (!error && assert_lapic_lvt != -1) { error = vm_lapic_local_irq(ctx, vcpu, assert_lapic_lvt); } if (!error && (get_memseg || get_all)) error = show_memseg(ctx); if (!error && (get_memmap || get_all)) error = show_memmap(ctx); if (!error) error = get_all_registers(ctx, vcpu); if (!error) error = get_all_segments(ctx, vcpu); if (!error) { if (cpu_intel) error = get_misc_vmcs(ctx, vcpu); else error = get_misc_vmcb(ctx, vcpu); } if (!error && (get_x2apic_state || get_all)) { error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state); if (error == 0) printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state); } if (!error && (get_eptp || get_all)) { if (cpu_intel) error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp); else error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_NPT_BASE, 8, &eptp); if (error == 0) printf("%s[%d]\t\t0x%016lx\n", cpu_intel ? "eptp" : "rvi/npt", vcpu, eptp); } if (!error && (get_exception_bitmap || get_all)) { if(cpu_intel) error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP, &bm); else error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXC_INTERCEPT, 4, &bm); if (error == 0) printf("exception_bitmap[%d]\t%#lx\n", vcpu, bm); } if (!error && (get_io_bitmap || get_all)) { if (cpu_intel) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, &bm); if (error == 0) printf("io_bitmap_a[%d]\t%#lx\n", vcpu, bm); error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, &bm); if (error == 0) printf("io_bitmap_b[%d]\t%#lx\n", vcpu, bm); } else { error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_IO_PERM, 8, &bm); if (error == 0) printf("io_bitmap[%d]\t%#lx\n", vcpu, bm); } } if (!error && (get_tsc_offset || get_all)) { uint64_t tscoff; if (cpu_intel) error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, &tscoff); else error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_TSC_OFFSET, 8, &tscoff); if (error == 0) printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff); } if (!error && (get_msr_bitmap_address || get_all)) { if (cpu_intel) error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr); else error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_MSR_PERM, 8, &addr); if (error == 0) printf("msr_bitmap[%d]\t\t%#lx\n", vcpu, addr); } if (!error && (get_msr_bitmap || get_all)) { if (cpu_intel) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr); } else { error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_MSR_PERM, 8, &addr); } if (error == 0) error = dump_msr_bitmap(vcpu, addr, cpu_intel); } if (!error && (get_vpid_asid || get_all)) { uint64_t vpid; if (cpu_intel) error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid); else error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_ASID, 4, &vpid); if (error == 0) printf("%s[%d]\t\t0x%04lx\n", cpu_intel ? "vpid" : "asid", vcpu, vpid); } if (!error && (get_guest_pat || get_all)) { if (cpu_intel) error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_PAT, &pat); else error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_GUEST_PAT, 8, &pat); if (error == 0) printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat); } if (!error && (get_guest_sysenter || get_all)) { if (cpu_intel) error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_SYSENTER_CS, &cs); else error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_SYSENTER_CS, 8, &cs); if (error == 0) printf("guest_sysenter_cs[%d]\t%#lx\n", vcpu, cs); if (cpu_intel) error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_SYSENTER_ESP, &rsp); else error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_SYSENTER_ESP, 8, &rsp); if (error == 0) printf("guest_sysenter_sp[%d]\t%#lx\n", vcpu, rsp); if (cpu_intel) error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_SYSENTER_EIP, &rip); else error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_SYSENTER_EIP, 8, &rip); if (error == 0) printf("guest_sysenter_ip[%d]\t%#lx\n", vcpu, rip); } if (!error && (get_exit_reason || get_all)) { if (cpu_intel) error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, &u64); else error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXIT_REASON, 8, &u64); if (error == 0) printf("exit_reason[%d]\t%#lx\n", vcpu, u64); } if (!error && setcap) { int captype; captype = vm_capability_name2type(capname); error = vm_set_capability(ctx, vcpu, captype, capval); if (error != 0 && errno == ENOENT) printf("Capability \"%s\" is not available\n", capname); } if (!error && get_gpa_pmap) { error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum); if (error == 0) { printf("gpa %#lx:", gpa_pmap); pte = &pteval[0]; while (ptenum-- > 0) printf(" %#lx", *pte++); printf("\n"); } } if (!error && set_rtc_nvram) error = vm_rtc_write(ctx, rtc_nvram_offset, rtc_nvram_value); if (!error && (get_rtc_nvram || get_all)) { error = vm_rtc_read(ctx, rtc_nvram_offset, &rtc_nvram_value); if (error == 0) { printf("rtc nvram[%03d]: 0x%02x\n", rtc_nvram_offset, rtc_nvram_value); } } if (!error && set_rtc_time) error = vm_rtc_settime(ctx, rtc_secs); if (!error && (get_rtc_time || get_all)) { error = vm_rtc_gettime(ctx, &rtc_secs); if (error == 0) { gmtime_r(&rtc_secs, &tm); printf("rtc time %#lx: %s %s %02d %02d:%02d:%02d %d\n", rtc_secs, wday_str(tm.tm_wday), mon_str(tm.tm_mon), tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, 1900 + tm.tm_year); } } if (!error && (getcap || get_all)) { int captype, val, getcaptype; if (getcap && capname) getcaptype = vm_capability_name2type(capname); else getcaptype = -1; for (captype = 0; captype < VM_CAP_MAX; captype++) { if (getcaptype >= 0 && captype != getcaptype) continue; error = vm_get_capability(ctx, vcpu, captype, &val); if (error == 0) { printf("Capability \"%s\" is %s on vcpu %d\n", vm_capability_type2name(captype), val ? "set" : "not set", vcpu); } else if (errno == ENOENT) { error = 0; printf("Capability \"%s\" is not available\n", vm_capability_type2name(captype)); } else { break; } } } if (!error && (get_active_cpus || get_all)) { error = vm_active_cpus(ctx, &cpus); if (!error) print_cpus("active cpus", &cpus); } if (!error && (get_suspended_cpus || get_all)) { error = vm_suspended_cpus(ctx, &cpus); if (!error) print_cpus("suspended cpus", &cpus); } if (!error && (get_intinfo || get_all)) { error = vm_get_intinfo(ctx, vcpu, &info[0], &info[1]); if (!error) { print_intinfo("pending", info[0]); print_intinfo("current", info[1]); } } if (!error && (get_stats || get_all)) { int i, num_stats; uint64_t *stats; struct timeval tv; const char *desc; stats = vm_get_stats(ctx, vcpu, &tv, &num_stats); if (stats != NULL) { printf("vcpu%d stats:\n", vcpu); for (i = 0; i < num_stats; i++) { desc = vm_get_stat_desc(ctx, i); printf("%-40s\t%ld\n", desc, stats[i]); } } } if (!error && (get_cpu_topology || get_all)) { uint16_t sockets, cores, threads, maxcpus; vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); printf("cpu_topology:\tsockets=%hu, cores=%hu, threads=%hu, " "maxcpus=%hu\n", sockets, cores, threads, maxcpus); } if (!error && run) { error = vm_run(ctx, vcpu, &vmexit); if (error == 0) dump_vm_run_exitcode(&vmexit, vcpu); else printf("vm_run error %d\n", error); } if (!error && force_reset) error = vm_suspend(ctx, VM_SUSPEND_RESET); if (!error && force_poweroff) error = vm_suspend(ctx, VM_SUSPEND_POWEROFF); if (error) printf("errno = %d\n", errno); if (!error && destroy) vm_destroy(ctx); +#ifdef BHYVE_SNAPSHOT + if (!error && vm_checkpoint_opt) + error = send_start_checkpoint(ctx, checkpoint_file); + + if (!error && vm_suspend_opt) + error = send_start_suspend(ctx, suspend_file); +#endif + free (opts); exit(error); }