diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c index 2d0cc21aeb25..3cbc885b82db 100644 --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -1,1836 +1,1836 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmmapi.h" #include "internal.h" #define MB (1024 * 1024UL) #define GB (1024 * 1024 * 1024UL) /* * Size of the guard region before and after the virtual address space * mapping the guest physical memory. This must be a multiple of the * superpage size for performance reasons. */ #define VM_MMAP_GUARD_SIZE (4 * MB) #define PROT_RW (PROT_READ | PROT_WRITE) #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) struct vmctx { int fd; uint32_t lowmem_limit; int memflags; size_t lowmem; size_t highmem; char *baseaddr; char *name; }; #define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) #define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) static int vm_device_open(const char *name) { int fd, len; char *vmfile; len = strlen("/dev/vmm/") + strlen(name) + 1; vmfile = malloc(len); assert(vmfile != NULL); snprintf(vmfile, len, "/dev/vmm/%s", name); /* Open the device file */ fd = open(vmfile, O_RDWR, 0); free(vmfile); return (fd); } int vm_create(const char *name) { /* Try to load vmm(4) module before creating a guest. */ if (modfind("vmm") < 0) kldload("vmm"); return (CREATE(name)); } struct vmctx * vm_open(const char *name) { struct vmctx *vm; int saved_errno; vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); assert(vm != NULL); vm->fd = -1; vm->memflags = 0; vm->lowmem_limit = 3 * GB; vm->name = (char *)(vm + 1); strcpy(vm->name, name); if ((vm->fd = vm_device_open(vm->name)) < 0) goto err; return (vm); err: saved_errno = errno; free(vm); errno = saved_errno; return (NULL); } void vm_close(struct vmctx *vm) { assert(vm != NULL); close(vm->fd); free(vm); } void vm_destroy(struct vmctx *vm) { assert(vm != NULL); if (vm->fd >= 0) close(vm->fd); DESTROY(vm->name); free(vm); } struct vcpu * vm_vcpu_open(struct vmctx *ctx, int vcpuid) { struct vcpu *vcpu; vcpu = malloc(sizeof(*vcpu)); vcpu->ctx = ctx; vcpu->vcpuid = vcpuid; return (vcpu); } void vm_vcpu_close(struct vcpu *vcpu) { free(vcpu); } int vcpu_id(struct vcpu *vcpu) { return (vcpu->vcpuid); } int vm_parse_memsize(const char *opt, size_t *ret_memsize) { char *endptr; size_t optval; int error; optval = strtoul(opt, &endptr, 0); if (*opt != '\0' && *endptr == '\0') { /* * For the sake of backward compatibility if the memory size * specified on the command line is less than a megabyte then * it is interpreted as being in units of MB. */ if (optval < MB) optval *= MB; *ret_memsize = optval; error = 0; } else error = expand_number(opt, ret_memsize); return (error); } uint32_t vm_get_lowmem_limit(struct vmctx *ctx) { return (ctx->lowmem_limit); } void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit) { ctx->lowmem_limit = limit; } void vm_set_memflags(struct vmctx *ctx, int flags) { ctx->memflags = flags; } int vm_get_memflags(struct vmctx *ctx) { return (ctx->memflags); } /* * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot) { struct vm_memmap memmap; int error, flags; memmap.gpa = gpa; memmap.segid = segid; memmap.segoff = off; memmap.len = len; memmap.prot = prot; memmap.flags = 0; if (ctx->memflags & VM_MEM_F_WIRED) memmap.flags |= VM_MEMMAP_F_WIRED; /* * If this mapping already exists then don't create it again. This * is the common case for SYSMEM mappings created by bhyveload(8). */ error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); if (error == 0 && gpa == memmap.gpa) { if (segid != memmap.segid || off != memmap.segoff || prot != memmap.prot || flags != memmap.flags) { errno = EEXIST; return (-1); } else { return (0); } } error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); return (error); } int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, size_t *lowmem_size, size_t *highmem_size) { *guest_baseaddr = ctx->baseaddr; *lowmem_size = ctx->lowmem; *highmem_size = ctx->highmem; return (0); } int vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len) { struct vm_munmap munmap; int error; munmap.gpa = gpa; munmap.len = len; error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap); return (error); } int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { struct vm_memmap memmap; int error; bzero(&memmap, sizeof(struct vm_memmap)); memmap.gpa = *gpa; error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); if (error == 0) { *gpa = memmap.gpa; *segid = memmap.segid; *segoff = memmap.segoff; *len = memmap.len; *prot = memmap.prot; *flags = memmap.flags; } return (error); } /* * Return 0 if the segments are identical and non-zero otherwise. * * This is slightly complicated by the fact that only device memory segments * are named. */ static int cmpseg(size_t len, const char *str, size_t len2, const char *str2) { if (len == len2) { if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) return (0); } return (-1); } static int vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) { struct vm_memseg memseg; size_t n; int error; /* * If the memory segment has already been created then just return. * This is the usual case for the SYSMEM segment created by userspace * loaders like bhyveload(8). */ error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, sizeof(memseg.name)); if (error) return (error); if (memseg.len != 0) { if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { errno = EINVAL; return (-1); } else { return (0); } } bzero(&memseg, sizeof(struct vm_memseg)); memseg.segid = segid; memseg.len = len; if (name != NULL) { n = strlcpy(memseg.name, name, sizeof(memseg.name)); if (n >= sizeof(memseg.name)) { errno = ENAMETOOLONG; return (-1); } } error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); return (error); } int vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, size_t bufsize) { struct vm_memseg memseg; size_t n; int error; memseg.segid = segid; error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); if (error == 0) { *lenp = memseg.len; n = strlcpy(namebuf, memseg.name, bufsize); if (n >= bufsize) { errno = ENAMETOOLONG; error = -1; } } return (error); } static int setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) { char *ptr; int error, flags; /* Map 'len' bytes starting at 'gpa' in the guest address space */ error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); if (error) return (error); flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap into the process address space on the host */ ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); if (ptr == MAP_FAILED) return (-1); return (0); } int vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) { size_t objsize, len; vm_paddr_t gpa; char *baseaddr, *ptr; int error; assert(vms == VM_MMAP_ALL); /* * If 'memsize' cannot fit entirely in the 'lowmem' segment then * create another 'highmem' segment above 4GB for the remainder. */ if (memsize > ctx->lowmem_limit) { ctx->lowmem = ctx->lowmem_limit; ctx->highmem = memsize - ctx->lowmem_limit; objsize = 4*GB + ctx->highmem; } else { ctx->lowmem = memsize; ctx->highmem = 0; objsize = ctx->lowmem; } error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); if (error) return (error); /* * Stake out a contiguous region covering the guest physical memory * and the adjoining guard regions. */ len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (ptr == MAP_FAILED) return (-1); baseaddr = ptr + VM_MMAP_GUARD_SIZE; if (ctx->highmem > 0) { gpa = 4*GB; len = ctx->highmem; error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } if (ctx->lowmem > 0) { gpa = 0; len = ctx->lowmem; error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } ctx->baseaddr = baseaddr; return (0); } /* * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in * the lowmem or highmem regions. * * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. * The instruction emulation code depends on this behavior. */ void * vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) { if (ctx->lowmem > 0) { if (gaddr < ctx->lowmem && len <= ctx->lowmem && gaddr + len <= ctx->lowmem) return (ctx->baseaddr + gaddr); } if (ctx->highmem > 0) { if (gaddr >= 4*GB) { if (gaddr < 4*GB + ctx->highmem && len <= ctx->highmem && gaddr + len <= 4*GB + ctx->highmem) return (ctx->baseaddr + gaddr); } } return (NULL); } vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr) { vm_paddr_t offaddr; offaddr = (char *)addr - ctx->baseaddr; if (ctx->lowmem > 0) if (offaddr <= ctx->lowmem) return (offaddr); if (ctx->highmem > 0) if (offaddr >= 4*GB && offaddr < 4*GB + ctx->highmem) return (offaddr); return ((vm_paddr_t)-1); } const char * vm_get_name(struct vmctx *ctx) { return (ctx->name); } size_t vm_get_lowmem_size(struct vmctx *ctx) { return (ctx->lowmem); } size_t vm_get_highmem_size(struct vmctx *ctx) { return (ctx->highmem); } void * vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) { char pathname[MAXPATHLEN]; size_t len2; char *base, *ptr; int fd, error, flags; fd = -1; ptr = MAP_FAILED; if (name == NULL || strlen(name) == 0) { errno = EINVAL; goto done; } error = vm_alloc_memseg(ctx, segid, len, name); if (error) goto done; strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); strlcat(pathname, ctx->name, sizeof(pathname)); strlcat(pathname, ".", sizeof(pathname)); strlcat(pathname, name, sizeof(pathname)); fd = open(pathname, O_RDWR); if (fd < 0) goto done; /* * Stake out a contiguous region covering the device memory and the * adjoining guard regions. */ len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (base == MAP_FAILED) goto done; flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap the devmem region in the host address space */ ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); done: if (fd >= 0) close(fd); return (ptr); } static int vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg) { /* * XXX: fragile, handle with care * Assumes that the first field of the ioctl data * is the vcpuid. */ *(int *)arg = vcpu->vcpuid; return (ioctl(vcpu->ctx->fd, cmd, arg)); } int vm_set_desc(struct vcpu *vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); vmsegdesc.regnum = reg; vmsegdesc.desc.base = base; vmsegdesc.desc.limit = limit; vmsegdesc.desc.access = access; error = vcpu_ioctl(vcpu, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc); return (error); } int vm_get_desc(struct vcpu *vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); vmsegdesc.regnum = reg; error = vcpu_ioctl(vcpu, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc); if (error == 0) { *base = vmsegdesc.desc.base; *limit = vmsegdesc.desc.limit; *access = vmsegdesc.desc.access; } return (error); } int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *seg_desc) { int error; error = vm_get_desc(vcpu, reg, &seg_desc->base, &seg_desc->limit, &seg_desc->access); return (error); } int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.regnum = reg; vmreg.regval = val; error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg); return (error); } int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.regnum = reg; error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg); *ret_val = vmreg.regval; return (error); } int vm_set_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals) { int error; struct vm_register_set vmregset; bzero(&vmregset, sizeof(vmregset)); vmregset.count = count; vmregset.regnums = regnums; vmregset.regvals = regvals; error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset); return (error); } int vm_get_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals) { int error; struct vm_register_set vmregset; bzero(&vmregset, sizeof(vmregset)); vmregset.count = count; vmregset.regnums = regnums; vmregset.regvals = regvals; error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset); return (error); } int vm_run(struct vcpu *vcpu, struct vm_exit *vmexit) { int error; struct vm_run vmrun; bzero(&vmrun, sizeof(vmrun)); error = vcpu_ioctl(vcpu, VM_RUN, &vmrun); bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); return (error); } int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) { struct vm_suspend vmsuspend; bzero(&vmsuspend, sizeof(vmsuspend)); vmsuspend.how = how; return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); } int vm_reinit(struct vmctx *ctx) { return (ioctl(ctx->fd, VM_REINIT, 0)); } int vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction) { struct vm_exception exc; exc.vector = vector; exc.error_code = errcode; exc.error_code_valid = errcode_valid; exc.restart_instruction = restart_instruction; return (vcpu_ioctl(vcpu, VM_INJECT_EXCEPTION, &exc)); } int vm_apicid2vcpu(struct vmctx *ctx __unused, int apicid) { /* * The apic id associated with the 'vcpu' has the same numerical value * as the 'vcpu' itself. */ return (apicid); } int vm_lapic_irq(struct vcpu *vcpu, int vector) { struct vm_lapic_irq vmirq; bzero(&vmirq, sizeof(vmirq)); vmirq.vector = vector; return (vcpu_ioctl(vcpu, VM_LAPIC_IRQ, &vmirq)); } int vm_lapic_local_irq(struct vcpu *vcpu, int vector) { struct vm_lapic_irq vmirq; bzero(&vmirq, sizeof(vmirq)); vmirq.vector = vector; return (vcpu_ioctl(vcpu, VM_LAPIC_LOCAL_IRQ, &vmirq)); } int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg) { struct vm_lapic_msi vmmsi; bzero(&vmmsi, sizeof(vmmsi)); vmmsi.addr = addr; vmmsi.msg = msg; return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi)); } int vm_ioapic_assert_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq)); } int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq)); } int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq)); } int vm_ioapic_pincount(struct vmctx *ctx, int *pincount) { return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount)); } int vm_readwrite_kernemu_device(struct vcpu *vcpu, vm_paddr_t gpa, bool write, int size, uint64_t *value) { struct vm_readwrite_kernemu_device irp = { .access_width = fls(size) - 1, .gpa = gpa, .value = write ? *value : ~0ul, }; long cmd = (write ? VM_SET_KERNEMU_DEV : VM_GET_KERNEMU_DEV); int rc; rc = vcpu_ioctl(vcpu, cmd, &irp); if (rc == 0 && !write) *value = irp.value; return (rc); } int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq)); } int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq)); } int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq)); } int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger) { struct vm_isa_irq_trigger isa_irq_trigger; bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger)); isa_irq_trigger.atpic_irq = atpic_irq; isa_irq_trigger.trigger = trigger; return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger)); } int vm_inject_nmi(struct vcpu *vcpu) { struct vm_nmi vmnmi; bzero(&vmnmi, sizeof(vmnmi)); return (vcpu_ioctl(vcpu, VM_INJECT_NMI, &vmnmi)); } static const char *capstrmap[] = { [VM_CAP_HALT_EXIT] = "hlt_exit", [VM_CAP_MTRAP_EXIT] = "mtrap_exit", [VM_CAP_PAUSE_EXIT] = "pause_exit", [VM_CAP_UNRESTRICTED_GUEST] = "unrestricted_guest", [VM_CAP_ENABLE_INVPCID] = "enable_invpcid", [VM_CAP_BPT_EXIT] = "bpt_exit", }; int vm_capability_name2type(const char *capname) { int i; for (i = 0; i < (int)nitems(capstrmap); i++) { if (strcmp(capstrmap[i], capname) == 0) return (i); } return (-1); } const char * vm_capability_type2name(int type) { if (type >= 0 && type < (int)nitems(capstrmap)) return (capstrmap[type]); return (NULL); } int vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval) { int error; struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.captype = cap; error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap); *retval = vmcap.capval; return (error); } int vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val) { struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.captype = cap; vmcap.capval = val; return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap)); } int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev)); } int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev)); } int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { struct vm_pptdev_mmio pptmmio; bzero(&pptmmio, sizeof(pptmmio)); pptmmio.bus = bus; pptmmio.slot = slot; pptmmio.func = func; pptmmio.gpa = gpa; pptmmio.len = len; pptmmio.hpa = hpa; return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio)); } int vm_unmap_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len) { struct vm_pptdev_mmio pptmmio; bzero(&pptmmio, sizeof(pptmmio)); pptmmio.bus = bus; pptmmio.slot = slot; pptmmio.func = func; pptmmio.gpa = gpa; pptmmio.len = len; return (ioctl(ctx->fd, VM_UNMAP_PPTDEV_MMIO, &pptmmio)); } int vm_setup_pptdev_msi(struct vmctx *ctx, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec) { struct vm_pptdev_msi pptmsi; bzero(&pptmsi, sizeof(pptmsi)); pptmsi.bus = bus; pptmsi.slot = slot; pptmsi.func = func; pptmsi.msg = msg; pptmsi.addr = addr; pptmsi.numvec = numvec; return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi)); } int vm_setup_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control) { struct vm_pptdev_msix pptmsix; bzero(&pptmsix, sizeof(pptmsix)); pptmsix.bus = bus; pptmsix.slot = slot; pptmsix.func = func; pptmsix.idx = idx; pptmsix.msg = msg; pptmsix.addr = addr; pptmsix.vector_control = vector_control; return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix); } int vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev ppt; bzero(&ppt, sizeof(ppt)); ppt.bus = bus; ppt.slot = slot; ppt.func = func; return ioctl(ctx->fd, VM_PPTDEV_DISABLE_MSIX, &ppt); } uint64_t * vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, int *ret_entries) { static _Thread_local uint64_t *stats_buf; static _Thread_local u_int stats_count; uint64_t *new_stats; struct vm_stats vmstats; u_int count, index; bool have_stats; have_stats = false; count = 0; for (index = 0;; index += nitems(vmstats.statbuf)) { vmstats.index = index; if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0) break; if (stats_count < index + vmstats.num_entries) { new_stats = realloc(stats_buf, (index + vmstats.num_entries) * sizeof(uint64_t)); if (new_stats == NULL) { errno = ENOMEM; return (NULL); } stats_count = index + vmstats.num_entries; stats_buf = new_stats; } memcpy(stats_buf + index, vmstats.statbuf, vmstats.num_entries * sizeof(uint64_t)); count += vmstats.num_entries; have_stats = true; if (vmstats.num_entries != nitems(vmstats.statbuf)) break; } if (have_stats) { if (ret_entries) *ret_entries = count; if (ret_tv) *ret_tv = vmstats.tv; return (stats_buf); } else return (NULL); } const char * vm_get_stat_desc(struct vmctx *ctx, int index) { static struct vm_stat_desc statdesc; statdesc.index = index; if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) return (statdesc.desc); else return (NULL); } int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); error = vcpu_ioctl(vcpu, VM_GET_X2APIC_STATE, &x2apic); *state = x2apic.state; return (error); } int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); x2apic.state = state; error = vcpu_ioctl(vcpu, VM_SET_X2APIC_STATE, &x2apic); return (error); } /* * From Intel Vol 3a: * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT */ int vcpu_reset(struct vcpu *vcpu) { int error; uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx; uint32_t desc_access, desc_limit; uint16_t sel; zero = 0; rflags = 0x2; error = vm_set_register(vcpu, VM_REG_GUEST_RFLAGS, rflags); if (error) goto done; rip = 0xfff0; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RIP, rip)) != 0) goto done; /* * According to Intels Software Developer Manual CR0 should be * initialized with CR0_ET | CR0_NW | CR0_CD but that crashes some * guests like Windows. */ cr0 = CR0_NE; if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR0, cr0)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR2, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR3, zero)) != 0) goto done; cr4 = 0; if ((error = vm_set_register(vcpu, VM_REG_GUEST_CR4, cr4)) != 0) goto done; /* * CS: present, r/w, accessed, 16-bit, byte granularity, usable */ desc_base = 0xffff0000; desc_limit = 0xffff; desc_access = 0x0093; error = vm_set_desc(vcpu, VM_REG_GUEST_CS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0xf000; if ((error = vm_set_register(vcpu, VM_REG_GUEST_CS, sel)) != 0) goto done; /* * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0093; error = vm_set_desc(vcpu, VM_REG_GUEST_SS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vcpu, VM_REG_GUEST_DS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vcpu, VM_REG_GUEST_ES, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vcpu, VM_REG_GUEST_FS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vcpu, VM_REG_GUEST_GS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vcpu, VM_REG_GUEST_SS, sel)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_DS, sel)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_ES, sel)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_FS, sel)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_GS, sel)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_EFER, zero)) != 0) goto done; /* General purpose registers */ rdx = 0xf00; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RAX, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RBX, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RCX, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RDX, rdx)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RSI, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RDI, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RBP, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_RSP, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R8, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R9, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R10, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R11, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R12, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R13, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R14, zero)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_R15, zero)) != 0) goto done; /* GDTR, IDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0; error = vm_set_desc(vcpu, VM_REG_GUEST_GDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; error = vm_set_desc(vcpu, VM_REG_GUEST_IDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; /* TR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0000008b; error = vm_set_desc(vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vcpu, VM_REG_GUEST_TR, sel)) != 0) goto done; /* LDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x00000082; error = vm_set_desc(vcpu, VM_REG_GUEST_LDTR, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vcpu, VM_REG_GUEST_LDTR, 0)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_DR6, 0xffff0ff0)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_DR7, 0x400)) != 0) goto done; if ((error = vm_set_register(vcpu, VM_REG_GUEST_INTR_SHADOW, zero)) != 0) goto done; error = 0; done: return (error); } int vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) { int error, i; struct vm_gpa_pte gpapte; bzero(&gpapte, sizeof(gpapte)); gpapte.gpa = gpa; error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); if (error == 0) { *num = gpapte.ptenum; for (i = 0; i < gpapte.ptenum; i++) pte[i] = gpapte.pte[i]; } return (error); } int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities) { int error; struct vm_hpet_cap cap; bzero(&cap, sizeof(struct vm_hpet_cap)); error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap); if (capabilities != NULL) *capabilities = cap.capabilities; return (error); } int vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; bzero(&gg, sizeof(struct vm_gla2gpa)); gg.prot = prot; gg.gla = gla; gg.paging = *paging; error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg); if (error == 0) { *fault = gg.fault; *gpa = gg.gpa; } return (error); } int vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; bzero(&gg, sizeof(struct vm_gla2gpa)); gg.prot = prot; gg.gla = gla; gg.paging = *paging; error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg); if (error == 0) { *fault = gg.fault; *gpa = gg.gpa; } return (error); } #ifndef min #define min(a,b) (((a) < (b)) ? (a) : (b)) #endif int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault) { void *va; uint64_t gpa, off; int error, i, n; for (i = 0; i < iovcnt; i++) { iov[i].iov_base = 0; iov[i].iov_len = 0; } while (len) { assert(iovcnt > 0); error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = MIN(len, PAGE_SIZE - off); va = vm_map_gpa(vcpu->ctx, gpa, n); if (va == NULL) return (EFAULT); iov->iov_base = va; iov->iov_len = n; iov++; iovcnt--; gla += n; len -= n; } return (0); } void vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused) { /* * Intentionally empty. This is used by the instruction * emulation code shared with the kernel. The in-kernel * version of this is non-empty. */ } void vm_copyin(struct iovec *iov, void *vp, size_t len) { const char *src; char *dst; size_t n; dst = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); src = iov->iov_base; bcopy(src, dst, n); iov++; dst += n; len -= n; } } void vm_copyout(const void *vp, struct iovec *iov, size_t len) { const char *src; char *dst; size_t n; src = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); dst = iov->iov_base; bcopy(src, dst, n); iov++; src += n; len -= n; } } static int vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) { struct vm_cpuset vm_cpuset; int error; bzero(&vm_cpuset, sizeof(struct vm_cpuset)); vm_cpuset.which = which; vm_cpuset.cpusetsize = sizeof(cpuset_t); vm_cpuset.cpus = cpus; error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); return (error); } int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); } int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); } int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); } int vm_activate_cpu(struct vcpu *vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac); return (error); } int vm_suspend_all_cpus(struct vmctx *ctx) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = -1; error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); return (error); } int vm_suspend_cpu(struct vcpu *vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac); return (error); } int vm_resume_cpu(struct vcpu *vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac); return (error); } int vm_resume_all_cpus(struct vmctx *ctx) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = -1; error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); return (error); } int vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii); if (error == 0) { *info1 = vmii.info1; *info2 = vmii.info2; } return (error); } int vm_set_intinfo(struct vcpu *vcpu, uint64_t info1) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); vmii.info1 = info1; error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii); return (error); } int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value) { struct vm_rtc_data rtcdata; int error; bzero(&rtcdata, sizeof(struct vm_rtc_data)); rtcdata.offset = offset; rtcdata.value = value; error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata); return (error); } int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval) { struct vm_rtc_data rtcdata; int error; bzero(&rtcdata, sizeof(struct vm_rtc_data)); rtcdata.offset = offset; error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata); if (error == 0) *retval = rtcdata.value; return (error); } int vm_rtc_settime(struct vmctx *ctx, time_t secs) { struct vm_rtc_time rtctime; int error; bzero(&rtctime, sizeof(struct vm_rtc_time)); rtctime.secs = secs; error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime); return (error); } int vm_rtc_gettime(struct vmctx *ctx, time_t *secs) { struct vm_rtc_time rtctime; int error; bzero(&rtctime, sizeof(struct vm_rtc_time)); error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime); if (error == 0) *secs = rtctime.secs; return (error); } int vm_restart_instruction(struct vcpu *vcpu) { int arg; return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg)); } int -vm_snapshot_req(struct vm_snapshot_meta *meta) +vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta) { - if (ioctl(meta->ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { + if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { #ifdef SNAPSHOT_DEBUG fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", __func__, meta->dev_name, errno); #endif return (-1); } return (0); } int vm_restore_time(struct vmctx *ctx) { int dummy; dummy = 0; return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); } int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) { struct vm_cpu_topology topology; bzero(&topology, sizeof (struct vm_cpu_topology)); topology.sockets = sockets; topology.cores = cores; topology.threads = threads; topology.maxcpus = maxcpus; return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); } int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) { struct vm_cpu_topology topology; int error; bzero(&topology, sizeof (struct vm_cpu_topology)); error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); if (error == 0) { *sockets = topology.sockets; *cores = topology.cores; *threads = topology.threads; *maxcpus = topology.maxcpus; } return (error); } /* Keep in sync with machine/vmm_dev.h. */ static const cap_ioctl_t vm_ioctl_cmds[] = { VM_RUN, VM_SUSPEND, VM_REINIT, VM_ALLOC_MEMSEG, VM_GET_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_GETNEXT, VM_MUNMAP_MEMSEG, VM_SET_REGISTER, VM_GET_REGISTER, VM_SET_SEGMENT_DESCRIPTOR, VM_GET_SEGMENT_DESCRIPTOR, VM_SET_REGISTER_SET, VM_GET_REGISTER_SET, VM_SET_KERNEMU_DEV, VM_GET_KERNEMU_DEV, VM_INJECT_EXCEPTION, VM_LAPIC_IRQ, VM_LAPIC_LOCAL_IRQ, VM_LAPIC_MSI, VM_IOAPIC_ASSERT_IRQ, VM_IOAPIC_DEASSERT_IRQ, VM_IOAPIC_PULSE_IRQ, VM_IOAPIC_PINCOUNT, VM_ISA_ASSERT_IRQ, VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER, VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV, VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI, VM_PPTDEV_MSIX, VM_UNMAP_PPTDEV_MMIO, VM_PPTDEV_DISABLE_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE, VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA, VM_GLA2GPA_NOFAULT, VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SUSPEND_CPU, VM_RESUME_CPU, VM_SET_INTINFO, VM_GET_INTINFO, VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME, VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY, VM_SNAPSHOT_REQ, VM_RESTORE_TIME }; int vm_limit_rights(struct vmctx *ctx) { cap_rights_t rights; size_t ncmds; cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); if (caph_rights_limit(ctx->fd, &rights) != 0) return (-1); ncmds = nitems(vm_ioctl_cmds); if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, ncmds) != 0) return (-1); return (0); } /* * Avoid using in new code. Operations on the fd should be wrapped here so that * capability rights can be kept in sync. */ int vm_get_device_fd(struct vmctx *ctx) { return (ctx->fd); } /* Legacy interface, do not use. */ const cap_ioctl_t * vm_get_ioctls(size_t *len) { cap_ioctl_t *cmds; if (len == NULL) { cmds = malloc(sizeof(vm_ioctl_cmds)); if (cmds == NULL) return (NULL); bcopy(vm_ioctl_cmds, cmds, sizeof(vm_ioctl_cmds)); return (cmds); } *len = nitems(vm_ioctl_cmds); return (NULL); } diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h index 322b47e4ee0e..459aad0fe1a3 100644 --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -1,279 +1,279 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMMAPI_H_ #define _VMMAPI_H_ #include #include #include #include #include #include /* * API version for out-of-tree consumers like grub-bhyve for making compile * time decisions. */ #define VMMAPI_VERSION 0200 /* 2 digit major followed by 2 digit minor */ struct iovec; struct vcpu; struct vmctx; struct vm_snapshot_meta; enum x2apic_state; /* * Different styles of mapping the memory assigned to a VM into the address * space of the controlling process. */ enum vm_mmap_style { VM_MMAP_NONE, /* no mapping */ VM_MMAP_ALL, /* fully and statically mapped */ VM_MMAP_SPARSE, /* mappings created on-demand */ }; /* * 'flags' value passed to 'vm_set_memflags()'. */ #define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */ #define VM_MEM_F_WIRED 0x02 /* guest memory is wired */ /* * Identifiers for memory segments: * - vm_setup_memory() uses VM_SYSMEM for the system memory segment. * - the remaining identifiers can be used to create devmem segments. */ enum { VM_SYSMEM, VM_BOOTROM, VM_FRAMEBUFFER, VM_PCIROM, }; __BEGIN_DECLS /* * Get the length and name of the memory segment identified by 'segid'. * Note that system memory segments are identified with a nul name. * * Returns 0 on success and non-zero otherwise. */ int vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name, size_t namesiz); /* * Iterate over the guest address space. This function finds an address range * that starts at an address >= *gpa. * * Returns 0 if the next address range was found and non-zero otherwise. */ int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, size_t *lowmem_size, size_t *highmem_size); /* * Create a device memory segment identified by 'segid'. * * Returns a pointer to the memory segment on success and MAP_FAILED otherwise. */ void *vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len); /* * Map the memory segment identified by 'segid' into the guest address space * at [gpa,gpa+len) with protection 'prot'. */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t segoff, size_t len, int prot); int vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len); int vm_create(const char *name); struct vmctx *vm_open(const char *name); void vm_close(struct vmctx *ctx); void vm_destroy(struct vmctx *ctx); int vm_limit_rights(struct vmctx *ctx); struct vcpu *vm_vcpu_open(struct vmctx *ctx, int vcpuid); void vm_vcpu_close(struct vcpu *vcpu); int vcpu_id(struct vcpu *vcpu); int vm_parse_memsize(const char *optarg, size_t *memsize); int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); /* inverse operation to vm_map_gpa - extract guest address from host pointer */ vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr); int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); int vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); int vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); uint32_t vm_get_lowmem_limit(struct vmctx *ctx); void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit); void vm_set_memflags(struct vmctx *ctx, int flags); int vm_get_memflags(struct vmctx *ctx); const char *vm_get_name(struct vmctx *ctx); size_t vm_get_lowmem_size(struct vmctx *ctx); size_t vm_get_highmem_size(struct vmctx *ctx); int vm_set_desc(struct vcpu *vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access); int vm_get_desc(struct vcpu *vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access); int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *seg_desc); int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val); int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval); int vm_set_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals); int vm_get_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals); int vm_run(struct vcpu *vcpu, struct vm_exit *ret_vmexit); int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how); int vm_reinit(struct vmctx *ctx); int vm_apicid2vcpu(struct vmctx *ctx, int apicid); int vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction); int vm_lapic_irq(struct vcpu *vcpu, int vector); int vm_lapic_local_irq(struct vcpu *vcpu, int vector); int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg); int vm_ioapic_assert_irq(struct vmctx *ctx, int irq); int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq); int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq); int vm_ioapic_pincount(struct vmctx *ctx, int *pincount); int vm_readwrite_kernemu_device(struct vcpu *vcpu, vm_paddr_t gpa, bool write, int size, uint64_t *value); int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger); int vm_inject_nmi(struct vcpu *vcpu); int vm_capability_name2type(const char *capname); const char *vm_capability_type2name(int type); int vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval); int vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val); int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len); int vm_setup_pptdev_msi(struct vmctx *ctx, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec); int vm_setup_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); int vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func); int vm_get_intinfo(struct vcpu *vcpu, uint64_t *i1, uint64_t *i2); int vm_set_intinfo(struct vcpu *vcpu, uint64_t exit_intinfo); /* * Return a pointer to the statistics buffer. Note that this is not MT-safe. */ uint64_t *vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, int *ret_entries); const char *vm_get_stat_desc(struct vmctx *ctx, int index); int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *s); int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state s); int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities); /* * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'. * The 'iovcnt' should be big enough to accommodate all GPA segments. * * retval fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Error */ int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *pg, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault); void vm_copyin(struct iovec *guest_iov, void *host_dst, size_t len); void vm_copyout(const void *host_src, struct iovec *guest_iov, size_t len); void vm_copy_teardown(struct iovec *iov, int iovcnt); /* RTC */ int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value); int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval); int vm_rtc_settime(struct vmctx *ctx, time_t secs); int vm_rtc_gettime(struct vmctx *ctx, time_t *secs); /* Reset vcpu register state */ int vcpu_reset(struct vcpu *vcpu); int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_activate_cpu(struct vcpu *vcpu); int vm_suspend_all_cpus(struct vmctx *ctx); int vm_suspend_cpu(struct vcpu *vcpu); int vm_resume_all_cpus(struct vmctx *ctx); int vm_resume_cpu(struct vcpu *vcpu); int vm_restart_instruction(struct vcpu *vcpu); /* CPU topology */ int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); /* * FreeBSD specific APIs */ int vm_setup_freebsd_registers(struct vcpu *vcpu, uint64_t rip, uint64_t cr3, uint64_t gdtbase, uint64_t rsp); int vm_setup_freebsd_registers_i386(struct vcpu *vcpu, uint32_t eip, uint32_t gdtbase, uint32_t esp); void vm_setup_freebsd_gdt(uint64_t *gdtr); /* * Save and restore */ -int vm_snapshot_req(struct vm_snapshot_meta *meta); +int vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta); int vm_restore_time(struct vmctx *ctx); /* * Deprecated interfaces, do not use them in new code. */ int vm_get_device_fd(struct vmctx *ctx); const cap_ioctl_t *vm_get_ioctls(size_t *len); __END_DECLS #endif /* _VMMAPI_H_ */ diff --git a/sys/amd64/include/vmm_snapshot.h b/sys/amd64/include/vmm_snapshot.h index c4d7fc4d4371..a34684e9b671 100644 --- a/sys/amd64/include/vmm_snapshot.h +++ b/sys/amd64/include/vmm_snapshot.h @@ -1,154 +1,131 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Flavius Anton * Copyright (c) 2016 Mihai Tiganus * Copyright (c) 2016-2019 Mihai Carabas * Copyright (c) 2017-2019 Darius Mihai * Copyright (c) 2017-2019 Elena Mihailescu * Copyright (c) 2018-2019 Sergiu Weisz * All rights reserved. * The bhyve-snapshot feature was developed under sponsorships * from Matthew Grooms. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMM_SNAPSHOT_ #define _VMM_SNAPSHOT_ #include #include #ifndef _KERNEL #include #endif -struct vmctx; - enum snapshot_req { STRUCT_VIOAPIC = 1, STRUCT_VM, STRUCT_VLAPIC, VM_MEM, STRUCT_VHPET, STRUCT_VMCX, STRUCT_VATPIC, STRUCT_VATPIT, STRUCT_VPMTMR, STRUCT_VRTC, }; struct vm_snapshot_buffer { /* * R/O for device-specific functions; * written by generic snapshot functions. */ uint8_t *const buf_start; const size_t buf_size; /* * R/W for device-specific functions used to keep track of buffer * current position and remaining size. */ uint8_t *buf; size_t buf_rem; /* * Length of the snapshot is either determined as (buf_size - buf_rem) * or (buf - buf_start) -- the second variation returns a signed value * so it may not be appropriate. * * Use vm_get_snapshot_size(meta). */ }; enum vm_snapshot_op { VM_SNAPSHOT_SAVE, VM_SNAPSHOT_RESTORE, }; struct vm_snapshot_meta { - struct vmctx *ctx; void *dev_data; const char *dev_name; /* identify userspace devices */ enum snapshot_req dev_req; /* identify kernel structs */ struct vm_snapshot_buffer buffer; enum vm_snapshot_op op; }; void vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op); int vm_snapshot_buf(void *data, size_t data_size, struct vm_snapshot_meta *meta); size_t vm_get_snapshot_size(struct vm_snapshot_meta *meta); -int vm_snapshot_guest2host_addr(void **addrp, size_t len, bool restore_null, - struct vm_snapshot_meta *meta); int vm_snapshot_buf_cmp(void *data, size_t data_size, struct vm_snapshot_meta *meta); #define SNAPSHOT_BUF_OR_LEAVE(DATA, LEN, META, RES, LABEL) \ do { \ (RES) = vm_snapshot_buf((DATA), (LEN), (META)); \ if ((RES) != 0) { \ vm_snapshot_buf_err(#DATA, (META)->op); \ goto LABEL; \ } \ } while (0) #define SNAPSHOT_VAR_OR_LEAVE(DATA, META, RES, LABEL) \ SNAPSHOT_BUF_OR_LEAVE(&(DATA), sizeof(DATA), (META), (RES), LABEL) -/* - * Address variables are pointers to guest memory. - * - * When RNULL != 0, do not enforce invalid address checks; instead, make the - * pointer NULL at restore time. - */ -#define SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ADDR, LEN, RNULL, META, RES, LABEL) \ -do { \ - (RES) = vm_snapshot_guest2host_addr((void **)&(ADDR), (LEN), (RNULL), \ - (META)); \ - if ((RES) != 0) { \ - if ((RES) == EFAULT) \ - fprintf(stderr, "%s: invalid address: %s\r\n", \ - __func__, #ADDR); \ - goto LABEL; \ - } \ -} while (0) - /* compare the value in the meta buffer with the data */ #define SNAPSHOT_BUF_CMP_OR_LEAVE(DATA, LEN, META, RES, LABEL) \ do { \ (RES) = vm_snapshot_buf_cmp((DATA), (LEN), (META)); \ if ((RES) != 0) { \ vm_snapshot_buf_err(#DATA, (META)->op); \ goto LABEL; \ } \ } while (0) #define SNAPSHOT_VAR_CMP_OR_LEAVE(DATA, META, RES, LABEL) \ SNAPSHOT_BUF_CMP_OR_LEAVE(&(DATA), sizeof(DATA), (META), (RES), LABEL) #endif diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index 2d8ff43a74dd..e7f7d24cecd2 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -1,1331 +1,1363 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_bhyve_snapshot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_stat.h" #include "vmm_mem.h" #include "io/ppt.h" #include "io/vatpic.h" #include "io/vioapic.h" #include "io/vhpet.h" #include "io/vrtc.h" #ifdef COMPAT_FREEBSD13 struct vm_stats_old { int cpuid; /* in */ int num_entries; /* out */ struct timeval tv; uint64_t statbuf[MAX_VM_STATS]; }; #define VM_STATS_OLD \ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats_old) + +struct vm_snapshot_meta_old { + void *ctx; /* unused */ + void *dev_data; + const char *dev_name; /* identify userspace devices */ + enum snapshot_req dev_req; /* identify kernel structs */ + + struct vm_snapshot_buffer buffer; + + enum vm_snapshot_op op; +}; + +#define VM_SNAPSHOT_REQ_OLD \ + _IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta_old) #endif struct devmem_softc { int segid; char *name; struct cdev *cdev; struct vmmdev_softc *sc; SLIST_ENTRY(devmem_softc) link; }; struct vmmdev_softc { struct vm *vm; /* vm instance cookie */ struct cdev *cdev; struct ucred *ucred; SLIST_ENTRY(vmmdev_softc) link; SLIST_HEAD(, devmem_softc) devmem; int flags; }; #define VSC_LINKED 0x01 static SLIST_HEAD(, vmmdev_softc) head; static unsigned pr_allow_flag; static struct mtx vmmdev_mtx; MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF); static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); SYSCTL_DECL(_hw_vmm); static int vmm_priv_check(struct ucred *ucred); static int devmem_create_cdev(const char *vmname, int id, char *devmem); static void devmem_destroy(void *arg); static int vmm_priv_check(struct ucred *ucred) { if (jailed(ucred) && !(ucred->cr_prison->pr_allow & pr_allow_flag)) return (EPERM); return (0); } static int vcpu_lock_one(struct vcpu *vcpu) { return (vcpu_set_state(vcpu, VCPU_FROZEN, true)); } static void vcpu_unlock_one(struct vmmdev_softc *sc, int vcpuid, struct vcpu *vcpu) { enum vcpu_state state; state = vcpu_get_state(vcpu, NULL); if (state != VCPU_FROZEN) { panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm), vcpuid, state); } vcpu_set_state(vcpu, VCPU_IDLE, false); } static int vcpu_lock_all(struct vmmdev_softc *sc) { struct vcpu *vcpu; int error; uint16_t i, j, maxcpus; vm_slock_vcpus(sc->vm); maxcpus = vm_get_maxcpus(sc->vm); for (i = 0; i < maxcpus; i++) { vcpu = vm_vcpu(sc->vm, i); if (vcpu == NULL) continue; error = vcpu_lock_one(vcpu); if (error) break; } if (error) { for (j = 0; j < i; j++) { vcpu = vm_vcpu(sc->vm, j); if (vcpu == NULL) continue; vcpu_unlock_one(sc, j, vcpu); } vm_unlock_vcpus(sc->vm); } return (error); } static void vcpu_unlock_all(struct vmmdev_softc *sc) { struct vcpu *vcpu; uint16_t i, maxcpus; maxcpus = vm_get_maxcpus(sc->vm); for (i = 0; i < maxcpus; i++) { vcpu = vm_vcpu(sc->vm, i); if (vcpu == NULL) continue; vcpu_unlock_one(sc, i, vcpu); } vm_unlock_vcpus(sc->vm); } static struct vmmdev_softc * vmmdev_lookup(const char *name) { struct vmmdev_softc *sc; #ifdef notyet /* XXX kernel is not compiled with invariants */ mtx_assert(&vmmdev_mtx, MA_OWNED); #endif SLIST_FOREACH(sc, &head, link) { if (strcmp(name, vm_name(sc->vm)) == 0) break; } if (sc == NULL) return (NULL); if (cr_cansee(curthread->td_ucred, sc->ucred)) return (NULL); return (sc); } static struct vmmdev_softc * vmmdev_lookup2(struct cdev *cdev) { return (cdev->si_drv1); } static int vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) { int error, off, c, prot; vm_paddr_t gpa, maxaddr; void *hpa, *cookie; struct vmmdev_softc *sc; error = vmm_priv_check(curthread->td_ucred); if (error) return (error); sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); /* * Get a read lock on the guest memory map. */ vm_slock_memsegs(sc->vm); prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); maxaddr = vmm_sysmem_maxaddr(sc->vm); while (uio->uio_resid > 0 && error == 0) { gpa = uio->uio_offset; off = gpa & PAGE_MASK; c = min(uio->uio_resid, PAGE_SIZE - off); /* * The VM has a hole in its physical memory map. If we want to * use 'dd' to inspect memory beyond the hole we need to * provide bogus data for memory that lies in the hole. * * Since this device does not support lseek(2), dd(1) will * read(2) blocks of data to simulate the lseek(2). */ hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie); if (hpa == NULL) { if (uio->uio_rw == UIO_READ && gpa < maxaddr) error = uiomove(__DECONST(void *, zero_region), c, uio); else error = EFAULT; } else { error = uiomove(hpa, c, uio); vm_gpa_release(cookie); } } vm_unlock_memsegs(sc->vm); return (error); } CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1); static int get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) { struct devmem_softc *dsc; int error; bool sysmem; error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL); if (error || mseg->len == 0) return (error); if (!sysmem) { SLIST_FOREACH(dsc, &sc->devmem, link) { if (dsc->segid == mseg->segid) break; } KASSERT(dsc != NULL, ("%s: devmem segment %d not found", __func__, mseg->segid)); error = copystr(dsc->name, mseg->name, len, NULL); } else { bzero(mseg->name, len); } return (error); } static int alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len) { char *name; int error; bool sysmem; error = 0; name = NULL; sysmem = true; /* * The allocation is lengthened by 1 to hold a terminating NUL. It'll * by stripped off when devfs processes the full string. */ if (VM_MEMSEG_NAME(mseg)) { sysmem = false; name = malloc(len, M_VMMDEV, M_WAITOK); error = copystr(mseg->name, name, len, NULL); if (error) goto done; } error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem); if (error) goto done; if (VM_MEMSEG_NAME(mseg)) { error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name); if (error) vm_free_memseg(sc->vm, mseg->segid); else name = NULL; /* freed when 'cdev' is destroyed */ } done: free(name, M_VMMDEV); return (error); } static int vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_get_register(vcpu, regnum[i], ®val[i]); if (error) break; } return (error); } static int vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, uint64_t *regval) { int error, i; error = 0; for (i = 0; i < count; i++) { error = vm_set_register(vcpu, regnum[i], regval[i]); if (error) break; } return (error); } static int vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { int error, vcpuid, size; cpuset_t *cpuset; struct vmmdev_softc *sc; struct vcpu *vcpu; struct vm_register *vmreg; struct vm_seg_desc *vmsegdesc; struct vm_register_set *vmregset; struct vm_run *vmrun; struct vm_exception *vmexc; struct vm_lapic_irq *vmirq; struct vm_lapic_msi *vmmsi; struct vm_ioapic_irq *ioapic_irq; struct vm_isa_irq *isa_irq; struct vm_isa_irq_trigger *isa_irq_trigger; struct vm_capability *vmcap; struct vm_pptdev *pptdev; struct vm_pptdev_mmio *pptmmio; struct vm_pptdev_msi *pptmsi; struct vm_pptdev_msix *pptmsix; #ifdef COMPAT_FREEBSD13 struct vm_stats_old *vmstats_old; #endif struct vm_stats *vmstats; struct vm_stat_desc *statdesc; struct vm_x2apic *x2apic; struct vm_gpa_pte *gpapte; struct vm_suspend *vmsuspend; struct vm_gla2gpa *gg; struct vm_cpuset *vm_cpuset; struct vm_intinfo *vmii; struct vm_rtc_time *rtctime; struct vm_rtc_data *rtcdata; struct vm_memmap *mm; struct vm_munmap *mu; struct vm_cpu_topology *topology; struct vm_readwrite_kernemu_device *kernemu; uint64_t *regvals; int *regnums; enum { NONE, SINGLE, ALL } vcpus_locked; bool memsegs_locked; #ifdef BHYVE_SNAPSHOT struct vm_snapshot_meta *snapshot_meta; +#ifdef COMPAT_FREEBSD13 + struct vm_snapshot_meta_old *snapshot_old; +#endif #endif error = vmm_priv_check(curthread->td_ucred); if (error) return (error); sc = vmmdev_lookup2(cdev); if (sc == NULL) return (ENXIO); vcpuid = -1; vcpu = NULL; vcpus_locked = NONE; memsegs_locked = false; /* * For VMM ioctls that operate on a single vCPU, lookup the * vcpu. For VMM ioctls which require one or more vCPUs to * not be running, lock necessary vCPUs. * * XXX fragile, handle with care * Most of these assume that the first field of the ioctl data * is the vcpuid. */ switch (cmd) { case VM_RUN: case VM_GET_REGISTER: case VM_SET_REGISTER: case VM_GET_SEGMENT_DESCRIPTOR: case VM_SET_SEGMENT_DESCRIPTOR: case VM_GET_REGISTER_SET: case VM_SET_REGISTER_SET: case VM_INJECT_EXCEPTION: case VM_GET_CAPABILITY: case VM_SET_CAPABILITY: case VM_SET_X2APIC_STATE: case VM_GLA2GPA: case VM_GLA2GPA_NOFAULT: case VM_ACTIVATE_CPU: case VM_SET_INTINFO: case VM_GET_INTINFO: case VM_RESTART_INSTRUCTION: case VM_GET_KERNEMU_DEV: case VM_SET_KERNEMU_DEV: /* * ioctls that can operate only on vcpus that are not running. */ vcpuid = *(int *)data; vcpu = vm_alloc_vcpu(sc->vm, vcpuid); if (vcpu == NULL) { error = EINVAL; goto done; } error = vcpu_lock_one(vcpu); if (error) goto done; vcpus_locked = SINGLE; break; #ifdef COMPAT_FREEBSD12 case VM_ALLOC_MEMSEG_FBSD12: #endif case VM_ALLOC_MEMSEG: case VM_BIND_PPTDEV: case VM_UNBIND_PPTDEV: case VM_MMAP_MEMSEG: case VM_MUNMAP_MEMSEG: case VM_REINIT: /* * ioctls that modify the memory map must lock memory * segments exclusively. */ vm_xlock_memsegs(sc->vm); memsegs_locked = true; /* FALLTHROUGH */ case VM_MAP_PPTDEV_MMIO: case VM_UNMAP_PPTDEV_MMIO: #ifdef BHYVE_SNAPSHOT case VM_SNAPSHOT_REQ: +#ifdef COMPAT_FREEBSD13 + case VM_SNAPSHOT_REQ_OLD: +#endif case VM_RESTORE_TIME: #endif /* * ioctls that operate on the entire virtual machine must * prevent all vcpus from running. */ error = vcpu_lock_all(sc); if (error) goto done; vcpus_locked = ALL; break; #ifdef COMPAT_FREEBSD12 case VM_GET_MEMSEG_FBSD12: #endif case VM_GET_MEMSEG: case VM_MMAP_GETNEXT: /* * Lock the memory map while it is being inspected. */ vm_slock_memsegs(sc->vm); memsegs_locked = true; break; #ifdef COMPAT_FREEBSD13 case VM_STATS_OLD: #endif case VM_STATS: case VM_INJECT_NMI: case VM_LAPIC_IRQ: case VM_GET_X2APIC_STATE: /* * These do not need the vCPU locked but do operate on * a specific vCPU. */ vcpuid = *(int *)data; vcpu = vm_alloc_vcpu(sc->vm, vcpuid); if (vcpu == NULL) { error = EINVAL; goto done; } break; case VM_LAPIC_LOCAL_IRQ: case VM_SUSPEND_CPU: case VM_RESUME_CPU: /* * These can either operate on all CPUs via a vcpuid of * -1 or on a specific vCPU. */ vcpuid = *(int *)data; if (vcpuid == -1) break; vcpu = vm_alloc_vcpu(sc->vm, vcpuid); if (vcpu == NULL) { error = EINVAL; goto done; } break; default: break; } switch(cmd) { case VM_RUN: vmrun = (struct vm_run *)data; error = vm_run(vcpu, &vmrun->vm_exit); break; case VM_SUSPEND: vmsuspend = (struct vm_suspend *)data; error = vm_suspend(sc->vm, vmsuspend->how); break; case VM_REINIT: error = vm_reinit(sc->vm); break; case VM_STAT_DESC: { statdesc = (struct vm_stat_desc *)data; error = vmm_stat_desc_copy(statdesc->index, statdesc->desc, sizeof(statdesc->desc)); break; } #ifdef COMPAT_FREEBSD13 case VM_STATS_OLD: vmstats_old = (struct vm_stats_old *)data; getmicrotime(&vmstats_old->tv); error = vmm_stat_copy(vcpu, 0, nitems(vmstats_old->statbuf), &vmstats_old->num_entries, vmstats_old->statbuf); break; #endif case VM_STATS: { vmstats = (struct vm_stats *)data; getmicrotime(&vmstats->tv); error = vmm_stat_copy(vcpu, vmstats->index, nitems(vmstats->statbuf), &vmstats->num_entries, vmstats->statbuf); break; } case VM_PPTDEV_MSI: pptmsi = (struct vm_pptdev_msi *)data; error = ppt_setup_msi(sc->vm, pptmsi->bus, pptmsi->slot, pptmsi->func, pptmsi->addr, pptmsi->msg, pptmsi->numvec); break; case VM_PPTDEV_MSIX: pptmsix = (struct vm_pptdev_msix *)data; error = ppt_setup_msix(sc->vm, pptmsix->bus, pptmsix->slot, pptmsix->func, pptmsix->idx, pptmsix->addr, pptmsix->msg, pptmsix->vector_control); break; case VM_PPTDEV_DISABLE_MSIX: pptdev = (struct vm_pptdev *)data; error = ppt_disable_msix(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_MAP_PPTDEV_MMIO: pptmmio = (struct vm_pptdev_mmio *)data; error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, pptmmio->func, pptmmio->gpa, pptmmio->len, pptmmio->hpa); break; case VM_UNMAP_PPTDEV_MMIO: pptmmio = (struct vm_pptdev_mmio *)data; error = ppt_unmap_mmio(sc->vm, pptmmio->bus, pptmmio->slot, pptmmio->func, pptmmio->gpa, pptmmio->len); break; case VM_BIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_UNBIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_INJECT_EXCEPTION: vmexc = (struct vm_exception *)data; error = vm_inject_exception(vcpu, vmexc->vector, vmexc->error_code_valid, vmexc->error_code, vmexc->restart_instruction); break; case VM_INJECT_NMI: error = vm_inject_nmi(vcpu); break; case VM_LAPIC_IRQ: vmirq = (struct vm_lapic_irq *)data; error = lapic_intr_edge(vcpu, vmirq->vector); break; case VM_LAPIC_LOCAL_IRQ: vmirq = (struct vm_lapic_irq *)data; error = lapic_set_local_intr(sc->vm, vcpu, vmirq->vector); break; case VM_LAPIC_MSI: vmmsi = (struct vm_lapic_msi *)data; error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg); break; case VM_IOAPIC_ASSERT_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_DEASSERT_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_PULSE_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); break; case VM_IOAPIC_PINCOUNT: *(int *)data = vioapic_pincount(sc->vm); break; case VM_SET_KERNEMU_DEV: case VM_GET_KERNEMU_DEV: { mem_region_write_t mwrite; mem_region_read_t mread; bool arg; kernemu = (void *)data; if (kernemu->access_width > 0) size = (1u << kernemu->access_width); else size = 1; if (kernemu->gpa >= DEFAULT_APIC_BASE && kernemu->gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { mread = lapic_mmio_read; mwrite = lapic_mmio_write; } else if (kernemu->gpa >= VIOAPIC_BASE && kernemu->gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { mread = vioapic_mmio_read; mwrite = vioapic_mmio_write; } else if (kernemu->gpa >= VHPET_BASE && kernemu->gpa < VHPET_BASE + VHPET_SIZE) { mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { error = EINVAL; break; } if (cmd == VM_SET_KERNEMU_DEV) error = mwrite(vcpu, kernemu->gpa, kernemu->value, size, &arg); else error = mread(vcpu, kernemu->gpa, &kernemu->value, size, &arg); break; } case VM_ISA_ASSERT_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_assert_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_DEASSERT_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_deassert_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_PULSE_IRQ: isa_irq = (struct vm_isa_irq *)data; error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq); break; case VM_ISA_SET_IRQ_TRIGGER: isa_irq_trigger = (struct vm_isa_irq_trigger *)data; error = vatpic_set_irq_trigger(sc->vm, isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger); break; case VM_MMAP_GETNEXT: mm = (struct vm_memmap *)data; error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid, &mm->segoff, &mm->len, &mm->prot, &mm->flags); break; case VM_MMAP_MEMSEG: mm = (struct vm_memmap *)data; error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff, mm->len, mm->prot, mm->flags); break; case VM_MUNMAP_MEMSEG: mu = (struct vm_munmap *)data; error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len); break; #ifdef COMPAT_FREEBSD12 case VM_ALLOC_MEMSEG_FBSD12: error = alloc_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg_fbsd12 *)0)->name)); break; #endif case VM_ALLOC_MEMSEG: error = alloc_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg *)0)->name)); break; #ifdef COMPAT_FREEBSD12 case VM_GET_MEMSEG_FBSD12: error = get_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg_fbsd12 *)0)->name)); break; #endif case VM_GET_MEMSEG: error = get_memseg(sc, (struct vm_memseg *)data, sizeof(((struct vm_memseg *)0)->name)); break; case VM_GET_REGISTER: vmreg = (struct vm_register *)data; error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval); break; case VM_SET_REGISTER: vmreg = (struct vm_register *)data; error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval); break; case VM_SET_SEGMENT_DESCRIPTOR: vmsegdesc = (struct vm_seg_desc *)data; error = vm_set_seg_desc(vcpu, vmsegdesc->regnum, &vmsegdesc->desc); break; case VM_GET_SEGMENT_DESCRIPTOR: vmsegdesc = (struct vm_seg_desc *)data; error = vm_get_seg_desc(vcpu, vmsegdesc->regnum, &vmsegdesc->desc); break; case VM_GET_REGISTER_SET: vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, M_WAITOK); regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = vm_get_register_set(vcpu, vmregset->count, regnums, regvals); if (error == 0) error = copyout(regvals, vmregset->regvals, sizeof(regvals[0]) * vmregset->count); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; case VM_SET_REGISTER_SET: vmregset = (struct vm_register_set *)data; if (vmregset->count > VM_REG_LAST) { error = EINVAL; break; } regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, M_WAITOK); regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, M_WAITOK); error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * vmregset->count); if (error == 0) error = copyin(vmregset->regvals, regvals, sizeof(regvals[0]) * vmregset->count); if (error == 0) error = vm_set_register_set(vcpu, vmregset->count, regnums, regvals); free(regvals, M_VMMDEV); free(regnums, M_VMMDEV); break; case VM_GET_CAPABILITY: vmcap = (struct vm_capability *)data; error = vm_get_capability(vcpu, vmcap->captype, &vmcap->capval); break; case VM_SET_CAPABILITY: vmcap = (struct vm_capability *)data; error = vm_set_capability(vcpu, vmcap->captype, vmcap->capval); break; case VM_SET_X2APIC_STATE: x2apic = (struct vm_x2apic *)data; error = vm_set_x2apic_state(vcpu, x2apic->state); break; case VM_GET_X2APIC_STATE: x2apic = (struct vm_x2apic *)data; error = vm_get_x2apic_state(vcpu, &x2apic->state); break; case VM_GET_GPA_PMAP: gpapte = (struct vm_gpa_pte *)data; pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), gpapte->gpa, gpapte->pte, &gpapte->ptenum); error = 0; break; case VM_GET_HPET_CAPABILITIES: error = vhpet_getcap((struct vm_hpet_cap *)data); break; case VM_GLA2GPA: { CTASSERT(PROT_READ == VM_PROT_READ); CTASSERT(PROT_WRITE == VM_PROT_WRITE); CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); gg = (struct vm_gla2gpa *)data; error = vm_gla2gpa(vcpu, &gg->paging, gg->gla, gg->prot, &gg->gpa, &gg->fault); KASSERT(error == 0 || error == EFAULT, ("%s: vm_gla2gpa unknown error %d", __func__, error)); break; } case VM_GLA2GPA_NOFAULT: gg = (struct vm_gla2gpa *)data; error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla, gg->prot, &gg->gpa, &gg->fault); KASSERT(error == 0 || error == EFAULT, ("%s: vm_gla2gpa unknown error %d", __func__, error)); break; case VM_ACTIVATE_CPU: error = vm_activate_cpu(vcpu); break; case VM_GET_CPUS: error = 0; vm_cpuset = (struct vm_cpuset *)data; size = vm_cpuset->cpusetsize; if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) { error = ERANGE; break; } cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO); if (vm_cpuset->which == VM_ACTIVE_CPUS) *cpuset = vm_active_cpus(sc->vm); else if (vm_cpuset->which == VM_SUSPENDED_CPUS) *cpuset = vm_suspended_cpus(sc->vm); else if (vm_cpuset->which == VM_DEBUG_CPUS) *cpuset = vm_debug_cpus(sc->vm); else error = EINVAL; if (error == 0) error = copyout(cpuset, vm_cpuset->cpus, size); free(cpuset, M_TEMP); break; case VM_SUSPEND_CPU: error = vm_suspend_cpu(sc->vm, vcpu); break; case VM_RESUME_CPU: error = vm_resume_cpu(sc->vm, vcpu); break; case VM_SET_INTINFO: vmii = (struct vm_intinfo *)data; error = vm_exit_intinfo(vcpu, vmii->info1); break; case VM_GET_INTINFO: vmii = (struct vm_intinfo *)data; error = vm_get_intinfo(vcpu, &vmii->info1, &vmii->info2); break; case VM_RTC_WRITE: rtcdata = (struct vm_rtc_data *)data; error = vrtc_nvram_write(sc->vm, rtcdata->offset, rtcdata->value); break; case VM_RTC_READ: rtcdata = (struct vm_rtc_data *)data; error = vrtc_nvram_read(sc->vm, rtcdata->offset, &rtcdata->value); break; case VM_RTC_SETTIME: rtctime = (struct vm_rtc_time *)data; error = vrtc_set_time(sc->vm, rtctime->secs); break; case VM_RTC_GETTIME: error = 0; rtctime = (struct vm_rtc_time *)data; rtctime->secs = vrtc_get_time(sc->vm); break; case VM_RESTART_INSTRUCTION: error = vm_restart_instruction(vcpu); break; case VM_SET_TOPOLOGY: topology = (struct vm_cpu_topology *)data; error = vm_set_topology(sc->vm, topology->sockets, topology->cores, topology->threads, topology->maxcpus); break; case VM_GET_TOPOLOGY: topology = (struct vm_cpu_topology *)data; vm_get_topology(sc->vm, &topology->sockets, &topology->cores, &topology->threads, &topology->maxcpus); error = 0; break; #ifdef BHYVE_SNAPSHOT case VM_SNAPSHOT_REQ: snapshot_meta = (struct vm_snapshot_meta *)data; error = vm_snapshot_req(sc->vm, snapshot_meta); break; +#ifdef COMPAT_FREEBSD13 + case VM_SNAPSHOT_REQ_OLD: + /* + * The old structure just has an additional pointer at + * the start that is ignored. + */ + snapshot_old = (struct vm_snapshot_meta_old *)data; + snapshot_meta = + (struct vm_snapshot_meta *)&snapshot_old->dev_data; + error = vm_snapshot_req(sc->vm, snapshot_meta); + break; +#endif case VM_RESTORE_TIME: error = vm_restore_time(sc->vm); break; #endif default: error = ENOTTY; break; } if (vcpus_locked == SINGLE) vcpu_unlock_one(sc, vcpuid, vcpu); else if (vcpus_locked == ALL) vcpu_unlock_all(sc); if (memsegs_locked) vm_unlock_memsegs(sc->vm); done: /* * Make sure that no handler returns a kernel-internal * error value to userspace. */ KASSERT(error == ERESTART || error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); return (error); } static int vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize, struct vm_object **objp, int nprot) { struct vmmdev_softc *sc; vm_paddr_t gpa; size_t len; vm_ooffset_t segoff, first, last; int error, found, segid; bool sysmem; error = vmm_priv_check(curthread->td_ucred); if (error) return (error); first = *offset; last = first + mapsize; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); sc = vmmdev_lookup2(cdev); if (sc == NULL) { /* virtual machine is in the process of being created */ return (EINVAL); } /* * Get a read lock on the guest memory map. */ vm_slock_memsegs(sc->vm); gpa = 0; found = 0; while (!found) { error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len, NULL, NULL); if (error) break; if (first >= gpa && last <= gpa + len) found = 1; else gpa += len; } if (found) { error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp); KASSERT(error == 0 && *objp != NULL, ("%s: invalid memory segment %d", __func__, segid)); if (sysmem) { vm_object_reference(*objp); *offset = segoff + (first - gpa); } else { error = EINVAL; } } vm_unlock_memsegs(sc->vm); return (error); } static void vmmdev_destroy(void *arg) { struct vmmdev_softc *sc = arg; struct devmem_softc *dsc; int error __diagused; vm_disable_vcpu_creation(sc->vm); error = vcpu_lock_all(sc); KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error)); vm_unlock_vcpus(sc->vm); while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) { KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__)); SLIST_REMOVE_HEAD(&sc->devmem, link); free(dsc->name, M_VMMDEV); free(dsc, M_VMMDEV); } if (sc->cdev != NULL) destroy_dev(sc->cdev); if (sc->vm != NULL) vm_destroy(sc->vm); if (sc->ucred != NULL) crfree(sc->ucred); if ((sc->flags & VSC_LINKED) != 0) { mtx_lock(&vmmdev_mtx); SLIST_REMOVE(&head, sc, vmmdev_softc, link); mtx_unlock(&vmmdev_mtx); } free(sc, M_VMMDEV); } static int sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) { struct devmem_softc *dsc; struct vmmdev_softc *sc; struct cdev *cdev; char *buf; int error, buflen; error = vmm_priv_check(req->td->td_ucred); if (error) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); strlcpy(buf, "beavis", buflen); error = sysctl_handle_string(oidp, buf, buflen, req); if (error != 0 || req->newptr == NULL) goto out; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(buf); if (sc == NULL || sc->cdev == NULL) { mtx_unlock(&vmmdev_mtx); error = EINVAL; goto out; } /* * Setting 'sc->cdev' to NULL is used to indicate that the VM * is scheduled for destruction. */ cdev = sc->cdev; sc->cdev = NULL; mtx_unlock(&vmmdev_mtx); /* * Destroy all cdevs: * * - any new operations on the 'cdev' will return an error (ENXIO). * * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev' */ SLIST_FOREACH(dsc, &sc->devmem, link) { KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed")); destroy_dev(dsc->cdev); devmem_destroy(dsc); } destroy_dev(cdev); vmmdev_destroy(sc); error = 0; out: free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_destroy, "A", NULL); static struct cdevsw vmmdevsw = { .d_name = "vmmdev", .d_version = D_VERSION, .d_ioctl = vmmdev_ioctl, .d_mmap_single = vmmdev_mmap_single, .d_read = vmmdev_rw, .d_write = vmmdev_rw, }; static int sysctl_vmm_create(SYSCTL_HANDLER_ARGS) { struct vm *vm; struct cdev *cdev; struct vmmdev_softc *sc, *sc2; char *buf; int error, buflen; error = vmm_priv_check(req->td->td_ucred); if (error) return (error); buflen = VM_MAX_NAMELEN + 1; buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); strlcpy(buf, "beavis", buflen); error = sysctl_handle_string(oidp, buf, buflen, req); if (error != 0 || req->newptr == NULL) goto out; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(buf); mtx_unlock(&vmmdev_mtx); if (sc != NULL) { error = EEXIST; goto out; } error = vm_create(buf, &vm); if (error != 0) goto out; sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); sc->ucred = crhold(curthread->td_ucred); sc->vm = vm; SLIST_INIT(&sc->devmem); /* * Lookup the name again just in case somebody sneaked in when we * dropped the lock. */ mtx_lock(&vmmdev_mtx); sc2 = vmmdev_lookup(buf); if (sc2 == NULL) { SLIST_INSERT_HEAD(&head, sc, link); sc->flags |= VSC_LINKED; } mtx_unlock(&vmmdev_mtx); if (sc2 != NULL) { vmmdev_destroy(sc); error = EEXIST; goto out; } error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred, UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); if (error != 0) { vmmdev_destroy(sc); goto out; } mtx_lock(&vmmdev_mtx); sc->cdev = cdev; sc->cdev->si_drv1 = sc; mtx_unlock(&vmmdev_mtx); out: free(buf, M_VMMDEV); return (error); } SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, NULL, 0, sysctl_vmm_create, "A", NULL); void vmmdev_init(void) { pr_allow_flag = prison_add_allow(NULL, "vmm", NULL, "Allow use of vmm in a jail."); } int vmmdev_cleanup(void) { int error; if (SLIST_EMPTY(&head)) error = 0; else error = EBUSY; return (error); } static int devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len, struct vm_object **objp, int nprot) { struct devmem_softc *dsc; vm_ooffset_t first, last; size_t seglen; int error; bool sysmem; dsc = cdev->si_drv1; if (dsc == NULL) { /* 'cdev' has been created but is not ready for use */ return (ENXIO); } first = *offset; last = *offset + len; if ((nprot & PROT_EXEC) || first < 0 || first >= last) return (EINVAL); vm_slock_memsegs(dsc->sc->vm); error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp); KASSERT(error == 0 && !sysmem && *objp != NULL, ("%s: invalid devmem segment %d", __func__, dsc->segid)); if (seglen >= last) vm_object_reference(*objp); else error = EINVAL; vm_unlock_memsegs(dsc->sc->vm); return (error); } static struct cdevsw devmemsw = { .d_name = "devmem", .d_version = D_VERSION, .d_mmap_single = devmem_mmap_single, }; static int devmem_create_cdev(const char *vmname, int segid, char *devname) { struct devmem_softc *dsc; struct vmmdev_softc *sc; struct cdev *cdev; int error; error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL, UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname); if (error) return (error); dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO); mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(vmname); KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname)); if (sc->cdev == NULL) { /* virtual machine is being created or destroyed */ mtx_unlock(&vmmdev_mtx); free(dsc, M_VMMDEV); destroy_dev_sched_cb(cdev, NULL, 0); return (ENODEV); } dsc->segid = segid; dsc->name = devname; dsc->cdev = cdev; dsc->sc = sc; SLIST_INSERT_HEAD(&sc->devmem, dsc, link); mtx_unlock(&vmmdev_mtx); /* The 'cdev' is ready for use after 'si_drv1' is initialized */ cdev->si_drv1 = dsc; return (0); } static void devmem_destroy(void *arg) { struct devmem_softc *dsc = arg; KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__)); dsc->cdev = NULL; dsc->sc = NULL; } diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index 57d8dd7aea60..d447a9ee60e4 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -1,1627 +1,1627 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #ifdef BHYVE_SNAPSHOT #include #include #endif #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #include #include #endif #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "bootrom.h" #include "config.h" #include "inout.h" #include "debug.h" #include "fwctl.h" #include "gdb.h" #include "ioapic.h" #include "kernemu_dev.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "qemu_fwcfg.h" #include "smbiostbl.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include "vmgenc.h" #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ #define MB (1024UL * 1024) #define GB (1024UL * MB) static const char * const vmx_exit_reason_desc[] = { [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", [EXIT_REASON_EXT_INTR] = "External interrupt", [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", [EXIT_REASON_INIT] = "INIT signal", [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", [EXIT_REASON_SMI] = "Other SMI", [EXIT_REASON_INTR_WINDOW] = "Interrupt window", [EXIT_REASON_NMI_WINDOW] = "NMI window", [EXIT_REASON_TASK_SWITCH] = "Task switch", [EXIT_REASON_CPUID] = "CPUID", [EXIT_REASON_GETSEC] = "GETSEC", [EXIT_REASON_HLT] = "HLT", [EXIT_REASON_INVD] = "INVD", [EXIT_REASON_INVLPG] = "INVLPG", [EXIT_REASON_RDPMC] = "RDPMC", [EXIT_REASON_RDTSC] = "RDTSC", [EXIT_REASON_RSM] = "RSM", [EXIT_REASON_VMCALL] = "VMCALL", [EXIT_REASON_VMCLEAR] = "VMCLEAR", [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", [EXIT_REASON_VMPTRLD] = "VMPTRLD", [EXIT_REASON_VMPTRST] = "VMPTRST", [EXIT_REASON_VMREAD] = "VMREAD", [EXIT_REASON_VMRESUME] = "VMRESUME", [EXIT_REASON_VMWRITE] = "VMWRITE", [EXIT_REASON_VMXOFF] = "VMXOFF", [EXIT_REASON_VMXON] = "VMXON", [EXIT_REASON_CR_ACCESS] = "Control-register accesses", [EXIT_REASON_DR_ACCESS] = "MOV DR", [EXIT_REASON_INOUT] = "I/O instruction", [EXIT_REASON_RDMSR] = "RDMSR", [EXIT_REASON_WRMSR] = "WRMSR", [EXIT_REASON_INVAL_VMCS] = "VM-entry failure due to invalid guest state", [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", [EXIT_REASON_MWAIT] = "MWAIT", [EXIT_REASON_MTF] = "Monitor trap flag", [EXIT_REASON_MONITOR] = "MONITOR", [EXIT_REASON_PAUSE] = "PAUSE", [EXIT_REASON_MCE_DURING_ENTRY] = "VM-entry failure due to machine-check event", [EXIT_REASON_TPR] = "TPR below threshold", [EXIT_REASON_APIC_ACCESS] = "APIC access", [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", [EXIT_REASON_EPT_FAULT] = "EPT violation", [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", [EXIT_REASON_INVEPT] = "INVEPT", [EXIT_REASON_RDTSCP] = "RDTSCP", [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", [EXIT_REASON_INVVPID] = "INVVPID", [EXIT_REASON_WBINVD] = "WBINVD", [EXIT_REASON_XSETBV] = "XSETBV", [EXIT_REASON_APIC_WRITE] = "APIC write", [EXIT_REASON_RDRAND] = "RDRAND", [EXIT_REASON_INVPCID] = "INVPCID", [EXIT_REASON_VMFUNC] = "VMFUNC", [EXIT_REASON_ENCLS] = "ENCLS", [EXIT_REASON_RDSEED] = "RDSEED", [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", [EXIT_REASON_XSAVES] = "XSAVES", [EXIT_REASON_XRSTORS] = "XRSTORS" }; typedef int (*vmexit_handler_t)(struct vmctx *, struct vcpu *, struct vm_exit *); int guest_ncpus; uint16_t cpu_cores, cpu_sockets, cpu_threads; int raw_stdio = 0; static char *progname; static const int BSP = 0; static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu); static struct bhyvestats { uint64_t vmexit_bogus; uint64_t vmexit_reqidle; uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; uint64_t vmexit_inst_emul; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; } stats; static struct vcpu_info { struct vmctx *ctx; struct vcpu *vcpu; int vcpuid; } *vcpu_info; static cpuset_t **vcpumap; static void usage(int code) { fprintf(stderr, "Usage: %s [-AaCDeHhPSuWwxY]\n" " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" " %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n" " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n" " -A: create ACPI tables\n" " -a: local apic is in xAPIC mode (deprecated)\n" " -C: include guest memory in core file\n" " -c: number of CPUs and/or topology specification\n" " -D: destroy on power-off\n" " -e: exit on unhandled I/O access\n" " -G: start a debug server\n" " -H: vmexit from the guest on HLT\n" " -h: help\n" " -k: key=value flat config file\n" " -K: PS2 keyboard layout\n" " -l: LPC device configuration\n" " -m: memory size\n" " -o: set config 'var' to 'value'\n" " -P: vmexit from the guest on pause\n" " -p: pin 'vcpu' to 'hostcpu'\n" #ifdef BHYVE_SNAPSHOT " -r: path to checkpoint file\n" #endif " -S: guest memory cannot be swapped\n" " -s: PCI slot config\n" " -U: UUID\n" " -u: RTC keeps UTC time\n" " -W: force virtio to use single-vector MSI\n" " -w: ignore unimplemented MSRs\n" " -x: local APIC is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), "", (int)strlen(progname), "", (int)strlen(progname), ""); exit(code); } /* * XXX This parser is known to have the following issues: * 1. It accepts null key=value tokens ",," as setting "cpus" to an * empty string. * * The acceptance of a null specification ('-c ""') is by design to match the * manual page syntax specification, this results in a topology of 1 vCPU. */ static int topology_parse(const char *opt) { char *cp, *str, *tofree; if (*opt == '\0') { set_config_value("sockets", "1"); set_config_value("cores", "1"); set_config_value("threads", "1"); set_config_value("cpus", "1"); return (0); } tofree = str = strdup(opt); if (str == NULL) errx(4, "Failed to allocate memory"); while ((cp = strsep(&str, ",")) != NULL) { if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) set_config_value("cpus", cp + strlen("cpus=")); else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0) set_config_value("sockets", cp + strlen("sockets=")); else if (strncmp(cp, "cores=", strlen("cores=")) == 0) set_config_value("cores", cp + strlen("cores=")); else if (strncmp(cp, "threads=", strlen("threads=")) == 0) set_config_value("threads", cp + strlen("threads=")); #ifdef notyet /* Do not expose this until vmm.ko implements it */ else if (strncmp(cp, "maxcpus=", strlen("maxcpus=")) == 0) set_config_value("maxcpus", cp + strlen("maxcpus=")); #endif else if (strchr(cp, '=') != NULL) goto out; else set_config_value("cpus", cp); } free(tofree); return (0); out: free(tofree); return (-1); } static int parse_int_value(const char *key, const char *value, int minval, int maxval) { char *cp; long lval; errno = 0; lval = strtol(value, &cp, 0); if (errno != 0 || *cp != '\0' || cp == value || lval < minval || lval > maxval) errx(4, "Invalid value for %s: '%s'", key, value); return (lval); } /* * Set the sockets, cores, threads, and guest_cpus variables based on * the configured topology. * * The limits of UINT16_MAX are due to the types passed to * vm_set_topology(). vmm.ko may enforce tighter limits. */ static void calc_topology(void) { const char *value; bool explicit_cpus; uint64_t ncpus; value = get_config_value("cpus"); if (value != NULL) { guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX); explicit_cpus = true; } else { guest_ncpus = 1; explicit_cpus = false; } value = get_config_value("cores"); if (value != NULL) cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX); else cpu_cores = 1; value = get_config_value("threads"); if (value != NULL) cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX); else cpu_threads = 1; value = get_config_value("sockets"); if (value != NULL) cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX); else cpu_sockets = guest_ncpus; /* * Compute sockets * cores * threads avoiding overflow. The * range check above insures these are 16 bit values. */ ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads; if (ncpus > UINT16_MAX) errx(4, "Computed number of vCPUs too high: %ju", (uintmax_t)ncpus); if (explicit_cpus) { if (guest_ncpus != (int)ncpus) errx(4, "Topology (%d sockets, %d cores, %d threads) " "does not match %d vCPUs", cpu_sockets, cpu_cores, cpu_threads, guest_ncpus); } else guest_ncpus = ncpus; } static int pincpu_parse(const char *opt) { const char *value; char *newval; char key[16]; int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0) { fprintf(stderr, "invalid vcpu '%d'\n", vcpu); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (asprintf(&newval, "%s%s%d", value != NULL ? value : "", value != NULL ? "," : "", pcpu) == -1) { perror("failed to build new cpuset string"); return (-1); } set_config_value(key, newval); free(newval); return (0); } static void parse_cpuset(int vcpu, const char *list, cpuset_t *set) { char *cp, *token; int pcpu, start; CPU_ZERO(set); start = -1; token = __DECONST(char *, list); for (;;) { pcpu = strtoul(token, &cp, 0); if (cp == token) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); if (pcpu < 0 || pcpu >= CPU_SETSIZE) errx(4, "hostcpu '%d' outside valid range from 0 to %d", pcpu, CPU_SETSIZE - 1); switch (*cp) { case ',': case '\0': if (start >= 0) { if (start > pcpu) errx(4, "Invalid hostcpu range %d-%d", start, pcpu); while (start < pcpu) { CPU_SET(start, set); start++; } start = -1; } CPU_SET(pcpu, set); break; case '-': if (start >= 0) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); start = pcpu; break; default: errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); } if (*cp == '\0') break; token = cp + 1; } } static void build_vcpumaps(void) { char key[16]; const char *value; int vcpu; vcpumap = calloc(guest_ncpus, sizeof(*vcpumap)); for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (value == NULL) continue; vcpumap[vcpu] = malloc(sizeof(cpuset_t)); if (vcpumap[vcpu] == NULL) err(4, "Failed to allocate cpuset for vcpu %d", vcpu); parse_cpuset(vcpu, value, vcpumap[vcpu]); } } void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode) { int error, restart_instruction; restart_instruction = 1; error = vm_inject_exception(vcpu, vector, errcode_valid, errcode, restart_instruction); assert(error == 0); } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr) { return (vm_rev_map_gpa(ctx, addr)); } #endif int fbsdrun_virtio_msix(void) { return (get_config_bool_default("virtio_msix", true)); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct vcpu_info *vi = param; int error; snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid); pthread_set_name_np(pthread_self(), tname); if (vcpumap[vi->vcpuid] != NULL) { error = pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), vcpumap[vi->vcpuid]); assert(error == 0); } #ifdef BHYVE_SNAPSHOT checkpoint_cpu_add(vi->vcpuid); #endif gdb_cpu_add(vi->vcpu); vm_loop(vi->ctx, vi->vcpu); /* not reached */ exit(1); return (NULL); } static void fbsdrun_addcpu(struct vcpu_info *vi) { pthread_t thr; int error; error = vm_activate_cpu(vi->vcpu); if (error != 0) err(EX_OSERR, "could not activate CPU %d", vi->vcpuid); CPU_SET_ATOMIC(vi->vcpuid, &cpumask); vm_suspend_cpu(vi->vcpu); error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi); assert(error == 0); } static int fbsdrun_deletecpu(int vcpu) { if (!CPU_ISSET(vcpu, &cpumask)) { fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); exit(4); } CPU_CLR_ATOMIC(vcpu, &cpumask); return (CPU_EMPTY(&cpumask)); } static int vmexit_handle_notify(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_exit *vme __unused, uint32_t eax __unused) { #if BHYVE_DEBUG /* * put guest-driven debug here */ #endif return (VMEXIT_CONTINUE); } static int vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vme) { int error; int bytes, port, in, out; port = vme->u.inout.port; bytes = vme->u.inout.bytes; in = vme->u.inout.in; out = !in; /* Extra-special case of host notifications */ if (out && port == GUEST_NIO_PORT) { error = vmexit_handle_notify(ctx, vcpu, vme, vme->u.inout.eax); return (error); } error = emulate_inout(ctx, vcpu, vme); if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port, vme->rip); return (VMEXIT_ABORT); } else { return (VMEXIT_CONTINUE); } } static int vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme) { uint64_t val; uint32_t eax, edx; int error; val = 0; error = emulate_rdmsr(vcpu, vme->u.msr.code, &val); if (error != 0) { fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", vme->u.msr.code, vcpu_id(vcpu)); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(vcpu); return (VMEXIT_CONTINUE); } } eax = val; error = vm_set_register(vcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; error = vm_set_register(vcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme) { int error; error = emulate_wrmsr(vcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", vme->u.msr.code, vme->u.msr.wval, vcpu_id(vcpu)); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(vcpu); return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); } #define DEBUG_EPT_MISCONFIG #ifdef DEBUG_EPT_MISCONFIG #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; static int ept_misconfig_ptenum; #endif static const char * vmexit_vmx_desc(uint32_t exit_reason) { if (exit_reason >= nitems(vmx_exit_reason_desc) || vmx_exit_reason_desc[exit_reason] == NULL) return ("Unknown"); return (vmx_exit_reason_desc[exit_reason]); } static int vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vme) { fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); fprintf(stderr, "\treason\t\tVMX\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vme->u.vmx.status); fprintf(stderr, "\texit_reason\t%u (%s)\n", vme->u.vmx.exit_reason, vmexit_vmx_desc(vme->u.vmx.exit_reason)); fprintf(stderr, "\tqualification\t0x%016lx\n", vme->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vme->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vme->u.vmx.inst_error); #ifdef DEBUG_EPT_MISCONFIG if (vme->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { vm_get_register(vcpu, VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), &ept_misconfig_gpa); vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, &ept_misconfig_ptenum); fprintf(stderr, "\tEPT misconfiguration:\n"); fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", ept_misconfig_ptenum, ept_misconfig_pte[0], ept_misconfig_pte[1], ept_misconfig_pte[2], ept_misconfig_pte[3]); } #endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } static int vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme) { fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); fprintf(stderr, "\treason\t\tSVM\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\texitcode\t%#lx\n", vme->u.svm.exitcode); fprintf(stderr, "\texitinfo1\t%#lx\n", vme->u.svm.exitinfo1); fprintf(stderr, "\texitinfo2\t%#lx\n", vme->u.svm.exitinfo2); return (VMEXIT_ABORT); } static int vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_exit *vme) { assert(vme->inst_length == 0); stats.vmexit_bogus++; return (VMEXIT_CONTINUE); } static int vmexit_reqidle(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_exit *vme) { assert(vme->inst_length == 0); stats.vmexit_reqidle++; return (VMEXIT_CONTINUE); } static int vmexit_hlt(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_exit *vme __unused) { stats.vmexit_hlt++; /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int vmexit_pause(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_exit *vme __unused) { stats.vmexit_pause++; return (VMEXIT_CONTINUE); } static int vmexit_mtrap(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme) { assert(vme->inst_length == 0); stats.vmexit_mtrap++; #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(vcpu_id(vcpu)); #endif gdb_cpu_mtrap(vcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(vcpu_id(vcpu)); #endif return (VMEXIT_CONTINUE); } static int vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme) { int err, i, cs_d; struct vie *vie; enum vm_cpu_mode mode; stats.vmexit_inst_emul++; vie = &vme->u.inst_emul.vie; if (!vie->decoded) { /* * Attempt to decode in userspace as a fallback. This allows * updating instruction decode in bhyve without rebooting the * kernel (rapid prototyping), albeit with much slower * emulation. */ vie_restart(vie); mode = vme->u.inst_emul.paging.cpu_mode; cs_d = vme->u.inst_emul.cs_d; if (vmm_decode_instruction(mode, cs_d, vie) != 0) goto fail; if (vm_set_register(vcpu, VM_REG_GUEST_RIP, vme->rip + vie->num_processed) != 0) goto fail; } err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie, &vme->u.inst_emul.paging); if (err) { if (err == ESRCH) { EPRINTLN("Unhandled memory access to 0x%lx\n", vme->u.inst_emul.gpa); } goto fail; } return (VMEXIT_CONTINUE); fail: fprintf(stderr, "Failed to emulate instruction sequence [ "); for (i = 0; i < vie->num_valid; i++) fprintf(stderr, "%02x", vie->inst[i]); FPRINTLN(stderr, " ] at 0x%lx", vme->rip); return (VMEXIT_ABORT); } static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; static int vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vme) { enum vm_suspend_how how; int vcpuid = vcpu_id(vcpu); how = vme->u.suspended.how; fbsdrun_deletecpu(vcpuid); if (vcpuid != BSP) { pthread_mutex_lock(&resetcpu_mtx); pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); } pthread_mutex_lock(&resetcpu_mtx); while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); switch (how) { case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: if (get_config_bool_default("destroy_on_poweroff", false)) vm_destroy(ctx); exit(1); case VM_SUSPEND_HALT: exit(2); case VM_SUSPEND_TRIPLEFAULT: exit(3); default: fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); exit(100); } return (0); /* NOTREACHED */ } static int vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme __unused) { #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(vcpu_id(vcpu)); #endif gdb_cpu_suspend(vcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(vcpu_id(vcpu)); #endif /* * XXX-MJ sleep for a short period to avoid chewing up the CPU in the * window between activation of the vCPU thread and the STARTUP IPI. */ usleep(1000); return (VMEXIT_CONTINUE); } static int vmexit_breakpoint(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_exit *vme) { gdb_cpu_breakpoint(vcpu, vme); return (VMEXIT_CONTINUE); } static int vmexit_ipi(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_exit *vme) { int error = -1; int i; switch (vme->u.ipi.mode) { case APIC_DELMODE_INIT: CPU_FOREACH_ISSET(i, &vme->u.ipi.dmask) { error = vm_suspend_cpu(vcpu_info[i].vcpu); if (error) { warnx("%s: failed to suspend cpu %d\n", __func__, i); break; } } break; case APIC_DELMODE_STARTUP: CPU_FOREACH_ISSET(i, &vme->u.ipi.dmask) { spinup_ap(vcpu_info[i].vcpu, vme->u.ipi.vector << PAGE_SHIFT); } error = 0; break; default: break; } return (error); } static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, [VM_EXITCODE_BPT] = vmexit_breakpoint, [VM_EXITCODE_IPI] = vmexit_ipi, }; static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu) { struct vm_exit vme; int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus; error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus)); while (1) { error = vm_run(vcpu, &vme); if (error != 0) break; exitcode = vme.exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(4); } rc = (*handler[exitcode])(ctx, vcpu, &vme); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(4); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu) { uint16_t sockets, cores, threads, maxcpus; int tmp, error; /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp); if (error != 0) return (1); error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); if (error == 0) return (maxcpus); else return (1); } static void fbsdrun_set_capabilities(struct vcpu *vcpu, bool bsp) { int err, tmp; if (get_config_bool_default("x86.vmexit_on_hlt", false)) { err = vm_get_capability(vcpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); exit(4); } vm_set_capability(vcpu, VM_CAP_HALT_EXIT, 1); if (bsp) handler[VM_EXITCODE_HLT] = vmexit_hlt; } if (get_config_bool_default("x86.vmexit_on_pause", false)) { /* * pause exit support required for this mode */ err = vm_get_capability(vcpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); exit(4); } vm_set_capability(vcpu, VM_CAP_PAUSE_EXIT, 1); if (bsp) handler[VM_EXITCODE_PAUSE] = vmexit_pause; } if (get_config_bool_default("x86.x2apic", false)) err = vm_set_x2apic_state(vcpu, X2APIC_ENABLED); else err = vm_set_x2apic_state(vcpu, X2APIC_DISABLED); if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); exit(4); } vm_set_capability(vcpu, VM_CAP_ENABLE_INVPCID, 1); err = vm_set_capability(vcpu, VM_CAP_IPI_EXIT, 1); assert(err == 0); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool reinit, romboot; reinit = romboot = false; if (lpc_bootrom()) romboot = true; error = vm_create(vmname); if (error) { if (errno == EEXIST) { if (romboot) { reinit = true; } else { /* * The virtual machine has been setup by the * userspace bootloader. */ } } else { perror("vm_create"); exit(4); } } else { if (!romboot) { /* * If the virtual machine was just created then a * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); exit(4); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(4); } #ifndef WITHOUT_CAPSICUM if (vm_limit_rights(ctx) != 0) err(EX_OSERR, "vm_limit_rights"); #endif if (reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(4); } } error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0 /* maxcpus, unimplemented */); if (error) errx(EX_OSERR, "vm_set_topology"); return (ctx); } static void spinup_vcpu(struct vcpu_info *vi, bool bsp) { int error; if (!bsp) { fbsdrun_set_capabilities(vi->vcpu, false); /* * Enable the 'unrestricted guest' mode for APs. * * APs startup in power-on 16-bit mode. */ error = vm_set_capability(vi->vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); } fbsdrun_addcpu(vi); } static bool parse_config_option(const char *option) { const char *value; char *path; value = strchr(option, '='); if (value == NULL || value[1] == '\0') return (false); path = strndup(option, value - option); if (path == NULL) err(4, "Failed to allocate memory"); set_config_value(path, value + 1); return (true); } static void parse_simple_config_file(const char *path) { FILE *fp; char *line, *cp; size_t linecap; unsigned int lineno; fp = fopen(path, "r"); if (fp == NULL) err(4, "Failed to open configuration file %s", path); line = NULL; linecap = 0; lineno = 1; for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) { if (*line == '#' || *line == '\n') continue; cp = strchr(line, '\n'); if (cp != NULL) *cp = '\0'; if (!parse_config_option(line)) errx(4, "%s line %u: invalid config option '%s'", path, lineno, line); } free(line); fclose(fp); } static void parse_gdb_options(const char *opt) { const char *sport; char *colon; if (opt[0] == 'w') { set_config_bool("gdb.wait", true); opt++; } colon = strrchr(opt, ':'); if (colon == NULL) { sport = opt; } else { *colon = '\0'; colon++; sport = colon; set_config_value("gdb.address", opt); } set_config_value("gdb.port", sport); } static void set_defaults(void) { set_config_bool("acpi_tables", false); set_config_value("memory.size", "256M"); set_config_bool("x86.strictmsr", true); set_config_value("lpc.fwcfg", "bhyve"); } int main(int argc, char *argv[]) { int c, error; int max_vcpus, memflags; struct vcpu *bsp; struct vmctx *ctx; uint64_t rip; size_t memsize; const char *optstr, *value, *vmname; #ifdef BHYVE_SNAPSHOT char *restore_file; struct restore_state rstate; restore_file = NULL; #endif init_config(); set_defaults(); progname = basename(argv[0]); #ifdef BHYVE_SNAPSHOT optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:K:U:r:"; #else optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:K:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': set_config_bool("x86.x2apic", false); break; case 'A': set_config_bool("acpi_tables", true); break; case 'D': set_config_bool("destroy_on_poweroff", true); break; case 'p': if (pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': if (topology_parse(optarg) != 0) { errx(EX_USAGE, "invalid cpu topology " "'%s'", optarg); } break; case 'C': set_config_bool("memory.guest_in_core", true); break; case 'G': parse_gdb_options(optarg); break; case 'k': parse_simple_config_file(optarg); break; case 'K': set_config_value("keyboard.layout", optarg); break; case 'l': if (strncmp(optarg, "help", strlen(optarg)) == 0) { lpc_print_supported_devices(); exit(0); } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; #ifdef BHYVE_SNAPSHOT case 'r': restore_file = optarg; break; #endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); exit(0); } else if (pci_parse_slot(optarg) != 0) exit(4); else break; case 'S': set_config_bool("memory.wired", true); break; case 'm': set_config_value("memory.size", optarg); break; case 'o': if (!parse_config_option(optarg)) errx(EX_USAGE, "invalid configuration option '%s'", optarg); break; case 'H': set_config_bool("x86.vmexit_on_hlt", true); break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': set_config_bool("x86.vmexit_on_pause", true); break; case 'e': set_config_bool("x86.strictio", true); break; case 'u': set_config_bool("rtc.use_localtime", false); break; case 'U': set_config_value("uuid", optarg); break; case 'w': set_config_bool("x86.strictmsr", false); break; case 'W': set_config_bool("virtio_msix", false); break; case 'x': set_config_bool("x86.x2apic", true); break; case 'Y': set_config_bool("x86.mptable", false); break; case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; if (argc > 1) usage(1); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { error = load_restore_file(restore_file, &rstate); if (error) { fprintf(stderr, "Failed to read checkpoint info from " "file: '%s'.\n", restore_file); exit(1); } vmname = lookup_vmname(&rstate); if (vmname != NULL) set_config_value("name", vmname); } #endif if (argc == 1) set_config_value("name", argv[0]); vmname = get_config_value("name"); if (vmname == NULL) usage(1); if (get_config_bool_default("config.dump", false)) { dump_config(); exit(1); } calc_topology(); build_vcpumaps(); value = get_config_value("memory.size"); error = vm_parse_memsize(value, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", value); ctx = do_open(vmname); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { guest_ncpus = lookup_guest_ncpus(&rstate); memflags = lookup_memflags(&rstate); memsize = lookup_memsize(&rstate); } if (guest_ncpus < 1) { fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); exit(1); } #endif bsp = vm_vcpu_open(ctx, BSP); max_vcpus = num_vcpus_allowed(ctx, bsp); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(4); } fbsdrun_set_capabilities(bsp, true); /* Allocate per-VCPU resources. */ vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info)); for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) { vcpu_info[vcpuid].ctx = ctx; vcpu_info[vcpuid].vcpuid = vcpuid; if (vcpuid == BSP) vcpu_info[vcpuid].vcpu = bsp; else vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid); } memflags = 0; if (get_config_bool_default("memory.wired", false)) memflags |= VM_MEM_F_WIRED; if (get_config_bool_default("memory.guest_in_core", false)) memflags |= VM_MEM_F_INCORE; vm_set_memflags(ctx, memflags); error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (error) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); exit(4); } init_mem(guest_ncpus); init_inout(); kernemu_dev_init(); init_bootrom(ctx); atkbdc_init(ctx); pci_irq_init(ctx); ioapic_init(ctx); rtc_init(ctx); sci_init(ctx); if (qemu_fwcfg_init(ctx) != 0) { fprintf(stderr, "qemu fwcfg initialization error"); exit(4); } if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus), &guest_ncpus) != 0) { fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu"); exit(4); } /* * Exit if a device emulation finds an error in its initilization */ if (init_pci(ctx) != 0) { perror("device emulation initialization error"); exit(4); } /* * Initialize after PCI, to allow a bootrom file to reserve the high * region. */ if (get_config_bool("acpi_tables")) vmgenc_init(ctx); init_gdb(ctx); if (lpc_bootrom()) { if (vm_set_capability(bsp, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); exit(4); } error = vcpu_reset(bsp); assert(error == 0); } /* * Add all vCPUs. */ for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) spinup_vcpu(&vcpu_info[vcpuid], vcpuid == BSP); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { fprintf(stdout, "Pausing pci devs...\r\n"); if (vm_pause_user_devs() != 0) { fprintf(stderr, "Failed to pause PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring vm mem...\r\n"); if (restore_vm_mem(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore VM memory.\n"); exit(1); } fprintf(stdout, "Restoring pci devs...\r\n"); - if (vm_restore_user_devs(ctx, &rstate) != 0) { + if (vm_restore_user_devs(&rstate) != 0) { fprintf(stderr, "Failed to restore PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring kernel structs...\r\n"); if (vm_restore_kern_structs(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore kernel structs.\n"); exit(1); } fprintf(stdout, "Resuming pci devs...\r\n"); if (vm_resume_user_devs() != 0) { fprintf(stderr, "Failed to resume PCI device state.\n"); exit(1); } } #endif error = vm_get_register(bsp, VM_REG_GUEST_RIP, &rip); assert(error == 0); /* * build the guest tables, MP etc. */ if (get_config_bool_default("x86.mptable", true)) { error = mptable_build(ctx, guest_ncpus); if (error) { perror("error to build the guest tables"); exit(4); } } error = smbios_build(ctx); if (error != 0) exit(4); if (get_config_bool("acpi_tables")) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } if (lpc_bootrom() && strcmp(lpc_fwcfg(), "bhyve") == 0) { fwctl_init(); } /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); #ifdef BHYVE_SNAPSHOT /* initialize mutex/cond variables */ init_snapshot(); /* * checkpointing thread for communication with bhyvectl */ if (init_checkpoint_thread(ctx) != 0) errx(EX_OSERR, "Failed to start checkpoint thread"); #endif #ifndef WITHOUT_CAPSICUM caph_cache_catpages(); if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_enter() == -1) errx(EX_OSERR, "cap_enter() failed"); #endif #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { destroy_restore_state(&rstate); if (vm_restore_time(ctx) < 0) err(EX_OSERR, "Unable to restore time"); for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) vm_resume_cpu(vcpu_info[vcpuid].vcpu); } else #endif vm_resume_cpu(bsp); /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(4); } diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c index 9c023d93cab9..6b6a91325f72 100644 --- a/usr.sbin/bhyve/pci_ahci.c +++ b/usr.sbin/bhyve/pci_ahci.c @@ -1,2738 +1,2739 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Zhixiang Yu * Copyright (c) 2015-2016 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include -#include - #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" +#ifdef BHYVE_SNAPSHOT +#include "snapshot.h" +#endif #include "ahci.h" #include "block_if.h" #define DEF_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ #define MAX_PORTS 32 /* AHCI supports 32 ports */ #define PxSIG_ATA 0x00000101 /* ATA drive */ #define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */ enum sata_fis_type { FIS_TYPE_REGH2D = 0x27, /* Register FIS - host to device */ FIS_TYPE_REGD2H = 0x34, /* Register FIS - device to host */ FIS_TYPE_DMAACT = 0x39, /* DMA activate FIS - device to host */ FIS_TYPE_DMASETUP = 0x41, /* DMA setup FIS - bidirectional */ FIS_TYPE_DATA = 0x46, /* Data FIS - bidirectional */ FIS_TYPE_BIST = 0x58, /* BIST activate FIS - bidirectional */ FIS_TYPE_PIOSETUP = 0x5F, /* PIO setup FIS - device to host */ FIS_TYPE_SETDEVBITS = 0xA1, /* Set dev bits FIS - device to host */ }; /* * SCSI opcodes */ #define TEST_UNIT_READY 0x00 #define REQUEST_SENSE 0x03 #define INQUIRY 0x12 #define START_STOP_UNIT 0x1B #define PREVENT_ALLOW 0x1E #define READ_CAPACITY 0x25 #define READ_10 0x28 #define POSITION_TO_ELEMENT 0x2B #define READ_TOC 0x43 #define GET_EVENT_STATUS_NOTIFICATION 0x4A #define MODE_SENSE_10 0x5A #define REPORT_LUNS 0xA0 #define READ_12 0xA8 #define READ_CD 0xBE /* * SCSI mode page codes */ #define MODEPAGE_RW_ERROR_RECOVERY 0x01 #define MODEPAGE_CD_CAPABILITIES 0x2A /* * ATA commands */ #define ATA_SF_ENAB_SATA_SF 0x10 #define ATA_SATA_SF_AN 0x05 #define ATA_SF_DIS_SATA_SF 0x90 /* * Debug printf */ #ifdef AHCI_DEBUG static FILE *dbg; #define DPRINTF(format, arg...) do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0) #else #define DPRINTF(format, arg...) #endif #define WPRINTF(format, arg...) printf(format, ##arg) #define AHCI_PORT_IDENT 20 + 1 struct ahci_ioreq { struct blockif_req io_req; struct ahci_port *io_pr; STAILQ_ENTRY(ahci_ioreq) io_flist; TAILQ_ENTRY(ahci_ioreq) io_blist; uint8_t *cfis; uint32_t len; uint32_t done; int slot; int more; int readop; }; struct ahci_port { struct blockif_ctxt *bctx; struct pci_ahci_softc *pr_sc; struct ata_params ata_ident; uint8_t *cmd_lst; uint8_t *rfis; int port; int atapi; int reset; int waitforclear; int mult_sectors; uint8_t xfermode; uint8_t err_cfis[20]; uint8_t sense_key; uint8_t asc; u_int ccs; uint32_t pending; uint32_t clb; uint32_t clbu; uint32_t fb; uint32_t fbu; uint32_t is; uint32_t ie; uint32_t cmd; uint32_t unused0; uint32_t tfd; uint32_t sig; uint32_t ssts; uint32_t sctl; uint32_t serr; uint32_t sact; uint32_t ci; uint32_t sntf; uint32_t fbs; /* * i/o request info */ struct ahci_ioreq *ioreq; int ioqsz; STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd; TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd; }; struct ahci_cmd_hdr { uint16_t flags; uint16_t prdtl; uint32_t prdbc; uint64_t ctba; uint32_t reserved[4]; }; struct ahci_prdt_entry { uint64_t dba; uint32_t reserved; #define DBCMASK 0x3fffff uint32_t dbc; }; struct pci_ahci_softc { struct pci_devinst *asc_pi; pthread_mutex_t mtx; int ports; uint32_t cap; uint32_t ghc; uint32_t is; uint32_t pi; uint32_t vs; uint32_t ccc_ctl; uint32_t ccc_pts; uint32_t em_loc; uint32_t em_ctl; uint32_t cap2; uint32_t bohc; uint32_t lintr; struct ahci_port port[MAX_PORTS]; }; #define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx) static void ahci_handle_port(struct ahci_port *p); static inline void lba_to_msf(uint8_t *buf, int lba) { lba += 150; buf[0] = (lba / 75) / 60; buf[1] = (lba / 75) % 60; buf[2] = lba % 75; } /* * Generate HBA interrupts on global IS register write. */ static void ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask) { struct pci_devinst *pi = sc->asc_pi; struct ahci_port *p; int i, nmsg; uint32_t mmask; /* Update global IS from PxIS/PxIE. */ for (i = 0; i < sc->ports; i++) { p = &sc->port[i]; if (p->is & p->ie) sc->is |= (1 << i); } DPRINTF("%s(%08x) %08x", __func__, mask, sc->is); /* If there is nothing enabled -- clear legacy interrupt and exit. */ if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) { if (sc->lintr) { pci_lintr_deassert(pi); sc->lintr = 0; } return; } /* If there is anything and no MSI -- assert legacy interrupt. */ nmsg = pci_msi_maxmsgnum(pi); if (nmsg == 0) { if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } return; } /* Assert respective MSIs for ports that were touched. */ for (i = 0; i < nmsg; i++) { if (sc->ports <= nmsg || i < nmsg - 1) mmask = 1 << i; else mmask = 0xffffffff << i; if (sc->is & mask && mmask & mask) pci_generate_msi(pi, i); } } /* * Generate HBA interrupt on specific port event. */ static void ahci_port_intr(struct ahci_port *p) { struct pci_ahci_softc *sc = p->pr_sc; struct pci_devinst *pi = sc->asc_pi; int nmsg; DPRINTF("%s(%d) %08x/%08x %08x", __func__, p->port, p->is, p->ie, sc->is); /* If there is nothing enabled -- we are done. */ if ((p->is & p->ie) == 0) return; /* In case of non-shared MSI always generate interrupt. */ nmsg = pci_msi_maxmsgnum(pi); if (sc->ports <= nmsg || p->port < nmsg - 1) { sc->is |= (1 << p->port); if ((sc->ghc & AHCI_GHC_IE) == 0) return; pci_generate_msi(pi, p->port); return; } /* If IS for this port is already set -- do nothing. */ if (sc->is & (1 << p->port)) return; sc->is |= (1 << p->port); /* If interrupts are enabled -- generate one. */ if ((sc->ghc & AHCI_GHC_IE) == 0) return; if (nmsg > 0) { pci_generate_msi(pi, nmsg - 1); } else if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } } static void ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis) { int offset, len, irq; if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE)) return; switch (ft) { case FIS_TYPE_REGD2H: offset = 0x40; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0; break; case FIS_TYPE_SETDEVBITS: offset = 0x58; len = 8; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0; break; case FIS_TYPE_PIOSETUP: offset = 0x20; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0; break; default: WPRINTF("unsupported fis type %d", ft); return; } if (fis[2] & ATA_S_ERROR) { p->waitforclear = 1; irq |= AHCI_P_IX_TFE; } memcpy(p->rfis + offset, fis, len); if (irq) { if (~p->is & irq) { p->is |= irq; ahci_port_intr(p); } } } static void ahci_write_fis_piosetup(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_PIOSETUP; ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis); } static void ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[8]; uint8_t error; error = (tfd >> 8) & 0xff; tfd &= 0x77; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_SETDEVBITS; fis[1] = (1 << 6); fis[2] = tfd; fis[3] = error; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = slot; p->err_cfis[2] = tfd; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else { *(uint32_t *)(fis + 4) = (1 << slot); p->sact &= ~(1 << slot); } p->tfd &= ~0x77; p->tfd |= tfd; ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis); } static void ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[20]; uint8_t error; error = (tfd >> 8) & 0xff; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = (1 << 6); fis[2] = tfd & 0xff; fis[3] = error; fis[4] = cfis[4]; fis[5] = cfis[5]; fis[6] = cfis[6]; fis[7] = cfis[7]; fis[8] = cfis[8]; fis[9] = cfis[9]; fis[10] = cfis[10]; fis[11] = cfis[11]; fis[12] = cfis[12]; fis[13] = cfis[13]; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = 0x80; p->err_cfis[2] = tfd & 0xff; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else p->ci &= ~(1 << slot); p->tfd = tfd; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot) { uint8_t fis[20]; p->tfd = ATA_S_READY | ATA_S_DSC; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = 0; /* No interrupt */ fis[2] = p->tfd; /* Status */ fis[3] = 0; /* No error */ p->ci &= ~(1 << slot); ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_reset_fis_d2h(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[3] = 1; fis[4] = 1; if (p->atapi) { fis[5] = 0x14; fis[6] = 0xeb; } fis[12] = 1; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_check_stopped(struct ahci_port *p) { /* * If we are no longer processing the command list and nothing * is in-flight, clear the running bit, the current command * slot, the command issue and active bits. */ if (!(p->cmd & AHCI_P_CMD_ST)) { if (p->pending == 0) { p->ccs = 0; p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK); p->ci = 0; p->sact = 0; p->waitforclear = 0; } } } static void ahci_port_stop(struct ahci_port *p) { struct ahci_ioreq *aior; uint8_t *cfis; int slot; int error; assert(pthread_mutex_isowned_np(&p->pr_sc->mtx)); TAILQ_FOREACH(aior, &p->iobhd, io_blist) { /* * Try to cancel the outstanding blockif request. */ error = blockif_cancel(p->bctx, &aior->io_req); if (error != 0) continue; slot = aior->slot; cfis = aior->cfis; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) p->sact &= ~(1 << slot); /* NCQ */ else p->ci &= ~(1 << slot); /* * This command is now done. */ p->pending &= ~(1 << slot); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); } ahci_check_stopped(p); } static void ahci_port_reset(struct ahci_port *pr) { pr->serr = 0; pr->sact = 0; pr->xfermode = ATA_UDMA6; pr->mult_sectors = 128; if (!pr->bctx) { pr->ssts = ATA_SS_DET_NO_DEVICE; pr->sig = 0xFFFFFFFF; pr->tfd = 0x7F; return; } pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE; if (pr->sctl & ATA_SC_SPD_MASK) pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK); else pr->ssts |= ATA_SS_SPD_GEN3; pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA; if (!pr->atapi) { pr->sig = PxSIG_ATA; pr->tfd |= ATA_S_READY; } else pr->sig = PxSIG_ATAPI; ahci_write_reset_fis_d2h(pr); } static void ahci_reset(struct pci_ahci_softc *sc) { int i; sc->ghc = AHCI_GHC_AE; sc->is = 0; if (sc->lintr) { pci_lintr_deassert(sc->asc_pi); sc->lintr = 0; } for (i = 0; i < sc->ports; i++) { sc->port[i].ie = 0; sc->port[i].is = 0; sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD); if (sc->port[i].bctx) sc->port[i].cmd |= AHCI_P_CMD_CPS; sc->port[i].sctl = 0; ahci_port_reset(&sc->port[i]); } } static void ata_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i ^ 1] = *src++; else dest[i ^ 1] = ' '; } } static void atapi_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i] = *src++; else dest[i] = ' '; } } /* * Build up the iovec based on the PRDT, 'done' and 'len'. */ static void ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior, struct ahci_prdt_entry *prdt, uint16_t prdtl) { struct blockif_req *breq = &aior->io_req; uint32_t dbcsz, extra, left, skip, todo; int i, j; assert(aior->len >= aior->done); /* Copy part of PRDT between 'done' and 'len' bytes into the iov. */ skip = aior->done; left = aior->len - aior->done; todo = 0; for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0; i++, prdt++) { dbcsz = (prdt->dbc & DBCMASK) + 1; /* Skip already done part of the PRDT */ if (dbcsz <= skip) { skip -= dbcsz; continue; } dbcsz -= skip; if (dbcsz > left) dbcsz = left; breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba + skip, dbcsz); breq->br_iov[j].iov_len = dbcsz; todo += dbcsz; left -= dbcsz; skip = 0; j++; } /* If we got limited by IOV length, round I/O down to sector size. */ if (j == BLOCKIF_IOV_MAX) { extra = todo % blockif_sectsz(p->bctx); todo -= extra; assert(todo > 0); while (extra > 0) { if (breq->br_iov[j - 1].iov_len > extra) { breq->br_iov[j - 1].iov_len -= extra; break; } extra -= breq->br_iov[j - 1].iov_len; j--; } } breq->br_iovcnt = j; breq->br_resid = todo; aior->done += todo; aior->more = (aior->done < aior->len && i < prdtl); } static void ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; struct ahci_prdt_entry *prdt; struct ahci_cmd_hdr *hdr; uint64_t lba; uint32_t len; int err, first, ncq, readop; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); ncq = 0; readop = 1; first = (done == 0); if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || cfis[2] == ATA_WRITE_FPDMA_QUEUED) readop = 0; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[11] << 8 | cfis[3]; if (!len) len = 65536; ncq = 1; } else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[13] << 8 | cfis[12]; if (!len) len = 65536; } else { lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) | (cfis[5] << 8) | cfis[4]; len = cfis[12]; if (!len) len = 256; } lba *= blockif_sectsz(p->bctx); len *= blockif_sectsz(p->bctx); /* Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->readop = readop; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); if (readop) err = blockif_read(p->bctx, breq); else err = blockif_write(p->bctx, breq); assert(err == 0); } static void ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_ioreq *aior; struct blockif_req *breq; int err; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = 0; aior->done = 0; aior->more = 0; breq = &aior->io_req; /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_flush(p->bctx, breq); assert(err == 0); } static inline void read_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, unsigned int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; uint8_t *to; unsigned int len; int i; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; to = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; unsigned int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); sublen = MIN(len, dbcsz); memcpy(to, ptr, sublen); len -= sublen; to += sublen; prdt++; } } static void ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; uint8_t *entry; uint64_t elba; uint32_t len, elen; int err, first, ncq; uint8_t buf[512]; first = (done == 0); if (cfis[2] == ATA_DATA_SET_MANAGEMENT) { len = (uint16_t)cfis[13] << 8 | cfis[12]; len *= 512; ncq = 0; } else { /* ATA_SEND_FPDMA_QUEUED */ len = (uint16_t)cfis[11] << 8 | cfis[3]; len *= 512; ncq = 1; } read_prdt(p, slot, cfis, buf, sizeof(buf)); next: entry = &buf[done]; elba = ((uint64_t)entry[5] << 40) | ((uint64_t)entry[4] << 32) | ((uint64_t)entry[3] << 24) | ((uint64_t)entry[2] << 16) | ((uint64_t)entry[1] << 8) | entry[0]; elen = (uint16_t)entry[7] << 8 | entry[6]; done += 8; if (elen == 0) { if (done >= len) { if (ncq) { if (first) ahci_write_fis_d2h_ncq(p, slot); ahci_write_fis_sdb(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } else { ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } p->pending &= ~(1 << slot); ahci_check_stopped(p); if (!first) ahci_handle_port(p); return; } goto next; } /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->more = (len != done); breq = &aior->io_req; breq->br_offset = elba * blockif_sectsz(p->bctx); breq->br_resid = elen * blockif_sectsz(p->bctx); /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); err = blockif_delete(p->bctx, breq); assert(err == 0); } static inline void write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, unsigned int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; uint8_t *from; unsigned int len; int i; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; from = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); sublen = MIN(len, dbcsz); memcpy(ptr, from, sublen); len -= sublen; from += sublen; prdt++; } hdr->prdbc = size - len; } static void ahci_checksum(uint8_t *buf, int size) { int i; uint8_t sum = 0; for (i = 0; i < size - 1; i++) sum += buf[i]; buf[size - 1] = 0x100 - sum; } static void ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; uint32_t buf[128]; uint8_t *buf8 = (uint8_t *)buf; uint16_t *buf16 = (uint16_t *)buf; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0 || cfis[5] != 0 || cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); return; } memset(buf, 0, sizeof(buf)); if (cfis[4] == 0x00) { /* Log directory */ buf16[0x00] = 1; /* Version -- 1 */ buf16[0x10] = 1; /* NCQ Command Error Log -- 1 page */ buf16[0x13] = 1; /* SATA NCQ Send and Receive Log -- 1 page */ } else if (cfis[4] == 0x10) { /* NCQ Command Error Log */ memcpy(buf8, p->err_cfis, sizeof(p->err_cfis)); ahci_checksum(buf8, sizeof(buf)); } else if (cfis[4] == 0x13) { /* SATA NCQ Send and Receive Log */ if (blockif_candelete(p->bctx) && !blockif_is_ro(p->bctx)) { buf[0x00] = 1; /* SFQ DSM supported */ buf[0x01] = 1; /* SFQ DSM TRIM supported */ } } else { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); return; } if (cfis[2] == ATA_READ_LOG_EXT) ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } static void handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void*)&p->ata_ident, sizeof(struct ata_params)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void ata_identify_init(struct ahci_port* p, int atapi) { struct ata_params* ata_ident = &p->ata_ident; if (atapi) { ata_ident->config = ATA_PROTO_ATAPI | ATA_ATAPI_TYPE_CDROM | ATA_ATAPI_REMOVABLE | ATA_DRQ_FAST; ata_ident->capabilities1 = ATA_SUPPORT_LBA | ATA_SUPPORT_DMA; ata_ident->capabilities2 = (1 << 14 | 1); ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88; ata_ident->obsolete62 = 0x3f; ata_ident->mwdmamodes = 7; if (p->xfermode & ATA_WDMA0) ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->apiomodes = 3; ata_ident->mwdmamin = 0x0078; ata_ident->mwdmarec = 0x0078; ata_ident->pioblind = 0x0078; ata_ident->pioiordy = 0x0078; ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3); ata_ident->satacapabilities2 = ((p->ssts & ATA_SS_SPD_MASK) >> 3); ata_ident->satasupport = ATA_SUPPORT_NCQ_STREAM; ata_ident->version_major = 0x3f0; ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); ata_ident->support.command2 = (1 << 14); ata_ident->support.extension = (1 << 14); ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); ata_ident->enabled.extension = (1 << 14); ata_ident->udmamodes = 0x7f; if (p->xfermode & ATA_UDMA0) ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->transport_major = 0x1020; ata_ident->integrity = 0x00a5; } else { uint64_t sectors; int sectsz, psectsz, psectoff, candelete, ro; uint16_t cyl; uint8_t sech, heads; ro = blockif_is_ro(p->bctx); candelete = blockif_candelete(p->bctx); sectsz = blockif_sectsz(p->bctx); sectors = blockif_size(p->bctx) / sectsz; blockif_chs(p->bctx, &cyl, &heads, &sech); blockif_psectsz(p->bctx, &psectsz, &psectoff); ata_ident->config = ATA_DRQ_FAST; ata_ident->cylinders = cyl; ata_ident->heads = heads; ata_ident->sectors = sech; ata_ident->sectors_intr = (0x8000 | 128); ata_ident->tcg = 0; ata_ident->capabilities1 = ATA_SUPPORT_DMA | ATA_SUPPORT_LBA | ATA_SUPPORT_IORDY; ata_ident->capabilities2 = (1 << 14); ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88; if (p->mult_sectors) ata_ident->multi = (ATA_MULTI_VALID | p->mult_sectors); if (sectors <= 0x0fffffff) { ata_ident->lba_size_1 = sectors; ata_ident->lba_size_2 = (sectors >> 16); } else { ata_ident->lba_size_1 = 0xffff; ata_ident->lba_size_2 = 0x0fff; } ata_ident->mwdmamodes = 0x7; if (p->xfermode & ATA_WDMA0) ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->apiomodes = 0x3; ata_ident->mwdmamin = 0x0078; ata_ident->mwdmarec = 0x0078; ata_ident->pioblind = 0x0078; ata_ident->pioiordy = 0x0078; ata_ident->support3 = 0; ata_ident->queue = 31; ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 | ATA_SUPPORT_NCQ); ata_ident->satacapabilities2 = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED | (p->ssts & ATA_SS_SPD_MASK) >> 3); ata_ident->version_major = 0x3f0; ata_ident->version_minor = 0x28; ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE | ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); ata_ident->support.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 14); ata_ident->support.extension = (1 << 14); ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE | ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); ata_ident->enabled.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 15); ata_ident->enabled.extension = (1 << 14); ata_ident->udmamodes = 0x7f; if (p->xfermode & ATA_UDMA0) ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->lba_size48_1 = sectors; ata_ident->lba_size48_2 = (sectors >> 16); ata_ident->lba_size48_3 = (sectors >> 32); ata_ident->lba_size48_4 = (sectors >> 48); if (candelete && !ro) { ata_ident->support3 |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT; ata_ident->max_dsm_blocks = 1; ata_ident->support_dsm = ATA_SUPPORT_DSM_TRIM; } ata_ident->pss = ATA_PSS_VALID_VALUE; ata_ident->lsalign = 0x4000; if (psectsz > sectsz) { ata_ident->pss |= ATA_PSS_MULTLS; ata_ident->pss |= ffsl(psectsz / sectsz) - 1; ata_ident->lsalign |= (psectoff / sectsz); } if (sectsz > 512) { ata_ident->pss |= ATA_PSS_LSSABOVE512; ata_ident->lss_1 = sectsz / 2; ata_ident->lss_2 = ((sectsz / 2) >> 16); } ata_ident->support2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); ata_ident->enabled2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); ata_ident->transport_major = 0x1020; ata_ident->integrity = 0x00a5; } ahci_checksum((uint8_t*)ata_ident, sizeof(struct ata_params)); } static void handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) { if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)&p->ata_ident, sizeof(struct ata_params)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[36]; uint8_t *acmd; unsigned int len; uint32_t tfd; acmd = cfis + 0x40; if (acmd[1] & 1) { /* VPD */ if (acmd[2] == 0) { /* Supported VPD pages */ buf[0] = 0x05; buf[1] = 0; buf[2] = 0; buf[3] = 1; buf[4] = 0; len = 4 + buf[3]; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } } else { buf[0] = 0x05; buf[1] = 0x80; buf[2] = 0x00; buf[3] = 0x21; buf[4] = 31; buf[5] = 0; buf[6] = 0; buf[7] = 0; atapi_string(buf + 8, "BHYVE", 8); atapi_string(buf + 16, "BHYVE DVD-ROM", 16); atapi_string(buf + 32, "001", 4); len = sizeof(buf); } if (len > acmd[4]) len = acmd[4]; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, len); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[8]; uint64_t sectors; sectors = blockif_size(p->bctx) / 2048; be32enc(buf, sectors - 1); be32enc(buf + 4, 2048); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint8_t format; unsigned int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); format = acmd[9] >> 6; switch (format) { case 0: { size_t size; int msf; uint64_t sectors; uint8_t start_track, buf[20], *bp; msf = (acmd[1] >> 1) & 1; start_track = acmd[6]; if (start_track > 1 && start_track != 0xaa) { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } bp = buf + 2; *bp++ = 1; *bp++ = 1; if (start_track <= 1) { *bp++ = 0; *bp++ = 0x14; *bp++ = 1; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } } *bp++ = 0; *bp++ = 0x14; *bp++ = 0xaa; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 1: { uint8_t buf[12]; memset(buf, 0, sizeof(buf)); buf[1] = 0xa; buf[2] = 0x1; buf[3] = 0x1; if (len > sizeof(buf)) len = sizeof(buf); write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 2: { size_t size; int msf; uint64_t sectors; uint8_t *bp, buf[50]; msf = (acmd[1] >> 1) & 1; bp = buf + 2; *bp++ = 1; *bp++ = 1; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa1; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa2; *bp++ = 0; *bp++ = 0; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } default: { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); break; } } } static void atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[16]; memset(buf, 0, sizeof(buf)); buf[3] = 8; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; struct blockif_req *breq; uint8_t *acmd; uint64_t lba; uint32_t len; int err; acmd = cfis + 0x40; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); prdt = (struct ahci_prdt_entry *)(cfis + 0x80); lba = be32dec(acmd + 2); if (acmd[0] == READ_10) len = be16dec(acmd + 7); else len = be32dec(acmd + 6); if (len == 0) { cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } lba *= 2048; len *= 2048; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->readop = 1; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_read(p->bctx, breq); assert(err == 0); } static void atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[64]; uint8_t *acmd; unsigned int len; acmd = cfis + 0x40; len = acmd[4]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, len); buf[0] = 0x70 | (1 << 7); buf[2] = p->sense_key; buf[7] = 10; buf[12] = p->asc; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd = cfis + 0x40; uint32_t tfd; switch (acmd[4] & 3) { case 0: case 1: case 3: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; tfd = ATA_S_READY | ATA_S_DSC; break; case 2: /* TODO eject media */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x53; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; uint8_t pc, code; unsigned int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); pc = acmd[2] >> 6; code = acmd[2] & 0x3f; switch (pc) { case 0: switch (code) { case MODEPAGE_RW_ERROR_RECOVERY: { uint8_t buf[16]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 16 - 2); buf[2] = 0x70; buf[8] = 0x01; buf[9] = 16 - 10; buf[11] = 0x05; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } case MODEPAGE_CD_CAPABILITIES: { uint8_t buf[30]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 30 - 2); buf[2] = 0x70; buf[8] = 0x2A; buf[9] = 30 - 10; buf[10] = 0x08; buf[12] = 0x71; be16enc(&buf[18], 2); be16enc(&buf[20], 512); write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } default: goto error; break; } break; case 3: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x39; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; error: case 1: case 2: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_get_event_status_notification(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; acmd = cfis + 0x40; /* we don't support asynchronous operation */ if (!(acmd[1] & 1)) { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } else { uint8_t buf[8]; unsigned int len; len = be16dec(acmd + 7); if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 8 - 2); buf[2] = 0x04; buf[3] = 0x10; buf[5] = 0x02; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; acmd = cfis + 0x40; #ifdef AHCI_DEBUG { int i; DPRINTF("ACMD:"); for (i = 0; i < 16; i++) DPRINTF("%02x ", acmd[i]); DPRINTF(""); } #endif switch (acmd[0]) { case TEST_UNIT_READY: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case INQUIRY: atapi_inquiry(p, slot, cfis); break; case READ_CAPACITY: atapi_read_capacity(p, slot, cfis); break; case PREVENT_ALLOW: /* TODO */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case READ_TOC: atapi_read_toc(p, slot, cfis); break; case REPORT_LUNS: atapi_report_luns(p, slot, cfis); break; case READ_10: case READ_12: atapi_read(p, slot, cfis, 0); break; case REQUEST_SENSE: atapi_request_sense(p, slot, cfis); break; case START_STOP_UNIT: atapi_start_stop_unit(p, slot, cfis); break; case MODE_SENSE_10: atapi_mode_sense(p, slot, cfis); break; case GET_EVENT_STATUS_NOTIFICATION: atapi_get_event_status_notification(p, slot, cfis); break; default: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x20; ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { p->tfd |= ATA_S_BUSY; switch (cfis[2]) { case ATA_ATA_IDENTIFY: handle_identify(p, slot, cfis); break; case ATA_SETFEATURES: { switch (cfis[3]) { case ATA_SF_ENAB_SATA_SF: switch (cfis[12]) { case ATA_SATA_SF_AN: p->tfd = ATA_S_DSC | ATA_S_READY; break; default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } break; case ATA_SF_ENAB_WCACHE: case ATA_SF_DIS_WCACHE: case ATA_SF_ENAB_RCACHE: case ATA_SF_DIS_RCACHE: p->tfd = ATA_S_DSC | ATA_S_READY; break; case ATA_SF_SETXFER: { switch (cfis[12] & 0xf8) { case ATA_PIO: case ATA_PIO0: break; case ATA_WDMA0: case ATA_UDMA0: p->xfermode = (cfis[12] & 0x7); break; } p->tfd = ATA_S_DSC | ATA_S_READY; break; } default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; } case ATA_SET_MULTI: if (cfis[12] != 0 && (cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) { p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); } else { p->mult_sectors = cfis[12]; p->tfd = ATA_S_DSC | ATA_S_READY; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; case ATA_READ: case ATA_WRITE: case ATA_READ48: case ATA_WRITE48: case ATA_READ_MUL: case ATA_WRITE_MUL: case ATA_READ_MUL48: case ATA_WRITE_MUL48: case ATA_READ_DMA: case ATA_WRITE_DMA: case ATA_READ_DMA48: case ATA_WRITE_DMA48: case ATA_READ_FPDMA_QUEUED: case ATA_WRITE_FPDMA_QUEUED: ahci_handle_rw(p, slot, cfis, 0); break; case ATA_FLUSHCACHE: case ATA_FLUSHCACHE48: ahci_handle_flush(p, slot, cfis); break; case ATA_DATA_SET_MANAGEMENT: if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM && cfis[13] == 0 && cfis[12] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_SEND_FPDMA_QUEUED: if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM && cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM && cfis[11] == 0 && cfis[3] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_READ_LOG_EXT: case ATA_READ_LOG_DMA_EXT: ahci_handle_read_log(p, slot, cfis); break; case ATA_SECURITY_FREEZE_LOCK: case ATA_SMART_CMD: case ATA_NOP: ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_CHECK_POWER_MODE: cfis[12] = 0xff; /* always on */ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_STANDBY_CMD: case ATA_STANDBY_IMMEDIATE: case ATA_IDLE_CMD: case ATA_IDLE_IMMEDIATE: case ATA_SLEEP: case ATA_READ_VERIFY: case ATA_READ_VERIFY48: ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_ATAPI_IDENTIFY: handle_atapi_identify(p, slot, cfis); break; case ATA_PACKET_CMD: if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else handle_packet_cmd(p, slot, cfis); break; default: WPRINTF("Unsupported cmd:%02x", cfis[2]); ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_slot(struct ahci_port *p, int slot) { struct ahci_cmd_hdr *hdr; #ifdef AHCI_DEBUG struct ahci_prdt_entry *prdt; #endif struct pci_ahci_softc *sc; uint8_t *cfis; #ifdef AHCI_DEBUG int cfl, i; #endif sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); #ifdef AHCI_DEBUG cfl = (hdr->flags & 0x1f) * 4; #endif cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba, 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry)); #ifdef AHCI_DEBUG prdt = (struct ahci_prdt_entry *)(cfis + 0x80); DPRINTF("cfis:"); for (i = 0; i < cfl; i++) { if (i % 10 == 0) DPRINTF(""); DPRINTF("%02x ", cfis[i]); } DPRINTF(""); for (i = 0; i < hdr->prdtl; i++) { DPRINTF("%d@%08"PRIx64"", prdt->dbc & 0x3fffff, prdt->dba); prdt++; } #endif if (cfis[0] != FIS_TYPE_REGH2D) { WPRINTF("Not a H2D FIS:%02x", cfis[0]); return; } if (cfis[1] & 0x80) { ahci_handle_cmd(p, slot, cfis); } else { if (cfis[15] & (1 << 2)) p->reset = 1; else if (p->reset) { p->reset = 0; ahci_port_reset(p); } p->ci &= ~(1 << slot); } } static void ahci_handle_port(struct ahci_port *p) { if (!(p->cmd & AHCI_P_CMD_ST)) return; /* * Search for any new commands to issue ignoring those that * are already in-flight. Stop if device is busy or in error. */ for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) { if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0) break; if (p->waitforclear) break; if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) { p->cmd &= ~AHCI_P_CMD_CCS_MASK; p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT; ahci_handle_slot(p, p->ccs); } } } /* * blockif callback routine - this runs in the context of the blockif * i/o thread, so the mutex needs to be acquired. */ static void ata_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint32_t tfd; uint8_t *cfis; int slot, ncq, dsm; DPRINTF("%s %d", __func__, err); ncq = dsm = 0; aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) ncq = 1; if (cfis[2] == ATA_DATA_SET_MANAGEMENT || (cfis[2] == ATA_SEND_FPDMA_QUEUED && (cfis[13] & 0x1f) == ATA_SFPDMA_DSM)) dsm = 1; pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { if (dsm) ahci_handle_dsm_trim(p, slot, cfis, aior->done); else ahci_handle_rw(p, slot, cfis, aior->done); goto out; } if (!err) tfd = ATA_S_READY | ATA_S_DSC; else tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; if (ncq) ahci_write_fis_sdb(p, slot, cfis, tfd); else ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit", __func__); } static void atapi_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint8_t *cfis; uint32_t tfd; int slot; DPRINTF("%s %d", __func__, err); aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE); pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { atapi_read(p, slot, cfis, aior->done); goto out; } if (!err) { tfd = ATA_S_READY | ATA_S_DSC; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x21; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit", __func__); } static void pci_ahci_ioreq_init(struct ahci_port *pr) { struct ahci_ioreq *vr; int i; pr->ioqsz = blockif_queuesz(pr->bctx); pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq)); STAILQ_INIT(&pr->iofhd); /* * Add all i/o request entries to the free queue */ for (i = 0; i < pr->ioqsz; i++) { vr = &pr->ioreq[i]; vr->io_pr = pr; if (!pr->atapi) vr->io_req.br_callback = ata_ioreq_cb; else vr->io_req.br_callback = atapi_ioreq_cb; vr->io_req.br_param = vr; STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist); } TAILQ_INIT(&pr->iobhd); } static void pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; struct ahci_port *p = &sc->port[port]; DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"", port, offset, value); switch (offset) { case AHCI_P_CLB: p->clb = value; break; case AHCI_P_CLBU: p->clbu = value; break; case AHCI_P_FB: p->fb = value; break; case AHCI_P_FBU: p->fbu = value; break; case AHCI_P_IS: p->is &= ~value; ahci_port_intr(p); break; case AHCI_P_IE: p->ie = value & 0xFDC000FF; ahci_port_intr(p); break; case AHCI_P_CMD: { p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK); p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value; if (!(value & AHCI_P_CMD_ST)) { ahci_port_stop(p); } else { uint64_t clb; p->cmd |= AHCI_P_CMD_CR; clb = (uint64_t)p->clbu << 32 | p->clb; p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb, AHCI_CL_SIZE * AHCI_MAX_SLOTS); } if (value & AHCI_P_CMD_FRE) { uint64_t fb; p->cmd |= AHCI_P_CMD_FR; fb = (uint64_t)p->fbu << 32 | p->fb; /* we don't support FBSCP, so rfis size is 256Bytes */ p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256); } else { p->cmd &= ~AHCI_P_CMD_FR; } if (value & AHCI_P_CMD_CLO) { p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ); p->cmd &= ~AHCI_P_CMD_CLO; } if (value & AHCI_P_CMD_ICC_MASK) { p->cmd &= ~AHCI_P_CMD_ICC_MASK; } ahci_handle_port(p); break; } case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"", offset); break; case AHCI_P_SCTL: p->sctl = value; if (!(p->cmd & AHCI_P_CMD_ST)) { if (value & ATA_SC_DET_RESET) ahci_port_reset(p); } break; case AHCI_P_SERR: p->serr &= ~value; break; case AHCI_P_SACT: p->sact |= value; break; case AHCI_P_CI: p->ci |= value; ahci_handle_port(p); break; case AHCI_P_SNTF: case AHCI_P_FBS: default: break; } } static void pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"", offset, value); switch (offset) { case AHCI_CAP: case AHCI_PI: case AHCI_VS: case AHCI_CAP2: DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"", offset); break; case AHCI_GHC: if (value & AHCI_GHC_HR) { ahci_reset(sc); break; } if (value & AHCI_GHC_IE) sc->ghc |= AHCI_GHC_IE; else sc->ghc &= ~AHCI_GHC_IE; ahci_generate_intr(sc, 0xffffffff); break; case AHCI_IS: sc->is &= ~value; ahci_generate_intr(sc, value); break; default: break; } } static void pci_ahci_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_ahci_softc *sc = pi->pi_arg; assert(baridx == 5); assert((offset % 4) == 0 && size == 4); pthread_mutex_lock(&sc->mtx); if (offset < AHCI_OFFSET) pci_ahci_host_write(sc, offset, value); else if (offset < (uint64_t)AHCI_OFFSET + sc->ports * AHCI_STEP) pci_ahci_port_write(sc, offset, value); else WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"", offset); pthread_mutex_unlock(&sc->mtx); } static uint64_t pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; switch (offset) { case AHCI_CAP: case AHCI_GHC: case AHCI_IS: case AHCI_PI: case AHCI_VS: case AHCI_CCCC: case AHCI_CCCP: case AHCI_EM_LOC: case AHCI_EM_CTL: case AHCI_CAP2: { uint32_t *p = &sc->cap; p += (offset - AHCI_CAP) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x", offset, value); return (value); } static uint64_t pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; switch (offset) { case AHCI_P_CLB: case AHCI_P_CLBU: case AHCI_P_FB: case AHCI_P_FBU: case AHCI_P_IS: case AHCI_P_IE: case AHCI_P_CMD: case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: case AHCI_P_SCTL: case AHCI_P_SERR: case AHCI_P_SACT: case AHCI_P_CI: case AHCI_P_SNTF: case AHCI_P_FBS: { uint32_t *p= &sc->port[port].clb; p += (offset - AHCI_P_CLB) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x", port, offset, value); return value; } static uint64_t pci_ahci_read(struct pci_devinst *pi, int baridx, uint64_t regoff, int size) { struct pci_ahci_softc *sc = pi->pi_arg; uint64_t offset; uint32_t value; assert(baridx == 5); assert(size == 1 || size == 2 || size == 4); assert((regoff & (size - 1)) == 0); pthread_mutex_lock(&sc->mtx); offset = regoff & ~0x3; /* round down to a multiple of 4 bytes */ if (offset < AHCI_OFFSET) value = pci_ahci_host_read(sc, offset); else if (offset < (uint64_t)AHCI_OFFSET + sc->ports * AHCI_STEP) value = pci_ahci_port_read(sc, offset); else { value = 0; WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"", regoff); } value >>= 8 * (regoff & 0x3); pthread_mutex_unlock(&sc->mtx); return (value); } /* * Each AHCI controller has a "port" node which contains nodes for * each port named after the decimal number of the port (no leading * zeroes). Port nodes contain a "type" ("hd" or "cd"), as well as * options for blockif. For example: * * pci.0.1.0 * .device="ahci" * .port * .0 * .type="hd" * .path="/path/to/image" */ static int pci_ahci_legacy_config_port(nvlist_t *nvl, int port, const char *type, const char *opts) { char node_name[sizeof("XX")]; nvlist_t *port_nvl; snprintf(node_name, sizeof(node_name), "%d", port); port_nvl = create_relative_config_node(nvl, node_name); set_config_value_node(port_nvl, "type", type); return (blockif_legacy_config(port_nvl, opts)); } static int pci_ahci_legacy_config(nvlist_t *nvl, const char *opts) { nvlist_t *ports_nvl; const char *type; char *next, *next2, *str, *tofree; int p, ret; if (opts == NULL) return (0); ports_nvl = create_relative_config_node(nvl, "port"); ret = 1; tofree = str = strdup(opts); for (p = 0; p < MAX_PORTS && str != NULL; p++, str = next) { /* Identify and cut off type of present port. */ if (strncmp(str, "hd:", 3) == 0) { type = "hd"; str += 3; } else if (strncmp(str, "cd:", 3) == 0) { type = "cd"; str += 3; } else type = NULL; /* Find and cut off the next port options. */ next = strstr(str, ",hd:"); next2 = strstr(str, ",cd:"); if (next == NULL || (next2 != NULL && next2 < next)) next = next2; if (next != NULL) { next[0] = 0; next++; } if (str[0] == 0) continue; if (type == NULL) { EPRINTLN("Missing or invalid type for port %d: \"%s\"", p, str); goto out; } if (pci_ahci_legacy_config_port(ports_nvl, p, type, str) != 0) goto out; } ret = 0; out: free(tofree); return (ret); } static int pci_ahci_cd_legacy_config(nvlist_t *nvl, const char *opts) { nvlist_t *ports_nvl; ports_nvl = create_relative_config_node(nvl, "port"); return (pci_ahci_legacy_config_port(ports_nvl, 0, "cd", opts)); } static int pci_ahci_hd_legacy_config(nvlist_t *nvl, const char *opts) { nvlist_t *ports_nvl; ports_nvl = create_relative_config_node(nvl, "port"); return (pci_ahci_legacy_config_port(ports_nvl, 0, "hd", opts)); } static int pci_ahci_init(struct pci_devinst *pi, nvlist_t *nvl) { char bident[sizeof("XXX:XXX:XXX")]; char node_name[sizeof("XX")]; struct blockif_ctxt *bctxt; struct pci_ahci_softc *sc; int atapi, ret, slots, p; MD5_CTX mdctx; u_char digest[16]; const char *path, *type, *value; nvlist_t *ports_nvl, *port_nvl; ret = 0; #ifdef AHCI_DEBUG dbg = fopen("/tmp/log", "w+"); #endif sc = calloc(1, sizeof(struct pci_ahci_softc)); pi->pi_arg = sc; sc->asc_pi = pi; pthread_mutex_init(&sc->mtx, NULL); sc->ports = 0; sc->pi = 0; slots = 32; ports_nvl = find_relative_config_node(nvl, "port"); for (p = 0; ports_nvl != NULL && p < MAX_PORTS; p++) { struct ata_params *ata_ident = &sc->port[p].ata_ident; char ident[AHCI_PORT_IDENT]; snprintf(node_name, sizeof(node_name), "%d", p); port_nvl = find_relative_config_node(ports_nvl, node_name); if (port_nvl == NULL) continue; type = get_config_value_node(port_nvl, "type"); if (type == NULL) continue; if (strcmp(type, "hd") == 0) atapi = 0; else atapi = 1; /* * Attempt to open the backing image. Use the PCI slot/func * and the port number for the identifier string. */ snprintf(bident, sizeof(bident), "%u:%u:%u", pi->pi_slot, pi->pi_func, p); bctxt = blockif_open(port_nvl, bident); if (bctxt == NULL) { sc->ports = p; ret = 1; goto open_fail; } sc->port[p].bctx = bctxt; sc->port[p].pr_sc = sc; sc->port[p].port = p; sc->port[p].atapi = atapi; /* * Create an identifier for the backing file. * Use parts of the md5 sum of the filename */ path = get_config_value_node(port_nvl, "path"); MD5Init(&mdctx); MD5Update(&mdctx, path, strlen(path)); MD5Final(digest, &mdctx); snprintf(ident, AHCI_PORT_IDENT, "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); memset(ata_ident, 0, sizeof(struct ata_params)); ata_string((uint8_t*)&ata_ident->serial, ident, 20); ata_string((uint8_t*)&ata_ident->revision, "001", 8); if (atapi) ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DVD ROM", 40); else ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DISK", 40); value = get_config_value_node(port_nvl, "nmrr"); if (value != NULL) ata_ident->media_rotation_rate = atoi(value); value = get_config_value_node(port_nvl, "ser"); if (value != NULL) ata_string((uint8_t*)(&ata_ident->serial), value, 20); value = get_config_value_node(port_nvl, "rev"); if (value != NULL) ata_string((uint8_t*)(&ata_ident->revision), value, 8); value = get_config_value_node(port_nvl, "model"); if (value != NULL) ata_string((uint8_t*)(&ata_ident->model), value, 40); ata_identify_init(&sc->port[p], atapi); /* * Allocate blockif request structures and add them * to the free list */ pci_ahci_ioreq_init(&sc->port[p]); sc->pi |= (1 << p); if (sc->port[p].ioqsz < slots) slots = sc->port[p].ioqsz; } sc->ports = p; /* Intel ICH8 AHCI */ --slots; if (sc->ports < DEF_PORTS) sc->ports = DEF_PORTS; sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF | AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP | AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)| AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC | (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1); sc->vs = 0x10300; sc->cap2 = AHCI_CAP2_APST; ahci_reset(sc); pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0); p = MIN(sc->ports, 16); p = flsl(p) - ((p & (p - 1)) ? 0 : 1); pci_emul_add_msicap(pi, 1 << p); pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32, AHCI_OFFSET + sc->ports * AHCI_STEP); pci_lintr_request(pi); open_fail: if (ret) { for (p = 0; p < sc->ports; p++) { if (sc->port[p].bctx != NULL) blockif_close(sc->port[p].bctx); } free(sc); } return (ret); } #ifdef BHYVE_SNAPSHOT static int pci_ahci_snapshot(struct vm_snapshot_meta *meta) { int i, ret; void *bctx; struct pci_devinst *pi; struct pci_ahci_softc *sc; struct ahci_port *port; pi = meta->dev_data; sc = pi->pi_arg; /* TODO: add mtx lock/unlock */ SNAPSHOT_VAR_OR_LEAVE(sc->ports, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->cap, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ghc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->is, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->pi, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->vs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ccc_ctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ccc_pts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->em_loc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->em_ctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->cap2, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->bohc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->lintr, meta, ret, done); for (i = 0; i < MAX_PORTS; i++) { port = &sc->port[i]; if (meta->op == VM_SNAPSHOT_SAVE) bctx = port->bctx; SNAPSHOT_VAR_OR_LEAVE(bctx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->port, meta, ret, done); /* Mostly for restore; save is ensured by the lines above. */ if (((bctx == NULL) && (port->bctx != NULL)) || ((bctx != NULL) && (port->bctx == NULL))) { fprintf(stderr, "%s: ports not matching\r\n", __func__); ret = EINVAL; goto done; } if (port->bctx == NULL) continue; if (port->port != i) { fprintf(stderr, "%s: ports not matching: " "actual: %d expected: %d\r\n", __func__, port->port, i); ret = EINVAL; goto done; } - SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->cmd_lst, + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, port->cmd_lst, AHCI_CL_SIZE * AHCI_MAX_SLOTS, false, meta, ret, done); - SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->rfis, 256, false, meta, - ret, done); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, port->rfis, 256, + false, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ata_ident, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->atapi, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->reset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->waitforclear, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->mult_sectors, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->xfermode, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->err_cfis, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sense_key, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->asc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ccs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->pending, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->clb, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->clbu, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fb, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fbu, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ie, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->cmd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->unused0, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->tfd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sig, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ssts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->serr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sact, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ci, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sntf, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done); assert(TAILQ_EMPTY(&port->iobhd)); } done: return (ret); } static int pci_ahci_pause(struct pci_devinst *pi) { struct pci_ahci_softc *sc; struct blockif_ctxt *bctxt; int i; sc = pi->pi_arg; for (i = 0; i < MAX_PORTS; i++) { bctxt = sc->port[i].bctx; if (bctxt == NULL) continue; blockif_pause(bctxt); } return (0); } static int pci_ahci_resume(struct pci_devinst *pi) { struct pci_ahci_softc *sc; struct blockif_ctxt *bctxt; int i; sc = pi->pi_arg; for (i = 0; i < MAX_PORTS; i++) { bctxt = sc->port[i].bctx; if (bctxt == NULL) continue; blockif_resume(bctxt); } return (0); } #endif /* BHYVE_SNAPSHOT */ /* * Use separate emulation names to distinguish drive and atapi devices */ static const struct pci_devemu pci_de_ahci = { .pe_emu = "ahci", .pe_init = pci_ahci_init, .pe_legacy_config = pci_ahci_legacy_config, .pe_barwrite = pci_ahci_write, .pe_barread = pci_ahci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_ahci_snapshot, .pe_pause = pci_ahci_pause, .pe_resume = pci_ahci_resume, #endif }; PCI_EMUL_SET(pci_de_ahci); static const struct pci_devemu pci_de_ahci_hd = { .pe_emu = "ahci-hd", .pe_legacy_config = pci_ahci_hd_legacy_config, .pe_alias = "ahci", }; PCI_EMUL_SET(pci_de_ahci_hd); static const struct pci_devemu pci_de_ahci_cd = { .pe_emu = "ahci-cd", .pe_legacy_config = pci_ahci_cd_legacy_config, .pe_alias = "ahci", }; PCI_EMUL_SET(pci_de_ahci_cd); diff --git a/usr.sbin/bhyve/pci_e82545.c b/usr.sbin/bhyve/pci_e82545.c index bec8edb17caf..646eb0ca0118 100644 --- a/usr.sbin/bhyve/pci_e82545.c +++ b/usr.sbin/bhyve/pci_e82545.c @@ -1,2539 +1,2541 @@ /* * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Alexander Motin * Copyright (c) 2015 Peter Grehan * Copyright (c) 2013 Jeremiah Lott, Avere Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif -#include #include #include #include #include #include #include #include #include #include #include #include #include "e1000_regs.h" #include "e1000_defines.h" #include "mii.h" #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" +#ifdef BHYVE_SNAPSHOT +#include "snapshot.h" +#endif #include "mevent.h" #include "net_utils.h" #include "net_backends.h" /* Hardware/register definitions XXX: move some to common code. */ #define E82545_VENDOR_ID_INTEL 0x8086 #define E82545_DEV_ID_82545EM_COPPER 0x100F #define E82545_SUBDEV_ID 0x1008 #define E82545_REVISION_4 4 #define E82545_MDIC_DATA_MASK 0x0000FFFF #define E82545_MDIC_OP_MASK 0x0c000000 #define E82545_MDIC_IE 0x20000000 #define E82545_EECD_FWE_DIS 0x00000010 /* Flash writes disabled */ #define E82545_EECD_FWE_EN 0x00000020 /* Flash writes enabled */ #define E82545_EECD_FWE_MASK 0x00000030 /* Flash writes mask */ #define E82545_BAR_REGISTER 0 #define E82545_BAR_REGISTER_LEN (128*1024) #define E82545_BAR_FLASH 1 #define E82545_BAR_FLASH_LEN (64*1024) #define E82545_BAR_IO 2 #define E82545_BAR_IO_LEN 8 #define E82545_IOADDR 0x00000000 #define E82545_IODATA 0x00000004 #define E82545_IO_REGISTER_MAX 0x0001FFFF #define E82545_IO_FLASH_BASE 0x00080000 #define E82545_IO_FLASH_MAX 0x000FFFFF #define E82545_ARRAY_ENTRY(reg, offset) (reg + (offset<<2)) #define E82545_RAR_MAX 15 #define E82545_MTA_MAX 127 #define E82545_VFTA_MAX 127 /* Slightly modified from the driver versions, hardcoded for 3 opcode bits, * followed by 6 address bits. * TODO: make opcode bits and addr bits configurable? * NVM Commands - Microwire */ #define E82545_NVM_OPCODE_BITS 3 #define E82545_NVM_ADDR_BITS 6 #define E82545_NVM_DATA_BITS 16 #define E82545_NVM_OPADDR_BITS (E82545_NVM_OPCODE_BITS + E82545_NVM_ADDR_BITS) #define E82545_NVM_ADDR_MASK ((1 << E82545_NVM_ADDR_BITS)-1) #define E82545_NVM_OPCODE_MASK \ (((1 << E82545_NVM_OPCODE_BITS) - 1) << E82545_NVM_ADDR_BITS) #define E82545_NVM_OPCODE_READ (0x6 << E82545_NVM_ADDR_BITS) /* read */ #define E82545_NVM_OPCODE_WRITE (0x5 << E82545_NVM_ADDR_BITS) /* write */ #define E82545_NVM_OPCODE_ERASE (0x7 << E82545_NVM_ADDR_BITS) /* erase */ #define E82545_NVM_OPCODE_EWEN (0x4 << E82545_NVM_ADDR_BITS) /* wr-enable */ #define E82545_NVM_EEPROM_SIZE 64 /* 64 * 16-bit values == 128K */ #define E1000_ICR_SRPD 0x00010000 /* This is an arbitrary number. There is no hard limit on the chip. */ #define I82545_MAX_TXSEGS 64 /* Legacy receive descriptor */ struct e1000_rx_desc { uint64_t buffer_addr; /* Address of the descriptor's data buffer */ uint16_t length; /* Length of data DMAed into data buffer */ uint16_t csum; /* Packet checksum */ uint8_t status; /* Descriptor status */ uint8_t errors; /* Descriptor Errors */ uint16_t special; }; /* Transmit descriptor types */ #define E1000_TXD_MASK (E1000_TXD_CMD_DEXT | 0x00F00000) #define E1000_TXD_TYP_L (0) #define E1000_TXD_TYP_C (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_C) #define E1000_TXD_TYP_D (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D) /* Legacy transmit descriptor */ struct e1000_tx_desc { uint64_t buffer_addr; /* Address of the descriptor's data buffer */ union { uint32_t data; struct { uint16_t length; /* Data buffer length */ uint8_t cso; /* Checksum offset */ uint8_t cmd; /* Descriptor control */ } flags; } lower; union { uint32_t data; struct { uint8_t status; /* Descriptor status */ uint8_t css; /* Checksum start */ uint16_t special; } fields; } upper; }; /* Context descriptor */ struct e1000_context_desc { union { uint32_t ip_config; struct { uint8_t ipcss; /* IP checksum start */ uint8_t ipcso; /* IP checksum offset */ uint16_t ipcse; /* IP checksum end */ } ip_fields; } lower_setup; union { uint32_t tcp_config; struct { uint8_t tucss; /* TCP checksum start */ uint8_t tucso; /* TCP checksum offset */ uint16_t tucse; /* TCP checksum end */ } tcp_fields; } upper_setup; uint32_t cmd_and_length; union { uint32_t data; struct { uint8_t status; /* Descriptor status */ uint8_t hdr_len; /* Header length */ uint16_t mss; /* Maximum segment size */ } fields; } tcp_seg_setup; }; /* Data descriptor */ struct e1000_data_desc { uint64_t buffer_addr; /* Address of the descriptor's buffer address */ union { uint32_t data; struct { uint16_t length; /* Data buffer length */ uint8_t typ_len_ext; uint8_t cmd; } flags; } lower; union { uint32_t data; struct { uint8_t status; /* Descriptor status */ uint8_t popts; /* Packet Options */ uint16_t special; } fields; } upper; }; union e1000_tx_udesc { struct e1000_tx_desc td; struct e1000_context_desc cd; struct e1000_data_desc dd; }; /* Tx checksum info for a packet. */ struct ck_info { int ck_valid; /* ck_info is valid */ uint8_t ck_start; /* start byte of cksum calcuation */ uint8_t ck_off; /* offset of cksum insertion */ uint16_t ck_len; /* length of cksum calc: 0 is to packet-end */ }; /* * Debug printf */ static int e82545_debug = 0; #define WPRINTF(msg,params...) PRINTLN("e82545: " msg, ##params) #define DPRINTF(msg,params...) if (e82545_debug) WPRINTF(msg, params) #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) /* s/w representation of the RAL/RAH regs */ struct eth_uni { int eu_valid; int eu_addrsel; struct ether_addr eu_eth; }; struct e82545_softc { struct pci_devinst *esc_pi; struct vmctx *esc_ctx; struct mevent *esc_mevpitr; pthread_mutex_t esc_mtx; struct ether_addr esc_mac; net_backend_t *esc_be; /* General */ uint32_t esc_CTRL; /* x0000 device ctl */ uint32_t esc_FCAL; /* x0028 flow ctl addr lo */ uint32_t esc_FCAH; /* x002C flow ctl addr hi */ uint32_t esc_FCT; /* x0030 flow ctl type */ uint32_t esc_VET; /* x0038 VLAN eth type */ uint32_t esc_FCTTV; /* x0170 flow ctl tx timer */ uint32_t esc_LEDCTL; /* x0E00 LED control */ uint32_t esc_PBA; /* x1000 pkt buffer allocation */ /* Interrupt control */ int esc_irq_asserted; uint32_t esc_ICR; /* x00C0 cause read/clear */ uint32_t esc_ITR; /* x00C4 intr throttling */ uint32_t esc_ICS; /* x00C8 cause set */ uint32_t esc_IMS; /* x00D0 mask set/read */ uint32_t esc_IMC; /* x00D8 mask clear */ /* Transmit */ union e1000_tx_udesc *esc_txdesc; struct e1000_context_desc esc_txctx; pthread_t esc_tx_tid; pthread_cond_t esc_tx_cond; int esc_tx_enabled; int esc_tx_active; uint32_t esc_TXCW; /* x0178 transmit config */ uint32_t esc_TCTL; /* x0400 transmit ctl */ uint32_t esc_TIPG; /* x0410 inter-packet gap */ uint16_t esc_AIT; /* x0458 Adaptive Interframe Throttle */ uint64_t esc_tdba; /* verified 64-bit desc table addr */ uint32_t esc_TDBAL; /* x3800 desc table addr, low bits */ uint32_t esc_TDBAH; /* x3804 desc table addr, hi 32-bits */ uint32_t esc_TDLEN; /* x3808 # descriptors in bytes */ uint16_t esc_TDH; /* x3810 desc table head idx */ uint16_t esc_TDHr; /* internal read version of TDH */ uint16_t esc_TDT; /* x3818 desc table tail idx */ uint32_t esc_TIDV; /* x3820 intr delay */ uint32_t esc_TXDCTL; /* x3828 desc control */ uint32_t esc_TADV; /* x382C intr absolute delay */ /* L2 frame acceptance */ struct eth_uni esc_uni[16]; /* 16 x unicast MAC addresses */ uint32_t esc_fmcast[128]; /* Multicast filter bit-match */ uint32_t esc_fvlan[128]; /* VLAN 4096-bit filter */ /* Receive */ struct e1000_rx_desc *esc_rxdesc; pthread_cond_t esc_rx_cond; int esc_rx_enabled; int esc_rx_active; int esc_rx_loopback; uint32_t esc_RCTL; /* x0100 receive ctl */ uint32_t esc_FCRTL; /* x2160 flow cntl thresh, low */ uint32_t esc_FCRTH; /* x2168 flow cntl thresh, hi */ uint64_t esc_rdba; /* verified 64-bit desc table addr */ uint32_t esc_RDBAL; /* x2800 desc table addr, low bits */ uint32_t esc_RDBAH; /* x2804 desc table addr, hi 32-bits*/ uint32_t esc_RDLEN; /* x2808 #descriptors */ uint16_t esc_RDH; /* x2810 desc table head idx */ uint16_t esc_RDT; /* x2818 desc table tail idx */ uint32_t esc_RDTR; /* x2820 intr delay */ uint32_t esc_RXDCTL; /* x2828 desc control */ uint32_t esc_RADV; /* x282C intr absolute delay */ uint32_t esc_RSRPD; /* x2C00 recv small packet detect */ uint32_t esc_RXCSUM; /* x5000 receive cksum ctl */ /* IO Port register access */ uint32_t io_addr; /* Shadow copy of MDIC */ uint32_t mdi_control; /* Shadow copy of EECD */ uint32_t eeprom_control; /* Latest NVM in/out */ uint16_t nvm_data; uint16_t nvm_opaddr; /* stats */ uint32_t missed_pkt_count; /* dropped for no room in rx queue */ uint32_t pkt_rx_by_size[6]; uint32_t pkt_tx_by_size[6]; uint32_t good_pkt_rx_count; uint32_t bcast_pkt_rx_count; uint32_t mcast_pkt_rx_count; uint32_t good_pkt_tx_count; uint32_t bcast_pkt_tx_count; uint32_t mcast_pkt_tx_count; uint32_t oversize_rx_count; uint32_t tso_tx_count; uint64_t good_octets_rx; uint64_t good_octets_tx; uint64_t missed_octets; /* counts missed and oversized */ uint8_t nvm_bits:6; /* number of bits remaining in/out */ uint8_t nvm_mode:2; #define E82545_NVM_MODE_OPADDR 0x0 #define E82545_NVM_MODE_DATAIN 0x1 #define E82545_NVM_MODE_DATAOUT 0x2 /* EEPROM data */ uint16_t eeprom_data[E82545_NVM_EEPROM_SIZE]; }; static void e82545_reset(struct e82545_softc *sc, int dev); static void e82545_rx_enable(struct e82545_softc *sc); static void e82545_rx_disable(struct e82545_softc *sc); static void e82545_rx_callback(int fd, enum ev_type type, void *param); static void e82545_tx_start(struct e82545_softc *sc); static void e82545_tx_enable(struct e82545_softc *sc); static void e82545_tx_disable(struct e82545_softc *sc); static inline int __unused e82545_size_stat_index(uint32_t size) { if (size <= 64) { return 0; } else if (size >= 1024) { return 5; } else { /* should be 1-4 */ return (ffs(size) - 6); } } static void e82545_init_eeprom(struct e82545_softc *sc) { uint16_t checksum, i; /* mac addr */ sc->eeprom_data[NVM_MAC_ADDR] = ((uint16_t)sc->esc_mac.octet[0]) | (((uint16_t)sc->esc_mac.octet[1]) << 8); sc->eeprom_data[NVM_MAC_ADDR+1] = ((uint16_t)sc->esc_mac.octet[2]) | (((uint16_t)sc->esc_mac.octet[3]) << 8); sc->eeprom_data[NVM_MAC_ADDR+2] = ((uint16_t)sc->esc_mac.octet[4]) | (((uint16_t)sc->esc_mac.octet[5]) << 8); /* pci ids */ sc->eeprom_data[NVM_SUB_DEV_ID] = E82545_SUBDEV_ID; sc->eeprom_data[NVM_SUB_VEN_ID] = E82545_VENDOR_ID_INTEL; sc->eeprom_data[NVM_DEV_ID] = E82545_DEV_ID_82545EM_COPPER; sc->eeprom_data[NVM_VEN_ID] = E82545_VENDOR_ID_INTEL; /* fill in the checksum */ checksum = 0; for (i = 0; i < NVM_CHECKSUM_REG; i++) { checksum += sc->eeprom_data[i]; } checksum = NVM_SUM - checksum; sc->eeprom_data[NVM_CHECKSUM_REG] = checksum; DPRINTF("eeprom checksum: 0x%x", checksum); } static void e82545_write_mdi(struct e82545_softc *sc __unused, uint8_t reg_addr, uint8_t phy_addr, uint32_t data) { DPRINTF("Write mdi reg:0x%x phy:0x%x data: 0x%x", reg_addr, phy_addr, data); } static uint32_t e82545_read_mdi(struct e82545_softc *sc __unused, uint8_t reg_addr, uint8_t phy_addr) { //DPRINTF("Read mdi reg:0x%x phy:0x%x", reg_addr, phy_addr); switch (reg_addr) { case PHY_STATUS: return (MII_SR_LINK_STATUS | MII_SR_AUTONEG_CAPS | MII_SR_AUTONEG_COMPLETE); case PHY_AUTONEG_ADV: return NWAY_AR_SELECTOR_FIELD; case PHY_LP_ABILITY: return 0; case PHY_1000T_STATUS: return (SR_1000T_LP_FD_CAPS | SR_1000T_REMOTE_RX_STATUS | SR_1000T_LOCAL_RX_STATUS); case PHY_ID1: return (M88E1011_I_PHY_ID >> 16) & 0xFFFF; case PHY_ID2: return (M88E1011_I_PHY_ID | E82545_REVISION_4) & 0xFFFF; default: DPRINTF("Unknown mdi read reg:0x%x phy:0x%x", reg_addr, phy_addr); return 0; } /* not reached */ } static void e82545_eecd_strobe(struct e82545_softc *sc) { /* Microwire state machine */ /* DPRINTF("eeprom state machine srtobe " "0x%x 0x%x 0x%x 0x%x", sc->nvm_mode, sc->nvm_bits, sc->nvm_opaddr, sc->nvm_data);*/ if (sc->nvm_bits == 0) { DPRINTF("eeprom state machine not expecting data! " "0x%x 0x%x 0x%x 0x%x", sc->nvm_mode, sc->nvm_bits, sc->nvm_opaddr, sc->nvm_data); return; } sc->nvm_bits--; if (sc->nvm_mode == E82545_NVM_MODE_DATAOUT) { /* shifting out */ if (sc->nvm_data & 0x8000) { sc->eeprom_control |= E1000_EECD_DO; } else { sc->eeprom_control &= ~E1000_EECD_DO; } sc->nvm_data <<= 1; if (sc->nvm_bits == 0) { /* read done, back to opcode mode. */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; } } else if (sc->nvm_mode == E82545_NVM_MODE_DATAIN) { /* shifting in */ sc->nvm_data <<= 1; if (sc->eeprom_control & E1000_EECD_DI) { sc->nvm_data |= 1; } if (sc->nvm_bits == 0) { /* eeprom write */ uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK; if (op != E82545_NVM_OPCODE_WRITE) { DPRINTF("Illegal eeprom write op 0x%x", sc->nvm_opaddr); } else if (addr >= E82545_NVM_EEPROM_SIZE) { DPRINTF("Illegal eeprom write addr 0x%x", sc->nvm_opaddr); } else { DPRINTF("eeprom write eeprom[0x%x] = 0x%x", addr, sc->nvm_data); sc->eeprom_data[addr] = sc->nvm_data; } /* back to opcode mode */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; } } else if (sc->nvm_mode == E82545_NVM_MODE_OPADDR) { sc->nvm_opaddr <<= 1; if (sc->eeprom_control & E1000_EECD_DI) { sc->nvm_opaddr |= 1; } if (sc->nvm_bits == 0) { uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; switch (op) { case E82545_NVM_OPCODE_EWEN: DPRINTF("eeprom write enable: 0x%x", sc->nvm_opaddr); /* back to opcode mode */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; break; case E82545_NVM_OPCODE_READ: { uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK; sc->nvm_mode = E82545_NVM_MODE_DATAOUT; sc->nvm_bits = E82545_NVM_DATA_BITS; if (addr < E82545_NVM_EEPROM_SIZE) { sc->nvm_data = sc->eeprom_data[addr]; DPRINTF("eeprom read: eeprom[0x%x] = 0x%x", addr, sc->nvm_data); } else { DPRINTF("eeprom illegal read: 0x%x", sc->nvm_opaddr); sc->nvm_data = 0; } break; } case E82545_NVM_OPCODE_WRITE: sc->nvm_mode = E82545_NVM_MODE_DATAIN; sc->nvm_bits = E82545_NVM_DATA_BITS; sc->nvm_data = 0; break; default: DPRINTF("eeprom unknown op: 0x%x", sc->nvm_opaddr); /* back to opcode mode */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; } } } else { DPRINTF("eeprom state machine wrong state! " "0x%x 0x%x 0x%x 0x%x", sc->nvm_mode, sc->nvm_bits, sc->nvm_opaddr, sc->nvm_data); } } static void e82545_itr_callback(int fd __unused, enum ev_type type __unused, void *param) { uint32_t new; struct e82545_softc *sc = param; pthread_mutex_lock(&sc->esc_mtx); new = sc->esc_ICR & sc->esc_IMS; if (new && !sc->esc_irq_asserted) { DPRINTF("itr callback: lintr assert %x", new); sc->esc_irq_asserted = 1; pci_lintr_assert(sc->esc_pi); } else { mevent_delete(sc->esc_mevpitr); sc->esc_mevpitr = NULL; } pthread_mutex_unlock(&sc->esc_mtx); } static void e82545_icr_assert(struct e82545_softc *sc, uint32_t bits) { uint32_t new; DPRINTF("icr assert: 0x%x", bits); /* * An interrupt is only generated if bits are set that * aren't already in the ICR, these bits are unmasked, * and there isn't an interrupt already pending. */ new = bits & ~sc->esc_ICR & sc->esc_IMS; sc->esc_ICR |= bits; if (new == 0) { DPRINTF("icr assert: masked %x, ims %x", new, sc->esc_IMS); } else if (sc->esc_mevpitr != NULL) { DPRINTF("icr assert: throttled %x, ims %x", new, sc->esc_IMS); } else if (!sc->esc_irq_asserted) { DPRINTF("icr assert: lintr assert %x", new); sc->esc_irq_asserted = 1; pci_lintr_assert(sc->esc_pi); if (sc->esc_ITR != 0) { sc->esc_mevpitr = mevent_add( (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ EVF_TIMER, e82545_itr_callback, sc); } } } static void e82545_ims_change(struct e82545_softc *sc, uint32_t bits) { uint32_t new; /* * Changing the mask may allow previously asserted * but masked interrupt requests to generate an interrupt. */ new = bits & sc->esc_ICR & ~sc->esc_IMS; sc->esc_IMS |= bits; if (new == 0) { DPRINTF("ims change: masked %x, ims %x", new, sc->esc_IMS); } else if (sc->esc_mevpitr != NULL) { DPRINTF("ims change: throttled %x, ims %x", new, sc->esc_IMS); } else if (!sc->esc_irq_asserted) { DPRINTF("ims change: lintr assert %x", new); sc->esc_irq_asserted = 1; pci_lintr_assert(sc->esc_pi); if (sc->esc_ITR != 0) { sc->esc_mevpitr = mevent_add( (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ EVF_TIMER, e82545_itr_callback, sc); } } } static void e82545_icr_deassert(struct e82545_softc *sc, uint32_t bits) { DPRINTF("icr deassert: 0x%x", bits); sc->esc_ICR &= ~bits; /* * If there are no longer any interrupt sources and there * was an asserted interrupt, clear it */ if (sc->esc_irq_asserted && !(sc->esc_ICR & sc->esc_IMS)) { DPRINTF("icr deassert: lintr deassert %x", bits); pci_lintr_deassert(sc->esc_pi); sc->esc_irq_asserted = 0; } } static void e82545_intr_write(struct e82545_softc *sc, uint32_t offset, uint32_t value) { DPRINTF("intr_write: off %x, val %x", offset, value); switch (offset) { case E1000_ICR: e82545_icr_deassert(sc, value); break; case E1000_ITR: sc->esc_ITR = value; break; case E1000_ICS: sc->esc_ICS = value; /* not used: store for debug */ e82545_icr_assert(sc, value); break; case E1000_IMS: e82545_ims_change(sc, value); break; case E1000_IMC: sc->esc_IMC = value; /* for debug */ sc->esc_IMS &= ~value; // XXX clear interrupts if all ICR bits now masked // and interrupt was pending ? break; default: break; } } static uint32_t e82545_intr_read(struct e82545_softc *sc, uint32_t offset) { uint32_t retval; retval = 0; DPRINTF("intr_read: off %x", offset); switch (offset) { case E1000_ICR: retval = sc->esc_ICR; sc->esc_ICR = 0; e82545_icr_deassert(sc, ~0); break; case E1000_ITR: retval = sc->esc_ITR; break; case E1000_ICS: /* write-only register */ break; case E1000_IMS: retval = sc->esc_IMS; break; case E1000_IMC: /* write-only register */ break; default: break; } return (retval); } static void e82545_devctl(struct e82545_softc *sc, uint32_t val) { sc->esc_CTRL = val & ~E1000_CTRL_RST; if (val & E1000_CTRL_RST) { DPRINTF("e1k: s/w reset, ctl %x", val); e82545_reset(sc, 1); } /* XXX check for phy reset ? */ } static void e82545_rx_update_rdba(struct e82545_softc *sc) { /* XXX verify desc base/len within phys mem range */ sc->esc_rdba = (uint64_t)sc->esc_RDBAH << 32 | sc->esc_RDBAL; /* Cache host mapping of guest descriptor array */ sc->esc_rxdesc = paddr_guest2host(sc->esc_ctx, sc->esc_rdba, sc->esc_RDLEN); } static void e82545_rx_ctl(struct e82545_softc *sc, uint32_t val) { int on; on = ((val & E1000_RCTL_EN) == E1000_RCTL_EN); /* Save RCTL after stripping reserved bits 31:27,24,21,14,11:10,0 */ sc->esc_RCTL = val & ~0xF9204c01; DPRINTF("rx_ctl - %s RCTL %x, val %x", on ? "on" : "off", sc->esc_RCTL, val); /* state change requested */ if (on != sc->esc_rx_enabled) { if (on) { /* Catch disallowed/unimplemented settings */ //assert(!(val & E1000_RCTL_LBM_TCVR)); if (sc->esc_RCTL & E1000_RCTL_LBM_TCVR) { sc->esc_rx_loopback = 1; } else { sc->esc_rx_loopback = 0; } e82545_rx_update_rdba(sc); e82545_rx_enable(sc); } else { e82545_rx_disable(sc); sc->esc_rx_loopback = 0; sc->esc_rdba = 0; sc->esc_rxdesc = NULL; } } } static void e82545_tx_update_tdba(struct e82545_softc *sc) { /* XXX verify desc base/len within phys mem range */ sc->esc_tdba = (uint64_t)sc->esc_TDBAH << 32 | sc->esc_TDBAL; /* Cache host mapping of guest descriptor array */ sc->esc_txdesc = paddr_guest2host(sc->esc_ctx, sc->esc_tdba, sc->esc_TDLEN); } static void e82545_tx_ctl(struct e82545_softc *sc, uint32_t val) { int on; on = ((val & E1000_TCTL_EN) == E1000_TCTL_EN); /* ignore TCTL_EN settings that don't change state */ if (on == sc->esc_tx_enabled) return; if (on) { e82545_tx_update_tdba(sc); e82545_tx_enable(sc); } else { e82545_tx_disable(sc); sc->esc_tdba = 0; sc->esc_txdesc = NULL; } /* Save TCTL value after stripping reserved bits 31:25,23,2,0 */ sc->esc_TCTL = val & ~0xFE800005; } static int e82545_bufsz(uint32_t rctl) { switch (rctl & (E1000_RCTL_BSEX | E1000_RCTL_SZ_256)) { case (E1000_RCTL_SZ_2048): return (2048); case (E1000_RCTL_SZ_1024): return (1024); case (E1000_RCTL_SZ_512): return (512); case (E1000_RCTL_SZ_256): return (256); case (E1000_RCTL_BSEX|E1000_RCTL_SZ_16384): return (16384); case (E1000_RCTL_BSEX|E1000_RCTL_SZ_8192): return (8192); case (E1000_RCTL_BSEX|E1000_RCTL_SZ_4096): return (4096); } return (256); /* Forbidden value. */ } /* XXX one packet at a time until this is debugged */ static void e82545_rx_callback(int fd __unused, enum ev_type type __unused, void *param) { struct e82545_softc *sc = param; struct e1000_rx_desc *rxd; struct iovec vec[64]; ssize_t len; int left, lim, maxpktsz, maxpktdesc, bufsz, i, n, size; uint32_t cause = 0; uint16_t *tp, tag, head; pthread_mutex_lock(&sc->esc_mtx); DPRINTF("rx_run: head %x, tail %x", sc->esc_RDH, sc->esc_RDT); if (!sc->esc_rx_enabled || sc->esc_rx_loopback) { DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped", sc->esc_rx_enabled, sc->esc_rx_loopback); while (netbe_rx_discard(sc->esc_be) > 0) { } goto done1; } bufsz = e82545_bufsz(sc->esc_RCTL); maxpktsz = (sc->esc_RCTL & E1000_RCTL_LPE) ? 16384 : 1522; maxpktdesc = (maxpktsz + bufsz - 1) / bufsz; size = sc->esc_RDLEN / 16; head = sc->esc_RDH; left = (size + sc->esc_RDT - head) % size; if (left < maxpktdesc) { DPRINTF("rx overflow (%d < %d) -- packet(s) dropped", left, maxpktdesc); while (netbe_rx_discard(sc->esc_be) > 0) { } goto done1; } sc->esc_rx_active = 1; pthread_mutex_unlock(&sc->esc_mtx); for (lim = size / 4; lim > 0 && left >= maxpktdesc; lim -= n) { /* Grab rx descriptor pointed to by the head pointer */ for (i = 0; i < maxpktdesc; i++) { rxd = &sc->esc_rxdesc[(head + i) % size]; vec[i].iov_base = paddr_guest2host(sc->esc_ctx, rxd->buffer_addr, bufsz); vec[i].iov_len = bufsz; } len = netbe_recv(sc->esc_be, vec, maxpktdesc); if (len <= 0) { DPRINTF("netbe_recv() returned %zd", len); goto done; } /* * Adjust the packet length based on whether the CRC needs * to be stripped or if the packet is less than the minimum * eth packet size. */ if (len < ETHER_MIN_LEN - ETHER_CRC_LEN) len = ETHER_MIN_LEN - ETHER_CRC_LEN; if (!(sc->esc_RCTL & E1000_RCTL_SECRC)) len += ETHER_CRC_LEN; n = (len + bufsz - 1) / bufsz; DPRINTF("packet read %zd bytes, %d segs, head %d", len, n, head); /* Apply VLAN filter. */ tp = (uint16_t *)vec[0].iov_base + 6; if ((sc->esc_RCTL & E1000_RCTL_VFE) && (ntohs(tp[0]) == sc->esc_VET)) { tag = ntohs(tp[1]) & 0x0fff; if ((sc->esc_fvlan[tag >> 5] & (1 << (tag & 0x1f))) != 0) { DPRINTF("known VLAN %d", tag); } else { DPRINTF("unknown VLAN %d", tag); n = 0; continue; } } /* Update all consumed descriptors. */ for (i = 0; i < n - 1; i++) { rxd = &sc->esc_rxdesc[(head + i) % size]; rxd->length = bufsz; rxd->csum = 0; rxd->errors = 0; rxd->special = 0; rxd->status = E1000_RXD_STAT_DD; } rxd = &sc->esc_rxdesc[(head + i) % size]; rxd->length = len % bufsz; rxd->csum = 0; rxd->errors = 0; rxd->special = 0; /* XXX signal no checksum for now */ rxd->status = E1000_RXD_STAT_PIF | E1000_RXD_STAT_IXSM | E1000_RXD_STAT_EOP | E1000_RXD_STAT_DD; /* Schedule receive interrupts. */ if ((uint32_t)len <= sc->esc_RSRPD) { cause |= E1000_ICR_SRPD | E1000_ICR_RXT0; } else { /* XXX: RDRT and RADV timers should be here. */ cause |= E1000_ICR_RXT0; } head = (head + n) % size; left -= n; } done: pthread_mutex_lock(&sc->esc_mtx); sc->esc_rx_active = 0; if (sc->esc_rx_enabled == 0) pthread_cond_signal(&sc->esc_rx_cond); sc->esc_RDH = head; /* Respect E1000_RCTL_RDMTS */ left = (size + sc->esc_RDT - head) % size; if (left < (size >> (((sc->esc_RCTL >> 8) & 3) + 1))) cause |= E1000_ICR_RXDMT0; /* Assert all accumulated interrupts. */ if (cause != 0) e82545_icr_assert(sc, cause); done1: DPRINTF("rx_run done: head %x, tail %x", sc->esc_RDH, sc->esc_RDT); pthread_mutex_unlock(&sc->esc_mtx); } static uint16_t e82545_carry(uint32_t sum) { sum = (sum & 0xFFFF) + (sum >> 16); if (sum > 0xFFFF) sum -= 0xFFFF; return (sum); } static uint16_t e82545_buf_checksum(uint8_t *buf, int len) { int i; uint32_t sum = 0; /* Checksum all the pairs of bytes first... */ for (i = 0; i < (len & ~1); i += 2) sum += *((u_int16_t *)(buf + i)); /* * If there's a single byte left over, checksum it, too. * Network byte order is big-endian, so the remaining byte is * the high byte. */ if (i < len) sum += htons(buf[i] << 8); return (e82545_carry(sum)); } static uint16_t e82545_iov_checksum(struct iovec *iov, int iovcnt, unsigned int off, unsigned int len) { unsigned int now, odd; uint32_t sum = 0, s; /* Skip completely unneeded vectors. */ while (iovcnt > 0 && iov->iov_len <= off && off > 0) { off -= iov->iov_len; iov++; iovcnt--; } /* Calculate checksum of requested range. */ odd = 0; while (len > 0 && iovcnt > 0) { now = MIN(len, iov->iov_len - off); s = e82545_buf_checksum((uint8_t *)iov->iov_base + off, now); sum += odd ? (s << 8) : s; odd ^= (now & 1); len -= now; off = 0; iov++; iovcnt--; } return (e82545_carry(sum)); } /* * Return the transmit descriptor type. */ static int e82545_txdesc_type(uint32_t lower) { int type; type = 0; if (lower & E1000_TXD_CMD_DEXT) type = lower & E1000_TXD_MASK; return (type); } static void e82545_transmit_checksum(struct iovec *iov, int iovcnt, struct ck_info *ck) { uint16_t cksum; unsigned int cklen; DPRINTF("tx cksum: iovcnt/s/off/len %d/%d/%d/%d", iovcnt, ck->ck_start, ck->ck_off, ck->ck_len); cklen = ck->ck_len ? ck->ck_len - ck->ck_start + 1U : UINT_MAX; cksum = e82545_iov_checksum(iov, iovcnt, ck->ck_start, cklen); *(uint16_t *)((uint8_t *)iov[0].iov_base + ck->ck_off) = ~cksum; } static void e82545_transmit_backend(struct e82545_softc *sc, struct iovec *iov, int iovcnt) { if (sc->esc_be == NULL) return; (void) netbe_send(sc->esc_be, iov, iovcnt); } static void e82545_transmit_done(struct e82545_softc *sc, uint16_t head, uint16_t tail, uint16_t dsize, int *tdwb) { union e1000_tx_udesc *dsc; for ( ; head != tail; head = (head + 1) % dsize) { dsc = &sc->esc_txdesc[head]; if (dsc->td.lower.data & E1000_TXD_CMD_RS) { dsc->td.upper.data |= E1000_TXD_STAT_DD; *tdwb = 1; } } } static int e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail, uint16_t dsize, uint16_t *rhead, int *tdwb) { uint8_t *hdr, *hdrp; struct iovec iovb[I82545_MAX_TXSEGS + 2]; struct iovec tiov[I82545_MAX_TXSEGS + 2]; struct e1000_context_desc *cd; struct ck_info ckinfo[2]; struct iovec *iov; union e1000_tx_udesc *dsc; int desc, dtype, ntype, iovcnt, tcp, tso, paylen, seg, tiovcnt, pv; unsigned hdrlen, vlen, pktlen, len, left, mss, now, nnow, nleft, pvoff; uint32_t tcpsum, tcpseq; uint16_t ipcs, tcpcs, ipid, ohead; bool invalid; ckinfo[0].ck_valid = ckinfo[1].ck_valid = 0; iovcnt = 0; ntype = 0; tso = 0; pktlen = 0; ohead = head; invalid = false; /* iovb[0/1] may be used for writable copy of headers. */ iov = &iovb[2]; for (desc = 0; ; desc++, head = (head + 1) % dsize) { if (head == tail) { *rhead = head; return (0); } dsc = &sc->esc_txdesc[head]; dtype = e82545_txdesc_type(dsc->td.lower.data); if (desc == 0) { switch (dtype) { case E1000_TXD_TYP_C: DPRINTF("tx ctxt desc idx %d: %016jx " "%08x%08x", head, dsc->td.buffer_addr, dsc->td.upper.data, dsc->td.lower.data); /* Save context and return */ sc->esc_txctx = dsc->cd; goto done; case E1000_TXD_TYP_L: DPRINTF("tx legacy desc idx %d: %08x%08x", head, dsc->td.upper.data, dsc->td.lower.data); /* * legacy cksum start valid in first descriptor */ ntype = dtype; ckinfo[0].ck_start = dsc->td.upper.fields.css; break; case E1000_TXD_TYP_D: DPRINTF("tx data desc idx %d: %08x%08x", head, dsc->td.upper.data, dsc->td.lower.data); ntype = dtype; break; default: break; } } else { /* Descriptor type must be consistent */ assert(dtype == ntype); DPRINTF("tx next desc idx %d: %08x%08x", head, dsc->td.upper.data, dsc->td.lower.data); } len = (dtype == E1000_TXD_TYP_L) ? dsc->td.lower.flags.length : dsc->dd.lower.data & 0xFFFFF; /* Strip checksum supplied by guest. */ if ((dsc->td.lower.data & E1000_TXD_CMD_EOP) != 0 && (dsc->td.lower.data & E1000_TXD_CMD_IFCS) == 0) { if (len <= 2) { WPRINTF("final descriptor too short (%d) -- dropped", len); invalid = true; } else len -= 2; } if (len > 0 && iovcnt < I82545_MAX_TXSEGS) { iov[iovcnt].iov_base = paddr_guest2host(sc->esc_ctx, dsc->td.buffer_addr, len); iov[iovcnt].iov_len = len; iovcnt++; pktlen += len; } /* * Pull out info that is valid in the final descriptor * and exit descriptor loop. */ if (dsc->td.lower.data & E1000_TXD_CMD_EOP) { if (dtype == E1000_TXD_TYP_L) { if (dsc->td.lower.data & E1000_TXD_CMD_IC) { ckinfo[0].ck_valid = 1; ckinfo[0].ck_off = dsc->td.lower.flags.cso; ckinfo[0].ck_len = 0; } } else { cd = &sc->esc_txctx; if (dsc->dd.lower.data & E1000_TXD_CMD_TSE) tso = 1; if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_IXSM) ckinfo[0].ck_valid = 1; if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_IXSM || tso) { ckinfo[0].ck_start = cd->lower_setup.ip_fields.ipcss; ckinfo[0].ck_off = cd->lower_setup.ip_fields.ipcso; ckinfo[0].ck_len = cd->lower_setup.ip_fields.ipcse; } if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_TXSM) ckinfo[1].ck_valid = 1; if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_TXSM || tso) { ckinfo[1].ck_start = cd->upper_setup.tcp_fields.tucss; ckinfo[1].ck_off = cd->upper_setup.tcp_fields.tucso; ckinfo[1].ck_len = cd->upper_setup.tcp_fields.tucse; } } break; } } if (invalid) goto done; if (iovcnt > I82545_MAX_TXSEGS) { WPRINTF("tx too many descriptors (%d > %d) -- dropped", iovcnt, I82545_MAX_TXSEGS); goto done; } hdrlen = vlen = 0; /* Estimate writable space for VLAN header insertion. */ if ((sc->esc_CTRL & E1000_CTRL_VME) && (dsc->td.lower.data & E1000_TXD_CMD_VLE)) { hdrlen = ETHER_ADDR_LEN*2; vlen = ETHER_VLAN_ENCAP_LEN; } if (!tso) { /* Estimate required writable space for checksums. */ if (ckinfo[0].ck_valid) hdrlen = MAX(hdrlen, ckinfo[0].ck_off + 2U); if (ckinfo[1].ck_valid) hdrlen = MAX(hdrlen, ckinfo[1].ck_off + 2U); /* Round up writable space to the first vector. */ if (hdrlen != 0 && iov[0].iov_len > hdrlen && iov[0].iov_len < hdrlen + 100) hdrlen = iov[0].iov_len; } else { /* In case of TSO header length provided by software. */ hdrlen = sc->esc_txctx.tcp_seg_setup.fields.hdr_len; /* * Cap the header length at 240 based on 7.2.4.5 of * the Intel 82576EB (Rev 2.63) datasheet. */ if (hdrlen > 240) { WPRINTF("TSO hdrlen too large: %d", hdrlen); goto done; } /* * If VLAN insertion is requested, ensure the header * at least holds the amount of data copied during * VLAN insertion below. * * XXX: Realistic packets will include a full Ethernet * header before the IP header at ckinfo[0].ck_start, * but this check is sufficient to prevent * out-of-bounds access below. */ if (vlen != 0 && hdrlen < ETHER_ADDR_LEN*2) { WPRINTF("TSO hdrlen too small for vlan insertion " "(%d vs %d) -- dropped", hdrlen, ETHER_ADDR_LEN*2); goto done; } /* * Ensure that the header length covers the used fields * in the IP and TCP headers as well as the IP and TCP * checksums. The following fields are accessed below: * * Header | Field | Offset | Length * -------+-------+--------+------- * IPv4 | len | 2 | 2 * IPv4 | ID | 4 | 2 * IPv6 | len | 4 | 2 * TCP | seq # | 4 | 4 * TCP | flags | 13 | 1 * UDP | len | 4 | 4 */ if (hdrlen < ckinfo[0].ck_start + 6U || hdrlen < ckinfo[0].ck_off + 2U) { WPRINTF("TSO hdrlen too small for IP fields (%d) " "-- dropped", hdrlen); goto done; } if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) { if (hdrlen < ckinfo[1].ck_start + 14U) { WPRINTF("TSO hdrlen too small for TCP fields " "(%d) -- dropped", hdrlen); goto done; } } else { if (hdrlen < ckinfo[1].ck_start + 8U) { WPRINTF("TSO hdrlen too small for UDP fields " "(%d) -- dropped", hdrlen); goto done; } } if (ckinfo[1].ck_valid && hdrlen < ckinfo[1].ck_off + 2U) { WPRINTF("TSO hdrlen too small for TCP/UDP fields " "(%d) -- dropped", hdrlen); goto done; } } if (pktlen < hdrlen + vlen) { WPRINTF("packet too small for writable header"); goto done; } /* Allocate, fill and prepend writable header vector. */ if (hdrlen + vlen != 0) { hdr = __builtin_alloca(hdrlen + vlen); hdr += vlen; for (left = hdrlen, hdrp = hdr; left > 0; left -= now, hdrp += now) { now = MIN(left, iov->iov_len); memcpy(hdrp, iov->iov_base, now); iov->iov_base = (uint8_t *)iov->iov_base + now; iov->iov_len -= now; if (iov->iov_len == 0) { iov++; iovcnt--; } } iov--; iovcnt++; iov->iov_base = hdr; iov->iov_len = hdrlen; } else hdr = NULL; /* Insert VLAN tag. */ if (vlen != 0) { hdr -= ETHER_VLAN_ENCAP_LEN; memmove(hdr, hdr + ETHER_VLAN_ENCAP_LEN, ETHER_ADDR_LEN*2); hdrlen += ETHER_VLAN_ENCAP_LEN; hdr[ETHER_ADDR_LEN*2 + 0] = sc->esc_VET >> 8; hdr[ETHER_ADDR_LEN*2 + 1] = sc->esc_VET & 0xff; hdr[ETHER_ADDR_LEN*2 + 2] = dsc->td.upper.fields.special >> 8; hdr[ETHER_ADDR_LEN*2 + 3] = dsc->td.upper.fields.special & 0xff; iov->iov_base = hdr; iov->iov_len += ETHER_VLAN_ENCAP_LEN; /* Correct checksum offsets after VLAN tag insertion. */ ckinfo[0].ck_start += ETHER_VLAN_ENCAP_LEN; ckinfo[0].ck_off += ETHER_VLAN_ENCAP_LEN; if (ckinfo[0].ck_len != 0) ckinfo[0].ck_len += ETHER_VLAN_ENCAP_LEN; ckinfo[1].ck_start += ETHER_VLAN_ENCAP_LEN; ckinfo[1].ck_off += ETHER_VLAN_ENCAP_LEN; if (ckinfo[1].ck_len != 0) ckinfo[1].ck_len += ETHER_VLAN_ENCAP_LEN; } /* Simple non-TSO case. */ if (!tso) { /* Calculate checksums and transmit. */ if (ckinfo[0].ck_valid) e82545_transmit_checksum(iov, iovcnt, &ckinfo[0]); if (ckinfo[1].ck_valid) e82545_transmit_checksum(iov, iovcnt, &ckinfo[1]); e82545_transmit_backend(sc, iov, iovcnt); goto done; } /* Doing TSO. */ tcp = (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) != 0; mss = sc->esc_txctx.tcp_seg_setup.fields.mss; paylen = (sc->esc_txctx.cmd_and_length & 0x000fffff); DPRINTF("tx %s segmentation offload %d+%d/%u bytes %d iovs", tcp ? "TCP" : "UDP", hdrlen, paylen, mss, iovcnt); ipid = ntohs(*(uint16_t *)&hdr[ckinfo[0].ck_start + 4]); tcpseq = 0; if (tcp) tcpseq = ntohl(*(uint32_t *)&hdr[ckinfo[1].ck_start + 4]); ipcs = *(uint16_t *)&hdr[ckinfo[0].ck_off]; tcpcs = 0; if (ckinfo[1].ck_valid) /* Save partial pseudo-header checksum. */ tcpcs = *(uint16_t *)&hdr[ckinfo[1].ck_off]; pv = 1; pvoff = 0; for (seg = 0, left = paylen; left > 0; seg++, left -= now) { now = MIN(left, mss); /* Construct IOVs for the segment. */ /* Include whole original header. */ tiov[0].iov_base = hdr; tiov[0].iov_len = hdrlen; tiovcnt = 1; /* Include respective part of payload IOV. */ for (nleft = now; pv < iovcnt && nleft > 0; nleft -= nnow) { nnow = MIN(nleft, iov[pv].iov_len - pvoff); tiov[tiovcnt].iov_base = (uint8_t *)iov[pv].iov_base + pvoff; tiov[tiovcnt++].iov_len = nnow; if (pvoff + nnow == iov[pv].iov_len) { pv++; pvoff = 0; } else pvoff += nnow; } DPRINTF("tx segment %d %d+%d bytes %d iovs", seg, hdrlen, now, tiovcnt); /* Update IP header. */ if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_IP) { /* IPv4 -- set length and ID */ *(uint16_t *)&hdr[ckinfo[0].ck_start + 2] = htons(hdrlen - ckinfo[0].ck_start + now); *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = htons(ipid + seg); } else { /* IPv6 -- set length */ *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = htons(hdrlen - ckinfo[0].ck_start - 40 + now); } /* Update pseudo-header checksum. */ tcpsum = tcpcs; tcpsum += htons(hdrlen - ckinfo[1].ck_start + now); /* Update TCP/UDP headers. */ if (tcp) { /* Update sequence number and FIN/PUSH flags. */ *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = htonl(tcpseq + paylen - left); if (now < left) { hdr[ckinfo[1].ck_start + 13] &= ~(TH_FIN | TH_PUSH); } } else { /* Update payload length. */ *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = hdrlen - ckinfo[1].ck_start + now; } /* Calculate checksums and transmit. */ if (ckinfo[0].ck_valid) { *(uint16_t *)&hdr[ckinfo[0].ck_off] = ipcs; e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[0]); } if (ckinfo[1].ck_valid) { *(uint16_t *)&hdr[ckinfo[1].ck_off] = e82545_carry(tcpsum); e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[1]); } e82545_transmit_backend(sc, tiov, tiovcnt); } done: head = (head + 1) % dsize; e82545_transmit_done(sc, ohead, head, dsize, tdwb); *rhead = head; return (desc + 1); } static void e82545_tx_run(struct e82545_softc *sc) { uint32_t cause; uint16_t head, rhead, tail, size; int lim, tdwb, sent; size = sc->esc_TDLEN / 16; if (size == 0) return; head = sc->esc_TDH % size; tail = sc->esc_TDT % size; DPRINTF("tx_run: head %x, rhead %x, tail %x", sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); pthread_mutex_unlock(&sc->esc_mtx); rhead = head; tdwb = 0; for (lim = size / 4; sc->esc_tx_enabled && lim > 0; lim -= sent) { sent = e82545_transmit(sc, head, tail, size, &rhead, &tdwb); if (sent == 0) break; head = rhead; } pthread_mutex_lock(&sc->esc_mtx); sc->esc_TDH = head; sc->esc_TDHr = rhead; cause = 0; if (tdwb) cause |= E1000_ICR_TXDW; if (lim != size / 4 && sc->esc_TDH == sc->esc_TDT) cause |= E1000_ICR_TXQE; if (cause) e82545_icr_assert(sc, cause); DPRINTF("tx_run done: head %x, rhead %x, tail %x", sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); } static _Noreturn void * e82545_tx_thread(void *param) { struct e82545_softc *sc = param; pthread_mutex_lock(&sc->esc_mtx); for (;;) { while (!sc->esc_tx_enabled || sc->esc_TDHr == sc->esc_TDT) { if (sc->esc_tx_enabled && sc->esc_TDHr != sc->esc_TDT) break; sc->esc_tx_active = 0; if (sc->esc_tx_enabled == 0) pthread_cond_signal(&sc->esc_tx_cond); pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); } sc->esc_tx_active = 1; /* Process some tx descriptors. Lock dropped inside. */ e82545_tx_run(sc); } } static void e82545_tx_start(struct e82545_softc *sc) { if (sc->esc_tx_active == 0) pthread_cond_signal(&sc->esc_tx_cond); } static void e82545_tx_enable(struct e82545_softc *sc) { sc->esc_tx_enabled = 1; } static void e82545_tx_disable(struct e82545_softc *sc) { sc->esc_tx_enabled = 0; while (sc->esc_tx_active) pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); } static void e82545_rx_enable(struct e82545_softc *sc) { sc->esc_rx_enabled = 1; } static void e82545_rx_disable(struct e82545_softc *sc) { sc->esc_rx_enabled = 0; while (sc->esc_rx_active) pthread_cond_wait(&sc->esc_rx_cond, &sc->esc_mtx); } static void e82545_write_ra(struct e82545_softc *sc, int reg, uint32_t wval) { struct eth_uni *eu; int idx; idx = reg >> 1; assert(idx < 15); eu = &sc->esc_uni[idx]; if (reg & 0x1) { /* RAH */ eu->eu_valid = ((wval & E1000_RAH_AV) == E1000_RAH_AV); eu->eu_addrsel = (wval >> 16) & 0x3; eu->eu_eth.octet[5] = wval >> 8; eu->eu_eth.octet[4] = wval; } else { /* RAL */ eu->eu_eth.octet[3] = wval >> 24; eu->eu_eth.octet[2] = wval >> 16; eu->eu_eth.octet[1] = wval >> 8; eu->eu_eth.octet[0] = wval; } } static uint32_t e82545_read_ra(struct e82545_softc *sc, int reg) { struct eth_uni *eu; uint32_t retval; int idx; idx = reg >> 1; assert(idx < 15); eu = &sc->esc_uni[idx]; if (reg & 0x1) { /* RAH */ retval = (eu->eu_valid << 31) | (eu->eu_addrsel << 16) | (eu->eu_eth.octet[5] << 8) | eu->eu_eth.octet[4]; } else { /* RAL */ retval = (eu->eu_eth.octet[3] << 24) | (eu->eu_eth.octet[2] << 16) | (eu->eu_eth.octet[1] << 8) | eu->eu_eth.octet[0]; } return (retval); } static void e82545_write_register(struct e82545_softc *sc, uint32_t offset, uint32_t value) { int ridx; if (offset & 0x3) { DPRINTF("Unaligned register write offset:0x%x value:0x%x", offset, value); return; } DPRINTF("Register write: 0x%x value: 0x%x", offset, value); switch (offset) { case E1000_CTRL: case E1000_CTRL_DUP: e82545_devctl(sc, value); break; case E1000_FCAL: sc->esc_FCAL = value; break; case E1000_FCAH: sc->esc_FCAH = value & ~0xFFFF0000; break; case E1000_FCT: sc->esc_FCT = value & ~0xFFFF0000; break; case E1000_VET: sc->esc_VET = value & ~0xFFFF0000; break; case E1000_FCTTV: sc->esc_FCTTV = value & ~0xFFFF0000; break; case E1000_LEDCTL: sc->esc_LEDCTL = value & ~0x30303000; break; case E1000_PBA: sc->esc_PBA = value & 0x0000FF80; break; case E1000_ICR: case E1000_ITR: case E1000_ICS: case E1000_IMS: case E1000_IMC: e82545_intr_write(sc, offset, value); break; case E1000_RCTL: e82545_rx_ctl(sc, value); break; case E1000_FCRTL: sc->esc_FCRTL = value & ~0xFFFF0007; break; case E1000_FCRTH: sc->esc_FCRTH = value & ~0xFFFF0007; break; case E1000_RDBAL(0): sc->esc_RDBAL = value & ~0xF; if (sc->esc_rx_enabled) { /* Apparently legal: update cached address */ e82545_rx_update_rdba(sc); } break; case E1000_RDBAH(0): assert(!sc->esc_rx_enabled); sc->esc_RDBAH = value; break; case E1000_RDLEN(0): assert(!sc->esc_rx_enabled); sc->esc_RDLEN = value & ~0xFFF0007F; break; case E1000_RDH(0): /* XXX should only ever be zero ? Range check ? */ sc->esc_RDH = value; break; case E1000_RDT(0): /* XXX if this opens up the rx ring, do something ? */ sc->esc_RDT = value; break; case E1000_RDTR: /* ignore FPD bit 31 */ sc->esc_RDTR = value & ~0xFFFF0000; break; case E1000_RXDCTL(0): sc->esc_RXDCTL = value & ~0xFEC0C0C0; break; case E1000_RADV: sc->esc_RADV = value & ~0xFFFF0000; break; case E1000_RSRPD: sc->esc_RSRPD = value & ~0xFFFFF000; break; case E1000_RXCSUM: sc->esc_RXCSUM = value & ~0xFFFFF800; break; case E1000_TXCW: sc->esc_TXCW = value & ~0x3FFF0000; break; case E1000_TCTL: e82545_tx_ctl(sc, value); break; case E1000_TIPG: sc->esc_TIPG = value; break; case E1000_AIT: sc->esc_AIT = value; break; case E1000_TDBAL(0): sc->esc_TDBAL = value & ~0xF; if (sc->esc_tx_enabled) e82545_tx_update_tdba(sc); break; case E1000_TDBAH(0): sc->esc_TDBAH = value; if (sc->esc_tx_enabled) e82545_tx_update_tdba(sc); break; case E1000_TDLEN(0): sc->esc_TDLEN = value & ~0xFFF0007F; if (sc->esc_tx_enabled) e82545_tx_update_tdba(sc); break; case E1000_TDH(0): if (sc->esc_tx_enabled) { WPRINTF("ignoring write to TDH while transmit enabled"); break; } if (value != 0) { WPRINTF("ignoring non-zero value written to TDH"); break; } sc->esc_TDHr = sc->esc_TDH = value; break; case E1000_TDT(0): sc->esc_TDT = value; if (sc->esc_tx_enabled) e82545_tx_start(sc); break; case E1000_TIDV: sc->esc_TIDV = value & ~0xFFFF0000; break; case E1000_TXDCTL(0): //assert(!sc->esc_tx_enabled); sc->esc_TXDCTL = value & ~0xC0C0C0; break; case E1000_TADV: sc->esc_TADV = value & ~0xFFFF0000; break; case E1000_RAL(0) ... E1000_RAH(15): /* convert to u32 offset */ ridx = (offset - E1000_RAL(0)) >> 2; e82545_write_ra(sc, ridx, value); break; case E1000_MTA ... (E1000_MTA + (127*4)): sc->esc_fmcast[(offset - E1000_MTA) >> 2] = value; break; case E1000_VFTA ... (E1000_VFTA + (127*4)): sc->esc_fvlan[(offset - E1000_VFTA) >> 2] = value; break; case E1000_EECD: { //DPRINTF("EECD write 0x%x -> 0x%x", sc->eeprom_control, value); /* edge triggered low->high */ uint32_t eecd_strobe = ((sc->eeprom_control & E1000_EECD_SK) ? 0 : (value & E1000_EECD_SK)); uint32_t eecd_mask = (E1000_EECD_SK|E1000_EECD_CS| E1000_EECD_DI|E1000_EECD_REQ); sc->eeprom_control &= ~eecd_mask; sc->eeprom_control |= (value & eecd_mask); /* grant/revoke immediately */ if (value & E1000_EECD_REQ) { sc->eeprom_control |= E1000_EECD_GNT; } else { sc->eeprom_control &= ~E1000_EECD_GNT; } if (eecd_strobe && (sc->eeprom_control & E1000_EECD_CS)) { e82545_eecd_strobe(sc); } return; } case E1000_MDIC: { uint8_t reg_addr = (uint8_t)((value & E1000_MDIC_REG_MASK) >> E1000_MDIC_REG_SHIFT); uint8_t phy_addr = (uint8_t)((value & E1000_MDIC_PHY_MASK) >> E1000_MDIC_PHY_SHIFT); sc->mdi_control = (value & ~(E1000_MDIC_ERROR|E1000_MDIC_DEST)); if ((value & E1000_MDIC_READY) != 0) { DPRINTF("Incorrect MDIC ready bit: 0x%x", value); return; } switch (value & E82545_MDIC_OP_MASK) { case E1000_MDIC_OP_READ: sc->mdi_control &= ~E82545_MDIC_DATA_MASK; sc->mdi_control |= e82545_read_mdi(sc, reg_addr, phy_addr); break; case E1000_MDIC_OP_WRITE: e82545_write_mdi(sc, reg_addr, phy_addr, value & E82545_MDIC_DATA_MASK); break; default: DPRINTF("Unknown MDIC op: 0x%x", value); return; } /* TODO: barrier? */ sc->mdi_control |= E1000_MDIC_READY; if (value & E82545_MDIC_IE) { // TODO: generate interrupt } return; } case E1000_MANC: case E1000_STATUS: return; default: DPRINTF("Unknown write register: 0x%x value:%x", offset, value); return; } } static uint32_t e82545_read_register(struct e82545_softc *sc, uint32_t offset) { uint32_t retval; int ridx; if (offset & 0x3) { DPRINTF("Unaligned register read offset:0x%x", offset); return 0; } DPRINTF("Register read: 0x%x", offset); switch (offset) { case E1000_CTRL: retval = sc->esc_CTRL; break; case E1000_STATUS: retval = E1000_STATUS_FD | E1000_STATUS_LU | E1000_STATUS_SPEED_1000; break; case E1000_FCAL: retval = sc->esc_FCAL; break; case E1000_FCAH: retval = sc->esc_FCAH; break; case E1000_FCT: retval = sc->esc_FCT; break; case E1000_VET: retval = sc->esc_VET; break; case E1000_FCTTV: retval = sc->esc_FCTTV; break; case E1000_LEDCTL: retval = sc->esc_LEDCTL; break; case E1000_PBA: retval = sc->esc_PBA; break; case E1000_ICR: case E1000_ITR: case E1000_ICS: case E1000_IMS: case E1000_IMC: retval = e82545_intr_read(sc, offset); break; case E1000_RCTL: retval = sc->esc_RCTL; break; case E1000_FCRTL: retval = sc->esc_FCRTL; break; case E1000_FCRTH: retval = sc->esc_FCRTH; break; case E1000_RDBAL(0): retval = sc->esc_RDBAL; break; case E1000_RDBAH(0): retval = sc->esc_RDBAH; break; case E1000_RDLEN(0): retval = sc->esc_RDLEN; break; case E1000_RDH(0): retval = sc->esc_RDH; break; case E1000_RDT(0): retval = sc->esc_RDT; break; case E1000_RDTR: retval = sc->esc_RDTR; break; case E1000_RXDCTL(0): retval = sc->esc_RXDCTL; break; case E1000_RADV: retval = sc->esc_RADV; break; case E1000_RSRPD: retval = sc->esc_RSRPD; break; case E1000_RXCSUM: retval = sc->esc_RXCSUM; break; case E1000_TXCW: retval = sc->esc_TXCW; break; case E1000_TCTL: retval = sc->esc_TCTL; break; case E1000_TIPG: retval = sc->esc_TIPG; break; case E1000_AIT: retval = sc->esc_AIT; break; case E1000_TDBAL(0): retval = sc->esc_TDBAL; break; case E1000_TDBAH(0): retval = sc->esc_TDBAH; break; case E1000_TDLEN(0): retval = sc->esc_TDLEN; break; case E1000_TDH(0): retval = sc->esc_TDH; break; case E1000_TDT(0): retval = sc->esc_TDT; break; case E1000_TIDV: retval = sc->esc_TIDV; break; case E1000_TXDCTL(0): retval = sc->esc_TXDCTL; break; case E1000_TADV: retval = sc->esc_TADV; break; case E1000_RAL(0) ... E1000_RAH(15): /* convert to u32 offset */ ridx = (offset - E1000_RAL(0)) >> 2; retval = e82545_read_ra(sc, ridx); break; case E1000_MTA ... (E1000_MTA + (127*4)): retval = sc->esc_fmcast[(offset - E1000_MTA) >> 2]; break; case E1000_VFTA ... (E1000_VFTA + (127*4)): retval = sc->esc_fvlan[(offset - E1000_VFTA) >> 2]; break; case E1000_EECD: //DPRINTF("EECD read %x", sc->eeprom_control); retval = sc->eeprom_control; break; case E1000_MDIC: retval = sc->mdi_control; break; case E1000_MANC: retval = 0; break; /* stats that we emulate. */ case E1000_MPC: retval = sc->missed_pkt_count; break; case E1000_PRC64: retval = sc->pkt_rx_by_size[0]; break; case E1000_PRC127: retval = sc->pkt_rx_by_size[1]; break; case E1000_PRC255: retval = sc->pkt_rx_by_size[2]; break; case E1000_PRC511: retval = sc->pkt_rx_by_size[3]; break; case E1000_PRC1023: retval = sc->pkt_rx_by_size[4]; break; case E1000_PRC1522: retval = sc->pkt_rx_by_size[5]; break; case E1000_GPRC: retval = sc->good_pkt_rx_count; break; case E1000_BPRC: retval = sc->bcast_pkt_rx_count; break; case E1000_MPRC: retval = sc->mcast_pkt_rx_count; break; case E1000_GPTC: case E1000_TPT: retval = sc->good_pkt_tx_count; break; case E1000_GORCL: retval = (uint32_t)sc->good_octets_rx; break; case E1000_GORCH: retval = (uint32_t)(sc->good_octets_rx >> 32); break; case E1000_TOTL: case E1000_GOTCL: retval = (uint32_t)sc->good_octets_tx; break; case E1000_TOTH: case E1000_GOTCH: retval = (uint32_t)(sc->good_octets_tx >> 32); break; case E1000_ROC: retval = sc->oversize_rx_count; break; case E1000_TORL: retval = (uint32_t)(sc->good_octets_rx + sc->missed_octets); break; case E1000_TORH: retval = (uint32_t)((sc->good_octets_rx + sc->missed_octets) >> 32); break; case E1000_TPR: retval = sc->good_pkt_rx_count + sc->missed_pkt_count + sc->oversize_rx_count; break; case E1000_PTC64: retval = sc->pkt_tx_by_size[0]; break; case E1000_PTC127: retval = sc->pkt_tx_by_size[1]; break; case E1000_PTC255: retval = sc->pkt_tx_by_size[2]; break; case E1000_PTC511: retval = sc->pkt_tx_by_size[3]; break; case E1000_PTC1023: retval = sc->pkt_tx_by_size[4]; break; case E1000_PTC1522: retval = sc->pkt_tx_by_size[5]; break; case E1000_MPTC: retval = sc->mcast_pkt_tx_count; break; case E1000_BPTC: retval = sc->bcast_pkt_tx_count; break; case E1000_TSCTC: retval = sc->tso_tx_count; break; /* stats that are always 0. */ case E1000_CRCERRS: case E1000_ALGNERRC: case E1000_SYMERRS: case E1000_RXERRC: case E1000_SCC: case E1000_ECOL: case E1000_MCC: case E1000_LATECOL: case E1000_COLC: case E1000_DC: case E1000_TNCRS: case E1000_SEC: case E1000_CEXTERR: case E1000_RLEC: case E1000_XONRXC: case E1000_XONTXC: case E1000_XOFFRXC: case E1000_XOFFTXC: case E1000_FCRUC: case E1000_RNBC: case E1000_RUC: case E1000_RFC: case E1000_RJC: case E1000_MGTPRC: case E1000_MGTPDC: case E1000_MGTPTC: case E1000_TSCTFC: retval = 0; break; default: DPRINTF("Unknown read register: 0x%x", offset); retval = 0; break; } return (retval); } static void e82545_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct e82545_softc *sc; //DPRINTF("Write bar:%d offset:0x%lx value:0x%lx size:%d", baridx, offset, value, size); sc = pi->pi_arg; pthread_mutex_lock(&sc->esc_mtx); switch (baridx) { case E82545_BAR_IO: switch (offset) { case E82545_IOADDR: if (size != 4) { DPRINTF("Wrong io addr write sz:%d value:0x%lx", size, value); } else sc->io_addr = (uint32_t)value; break; case E82545_IODATA: if (size != 4) { DPRINTF("Wrong io data write size:%d value:0x%lx", size, value); } else if (sc->io_addr > E82545_IO_REGISTER_MAX) { DPRINTF("Non-register io write addr:0x%x value:0x%lx", sc->io_addr, value); } else e82545_write_register(sc, sc->io_addr, (uint32_t)value); break; default: DPRINTF("Unknown io bar write offset:0x%lx value:0x%lx size:%d", offset, value, size); break; } break; case E82545_BAR_REGISTER: if (size != 4) { DPRINTF("Wrong register write size:%d offset:0x%lx value:0x%lx", size, offset, value); } else e82545_write_register(sc, (uint32_t)offset, (uint32_t)value); break; default: DPRINTF("Unknown write bar:%d off:0x%lx val:0x%lx size:%d", baridx, offset, value, size); } pthread_mutex_unlock(&sc->esc_mtx); } static uint64_t e82545_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct e82545_softc *sc; uint64_t retval; //DPRINTF("Read bar:%d offset:0x%lx size:%d", baridx, offset, size); sc = pi->pi_arg; retval = 0; pthread_mutex_lock(&sc->esc_mtx); switch (baridx) { case E82545_BAR_IO: switch (offset) { case E82545_IOADDR: if (size != 4) { DPRINTF("Wrong io addr read sz:%d", size); } else retval = sc->io_addr; break; case E82545_IODATA: if (size != 4) { DPRINTF("Wrong io data read sz:%d", size); } if (sc->io_addr > E82545_IO_REGISTER_MAX) { DPRINTF("Non-register io read addr:0x%x", sc->io_addr); } else retval = e82545_read_register(sc, sc->io_addr); break; default: DPRINTF("Unknown io bar read offset:0x%lx size:%d", offset, size); break; } break; case E82545_BAR_REGISTER: if (size != 4) { DPRINTF("Wrong register read size:%d offset:0x%lx", size, offset); } else retval = e82545_read_register(sc, (uint32_t)offset); break; default: DPRINTF("Unknown read bar:%d offset:0x%lx size:%d", baridx, offset, size); break; } pthread_mutex_unlock(&sc->esc_mtx); return (retval); } static void e82545_reset(struct e82545_softc *sc, int drvr) { int i; e82545_rx_disable(sc); e82545_tx_disable(sc); /* clear outstanding interrupts */ if (sc->esc_irq_asserted) pci_lintr_deassert(sc->esc_pi); /* misc */ if (!drvr) { sc->esc_FCAL = 0; sc->esc_FCAH = 0; sc->esc_FCT = 0; sc->esc_VET = 0; sc->esc_FCTTV = 0; } sc->esc_LEDCTL = 0x07061302; sc->esc_PBA = 0x00100030; /* start nvm in opcode mode. */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; sc->eeprom_control = E1000_EECD_PRES | E82545_EECD_FWE_EN; e82545_init_eeprom(sc); /* interrupt */ sc->esc_ICR = 0; sc->esc_ITR = 250; sc->esc_ICS = 0; sc->esc_IMS = 0; sc->esc_IMC = 0; /* L2 filters */ if (!drvr) { memset(sc->esc_fvlan, 0, sizeof(sc->esc_fvlan)); memset(sc->esc_fmcast, 0, sizeof(sc->esc_fmcast)); memset(sc->esc_uni, 0, sizeof(sc->esc_uni)); /* XXX not necessary on 82545 ?? */ sc->esc_uni[0].eu_valid = 1; memcpy(sc->esc_uni[0].eu_eth.octet, sc->esc_mac.octet, ETHER_ADDR_LEN); } else { /* Clear RAH valid bits */ for (i = 0; i < 16; i++) sc->esc_uni[i].eu_valid = 0; } /* receive */ if (!drvr) { sc->esc_RDBAL = 0; sc->esc_RDBAH = 0; } sc->esc_RCTL = 0; sc->esc_FCRTL = 0; sc->esc_FCRTH = 0; sc->esc_RDLEN = 0; sc->esc_RDH = 0; sc->esc_RDT = 0; sc->esc_RDTR = 0; sc->esc_RXDCTL = (1 << 24) | (1 << 16); /* default GRAN/WTHRESH */ sc->esc_RADV = 0; sc->esc_RXCSUM = 0; /* transmit */ if (!drvr) { sc->esc_TDBAL = 0; sc->esc_TDBAH = 0; sc->esc_TIPG = 0; sc->esc_AIT = 0; sc->esc_TIDV = 0; sc->esc_TADV = 0; } sc->esc_tdba = 0; sc->esc_txdesc = NULL; sc->esc_TXCW = 0; sc->esc_TCTL = 0; sc->esc_TDLEN = 0; sc->esc_TDT = 0; sc->esc_TDHr = sc->esc_TDH = 0; sc->esc_TXDCTL = 0; } static int e82545_init(struct pci_devinst *pi, nvlist_t *nvl) { char nstr[80]; struct e82545_softc *sc; const char *mac; int err; /* Setup our softc */ sc = calloc(1, sizeof(*sc)); pi->pi_arg = sc; sc->esc_pi = pi; sc->esc_ctx = pi->pi_vmctx; pthread_mutex_init(&sc->esc_mtx, NULL); pthread_cond_init(&sc->esc_rx_cond, NULL); pthread_cond_init(&sc->esc_tx_cond, NULL); pthread_create(&sc->esc_tx_tid, NULL, e82545_tx_thread, sc); snprintf(nstr, sizeof(nstr), "e82545-%d:%d tx", pi->pi_slot, pi->pi_func); pthread_set_name_np(sc->esc_tx_tid, nstr); pci_set_cfgdata16(pi, PCIR_DEVICE, E82545_DEV_ID_82545EM_COPPER); pci_set_cfgdata16(pi, PCIR_VENDOR, E82545_VENDOR_ID_INTEL); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_NETWORK_ETHERNET); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, E82545_SUBDEV_ID); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, E82545_VENDOR_ID_INTEL); pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); pci_set_cfgdata8(pi, PCIR_INTPIN, 0x1); /* TODO: this card also supports msi, but the freebsd driver for it * does not, so I have not implemented it. */ pci_lintr_request(pi); pci_emul_alloc_bar(pi, E82545_BAR_REGISTER, PCIBAR_MEM32, E82545_BAR_REGISTER_LEN); pci_emul_alloc_bar(pi, E82545_BAR_FLASH, PCIBAR_MEM32, E82545_BAR_FLASH_LEN); pci_emul_alloc_bar(pi, E82545_BAR_IO, PCIBAR_IO, E82545_BAR_IO_LEN); mac = get_config_value_node(nvl, "mac"); if (mac != NULL) { err = net_parsemac(mac, sc->esc_mac.octet); if (err) { free(sc); return (err); } } else net_genmac(pi, sc->esc_mac.octet); err = netbe_init(&sc->esc_be, nvl, e82545_rx_callback, sc); if (err) { free(sc); return (err); } netbe_rx_enable(sc->esc_be); /* H/w initiated reset */ e82545_reset(sc, 0); return (0); } #ifdef BHYVE_SNAPSHOT static int e82545_snapshot(struct vm_snapshot_meta *meta) { int i; int ret; struct e82545_softc *sc; struct pci_devinst *pi; uint64_t bitmap_value; pi = meta->dev_data; sc = pi->pi_arg; /* esc_mevp and esc_mevpitr should be reinitiated at init. */ SNAPSHOT_VAR_OR_LEAVE(sc->esc_mac, meta, ret, done); /* General */ SNAPSHOT_VAR_OR_LEAVE(sc->esc_CTRL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCT, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_VET, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCTTV, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_LEDCTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_PBA, meta, ret, done); /* Interrupt control */ SNAPSHOT_VAR_OR_LEAVE(sc->esc_irq_asserted, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICR, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_ITR, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICS, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMS, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMC, meta, ret, done); /* * Transmit * * The fields in the unions are in superposition to access certain * bytes in the larger uint variables. * e.g., ip_config = [ipcss|ipcso|ipcse0|ipcse1] */ SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.lower_setup.ip_config, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.upper_setup.tcp_config, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.cmd_and_length, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.tcp_seg_setup.data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_active, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXCW, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TCTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIPG, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_AIT, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_tdba, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDLEN, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDHr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDT, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIDV, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXDCTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TADV, meta, ret, done); /* Has dependency on esc_TDLEN; reoreder of fields from struct. */ - SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_txdesc, sc->esc_TDLEN, - true, meta, ret, done); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, sc->esc_txdesc, + sc->esc_TDLEN, true, meta, ret, done); /* L2 frame acceptance */ for (i = 0; i < (int)nitems(sc->esc_uni); i++) { SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_valid, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_addrsel, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_eth, meta, ret, done); } SNAPSHOT_BUF_OR_LEAVE(sc->esc_fmcast, sizeof(sc->esc_fmcast), meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(sc->esc_fvlan, sizeof(sc->esc_fvlan), meta, ret, done); /* Receive */ SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_active, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_loopback, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RCTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_rdba, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDLEN, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDT, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDTR, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXDCTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RADV, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RSRPD, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXCSUM, meta, ret, done); /* Has dependency on esc_RDLEN; reoreder of fields from struct. */ - SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_rxdesc, sc->esc_TDLEN, - true, meta, ret, done); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, sc->esc_rxdesc, + sc->esc_TDLEN, true, meta, ret, done); /* IO Port register access */ SNAPSHOT_VAR_OR_LEAVE(sc->io_addr, meta, ret, done); /* Shadow copy of MDIC */ SNAPSHOT_VAR_OR_LEAVE(sc->mdi_control, meta, ret, done); /* Shadow copy of EECD */ SNAPSHOT_VAR_OR_LEAVE(sc->eeprom_control, meta, ret, done); /* Latest NVM in/out */ SNAPSHOT_VAR_OR_LEAVE(sc->nvm_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->nvm_opaddr, meta, ret, done); /* Stats */ SNAPSHOT_VAR_OR_LEAVE(sc->missed_pkt_count, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(sc->pkt_rx_by_size, sizeof(sc->pkt_rx_by_size), meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(sc->pkt_tx_by_size, sizeof(sc->pkt_tx_by_size), meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_rx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_rx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_rx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_tx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_tx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_tx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->oversize_rx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->tso_tx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_rx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_tx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->missed_octets, meta, ret, done); if (meta->op == VM_SNAPSHOT_SAVE) bitmap_value = sc->nvm_bits; SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE) sc->nvm_bits = bitmap_value; if (meta->op == VM_SNAPSHOT_SAVE) bitmap_value = sc->nvm_bits; SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE) sc->nvm_bits = bitmap_value; /* EEPROM data */ SNAPSHOT_BUF_OR_LEAVE(sc->eeprom_data, sizeof(sc->eeprom_data), meta, ret, done); done: return (ret); } #endif static const struct pci_devemu pci_de_e82545 = { .pe_emu = "e1000", .pe_init = e82545_init, .pe_legacy_config = netbe_legacy_config, .pe_barwrite = e82545_write, .pe_barread = e82545_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = e82545_snapshot, #endif }; PCI_EMUL_SET(pci_de_e82545); diff --git a/usr.sbin/bhyve/pci_xhci.c b/usr.sbin/bhyve/pci_xhci.c index 0ac1423f0736..2c5f36eb130d 100644 --- a/usr.sbin/bhyve/pci_xhci.c +++ b/usr.sbin/bhyve/pci_xhci.c @@ -1,3210 +1,3211 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Leon Dang * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* XHCI options: -s ,xhci,{devices} devices: tablet USB tablet mouse */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include -#include - #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "pci_xhci.h" +#ifdef BHYVE_SNAPSHOT +#include "snapshot.h" +#endif #include "usb_emul.h" static int xhci_debug = 0; #define DPRINTF(params) if (xhci_debug) PRINTLN params #define WPRINTF(params) PRINTLN params #define XHCI_NAME "xhci" #define XHCI_MAX_DEVS 8 /* 4 USB3 + 4 USB2 devs */ #define XHCI_MAX_SLOTS 64 /* min allowed by Windows drivers */ /* * XHCI data structures can be up to 64k, but limit paddr_guest2host mapping * to 4k to avoid going over the guest physical memory barrier. */ #define XHCI_PADDR_SZ 4096 /* paddr_guest2host max size */ #define XHCI_ERST_MAX 0 /* max 2^entries event ring seg tbl */ #define XHCI_CAPLEN (4*8) /* offset of op register space */ #define XHCI_HCCPRAMS2 0x1C /* offset of HCCPARAMS2 register */ #define XHCI_PORTREGS_START 0x400 #define XHCI_DOORBELL_MAX 256 #define XHCI_STREAMS_MAX 1 /* 4-15 in XHCI spec */ /* caplength and hci-version registers */ #define XHCI_SET_CAPLEN(x) ((x) & 0xFF) #define XHCI_SET_HCIVERSION(x) (((x) & 0xFFFF) << 16) #define XHCI_GET_HCIVERSION(x) (((x) >> 16) & 0xFFFF) /* hcsparams1 register */ #define XHCI_SET_HCSP1_MAXSLOTS(x) ((x) & 0xFF) #define XHCI_SET_HCSP1_MAXINTR(x) (((x) & 0x7FF) << 8) #define XHCI_SET_HCSP1_MAXPORTS(x) (((x) & 0xFF) << 24) /* hcsparams2 register */ #define XHCI_SET_HCSP2_IST(x) ((x) & 0x0F) #define XHCI_SET_HCSP2_ERSTMAX(x) (((x) & 0x0F) << 4) #define XHCI_SET_HCSP2_MAXSCRATCH_HI(x) (((x) & 0x1F) << 21) #define XHCI_SET_HCSP2_MAXSCRATCH_LO(x) (((x) & 0x1F) << 27) /* hcsparams3 register */ #define XHCI_SET_HCSP3_U1EXITLATENCY(x) ((x) & 0xFF) #define XHCI_SET_HCSP3_U2EXITLATENCY(x) (((x) & 0xFFFF) << 16) /* hccparams1 register */ #define XHCI_SET_HCCP1_AC64(x) ((x) & 0x01) #define XHCI_SET_HCCP1_BNC(x) (((x) & 0x01) << 1) #define XHCI_SET_HCCP1_CSZ(x) (((x) & 0x01) << 2) #define XHCI_SET_HCCP1_PPC(x) (((x) & 0x01) << 3) #define XHCI_SET_HCCP1_PIND(x) (((x) & 0x01) << 4) #define XHCI_SET_HCCP1_LHRC(x) (((x) & 0x01) << 5) #define XHCI_SET_HCCP1_LTC(x) (((x) & 0x01) << 6) #define XHCI_SET_HCCP1_NSS(x) (((x) & 0x01) << 7) #define XHCI_SET_HCCP1_PAE(x) (((x) & 0x01) << 8) #define XHCI_SET_HCCP1_SPC(x) (((x) & 0x01) << 9) #define XHCI_SET_HCCP1_SEC(x) (((x) & 0x01) << 10) #define XHCI_SET_HCCP1_CFC(x) (((x) & 0x01) << 11) #define XHCI_SET_HCCP1_MAXPSA(x) (((x) & 0x0F) << 12) #define XHCI_SET_HCCP1_XECP(x) (((x) & 0xFFFF) << 16) /* hccparams2 register */ #define XHCI_SET_HCCP2_U3C(x) ((x) & 0x01) #define XHCI_SET_HCCP2_CMC(x) (((x) & 0x01) << 1) #define XHCI_SET_HCCP2_FSC(x) (((x) & 0x01) << 2) #define XHCI_SET_HCCP2_CTC(x) (((x) & 0x01) << 3) #define XHCI_SET_HCCP2_LEC(x) (((x) & 0x01) << 4) #define XHCI_SET_HCCP2_CIC(x) (((x) & 0x01) << 5) /* other registers */ #define XHCI_SET_DOORBELL(x) ((x) & ~0x03) #define XHCI_SET_RTSOFFSET(x) ((x) & ~0x0F) /* register masks */ #define XHCI_PS_PLS_MASK (0xF << 5) /* port link state */ #define XHCI_PS_SPEED_MASK (0xF << 10) /* port speed */ #define XHCI_PS_PIC_MASK (0x3 << 14) /* port indicator */ /* port register set */ #define XHCI_PORTREGS_BASE 0x400 /* base offset */ #define XHCI_PORTREGS_PORT0 0x3F0 #define XHCI_PORTREGS_SETSZ 0x10 /* size of a set */ #define MASK_64_HI(x) ((x) & ~0xFFFFFFFFULL) #define MASK_64_LO(x) ((x) & 0xFFFFFFFFULL) #define FIELD_REPLACE(a,b,m,s) (((a) & ~((m) << (s))) | \ (((b) & (m)) << (s))) #define FIELD_COPY(a,b,m,s) (((a) & ~((m) << (s))) | \ (((b) & ((m) << (s))))) #define SNAP_DEV_NAME_LEN 128 struct pci_xhci_trb_ring { uint64_t ringaddr; /* current dequeue guest address */ uint32_t ccs; /* consumer cycle state */ }; /* device endpoint transfer/stream rings */ struct pci_xhci_dev_ep { union { struct xhci_trb *_epu_tr; struct xhci_stream_ctx *_epu_sctx; } _ep_trbsctx; #define ep_tr _ep_trbsctx._epu_tr #define ep_sctx _ep_trbsctx._epu_sctx /* * Caches the value of MaxPStreams from the endpoint context * when an endpoint is initialized and is used to validate the * use of ep_ringaddr vs ep_sctx_trbs[] as well as the length * of ep_sctx_trbs[]. */ uint32_t ep_MaxPStreams; union { struct pci_xhci_trb_ring _epu_trb; struct pci_xhci_trb_ring *_epu_sctx_trbs; } _ep_trb_rings; #define ep_ringaddr _ep_trb_rings._epu_trb.ringaddr #define ep_ccs _ep_trb_rings._epu_trb.ccs #define ep_sctx_trbs _ep_trb_rings._epu_sctx_trbs struct usb_data_xfer *ep_xfer; /* transfer chain */ }; /* device context base address array: maps slot->device context */ struct xhci_dcbaa { uint64_t dcba[USB_MAX_DEVICES+1]; /* xhci_dev_ctx ptrs */ }; /* port status registers */ struct pci_xhci_portregs { uint32_t portsc; /* port status and control */ uint32_t portpmsc; /* port pwr mgmt status & control */ uint32_t portli; /* port link info */ uint32_t porthlpmc; /* port hardware LPM control */ } __packed; #define XHCI_PS_SPEED_SET(x) (((x) & 0xF) << 10) /* xHC operational registers */ struct pci_xhci_opregs { uint32_t usbcmd; /* usb command */ uint32_t usbsts; /* usb status */ uint32_t pgsz; /* page size */ uint32_t dnctrl; /* device notification control */ uint64_t crcr; /* command ring control */ uint64_t dcbaap; /* device ctx base addr array ptr */ uint32_t config; /* configure */ /* guest mapped addresses: */ struct xhci_trb *cr_p; /* crcr dequeue */ struct xhci_dcbaa *dcbaa_p; /* dev ctx array ptr */ }; /* xHC runtime registers */ struct pci_xhci_rtsregs { uint32_t mfindex; /* microframe index */ struct { /* interrupter register set */ uint32_t iman; /* interrupter management */ uint32_t imod; /* interrupter moderation */ uint32_t erstsz; /* event ring segment table size */ uint32_t rsvd; uint64_t erstba; /* event ring seg-tbl base addr */ uint64_t erdp; /* event ring dequeue ptr */ } intrreg __packed; /* guest mapped addresses */ struct xhci_event_ring_seg *erstba_p; struct xhci_trb *erst_p; /* event ring segment tbl */ int er_deq_seg; /* event ring dequeue segment */ int er_enq_idx; /* event ring enqueue index - xHCI */ int er_enq_seg; /* event ring enqueue segment */ uint32_t er_events_cnt; /* number of events in ER */ uint32_t event_pcs; /* producer cycle state flag */ }; struct pci_xhci_softc; /* * USB device emulation container. * This is referenced from usb_hci->hci_sc; 1 pci_xhci_dev_emu for each * emulated device instance. */ struct pci_xhci_dev_emu { struct pci_xhci_softc *xsc; /* XHCI contexts */ struct xhci_dev_ctx *dev_ctx; struct pci_xhci_dev_ep eps[XHCI_MAX_ENDPOINTS]; int dev_slotstate; struct usb_devemu *dev_ue; /* USB emulated dev */ void *dev_sc; /* device's softc */ struct usb_hci hci; }; struct pci_xhci_softc { struct pci_devinst *xsc_pi; pthread_mutex_t mtx; uint32_t caplength; /* caplen & hciversion */ uint32_t hcsparams1; /* structural parameters 1 */ uint32_t hcsparams2; /* structural parameters 2 */ uint32_t hcsparams3; /* structural parameters 3 */ uint32_t hccparams1; /* capability parameters 1 */ uint32_t dboff; /* doorbell offset */ uint32_t rtsoff; /* runtime register space offset */ uint32_t hccparams2; /* capability parameters 2 */ uint32_t regsend; /* end of configuration registers */ struct pci_xhci_opregs opregs; struct pci_xhci_rtsregs rtsregs; struct pci_xhci_portregs *portregs; struct pci_xhci_dev_emu **devices; /* XHCI[port] = device */ struct pci_xhci_dev_emu **slots; /* slots assigned from 1 */ int usb2_port_start; int usb3_port_start; }; /* port and slot numbering start from 1 */ #define XHCI_PORTREG_PTR(x,n) &((x)->portregs[(n) - 1]) #define XHCI_DEVINST_PTR(x,n) ((x)->devices[(n) - 1]) #define XHCI_SLOTDEV_PTR(x,n) ((x)->slots[(n) - 1]) #define XHCI_HALTED(sc) ((sc)->opregs.usbsts & XHCI_STS_HCH) #define XHCI_GADDR_SIZE(a) (XHCI_PADDR_SZ - \ (((uint64_t) (a)) & (XHCI_PADDR_SZ - 1))) #define XHCI_GADDR(sc,a) paddr_guest2host((sc)->xsc_pi->pi_vmctx, \ (a), XHCI_GADDR_SIZE(a)) static int xhci_in_use; /* map USB errors to XHCI */ static const int xhci_usb_errors[USB_ERR_MAX] = { [USB_ERR_NORMAL_COMPLETION] = XHCI_TRB_ERROR_SUCCESS, [USB_ERR_PENDING_REQUESTS] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NOT_STARTED] = XHCI_TRB_ERROR_ENDP_NOT_ON, [USB_ERR_INVAL] = XHCI_TRB_ERROR_INVALID, [USB_ERR_NOMEM] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_CANCELLED] = XHCI_TRB_ERROR_STOPPED, [USB_ERR_BAD_ADDRESS] = XHCI_TRB_ERROR_PARAMETER, [USB_ERR_BAD_BUFSIZE] = XHCI_TRB_ERROR_PARAMETER, [USB_ERR_BAD_FLAG] = XHCI_TRB_ERROR_PARAMETER, [USB_ERR_NO_CALLBACK] = XHCI_TRB_ERROR_STALL, [USB_ERR_IN_USE] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NO_ADDR] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NO_PIPE] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_ZERO_NFRAMES] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_ZERO_MAXP] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_SET_ADDR_FAILED] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NO_POWER] = XHCI_TRB_ERROR_ENDP_NOT_ON, [USB_ERR_TOO_DEEP] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_IOERROR] = XHCI_TRB_ERROR_TRB, [USB_ERR_NOT_CONFIGURED] = XHCI_TRB_ERROR_ENDP_NOT_ON, [USB_ERR_TIMEOUT] = XHCI_TRB_ERROR_CMD_ABORTED, [USB_ERR_SHORT_XFER] = XHCI_TRB_ERROR_SHORT_PKT, [USB_ERR_STALLED] = XHCI_TRB_ERROR_STALL, [USB_ERR_INTERRUPTED] = XHCI_TRB_ERROR_CMD_ABORTED, [USB_ERR_DMA_LOAD_FAILED] = XHCI_TRB_ERROR_DATA_BUF, [USB_ERR_BAD_CONTEXT] = XHCI_TRB_ERROR_TRB, [USB_ERR_NO_ROOT_HUB] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_NO_INTR_THREAD] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_NOT_LOCKED] = XHCI_TRB_ERROR_UNDEFINED, }; #define USB_TO_XHCI_ERR(e) ((e) < USB_ERR_MAX ? xhci_usb_errors[(e)] : \ XHCI_TRB_ERROR_INVALID) static int pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb, int do_intr); static void pci_xhci_dump_trb(struct xhci_trb *trb); static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc); static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot); static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm); static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, uint32_t streamid, uint64_t ringaddr, int ccs); static void pci_xhci_set_evtrb(struct xhci_trb *evtrb, uint64_t port, uint32_t errcode, uint32_t evtype) { evtrb->qwTrb0 = port << 24; evtrb->dwTrb2 = XHCI_TRB_2_ERROR_SET(errcode); evtrb->dwTrb3 = XHCI_TRB_3_TYPE_SET(evtype); } /* controller reset */ static void pci_xhci_reset(struct pci_xhci_softc *sc) { int i; sc->rtsregs.er_enq_idx = 0; sc->rtsregs.er_events_cnt = 0; sc->rtsregs.event_pcs = 1; for (i = 1; i <= XHCI_MAX_SLOTS; i++) { pci_xhci_reset_slot(sc, i); } } static uint32_t pci_xhci_usbcmd_write(struct pci_xhci_softc *sc, uint32_t cmd) { int do_intr = 0; int i; if (cmd & XHCI_CMD_RS) { do_intr = (sc->opregs.usbcmd & XHCI_CMD_RS) == 0; sc->opregs.usbcmd |= XHCI_CMD_RS; sc->opregs.usbsts &= ~XHCI_STS_HCH; sc->opregs.usbsts |= XHCI_STS_PCD; /* Queue port change event on controller run from stop */ if (do_intr) for (i = 1; i <= XHCI_MAX_DEVS; i++) { struct pci_xhci_dev_emu *dev; struct pci_xhci_portregs *port; struct xhci_trb evtrb; if ((dev = XHCI_DEVINST_PTR(sc, i)) == NULL) continue; port = XHCI_PORTREG_PTR(sc, i); port->portsc |= XHCI_PS_CSC | XHCI_PS_CCS; port->portsc &= ~XHCI_PS_PLS_MASK; /* * XHCI 4.19.3 USB2 RxDetect->Polling, * USB3 Polling->U0 */ if (dev->dev_ue->ue_usbver == 2) port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL); else port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0); pci_xhci_set_evtrb(&evtrb, i, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); if (pci_xhci_insert_event(sc, &evtrb, 0) != XHCI_TRB_ERROR_SUCCESS) break; } } else { sc->opregs.usbcmd &= ~XHCI_CMD_RS; sc->opregs.usbsts |= XHCI_STS_HCH; sc->opregs.usbsts &= ~XHCI_STS_PCD; } /* start execution of schedule; stop when set to 0 */ cmd |= sc->opregs.usbcmd & XHCI_CMD_RS; if (cmd & XHCI_CMD_HCRST) { /* reset controller */ pci_xhci_reset(sc); cmd &= ~XHCI_CMD_HCRST; } cmd &= ~(XHCI_CMD_CSS | XHCI_CMD_CRS); if (do_intr) pci_xhci_assert_interrupt(sc); return (cmd); } static void pci_xhci_portregs_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { struct xhci_trb evtrb; struct pci_xhci_portregs *p; int port; uint32_t oldpls, newpls; if (sc->portregs == NULL) return; port = (offset - XHCI_PORTREGS_PORT0) / XHCI_PORTREGS_SETSZ; offset = (offset - XHCI_PORTREGS_PORT0) % XHCI_PORTREGS_SETSZ; DPRINTF(("pci_xhci: portregs wr offset 0x%lx, port %u: 0x%lx", offset, port, value)); assert(port >= 0); if (port > XHCI_MAX_DEVS) { DPRINTF(("pci_xhci: portregs_write port %d > ndevices", port)); return; } if (XHCI_DEVINST_PTR(sc, port) == NULL) { DPRINTF(("pci_xhci: portregs_write to unattached port %d", port)); } p = XHCI_PORTREG_PTR(sc, port); switch (offset) { case 0: /* port reset or warm reset */ if (value & (XHCI_PS_PR | XHCI_PS_WPR)) { pci_xhci_reset_port(sc, port, value & XHCI_PS_WPR); break; } if ((p->portsc & XHCI_PS_PP) == 0) { WPRINTF(("pci_xhci: portregs_write to unpowered " "port %d", port)); break; } /* Port status and control register */ oldpls = XHCI_PS_PLS_GET(p->portsc); newpls = XHCI_PS_PLS_GET(value); p->portsc &= XHCI_PS_PED | XHCI_PS_PLS_MASK | XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK; if (XHCI_DEVINST_PTR(sc, port)) p->portsc |= XHCI_PS_CCS; p->portsc |= (value & ~(XHCI_PS_OCA | XHCI_PS_PR | XHCI_PS_PED | XHCI_PS_PLS_MASK | /* link state */ XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK | /* port indicator */ XHCI_PS_LWS | XHCI_PS_DR | XHCI_PS_WPR)); /* clear control bits */ p->portsc &= ~(value & (XHCI_PS_CSC | XHCI_PS_PEC | XHCI_PS_WRC | XHCI_PS_OCC | XHCI_PS_PRC | XHCI_PS_PLC | XHCI_PS_CEC | XHCI_PS_CAS)); /* port disable request; for USB3, don't care */ if (value & XHCI_PS_PED) DPRINTF(("Disable port %d request", port)); if (!(value & XHCI_PS_LWS)) break; DPRINTF(("Port new PLS: %d", newpls)); switch (newpls) { case 0: /* U0 */ case 3: /* U3 */ if (oldpls != newpls) { p->portsc &= ~XHCI_PS_PLS_MASK; p->portsc |= XHCI_PS_PLS_SET(newpls) | XHCI_PS_PLC; if (oldpls != 0 && newpls == 0) { pci_xhci_set_evtrb(&evtrb, port, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); pci_xhci_insert_event(sc, &evtrb, 1); } } break; default: DPRINTF(("Unhandled change port %d PLS %u", port, newpls)); break; } break; case 4: /* Port power management status and control register */ p->portpmsc = value; break; case 8: /* Port link information register */ DPRINTF(("pci_xhci attempted write to PORTLI, port %d", port)); break; case 12: /* * Port hardware LPM control register. * For USB3, this register is reserved. */ p->porthlpmc = value; break; default: DPRINTF(("pci_xhci: unaligned portreg write offset %#lx", offset)); break; } } static struct xhci_dev_ctx * pci_xhci_get_dev_ctx(struct pci_xhci_softc *sc, uint32_t slot) { uint64_t devctx_addr; struct xhci_dev_ctx *devctx; assert(slot > 0 && slot <= XHCI_MAX_DEVS); assert(XHCI_SLOTDEV_PTR(sc, slot) != NULL); assert(sc->opregs.dcbaa_p != NULL); devctx_addr = sc->opregs.dcbaa_p->dcba[slot]; if (devctx_addr == 0) { DPRINTF(("get_dev_ctx devctx_addr == 0")); return (NULL); } DPRINTF(("pci_xhci: get dev ctx, slot %u devctx addr %016lx", slot, devctx_addr)); devctx = XHCI_GADDR(sc, devctx_addr & ~0x3FUL); return (devctx); } static struct xhci_trb * pci_xhci_trb_next(struct pci_xhci_softc *sc, struct xhci_trb *curtrb, uint64_t *guestaddr) { struct xhci_trb *next; assert(curtrb != NULL); if (XHCI_TRB_3_TYPE_GET(curtrb->dwTrb3) == XHCI_TRB_TYPE_LINK) { if (guestaddr) *guestaddr = curtrb->qwTrb0 & ~0xFUL; next = XHCI_GADDR(sc, curtrb->qwTrb0 & ~0xFUL); } else { if (guestaddr) *guestaddr += sizeof(struct xhci_trb) & ~0xFUL; next = curtrb + 1; } return (next); } static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc) { sc->rtsregs.intrreg.erdp |= XHCI_ERDP_LO_BUSY; sc->rtsregs.intrreg.iman |= XHCI_IMAN_INTR_PEND; sc->opregs.usbsts |= XHCI_STS_EINT; /* only trigger interrupt if permitted */ if ((sc->opregs.usbcmd & XHCI_CMD_INTE) && (sc->rtsregs.intrreg.iman & XHCI_IMAN_INTR_ENA)) { if (pci_msi_enabled(sc->xsc_pi)) pci_generate_msi(sc->xsc_pi, 0); else pci_lintr_assert(sc->xsc_pi); } } static void pci_xhci_deassert_interrupt(struct pci_xhci_softc *sc) { if (!pci_msi_enabled(sc->xsc_pi)) pci_lintr_assert(sc->xsc_pi); } static void pci_xhci_init_ep(struct pci_xhci_dev_emu *dev, int epid) { struct xhci_dev_ctx *dev_ctx; struct pci_xhci_dev_ep *devep; struct xhci_endp_ctx *ep_ctx; uint32_t i, pstreams; dev_ctx = dev->dev_ctx; ep_ctx = &dev_ctx->ctx_ep[epid]; devep = &dev->eps[epid]; pstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0); if (pstreams > 0) { DPRINTF(("init_ep %d with pstreams %d", epid, pstreams)); assert(devep->ep_sctx_trbs == NULL); devep->ep_sctx = XHCI_GADDR(dev->xsc, ep_ctx->qwEpCtx2 & XHCI_EPCTX_2_TR_DQ_PTR_MASK); devep->ep_sctx_trbs = calloc(pstreams, sizeof(struct pci_xhci_trb_ring)); for (i = 0; i < pstreams; i++) { devep->ep_sctx_trbs[i].ringaddr = devep->ep_sctx[i].qwSctx0 & XHCI_SCTX_0_TR_DQ_PTR_MASK; devep->ep_sctx_trbs[i].ccs = XHCI_SCTX_0_DCS_GET(devep->ep_sctx[i].qwSctx0); } } else { DPRINTF(("init_ep %d with no pstreams", epid)); devep->ep_ringaddr = ep_ctx->qwEpCtx2 & XHCI_EPCTX_2_TR_DQ_PTR_MASK; devep->ep_ccs = XHCI_EPCTX_2_DCS_GET(ep_ctx->qwEpCtx2); devep->ep_tr = XHCI_GADDR(dev->xsc, devep->ep_ringaddr); DPRINTF(("init_ep tr DCS %x", devep->ep_ccs)); } devep->ep_MaxPStreams = pstreams; if (devep->ep_xfer == NULL) { devep->ep_xfer = malloc(sizeof(struct usb_data_xfer)); USB_DATA_XFER_INIT(devep->ep_xfer); } } static void pci_xhci_disable_ep(struct pci_xhci_dev_emu *dev, int epid) { struct xhci_dev_ctx *dev_ctx; struct pci_xhci_dev_ep *devep; struct xhci_endp_ctx *ep_ctx; DPRINTF(("pci_xhci disable_ep %d", epid)); dev_ctx = dev->dev_ctx; ep_ctx = &dev_ctx->ctx_ep[epid]; ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_DISABLED; devep = &dev->eps[epid]; if (devep->ep_MaxPStreams > 0) free(devep->ep_sctx_trbs); if (devep->ep_xfer != NULL) { free(devep->ep_xfer); devep->ep_xfer = NULL; } memset(devep, 0, sizeof(struct pci_xhci_dev_ep)); } /* reset device at slot and data structures related to it */ static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot) { struct pci_xhci_dev_emu *dev; dev = XHCI_SLOTDEV_PTR(sc, slot); if (!dev) { DPRINTF(("xhci reset unassigned slot (%d)?", slot)); } else { dev->dev_slotstate = XHCI_ST_DISABLED; } /* TODO: reset ring buffer pointers */ } static int pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb, int do_intr) { struct pci_xhci_rtsregs *rts; uint64_t erdp; int erdp_idx; int err; struct xhci_trb *evtrbptr; err = XHCI_TRB_ERROR_SUCCESS; rts = &sc->rtsregs; erdp = rts->intrreg.erdp & ~0xF; erdp_idx = (erdp - rts->erstba_p[rts->er_deq_seg].qwEvrsTablePtr) / sizeof(struct xhci_trb); DPRINTF(("pci_xhci: insert event 0[%lx] 2[%x] 3[%x]", evtrb->qwTrb0, evtrb->dwTrb2, evtrb->dwTrb3)); DPRINTF(("\terdp idx %d/seg %d, enq idx %d/seg %d, pcs %u", erdp_idx, rts->er_deq_seg, rts->er_enq_idx, rts->er_enq_seg, rts->event_pcs)); DPRINTF(("\t(erdp=0x%lx, erst=0x%lx, tblsz=%u, do_intr %d)", erdp, rts->erstba_p->qwEvrsTablePtr, rts->erstba_p->dwEvrsTableSize, do_intr)); evtrbptr = &rts->erst_p[rts->er_enq_idx]; /* TODO: multi-segment table */ if (rts->er_events_cnt >= rts->erstba_p->dwEvrsTableSize) { DPRINTF(("pci_xhci[%d] cannot insert event; ring full", __LINE__)); err = XHCI_TRB_ERROR_EV_RING_FULL; goto done; } if (rts->er_events_cnt == rts->erstba_p->dwEvrsTableSize - 1) { struct xhci_trb errev; if ((evtrbptr->dwTrb3 & 0x1) == (rts->event_pcs & 0x1)) { DPRINTF(("pci_xhci[%d] insert evt err: ring full", __LINE__)); errev.qwTrb0 = 0; errev.dwTrb2 = XHCI_TRB_2_ERROR_SET( XHCI_TRB_ERROR_EV_RING_FULL); errev.dwTrb3 = XHCI_TRB_3_TYPE_SET( XHCI_TRB_EVENT_HOST_CTRL) | rts->event_pcs; rts->er_events_cnt++; memcpy(&rts->erst_p[rts->er_enq_idx], &errev, sizeof(struct xhci_trb)); rts->er_enq_idx = (rts->er_enq_idx + 1) % rts->erstba_p->dwEvrsTableSize; err = XHCI_TRB_ERROR_EV_RING_FULL; do_intr = 1; goto done; } } else { rts->er_events_cnt++; } evtrb->dwTrb3 &= ~XHCI_TRB_3_CYCLE_BIT; evtrb->dwTrb3 |= rts->event_pcs; memcpy(&rts->erst_p[rts->er_enq_idx], evtrb, sizeof(struct xhci_trb)); rts->er_enq_idx = (rts->er_enq_idx + 1) % rts->erstba_p->dwEvrsTableSize; if (rts->er_enq_idx == 0) rts->event_pcs ^= 1; done: if (do_intr) pci_xhci_assert_interrupt(sc); return (err); } static uint32_t pci_xhci_cmd_enable_slot(struct pci_xhci_softc *sc, uint32_t *slot) { struct pci_xhci_dev_emu *dev; uint32_t cmderr; int i; cmderr = XHCI_TRB_ERROR_NO_SLOTS; if (sc->portregs != NULL) for (i = 1; i <= XHCI_MAX_SLOTS; i++) { dev = XHCI_SLOTDEV_PTR(sc, i); if (dev && dev->dev_slotstate == XHCI_ST_DISABLED) { *slot = i; dev->dev_slotstate = XHCI_ST_ENABLED; cmderr = XHCI_TRB_ERROR_SUCCESS; dev->hci.hci_address = i; break; } } DPRINTF(("pci_xhci enable slot (error=%d) slot %u", cmderr != XHCI_TRB_ERROR_SUCCESS, *slot)); return (cmderr); } static uint32_t pci_xhci_cmd_disable_slot(struct pci_xhci_softc *sc, uint32_t slot) { struct pci_xhci_dev_emu *dev; uint32_t cmderr; DPRINTF(("pci_xhci disable slot %u", slot)); cmderr = XHCI_TRB_ERROR_NO_SLOTS; if (sc->portregs == NULL) goto done; if (slot > XHCI_MAX_SLOTS) { cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; goto done; } dev = XHCI_SLOTDEV_PTR(sc, slot); if (dev) { if (dev->dev_slotstate == XHCI_ST_DISABLED) { cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; } else { dev->dev_slotstate = XHCI_ST_DISABLED; cmderr = XHCI_TRB_ERROR_SUCCESS; /* TODO: reset events and endpoints */ } } else cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; done: return (cmderr); } static uint32_t pci_xhci_cmd_reset_device(struct pci_xhci_softc *sc, uint32_t slot) { struct pci_xhci_dev_emu *dev; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; uint32_t cmderr; int i; cmderr = XHCI_TRB_ERROR_NO_SLOTS; if (sc->portregs == NULL) goto done; DPRINTF(("pci_xhci reset device slot %u", slot)); dev = XHCI_SLOTDEV_PTR(sc, slot); if (!dev || dev->dev_slotstate == XHCI_ST_DISABLED) cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; else { dev->dev_slotstate = XHCI_ST_DEFAULT; dev->hci.hci_address = 0; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); /* slot state */ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_DEFAULT, 0x1F, 27); /* number of contexts */ dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); /* reset all eps other than ep-0 */ for (i = 2; i <= 31; i++) { ep_ctx = &dev_ctx->ctx_ep[i]; ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_DISABLED, 0x7, 0); } cmderr = XHCI_TRB_ERROR_SUCCESS; } pci_xhci_reset_slot(sc, slot); done: return (cmderr); } static uint32_t pci_xhci_cmd_address_device(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct pci_xhci_dev_emu *dev; struct xhci_input_dev_ctx *input_ctx; struct xhci_slot_ctx *islot_ctx; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep0_ctx; uint32_t cmderr; input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); islot_ctx = &input_ctx->ctx_slot; ep0_ctx = &input_ctx->ctx_ep[1]; cmderr = XHCI_TRB_ERROR_SUCCESS; DPRINTF(("pci_xhci: address device, input ctl: D 0x%08x A 0x%08x,", input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1)); DPRINTF((" slot %08x %08x %08x %08x", islot_ctx->dwSctx0, islot_ctx->dwSctx1, islot_ctx->dwSctx2, islot_ctx->dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); /* when setting address: drop-ctx=0, add-ctx=slot+ep0 */ if ((input_ctx->ctx_input.dwInCtx0 != 0) || (input_ctx->ctx_input.dwInCtx1 & 0x03) != 0x03) { DPRINTF(("pci_xhci: address device, input ctl invalid")); cmderr = XHCI_TRB_ERROR_TRB; goto done; } /* assign address to slot */ dev_ctx = pci_xhci_get_dev_ctx(sc, slot); DPRINTF(("pci_xhci: address device, dev ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); dev->hci.hci_address = slot; dev->dev_ctx = dev_ctx; if (dev->dev_ue->ue_reset == NULL || dev->dev_ue->ue_reset(dev->dev_sc) < 0) { cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; goto done; } memcpy(&dev_ctx->ctx_slot, islot_ctx, sizeof(struct xhci_slot_ctx)); dev_ctx->ctx_slot.dwSctx3 = XHCI_SCTX_3_SLOT_STATE_SET(XHCI_ST_SLCTX_ADDRESSED) | XHCI_SCTX_3_DEV_ADDR_SET(slot); memcpy(&dev_ctx->ctx_ep[1], ep0_ctx, sizeof(struct xhci_endp_ctx)); ep0_ctx = &dev_ctx->ctx_ep[1]; ep0_ctx->dwEpCtx0 = (ep0_ctx->dwEpCtx0 & ~0x7) | XHCI_EPCTX_0_EPSTATE_SET(XHCI_ST_EPCTX_RUNNING); pci_xhci_init_ep(dev, 1); dev->dev_slotstate = XHCI_ST_ADDRESSED; DPRINTF(("pci_xhci: address device, output ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); done: return (cmderr); } static uint32_t pci_xhci_cmd_config_ep(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct xhci_input_dev_ctx *input_ctx; struct pci_xhci_dev_emu *dev; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx, *iep_ctx; uint32_t cmderr; int i; cmderr = XHCI_TRB_ERROR_SUCCESS; DPRINTF(("pci_xhci config_ep slot %u", slot)); dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); if ((trb->dwTrb3 & XHCI_TRB_3_DCEP_BIT) != 0) { DPRINTF(("pci_xhci config_ep - deconfigure ep slot %u", slot)); if (dev->dev_ue->ue_stop != NULL) dev->dev_ue->ue_stop(dev->dev_sc); dev->dev_slotstate = XHCI_ST_ADDRESSED; dev->hci.hci_address = 0; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); /* number of contexts */ dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); /* slot state */ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_ADDRESSED, 0x1F, 27); /* disable endpoints */ for (i = 2; i < 32; i++) pci_xhci_disable_ep(dev, i); cmderr = XHCI_TRB_ERROR_SUCCESS; goto done; } if (dev->dev_slotstate < XHCI_ST_ADDRESSED) { DPRINTF(("pci_xhci: config_ep slotstate x%x != addressed", dev->dev_slotstate)); cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; goto done; } /* In addressed/configured state; * for each drop endpoint ctx flag: * ep->state = DISABLED * for each add endpoint ctx flag: * cp(ep-in, ep-out) * ep->state = RUNNING * for each drop+add endpoint flag: * reset ep resources * cp(ep-in, ep-out) * ep->state = RUNNING * if input->DisabledCtx[2-31] < 30: (at least 1 ep not disabled) * slot->state = configured */ input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); dev_ctx = dev->dev_ctx; DPRINTF(("pci_xhci: config_ep inputctx: D:x%08x A:x%08x 7:x%08x", input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1, input_ctx->ctx_input.dwInCtx7)); for (i = 2; i <= 31; i++) { ep_ctx = &dev_ctx->ctx_ep[i]; if (input_ctx->ctx_input.dwInCtx0 & XHCI_INCTX_0_DROP_MASK(i)) { DPRINTF((" config ep - dropping ep %d", i)); pci_xhci_disable_ep(dev, i); } if (input_ctx->ctx_input.dwInCtx1 & XHCI_INCTX_1_ADD_MASK(i)) { iep_ctx = &input_ctx->ctx_ep[i]; DPRINTF((" enable ep[%d] %08x %08x %016lx %08x", i, iep_ctx->dwEpCtx0, iep_ctx->dwEpCtx1, iep_ctx->qwEpCtx2, iep_ctx->dwEpCtx4)); memcpy(ep_ctx, iep_ctx, sizeof(struct xhci_endp_ctx)); pci_xhci_init_ep(dev, i); /* ep state */ ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); } } /* slot state to configured */ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_CONFIGURED, 0x1F, 27); dev_ctx->ctx_slot.dwSctx0 = FIELD_COPY( dev_ctx->ctx_slot.dwSctx0, input_ctx->ctx_slot.dwSctx0, 0x1F, 27); dev->dev_slotstate = XHCI_ST_CONFIGURED; DPRINTF(("EP configured; slot %u [0]=0x%08x [1]=0x%08x [2]=0x%08x " "[3]=0x%08x", slot, dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); done: return (cmderr); } static uint32_t pci_xhci_cmd_reset_ep(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; uint32_t cmderr, epid; uint32_t type; epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); DPRINTF(("pci_xhci: reset ep %u: slot %u", epid, slot)); cmderr = XHCI_TRB_ERROR_SUCCESS; type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); if (type == XHCI_TRB_TYPE_STOP_EP && (trb->dwTrb3 & XHCI_TRB_3_SUSP_EP_BIT) != 0) { /* XXX suspend endpoint for 10ms */ } if (epid < 1 || epid > 31) { DPRINTF(("pci_xhci: reset ep: invalid epid %u", epid)); cmderr = XHCI_TRB_ERROR_TRB; goto done; } devep = &dev->eps[epid]; if (devep->ep_xfer != NULL) USB_DATA_XFER_RESET(devep->ep_xfer); dev_ctx = dev->dev_ctx; assert(dev_ctx != NULL); ep_ctx = &dev_ctx->ctx_ep[epid]; ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; if (devep->ep_MaxPStreams == 0) ep_ctx->qwEpCtx2 = devep->ep_ringaddr | devep->ep_ccs; DPRINTF(("pci_xhci: reset ep[%u] %08x %08x %016lx %08x", epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, ep_ctx->dwEpCtx4)); if (type == XHCI_TRB_TYPE_RESET_EP && (dev->dev_ue->ue_reset == NULL || dev->dev_ue->ue_reset(dev->dev_sc) < 0)) { cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; goto done; } done: return (cmderr); } static uint32_t pci_xhci_find_stream(struct pci_xhci_softc *sc, struct xhci_endp_ctx *ep, struct pci_xhci_dev_ep *devep, uint32_t streamid) { struct xhci_stream_ctx *sctx; if (devep->ep_MaxPStreams == 0) return (XHCI_TRB_ERROR_TRB); if (devep->ep_MaxPStreams > XHCI_STREAMS_MAX) return (XHCI_TRB_ERROR_INVALID_SID); if (XHCI_EPCTX_0_LSA_GET(ep->dwEpCtx0) == 0) { DPRINTF(("pci_xhci: find_stream; LSA bit not set")); return (XHCI_TRB_ERROR_INVALID_SID); } /* only support primary stream */ if (streamid > devep->ep_MaxPStreams) return (XHCI_TRB_ERROR_STREAM_TYPE); sctx = (struct xhci_stream_ctx *)XHCI_GADDR(sc, ep->qwEpCtx2 & ~0xFUL) + streamid; if (!XHCI_SCTX_0_SCT_GET(sctx->qwSctx0)) return (XHCI_TRB_ERROR_STREAM_TYPE); return (XHCI_TRB_ERROR_SUCCESS); } static uint32_t pci_xhci_cmd_set_tr(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; uint32_t cmderr, epid; uint32_t streamid; cmderr = XHCI_TRB_ERROR_SUCCESS; dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); DPRINTF(("pci_xhci set_tr: new-tr x%016lx, SCT %u DCS %u", (trb->qwTrb0 & ~0xF), (uint32_t)((trb->qwTrb0 >> 1) & 0x7), (uint32_t)(trb->qwTrb0 & 0x1))); DPRINTF((" stream-id %u, slot %u, epid %u, C %u", (trb->dwTrb2 >> 16) & 0xFFFF, XHCI_TRB_3_SLOT_GET(trb->dwTrb3), XHCI_TRB_3_EP_GET(trb->dwTrb3), trb->dwTrb3 & 0x1)); epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); if (epid < 1 || epid > 31) { DPRINTF(("pci_xhci: set_tr_deq: invalid epid %u", epid)); cmderr = XHCI_TRB_ERROR_TRB; goto done; } dev_ctx = dev->dev_ctx; assert(dev_ctx != NULL); ep_ctx = &dev_ctx->ctx_ep[epid]; devep = &dev->eps[epid]; switch (XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0)) { case XHCI_ST_EPCTX_STOPPED: case XHCI_ST_EPCTX_ERROR: break; default: DPRINTF(("pci_xhci cmd set_tr invalid state %x", XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0))); cmderr = XHCI_TRB_ERROR_CONTEXT_STATE; goto done; } streamid = XHCI_TRB_2_STREAM_GET(trb->dwTrb2); if (devep->ep_MaxPStreams > 0) { cmderr = pci_xhci_find_stream(sc, ep_ctx, devep, streamid); if (cmderr == XHCI_TRB_ERROR_SUCCESS) { assert(devep->ep_sctx != NULL); devep->ep_sctx[streamid].qwSctx0 = trb->qwTrb0; devep->ep_sctx_trbs[streamid].ringaddr = trb->qwTrb0 & ~0xF; devep->ep_sctx_trbs[streamid].ccs = XHCI_EPCTX_2_DCS_GET(trb->qwTrb0); } } else { if (streamid != 0) { DPRINTF(("pci_xhci cmd set_tr streamid %x != 0", streamid)); } ep_ctx->qwEpCtx2 = trb->qwTrb0 & ~0xFUL; devep->ep_ringaddr = ep_ctx->qwEpCtx2 & ~0xFUL; devep->ep_ccs = trb->qwTrb0 & 0x1; devep->ep_tr = XHCI_GADDR(sc, devep->ep_ringaddr); DPRINTF(("pci_xhci set_tr first TRB:")); pci_xhci_dump_trb(devep->ep_tr); } ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; done: return (cmderr); } static uint32_t pci_xhci_cmd_eval_ctx(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct xhci_input_dev_ctx *input_ctx; struct xhci_slot_ctx *islot_ctx; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep0_ctx; uint32_t cmderr; input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); islot_ctx = &input_ctx->ctx_slot; ep0_ctx = &input_ctx->ctx_ep[1]; cmderr = XHCI_TRB_ERROR_SUCCESS; DPRINTF(("pci_xhci: eval ctx, input ctl: D 0x%08x A 0x%08x,", input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1)); DPRINTF((" slot %08x %08x %08x %08x", islot_ctx->dwSctx0, islot_ctx->dwSctx1, islot_ctx->dwSctx2, islot_ctx->dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); /* this command expects drop-ctx=0 & add-ctx=slot+ep0 */ if ((input_ctx->ctx_input.dwInCtx0 != 0) || (input_ctx->ctx_input.dwInCtx1 & 0x03) == 0) { DPRINTF(("pci_xhci: eval ctx, input ctl invalid")); cmderr = XHCI_TRB_ERROR_TRB; goto done; } /* assign address to slot; in this emulation, slot_id = address */ dev_ctx = pci_xhci_get_dev_ctx(sc, slot); DPRINTF(("pci_xhci: eval ctx, dev ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); if (input_ctx->ctx_input.dwInCtx1 & 0x01) { /* slot ctx */ /* set max exit latency */ dev_ctx->ctx_slot.dwSctx1 = FIELD_COPY( dev_ctx->ctx_slot.dwSctx1, input_ctx->ctx_slot.dwSctx1, 0xFFFF, 0); /* set interrupter target */ dev_ctx->ctx_slot.dwSctx2 = FIELD_COPY( dev_ctx->ctx_slot.dwSctx2, input_ctx->ctx_slot.dwSctx2, 0x3FF, 22); } if (input_ctx->ctx_input.dwInCtx1 & 0x02) { /* control ctx */ /* set max packet size */ dev_ctx->ctx_ep[1].dwEpCtx1 = FIELD_COPY( dev_ctx->ctx_ep[1].dwEpCtx1, ep0_ctx->dwEpCtx1, 0xFFFF, 16); ep0_ctx = &dev_ctx->ctx_ep[1]; } DPRINTF(("pci_xhci: eval ctx, output ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); done: return (cmderr); } static int pci_xhci_complete_commands(struct pci_xhci_softc *sc) { struct xhci_trb evtrb; struct xhci_trb *trb; uint64_t crcr; uint32_t ccs; /* cycle state (XHCI 4.9.2) */ uint32_t type; uint32_t slot; uint32_t cmderr; int error; error = 0; sc->opregs.crcr |= XHCI_CRCR_LO_CRR; trb = sc->opregs.cr_p; ccs = sc->opregs.crcr & XHCI_CRCR_LO_RCS; crcr = sc->opregs.crcr & ~0xF; while (1) { sc->opregs.cr_p = trb; type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); if ((trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT) != (ccs & XHCI_TRB_3_CYCLE_BIT)) break; DPRINTF(("pci_xhci: cmd type 0x%x, Trb0 x%016lx dwTrb2 x%08x" " dwTrb3 x%08x, TRB_CYCLE %u/ccs %u", type, trb->qwTrb0, trb->dwTrb2, trb->dwTrb3, trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT, ccs)); cmderr = XHCI_TRB_ERROR_SUCCESS; evtrb.dwTrb2 = 0; evtrb.dwTrb3 = (ccs & XHCI_TRB_3_CYCLE_BIT) | XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_CMD_COMPLETE); slot = 0; switch (type) { case XHCI_TRB_TYPE_LINK: /* 0x06 */ if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) ccs ^= XHCI_CRCR_LO_RCS; break; case XHCI_TRB_TYPE_ENABLE_SLOT: /* 0x09 */ cmderr = pci_xhci_cmd_enable_slot(sc, &slot); break; case XHCI_TRB_TYPE_DISABLE_SLOT: /* 0x0A */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_disable_slot(sc, slot); break; case XHCI_TRB_TYPE_ADDRESS_DEVICE: /* 0x0B */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_address_device(sc, slot, trb); break; case XHCI_TRB_TYPE_CONFIGURE_EP: /* 0x0C */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_config_ep(sc, slot, trb); break; case XHCI_TRB_TYPE_EVALUATE_CTX: /* 0x0D */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_eval_ctx(sc, slot, trb); break; case XHCI_TRB_TYPE_RESET_EP: /* 0x0E */ DPRINTF(("Reset Endpoint on slot %d", slot)); slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); break; case XHCI_TRB_TYPE_STOP_EP: /* 0x0F */ DPRINTF(("Stop Endpoint on slot %d", slot)); slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); break; case XHCI_TRB_TYPE_SET_TR_DEQUEUE: /* 0x10 */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_set_tr(sc, slot, trb); break; case XHCI_TRB_TYPE_RESET_DEVICE: /* 0x11 */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_reset_device(sc, slot); break; case XHCI_TRB_TYPE_FORCE_EVENT: /* 0x12 */ /* TODO: */ break; case XHCI_TRB_TYPE_NEGOTIATE_BW: /* 0x13 */ break; case XHCI_TRB_TYPE_SET_LATENCY_TOL: /* 0x14 */ break; case XHCI_TRB_TYPE_GET_PORT_BW: /* 0x15 */ break; case XHCI_TRB_TYPE_FORCE_HEADER: /* 0x16 */ break; case XHCI_TRB_TYPE_NOOP_CMD: /* 0x17 */ break; default: DPRINTF(("pci_xhci: unsupported cmd %x", type)); break; } if (type != XHCI_TRB_TYPE_LINK) { /* * insert command completion event and assert intr */ evtrb.qwTrb0 = crcr; evtrb.dwTrb2 |= XHCI_TRB_2_ERROR_SET(cmderr); evtrb.dwTrb3 |= XHCI_TRB_3_SLOT_SET(slot); DPRINTF(("pci_xhci: command 0x%x result: 0x%x", type, cmderr)); pci_xhci_insert_event(sc, &evtrb, 1); } trb = pci_xhci_trb_next(sc, trb, &crcr); } sc->opregs.crcr = crcr | (sc->opregs.crcr & XHCI_CRCR_LO_CA) | ccs; sc->opregs.crcr &= ~XHCI_CRCR_LO_CRR; return (error); } static void pci_xhci_dump_trb(struct xhci_trb *trb) { static const char *trbtypes[] = { "RESERVED", "NORMAL", "SETUP_STAGE", "DATA_STAGE", "STATUS_STAGE", "ISOCH", "LINK", "EVENT_DATA", "NOOP", "ENABLE_SLOT", "DISABLE_SLOT", "ADDRESS_DEVICE", "CONFIGURE_EP", "EVALUATE_CTX", "RESET_EP", "STOP_EP", "SET_TR_DEQUEUE", "RESET_DEVICE", "FORCE_EVENT", "NEGOTIATE_BW", "SET_LATENCY_TOL", "GET_PORT_BW", "FORCE_HEADER", "NOOP_CMD" }; uint32_t type; type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); DPRINTF(("pci_xhci: trb[@%p] type x%02x %s 0:x%016lx 2:x%08x 3:x%08x", trb, type, type <= XHCI_TRB_TYPE_NOOP_CMD ? trbtypes[type] : "INVALID", trb->qwTrb0, trb->dwTrb2, trb->dwTrb3)); } static int pci_xhci_xfer_complete(struct pci_xhci_softc *sc, struct usb_data_xfer *xfer, uint32_t slot, uint32_t epid, int *do_intr) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; struct xhci_trb *trb; struct xhci_trb evtrb; uint32_t trbflags; uint32_t edtla; int i, err; dev = XHCI_SLOTDEV_PTR(sc, slot); devep = &dev->eps[epid]; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); assert(dev_ctx != NULL); ep_ctx = &dev_ctx->ctx_ep[epid]; err = XHCI_TRB_ERROR_SUCCESS; *do_intr = 0; edtla = 0; /* go through list of TRBs and insert event(s) */ for (i = xfer->head; xfer->ndata > 0; ) { evtrb.qwTrb0 = (uint64_t)xfer->data[i].hci_data; trb = XHCI_GADDR(sc, evtrb.qwTrb0); trbflags = trb->dwTrb3; DPRINTF(("pci_xhci: xfer[%d] done?%u:%d trb %x %016lx %x " "(err %d) IOC?%d", i, xfer->data[i].processed, xfer->data[i].blen, XHCI_TRB_3_TYPE_GET(trbflags), evtrb.qwTrb0, trbflags, err, trb->dwTrb3 & XHCI_TRB_3_IOC_BIT ? 1 : 0)); if (!xfer->data[i].processed) { xfer->head = i; break; } xfer->ndata--; edtla += xfer->data[i].bdone; trb->dwTrb3 = (trb->dwTrb3 & ~0x1) | (xfer->data[i].ccs); pci_xhci_update_ep_ring(sc, dev, devep, ep_ctx, xfer->data[i].streamid, xfer->data[i].trbnext, xfer->data[i].ccs); /* Only interrupt if IOC or short packet */ if (!(trb->dwTrb3 & XHCI_TRB_3_IOC_BIT) && !((err == XHCI_TRB_ERROR_SHORT_PKT) && (trb->dwTrb3 & XHCI_TRB_3_ISP_BIT))) { i = (i + 1) % USB_MAX_XFER_BLOCKS; continue; } evtrb.dwTrb2 = XHCI_TRB_2_ERROR_SET(err) | XHCI_TRB_2_REM_SET(xfer->data[i].blen); evtrb.dwTrb3 = XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_TRANSFER) | XHCI_TRB_3_SLOT_SET(slot) | XHCI_TRB_3_EP_SET(epid); if (XHCI_TRB_3_TYPE_GET(trbflags) == XHCI_TRB_TYPE_EVENT_DATA) { DPRINTF(("pci_xhci EVENT_DATA edtla %u", edtla)); evtrb.qwTrb0 = trb->qwTrb0; evtrb.dwTrb2 = (edtla & 0xFFFFF) | XHCI_TRB_2_ERROR_SET(err); evtrb.dwTrb3 |= XHCI_TRB_3_ED_BIT; edtla = 0; } *do_intr = 1; err = pci_xhci_insert_event(sc, &evtrb, 0); if (err != XHCI_TRB_ERROR_SUCCESS) { break; } i = (i + 1) % USB_MAX_XFER_BLOCKS; } return (err); } static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev __unused, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, uint32_t streamid, uint64_t ringaddr, int ccs) { if (devep->ep_MaxPStreams != 0) { devep->ep_sctx[streamid].qwSctx0 = (ringaddr & ~0xFUL) | (ccs & 0x1); devep->ep_sctx_trbs[streamid].ringaddr = ringaddr & ~0xFUL; devep->ep_sctx_trbs[streamid].ccs = ccs & 0x1; ep_ctx->qwEpCtx2 = (ep_ctx->qwEpCtx2 & ~0x1) | (ccs & 0x1); DPRINTF(("xhci update ep-ring stream %d, addr %lx", streamid, devep->ep_sctx[streamid].qwSctx0)); } else { devep->ep_ringaddr = ringaddr & ~0xFUL; devep->ep_ccs = ccs & 0x1; devep->ep_tr = XHCI_GADDR(sc, ringaddr & ~0xFUL); ep_ctx->qwEpCtx2 = (ringaddr & ~0xFUL) | (ccs & 0x1); DPRINTF(("xhci update ep-ring, addr %lx", (devep->ep_ringaddr | devep->ep_ccs))); } } /* * Outstanding transfer still in progress (device NAK'd earlier) so retry * the transfer again to see if it succeeds. */ static int pci_xhci_try_usb_xfer(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, uint32_t slot, uint32_t epid) { struct usb_data_xfer *xfer; int err; int do_intr; ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); err = 0; do_intr = 0; xfer = devep->ep_xfer; USB_DATA_XFER_LOCK(xfer); /* outstanding requests queued up */ if (dev->dev_ue->ue_data != NULL) { err = dev->dev_ue->ue_data(dev->dev_sc, xfer, epid & 0x1 ? USB_XFER_IN : USB_XFER_OUT, epid/2); if (err == USB_ERR_CANCELLED) { if (USB_DATA_GET_ERRCODE(&xfer->data[xfer->head]) == USB_NAK) err = XHCI_TRB_ERROR_SUCCESS; } else { err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr); if (err == XHCI_TRB_ERROR_SUCCESS && do_intr) { pci_xhci_assert_interrupt(sc); } /* XXX should not do it if error? */ USB_DATA_XFER_RESET(xfer); } } USB_DATA_XFER_UNLOCK(xfer); return (err); } static int pci_xhci_handle_transfer(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, struct xhci_trb *trb, uint32_t slot, uint32_t epid, uint64_t addr, uint32_t ccs, uint32_t streamid) { struct xhci_trb *setup_trb; struct usb_data_xfer *xfer; struct usb_data_xfer_block *xfer_block; uint64_t val; uint32_t trbflags; int do_intr, err; int do_retry; ep_ctx->dwEpCtx0 = FIELD_REPLACE(ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); xfer = devep->ep_xfer; USB_DATA_XFER_LOCK(xfer); DPRINTF(("pci_xhci handle_transfer slot %u", slot)); retry: err = XHCI_TRB_ERROR_INVALID; do_retry = 0; do_intr = 0; setup_trb = NULL; while (1) { pci_xhci_dump_trb(trb); trbflags = trb->dwTrb3; if (XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK && (trbflags & XHCI_TRB_3_CYCLE_BIT) != (ccs & XHCI_TRB_3_CYCLE_BIT)) { DPRINTF(("Cycle-bit changed trbflags %x, ccs %x", trbflags & XHCI_TRB_3_CYCLE_BIT, ccs)); break; } xfer_block = NULL; switch (XHCI_TRB_3_TYPE_GET(trbflags)) { case XHCI_TRB_TYPE_LINK: if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) ccs ^= 0x1; xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); xfer_block->processed = 1; break; case XHCI_TRB_TYPE_SETUP_STAGE: if ((trbflags & XHCI_TRB_3_IDT_BIT) == 0 || XHCI_TRB_2_BYTES_GET(trb->dwTrb2) != 8) { DPRINTF(("pci_xhci: invalid setup trb")); err = XHCI_TRB_ERROR_TRB; goto errout; } setup_trb = trb; val = trb->qwTrb0; if (!xfer->ureq) xfer->ureq = malloc( sizeof(struct usb_device_request)); memcpy(xfer->ureq, &val, sizeof(struct usb_device_request)); xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); xfer_block->processed = 1; break; case XHCI_TRB_TYPE_NORMAL: case XHCI_TRB_TYPE_ISOCH: if (setup_trb != NULL) { DPRINTF(("pci_xhci: trb not supposed to be in " "ctl scope")); err = XHCI_TRB_ERROR_TRB; goto errout; } /* fall through */ case XHCI_TRB_TYPE_DATA_STAGE: xfer_block = usb_data_xfer_append(xfer, (void *)(trbflags & XHCI_TRB_3_IDT_BIT ? &trb->qwTrb0 : XHCI_GADDR(sc, trb->qwTrb0)), trb->dwTrb2 & 0x1FFFF, (void *)addr, ccs); break; case XHCI_TRB_TYPE_STATUS_STAGE: xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); break; case XHCI_TRB_TYPE_NOOP: xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); xfer_block->processed = 1; break; case XHCI_TRB_TYPE_EVENT_DATA: xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); if ((epid > 1) && (trbflags & XHCI_TRB_3_IOC_BIT)) { xfer_block->processed = 1; } break; default: DPRINTF(("pci_xhci: handle xfer unexpected trb type " "0x%x", XHCI_TRB_3_TYPE_GET(trbflags))); err = XHCI_TRB_ERROR_TRB; goto errout; } trb = pci_xhci_trb_next(sc, trb, &addr); DPRINTF(("pci_xhci: next trb: 0x%lx", (uint64_t)trb)); if (xfer_block) { xfer_block->trbnext = addr; xfer_block->streamid = streamid; } if (!setup_trb && !(trbflags & XHCI_TRB_3_CHAIN_BIT) && XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK) { break; } /* handle current batch that requires interrupt on complete */ if (trbflags & XHCI_TRB_3_IOC_BIT) { DPRINTF(("pci_xhci: trb IOC bit set")); if (epid == 1) do_retry = 1; break; } } DPRINTF(("pci_xhci[%d]: xfer->ndata %u", __LINE__, xfer->ndata)); if (xfer->ndata <= 0) goto errout; if (epid == 1) { int usberr; if (dev->dev_ue->ue_request != NULL) usberr = dev->dev_ue->ue_request(dev->dev_sc, xfer); else usberr = USB_ERR_NOT_STARTED; err = USB_TO_XHCI_ERR(usberr); if (err == XHCI_TRB_ERROR_SUCCESS || err == XHCI_TRB_ERROR_STALL || err == XHCI_TRB_ERROR_SHORT_PKT) { err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr); if (err != XHCI_TRB_ERROR_SUCCESS) do_retry = 0; } } else { /* handle data transfer */ pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); err = XHCI_TRB_ERROR_SUCCESS; } errout: if (err == XHCI_TRB_ERROR_EV_RING_FULL) DPRINTF(("pci_xhci[%d]: event ring full", __LINE__)); if (!do_retry) USB_DATA_XFER_UNLOCK(xfer); if (do_intr) pci_xhci_assert_interrupt(sc); if (do_retry) { USB_DATA_XFER_RESET(xfer); DPRINTF(("pci_xhci[%d]: retry:continuing with next TRBs", __LINE__)); goto retry; } if (epid == 1) USB_DATA_XFER_RESET(xfer); return (err); } static void pci_xhci_device_doorbell(struct pci_xhci_softc *sc, uint32_t slot, uint32_t epid, uint32_t streamid) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; struct pci_xhci_trb_ring *sctx_tr; struct xhci_trb *trb; uint64_t ringaddr; uint32_t ccs; int error; DPRINTF(("pci_xhci doorbell slot %u epid %u stream %u", slot, epid, streamid)); if (slot == 0 || slot > XHCI_MAX_SLOTS) { DPRINTF(("pci_xhci: invalid doorbell slot %u", slot)); return; } if (epid == 0 || epid >= XHCI_MAX_ENDPOINTS) { DPRINTF(("pci_xhci: invalid endpoint %u", epid)); return; } dev = XHCI_SLOTDEV_PTR(sc, slot); devep = &dev->eps[epid]; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); if (!dev_ctx) { return; } ep_ctx = &dev_ctx->ctx_ep[epid]; sctx_tr = NULL; DPRINTF(("pci_xhci: device doorbell ep[%u] %08x %08x %016lx %08x", epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, ep_ctx->dwEpCtx4)); if (ep_ctx->qwEpCtx2 == 0) return; /* handle pending transfers */ if (devep->ep_xfer->ndata > 0) { pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); return; } /* get next trb work item */ if (devep->ep_MaxPStreams != 0) { /* * Stream IDs of 0, 65535 (any stream), and 65534 * (prime) are invalid. */ if (streamid == 0 || streamid == 65534 || streamid == 65535) { DPRINTF(("pci_xhci: invalid stream %u", streamid)); return; } error = pci_xhci_find_stream(sc, ep_ctx, devep, streamid); if (error != XHCI_TRB_ERROR_SUCCESS) { DPRINTF(("pci_xhci: invalid stream %u: %d", streamid, error)); return; } sctx_tr = &devep->ep_sctx_trbs[streamid]; ringaddr = sctx_tr->ringaddr; ccs = sctx_tr->ccs; trb = XHCI_GADDR(sc, sctx_tr->ringaddr & ~0xFUL); DPRINTF(("doorbell, stream %u, ccs %lx, trb ccs %x", streamid, ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); } else { if (streamid != 0) { DPRINTF(("pci_xhci: invalid stream %u", streamid)); return; } ringaddr = devep->ep_ringaddr; ccs = devep->ep_ccs; trb = devep->ep_tr; DPRINTF(("doorbell, ccs %lx, trb ccs %x", ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); } if (XHCI_TRB_3_TYPE_GET(trb->dwTrb3) == 0) { DPRINTF(("pci_xhci: ring %lx trb[%lx] EP %u is RESERVED?", ep_ctx->qwEpCtx2, devep->ep_ringaddr, epid)); return; } pci_xhci_handle_transfer(sc, dev, devep, ep_ctx, trb, slot, epid, ringaddr, ccs, streamid); } static void pci_xhci_dbregs_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { offset = (offset - sc->dboff) / sizeof(uint32_t); DPRINTF(("pci_xhci: doorbell write offset 0x%lx: 0x%lx", offset, value)); if (XHCI_HALTED(sc)) { DPRINTF(("pci_xhci: controller halted")); return; } if (offset == 0) pci_xhci_complete_commands(sc); else if (sc->portregs != NULL) pci_xhci_device_doorbell(sc, offset, XHCI_DB_TARGET_GET(value), XHCI_DB_SID_GET(value)); } static void pci_xhci_rtsregs_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { struct pci_xhci_rtsregs *rts; offset -= sc->rtsoff; if (offset == 0) { DPRINTF(("pci_xhci attempted write to MFINDEX")); return; } DPRINTF(("pci_xhci: runtime regs write offset 0x%lx: 0x%lx", offset, value)); offset -= 0x20; /* start of intrreg */ rts = &sc->rtsregs; switch (offset) { case 0x00: if (value & XHCI_IMAN_INTR_PEND) rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; rts->intrreg.iman = (value & XHCI_IMAN_INTR_ENA) | (rts->intrreg.iman & XHCI_IMAN_INTR_PEND); if (!(value & XHCI_IMAN_INTR_ENA)) pci_xhci_deassert_interrupt(sc); break; case 0x04: rts->intrreg.imod = value; break; case 0x08: rts->intrreg.erstsz = value & 0xFFFF; break; case 0x10: /* ERSTBA low bits */ rts->intrreg.erstba = MASK_64_HI(sc->rtsregs.intrreg.erstba) | (value & ~0x3F); break; case 0x14: /* ERSTBA high bits */ rts->intrreg.erstba = (value << 32) | MASK_64_LO(sc->rtsregs.intrreg.erstba); rts->erstba_p = XHCI_GADDR(sc, sc->rtsregs.intrreg.erstba & ~0x3FUL); rts->erst_p = XHCI_GADDR(sc, sc->rtsregs.erstba_p->qwEvrsTablePtr & ~0x3FUL); rts->er_enq_idx = 0; rts->er_events_cnt = 0; DPRINTF(("pci_xhci: wr erstba erst (%p) ptr 0x%lx, sz %u", rts->erstba_p, rts->erstba_p->qwEvrsTablePtr, rts->erstba_p->dwEvrsTableSize)); break; case 0x18: /* ERDP low bits */ rts->intrreg.erdp = MASK_64_HI(sc->rtsregs.intrreg.erdp) | (rts->intrreg.erdp & XHCI_ERDP_LO_BUSY) | (value & ~0xF); if (value & XHCI_ERDP_LO_BUSY) { rts->intrreg.erdp &= ~XHCI_ERDP_LO_BUSY; rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; } rts->er_deq_seg = XHCI_ERDP_LO_SINDEX(value); break; case 0x1C: /* ERDP high bits */ rts->intrreg.erdp = (value << 32) | MASK_64_LO(sc->rtsregs.intrreg.erdp); if (rts->er_events_cnt > 0) { uint64_t erdp; int erdp_i; erdp = rts->intrreg.erdp & ~0xF; erdp_i = (erdp - rts->erstba_p->qwEvrsTablePtr) / sizeof(struct xhci_trb); if (erdp_i <= rts->er_enq_idx) rts->er_events_cnt = rts->er_enq_idx - erdp_i; else rts->er_events_cnt = rts->erstba_p->dwEvrsTableSize - (erdp_i - rts->er_enq_idx); DPRINTF(("pci_xhci: erdp 0x%lx, events cnt %u", erdp, rts->er_events_cnt)); } break; default: DPRINTF(("pci_xhci attempted write to RTS offset 0x%lx", offset)); break; } } static uint64_t pci_xhci_portregs_read(struct pci_xhci_softc *sc, uint64_t offset) { struct pci_xhci_portregs *portregs; int port; uint32_t reg; if (sc->portregs == NULL) return (0); port = (offset - XHCI_PORTREGS_PORT0) / XHCI_PORTREGS_SETSZ; offset = (offset - XHCI_PORTREGS_PORT0) % XHCI_PORTREGS_SETSZ; if (port > XHCI_MAX_DEVS) { DPRINTF(("pci_xhci: portregs_read port %d >= XHCI_MAX_DEVS", port)); /* return default value for unused port */ return (XHCI_PS_SPEED_SET(3)); } portregs = XHCI_PORTREG_PTR(sc, port); switch (offset) { case 0: reg = portregs->portsc; break; case 4: reg = portregs->portpmsc; break; case 8: reg = portregs->portli; break; case 12: reg = portregs->porthlpmc; break; default: DPRINTF(("pci_xhci: unaligned portregs read offset %#lx", offset)); reg = 0xffffffff; break; } DPRINTF(("pci_xhci: portregs read offset 0x%lx port %u -> 0x%x", offset, port, reg)); return (reg); } static void pci_xhci_hostop_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { offset -= XHCI_CAPLEN; if (offset < 0x400) DPRINTF(("pci_xhci: hostop write offset 0x%lx: 0x%lx", offset, value)); switch (offset) { case XHCI_USBCMD: sc->opregs.usbcmd = pci_xhci_usbcmd_write(sc, value & 0x3F0F); break; case XHCI_USBSTS: /* clear bits on write */ sc->opregs.usbsts &= ~(value & (XHCI_STS_HSE|XHCI_STS_EINT|XHCI_STS_PCD|XHCI_STS_SSS| XHCI_STS_RSS|XHCI_STS_SRE|XHCI_STS_CNR)); break; case XHCI_PAGESIZE: /* read only */ break; case XHCI_DNCTRL: sc->opregs.dnctrl = value & 0xFFFF; break; case XHCI_CRCR_LO: if (sc->opregs.crcr & XHCI_CRCR_LO_CRR) { sc->opregs.crcr &= ~(XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); sc->opregs.crcr |= value & (XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); } else { sc->opregs.crcr = MASK_64_HI(sc->opregs.crcr) | (value & (0xFFFFFFC0 | XHCI_CRCR_LO_RCS)); } break; case XHCI_CRCR_HI: if (!(sc->opregs.crcr & XHCI_CRCR_LO_CRR)) { sc->opregs.crcr = MASK_64_LO(sc->opregs.crcr) | (value << 32); sc->opregs.cr_p = XHCI_GADDR(sc, sc->opregs.crcr & ~0xF); } if (sc->opregs.crcr & XHCI_CRCR_LO_CS) { /* Stop operation of Command Ring */ } if (sc->opregs.crcr & XHCI_CRCR_LO_CA) { /* Abort command */ } break; case XHCI_DCBAAP_LO: sc->opregs.dcbaap = MASK_64_HI(sc->opregs.dcbaap) | (value & 0xFFFFFFC0); break; case XHCI_DCBAAP_HI: sc->opregs.dcbaap = MASK_64_LO(sc->opregs.dcbaap) | (value << 32); sc->opregs.dcbaa_p = XHCI_GADDR(sc, sc->opregs.dcbaap & ~0x3FUL); DPRINTF(("pci_xhci: opregs dcbaap = 0x%lx (vaddr 0x%lx)", sc->opregs.dcbaap, (uint64_t)sc->opregs.dcbaa_p)); break; case XHCI_CONFIG: sc->opregs.config = value & 0x03FF; break; default: if (offset >= 0x400) pci_xhci_portregs_write(sc, offset, value); break; } } static void pci_xhci_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size __unused, uint64_t value) { struct pci_xhci_softc *sc; sc = pi->pi_arg; assert(baridx == 0); pthread_mutex_lock(&sc->mtx); if (offset < XHCI_CAPLEN) /* read only registers */ WPRINTF(("pci_xhci: write RO-CAPs offset %ld", offset)); else if (offset < sc->dboff) pci_xhci_hostop_write(sc, offset, value); else if (offset < sc->rtsoff) pci_xhci_dbregs_write(sc, offset, value); else if (offset < sc->regsend) pci_xhci_rtsregs_write(sc, offset, value); else WPRINTF(("pci_xhci: write invalid offset %ld", offset)); pthread_mutex_unlock(&sc->mtx); } static uint64_t pci_xhci_hostcap_read(struct pci_xhci_softc *sc, uint64_t offset) { uint64_t value; switch (offset) { case XHCI_CAPLENGTH: /* 0x00 */ value = sc->caplength; break; case XHCI_HCSPARAMS1: /* 0x04 */ value = sc->hcsparams1; break; case XHCI_HCSPARAMS2: /* 0x08 */ value = sc->hcsparams2; break; case XHCI_HCSPARAMS3: /* 0x0C */ value = sc->hcsparams3; break; case XHCI_HCSPARAMS0: /* 0x10 */ value = sc->hccparams1; break; case XHCI_DBOFF: /* 0x14 */ value = sc->dboff; break; case XHCI_RTSOFF: /* 0x18 */ value = sc->rtsoff; break; case XHCI_HCCPRAMS2: /* 0x1C */ value = sc->hccparams2; break; default: value = 0; break; } DPRINTF(("pci_xhci: hostcap read offset 0x%lx -> 0x%lx", offset, value)); return (value); } static uint64_t pci_xhci_hostop_read(struct pci_xhci_softc *sc, uint64_t offset) { uint64_t value; offset = (offset - XHCI_CAPLEN); switch (offset) { case XHCI_USBCMD: /* 0x00 */ value = sc->opregs.usbcmd; break; case XHCI_USBSTS: /* 0x04 */ value = sc->opregs.usbsts; break; case XHCI_PAGESIZE: /* 0x08 */ value = sc->opregs.pgsz; break; case XHCI_DNCTRL: /* 0x14 */ value = sc->opregs.dnctrl; break; case XHCI_CRCR_LO: /* 0x18 */ value = sc->opregs.crcr & XHCI_CRCR_LO_CRR; break; case XHCI_CRCR_HI: /* 0x1C */ value = 0; break; case XHCI_DCBAAP_LO: /* 0x30 */ value = sc->opregs.dcbaap & 0xFFFFFFFF; break; case XHCI_DCBAAP_HI: /* 0x34 */ value = (sc->opregs.dcbaap >> 32) & 0xFFFFFFFF; break; case XHCI_CONFIG: /* 0x38 */ value = sc->opregs.config; break; default: if (offset >= 0x400) value = pci_xhci_portregs_read(sc, offset); else value = 0; break; } if (offset < 0x400) DPRINTF(("pci_xhci: hostop read offset 0x%lx -> 0x%lx", offset, value)); return (value); } static uint64_t pci_xhci_dbregs_read(struct pci_xhci_softc *sc __unused, uint64_t offset __unused) { /* read doorbell always returns 0 */ return (0); } static uint64_t pci_xhci_rtsregs_read(struct pci_xhci_softc *sc, uint64_t offset) { uint32_t value; offset -= sc->rtsoff; value = 0; if (offset == XHCI_MFINDEX) { value = sc->rtsregs.mfindex; } else if (offset >= 0x20) { int item; uint32_t *p; offset -= 0x20; item = offset % 32; assert(offset < sizeof(sc->rtsregs.intrreg)); p = &sc->rtsregs.intrreg.iman; p += item / sizeof(uint32_t); value = *p; } DPRINTF(("pci_xhci: rtsregs read offset 0x%lx -> 0x%x", offset, value)); return (value); } static uint64_t pci_xhci_xecp_read(struct pci_xhci_softc *sc, uint64_t offset) { uint32_t value; offset -= sc->regsend; value = 0; switch (offset) { case 0: /* rev major | rev minor | next-cap | cap-id */ value = (0x02 << 24) | (4 << 8) | XHCI_ID_PROTOCOLS; break; case 4: /* name string = "USB" */ value = 0x20425355; break; case 8: /* psic | proto-defined | compat # | compat offset */ value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb2_port_start; break; case 12: break; case 16: /* rev major | rev minor | next-cap | cap-id */ value = (0x03 << 24) | XHCI_ID_PROTOCOLS; break; case 20: /* name string = "USB" */ value = 0x20425355; break; case 24: /* psic | proto-defined | compat # | compat offset */ value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb3_port_start; break; case 28: break; default: DPRINTF(("pci_xhci: xecp invalid offset 0x%lx", offset)); break; } DPRINTF(("pci_xhci: xecp read offset 0x%lx -> 0x%x", offset, value)); return (value); } static uint64_t pci_xhci_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_xhci_softc *sc; uint32_t value; sc = pi->pi_arg; assert(baridx == 0); pthread_mutex_lock(&sc->mtx); if (offset < XHCI_CAPLEN) value = pci_xhci_hostcap_read(sc, offset); else if (offset < sc->dboff) value = pci_xhci_hostop_read(sc, offset); else if (offset < sc->rtsoff) value = pci_xhci_dbregs_read(sc, offset); else if (offset < sc->regsend) value = pci_xhci_rtsregs_read(sc, offset); else if (offset < (sc->regsend + 4*32)) value = pci_xhci_xecp_read(sc, offset); else { value = 0; WPRINTF(("pci_xhci: read invalid offset %ld", offset)); } pthread_mutex_unlock(&sc->mtx); switch (size) { case 1: value &= 0xFF; break; case 2: value &= 0xFFFF; break; case 4: value &= 0xFFFFFFFF; break; } return (value); } static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm) { struct pci_xhci_portregs *port; struct pci_xhci_dev_emu *dev; struct xhci_trb evtrb; int error; assert(portn <= XHCI_MAX_DEVS); DPRINTF(("xhci reset port %d", portn)); port = XHCI_PORTREG_PTR(sc, portn); dev = XHCI_DEVINST_PTR(sc, portn); if (dev) { port->portsc &= ~(XHCI_PS_PLS_MASK | XHCI_PS_PR | XHCI_PS_PRC); port->portsc |= XHCI_PS_PED | XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); if (warm && dev->dev_ue->ue_usbver == 3) { port->portsc |= XHCI_PS_WRC; } if ((port->portsc & XHCI_PS_PRC) == 0) { port->portsc |= XHCI_PS_PRC; pci_xhci_set_evtrb(&evtrb, portn, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); error = pci_xhci_insert_event(sc, &evtrb, 1); if (error != XHCI_TRB_ERROR_SUCCESS) DPRINTF(("xhci reset port insert event " "failed")); } } } static void pci_xhci_init_port(struct pci_xhci_softc *sc, int portn) { struct pci_xhci_portregs *port; struct pci_xhci_dev_emu *dev; port = XHCI_PORTREG_PTR(sc, portn); dev = XHCI_DEVINST_PTR(sc, portn); if (dev) { port->portsc = XHCI_PS_CCS | /* connected */ XHCI_PS_PP; /* port power */ if (dev->dev_ue->ue_usbver == 2) { port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL) | XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); } else { port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0) | XHCI_PS_PED | /* enabled */ XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); } DPRINTF(("Init port %d 0x%x", portn, port->portsc)); } else { port->portsc = XHCI_PS_PLS_SET(UPS_PORT_LS_RX_DET) | XHCI_PS_PP; DPRINTF(("Init empty port %d 0x%x", portn, port->portsc)); } } static int pci_xhci_dev_intr(struct usb_hci *hci, int epctx) { struct pci_xhci_dev_emu *dev; struct xhci_dev_ctx *dev_ctx; struct xhci_trb evtrb; struct pci_xhci_softc *sc; struct pci_xhci_portregs *p; struct xhci_endp_ctx *ep_ctx; int error = 0; int dir_in; int epid; dir_in = epctx & 0x80; epid = epctx & ~0x80; /* HW endpoint contexts are 0-15; convert to epid based on dir */ epid = (epid * 2) + (dir_in ? 1 : 0); assert(epid >= 1 && epid <= 31); dev = hci->hci_sc; sc = dev->xsc; /* check if device is ready; OS has to initialise it */ if (sc->rtsregs.erstba_p == NULL || (sc->opregs.usbcmd & XHCI_CMD_RS) == 0 || dev->dev_ctx == NULL) return (0); p = XHCI_PORTREG_PTR(sc, hci->hci_port); /* raise event if link U3 (suspended) state */ if (XHCI_PS_PLS_GET(p->portsc) == 3) { p->portsc &= ~XHCI_PS_PLS_MASK; p->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_RESUME); if ((p->portsc & XHCI_PS_PLC) != 0) return (0); p->portsc |= XHCI_PS_PLC; pci_xhci_set_evtrb(&evtrb, hci->hci_port, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); error = pci_xhci_insert_event(sc, &evtrb, 0); if (error != XHCI_TRB_ERROR_SUCCESS) goto done; } dev_ctx = dev->dev_ctx; ep_ctx = &dev_ctx->ctx_ep[epid]; if ((ep_ctx->dwEpCtx0 & 0x7) == XHCI_ST_EPCTX_DISABLED) { DPRINTF(("xhci device interrupt on disabled endpoint %d", epid)); return (0); } DPRINTF(("xhci device interrupt on endpoint %d", epid)); pci_xhci_device_doorbell(sc, hci->hci_port, epid, 0); done: return (error); } static int pci_xhci_dev_event(struct usb_hci *hci, enum hci_usbev evid __unused, void *param __unused) { DPRINTF(("xhci device event port %d", hci->hci_port)); return (0); } /* * Each controller contains a "slot" node which contains a list of * child nodes each of which is a device. Each slot node's name * corresponds to a specific controller slot. These nodes * contain a "device" variable identifying the device model of the * USB device. For example: * * pci.0.1.0 * .device="xhci" * .slot * .1 * .device="tablet" */ static int pci_xhci_legacy_config(nvlist_t *nvl, const char *opts) { char node_name[16]; nvlist_t *slots_nvl, *slot_nvl; char *cp, *opt, *str, *tofree; int slot; if (opts == NULL) return (0); slots_nvl = create_relative_config_node(nvl, "slot"); slot = 1; tofree = str = strdup(opts); while ((opt = strsep(&str, ",")) != NULL) { /* device[=] */ cp = strchr(opt, '='); if (cp != NULL) { *cp = '\0'; cp++; } snprintf(node_name, sizeof(node_name), "%d", slot); slot++; slot_nvl = create_relative_config_node(slots_nvl, node_name); set_config_value_node(slot_nvl, "device", opt); /* * NB: Given that we split on commas above, the legacy * format only supports a single option. */ if (cp != NULL && *cp != '\0') pci_parse_legacy_config(slot_nvl, cp); } free(tofree); return (0); } static int pci_xhci_parse_devices(struct pci_xhci_softc *sc, nvlist_t *nvl) { struct pci_xhci_dev_emu *dev; struct usb_devemu *ue; const nvlist_t *slots_nvl, *slot_nvl; const char *name, *device; char *cp; void *devsc, *cookie; long slot; int type, usb3_port, usb2_port, i, ndevices; usb3_port = sc->usb3_port_start; usb2_port = sc->usb2_port_start; sc->devices = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_dev_emu *)); sc->slots = calloc(XHCI_MAX_SLOTS, sizeof(struct pci_xhci_dev_emu *)); ndevices = 0; slots_nvl = find_relative_config_node(nvl, "slot"); if (slots_nvl == NULL) goto portsfinal; cookie = NULL; while ((name = nvlist_next(slots_nvl, &type, &cookie)) != NULL) { if (usb2_port == ((sc->usb2_port_start) + XHCI_MAX_DEVS/2) || usb3_port == ((sc->usb3_port_start) + XHCI_MAX_DEVS/2)) { WPRINTF(("pci_xhci max number of USB 2 or 3 " "devices reached, max %d", XHCI_MAX_DEVS/2)); goto bad; } if (type != NV_TYPE_NVLIST) { EPRINTLN( "pci_xhci: config variable '%s' under slot node", name); goto bad; } slot = strtol(name, &cp, 0); if (*cp != '\0' || slot <= 0 || slot > XHCI_MAX_SLOTS) { EPRINTLN("pci_xhci: invalid slot '%s'", name); goto bad; } if (XHCI_SLOTDEV_PTR(sc, slot) != NULL) { EPRINTLN("pci_xhci: duplicate slot '%s'", name); goto bad; } slot_nvl = nvlist_get_nvlist(slots_nvl, name); device = get_config_value_node(slot_nvl, "device"); if (device == NULL) { EPRINTLN( "pci_xhci: missing \"device\" value for slot '%s'", name); goto bad; } ue = usb_emu_finddev(device); if (ue == NULL) { EPRINTLN("pci_xhci: unknown device model \"%s\"", device); goto bad; } DPRINTF(("pci_xhci adding device %s", device)); dev = calloc(1, sizeof(struct pci_xhci_dev_emu)); dev->xsc = sc; dev->hci.hci_sc = dev; dev->hci.hci_intr = pci_xhci_dev_intr; dev->hci.hci_event = pci_xhci_dev_event; if (ue->ue_usbver == 2) { if (usb2_port == sc->usb2_port_start + XHCI_MAX_DEVS / 2) { WPRINTF(("pci_xhci max number of USB 2 devices " "reached, max %d", XHCI_MAX_DEVS / 2)); goto bad; } dev->hci.hci_port = usb2_port; usb2_port++; } else { if (usb3_port == sc->usb3_port_start + XHCI_MAX_DEVS / 2) { WPRINTF(("pci_xhci max number of USB 3 devices " "reached, max %d", XHCI_MAX_DEVS / 2)); goto bad; } dev->hci.hci_port = usb3_port; usb3_port++; } XHCI_DEVINST_PTR(sc, dev->hci.hci_port) = dev; dev->hci.hci_address = 0; devsc = ue->ue_init(&dev->hci, nvl); if (devsc == NULL) { goto bad; } dev->dev_ue = ue; dev->dev_sc = devsc; XHCI_SLOTDEV_PTR(sc, slot) = dev; ndevices++; } portsfinal: sc->portregs = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_portregs)); if (ndevices > 0) { for (i = 1; i <= XHCI_MAX_DEVS; i++) { pci_xhci_init_port(sc, i); } } else { WPRINTF(("pci_xhci no USB devices configured")); } return (0); bad: for (i = 1; i <= XHCI_MAX_DEVS; i++) { free(XHCI_DEVINST_PTR(sc, i)); } free(sc->devices); free(sc->slots); return (-1); } static int pci_xhci_init(struct pci_devinst *pi, nvlist_t *nvl) { struct pci_xhci_softc *sc; int error; if (xhci_in_use) { WPRINTF(("pci_xhci controller already defined")); return (-1); } xhci_in_use = 1; sc = calloc(1, sizeof(struct pci_xhci_softc)); pi->pi_arg = sc; sc->xsc_pi = pi; sc->usb2_port_start = (XHCI_MAX_DEVS/2) + 1; sc->usb3_port_start = 1; /* discover devices */ error = pci_xhci_parse_devices(sc, nvl); if (error < 0) goto done; else error = 0; sc->caplength = XHCI_SET_CAPLEN(XHCI_CAPLEN) | XHCI_SET_HCIVERSION(0x0100); sc->hcsparams1 = XHCI_SET_HCSP1_MAXPORTS(XHCI_MAX_DEVS) | XHCI_SET_HCSP1_MAXINTR(1) | /* interrupters */ XHCI_SET_HCSP1_MAXSLOTS(XHCI_MAX_SLOTS); sc->hcsparams2 = XHCI_SET_HCSP2_ERSTMAX(XHCI_ERST_MAX) | XHCI_SET_HCSP2_IST(0x04); sc->hcsparams3 = 0; /* no latency */ sc->hccparams1 = XHCI_SET_HCCP1_AC64(1) | /* 64-bit addrs */ XHCI_SET_HCCP1_NSS(1) | /* no 2nd-streams */ XHCI_SET_HCCP1_SPC(1) | /* short packet */ XHCI_SET_HCCP1_MAXPSA(XHCI_STREAMS_MAX); sc->hccparams2 = XHCI_SET_HCCP2_LEC(1) | XHCI_SET_HCCP2_U3C(1); sc->dboff = XHCI_SET_DOORBELL(XHCI_CAPLEN + XHCI_PORTREGS_START + XHCI_MAX_DEVS * sizeof(struct pci_xhci_portregs)); /* dboff must be 32-bit aligned */ if (sc->dboff & 0x3) sc->dboff = (sc->dboff + 0x3) & ~0x3; /* rtsoff must be 32-bytes aligned */ sc->rtsoff = XHCI_SET_RTSOFFSET(sc->dboff + (XHCI_MAX_SLOTS+1) * 32); if (sc->rtsoff & 0x1F) sc->rtsoff = (sc->rtsoff + 0x1F) & ~0x1F; DPRINTF(("pci_xhci dboff: 0x%x, rtsoff: 0x%x", sc->dboff, sc->rtsoff)); sc->opregs.usbsts = XHCI_STS_HCH; sc->opregs.pgsz = XHCI_PAGESIZE_4K; pci_xhci_reset(sc); sc->regsend = sc->rtsoff + 0x20 + 32; /* only 1 intrpter */ /* * Set extended capabilities pointer to be after regsend; * value of xecp field is 32-bit offset. */ sc->hccparams1 |= XHCI_SET_HCCP1_XECP(sc->regsend/4); pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1E31); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SERIALBUS); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_SERIALBUS_USB); pci_set_cfgdata8(pi, PCIR_PROGIF,PCIP_SERIALBUS_USB_XHCI); pci_set_cfgdata8(pi, PCI_USBREV, PCI_USB_REV_3_0); pci_emul_add_msicap(pi, 1); /* regsend + xecp registers */ pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, sc->regsend + 4*32); DPRINTF(("pci_xhci pci_emu_alloc: %d", sc->regsend + 4*32)); pci_lintr_request(pi); pthread_mutex_init(&sc->mtx, NULL); done: if (error) { free(sc); } return (error); } #ifdef BHYVE_SNAPSHOT static void pci_xhci_map_devs_slots(struct pci_xhci_softc *sc, int maps[]) { int i, j; struct pci_xhci_dev_emu *dev, *slot; memset(maps, 0, sizeof(maps[0]) * XHCI_MAX_SLOTS); for (i = 1; i <= XHCI_MAX_SLOTS; i++) { for (j = 1; j <= XHCI_MAX_DEVS; j++) { slot = XHCI_SLOTDEV_PTR(sc, i); dev = XHCI_DEVINST_PTR(sc, j); if (slot == dev) maps[i] = j; } } } static int -pci_xhci_snapshot_ep(struct pci_xhci_softc *sc __unused, - struct pci_xhci_dev_emu *dev, int idx, struct vm_snapshot_meta *meta) +pci_xhci_snapshot_ep(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, + int idx, struct vm_snapshot_meta *meta) { int k; int ret; struct usb_data_xfer *xfer; struct usb_data_xfer_block *xfer_block; /* some sanity checks */ if (meta->op == VM_SNAPSHOT_SAVE) xfer = dev->eps[idx].ep_xfer; SNAPSHOT_VAR_OR_LEAVE(xfer, meta, ret, done); if (xfer == NULL) { ret = 0; goto done; } if (meta->op == VM_SNAPSHOT_RESTORE) { pci_xhci_init_ep(dev, idx); xfer = dev->eps[idx].ep_xfer; } /* save / restore proper */ for (k = 0; k < USB_MAX_XFER_BLOCKS; k++) { xfer_block = &xfer->data[k]; - SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(xfer_block->buf, - XHCI_GADDR_SIZE(xfer_block->buf), true, meta, ret, - done); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->xsc_pi->pi_vmctx, + xfer_block->buf, XHCI_GADDR_SIZE(xfer_block->buf), true, + meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->blen, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->bdone, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->processed, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->hci_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->ccs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->streamid, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->trbnext, meta, ret, done); } SNAPSHOT_VAR_OR_LEAVE(xfer->ureq, meta, ret, done); if (xfer->ureq) { /* xfer->ureq is not allocated at restore time */ if (meta->op == VM_SNAPSHOT_RESTORE) xfer->ureq = malloc(sizeof(struct usb_device_request)); SNAPSHOT_BUF_OR_LEAVE(xfer->ureq, sizeof(struct usb_device_request), meta, ret, done); } SNAPSHOT_VAR_OR_LEAVE(xfer->ndata, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer->head, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer->tail, meta, ret, done); done: return (ret); } static int pci_xhci_snapshot(struct vm_snapshot_meta *meta) { int i, j; int ret; int restore_idx; struct pci_devinst *pi; struct pci_xhci_softc *sc; struct pci_xhci_portregs *port; struct pci_xhci_dev_emu *dev; char dname[SNAP_DEV_NAME_LEN]; int maps[XHCI_MAX_SLOTS + 1]; pi = meta->dev_data; sc = pi->pi_arg; SNAPSHOT_VAR_OR_LEAVE(sc->caplength, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams1, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams2, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams3, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hccparams1, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->dboff, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsoff, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hccparams2, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->regsend, meta, ret, done); /* opregs */ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbcmd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbsts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.pgsz, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dnctrl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.crcr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dcbaap, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.config, meta, ret, done); /* opregs.cr_p */ - SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.cr_p, + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, sc->opregs.cr_p, XHCI_GADDR_SIZE(sc->opregs.cr_p), true, meta, ret, done); /* opregs.dcbaa_p */ - SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.dcbaa_p, + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, sc->opregs.dcbaa_p, XHCI_GADDR_SIZE(sc->opregs.dcbaa_p), true, meta, ret, done); /* rtsregs */ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.mfindex, meta, ret, done); /* rtsregs.intrreg */ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.iman, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.imod, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstsz, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.rsvd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstba, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erdp, meta, ret, done); /* rtsregs.erstba_p */ - SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erstba_p, + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, sc->rtsregs.erstba_p, XHCI_GADDR_SIZE(sc->rtsregs.erstba_p), true, meta, ret, done); /* rtsregs.erst_p */ - SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erst_p, + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, sc->rtsregs.erst_p, XHCI_GADDR_SIZE(sc->rtsregs.erst_p), true, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_deq_seg, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_idx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_seg, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_events_cnt, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.event_pcs, meta, ret, done); /* sanity checking */ for (i = 1; i <= XHCI_MAX_DEVS; i++) { dev = XHCI_DEVINST_PTR(sc, i); if (dev == NULL) continue; if (meta->op == VM_SNAPSHOT_SAVE) restore_idx = i; SNAPSHOT_VAR_OR_LEAVE(restore_idx, meta, ret, done); /* check if the restored device (when restoring) is sane */ if (restore_idx != i) { fprintf(stderr, "%s: idx not matching: actual: %d, " "expected: %d\r\n", __func__, restore_idx, i); ret = EINVAL; goto done; } if (meta->op == VM_SNAPSHOT_SAVE) { memset(dname, 0, sizeof(dname)); strncpy(dname, dev->dev_ue->ue_emu, sizeof(dname) - 1); } SNAPSHOT_BUF_OR_LEAVE(dname, sizeof(dname), meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE) { dname[sizeof(dname) - 1] = '\0'; if (strcmp(dev->dev_ue->ue_emu, dname)) { fprintf(stderr, "%s: device names mismatch: " "actual: %s, expected: %s\r\n", __func__, dname, dev->dev_ue->ue_emu); ret = EINVAL; goto done; } } } /* portregs */ for (i = 1; i <= XHCI_MAX_DEVS; i++) { port = XHCI_PORTREG_PTR(sc, i); dev = XHCI_DEVINST_PTR(sc, i); if (dev == NULL) continue; SNAPSHOT_VAR_OR_LEAVE(port->portsc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->portpmsc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->portli, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->porthlpmc, meta, ret, done); } /* slots */ if (meta->op == VM_SNAPSHOT_SAVE) pci_xhci_map_devs_slots(sc, maps); for (i = 1; i <= XHCI_MAX_SLOTS; i++) { SNAPSHOT_VAR_OR_LEAVE(maps[i], meta, ret, done); if (meta->op == VM_SNAPSHOT_SAVE) { dev = XHCI_SLOTDEV_PTR(sc, i); } else if (meta->op == VM_SNAPSHOT_RESTORE) { if (maps[i] != 0) dev = XHCI_DEVINST_PTR(sc, maps[i]); else dev = NULL; XHCI_SLOTDEV_PTR(sc, i) = dev; } else { /* error */ ret = EINVAL; goto done; } if (dev == NULL) continue; - SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(dev->dev_ctx, + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, dev->dev_ctx, XHCI_GADDR_SIZE(dev->dev_ctx), true, meta, ret, done); if (dev->dev_ctx != NULL) { for (j = 1; j < XHCI_MAX_ENDPOINTS; j++) { ret = pci_xhci_snapshot_ep(sc, dev, j, meta); if (ret != 0) goto done; } } SNAPSHOT_VAR_OR_LEAVE(dev->dev_slotstate, meta, ret, done); /* devices[i]->dev_sc */ dev->dev_ue->ue_snapshot(dev->dev_sc, meta); /* devices[i]->hci */ SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_address, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_port, meta, ret, done); } SNAPSHOT_VAR_OR_LEAVE(sc->usb2_port_start, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->usb3_port_start, meta, ret, done); done: return (ret); } #endif static const struct pci_devemu pci_de_xhci = { .pe_emu = "xhci", .pe_init = pci_xhci_init, .pe_legacy_config = pci_xhci_legacy_config, .pe_barwrite = pci_xhci_write, .pe_barread = pci_xhci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_xhci_snapshot, #endif }; PCI_EMUL_SET(pci_de_xhci); diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c index 72b63c506c41..866fc265b8aa 100644 --- a/usr.sbin/bhyve/snapshot.c +++ b/usr.sbin/bhyve/snapshot.c @@ -1,1708 +1,1702 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Flavius Anton * Copyright (c) 2016 Mihai Tiganus * Copyright (c) 2016-2019 Mihai Carabas * Copyright (c) 2017-2019 Darius Mihai * Copyright (c) 2017-2019 Elena Mihailescu * Copyright (c) 2018-2019 Sergiu Weisz * All rights reserved. * The bhyve-snapshot feature was developed under sponsorships * from Matthew Grooms. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "debug.h" #include "inout.h" #include "ipc.h" #include "fwctl.h" #include "ioapic.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "smbiostbl.h" #include "snapshot.h" #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include #include struct spinner_info { const size_t *crtval; const size_t maxval; const size_t total; }; extern int guest_ncpus; static struct winsize winsize; static sig_t old_winch_handler; #define KB (1024UL) #define MB (1024UL * KB) #define GB (1024UL * MB) #define SNAPSHOT_CHUNK (4 * MB) #define PROG_BUF_SZ (8192) #define SNAPSHOT_BUFFER_SIZE (20 * MB) #define JSON_STRUCT_ARR_KEY "structs" #define JSON_DEV_ARR_KEY "devices" #define JSON_BASIC_METADATA_KEY "basic metadata" #define JSON_SNAPSHOT_REQ_KEY "snapshot_req" #define JSON_SIZE_KEY "size" #define JSON_FILE_OFFSET_KEY "file_offset" #define JSON_NCPUS_KEY "ncpus" #define JSON_VMNAME_KEY "vmname" #define JSON_MEMSIZE_KEY "memsize" #define JSON_MEMFLAGS_KEY "memflags" #define min(a,b) \ ({ \ __typeof__ (a) _a = (a); \ __typeof__ (b) _b = (b); \ _a < _b ? _a : _b; \ }) static const struct vm_snapshot_dev_info snapshot_devs[] = { { "atkbdc", atkbdc_snapshot, NULL, NULL }, { "virtio-net", pci_snapshot, pci_pause, pci_resume }, { "virtio-blk", pci_snapshot, pci_pause, pci_resume }, { "virtio-rnd", pci_snapshot, NULL, NULL }, { "lpc", pci_snapshot, NULL, NULL }, { "fbuf", pci_snapshot, NULL, NULL }, { "xhci", pci_snapshot, NULL, NULL }, { "e1000", pci_snapshot, NULL, NULL }, { "ahci", pci_snapshot, pci_pause, pci_resume }, { "ahci-hd", pci_snapshot, pci_pause, pci_resume }, { "ahci-cd", pci_snapshot, pci_pause, pci_resume }, }; static const struct vm_snapshot_kern_info snapshot_kern_structs[] = { { "vhpet", STRUCT_VHPET }, { "vm", STRUCT_VM }, { "vioapic", STRUCT_VIOAPIC }, { "vlapic", STRUCT_VLAPIC }, { "vmcx", STRUCT_VMCX }, { "vatpit", STRUCT_VATPIT }, { "vatpic", STRUCT_VATPIC }, { "vpmtmr", STRUCT_VPMTMR }, { "vrtc", STRUCT_VRTC }, }; static cpuset_t vcpus_active, vcpus_suspended; static pthread_mutex_t vcpu_lock; static pthread_cond_t vcpus_idle, vcpus_can_run; static bool checkpoint_active; /* * TODO: Harden this function and all of its callers since 'base_str' is a user * provided string. */ static char * strcat_extension(const char *base_str, const char *ext) { char *res; size_t base_len, ext_len; base_len = strnlen(base_str, NAME_MAX); ext_len = strnlen(ext, NAME_MAX); if (base_len + ext_len > NAME_MAX) { fprintf(stderr, "Filename exceeds maximum length.\n"); return (NULL); } res = malloc(base_len + ext_len + 1); if (res == NULL) { perror("Failed to allocate memory."); return (NULL); } memcpy(res, base_str, base_len); memcpy(res + base_len, ext, ext_len); res[base_len + ext_len] = 0; return (res); } void destroy_restore_state(struct restore_state *rstate) { if (rstate == NULL) { fprintf(stderr, "Attempting to destroy NULL restore struct.\n"); return; } if (rstate->kdata_map != MAP_FAILED) munmap(rstate->kdata_map, rstate->kdata_len); if (rstate->kdata_fd > 0) close(rstate->kdata_fd); if (rstate->vmmem_fd > 0) close(rstate->vmmem_fd); if (rstate->meta_root_obj != NULL) ucl_object_unref(rstate->meta_root_obj); if (rstate->meta_parser != NULL) ucl_parser_free(rstate->meta_parser); } static int load_vmmem_file(const char *filename, struct restore_state *rstate) { struct stat sb; int err; rstate->vmmem_fd = open(filename, O_RDONLY); if (rstate->vmmem_fd < 0) { perror("Failed to open restore file"); return (-1); } err = fstat(rstate->vmmem_fd, &sb); if (err < 0) { perror("Failed to stat restore file"); goto err_load_vmmem; } if (sb.st_size == 0) { fprintf(stderr, "Restore file is empty.\n"); goto err_load_vmmem; } rstate->vmmem_len = sb.st_size; return (0); err_load_vmmem: if (rstate->vmmem_fd > 0) close(rstate->vmmem_fd); return (-1); } static int load_kdata_file(const char *filename, struct restore_state *rstate) { struct stat sb; int err; rstate->kdata_fd = open(filename, O_RDONLY); if (rstate->kdata_fd < 0) { perror("Failed to open kernel data file"); return (-1); } err = fstat(rstate->kdata_fd, &sb); if (err < 0) { perror("Failed to stat kernel data file"); goto err_load_kdata; } if (sb.st_size == 0) { fprintf(stderr, "Kernel data file is empty.\n"); goto err_load_kdata; } rstate->kdata_len = sb.st_size; rstate->kdata_map = mmap(NULL, rstate->kdata_len, PROT_READ, MAP_SHARED, rstate->kdata_fd, 0); if (rstate->kdata_map == MAP_FAILED) { perror("Failed to map restore file"); goto err_load_kdata; } return (0); err_load_kdata: if (rstate->kdata_fd > 0) close(rstate->kdata_fd); return (-1); } static int load_metadata_file(const char *filename, struct restore_state *rstate) { ucl_object_t *obj; struct ucl_parser *parser; int err; parser = ucl_parser_new(UCL_PARSER_DEFAULT); if (parser == NULL) { fprintf(stderr, "Failed to initialize UCL parser.\n"); err = -1; goto err_load_metadata; } err = ucl_parser_add_file(parser, filename); if (err == 0) { fprintf(stderr, "Failed to parse metadata file: '%s'\n", filename); err = -1; goto err_load_metadata; } obj = ucl_parser_get_object(parser); if (obj == NULL) { fprintf(stderr, "Failed to parse object.\n"); err = -1; goto err_load_metadata; } rstate->meta_parser = parser; rstate->meta_root_obj = (ucl_object_t *)obj; return (0); err_load_metadata: if (parser != NULL) ucl_parser_free(parser); return (err); } int load_restore_file(const char *filename, struct restore_state *rstate) { int err = 0; char *kdata_filename = NULL, *meta_filename = NULL; assert(filename != NULL); assert(rstate != NULL); memset(rstate, 0, sizeof(*rstate)); rstate->kdata_map = MAP_FAILED; err = load_vmmem_file(filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest RAM file.\n"); goto err_restore; } kdata_filename = strcat_extension(filename, ".kern"); if (kdata_filename == NULL) { fprintf(stderr, "Failed to construct kernel data filename.\n"); goto err_restore; } err = load_kdata_file(kdata_filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest kernel data file.\n"); goto err_restore; } meta_filename = strcat_extension(filename, ".meta"); if (meta_filename == NULL) { fprintf(stderr, "Failed to construct kernel metadata filename.\n"); goto err_restore; } err = load_metadata_file(meta_filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest metadata file.\n"); goto err_restore; } return (0); err_restore: destroy_restore_state(rstate); if (kdata_filename != NULL) free(kdata_filename); if (meta_filename != NULL) free(meta_filename); return (-1); } #define JSON_GET_INT_OR_RETURN(key, obj, result_ptr, ret) \ do { \ const ucl_object_t *obj__; \ obj__ = ucl_object_lookup(obj, key); \ if (obj__ == NULL) { \ fprintf(stderr, "Missing key: '%s'", key); \ return (ret); \ } \ if (!ucl_object_toint_safe(obj__, result_ptr)) { \ fprintf(stderr, "Cannot convert '%s' value to int.", key); \ return (ret); \ } \ } while(0) #define JSON_GET_STRING_OR_RETURN(key, obj, result_ptr, ret) \ do { \ const ucl_object_t *obj__; \ obj__ = ucl_object_lookup(obj, key); \ if (obj__ == NULL) { \ fprintf(stderr, "Missing key: '%s'", key); \ return (ret); \ } \ if (!ucl_object_tostring_safe(obj__, result_ptr)) { \ fprintf(stderr, "Cannot convert '%s' value to string.", key); \ return (ret); \ } \ } while(0) static void * lookup_struct(enum snapshot_req struct_id, struct restore_state *rstate, size_t *struct_size) { const ucl_object_t *structs = NULL, *obj = NULL; ucl_object_iter_t it = NULL; int64_t snapshot_req, size, file_offset; structs = ucl_object_lookup(rstate->meta_root_obj, JSON_STRUCT_ARR_KEY); if (structs == NULL) { fprintf(stderr, "Failed to find '%s' object.\n", JSON_STRUCT_ARR_KEY); return (NULL); } if (ucl_object_type(structs) != UCL_ARRAY) { fprintf(stderr, "Object '%s' is not an array.\n", JSON_STRUCT_ARR_KEY); return (NULL); } while ((obj = ucl_object_iterate(structs, &it, true)) != NULL) { snapshot_req = -1; JSON_GET_INT_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj, &snapshot_req, NULL); assert(snapshot_req >= 0); if ((enum snapshot_req) snapshot_req == struct_id) { JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj, &size, NULL); assert(size >= 0); JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj, &file_offset, NULL); assert(file_offset >= 0); assert((uint64_t)file_offset + size <= rstate->kdata_len); *struct_size = (size_t)size; return ((uint8_t *)rstate->kdata_map + file_offset); } } return (NULL); } static void * lookup_check_dev(const char *dev_name, struct restore_state *rstate, const ucl_object_t *obj, size_t *data_size) { const char *snapshot_req; int64_t size, file_offset; snapshot_req = NULL; JSON_GET_STRING_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj, &snapshot_req, NULL); assert(snapshot_req != NULL); if (!strcmp(snapshot_req, dev_name)) { JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj, &size, NULL); assert(size >= 0); JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj, &file_offset, NULL); assert(file_offset >= 0); assert((uint64_t)file_offset + size <= rstate->kdata_len); *data_size = (size_t)size; return ((uint8_t *)rstate->kdata_map + file_offset); } return (NULL); } static void* lookup_dev(const char *dev_name, struct restore_state *rstate, size_t *data_size) { const ucl_object_t *devs = NULL, *obj = NULL; ucl_object_iter_t it = NULL; void *ret; devs = ucl_object_lookup(rstate->meta_root_obj, JSON_DEV_ARR_KEY); if (devs == NULL) { fprintf(stderr, "Failed to find '%s' object.\n", JSON_DEV_ARR_KEY); return (NULL); } if (ucl_object_type(devs) != UCL_ARRAY) { fprintf(stderr, "Object '%s' is not an array.\n", JSON_DEV_ARR_KEY); return (NULL); } while ((obj = ucl_object_iterate(devs, &it, true)) != NULL) { ret = lookup_check_dev(dev_name, rstate, obj, data_size); if (ret != NULL) return (ret); } return (NULL); } static const ucl_object_t * lookup_basic_metadata_object(struct restore_state *rstate) { const ucl_object_t *basic_meta_obj = NULL; basic_meta_obj = ucl_object_lookup(rstate->meta_root_obj, JSON_BASIC_METADATA_KEY); if (basic_meta_obj == NULL) { fprintf(stderr, "Failed to find '%s' object.\n", JSON_BASIC_METADATA_KEY); return (NULL); } if (ucl_object_type(basic_meta_obj) != UCL_OBJECT) { fprintf(stderr, "Object '%s' is not a JSON object.\n", JSON_BASIC_METADATA_KEY); return (NULL); } return (basic_meta_obj); } const char * lookup_vmname(struct restore_state *rstate) { const char *vmname; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (NULL); JSON_GET_STRING_OR_RETURN(JSON_VMNAME_KEY, obj, &vmname, NULL); return (vmname); } int lookup_memflags(struct restore_state *rstate) { int64_t memflags; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_MEMFLAGS_KEY, obj, &memflags, 0); return ((int)memflags); } size_t lookup_memsize(struct restore_state *rstate) { int64_t memsize; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_MEMSIZE_KEY, obj, &memsize, 0); if (memsize < 0) memsize = 0; return ((size_t)memsize); } int lookup_guest_ncpus(struct restore_state *rstate) { int64_t ncpus; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_NCPUS_KEY, obj, &ncpus, 0); return ((int)ncpus); } static void winch_handler(int signal __unused) { #ifdef TIOCGWINSZ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize); #endif /* TIOCGWINSZ */ } static int print_progress(size_t crtval, const size_t maxval) { size_t rc; double crtval_gb, maxval_gb; size_t i, win_width, prog_start, prog_done, prog_end; int mval_len; static char prog_buf[PROG_BUF_SZ]; static const size_t len = sizeof(prog_buf); static size_t div; static const char *div_str; static char wip_bar[] = { '/', '-', '\\', '|' }; static int wip_idx = 0; if (maxval == 0) { printf("[0B / 0B]\r\n"); return (0); } if (crtval > maxval) crtval = maxval; if (maxval > 10 * GB) { div = GB; div_str = "GiB"; } else if (maxval > 10 * MB) { div = MB; div_str = "MiB"; } else { div = KB; div_str = "KiB"; } crtval_gb = (double) crtval / div; maxval_gb = (double) maxval / div; rc = snprintf(prog_buf, len, "%.03lf", maxval_gb); if (rc == len) { fprintf(stderr, "Maxval too big\n"); return (-1); } mval_len = rc; rc = snprintf(prog_buf, len, "\r[%*.03lf%s / %.03lf%s] |", mval_len, crtval_gb, div_str, maxval_gb, div_str); if (rc == len) { fprintf(stderr, "Buffer too small to print progress\n"); return (-1); } win_width = min(winsize.ws_col, len); prog_start = rc; if (prog_start < (win_width - 2)) { prog_end = win_width - prog_start - 2; prog_done = prog_end * (crtval_gb / maxval_gb); for (i = prog_start; i < prog_start + prog_done; i++) prog_buf[i] = '#'; if (crtval != maxval) { prog_buf[i] = wip_bar[wip_idx]; wip_idx = (wip_idx + 1) % sizeof(wip_bar); i++; } else { prog_buf[i++] = '#'; } for (; i < win_width - 2; i++) prog_buf[i] = '_'; prog_buf[win_width - 2] = '|'; } prog_buf[win_width - 1] = '\0'; write(STDOUT_FILENO, prog_buf, win_width); return (0); } static void * snapshot_spinner_cb(void *arg) { int rc; size_t crtval, maxval, total; struct spinner_info *si; struct timespec ts; si = arg; if (si == NULL) pthread_exit(NULL); ts.tv_sec = 0; ts.tv_nsec = 50 * 1000 * 1000; /* 50 ms sleep time */ do { crtval = *si->crtval; maxval = si->maxval; total = si->total; rc = print_progress(crtval, total); if (rc < 0) { fprintf(stderr, "Failed to parse progress\n"); break; } nanosleep(&ts, NULL); } while (crtval < maxval); pthread_exit(NULL); return NULL; } static int vm_snapshot_mem_part(const int snapfd, const size_t foff, void *src, const size_t len, const size_t totalmem, const bool op_wr) { int rc; size_t part_done, todo, rem; ssize_t done; bool show_progress; pthread_t spinner_th; struct spinner_info *si; if (lseek(snapfd, foff, SEEK_SET) < 0) { perror("Failed to change file offset"); return (-1); } show_progress = false; if (isatty(STDIN_FILENO) && (winsize.ws_col != 0)) show_progress = true; part_done = foff; rem = len; if (show_progress) { si = &(struct spinner_info) { .crtval = &part_done, .maxval = foff + len, .total = totalmem }; rc = pthread_create(&spinner_th, 0, snapshot_spinner_cb, si); if (rc) { perror("Unable to create spinner thread"); show_progress = false; } } while (rem > 0) { if (show_progress) todo = min(SNAPSHOT_CHUNK, rem); else todo = rem; if (op_wr) done = write(snapfd, src, todo); else done = read(snapfd, src, todo); if (done < 0) { perror("Failed to write in file"); return (-1); } src = (uint8_t *)src + done; part_done += done; rem -= done; } if (show_progress) { rc = pthread_join(spinner_th, NULL); if (rc) perror("Unable to end spinner thread"); } return (0); } static size_t vm_snapshot_mem(struct vmctx *ctx, int snapfd, size_t memsz, const bool op_wr) { int ret; size_t lowmem, highmem, totalmem; char *baseaddr; ret = vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem, &highmem); if (ret) { fprintf(stderr, "%s: unable to retrieve guest memory size\r\n", __func__); return (0); } totalmem = lowmem + highmem; if ((op_wr == false) && (totalmem != memsz)) { fprintf(stderr, "%s: mem size mismatch: %ld vs %ld\r\n", __func__, totalmem, memsz); return (0); } winsize.ws_col = 80; #ifdef TIOCGWINSZ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize); #endif /* TIOCGWINSZ */ old_winch_handler = signal(SIGWINCH, winch_handler); ret = vm_snapshot_mem_part(snapfd, 0, baseaddr, lowmem, totalmem, op_wr); if (ret) { fprintf(stderr, "%s: Could not %s lowmem\r\n", __func__, op_wr ? "write" : "read"); totalmem = 0; goto done; } if (highmem == 0) goto done; ret = vm_snapshot_mem_part(snapfd, lowmem, baseaddr + 4*GB, highmem, totalmem, op_wr); if (ret) { fprintf(stderr, "%s: Could not %s highmem\r\n", __func__, op_wr ? "write" : "read"); totalmem = 0; goto done; } done: printf("\r\n"); signal(SIGWINCH, old_winch_handler); return (totalmem); } int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate) { size_t restored; restored = vm_snapshot_mem(ctx, rstate->vmmem_fd, rstate->vmmem_len, false); if (restored != rstate->vmmem_len) return (-1); return (0); } static int vm_restore_kern_struct(struct vmctx *ctx, struct restore_state *rstate, const struct vm_snapshot_kern_info *info) { void *struct_ptr; size_t struct_size; int ret; struct vm_snapshot_meta *meta; struct_ptr = lookup_struct(info->req, rstate, &struct_size); if (struct_ptr == NULL) { fprintf(stderr, "%s: Failed to lookup struct %s\r\n", __func__, info->struct_name); ret = -1; goto done; } if (struct_size == 0) { fprintf(stderr, "%s: Kernel struct size was 0 for: %s\r\n", __func__, info->struct_name); ret = -1; goto done; } meta = &(struct vm_snapshot_meta) { - .ctx = ctx, .dev_name = info->struct_name, .dev_req = info->req, .buffer.buf_start = struct_ptr, .buffer.buf_size = struct_size, .buffer.buf = struct_ptr, .buffer.buf_rem = struct_size, .op = VM_SNAPSHOT_RESTORE, }; - ret = vm_snapshot_req(meta); + ret = vm_snapshot_req(ctx, meta); if (ret != 0) { fprintf(stderr, "%s: Failed to restore struct: %s\r\n", __func__, info->struct_name); goto done; } done: return (ret); } int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate) { size_t i; int ret; for (i = 0; i < nitems(snapshot_kern_structs); i++) { ret = vm_restore_kern_struct(ctx, rstate, &snapshot_kern_structs[i]); if (ret != 0) return (ret); } return (0); } static int -vm_restore_user_dev(struct vmctx *ctx, struct restore_state *rstate, +vm_restore_user_dev(struct restore_state *rstate, const struct vm_snapshot_dev_info *info) { void *dev_ptr; size_t dev_size; int ret; struct vm_snapshot_meta *meta; dev_ptr = lookup_dev(info->dev_name, rstate, &dev_size); if (dev_ptr == NULL) { fprintf(stderr, "Failed to lookup dev: %s\r\n", info->dev_name); fprintf(stderr, "Continuing the restore/migration process\r\n"); return (0); } if (dev_size == 0) { fprintf(stderr, "%s: Device size is 0. " "Assuming %s is not used\r\n", __func__, info->dev_name); return (0); } meta = &(struct vm_snapshot_meta) { - .ctx = ctx, .dev_name = info->dev_name, .buffer.buf_start = dev_ptr, .buffer.buf_size = dev_size, .buffer.buf = dev_ptr, .buffer.buf_rem = dev_size, .op = VM_SNAPSHOT_RESTORE, }; ret = (*info->snapshot_cb)(meta); if (ret != 0) { fprintf(stderr, "Failed to restore dev: %s\r\n", info->dev_name); return (-1); } return (0); } int -vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate) +vm_restore_user_devs(struct restore_state *rstate) { size_t i; int ret; for (i = 0; i < nitems(snapshot_devs); i++) { - ret = vm_restore_user_dev(ctx, rstate, &snapshot_devs[i]); + ret = vm_restore_user_dev(rstate, &snapshot_devs[i]); if (ret != 0) return (ret); } return 0; } int vm_pause_user_devs(void) { const struct vm_snapshot_dev_info *info; size_t i; int ret; for (i = 0; i < nitems(snapshot_devs); i++) { info = &snapshot_devs[i]; if (info->pause_cb == NULL) continue; ret = info->pause_cb(info->dev_name); if (ret != 0) return (ret); } return (0); } int vm_resume_user_devs(void) { const struct vm_snapshot_dev_info *info; size_t i; int ret; for (i = 0; i < nitems(snapshot_devs); i++) { info = &snapshot_devs[i]; if (info->resume_cb == NULL) continue; ret = info->resume_cb(info->dev_name); if (ret != 0) return (ret); } return (0); } static int -vm_snapshot_kern_struct(int data_fd, xo_handle_t *xop, const char *array_key, - struct vm_snapshot_meta *meta, off_t *offset) +vm_snapshot_kern_struct(struct vmctx *ctx, int data_fd, xo_handle_t *xop, + const char *array_key, struct vm_snapshot_meta *meta, off_t *offset) { int ret; size_t data_size; ssize_t write_cnt; - ret = vm_snapshot_req(meta); + ret = vm_snapshot_req(ctx, meta); if (ret != 0) { fprintf(stderr, "%s: Failed to snapshot struct %s\r\n", __func__, meta->dev_name); ret = -1; goto done; } data_size = vm_get_snapshot_size(meta); /* XXX-MJ no handling for short writes. */ write_cnt = write(data_fd, meta->buffer.buf_start, data_size); if (write_cnt < 0 || (size_t)write_cnt != data_size) { perror("Failed to write all snapshotted data."); ret = -1; goto done; } /* Write metadata. */ xo_open_instance_h(xop, array_key); xo_emit_h(xop, "{:debug_name/%s}\n", meta->dev_name); xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%d}\n", meta->dev_req); xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); xo_close_instance_h(xop, JSON_STRUCT_ARR_KEY); *offset += data_size; done: return (ret); } static int vm_snapshot_kern_structs(struct vmctx *ctx, int data_fd, xo_handle_t *xop) { int ret, error; size_t buf_size, i, offset; char *buffer; struct vm_snapshot_meta *meta; error = 0; offset = 0; buf_size = SNAPSHOT_BUFFER_SIZE; buffer = malloc(SNAPSHOT_BUFFER_SIZE * sizeof(char)); if (buffer == NULL) { error = ENOMEM; perror("Failed to allocate memory for snapshot buffer"); goto err_vm_snapshot_kern_data; } meta = &(struct vm_snapshot_meta) { - .ctx = ctx, - .buffer.buf_start = buffer, .buffer.buf_size = buf_size, .op = VM_SNAPSHOT_SAVE, }; xo_open_list_h(xop, JSON_STRUCT_ARR_KEY); for (i = 0; i < nitems(snapshot_kern_structs); i++) { meta->dev_name = snapshot_kern_structs[i].struct_name; meta->dev_req = snapshot_kern_structs[i].req; memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); meta->buffer.buf = meta->buffer.buf_start; meta->buffer.buf_rem = meta->buffer.buf_size; - ret = vm_snapshot_kern_struct(data_fd, xop, JSON_DEV_ARR_KEY, - meta, &offset); + ret = vm_snapshot_kern_struct(ctx, data_fd, xop, + JSON_DEV_ARR_KEY, meta, &offset); if (ret != 0) { error = -1; goto err_vm_snapshot_kern_data; } } xo_close_list_h(xop, JSON_STRUCT_ARR_KEY); err_vm_snapshot_kern_data: if (buffer != NULL) free(buffer); return (error); } static int vm_snapshot_basic_metadata(struct vmctx *ctx, xo_handle_t *xop, size_t memsz) { xo_open_container_h(xop, JSON_BASIC_METADATA_KEY); xo_emit_h(xop, "{:" JSON_NCPUS_KEY "/%ld}\n", guest_ncpus); xo_emit_h(xop, "{:" JSON_VMNAME_KEY "/%s}\n", vm_get_name(ctx)); xo_emit_h(xop, "{:" JSON_MEMSIZE_KEY "/%lu}\n", memsz); xo_emit_h(xop, "{:" JSON_MEMFLAGS_KEY "/%d}\n", vm_get_memflags(ctx)); xo_close_container_h(xop, JSON_BASIC_METADATA_KEY); return (0); } static int vm_snapshot_dev_write_data(int data_fd, xo_handle_t *xop, const char *array_key, struct vm_snapshot_meta *meta, off_t *offset) { ssize_t ret; size_t data_size; data_size = vm_get_snapshot_size(meta); /* XXX-MJ no handling for short writes. */ ret = write(data_fd, meta->buffer.buf_start, data_size); if (ret < 0 || (size_t)ret != data_size) { perror("Failed to write all snapshotted data."); return (-1); } /* Write metadata. */ xo_open_instance_h(xop, array_key); xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name); xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); xo_close_instance_h(xop, array_key); *offset += data_size; return (0); } static int vm_snapshot_user_dev(const struct vm_snapshot_dev_info *info, int data_fd, xo_handle_t *xop, struct vm_snapshot_meta *meta, off_t *offset) { int ret; ret = (*info->snapshot_cb)(meta); if (ret != 0) { fprintf(stderr, "Failed to snapshot %s; ret=%d\r\n", meta->dev_name, ret); return (ret); } ret = vm_snapshot_dev_write_data(data_fd, xop, JSON_DEV_ARR_KEY, meta, offset); if (ret != 0) return (ret); return (0); } static int -vm_snapshot_user_devs(struct vmctx *ctx, int data_fd, xo_handle_t *xop) +vm_snapshot_user_devs(int data_fd, xo_handle_t *xop) { int ret; off_t offset; void *buffer; size_t buf_size, i; struct vm_snapshot_meta *meta; buf_size = SNAPSHOT_BUFFER_SIZE; offset = lseek(data_fd, 0, SEEK_CUR); if (offset < 0) { perror("Failed to get data file current offset."); return (-1); } buffer = malloc(buf_size); if (buffer == NULL) { perror("Failed to allocate memory for snapshot buffer"); ret = ENOSPC; goto snapshot_err; } meta = &(struct vm_snapshot_meta) { - .ctx = ctx, - .buffer.buf_start = buffer, .buffer.buf_size = buf_size, .op = VM_SNAPSHOT_SAVE, }; xo_open_list_h(xop, JSON_DEV_ARR_KEY); /* Restore other devices that support this feature */ for (i = 0; i < nitems(snapshot_devs); i++) { meta->dev_name = snapshot_devs[i].dev_name; memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); meta->buffer.buf = meta->buffer.buf_start; meta->buffer.buf_rem = meta->buffer.buf_size; ret = vm_snapshot_user_dev(&snapshot_devs[i], data_fd, xop, meta, &offset); if (ret != 0) goto snapshot_err; } xo_close_list_h(xop, JSON_DEV_ARR_KEY); snapshot_err: if (buffer != NULL) free(buffer); return (ret); } void checkpoint_cpu_add(int vcpu) { pthread_mutex_lock(&vcpu_lock); CPU_SET(vcpu, &vcpus_active); if (checkpoint_active) { CPU_SET(vcpu, &vcpus_suspended); while (checkpoint_active) pthread_cond_wait(&vcpus_can_run, &vcpu_lock); CPU_CLR(vcpu, &vcpus_suspended); } pthread_mutex_unlock(&vcpu_lock); } /* * When a vCPU is suspended for any reason, it calls * checkpoint_cpu_suspend(). This records that the vCPU is idle. * Before returning from suspension, checkpoint_cpu_resume() is * called. In suspend we note that the vCPU is idle. In resume we * pause the vCPU thread until the checkpoint is complete. The reason * for the two-step process is that vCPUs might already be stopped in * the debug server when a checkpoint is requested. This approach * allows us to account for and handle those vCPUs. */ void checkpoint_cpu_suspend(int vcpu) { pthread_mutex_lock(&vcpu_lock); CPU_SET(vcpu, &vcpus_suspended); if (checkpoint_active && CPU_CMP(&vcpus_active, &vcpus_suspended) == 0) pthread_cond_signal(&vcpus_idle); pthread_mutex_unlock(&vcpu_lock); } void checkpoint_cpu_resume(int vcpu) { pthread_mutex_lock(&vcpu_lock); while (checkpoint_active) pthread_cond_wait(&vcpus_can_run, &vcpu_lock); CPU_CLR(vcpu, &vcpus_suspended); pthread_mutex_unlock(&vcpu_lock); } static void vm_vcpu_pause(struct vmctx *ctx) { pthread_mutex_lock(&vcpu_lock); checkpoint_active = true; vm_suspend_all_cpus(ctx); while (CPU_CMP(&vcpus_active, &vcpus_suspended) != 0) pthread_cond_wait(&vcpus_idle, &vcpu_lock); pthread_mutex_unlock(&vcpu_lock); } static void vm_vcpu_resume(struct vmctx *ctx) { pthread_mutex_lock(&vcpu_lock); checkpoint_active = false; pthread_mutex_unlock(&vcpu_lock); vm_resume_all_cpus(ctx); pthread_cond_broadcast(&vcpus_can_run); } static int vm_checkpoint(struct vmctx *ctx, const char *checkpoint_file, bool stop_vm) { int fd_checkpoint = 0, kdata_fd = 0; int ret = 0; int error = 0; size_t memsz; xo_handle_t *xop = NULL; char *meta_filename = NULL; char *kdata_filename = NULL; FILE *meta_file = NULL; kdata_filename = strcat_extension(checkpoint_file, ".kern"); if (kdata_filename == NULL) { fprintf(stderr, "Failed to construct kernel data filename.\n"); return (-1); } kdata_fd = open(kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); if (kdata_fd < 0) { perror("Failed to open kernel data snapshot file."); error = -1; goto done; } fd_checkpoint = open(checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700); if (fd_checkpoint < 0) { perror("Failed to create checkpoint file"); error = -1; goto done; } meta_filename = strcat_extension(checkpoint_file, ".meta"); if (meta_filename == NULL) { fprintf(stderr, "Failed to construct vm metadata filename.\n"); goto done; } meta_file = fopen(meta_filename, "w"); if (meta_file == NULL) { perror("Failed to open vm metadata snapshot file."); goto done; } xop = xo_create_to_file(meta_file, XO_STYLE_JSON, XOF_PRETTY); if (xop == NULL) { perror("Failed to get libxo handle on metadata file."); goto done; } vm_vcpu_pause(ctx); ret = vm_pause_user_devs(); if (ret != 0) { fprintf(stderr, "Could not pause devices\r\n"); error = ret; goto done; } memsz = vm_snapshot_mem(ctx, fd_checkpoint, 0, true); if (memsz == 0) { perror("Could not write guest memory to file"); error = -1; goto done; } ret = vm_snapshot_basic_metadata(ctx, xop, memsz); if (ret != 0) { fprintf(stderr, "Failed to snapshot vm basic metadata.\n"); error = -1; goto done; } ret = vm_snapshot_kern_structs(ctx, kdata_fd, xop); if (ret != 0) { fprintf(stderr, "Failed to snapshot vm kernel data.\n"); error = -1; goto done; } - ret = vm_snapshot_user_devs(ctx, kdata_fd, xop); + ret = vm_snapshot_user_devs(kdata_fd, xop); if (ret != 0) { fprintf(stderr, "Failed to snapshot device state.\n"); error = -1; goto done; } xo_finish_h(xop); if (stop_vm) { vm_destroy(ctx); exit(0); } done: ret = vm_resume_user_devs(); if (ret != 0) fprintf(stderr, "Could not resume devices\r\n"); vm_vcpu_resume(ctx); if (fd_checkpoint > 0) close(fd_checkpoint); if (meta_filename != NULL) free(meta_filename); if (kdata_filename != NULL) free(kdata_filename); if (xop != NULL) xo_destroy(xop); if (meta_file != NULL) fclose(meta_file); if (kdata_fd > 0) close(kdata_fd); return (error); } static int handle_message(struct vmctx *ctx, nvlist_t *nvl) { const char *cmd; struct ipc_command **ipc_cmd; if (!nvlist_exists_string(nvl, "cmd")) return (EINVAL); cmd = nvlist_get_string(nvl, "cmd"); IPC_COMMAND_FOREACH(ipc_cmd, ipc_cmd_set) { if (strcmp(cmd, (*ipc_cmd)->name) == 0) return ((*ipc_cmd)->handler(ctx, nvl)); } return (EOPNOTSUPP); } /* * Listen for commands from bhyvectl */ void * checkpoint_thread(void *param) { int fd; struct checkpoint_thread_info *thread_info; nvlist_t *nvl; pthread_set_name_np(pthread_self(), "checkpoint thread"); thread_info = (struct checkpoint_thread_info *)param; while ((fd = accept(thread_info->socket_fd, NULL, NULL)) != -1) { nvl = nvlist_recv(fd, 0); if (nvl != NULL) handle_message(thread_info->ctx, nvl); else EPRINTLN("nvlist_recv() failed: %s", strerror(errno)); close(fd); nvlist_destroy(nvl); } return (NULL); } static int vm_do_checkpoint(struct vmctx *ctx, const nvlist_t *nvl) { int error; if (!nvlist_exists_string(nvl, "filename") || !nvlist_exists_bool(nvl, "suspend")) error = EINVAL; else error = vm_checkpoint(ctx, nvlist_get_string(nvl, "filename"), nvlist_get_bool(nvl, "suspend")); return (error); } IPC_COMMAND(ipc_cmd_set, checkpoint, vm_do_checkpoint); void init_snapshot(void) { int err; err = pthread_mutex_init(&vcpu_lock, NULL); if (err != 0) errc(1, err, "checkpoint mutex init"); err = pthread_cond_init(&vcpus_idle, NULL); if (err != 0) errc(1, err, "checkpoint cv init (vcpus_idle)"); err = pthread_cond_init(&vcpus_can_run, NULL); if (err != 0) errc(1, err, "checkpoint cv init (vcpus_can_run)"); } /* * Create the listening socket for IPC with bhyvectl */ int init_checkpoint_thread(struct vmctx *ctx) { struct checkpoint_thread_info *checkpoint_info = NULL; struct sockaddr_un addr; int socket_fd; pthread_t checkpoint_pthread; int err; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif memset(&addr, 0, sizeof(addr)); socket_fd = socket(PF_UNIX, SOCK_STREAM, 0); if (socket_fd < 0) { EPRINTLN("Socket creation failed: %s", strerror(errno)); err = -1; goto fail; } addr.sun_family = AF_UNIX; snprintf(addr.sun_path, sizeof(addr.sun_path), "%s%s", BHYVE_RUN_DIR, vm_get_name(ctx)); addr.sun_len = SUN_LEN(&addr); unlink(addr.sun_path); if (bind(socket_fd, (struct sockaddr *)&addr, addr.sun_len) != 0) { EPRINTLN("Failed to bind socket \"%s\": %s\n", addr.sun_path, strerror(errno)); err = -1; goto fail; } if (listen(socket_fd, 10) < 0) { EPRINTLN("ipc socket listen: %s\n", strerror(errno)); err = errno; goto fail; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_ACCEPT, CAP_READ, CAP_RECV, CAP_WRITE, CAP_SEND, CAP_GETSOCKOPT); if (caph_rights_limit(socket_fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif checkpoint_info = calloc(1, sizeof(*checkpoint_info)); checkpoint_info->ctx = ctx; checkpoint_info->socket_fd = socket_fd; err = pthread_create(&checkpoint_pthread, NULL, checkpoint_thread, checkpoint_info); if (err != 0) goto fail; return (0); fail: free(checkpoint_info); if (socket_fd > 0) close(socket_fd); unlink(addr.sun_path); return (err); } void vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op) { const char *__op; if (op == VM_SNAPSHOT_SAVE) __op = "save"; else if (op == VM_SNAPSHOT_RESTORE) __op = "restore"; else __op = "unknown"; fprintf(stderr, "%s: snapshot-%s failed for %s\r\n", __func__, __op, bufname); } int vm_snapshot_buf(void *data, size_t data_size, struct vm_snapshot_meta *meta) { struct vm_snapshot_buffer *buffer; int op; buffer = &meta->buffer; op = meta->op; if (buffer->buf_rem < data_size) { fprintf(stderr, "%s: buffer too small\r\n", __func__); return (E2BIG); } if (op == VM_SNAPSHOT_SAVE) memcpy(buffer->buf, data, data_size); else if (op == VM_SNAPSHOT_RESTORE) memcpy(data, buffer->buf, data_size); else return (EINVAL); buffer->buf += data_size; buffer->buf_rem -= data_size; return (0); } size_t vm_get_snapshot_size(struct vm_snapshot_meta *meta) { size_t length; struct vm_snapshot_buffer *buffer; buffer = &meta->buffer; if (buffer->buf_size < buffer->buf_rem) { fprintf(stderr, "%s: Invalid buffer: size = %zu, rem = %zu\r\n", __func__, buffer->buf_size, buffer->buf_rem); length = 0; } else { length = buffer->buf_size - buffer->buf_rem; } return (length); } int -vm_snapshot_guest2host_addr(void **addrp, size_t len, bool restore_null, - struct vm_snapshot_meta *meta) +vm_snapshot_guest2host_addr(struct vmctx *ctx, void **addrp, size_t len, + bool restore_null, struct vm_snapshot_meta *meta) { int ret; vm_paddr_t gaddr; if (meta->op == VM_SNAPSHOT_SAVE) { - gaddr = paddr_host2guest(meta->ctx, *addrp); + gaddr = paddr_host2guest(ctx, *addrp); if (gaddr == (vm_paddr_t) -1) { if (!restore_null || (restore_null && (*addrp != NULL))) { ret = EFAULT; goto done; } } SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); } else if (meta->op == VM_SNAPSHOT_RESTORE) { SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); if (gaddr == (vm_paddr_t) -1) { if (!restore_null) { ret = EFAULT; goto done; } } - *addrp = paddr_guest2host(meta->ctx, gaddr, len); + *addrp = paddr_guest2host(ctx, gaddr, len); } else { ret = EINVAL; } done: return (ret); } int vm_snapshot_buf_cmp(void *data, size_t data_size, struct vm_snapshot_meta *meta) { struct vm_snapshot_buffer *buffer; int op; int ret; buffer = &meta->buffer; op = meta->op; if (buffer->buf_rem < data_size) { fprintf(stderr, "%s: buffer too small\r\n", __func__); ret = E2BIG; goto done; } if (op == VM_SNAPSHOT_SAVE) { ret = 0; memcpy(buffer->buf, data, data_size); } else if (op == VM_SNAPSHOT_RESTORE) { ret = memcmp(data, buffer->buf, data_size); } else { ret = EINVAL; goto done; } buffer->buf += data_size; buffer->buf_rem -= data_size; done: return (ret); } diff --git a/usr.sbin/bhyve/snapshot.h b/usr.sbin/bhyve/snapshot.h index e59c92f2326f..a125dddc1e71 100644 --- a/usr.sbin/bhyve/snapshot.h +++ b/usr.sbin/bhyve/snapshot.h @@ -1,109 +1,130 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Flavius Anton * Copyright (c) 2016 Mihai Tiganus * Copyright (c) 2016-2019 Mihai Carabas * Copyright (c) 2017-2019 Darius Mihai * Copyright (c) 2017-2019 Elena Mihailescu * Copyright (c) 2018-2019 Sergiu Weisz * All rights reserved. * The bhyve-snapshot feature was developed under sponsorships * from Matthew Grooms. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _BHYVE_SNAPSHOT_ #define _BHYVE_SNAPSHOT_ #include #include #include #define BHYVE_RUN_DIR "/var/run/bhyve/" #define MAX_SNAPSHOT_FILENAME PATH_MAX struct vmctx; struct restore_state { int kdata_fd; int vmmem_fd; void *kdata_map; size_t kdata_len; size_t vmmem_len; struct ucl_parser *meta_parser; ucl_object_t *meta_root_obj; }; struct checkpoint_thread_info { struct vmctx *ctx; int socket_fd; }; typedef int (*vm_snapshot_dev_cb)(struct vm_snapshot_meta *); typedef int (*vm_pause_dev_cb) (const char *); typedef int (*vm_resume_dev_cb) (const char *); struct vm_snapshot_dev_info { const char *dev_name; /* device name */ vm_snapshot_dev_cb snapshot_cb; /* callback for device snapshot */ vm_pause_dev_cb pause_cb; /* callback for device pause */ vm_resume_dev_cb resume_cb; /* callback for device resume */ }; struct vm_snapshot_kern_info { const char *struct_name; /* kernel structure name*/ enum snapshot_req req; /* request type */ }; void destroy_restore_state(struct restore_state *rstate); const char *lookup_vmname(struct restore_state *rstate); int lookup_memflags(struct restore_state *rstate); size_t lookup_memsize(struct restore_state *rstate); int lookup_guest_ncpus(struct restore_state *rstate); void checkpoint_cpu_add(int vcpu); void checkpoint_cpu_resume(int vcpu); void checkpoint_cpu_suspend(int vcpu); int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate); int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate); -int vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate); +int vm_restore_user_devs(struct restore_state *rstate); int vm_pause_user_devs(void); int vm_resume_user_devs(void); int get_checkpoint_msg(int conn_fd, struct vmctx *ctx); void *checkpoint_thread(void *param); int init_checkpoint_thread(struct vmctx *ctx); void init_snapshot(void); int load_restore_file(const char *filename, struct restore_state *rstate); +int vm_snapshot_guest2host_addr(struct vmctx *ctx, void **addrp, size_t len, + bool restore_null, struct vm_snapshot_meta *meta); + +/* + * Address variables are pointers to guest memory. + * + * When RNULL != 0, do not enforce invalid address checks; instead, make the + * pointer NULL at restore time. + */ +#define SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(CTX, ADDR, LEN, RNULL, META, RES, LABEL) \ +do { \ + (RES) = vm_snapshot_guest2host_addr((CTX), (void **)&(ADDR), (LEN), \ + (RNULL), (META)); \ + if ((RES) != 0) { \ + if ((RES) == EFAULT) \ + fprintf(stderr, "%s: invalid address: %s\r\n", \ + __func__, #ADDR); \ + goto LABEL; \ + } \ +} while (0) + #endif diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c index 68304f76dc49..12d5f9b147d0 100644 --- a/usr.sbin/bhyve/virtio.c +++ b/usr.sbin/bhyve/virtio.c @@ -1,962 +1,966 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Chris Torek * All rights reserved. * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include -#include #include #include #include #include #include #include #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" +#ifdef BHYVE_SNAPSHOT +#include "snapshot.h" +#endif #include "virtio.h" /* * Functions for dealing with generalized "virtual devices" as * defined by */ /* * In case we decide to relax the "virtio softc comes at the * front of virtio-based device softc" constraint, let's use * this to convert. */ #define DEV_SOFTC(vs) ((void *)(vs)) /* * Link a virtio_softc to its constants, the device softc, and * the PCI emulation. */ void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, void *dev_softc, struct pci_devinst *pi, struct vqueue_info *queues) { int i; /* vs and dev_softc addresses must match */ assert((void *)vs == dev_softc); vs->vs_vc = vc; vs->vs_pi = pi; pi->pi_arg = vs; vs->vs_queues = queues; for (i = 0; i < vc->vc_nvq; i++) { queues[i].vq_vs = vs; queues[i].vq_num = i; } } /* * Reset device (device-wide). This erases all queues, i.e., * all the queues become invalid (though we don't wipe out the * internal pointers, we just clear the VQ_ALLOC flag). * * It resets negotiated features to "none". * * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR. */ void vi_reset_dev(struct virtio_softc *vs) { struct vqueue_info *vq; int i, nvq; if (vs->vs_mtx) assert(pthread_mutex_isowned_np(vs->vs_mtx)); nvq = vs->vs_vc->vc_nvq; for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { vq->vq_flags = 0; vq->vq_last_avail = 0; vq->vq_next_used = 0; vq->vq_save_used = 0; vq->vq_pfn = 0; vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR; } vs->vs_negotiated_caps = 0; vs->vs_curq = 0; /* vs->vs_status = 0; -- redundant */ if (vs->vs_isr) pci_lintr_deassert(vs->vs_pi); vs->vs_isr = 0; vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR; } /* * Set I/O BAR (usually 0) to map PCI config registers. */ void vi_set_io_bar(struct virtio_softc *vs, int barnum) { size_t size; /* * ??? should we use VIRTIO_PCI_CONFIG_OFF(0) if MSI-X is disabled? * Existing code did not... */ size = VIRTIO_PCI_CONFIG_OFF(1) + vs->vs_vc->vc_cfgsize; pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size); } /* * Initialize MSI-X vector capabilities if we're to use MSI-X, * or MSI capabilities if not. * * We assume we want one MSI-X vector per queue, here, plus one * for the config vec. */ int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix) { int nvec; if (use_msix) { vs->vs_flags |= VIRTIO_USE_MSIX; VS_LOCK(vs); vi_reset_dev(vs); /* set all vectors to NO_VECTOR */ VS_UNLOCK(vs); nvec = vs->vs_vc->vc_nvq + 1; if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum)) return (1); } else vs->vs_flags &= ~VIRTIO_USE_MSIX; /* Only 1 MSI vector for bhyve */ pci_emul_add_msicap(vs->vs_pi, 1); /* Legacy interrupts are mandatory for virtio devices */ pci_lintr_request(vs->vs_pi); return (0); } /* * Initialize the currently-selected virtio queue (vs->vs_curq). * The guest just gave us a page frame number, from which we can * calculate the addresses of the queue. */ static void vi_vq_init(struct virtio_softc *vs, uint32_t pfn) { struct vqueue_info *vq; uint64_t phys; size_t size; char *base; vq = &vs->vs_queues[vs->vs_curq]; vq->vq_pfn = pfn; phys = (uint64_t)pfn << VRING_PFN; size = vring_size_aligned(vq->vq_qsize); base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size); /* First page(s) are descriptors... */ vq->vq_desc = (struct vring_desc *)base; base += vq->vq_qsize * sizeof(struct vring_desc); /* ... immediately followed by "avail" ring (entirely uint16_t's) */ vq->vq_avail = (struct vring_avail *)base; base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t); /* Then it's rounded up to the next page... */ base = (char *)roundup2((uintptr_t)base, VRING_ALIGN); /* ... and the last page(s) are the used ring. */ vq->vq_used = (struct vring_used *)base; /* Mark queue as allocated, and start at 0 when we use it. */ vq->vq_flags = VQ_ALLOC; vq->vq_last_avail = 0; vq->vq_next_used = 0; vq->vq_save_used = 0; } /* * Helper inline for vq_getchain(): record the i'th "real" * descriptor. */ static inline void _vq_record(int i, struct vring_desc *vd, struct vmctx *ctx, struct iovec *iov, int n_iov, struct vi_req *reqp) { if (i >= n_iov) return; iov[i].iov_base = paddr_guest2host(ctx, vd->addr, vd->len); iov[i].iov_len = vd->len; if ((vd->flags & VRING_DESC_F_WRITE) == 0) reqp->readable++; else reqp->writable++; } #define VQ_MAX_DESCRIPTORS 512 /* see below */ /* * Examine the chain of descriptors starting at the "next one" to * make sure that they describe a sensible request. If so, return * the number of "real" descriptors that would be needed/used in * acting on this request. This may be smaller than the number of * available descriptors, e.g., if there are two available but * they are two separate requests, this just returns 1. Or, it * may be larger: if there are indirect descriptors involved, * there may only be one descriptor available but it may be an * indirect pointing to eight more. We return 8 in this case, * i.e., we do not count the indirect descriptors, only the "real" * ones. * * Basically, this vets the "flags" and "next" field of each * descriptor and tells you how many are involved. Since some may * be indirect, this also needs the vmctx (in the pci_devinst * at vs->vs_pi) so that it can find indirect descriptors. * * As we process each descriptor, we copy and adjust it (guest to * host address wise, also using the vmtctx) into the given iov[] * array (of the given size). If the array overflows, we stop * placing values into the array but keep processing descriptors, * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1. * So you, the caller, must not assume that iov[] is as big as the * return value (you can process the same thing twice to allocate * a larger iov array if needed, or supply a zero length to find * out how much space is needed). * * If some descriptor(s) are invalid, this prints a diagnostic message * and returns -1. If no descriptors are ready now it simply returns 0. * * You are assumed to have done a vq_ring_ready() if needed (note * that vq_has_descs() does one). */ int vq_getchain(struct vqueue_info *vq, struct iovec *iov, int niov, struct vi_req *reqp) { int i; u_int ndesc, n_indir; u_int idx, next; struct vi_req req; struct vring_desc *vdir, *vindir, *vp; struct vmctx *ctx; struct virtio_softc *vs; const char *name; vs = vq->vq_vs; name = vs->vs_vc->vc_name; memset(&req, 0, sizeof(req)); /* * Note: it's the responsibility of the guest not to * update vq->vq_avail->idx until all of the descriptors * the guest has written are valid (including all their * "next" fields and "flags"). * * Compute (vq_avail->idx - last_avail) in integers mod 2**16. This is * the number of descriptors the device has made available * since the last time we updated vq->vq_last_avail. * * We just need to do the subtraction as an unsigned int, * then trim off excess bits. */ idx = vq->vq_last_avail; ndesc = (uint16_t)((u_int)vq->vq_avail->idx - idx); if (ndesc == 0) return (0); if (ndesc > vq->vq_qsize) { /* XXX need better way to diagnose issues */ EPRINTLN( "%s: ndesc (%u) out of range, driver confused?", name, (u_int)ndesc); return (-1); } /* * Now count/parse "involved" descriptors starting from * the head of the chain. * * To prevent loops, we could be more complicated and * check whether we're re-visiting a previously visited * index, but we just abort if the count gets excessive. */ ctx = vs->vs_pi->pi_vmctx; req.idx = next = vq->vq_avail->ring[idx & (vq->vq_qsize - 1)]; vq->vq_last_avail++; for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->next) { if (next >= vq->vq_qsize) { EPRINTLN( "%s: descriptor index %u out of range, " "driver confused?", name, next); return (-1); } vdir = &vq->vq_desc[next]; if ((vdir->flags & VRING_DESC_F_INDIRECT) == 0) { _vq_record(i, vdir, ctx, iov, niov, &req); i++; } else if ((vs->vs_vc->vc_hv_caps & VIRTIO_RING_F_INDIRECT_DESC) == 0) { EPRINTLN( "%s: descriptor has forbidden INDIRECT flag, " "driver confused?", name); return (-1); } else { n_indir = vdir->len / 16; if ((vdir->len & 0xf) || n_indir == 0) { EPRINTLN( "%s: invalid indir len 0x%x, " "driver confused?", name, (u_int)vdir->len); return (-1); } vindir = paddr_guest2host(ctx, vdir->addr, vdir->len); /* * Indirects start at the 0th, then follow * their own embedded "next"s until those run * out. Each one's indirect flag must be off * (we don't really have to check, could just * ignore errors...). */ next = 0; for (;;) { vp = &vindir[next]; if (vp->flags & VRING_DESC_F_INDIRECT) { EPRINTLN( "%s: indirect desc has INDIR flag," " driver confused?", name); return (-1); } _vq_record(i, vp, ctx, iov, niov, &req); if (++i > VQ_MAX_DESCRIPTORS) goto loopy; if ((vp->flags & VRING_DESC_F_NEXT) == 0) break; next = vp->next; if (next >= n_indir) { EPRINTLN( "%s: invalid next %u > %u, " "driver confused?", name, (u_int)next, n_indir); return (-1); } } } if ((vdir->flags & VRING_DESC_F_NEXT) == 0) goto done; } loopy: EPRINTLN( "%s: descriptor loop? count > %d - driver confused?", name, i); return (-1); done: *reqp = req; return (i); } /* * Return the first n_chain request chains back to the available queue. * * (These chains are the ones you handled when you called vq_getchain() * and used its positive return value.) */ void vq_retchains(struct vqueue_info *vq, uint16_t n_chains) { vq->vq_last_avail -= n_chains; } void vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) { struct vring_used *vuh; struct vring_used_elem *vue; uint16_t mask; /* * Notes: * - mask is N-1 where N is a power of 2 so computes x % N * - vuh points to the "used" data shared with guest * - vue points to the "used" ring entry we want to update */ mask = vq->vq_qsize - 1; vuh = vq->vq_used; vue = &vuh->ring[vq->vq_next_used++ & mask]; vue->id = idx; vue->len = iolen; } void vq_relchain_publish(struct vqueue_info *vq) { /* * Ensure the used descriptor is visible before updating the index. * This is necessary on ISAs with memory ordering less strict than x86 * (and even on x86 to act as a compiler barrier). */ atomic_thread_fence_rel(); vq->vq_used->idx = vq->vq_next_used; } /* * Return specified request chain to the guest, setting its I/O length * to the provided value. * * (This chain is the one you handled when you called vq_getchain() * and used its positive return value.) */ void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) { vq_relchain_prepare(vq, idx, iolen); vq_relchain_publish(vq); } /* * Driver has finished processing "available" chains and calling * vq_relchain on each one. If driver used all the available * chains, used_all should be set. * * If the "used" index moved we may need to inform the guest, i.e., * deliver an interrupt. Even if the used index did NOT move we * may need to deliver an interrupt, if the avail ring is empty and * we are supposed to interrupt on empty. * * Note that used_all_avail is provided by the caller because it's * a snapshot of the ring state when he decided to finish interrupt * processing -- it's possible that descriptors became available after * that point. (It's also typically a constant 1/True as well.) */ void vq_endchains(struct vqueue_info *vq, int used_all_avail) { struct virtio_softc *vs; uint16_t event_idx, new_idx, old_idx; int intr; /* * Interrupt generation: if we're using EVENT_IDX, * interrupt if we've crossed the event threshold. * Otherwise interrupt is generated if we added "used" entries, * but suppressed by VRING_AVAIL_F_NO_INTERRUPT. * * In any case, though, if NOTIFY_ON_EMPTY is set and the * entire avail was processed, we need to interrupt always. */ vs = vq->vq_vs; old_idx = vq->vq_save_used; vq->vq_save_used = new_idx = vq->vq_used->idx; /* * Use full memory barrier between "idx" store from preceding * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or * "flags" field below. */ atomic_thread_fence_seq_cst(); if (used_all_avail && (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) intr = 1; else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) { event_idx = VQ_USED_EVENT_IDX(vq); /* * This calculation is per docs and the kernel * (see src/sys/dev/virtio/virtio_ring.h). */ intr = (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old_idx); } else { intr = new_idx != old_idx && !(vq->vq_avail->flags & VRING_AVAIL_F_NO_INTERRUPT); } if (intr) vq_interrupt(vs, vq); } /* Note: these are in sorted order to make for a fast search */ static struct config_reg { uint16_t cr_offset; /* register offset */ uint8_t cr_size; /* size (bytes) */ uint8_t cr_ro; /* true => reg is read only */ const char *cr_name; /* name of reg */ } config_regs[] = { { VIRTIO_PCI_HOST_FEATURES, 4, 1, "HOST_FEATURES" }, { VIRTIO_PCI_GUEST_FEATURES, 4, 0, "GUEST_FEATURES" }, { VIRTIO_PCI_QUEUE_PFN, 4, 0, "QUEUE_PFN" }, { VIRTIO_PCI_QUEUE_NUM, 2, 1, "QUEUE_NUM" }, { VIRTIO_PCI_QUEUE_SEL, 2, 0, "QUEUE_SEL" }, { VIRTIO_PCI_QUEUE_NOTIFY, 2, 0, "QUEUE_NOTIFY" }, { VIRTIO_PCI_STATUS, 1, 0, "STATUS" }, { VIRTIO_PCI_ISR, 1, 0, "ISR" }, { VIRTIO_MSI_CONFIG_VECTOR, 2, 0, "CONFIG_VECTOR" }, { VIRTIO_MSI_QUEUE_VECTOR, 2, 0, "QUEUE_VECTOR" }, }; static inline struct config_reg * vi_find_cr(int offset) { u_int hi, lo, mid; struct config_reg *cr; lo = 0; hi = sizeof(config_regs) / sizeof(*config_regs) - 1; while (hi >= lo) { mid = (hi + lo) >> 1; cr = &config_regs[mid]; if (cr->cr_offset == offset) return (cr); if (cr->cr_offset < offset) lo = mid + 1; else hi = mid - 1; } return (NULL); } /* * Handle pci config space reads. * If it's to the MSI-X info, do that. * If it's part of the virtio standard stuff, do that. * Otherwise dispatch to the actual driver. */ uint64_t vi_pci_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct virtio_softc *vs = pi->pi_arg; struct virtio_consts *vc; struct config_reg *cr; uint64_t virtio_config_size, max; const char *name; uint32_t newoff; uint32_t value; int error; if (vs->vs_flags & VIRTIO_USE_MSIX) { if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { return (pci_emul_msix_tread(pi, offset, size)); } } /* XXX probably should do something better than just assert() */ assert(baridx == 0); if (vs->vs_mtx) pthread_mutex_lock(vs->vs_mtx); vc = vs->vs_vc; name = vc->vc_name; value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff; if (size != 1 && size != 2 && size != 4) goto bad; virtio_config_size = VIRTIO_PCI_CONFIG_OFF(pci_msix_enabled(pi)); if (offset >= virtio_config_size) { /* * Subtract off the standard size (including MSI-X * registers if enabled) and dispatch to underlying driver. * If that fails, fall into general code. */ newoff = offset - virtio_config_size; max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; if (newoff + size > max) goto bad; if (vc->vc_cfgread != NULL) error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value); else error = 0; if (!error) goto done; } bad: cr = vi_find_cr(offset); if (cr == NULL || cr->cr_size != size) { if (cr != NULL) { /* offset must be OK, so size must be bad */ EPRINTLN( "%s: read from %s: bad size %d", name, cr->cr_name, size); } else { EPRINTLN( "%s: read from bad offset/size %jd/%d", name, (uintmax_t)offset, size); } goto done; } switch (offset) { case VIRTIO_PCI_HOST_FEATURES: value = vc->vc_hv_caps; break; case VIRTIO_PCI_GUEST_FEATURES: value = vs->vs_negotiated_caps; break; case VIRTIO_PCI_QUEUE_PFN: if (vs->vs_curq < vc->vc_nvq) value = vs->vs_queues[vs->vs_curq].vq_pfn; break; case VIRTIO_PCI_QUEUE_NUM: value = vs->vs_curq < vc->vc_nvq ? vs->vs_queues[vs->vs_curq].vq_qsize : 0; break; case VIRTIO_PCI_QUEUE_SEL: value = vs->vs_curq; break; case VIRTIO_PCI_QUEUE_NOTIFY: value = 0; /* XXX */ break; case VIRTIO_PCI_STATUS: value = vs->vs_status; break; case VIRTIO_PCI_ISR: value = vs->vs_isr; vs->vs_isr = 0; /* a read clears this flag */ if (value) pci_lintr_deassert(pi); break; case VIRTIO_MSI_CONFIG_VECTOR: value = vs->vs_msix_cfg_idx; break; case VIRTIO_MSI_QUEUE_VECTOR: value = vs->vs_curq < vc->vc_nvq ? vs->vs_queues[vs->vs_curq].vq_msix_idx : VIRTIO_MSI_NO_VECTOR; break; } done: if (vs->vs_mtx) pthread_mutex_unlock(vs->vs_mtx); return (value); } /* * Handle pci config space writes. * If it's to the MSI-X info, do that. * If it's part of the virtio standard stuff, do that. * Otherwise dispatch to the actual driver. */ void vi_pci_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct virtio_softc *vs = pi->pi_arg; struct vqueue_info *vq; struct virtio_consts *vc; struct config_reg *cr; uint64_t virtio_config_size, max; const char *name; uint32_t newoff; int error; if (vs->vs_flags & VIRTIO_USE_MSIX) { if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { pci_emul_msix_twrite(pi, offset, size, value); return; } } /* XXX probably should do something better than just assert() */ assert(baridx == 0); if (vs->vs_mtx) pthread_mutex_lock(vs->vs_mtx); vc = vs->vs_vc; name = vc->vc_name; if (size != 1 && size != 2 && size != 4) goto bad; virtio_config_size = VIRTIO_PCI_CONFIG_OFF(pci_msix_enabled(pi)); if (offset >= virtio_config_size) { /* * Subtract off the standard size (including MSI-X * registers if enabled) and dispatch to underlying driver. */ newoff = offset - virtio_config_size; max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; if (newoff + size > max) goto bad; if (vc->vc_cfgwrite != NULL) error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value); else error = 0; if (!error) goto done; } bad: cr = vi_find_cr(offset); if (cr == NULL || cr->cr_size != size || cr->cr_ro) { if (cr != NULL) { /* offset must be OK, wrong size and/or reg is R/O */ if (cr->cr_size != size) EPRINTLN( "%s: write to %s: bad size %d", name, cr->cr_name, size); if (cr->cr_ro) EPRINTLN( "%s: write to read-only reg %s", name, cr->cr_name); } else { EPRINTLN( "%s: write to bad offset/size %jd/%d", name, (uintmax_t)offset, size); } goto done; } switch (offset) { case VIRTIO_PCI_GUEST_FEATURES: vs->vs_negotiated_caps = value & vc->vc_hv_caps; if (vc->vc_apply_features) (*vc->vc_apply_features)(DEV_SOFTC(vs), vs->vs_negotiated_caps); break; case VIRTIO_PCI_QUEUE_PFN: if (vs->vs_curq >= vc->vc_nvq) goto bad_qindex; vi_vq_init(vs, value); break; case VIRTIO_PCI_QUEUE_SEL: /* * Note that the guest is allowed to select an * invalid queue; we just need to return a QNUM * of 0 while the bad queue is selected. */ vs->vs_curq = value; break; case VIRTIO_PCI_QUEUE_NOTIFY: if (value >= (unsigned int)vc->vc_nvq) { EPRINTLN("%s: queue %d notify out of range", name, (int)value); goto done; } vq = &vs->vs_queues[value]; if (vq->vq_notify) (*vq->vq_notify)(DEV_SOFTC(vs), vq); else if (vc->vc_qnotify) (*vc->vc_qnotify)(DEV_SOFTC(vs), vq); else EPRINTLN( "%s: qnotify queue %d: missing vq/vc notify", name, (int)value); break; case VIRTIO_PCI_STATUS: vs->vs_status = value; if (value == 0) (*vc->vc_reset)(DEV_SOFTC(vs)); break; case VIRTIO_MSI_CONFIG_VECTOR: vs->vs_msix_cfg_idx = value; break; case VIRTIO_MSI_QUEUE_VECTOR: if (vs->vs_curq >= vc->vc_nvq) goto bad_qindex; vq = &vs->vs_queues[vs->vs_curq]; vq->vq_msix_idx = value; break; } goto done; bad_qindex: EPRINTLN( "%s: write config reg %s: curq %d >= max %d", name, cr->cr_name, vs->vs_curq, vc->vc_nvq); done: if (vs->vs_mtx) pthread_mutex_unlock(vs->vs_mtx); } #ifdef BHYVE_SNAPSHOT int vi_pci_pause(struct pci_devinst *pi) { struct virtio_softc *vs; struct virtio_consts *vc; vs = pi->pi_arg; vc = vs->vs_vc; vc = vs->vs_vc; assert(vc->vc_pause != NULL); (*vc->vc_pause)(DEV_SOFTC(vs)); return (0); } int vi_pci_resume(struct pci_devinst *pi) { struct virtio_softc *vs; struct virtio_consts *vc; vs = pi->pi_arg; vc = vs->vs_vc; vc = vs->vs_vc; assert(vc->vc_resume != NULL); (*vc->vc_resume)(DEV_SOFTC(vs)); return (0); } static int vi_pci_snapshot_softc(struct virtio_softc *vs, struct vm_snapshot_meta *meta) { int ret; SNAPSHOT_VAR_OR_LEAVE(vs->vs_flags, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_negotiated_caps, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_curq, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_status, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_isr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_msix_cfg_idx, meta, ret, done); done: return (ret); } static int vi_pci_snapshot_consts(struct virtio_consts *vc, struct vm_snapshot_meta *meta) { int ret; SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_nvq, meta, ret, done); SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_cfgsize, meta, ret, done); SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_hv_caps, meta, ret, done); done: return (ret); } static int vi_pci_snapshot_queues(struct virtio_softc *vs, struct vm_snapshot_meta *meta) { int i; int ret; struct virtio_consts *vc; struct vqueue_info *vq; + struct vmctx *ctx; uint64_t addr_size; + ctx = vs->vs_pi->pi_vmctx; vc = vs->vs_vc; /* Save virtio queue info */ for (i = 0; i < vc->vc_nvq; i++) { vq = &vs->vs_queues[i]; SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_qsize, meta, ret, done); SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_num, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_flags, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_last_avail, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_next_used, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_save_used, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_msix_idx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_pfn, meta, ret, done); if (!vq_ring_ready(vq)) continue; addr_size = vq->vq_qsize * sizeof(struct vring_desc); - SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_desc, addr_size, + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ctx, vq->vq_desc, addr_size, false, meta, ret, done); addr_size = (2 + vq->vq_qsize + 1) * sizeof(uint16_t); - SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_avail, addr_size, + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ctx, vq->vq_avail, addr_size, false, meta, ret, done); addr_size = (2 + 2 * vq->vq_qsize + 1) * sizeof(uint16_t); - SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_used, addr_size, + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ctx, vq->vq_used, addr_size, false, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(vq->vq_desc, vring_size_aligned(vq->vq_qsize), meta, ret, done); } done: return (ret); } int vi_pci_snapshot(struct vm_snapshot_meta *meta) { int ret; struct pci_devinst *pi; struct virtio_softc *vs; struct virtio_consts *vc; pi = meta->dev_data; vs = pi->pi_arg; vc = vs->vs_vc; /* Save virtio softc */ ret = vi_pci_snapshot_softc(vs, meta); if (ret != 0) goto done; /* Save virtio consts */ ret = vi_pci_snapshot_consts(vc, meta); if (ret != 0) goto done; /* Save virtio queue info */ ret = vi_pci_snapshot_queues(vs, meta); if (ret != 0) goto done; /* Save device softc, if needed */ if (vc->vc_snapshot != NULL) { ret = (*vc->vc_snapshot)(DEV_SOFTC(vs), meta); if (ret != 0) goto done; } done: return (ret); } #endif