Index: releng/11.1/lib/libvmmapi/vmmapi.c =================================================================== --- releng/11.1/lib/libvmmapi/vmmapi.c (revision 320933) +++ releng/11.1/lib/libvmmapi/vmmapi.c (revision 320934) @@ -1,1418 +1,1460 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmmapi.h" #define MB (1024 * 1024UL) #define GB (1024 * 1024 * 1024UL) /* * Size of the guard region before and after the virtual address space * mapping the guest physical memory. This must be a multiple of the * superpage size for performance reasons. */ #define VM_MMAP_GUARD_SIZE (4 * MB) #define PROT_RW (PROT_READ | PROT_WRITE) #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) struct vmctx { int fd; uint32_t lowmem_limit; int memflags; size_t lowmem; size_t highmem; char *baseaddr; char *name; }; #define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) #define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) static int vm_device_open(const char *name) { int fd, len; char *vmfile; len = strlen("/dev/vmm/") + strlen(name) + 1; vmfile = malloc(len); assert(vmfile != NULL); snprintf(vmfile, len, "/dev/vmm/%s", name); /* Open the device file */ fd = open(vmfile, O_RDWR, 0); free(vmfile); return (fd); } int vm_create(const char *name) { return (CREATE((char *)name)); } struct vmctx * vm_open(const char *name) { struct vmctx *vm; vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); assert(vm != NULL); vm->fd = -1; vm->memflags = 0; vm->lowmem_limit = 3 * GB; vm->name = (char *)(vm + 1); strcpy(vm->name, name); if ((vm->fd = vm_device_open(vm->name)) < 0) goto err; return (vm); err: vm_destroy(vm); return (NULL); } void vm_destroy(struct vmctx *vm) { assert(vm != NULL); if (vm->fd >= 0) close(vm->fd); DESTROY(vm->name); free(vm); } int vm_parse_memsize(const char *optarg, size_t *ret_memsize) { char *endptr; size_t optval; int error; optval = strtoul(optarg, &endptr, 0); if (*optarg != '\0' && *endptr == '\0') { /* * For the sake of backward compatibility if the memory size * specified on the command line is less than a megabyte then * it is interpreted as being in units of MB. */ if (optval < MB) optval *= MB; *ret_memsize = optval; error = 0; } else error = expand_number(optarg, ret_memsize); return (error); } uint32_t vm_get_lowmem_limit(struct vmctx *ctx) { return (ctx->lowmem_limit); } void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit) { ctx->lowmem_limit = limit; } void vm_set_memflags(struct vmctx *ctx, int flags) { ctx->memflags = flags; } int vm_get_memflags(struct vmctx *ctx) { return (ctx->memflags); } /* * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, size_t len, int prot) { struct vm_memmap memmap; int error, flags; memmap.gpa = gpa; memmap.segid = segid; memmap.segoff = off; memmap.len = len; memmap.prot = prot; memmap.flags = 0; if (ctx->memflags & VM_MEM_F_WIRED) memmap.flags |= VM_MEMMAP_F_WIRED; /* * If this mapping already exists then don't create it again. This * is the common case for SYSMEM mappings created by bhyveload(8). */ error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); if (error == 0 && gpa == memmap.gpa) { if (segid != memmap.segid || off != memmap.segoff || prot != memmap.prot || flags != memmap.flags) { errno = EEXIST; return (-1); } else { return (0); } } error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); return (error); } int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { struct vm_memmap memmap; int error; bzero(&memmap, sizeof(struct vm_memmap)); memmap.gpa = *gpa; error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); if (error == 0) { *gpa = memmap.gpa; *segid = memmap.segid; *segoff = memmap.segoff; *len = memmap.len; *prot = memmap.prot; *flags = memmap.flags; } return (error); } /* * Return 0 if the segments are identical and non-zero otherwise. * * This is slightly complicated by the fact that only device memory segments * are named. */ static int cmpseg(size_t len, const char *str, size_t len2, const char *str2) { if (len == len2) { if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) return (0); } return (-1); } static int vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) { struct vm_memseg memseg; size_t n; int error; /* * If the memory segment has already been created then just return. * This is the usual case for the SYSMEM segment created by userspace * loaders like bhyveload(8). */ error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, sizeof(memseg.name)); if (error) return (error); if (memseg.len != 0) { if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { errno = EINVAL; return (-1); } else { return (0); } } bzero(&memseg, sizeof(struct vm_memseg)); memseg.segid = segid; memseg.len = len; if (name != NULL) { n = strlcpy(memseg.name, name, sizeof(memseg.name)); if (n >= sizeof(memseg.name)) { errno = ENAMETOOLONG; return (-1); } } error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); return (error); } int vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, size_t bufsize) { struct vm_memseg memseg; size_t n; int error; memseg.segid = segid; error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); if (error == 0) { *lenp = memseg.len; n = strlcpy(namebuf, memseg.name, bufsize); if (n >= bufsize) { errno = ENAMETOOLONG; error = -1; } } return (error); } static int setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) { char *ptr; int error, flags; /* Map 'len' bytes starting at 'gpa' in the guest address space */ error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); if (error) return (error); flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap into the process address space on the host */ ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); if (ptr == MAP_FAILED) return (-1); return (0); } int vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) { size_t objsize, len; vm_paddr_t gpa; char *baseaddr, *ptr; int error, flags; assert(vms == VM_MMAP_ALL); /* * If 'memsize' cannot fit entirely in the 'lowmem' segment then * create another 'highmem' segment above 4GB for the remainder. */ if (memsize > ctx->lowmem_limit) { ctx->lowmem = ctx->lowmem_limit; ctx->highmem = memsize - ctx->lowmem_limit; objsize = 4*GB + ctx->highmem; } else { ctx->lowmem = memsize; ctx->highmem = 0; objsize = ctx->lowmem; } error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); if (error) return (error); /* * Stake out a contiguous region covering the guest physical memory * and the adjoining guard regions. */ len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; flags = MAP_PRIVATE | MAP_ANON | MAP_NOCORE | MAP_ALIGNED_SUPER; ptr = mmap(NULL, len, PROT_NONE, flags, -1, 0); if (ptr == MAP_FAILED) return (-1); baseaddr = ptr + VM_MMAP_GUARD_SIZE; if (ctx->highmem > 0) { gpa = 4*GB; len = ctx->highmem; error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } if (ctx->lowmem > 0) { gpa = 0; len = ctx->lowmem; error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } ctx->baseaddr = baseaddr; return (0); } /* * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in * the lowmem or highmem regions. * * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. * The instruction emulation code depends on this behavior. */ void * vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) { if (ctx->lowmem > 0) { if (gaddr < ctx->lowmem && len <= ctx->lowmem && gaddr + len <= ctx->lowmem) return (ctx->baseaddr + gaddr); } if (ctx->highmem > 0) { if (gaddr >= 4*GB) { if (gaddr < 4*GB + ctx->highmem && len <= ctx->highmem && gaddr + len <= 4*GB + ctx->highmem) return (ctx->baseaddr + gaddr); } } return (NULL); } size_t vm_get_lowmem_size(struct vmctx *ctx) { return (ctx->lowmem); } size_t vm_get_highmem_size(struct vmctx *ctx) { return (ctx->highmem); } void * vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) { char pathname[MAXPATHLEN]; size_t len2; char *base, *ptr; int fd, error, flags; fd = -1; ptr = MAP_FAILED; if (name == NULL || strlen(name) == 0) { errno = EINVAL; goto done; } error = vm_alloc_memseg(ctx, segid, len, name); if (error) goto done; strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); strlcat(pathname, ctx->name, sizeof(pathname)); strlcat(pathname, ".", sizeof(pathname)); strlcat(pathname, name, sizeof(pathname)); fd = open(pathname, O_RDWR); if (fd < 0) goto done; /* * Stake out a contiguous region covering the device memory and the * adjoining guard regions. */ len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; flags = MAP_PRIVATE | MAP_ANON | MAP_NOCORE | MAP_ALIGNED_SUPER; base = mmap(NULL, len2, PROT_NONE, flags, -1, 0); if (base == MAP_FAILED) goto done; flags = MAP_SHARED | MAP_FIXED; if ((ctx->memflags & VM_MEM_F_INCORE) == 0) flags |= MAP_NOCORE; /* mmap the devmem region in the host address space */ ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); done: if (fd >= 0) close(fd); return (ptr); } int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); vmsegdesc.cpuid = vcpu; vmsegdesc.regnum = reg; vmsegdesc.desc.base = base; vmsegdesc.desc.limit = limit; vmsegdesc.desc.access = access; error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc); return (error); } int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); vmsegdesc.cpuid = vcpu; vmsegdesc.regnum = reg; error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc); if (error == 0) { *base = vmsegdesc.desc.base; *limit = vmsegdesc.desc.limit; *access = vmsegdesc.desc.access; } return (error); } int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc) { int error; error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit, &seg_desc->access); return (error); } int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.cpuid = vcpu; vmreg.regnum = reg; vmreg.regval = val; error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg); return (error); } int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.cpuid = vcpu; vmreg.regnum = reg; error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg); *ret_val = vmreg.regval; return (error); } int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) { int error; struct vm_run vmrun; bzero(&vmrun, sizeof(vmrun)); vmrun.cpuid = vcpu; error = ioctl(ctx->fd, VM_RUN, &vmrun); bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); return (error); } int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) { struct vm_suspend vmsuspend; bzero(&vmsuspend, sizeof(vmsuspend)); vmsuspend.how = how; return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); } int vm_reinit(struct vmctx *ctx) { return (ioctl(ctx->fd, VM_REINIT, 0)); } int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction) { struct vm_exception exc; exc.cpuid = vcpu; exc.vector = vector; exc.error_code = errcode; exc.error_code_valid = errcode_valid; exc.restart_instruction = restart_instruction; return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc)); } int vm_apicid2vcpu(struct vmctx *ctx, int apicid) { /* * The apic id associated with the 'vcpu' has the same numerical value * as the 'vcpu' itself. */ return (apicid); } int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector) { struct vm_lapic_irq vmirq; bzero(&vmirq, sizeof(vmirq)); vmirq.cpuid = vcpu; vmirq.vector = vector; return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq)); } int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector) { struct vm_lapic_irq vmirq; bzero(&vmirq, sizeof(vmirq)); vmirq.cpuid = vcpu; vmirq.vector = vector; return (ioctl(ctx->fd, VM_LAPIC_LOCAL_IRQ, &vmirq)); } int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg) { struct vm_lapic_msi vmmsi; bzero(&vmmsi, sizeof(vmmsi)); vmmsi.addr = addr; vmmsi.msg = msg; return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi)); } int vm_ioapic_assert_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq)); } int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq)); } int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq) { struct vm_ioapic_irq ioapic_irq; bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); ioapic_irq.irq = irq; return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq)); } int vm_ioapic_pincount(struct vmctx *ctx, int *pincount) { return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount)); } int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq)); } int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq)); } int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) { struct vm_isa_irq isa_irq; bzero(&isa_irq, sizeof(struct vm_isa_irq)); isa_irq.atpic_irq = atpic_irq; isa_irq.ioapic_irq = ioapic_irq; return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq)); } int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger) { struct vm_isa_irq_trigger isa_irq_trigger; bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger)); isa_irq_trigger.atpic_irq = atpic_irq; isa_irq_trigger.trigger = trigger; return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger)); } int vm_inject_nmi(struct vmctx *ctx, int vcpu) { struct vm_nmi vmnmi; bzero(&vmnmi, sizeof(vmnmi)); vmnmi.cpuid = vcpu; return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi)); } static struct { const char *name; int type; } capstrmap[] = { { "hlt_exit", VM_CAP_HALT_EXIT }, { "mtrap_exit", VM_CAP_MTRAP_EXIT }, { "pause_exit", VM_CAP_PAUSE_EXIT }, { "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST }, { "enable_invpcid", VM_CAP_ENABLE_INVPCID }, { 0 } }; int vm_capability_name2type(const char *capname) { int i; for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) { if (strcmp(capstrmap[i].name, capname) == 0) return (capstrmap[i].type); } return (-1); } const char * vm_capability_type2name(int type) { int i; for (i = 0; capstrmap[i].name != NULL; i++) { if (capstrmap[i].type == type) return (capstrmap[i].name); } return (NULL); } int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int *retval) { int error; struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.cpuid = vcpu; vmcap.captype = cap; error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap); *retval = vmcap.capval; return (error); } int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val) { struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.cpuid = vcpu; vmcap.captype = cap; vmcap.capval = val; return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap)); } int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev)); } int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev)); } int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { struct vm_pptdev_mmio pptmmio; bzero(&pptmmio, sizeof(pptmmio)); pptmmio.bus = bus; pptmmio.slot = slot; pptmmio.func = func; pptmmio.gpa = gpa; pptmmio.len = len; pptmmio.hpa = hpa; return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio)); } int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec) { struct vm_pptdev_msi pptmsi; bzero(&pptmsi, sizeof(pptmsi)); pptmsi.vcpu = vcpu; pptmsi.bus = bus; pptmsi.slot = slot; pptmsi.func = func; pptmsi.msg = msg; pptmsi.addr = addr; pptmsi.numvec = numvec; return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi)); } int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control) { struct vm_pptdev_msix pptmsix; bzero(&pptmsix, sizeof(pptmsix)); pptmsix.vcpu = vcpu; pptmsix.bus = bus; pptmsix.slot = slot; pptmsix.func = func; pptmsix.idx = idx; pptmsix.msg = msg; pptmsix.addr = addr; pptmsix.vector_control = vector_control; return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix); } uint64_t * vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, int *ret_entries) { int error; static struct vm_stats vmstats; vmstats.cpuid = vcpu; error = ioctl(ctx->fd, VM_STATS, &vmstats); if (error == 0) { if (ret_entries) *ret_entries = vmstats.num_entries; if (ret_tv) *ret_tv = vmstats.tv; return (vmstats.statbuf); } else return (NULL); } const char * vm_get_stat_desc(struct vmctx *ctx, int index) { static struct vm_stat_desc statdesc; statdesc.index = index; if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) return (statdesc.desc); else return (NULL); } int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); x2apic.cpuid = vcpu; error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic); *state = x2apic.state; return (error); } int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); x2apic.cpuid = vcpu; x2apic.state = state; error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic); return (error); } /* * From Intel Vol 3a: * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT */ int vcpu_reset(struct vmctx *vmctx, int vcpu) { int error; uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx; uint32_t desc_access, desc_limit; uint16_t sel; zero = 0; rflags = 0x2; error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); if (error) goto done; rip = 0xfff0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0) goto done; cr0 = CR0_NE; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0) goto done; cr4 = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0) goto done; /* * CS: present, r/w, accessed, 16-bit, byte granularity, usable */ desc_base = 0xffff0000; desc_limit = 0xffff; desc_access = 0x0093; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0xf000; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0) goto done; /* * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0093; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0) goto done; /* General purpose registers */ rdx = 0xf00; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0) goto done; /* GDTR, IDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; /* TR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0000008b; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0) goto done; /* LDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x00000082; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) goto done; /* XXX cr2, debug registers */ error = 0; done: return (error); } int vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) { int error, i; struct vm_gpa_pte gpapte; bzero(&gpapte, sizeof(gpapte)); gpapte.gpa = gpa; error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); if (error == 0) { *num = gpapte.ptenum; for (i = 0; i < gpapte.ptenum; i++) pte[i] = gpapte.pte[i]; } return (error); } int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities) { int error; struct vm_hpet_cap cap; bzero(&cap, sizeof(struct vm_hpet_cap)); error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap); if (capabilities != NULL) *capabilities = cap.capabilities; return (error); } int vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; bzero(&gg, sizeof(struct vm_gla2gpa)); gg.vcpuid = vcpu; gg.prot = prot; gg.gla = gla; gg.paging = *paging; error = ioctl(ctx->fd, VM_GLA2GPA, &gg); if (error == 0) { *fault = gg.fault; *gpa = gg.gpa; } return (error); } #ifndef min #define min(a,b) (((a) < (b)) ? (a) : (b)) #endif int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault) { void *va; uint64_t gpa; int error, i, n, off; for (i = 0; i < iovcnt; i++) { iov[i].iov_base = 0; iov[i].iov_len = 0; } while (len) { assert(iovcnt > 0); error = vm_gla2gpa(ctx, vcpu, paging, gla, prot, &gpa, fault); if (error || *fault) return (error); off = gpa & PAGE_MASK; n = min(len, PAGE_SIZE - off); va = vm_map_gpa(ctx, gpa, n); if (va == NULL) return (EFAULT); iov->iov_base = va; iov->iov_len = n; iov++; iovcnt--; gla += n; len -= n; } return (0); } void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt) { return; } void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len) { const char *src; char *dst; size_t n; dst = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); src = iov->iov_base; bcopy(src, dst, n); iov++; dst += n; len -= n; } } void vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov, size_t len) { const char *src; char *dst; size_t n; src = vp; while (len) { assert(iov->iov_len); n = min(len, iov->iov_len); dst = iov->iov_base; bcopy(src, dst, n); iov++; src += n; len -= n; } } static int vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) { struct vm_cpuset vm_cpuset; int error; bzero(&vm_cpuset, sizeof(struct vm_cpuset)); vm_cpuset.which = which; vm_cpuset.cpusetsize = sizeof(cpuset_t); vm_cpuset.cpus = cpus; error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); return (error); } int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); } int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) { return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); } int vm_activate_cpu(struct vmctx *ctx, int vcpu) { struct vm_activate_cpu ac; int error; bzero(&ac, sizeof(struct vm_activate_cpu)); ac.vcpuid = vcpu; error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac); return (error); } int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); vmii.vcpuid = vcpu; error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii); if (error == 0) { *info1 = vmii.info1; *info2 = vmii.info2; } return (error); } int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1) { struct vm_intinfo vmii; int error; bzero(&vmii, sizeof(struct vm_intinfo)); vmii.vcpuid = vcpu; vmii.info1 = info1; error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii); return (error); } int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value) { struct vm_rtc_data rtcdata; int error; bzero(&rtcdata, sizeof(struct vm_rtc_data)); rtcdata.offset = offset; rtcdata.value = value; error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata); return (error); } int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval) { struct vm_rtc_data rtcdata; int error; bzero(&rtcdata, sizeof(struct vm_rtc_data)); rtcdata.offset = offset; error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata); if (error == 0) *retval = rtcdata.value; return (error); } int vm_rtc_settime(struct vmctx *ctx, time_t secs) { struct vm_rtc_time rtctime; int error; bzero(&rtctime, sizeof(struct vm_rtc_time)); rtctime.secs = secs; error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime); return (error); } int vm_rtc_gettime(struct vmctx *ctx, time_t *secs) { struct vm_rtc_time rtctime; int error; bzero(&rtctime, sizeof(struct vm_rtc_time)); error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime); if (error == 0) *secs = rtctime.secs; return (error); } int vm_restart_instruction(void *arg, int vcpu) { struct vmctx *ctx = arg; return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu)); } + +int +vm_get_device_fd(struct vmctx *ctx) +{ + + return (ctx->fd); +} + +const cap_ioctl_t * +vm_get_ioctls(size_t *len) +{ + cap_ioctl_t *cmds; + /* keep in sync with machine/vmm_dev.h */ + static const cap_ioctl_t vm_ioctl_cmds[] = { VM_RUN, VM_SUSPEND, VM_REINIT, + VM_ALLOC_MEMSEG, VM_GET_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_MEMSEG, + VM_MMAP_GETNEXT, VM_SET_REGISTER, VM_GET_REGISTER, + VM_SET_SEGMENT_DESCRIPTOR, VM_GET_SEGMENT_DESCRIPTOR, + VM_INJECT_EXCEPTION, VM_LAPIC_IRQ, VM_LAPIC_LOCAL_IRQ, + VM_LAPIC_MSI, VM_IOAPIC_ASSERT_IRQ, VM_IOAPIC_DEASSERT_IRQ, + VM_IOAPIC_PULSE_IRQ, VM_IOAPIC_PINCOUNT, VM_ISA_ASSERT_IRQ, + VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER, + VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV, + VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI, + VM_PPTDEV_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, + VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE, + VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA, + VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SET_INTINFO, VM_GET_INTINFO, + VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME, + VM_RESTART_INSTRUCTION }; + + if (len == NULL) { + cmds = malloc(sizeof(vm_ioctl_cmds)); + if (cmds == NULL) + return (NULL); + bcopy(vm_ioctl_cmds, cmds, sizeof(vm_ioctl_cmds)); + return (cmds); + } + + *len = nitems(vm_ioctl_cmds); + return (NULL); +} + Index: releng/11.1/lib/libvmmapi/vmmapi.h =================================================================== --- releng/11.1/lib/libvmmapi/vmmapi.h (revision 320933) +++ releng/11.1/lib/libvmmapi/vmmapi.h (revision 320934) @@ -1,219 +1,222 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMMAPI_H_ #define _VMMAPI_H_ #include #include /* * API version for out-of-tree consumers like grub-bhyve for making compile * time decisions. */ -#define VMMAPI_VERSION 0102 /* 2 digit major followed by 2 digit minor */ +#define VMMAPI_VERSION 0103 /* 2 digit major followed by 2 digit minor */ struct iovec; struct vmctx; enum x2apic_state; /* * Different styles of mapping the memory assigned to a VM into the address * space of the controlling process. */ enum vm_mmap_style { VM_MMAP_NONE, /* no mapping */ VM_MMAP_ALL, /* fully and statically mapped */ VM_MMAP_SPARSE, /* mappings created on-demand */ }; /* * 'flags' value passed to 'vm_set_memflags()'. */ #define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */ #define VM_MEM_F_WIRED 0x02 /* guest memory is wired */ /* * Identifiers for memory segments: * - vm_setup_memory() uses VM_SYSMEM for the system memory segment. * - the remaining identifiers can be used to create devmem segments. */ enum { VM_SYSMEM, VM_BOOTROM, VM_FRAMEBUFFER, }; /* * Get the length and name of the memory segment identified by 'segid'. * Note that system memory segments are identified with a nul name. * * Returns 0 on success and non-zero otherwise. */ int vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name, size_t namesiz); /* * Iterate over the guest address space. This function finds an address range * that starts at an address >= *gpa. * * Returns 0 if the next address range was found and non-zero otherwise. */ int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); /* * Create a device memory segment identified by 'segid'. * * Returns a pointer to the memory segment on success and MAP_FAILED otherwise. */ void *vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len); /* * Map the memory segment identified by 'segid' into the guest address space * at [gpa,gpa+len) with protection 'prot'. */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t segoff, size_t len, int prot); int vm_create(const char *name); +int vm_get_device_fd(struct vmctx *ctx); struct vmctx *vm_open(const char *name); void vm_destroy(struct vmctx *ctx); int vm_parse_memsize(const char *optarg, size_t *memsize); int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); uint32_t vm_get_lowmem_limit(struct vmctx *ctx); void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit); void vm_set_memflags(struct vmctx *ctx, int flags); int vm_get_memflags(struct vmctx *ctx); size_t vm_get_lowmem_size(struct vmctx *ctx); size_t vm_get_highmem_size(struct vmctx *ctx); int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access); int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access); int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc); int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit); int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how); int vm_reinit(struct vmctx *ctx); int vm_apicid2vcpu(struct vmctx *ctx, int apicid); int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction); int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector); int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector); int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg); int vm_ioapic_assert_irq(struct vmctx *ctx, int irq); int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq); int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq); int vm_ioapic_pincount(struct vmctx *ctx, int *pincount); int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger); int vm_inject_nmi(struct vmctx *ctx, int vcpu); int vm_capability_name2type(const char *capname); const char *vm_capability_type2name(int type); int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int *retval); int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val); int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec); int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2); int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo); + +const cap_ioctl_t *vm_get_ioctls(size_t *len); /* * Return a pointer to the statistics buffer. Note that this is not MT-safe. */ uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, int *ret_entries); const char *vm_get_stat_desc(struct vmctx *ctx, int index); int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s); int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s); int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities); /* * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'. * The 'iovcnt' should be big enough to accommodate all GPA segments. * * retval fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Error */ int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault); void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov, void *host_dst, size_t len); void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src, struct iovec *guest_iov, size_t len); void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt); /* RTC */ int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value); int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval); int vm_rtc_settime(struct vmctx *ctx, time_t secs); int vm_rtc_gettime(struct vmctx *ctx, time_t *secs); /* Reset vcpu register state */ int vcpu_reset(struct vmctx *ctx, int vcpu); int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_activate_cpu(struct vmctx *ctx, int vcpu); /* * FreeBSD specific APIs */ int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu, uint64_t rip, uint64_t cr3, uint64_t gdtbase, uint64_t rsp); int vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu, uint32_t eip, uint32_t gdtbase, uint32_t esp); void vm_setup_freebsd_gdt(uint64_t *gdtr); #endif /* _VMMAPI_H_ */ Index: releng/11.1/usr.sbin/bhyve/bhyverun.c =================================================================== --- releng/11.1/usr.sbin/bhyve/bhyverun.c (revision 320933) +++ releng/11.1/usr.sbin/bhyve/bhyverun.c (revision 320934) @@ -1,971 +1,1049 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include +#ifndef WITHOUT_CAPSICUM +#include +#include +#endif #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "inout.h" #include "dbgport.h" #include "fwctl.h" #include "ioapic.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "smbiostbl.h" #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ #define MB (1024UL * 1024) #define GB (1024UL * MB) typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); char *vmname; int guest_ncpus; char *guest_uuid_str; static int guest_vmexit_on_hlt, guest_vmexit_on_pause; static int virtio_msix = 1; static int x2apic_mode = 0; /* default is xAPIC */ static int strictio; static int strictmsr = 1; static int acpi; static char *progname; static const int BSP = 0; static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); static struct vm_exit vmexit[VM_MAXCPU]; struct bhyvestats { uint64_t vmexit_bogus; uint64_t vmexit_reqidle; uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; uint64_t vmexit_inst_emul; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; } stats; struct mt_vmm_info { pthread_t mt_thr; struct vmctx *mt_ctx; int mt_vcpu; } mt_vmm_info[VM_MAXCPU]; static cpuset_t *vcpumap[VM_MAXCPU] = { NULL }; static void usage(int code) { fprintf(stderr, "Usage: %s [-abehuwxACHPSWY] [-c vcpus] [-g ] [-l ]\n" " %*s [-m mem] [-p vcpu:hostcpu] [-s ] [-U uuid] \n" " -a: local apic is in xAPIC mode (deprecated)\n" " -A: create ACPI tables\n" " -c: # cpus (default 1)\n" " -C: include guest memory in core file\n" " -e: exit on unhandled I/O access\n" " -g: gdb port\n" " -h: help\n" " -H: vmexit from the guest on hlt\n" " -l: LPC device configuration\n" " -m: memory size in MB\n" " -p: pin 'vcpu' to 'hostcpu'\n" " -P: vmexit from the guest on pause\n" " -s: PCI slot config\n" " -S: guest memory cannot be swapped\n" " -u: RTC keeps UTC time\n" " -U: uuid\n" " -w: ignore unimplemented MSRs\n" " -W: force virtio to use single-vector MSI\n" " -x: local apic is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), ""); exit(code); } +#ifndef WITHOUT_CAPSICUM +/* + * 11-stable capsicum helpers + */ +static void +bhyve_caph_cache_catpages(void) +{ + + (void)catopen("libc", NL_CAT_LOCALE); +} + static int +bhyve_caph_limit_stdoe(void) +{ + cap_rights_t rights; + unsigned long cmds[] = { TIOCGETA, TIOCGWINSZ }; + int i, fds[] = { STDOUT_FILENO, STDERR_FILENO }; + + cap_rights_init(&rights, CAP_FCNTL, CAP_FSTAT, CAP_IOCTL); + cap_rights_set(&rights, CAP_WRITE); + + for (i = 0; i < nitems(fds); i++) { + if (cap_rights_limit(fds[i], &rights) < 0 && errno != ENOSYS) + return (-1); + + if (cap_ioctls_limit(fds[i], cmds, nitems(cmds)) < 0 && errno != ENOSYS) + return (-1); + + if (cap_fcntls_limit(fds[i], CAP_FCNTL_GETFL) < 0 && errno != ENOSYS) + return (-1); + } + + return (0); +} + +#endif + +static int pincpu_parse(const char *opt) { int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0 || vcpu >= VM_MAXCPU) { fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n", vcpu, VM_MAXCPU - 1); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } if (vcpumap[vcpu] == NULL) { if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) { perror("malloc"); return (-1); } CPU_ZERO(vcpumap[vcpu]); } CPU_SET(pcpu, vcpumap[vcpu]); return (0); } void vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, int errcode) { struct vmctx *ctx; int error, restart_instruction; ctx = arg; restart_instruction = 1; error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, restart_instruction); assert(error == 0); } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } int fbsdrun_vmexit_on_pause(void) { return (guest_vmexit_on_pause); } int fbsdrun_vmexit_on_hlt(void) { return (guest_vmexit_on_hlt); } int fbsdrun_virtio_msix(void) { return (virtio_msix); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct mt_vmm_info *mtp; int vcpu; mtp = param; vcpu = mtp->mt_vcpu; snprintf(tname, sizeof(tname), "vcpu %d", vcpu); pthread_set_name_np(mtp->mt_thr, tname); vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); /* not reached */ exit(1); return (NULL); } void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) { int error; assert(fromcpu == BSP); /* * The 'newcpu' must be activated in the context of 'fromcpu'. If * vm_activate_cpu() is delayed until newcpu's pthread starts running * then vmm.ko is out-of-sync with bhyve and this can create a race * with vm_suspend(). */ error = vm_activate_cpu(ctx, newcpu); if (error != 0) err(EX_OSERR, "could not activate CPU %d", newcpu); CPU_SET_ATOMIC(newcpu, &cpumask); /* * Set up the vmexit struct to allow execution to start * at the given RIP */ vmexit[newcpu].rip = rip; vmexit[newcpu].inst_length = 0; mt_vmm_info[newcpu].mt_ctx = ctx; mt_vmm_info[newcpu].mt_vcpu = newcpu; error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, fbsdrun_start_thread, &mt_vmm_info[newcpu]); assert(error == 0); } static int fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) { if (!CPU_ISSET(vcpu, &cpumask)) { fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); exit(1); } CPU_CLR_ATOMIC(vcpu, &cpumask); return (CPU_EMPTY(&cpumask)); } static int vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, uint32_t eax) { #if BHYVE_DEBUG /* * put guest-driven debug here */ #endif return (VMEXIT_CONTINUE); } static int vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; int bytes, port, in, out; int vcpu; vcpu = *pvcpu; port = vme->u.inout.port; bytes = vme->u.inout.bytes; in = vme->u.inout.in; out = !in; /* Extra-special case of host notifications */ if (out && port == GUEST_NIO_PORT) { error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); return (error); } error = emulate_inout(ctx, vcpu, vme, strictio); if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port, vmexit->rip); return (VMEXIT_ABORT); } else { return (VMEXIT_CONTINUE); } } static int vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { uint64_t val; uint32_t eax, edx; int error; val = 0; error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val); if (error != 0) { fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", vme->u.msr.code, *pvcpu); if (strictmsr) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } eax = val; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", vme->u.msr.code, vme->u.msr.wval, *pvcpu); if (strictmsr) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); } static int vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { (void)spinup_ap(ctx, *pvcpu, vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); return (VMEXIT_CONTINUE); } #define DEBUG_EPT_MISCONFIG #ifdef DEBUG_EPT_MISCONFIG #define EXIT_REASON_EPT_MISCONFIG 49 #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 #define VMCS_IDENT(x) ((x) | 0x80000000) static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; static int ept_misconfig_ptenum; #endif static int vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tVMX\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status); fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason); fprintf(stderr, "\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); #ifdef DEBUG_EPT_MISCONFIG if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { vm_get_register(ctx, *pvcpu, VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), &ept_misconfig_gpa); vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, &ept_misconfig_ptenum); fprintf(stderr, "\tEPT misconfiguration:\n"); fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", ept_misconfig_ptenum, ept_misconfig_pte[0], ept_misconfig_pte[1], ept_misconfig_pte[2], ept_misconfig_pte[3]); } #endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } static int vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tSVM\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode); fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1); fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2); return (VMEXIT_ABORT); } static int vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_bogus++; return (VMEXIT_CONTINUE); } static int vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_reqidle++; return (VMEXIT_CONTINUE); } static int vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_hlt++; /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_pause++; return (VMEXIT_CONTINUE); } static int vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_mtrap++; return (VMEXIT_CONTINUE); } static int vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { int err, i; struct vie *vie; stats.vmexit_inst_emul++; vie = &vmexit->u.inst_emul.vie; err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, vie, &vmexit->u.inst_emul.paging); if (err) { if (err == ESRCH) { fprintf(stderr, "Unhandled memory access to 0x%lx\n", vmexit->u.inst_emul.gpa); } fprintf(stderr, "Failed to emulate instruction ["); for (i = 0; i < vie->num_valid; i++) { fprintf(stderr, "0x%02x%s", vie->inst[i], i != (vie->num_valid - 1) ? " " : ""); } fprintf(stderr, "] at 0x%lx\n", vmexit->rip); return (VMEXIT_ABORT); } return (VMEXIT_CONTINUE); } static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; static int vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { enum vm_suspend_how how; how = vmexit->u.suspended.how; fbsdrun_deletecpu(ctx, *pvcpu); if (*pvcpu != BSP) { pthread_mutex_lock(&resetcpu_mtx); pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); } pthread_mutex_lock(&resetcpu_mtx); while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); switch (how) { case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: exit(1); case VM_SUSPEND_HALT: exit(2); case VM_SUSPEND_TRIPLEFAULT: exit(3); default: fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); exit(100); } return (0); /* NOTREACHED */ } static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, }; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) { int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus; if (vcpumap[vcpu] != NULL) { error = pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), vcpumap[vcpu]); assert(error == 0); } error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu, &active_cpus)); error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); assert(error == 0); while (1) { error = vm_run(ctx, vcpu, &vmexit[vcpu]); if (error != 0) break; exitcode = vmexit[vcpu].exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(1); } rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(1); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx) { int tmp, error; error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ if (error == 0) return (VM_MAXCPU); else return (1); } void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) { int err, tmp; if (fbsdrun_vmexit_on_hlt()) { err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); exit(1); } vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_HLT] = vmexit_hlt; } if (fbsdrun_vmexit_on_pause()) { /* * pause exit support required for this mode */ err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); exit(1); } vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_PAUSE] = vmexit_pause; } if (x2apic_mode) err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED); else err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); exit(1); } vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool reinit, romboot; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + const cap_ioctl_t *cmds; + size_t ncmds; +#endif reinit = romboot = false; if (lpc_bootrom()) romboot = true; error = vm_create(vmname); if (error) { if (errno == EEXIST) { if (romboot) { reinit = true; } else { /* * The virtual machine has been setup by the * userspace bootloader. */ } } else { perror("vm_create"); exit(1); } } else { if (!romboot) { /* * If the virtual machine was just created then a * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); exit(1); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(1); } +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); + if (cap_rights_limit(vm_get_device_fd(ctx), &rights) == -1 && + errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + vm_get_ioctls(&ncmds); + cmds = vm_get_ioctls(NULL); + if (cmds == NULL) + errx(EX_OSERR, "out of memory"); + if (cap_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1 && + errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + free((cap_ioctl_t *)cmds); +#endif + if (reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(1); } } return (ctx); } int main(int argc, char *argv[]) { int c, error, gdb_port, err, bvmcons; int max_vcpus, mptgen, memflags; int rtc_localtime; struct vmctx *ctx; uint64_t rip; size_t memsize; char *optstr; bvmcons = 0; progname = basename(argv[0]); gdb_port = 0; guest_ncpus = 1; memsize = 256 * MB; mptgen = 1; rtc_localtime = 1; memflags = 0; optstr = "abehuwxACHIPSWYp:g:c:s:m:l:U:"; while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': x2apic_mode = 0; break; case 'A': acpi = 1; break; case 'b': bvmcons = 1; break; case 'p': if (pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': guest_ncpus = atoi(optarg); break; case 'C': memflags |= VM_MEM_F_INCORE; break; case 'g': gdb_port = atoi(optarg); break; case 'l': if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; case 's': if (pci_parse_slot(optarg) != 0) exit(1); else break; case 'S': memflags |= VM_MEM_F_WIRED; break; case 'm': error = vm_parse_memsize(optarg, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", optarg); break; case 'H': guest_vmexit_on_hlt = 1; break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': guest_vmexit_on_pause = 1; break; case 'e': strictio = 1; break; case 'u': rtc_localtime = 0; break; case 'U': guest_uuid_str = optarg; break; case 'w': strictmsr = 0; break; case 'W': virtio_msix = 0; break; case 'x': x2apic_mode = 1; break; case 'Y': mptgen = 0; break; case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; if (argc != 1) usage(1); vmname = argv[0]; ctx = do_open(vmname); if (guest_ncpus < 1) { fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); exit(1); } max_vcpus = num_vcpus_allowed(ctx); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(1); } fbsdrun_set_capabilities(ctx, BSP); vm_set_memflags(ctx, memflags); err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (err) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(1); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); exit(1); } init_mem(); init_inout(); atkbdc_init(ctx); pci_irq_init(ctx); ioapic_init(ctx); rtc_init(ctx, rtc_localtime); sci_init(ctx); /* * Exit if a device emulation finds an error in it's initilization */ if (init_pci(ctx) != 0) exit(1); if (gdb_port != 0) init_dbgport(gdb_port); if (bvmcons) init_bvmcons(); if (lpc_bootrom()) { if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); exit(1); } error = vcpu_reset(ctx, BSP); assert(error == 0); } error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); assert(error == 0); /* * build the guest tables, MP etc. */ if (mptgen) { error = mptable_build(ctx, guest_ncpus); if (error) exit(1); } error = smbios_build(ctx); assert(error == 0); if (acpi) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } if (lpc_bootrom()) fwctl_init(); + +#ifndef WITHOUT_CAPSICUM + + + if (bhyve_caph_limit_stdoe() == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + + if (cap_enter() == -1 && errno != ENOSYS) + errx(EX_OSERR, "cap_enter() failed"); +#endif /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); /* * Add CPU 0 */ fbsdrun_addcpu(ctx, BSP, BSP, rip); /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(1); } Index: releng/11.1/usr.sbin/bhyve/block_if.c =================================================================== --- releng/11.1/usr.sbin/bhyve/block_if.c (revision 320933) +++ releng/11.1/usr.sbin/bhyve/block_if.c (revision 320934) @@ -1,821 +1,844 @@ /*- * Copyright (c) 2013 Peter Grehan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include "bhyverun.h" #include "mevent.h" #include "block_if.h" #define BLOCKIF_SIG 0xb109b109 #define BLOCKIF_NUMTHR 8 #define BLOCKIF_MAXREQ (64 + BLOCKIF_NUMTHR) enum blockop { BOP_READ, BOP_WRITE, BOP_FLUSH, BOP_DELETE }; enum blockstat { BST_FREE, BST_BLOCK, BST_PEND, BST_BUSY, BST_DONE }; struct blockif_elem { TAILQ_ENTRY(blockif_elem) be_link; struct blockif_req *be_req; enum blockop be_op; enum blockstat be_status; pthread_t be_tid; off_t be_block; }; struct blockif_ctxt { int bc_magic; int bc_fd; int bc_ischr; int bc_isgeom; int bc_candelete; int bc_rdonly; off_t bc_size; int bc_sectsz; int bc_psectsz; int bc_psectoff; int bc_closing; pthread_t bc_btid[BLOCKIF_NUMTHR]; pthread_mutex_t bc_mtx; pthread_cond_t bc_cond; /* Request elements and free/pending/busy queues */ TAILQ_HEAD(, blockif_elem) bc_freeq; TAILQ_HEAD(, blockif_elem) bc_pendq; TAILQ_HEAD(, blockif_elem) bc_busyq; struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; }; static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; struct blockif_sig_elem { pthread_mutex_t bse_mtx; pthread_cond_t bse_cond; int bse_pending; struct blockif_sig_elem *bse_next; }; static struct blockif_sig_elem *blockif_bse_head; static int blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { struct blockif_elem *be, *tbe; off_t off; int i; be = TAILQ_FIRST(&bc->bc_freeq); assert(be != NULL); assert(be->be_status == BST_FREE); TAILQ_REMOVE(&bc->bc_freeq, be, be_link); be->be_req = breq; be->be_op = op; switch (op) { case BOP_READ: case BOP_WRITE: case BOP_DELETE: off = breq->br_offset; for (i = 0; i < breq->br_iovcnt; i++) off += breq->br_iov[i].iov_len; break; default: off = OFF_MAX; } be->be_block = off; TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { if (tbe->be_block == breq->br_offset) break; } if (tbe == NULL) { TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { if (tbe->be_block == breq->br_offset) break; } } if (tbe == NULL) be->be_status = BST_PEND; else be->be_status = BST_BLOCK; TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); return (be->be_status == BST_PEND); } static int blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) { struct blockif_elem *be; TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { if (be->be_status == BST_PEND) break; assert(be->be_status == BST_BLOCK); } if (be == NULL) return (0); TAILQ_REMOVE(&bc->bc_pendq, be, be_link); be->be_status = BST_BUSY; be->be_tid = t; TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); *bep = be; return (1); } static void blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) { struct blockif_elem *tbe; if (be->be_status == BST_DONE || be->be_status == BST_BUSY) TAILQ_REMOVE(&bc->bc_busyq, be, be_link); else TAILQ_REMOVE(&bc->bc_pendq, be, be_link); TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { if (tbe->be_req->br_offset == be->be_block) tbe->be_status = BST_PEND; } be->be_tid = 0; be->be_status = BST_FREE; be->be_req = NULL; TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); } static void blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) { struct blockif_req *br; off_t arg[2]; ssize_t clen, len, off, boff, voff; int i, err; br = be->be_req; if (br->br_iovcnt <= 1) buf = NULL; err = 0; switch (be->be_op) { case BOP_READ: if (buf == NULL) { if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, br->br_offset)) < 0) err = errno; else br->br_resid -= len; break; } i = 0; off = voff = 0; while (br->br_resid > 0) { len = MIN(br->br_resid, MAXPHYS); if (pread(bc->bc_fd, buf, len, br->br_offset + off) < 0) { err = errno; break; } boff = 0; do { clen = MIN(len - boff, br->br_iov[i].iov_len - voff); memcpy(br->br_iov[i].iov_base + voff, buf + boff, clen); if (clen < br->br_iov[i].iov_len - voff) voff += clen; else { i++; voff = 0; } boff += clen; } while (boff < len); off += len; br->br_resid -= len; } break; case BOP_WRITE: if (bc->bc_rdonly) { err = EROFS; break; } if (buf == NULL) { if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, br->br_offset)) < 0) err = errno; else br->br_resid -= len; break; } i = 0; off = voff = 0; while (br->br_resid > 0) { len = MIN(br->br_resid, MAXPHYS); boff = 0; do { clen = MIN(len - boff, br->br_iov[i].iov_len - voff); memcpy(buf + boff, br->br_iov[i].iov_base + voff, clen); if (clen < br->br_iov[i].iov_len - voff) voff += clen; else { i++; voff = 0; } boff += clen; } while (boff < len); if (pwrite(bc->bc_fd, buf, len, br->br_offset + off) < 0) { err = errno; break; } off += len; br->br_resid -= len; } break; case BOP_FLUSH: if (bc->bc_ischr) { if (ioctl(bc->bc_fd, DIOCGFLUSH)) err = errno; } else if (fsync(bc->bc_fd)) err = errno; break; case BOP_DELETE: if (!bc->bc_candelete) err = EOPNOTSUPP; else if (bc->bc_rdonly) err = EROFS; else if (bc->bc_ischr) { arg[0] = br->br_offset; arg[1] = br->br_resid; if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) err = errno; else br->br_resid = 0; } else err = EOPNOTSUPP; break; default: err = EINVAL; break; } be->be_status = BST_DONE; (*br->br_callback)(br, err); } static void * blockif_thr(void *arg) { struct blockif_ctxt *bc; struct blockif_elem *be; pthread_t t; uint8_t *buf; bc = arg; if (bc->bc_isgeom) buf = malloc(MAXPHYS); else buf = NULL; t = pthread_self(); pthread_mutex_lock(&bc->bc_mtx); for (;;) { while (blockif_dequeue(bc, t, &be)) { pthread_mutex_unlock(&bc->bc_mtx); blockif_proc(bc, be, buf); pthread_mutex_lock(&bc->bc_mtx); blockif_complete(bc, be); } /* Check ctxt status here to see if exit requested */ if (bc->bc_closing) break; pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); } pthread_mutex_unlock(&bc->bc_mtx); if (buf) free(buf); pthread_exit(NULL); return (NULL); } static void blockif_sigcont_handler(int signal, enum ev_type type, void *arg) { struct blockif_sig_elem *bse; for (;;) { /* * Process the entire list even if not intended for * this thread. */ do { bse = blockif_bse_head; if (bse == NULL) return; } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, (uintptr_t)bse, (uintptr_t)bse->bse_next)); pthread_mutex_lock(&bse->bse_mtx); bse->bse_pending = 0; pthread_cond_signal(&bse->bse_cond); pthread_mutex_unlock(&bse->bse_mtx); } } static void blockif_init(void) { mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); (void) signal(SIGCONT, SIG_IGN); } struct blockif_ctxt * blockif_open(const char *optstr, const char *ident) { char tname[MAXCOMLEN + 1]; char name[MAXPATHLEN]; char *nopt, *xopts, *cp; struct blockif_ctxt *bc; struct stat sbuf; struct diocgattr_arg arg; off_t size, psectsz, psectoff; int extra, fd, i, sectsz; int nocache, sync, ro, candelete, geom, ssopt, pssopt; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; +#endif pthread_once(&blockif_once, blockif_init); fd = -1; ssopt = 0; nocache = 0; sync = 0; ro = 0; /* * The first element in the optstring is always a pathname. * Optional elements follow */ nopt = xopts = strdup(optstr); while (xopts != NULL) { cp = strsep(&xopts, ","); if (cp == nopt) /* file or device pathname */ continue; else if (!strcmp(cp, "nocache")) nocache = 1; else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) sync = 1; else if (!strcmp(cp, "ro")) ro = 1; else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) ; else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) pssopt = ssopt; else { fprintf(stderr, "Invalid device option \"%s\"\n", cp); goto err; } } extra = 0; if (nocache) extra |= O_DIRECT; if (sync) extra |= O_SYNC; fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); if (fd < 0 && !ro) { /* Attempt a r/w fail with a r/o open */ fd = open(nopt, O_RDONLY | extra); ro = 1; } if (fd < 0) { warn("Could not open backing file: %s", nopt); goto err; } if (fstat(fd, &sbuf) < 0) { warn("Could not stat backing file %s", nopt); goto err; } +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, + CAP_WRITE); + if (ro) + cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); + + if (cap_rights_limit(fd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + /* * Deal with raw devices */ size = sbuf.st_size; sectsz = DEV_BSIZE; psectsz = psectoff = 0; candelete = geom = 0; if (S_ISCHR(sbuf.st_mode)) { if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || ioctl(fd, DIOCGSECTORSIZE, §sz)) { perror("Could not fetch dev blk/sector size"); goto err; } assert(size != 0); assert(sectsz != 0); if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); arg.len = sizeof(arg.value.i); if (ioctl(fd, DIOCGATTR, &arg) == 0) candelete = arg.value.i; if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) geom = 1; } else psectsz = sbuf.st_blksize; + +#ifndef WITHOUT_CAPSICUM + if (cap_ioctls_limit(fd, cmds, nitems(cmds)) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif if (ssopt != 0) { if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || ssopt > pssopt) { fprintf(stderr, "Invalid sector size %d/%d\n", ssopt, pssopt); goto err; } /* * Some backend drivers (e.g. cd0, ada0) require that the I/O * size be a multiple of the device's sector size. * * Validate that the emulated sector size complies with this * requirement. */ if (S_ISCHR(sbuf.st_mode)) { if (ssopt < sectsz || (ssopt % sectsz) != 0) { fprintf(stderr, "Sector size %d incompatible " "with underlying device sector size %d\n", ssopt, sectsz); goto err; } } sectsz = ssopt; psectsz = pssopt; psectoff = 0; } bc = calloc(1, sizeof(struct blockif_ctxt)); if (bc == NULL) { perror("calloc"); goto err; } bc->bc_magic = BLOCKIF_SIG; bc->bc_fd = fd; bc->bc_ischr = S_ISCHR(sbuf.st_mode); bc->bc_isgeom = geom; bc->bc_candelete = candelete; bc->bc_rdonly = ro; bc->bc_size = size; bc->bc_sectsz = sectsz; bc->bc_psectsz = psectsz; bc->bc_psectoff = psectoff; pthread_mutex_init(&bc->bc_mtx, NULL); pthread_cond_init(&bc->bc_cond, NULL); TAILQ_INIT(&bc->bc_freeq); TAILQ_INIT(&bc->bc_pendq); TAILQ_INIT(&bc->bc_busyq); for (i = 0; i < BLOCKIF_MAXREQ; i++) { bc->bc_reqs[i].be_status = BST_FREE; TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); } for (i = 0; i < BLOCKIF_NUMTHR; i++) { pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); pthread_set_name_np(bc->bc_btid[i], tname); } return (bc); err: if (fd >= 0) close(fd); return (NULL); } static int blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { int err; err = 0; pthread_mutex_lock(&bc->bc_mtx); if (!TAILQ_EMPTY(&bc->bc_freeq)) { /* * Enqueue and inform the block i/o thread * that there is work available */ if (blockif_enqueue(bc, breq, op)) pthread_cond_signal(&bc->bc_cond); } else { /* * Callers are not allowed to enqueue more than * the specified blockif queue limit. Return an * error to indicate that the queue length has been * exceeded. */ err = E2BIG; } pthread_mutex_unlock(&bc->bc_mtx); return (err); } int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_READ)); } int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_WRITE)); } int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_FLUSH)); } int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_DELETE)); } int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) { struct blockif_elem *be; assert(bc->bc_magic == BLOCKIF_SIG); pthread_mutex_lock(&bc->bc_mtx); /* * Check pending requests. */ TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { if (be->be_req == breq) break; } if (be != NULL) { /* * Found it. */ blockif_complete(bc, be); pthread_mutex_unlock(&bc->bc_mtx); return (0); } /* * Check in-flight requests. */ TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { if (be->be_req == breq) break; } if (be == NULL) { /* * Didn't find it. */ pthread_mutex_unlock(&bc->bc_mtx); return (EINVAL); } /* * Interrupt the processing thread to force it return * prematurely via it's normal callback path. */ while (be->be_status == BST_BUSY) { struct blockif_sig_elem bse, *old_head; pthread_mutex_init(&bse.bse_mtx, NULL); pthread_cond_init(&bse.bse_cond, NULL); bse.bse_pending = 1; do { old_head = blockif_bse_head; bse.bse_next = old_head; } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, (uintptr_t)old_head, (uintptr_t)&bse)); pthread_kill(be->be_tid, SIGCONT); pthread_mutex_lock(&bse.bse_mtx); while (bse.bse_pending) pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); pthread_mutex_unlock(&bse.bse_mtx); } pthread_mutex_unlock(&bc->bc_mtx); /* * The processing thread has been interrupted. Since it's not * clear if the callback has been invoked yet, return EBUSY. */ return (EBUSY); } int blockif_close(struct blockif_ctxt *bc) { void *jval; int i; assert(bc->bc_magic == BLOCKIF_SIG); /* * Stop the block i/o thread */ pthread_mutex_lock(&bc->bc_mtx); bc->bc_closing = 1; pthread_mutex_unlock(&bc->bc_mtx); pthread_cond_broadcast(&bc->bc_cond); for (i = 0; i < BLOCKIF_NUMTHR; i++) pthread_join(bc->bc_btid[i], &jval); /* XXX Cancel queued i/o's ??? */ /* * Release resources */ bc->bc_magic = 0; close(bc->bc_fd); free(bc); return (0); } /* * Return virtual C/H/S values for a given block. Use the algorithm * outlined in the VHD specification to calculate values. */ void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) { off_t sectors; /* total sectors of the block dev */ off_t hcyl; /* cylinders times heads */ uint16_t secpt; /* sectors per track */ uint8_t heads; assert(bc->bc_magic == BLOCKIF_SIG); sectors = bc->bc_size / bc->bc_sectsz; /* Clamp the size to the largest possible with CHS */ if (sectors > 65535UL*16*255) sectors = 65535UL*16*255; if (sectors >= 65536UL*16*63) { secpt = 255; heads = 16; hcyl = sectors / secpt; } else { secpt = 17; hcyl = sectors / secpt; heads = (hcyl + 1023) / 1024; if (heads < 4) heads = 4; if (hcyl >= (heads * 1024) || heads > 16) { secpt = 31; heads = 16; hcyl = sectors / secpt; } if (hcyl >= (heads * 1024)) { secpt = 63; heads = 16; hcyl = sectors / secpt; } } *c = hcyl / heads; *h = heads; *s = secpt; } /* * Accessors */ off_t blockif_size(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_size); } int blockif_sectsz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_sectsz); } void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) { assert(bc->bc_magic == BLOCKIF_SIG); *size = bc->bc_psectsz; *off = bc->bc_psectoff; } int blockif_queuesz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (BLOCKIF_MAXREQ - 1); } int blockif_is_ro(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_rdonly); } int blockif_candelete(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_candelete); } Index: releng/11.1/usr.sbin/bhyve/consport.c =================================================================== --- releng/11.1/usr.sbin/bhyve/consport.c (revision 320933) +++ releng/11.1/usr.sbin/bhyve/consport.c (revision 320934) @@ -1,153 +1,170 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include +#include +#include #include #include #include #include #include +#include #include "inout.h" #include "pci_lpc.h" #define BVM_CONSOLE_PORT 0x220 #define BVM_CONS_SIG ('b' << 8 | 'v') static struct termios tio_orig, tio_new; static void ttyclose(void) { tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig); } static void ttyopen(void) { tcgetattr(STDIN_FILENO, &tio_orig); cfmakeraw(&tio_new); tcsetattr(STDIN_FILENO, TCSANOW, &tio_new); atexit(ttyclose); } static bool tty_char_available(void) { fd_set rfds; struct timeval tv; FD_ZERO(&rfds); FD_SET(STDIN_FILENO, &rfds); tv.tv_sec = 0; tv.tv_usec = 0; if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) { return (true); } else { return (false); } } static int ttyread(void) { char rb; if (tty_char_available()) { read(STDIN_FILENO, &rb, 1); return (rb & 0xff); } else { return (-1); } } static void ttywrite(unsigned char wb) { (void) write(STDOUT_FILENO, &wb, 1); } static int console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { static int opened; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; +#endif if (bytes == 2 && in) { *eax = BVM_CONS_SIG; return (0); } /* * Guests might probe this port to look for old ISA devices * using single-byte reads. Return 0xff for those. */ if (bytes == 1 && in) { *eax = 0xff; return (0); } if (bytes != 4) return (-1); if (!opened) { +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ, CAP_WRITE); + if (cap_rights_limit(STDIN_FILENO, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (cap_ioctls_limit(STDIN_FILENO, cmds, nitems(cmds)) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif ttyopen(); opened = 1; } if (in) *eax = ttyread(); else ttywrite(*eax); return (0); } SYSRES_IO(BVM_CONSOLE_PORT, 4); static struct inout_port consport = { "bvmcons", BVM_CONSOLE_PORT, 1, IOPORT_F_INOUT, console_handler }; void init_bvmcons(void) { register_inout(&consport); } Index: releng/11.1/usr.sbin/bhyve/dbgport.c =================================================================== --- releng/11.1/usr.sbin/bhyve/dbgport.c (revision 320933) +++ releng/11.1/usr.sbin/bhyve/dbgport.c (revision 320934) @@ -1,159 +1,173 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include #include #include +#include #include #include +#include #include #include #include #include "inout.h" #include "dbgport.h" #include "pci_lpc.h" #define BVM_DBG_PORT 0x224 #define BVM_DBG_SIG ('B' << 8 | 'V') static int listen_fd, conn_fd; static struct sockaddr_in sin; static int dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { int nwritten, nread, printonce; int on = 1; char ch; if (bytes == 2 && in) { *eax = BVM_DBG_SIG; return (0); } if (bytes != 4) return (-1); again: printonce = 0; while (conn_fd < 0) { if (!printonce) { printf("Waiting for connection from gdb\r\n"); printonce = 1; } conn_fd = accept4(listen_fd, NULL, NULL, SOCK_NONBLOCK); if (conn_fd >= 0) { /* Avoid EPIPE after the client drops off. */ (void)setsockopt(conn_fd, SOL_SOCKET, SO_NOSIGPIPE, &on, sizeof(on)); /* Improve latency for one byte at a time tranfers. */ (void)setsockopt(conn_fd, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); } else if (errno != EINTR) { perror("accept"); } } if (in) { nread = read(conn_fd, &ch, 1); if (nread == -1 && errno == EAGAIN) *eax = -1; else if (nread == 1) *eax = ch; else { close(conn_fd); conn_fd = -1; goto again; } } else { ch = *eax; nwritten = write(conn_fd, &ch, 1); if (nwritten != 1) { close(conn_fd); conn_fd = -1; goto again; } } return (0); } static struct inout_port dbgport = { "bvmdbg", BVM_DBG_PORT, 1, IOPORT_F_INOUT, dbg_handler }; SYSRES_IO(BVM_DBG_PORT, 4); void init_dbgport(int sport) { int reuse; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif conn_fd = -1; if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { perror("socket"); exit(1); } sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl(INADDR_ANY); sin.sin_port = htons(sport); reuse = 1; if (setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)) < 0) { perror("setsockopt"); exit(1); } if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) { perror("bind"); exit(1); } if (listen(listen_fd, 1) < 0) { perror("listen"); exit(1); } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_ACCEPT, CAP_READ, CAP_WRITE); + if (cap_rights_limit(listen_fd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif register_inout(&dbgport); } Index: releng/11.1/usr.sbin/bhyve/mevent.c =================================================================== --- releng/11.1/usr.sbin/bhyve/mevent.c (revision 320933) +++ releng/11.1/usr.sbin/bhyve/mevent.c (revision 320934) @@ -1,456 +1,478 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Micro event library for FreeBSD, designed for a single i/o thread * using kqueue, and having events be persistent by default. */ #include __FBSDID("$FreeBSD$"); #include +#include #include #include #include #include +#include #include #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include #include #include #include "mevent.h" #define MEVENT_MAX 64 #define MEV_ADD 1 #define MEV_ENABLE 2 #define MEV_DISABLE 3 #define MEV_DEL_PENDING 4 extern char *vmname; static pthread_t mevent_tid; static int mevent_timid = 43; static int mevent_pipefd[2]; static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER; struct mevent { void (*me_func)(int, enum ev_type, void *); #define me_msecs me_fd int me_fd; int me_timid; enum ev_type me_type; void *me_param; int me_cq; int me_state; int me_closefd; LIST_ENTRY(mevent) me_list; }; static LIST_HEAD(listhead, mevent) global_head, change_head; static void mevent_qlock(void) { pthread_mutex_lock(&mevent_lmutex); } static void mevent_qunlock(void) { pthread_mutex_unlock(&mevent_lmutex); } static void mevent_pipe_read(int fd, enum ev_type type, void *param) { char buf[MEVENT_MAX]; int status; /* * Drain the pipe read side. The fd is non-blocking so this is * safe to do. */ do { status = read(fd, buf, sizeof(buf)); } while (status == MEVENT_MAX); } static void mevent_notify(void) { char c; /* * If calling from outside the i/o thread, write a byte on the * pipe to force the i/o thread to exit the blocking kevent call. */ if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) { write(mevent_pipefd[1], &c, 1); } } static int mevent_kq_filter(struct mevent *mevp) { int retval; retval = 0; if (mevp->me_type == EVF_READ) retval = EVFILT_READ; if (mevp->me_type == EVF_WRITE) retval = EVFILT_WRITE; if (mevp->me_type == EVF_TIMER) retval = EVFILT_TIMER; if (mevp->me_type == EVF_SIGNAL) retval = EVFILT_SIGNAL; return (retval); } static int mevent_kq_flags(struct mevent *mevp) { int ret; switch (mevp->me_state) { case MEV_ADD: ret = EV_ADD; /* implicitly enabled */ break; case MEV_ENABLE: ret = EV_ENABLE; break; case MEV_DISABLE: ret = EV_DISABLE; break; case MEV_DEL_PENDING: ret = EV_DELETE; break; default: assert(0); break; } return (ret); } static int mevent_kq_fflags(struct mevent *mevp) { /* XXX nothing yet, perhaps EV_EOF for reads ? */ return (0); } static int mevent_build(int mfd, struct kevent *kev) { struct mevent *mevp, *tmpp; int i; i = 0; mevent_qlock(); LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { if (mevp->me_closefd) { /* * A close of the file descriptor will remove the * event */ close(mevp->me_fd); } else { if (mevp->me_type == EVF_TIMER) { kev[i].ident = mevp->me_timid; kev[i].data = mevp->me_msecs; } else { kev[i].ident = mevp->me_fd; kev[i].data = 0; } kev[i].filter = mevent_kq_filter(mevp); kev[i].flags = mevent_kq_flags(mevp); kev[i].fflags = mevent_kq_fflags(mevp); kev[i].udata = mevp; i++; } mevp->me_cq = 0; LIST_REMOVE(mevp, me_list); if (mevp->me_state == MEV_DEL_PENDING) { free(mevp); } else { LIST_INSERT_HEAD(&global_head, mevp, me_list); } assert(i < MEVENT_MAX); } mevent_qunlock(); return (i); } static void mevent_handle(struct kevent *kev, int numev) { struct mevent *mevp; int i; for (i = 0; i < numev; i++) { mevp = kev[i].udata; /* XXX check for EV_ERROR ? */ (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); } } struct mevent * mevent_add(int tfd, enum ev_type type, void (*func)(int, enum ev_type, void *), void *param) { struct mevent *lp, *mevp; if (tfd < 0 || func == NULL) { return (NULL); } mevp = NULL; mevent_qlock(); /* * Verify that the fd/type tuple is not present in any list */ LIST_FOREACH(lp, &global_head, me_list) { if (type != EVF_TIMER && lp->me_fd == tfd && lp->me_type == type) { goto exit; } } LIST_FOREACH(lp, &change_head, me_list) { if (type != EVF_TIMER && lp->me_fd == tfd && lp->me_type == type) { goto exit; } } /* * Allocate an entry, populate it, and add it to the change list. */ mevp = calloc(1, sizeof(struct mevent)); if (mevp == NULL) { goto exit; } if (type == EVF_TIMER) { mevp->me_msecs = tfd; mevp->me_timid = mevent_timid++; } else mevp->me_fd = tfd; mevp->me_type = type; mevp->me_func = func; mevp->me_param = param; LIST_INSERT_HEAD(&change_head, mevp, me_list); mevp->me_cq = 1; mevp->me_state = MEV_ADD; mevent_notify(); exit: mevent_qunlock(); return (mevp); } static int mevent_update(struct mevent *evp, int newstate) { /* * It's not possible to enable/disable a deleted event */ if (evp->me_state == MEV_DEL_PENDING) return (EINVAL); /* * No update needed if state isn't changing */ if (evp->me_state == newstate) return (0); mevent_qlock(); evp->me_state = newstate; /* * Place the entry onto the changed list if not already there. */ if (evp->me_cq == 0) { evp->me_cq = 1; LIST_REMOVE(evp, me_list); LIST_INSERT_HEAD(&change_head, evp, me_list); mevent_notify(); } mevent_qunlock(); return (0); } int mevent_enable(struct mevent *evp) { return (mevent_update(evp, MEV_ENABLE)); } int mevent_disable(struct mevent *evp) { return (mevent_update(evp, MEV_DISABLE)); } static int mevent_delete_event(struct mevent *evp, int closefd) { mevent_qlock(); /* * Place the entry onto the changed list if not already there, and * mark as to be deleted. */ if (evp->me_cq == 0) { evp->me_cq = 1; LIST_REMOVE(evp, me_list); LIST_INSERT_HEAD(&change_head, evp, me_list); mevent_notify(); } evp->me_state = MEV_DEL_PENDING; if (closefd) evp->me_closefd = 1; mevent_qunlock(); return (0); } int mevent_delete(struct mevent *evp) { return (mevent_delete_event(evp, 0)); } int mevent_delete_close(struct mevent *evp) { return (mevent_delete_event(evp, 1)); } static void mevent_set_name(void) { pthread_set_name_np(mevent_tid, "mevent"); } void mevent_dispatch(void) { struct kevent changelist[MEVENT_MAX]; struct kevent eventlist[MEVENT_MAX]; struct mevent *pipev; int mfd; int numev; int ret; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif mevent_tid = pthread_self(); mevent_set_name(); mfd = kqueue(); assert(mfd > 0); +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_KQUEUE); + if (cap_rights_limit(mfd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + /* * Open the pipe that will be used for other threads to force * the blocking kqueue call to exit by writing to it. Set the * descriptor to non-blocking. */ ret = pipe(mevent_pipefd); if (ret < 0) { perror("pipe"); exit(0); } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (cap_rights_limit(mevent_pipefd[0], &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (cap_rights_limit(mevent_pipefd[1], &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif /* * Add internal event handler for the pipe write fd */ pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL); assert(pipev != NULL); for (;;) { /* * Build changelist if required. * XXX the changelist can be put into the blocking call * to eliminate the extra syscall. Currently better for * debug. */ numev = mevent_build(mfd, changelist); if (numev) { ret = kevent(mfd, changelist, numev, NULL, 0, NULL); if (ret == -1) { perror("Error return from kevent change"); } } /* * Block awaiting events */ ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL); if (ret == -1 && errno != EINTR) { perror("Error return from kevent monitor"); } /* * Handle reported events */ mevent_handle(eventlist, ret); } } Index: releng/11.1/usr.sbin/bhyve/pci_e82545.c =================================================================== --- releng/11.1/usr.sbin/bhyve/pci_e82545.c (revision 320933) +++ releng/11.1/usr.sbin/bhyve/pci_e82545.c (revision 320934) @@ -1,2372 +1,2386 @@ /* * Copyright (c) 2016 Alexander Motin * Copyright (c) 2015 Peter Grehan * Copyright (c) 2013 Jeremiah Lott, Avere Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include #include #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include #include "e1000_regs.h" #include "e1000_defines.h" #include "mii.h" #include "bhyverun.h" #include "pci_emul.h" #include "mevent.h" /* Hardware/register definitions XXX: move some to common code. */ #define E82545_VENDOR_ID_INTEL 0x8086 #define E82545_DEV_ID_82545EM_COPPER 0x100F #define E82545_SUBDEV_ID 0x1008 #define E82545_REVISION_4 4 #define E82545_MDIC_DATA_MASK 0x0000FFFF #define E82545_MDIC_OP_MASK 0x0c000000 #define E82545_MDIC_IE 0x20000000 #define E82545_EECD_FWE_DIS 0x00000010 /* Flash writes disabled */ #define E82545_EECD_FWE_EN 0x00000020 /* Flash writes enabled */ #define E82545_EECD_FWE_MASK 0x00000030 /* Flash writes mask */ #define E82545_BAR_REGISTER 0 #define E82545_BAR_REGISTER_LEN (128*1024) #define E82545_BAR_FLASH 1 #define E82545_BAR_FLASH_LEN (64*1024) #define E82545_BAR_IO 2 #define E82545_BAR_IO_LEN 8 #define E82545_IOADDR 0x00000000 #define E82545_IODATA 0x00000004 #define E82545_IO_REGISTER_MAX 0x0001FFFF #define E82545_IO_FLASH_BASE 0x00080000 #define E82545_IO_FLASH_MAX 0x000FFFFF #define E82545_ARRAY_ENTRY(reg, offset) (reg + (offset<<2)) #define E82545_RAR_MAX 15 #define E82545_MTA_MAX 127 #define E82545_VFTA_MAX 127 /* Slightly modified from the driver versions, hardcoded for 3 opcode bits, * followed by 6 address bits. * TODO: make opcode bits and addr bits configurable? * NVM Commands - Microwire */ #define E82545_NVM_OPCODE_BITS 3 #define E82545_NVM_ADDR_BITS 6 #define E82545_NVM_DATA_BITS 16 #define E82545_NVM_OPADDR_BITS (E82545_NVM_OPCODE_BITS + E82545_NVM_ADDR_BITS) #define E82545_NVM_ADDR_MASK ((1 << E82545_NVM_ADDR_BITS)-1) #define E82545_NVM_OPCODE_MASK \ (((1 << E82545_NVM_OPCODE_BITS) - 1) << E82545_NVM_ADDR_BITS) #define E82545_NVM_OPCODE_READ (0x6 << E82545_NVM_ADDR_BITS) /* read */ #define E82545_NVM_OPCODE_WRITE (0x5 << E82545_NVM_ADDR_BITS) /* write */ #define E82545_NVM_OPCODE_ERASE (0x7 << E82545_NVM_ADDR_BITS) /* erase */ #define E82545_NVM_OPCODE_EWEN (0x4 << E82545_NVM_ADDR_BITS) /* wr-enable */ #define E82545_NVM_EEPROM_SIZE 64 /* 64 * 16-bit values == 128K */ #define E1000_ICR_SRPD 0x00010000 /* This is an arbitrary number. There is no hard limit on the chip. */ #define I82545_MAX_TXSEGS 64 /* Legacy receive descriptor */ struct e1000_rx_desc { uint64_t buffer_addr; /* Address of the descriptor's data buffer */ uint16_t length; /* Length of data DMAed into data buffer */ uint16_t csum; /* Packet checksum */ uint8_t status; /* Descriptor status */ uint8_t errors; /* Descriptor Errors */ uint16_t special; }; /* Transmit descriptor types */ #define E1000_TXD_MASK (E1000_TXD_CMD_DEXT | 0x00F00000) #define E1000_TXD_TYP_L (0) #define E1000_TXD_TYP_C (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_C) #define E1000_TXD_TYP_D (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D) /* Legacy transmit descriptor */ struct e1000_tx_desc { uint64_t buffer_addr; /* Address of the descriptor's data buffer */ union { uint32_t data; struct { uint16_t length; /* Data buffer length */ uint8_t cso; /* Checksum offset */ uint8_t cmd; /* Descriptor control */ } flags; } lower; union { uint32_t data; struct { uint8_t status; /* Descriptor status */ uint8_t css; /* Checksum start */ uint16_t special; } fields; } upper; }; /* Context descriptor */ struct e1000_context_desc { union { uint32_t ip_config; struct { uint8_t ipcss; /* IP checksum start */ uint8_t ipcso; /* IP checksum offset */ uint16_t ipcse; /* IP checksum end */ } ip_fields; } lower_setup; union { uint32_t tcp_config; struct { uint8_t tucss; /* TCP checksum start */ uint8_t tucso; /* TCP checksum offset */ uint16_t tucse; /* TCP checksum end */ } tcp_fields; } upper_setup; uint32_t cmd_and_length; union { uint32_t data; struct { uint8_t status; /* Descriptor status */ uint8_t hdr_len; /* Header length */ uint16_t mss; /* Maximum segment size */ } fields; } tcp_seg_setup; }; /* Data descriptor */ struct e1000_data_desc { uint64_t buffer_addr; /* Address of the descriptor's buffer address */ union { uint32_t data; struct { uint16_t length; /* Data buffer length */ uint8_t typ_len_ext; uint8_t cmd; } flags; } lower; union { uint32_t data; struct { uint8_t status; /* Descriptor status */ uint8_t popts; /* Packet Options */ uint16_t special; } fields; } upper; }; union e1000_tx_udesc { struct e1000_tx_desc td; struct e1000_context_desc cd; struct e1000_data_desc dd; }; /* Tx checksum info for a packet. */ struct ck_info { int ck_valid; /* ck_info is valid */ uint8_t ck_start; /* start byte of cksum calcuation */ uint8_t ck_off; /* offset of cksum insertion */ uint16_t ck_len; /* length of cksum calc: 0 is to packet-end */ }; /* * Debug printf */ static int e82545_debug = 0; #define DPRINTF(msg,params...) if (e82545_debug) fprintf(stderr, "e82545: " msg, params) #define WPRINTF(msg,params...) fprintf(stderr, "e82545: " msg, params) #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) /* s/w representation of the RAL/RAH regs */ struct eth_uni { int eu_valid; int eu_addrsel; struct ether_addr eu_eth; }; struct e82545_softc { struct pci_devinst *esc_pi; struct vmctx *esc_ctx; struct mevent *esc_mevp; struct mevent *esc_mevpitr; pthread_mutex_t esc_mtx; struct ether_addr esc_mac; int esc_tapfd; /* General */ uint32_t esc_CTRL; /* x0000 device ctl */ uint32_t esc_FCAL; /* x0028 flow ctl addr lo */ uint32_t esc_FCAH; /* x002C flow ctl addr hi */ uint32_t esc_FCT; /* x0030 flow ctl type */ uint32_t esc_VET; /* x0038 VLAN eth type */ uint32_t esc_FCTTV; /* x0170 flow ctl tx timer */ uint32_t esc_LEDCTL; /* x0E00 LED control */ uint32_t esc_PBA; /* x1000 pkt buffer allocation */ /* Interrupt control */ int esc_irq_asserted; uint32_t esc_ICR; /* x00C0 cause read/clear */ uint32_t esc_ITR; /* x00C4 intr throttling */ uint32_t esc_ICS; /* x00C8 cause set */ uint32_t esc_IMS; /* x00D0 mask set/read */ uint32_t esc_IMC; /* x00D8 mask clear */ /* Transmit */ union e1000_tx_udesc *esc_txdesc; struct e1000_context_desc esc_txctx; pthread_t esc_tx_tid; pthread_cond_t esc_tx_cond; int esc_tx_enabled; int esc_tx_active; uint32_t esc_TXCW; /* x0178 transmit config */ uint32_t esc_TCTL; /* x0400 transmit ctl */ uint32_t esc_TIPG; /* x0410 inter-packet gap */ uint16_t esc_AIT; /* x0458 Adaptive Interframe Throttle */ uint64_t esc_tdba; /* verified 64-bit desc table addr */ uint32_t esc_TDBAL; /* x3800 desc table addr, low bits */ uint32_t esc_TDBAH; /* x3804 desc table addr, hi 32-bits */ uint32_t esc_TDLEN; /* x3808 # descriptors in bytes */ uint16_t esc_TDH; /* x3810 desc table head idx */ uint16_t esc_TDHr; /* internal read version of TDH */ uint16_t esc_TDT; /* x3818 desc table tail idx */ uint32_t esc_TIDV; /* x3820 intr delay */ uint32_t esc_TXDCTL; /* x3828 desc control */ uint32_t esc_TADV; /* x382C intr absolute delay */ /* L2 frame acceptance */ struct eth_uni esc_uni[16]; /* 16 x unicast MAC addresses */ uint32_t esc_fmcast[128]; /* Multicast filter bit-match */ uint32_t esc_fvlan[128]; /* VLAN 4096-bit filter */ /* Receive */ struct e1000_rx_desc *esc_rxdesc; pthread_cond_t esc_rx_cond; int esc_rx_enabled; int esc_rx_active; int esc_rx_loopback; uint32_t esc_RCTL; /* x0100 receive ctl */ uint32_t esc_FCRTL; /* x2160 flow cntl thresh, low */ uint32_t esc_FCRTH; /* x2168 flow cntl thresh, hi */ uint64_t esc_rdba; /* verified 64-bit desc table addr */ uint32_t esc_RDBAL; /* x2800 desc table addr, low bits */ uint32_t esc_RDBAH; /* x2804 desc table addr, hi 32-bits*/ uint32_t esc_RDLEN; /* x2808 #descriptors */ uint16_t esc_RDH; /* x2810 desc table head idx */ uint16_t esc_RDT; /* x2818 desc table tail idx */ uint32_t esc_RDTR; /* x2820 intr delay */ uint32_t esc_RXDCTL; /* x2828 desc control */ uint32_t esc_RADV; /* x282C intr absolute delay */ uint32_t esc_RSRPD; /* x2C00 recv small packet detect */ uint32_t esc_RXCSUM; /* x5000 receive cksum ctl */ /* IO Port register access */ uint32_t io_addr; /* Shadow copy of MDIC */ uint32_t mdi_control; /* Shadow copy of EECD */ uint32_t eeprom_control; /* Latest NVM in/out */ uint16_t nvm_data; uint16_t nvm_opaddr; /* stats */ uint32_t missed_pkt_count; /* dropped for no room in rx queue */ uint32_t pkt_rx_by_size[6]; uint32_t pkt_tx_by_size[6]; uint32_t good_pkt_rx_count; uint32_t bcast_pkt_rx_count; uint32_t mcast_pkt_rx_count; uint32_t good_pkt_tx_count; uint32_t bcast_pkt_tx_count; uint32_t mcast_pkt_tx_count; uint32_t oversize_rx_count; uint32_t tso_tx_count; uint64_t good_octets_rx; uint64_t good_octets_tx; uint64_t missed_octets; /* counts missed and oversized */ uint8_t nvm_bits:6; /* number of bits remaining in/out */ uint8_t nvm_mode:2; #define E82545_NVM_MODE_OPADDR 0x0 #define E82545_NVM_MODE_DATAIN 0x1 #define E82545_NVM_MODE_DATAOUT 0x2 /* EEPROM data */ uint16_t eeprom_data[E82545_NVM_EEPROM_SIZE]; }; static void e82545_reset(struct e82545_softc *sc, int dev); static void e82545_rx_enable(struct e82545_softc *sc); static void e82545_rx_disable(struct e82545_softc *sc); static void e82545_tap_callback(int fd, enum ev_type type, void *param); static void e82545_tx_start(struct e82545_softc *sc); static void e82545_tx_enable(struct e82545_softc *sc); static void e82545_tx_disable(struct e82545_softc *sc); static inline int e82545_size_stat_index(uint32_t size) { if (size <= 64) { return 0; } else if (size >= 1024) { return 5; } else { /* should be 1-4 */ return (ffs(size) - 6); } } static void e82545_init_eeprom(struct e82545_softc *sc) { uint16_t checksum, i; /* mac addr */ sc->eeprom_data[NVM_MAC_ADDR] = ((uint16_t)sc->esc_mac.octet[0]) | (((uint16_t)sc->esc_mac.octet[1]) << 8); sc->eeprom_data[NVM_MAC_ADDR+1] = ((uint16_t)sc->esc_mac.octet[2]) | (((uint16_t)sc->esc_mac.octet[3]) << 8); sc->eeprom_data[NVM_MAC_ADDR+2] = ((uint16_t)sc->esc_mac.octet[4]) | (((uint16_t)sc->esc_mac.octet[5]) << 8); /* pci ids */ sc->eeprom_data[NVM_SUB_DEV_ID] = E82545_SUBDEV_ID; sc->eeprom_data[NVM_SUB_VEN_ID] = E82545_VENDOR_ID_INTEL; sc->eeprom_data[NVM_DEV_ID] = E82545_DEV_ID_82545EM_COPPER; sc->eeprom_data[NVM_VEN_ID] = E82545_VENDOR_ID_INTEL; /* fill in the checksum */ checksum = 0; for (i = 0; i < NVM_CHECKSUM_REG; i++) { checksum += sc->eeprom_data[i]; } checksum = NVM_SUM - checksum; sc->eeprom_data[NVM_CHECKSUM_REG] = checksum; DPRINTF("eeprom checksum: 0x%x\r\n", checksum); } static void e82545_write_mdi(struct e82545_softc *sc, uint8_t reg_addr, uint8_t phy_addr, uint32_t data) { DPRINTF("Write mdi reg:0x%x phy:0x%x data: 0x%x\r\n", reg_addr, phy_addr, data); } static uint32_t e82545_read_mdi(struct e82545_softc *sc, uint8_t reg_addr, uint8_t phy_addr) { //DPRINTF("Read mdi reg:0x%x phy:0x%x\r\n", reg_addr, phy_addr); switch (reg_addr) { case PHY_STATUS: return (MII_SR_LINK_STATUS | MII_SR_AUTONEG_CAPS | MII_SR_AUTONEG_COMPLETE); case PHY_AUTONEG_ADV: return NWAY_AR_SELECTOR_FIELD; case PHY_LP_ABILITY: return 0; case PHY_1000T_STATUS: return (SR_1000T_LP_FD_CAPS | SR_1000T_REMOTE_RX_STATUS | SR_1000T_LOCAL_RX_STATUS); case PHY_ID1: return (M88E1011_I_PHY_ID >> 16) & 0xFFFF; case PHY_ID2: return (M88E1011_I_PHY_ID | E82545_REVISION_4) & 0xFFFF; default: DPRINTF("Unknown mdi read reg:0x%x phy:0x%x\r\n", reg_addr, phy_addr); return 0; } /* not reached */ } static void e82545_eecd_strobe(struct e82545_softc *sc) { /* Microwire state machine */ /* DPRINTF("eeprom state machine srtobe " "0x%x 0x%x 0x%x 0x%x\r\n", sc->nvm_mode, sc->nvm_bits, sc->nvm_opaddr, sc->nvm_data);*/ if (sc->nvm_bits == 0) { DPRINTF("eeprom state machine not expecting data! " "0x%x 0x%x 0x%x 0x%x\r\n", sc->nvm_mode, sc->nvm_bits, sc->nvm_opaddr, sc->nvm_data); return; } sc->nvm_bits--; if (sc->nvm_mode == E82545_NVM_MODE_DATAOUT) { /* shifting out */ if (sc->nvm_data & 0x8000) { sc->eeprom_control |= E1000_EECD_DO; } else { sc->eeprom_control &= ~E1000_EECD_DO; } sc->nvm_data <<= 1; if (sc->nvm_bits == 0) { /* read done, back to opcode mode. */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; } } else if (sc->nvm_mode == E82545_NVM_MODE_DATAIN) { /* shifting in */ sc->nvm_data <<= 1; if (sc->eeprom_control & E1000_EECD_DI) { sc->nvm_data |= 1; } if (sc->nvm_bits == 0) { /* eeprom write */ uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK; if (op != E82545_NVM_OPCODE_WRITE) { DPRINTF("Illegal eeprom write op 0x%x\r\n", sc->nvm_opaddr); } else if (addr >= E82545_NVM_EEPROM_SIZE) { DPRINTF("Illegal eeprom write addr 0x%x\r\n", sc->nvm_opaddr); } else { DPRINTF("eeprom write eeprom[0x%x] = 0x%x\r\n", addr, sc->nvm_data); sc->eeprom_data[addr] = sc->nvm_data; } /* back to opcode mode */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; } } else if (sc->nvm_mode == E82545_NVM_MODE_OPADDR) { sc->nvm_opaddr <<= 1; if (sc->eeprom_control & E1000_EECD_DI) { sc->nvm_opaddr |= 1; } if (sc->nvm_bits == 0) { uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; switch (op) { case E82545_NVM_OPCODE_EWEN: DPRINTF("eeprom write enable: 0x%x\r\n", sc->nvm_opaddr); /* back to opcode mode */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; break; case E82545_NVM_OPCODE_READ: { uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK; sc->nvm_mode = E82545_NVM_MODE_DATAOUT; sc->nvm_bits = E82545_NVM_DATA_BITS; if (addr < E82545_NVM_EEPROM_SIZE) { sc->nvm_data = sc->eeprom_data[addr]; DPRINTF("eeprom read: eeprom[0x%x] = 0x%x\r\n", addr, sc->nvm_data); } else { DPRINTF("eeprom illegal read: 0x%x\r\n", sc->nvm_opaddr); sc->nvm_data = 0; } break; } case E82545_NVM_OPCODE_WRITE: sc->nvm_mode = E82545_NVM_MODE_DATAIN; sc->nvm_bits = E82545_NVM_DATA_BITS; sc->nvm_data = 0; break; default: DPRINTF("eeprom unknown op: 0x%x\r\r", sc->nvm_opaddr); /* back to opcode mode */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; } } } else { DPRINTF("eeprom state machine wrong state! " "0x%x 0x%x 0x%x 0x%x\r\n", sc->nvm_mode, sc->nvm_bits, sc->nvm_opaddr, sc->nvm_data); } } static void e82545_itr_callback(int fd, enum ev_type type, void *param) { uint32_t new; struct e82545_softc *sc = param; pthread_mutex_lock(&sc->esc_mtx); new = sc->esc_ICR & sc->esc_IMS; if (new && !sc->esc_irq_asserted) { DPRINTF("itr callback: lintr assert %x\r\n", new); sc->esc_irq_asserted = 1; pci_lintr_assert(sc->esc_pi); } else { mevent_delete(sc->esc_mevpitr); sc->esc_mevpitr = NULL; } pthread_mutex_unlock(&sc->esc_mtx); } static void e82545_icr_assert(struct e82545_softc *sc, uint32_t bits) { uint32_t new; DPRINTF("icr assert: 0x%x\r\n", bits); /* * An interrupt is only generated if bits are set that * aren't already in the ICR, these bits are unmasked, * and there isn't an interrupt already pending. */ new = bits & ~sc->esc_ICR & sc->esc_IMS; sc->esc_ICR |= bits; if (new == 0) { DPRINTF("icr assert: masked %x, ims %x\r\n", new, sc->esc_IMS); } else if (sc->esc_mevpitr != NULL) { DPRINTF("icr assert: throttled %x, ims %x\r\n", new, sc->esc_IMS); } else if (!sc->esc_irq_asserted) { DPRINTF("icr assert: lintr assert %x\r\n", new); sc->esc_irq_asserted = 1; pci_lintr_assert(sc->esc_pi); if (sc->esc_ITR != 0) { sc->esc_mevpitr = mevent_add( (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ EVF_TIMER, e82545_itr_callback, sc); } } } static void e82545_ims_change(struct e82545_softc *sc, uint32_t bits) { uint32_t new; /* * Changing the mask may allow previously asserted * but masked interrupt requests to generate an interrupt. */ new = bits & sc->esc_ICR & ~sc->esc_IMS; sc->esc_IMS |= bits; if (new == 0) { DPRINTF("ims change: masked %x, ims %x\r\n", new, sc->esc_IMS); } else if (sc->esc_mevpitr != NULL) { DPRINTF("ims change: throttled %x, ims %x\r\n", new, sc->esc_IMS); } else if (!sc->esc_irq_asserted) { DPRINTF("ims change: lintr assert %x\n\r", new); sc->esc_irq_asserted = 1; pci_lintr_assert(sc->esc_pi); if (sc->esc_ITR != 0) { sc->esc_mevpitr = mevent_add( (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ EVF_TIMER, e82545_itr_callback, sc); } } } static void e82545_icr_deassert(struct e82545_softc *sc, uint32_t bits) { DPRINTF("icr deassert: 0x%x\r\n", bits); sc->esc_ICR &= ~bits; /* * If there are no longer any interrupt sources and there * was an asserted interrupt, clear it */ if (sc->esc_irq_asserted && !(sc->esc_ICR & sc->esc_IMS)) { DPRINTF("icr deassert: lintr deassert %x\r\n", bits); pci_lintr_deassert(sc->esc_pi); sc->esc_irq_asserted = 0; } } static void e82545_intr_write(struct e82545_softc *sc, uint32_t offset, uint32_t value) { DPRINTF("intr_write: off %x, val %x\n\r", offset, value); switch (offset) { case E1000_ICR: e82545_icr_deassert(sc, value); break; case E1000_ITR: sc->esc_ITR = value; break; case E1000_ICS: sc->esc_ICS = value; /* not used: store for debug */ e82545_icr_assert(sc, value); break; case E1000_IMS: e82545_ims_change(sc, value); break; case E1000_IMC: sc->esc_IMC = value; /* for debug */ sc->esc_IMS &= ~value; // XXX clear interrupts if all ICR bits now masked // and interrupt was pending ? break; default: break; } } static uint32_t e82545_intr_read(struct e82545_softc *sc, uint32_t offset) { uint32_t retval; retval = 0; DPRINTF("intr_read: off %x\n\r", offset); switch (offset) { case E1000_ICR: retval = sc->esc_ICR; sc->esc_ICR = 0; e82545_icr_deassert(sc, ~0); break; case E1000_ITR: retval = sc->esc_ITR; break; case E1000_ICS: /* write-only register */ break; case E1000_IMS: retval = sc->esc_IMS; break; case E1000_IMC: /* write-only register */ break; default: break; } return (retval); } static void e82545_devctl(struct e82545_softc *sc, uint32_t val) { sc->esc_CTRL = val & ~E1000_CTRL_RST; if (val & E1000_CTRL_RST) { DPRINTF("e1k: s/w reset, ctl %x\n", val); e82545_reset(sc, 1); } /* XXX check for phy reset ? */ } static void e82545_rx_update_rdba(struct e82545_softc *sc) { /* XXX verify desc base/len within phys mem range */ sc->esc_rdba = (uint64_t)sc->esc_RDBAH << 32 | sc->esc_RDBAL; /* Cache host mapping of guest descriptor array */ sc->esc_rxdesc = paddr_guest2host(sc->esc_ctx, sc->esc_rdba, sc->esc_RDLEN); } static void e82545_rx_ctl(struct e82545_softc *sc, uint32_t val) { int on; on = ((val & E1000_RCTL_EN) == E1000_RCTL_EN); /* Save RCTL after stripping reserved bits 31:27,24,21,14,11:10,0 */ sc->esc_RCTL = val & ~0xF9204c01; DPRINTF("rx_ctl - %s RCTL %x, val %x\n", on ? "on" : "off", sc->esc_RCTL, val); /* state change requested */ if (on != sc->esc_rx_enabled) { if (on) { /* Catch disallowed/unimplemented settings */ //assert(!(val & E1000_RCTL_LBM_TCVR)); if (sc->esc_RCTL & E1000_RCTL_LBM_TCVR) { sc->esc_rx_loopback = 1; } else { sc->esc_rx_loopback = 0; } e82545_rx_update_rdba(sc); e82545_rx_enable(sc); } else { e82545_rx_disable(sc); sc->esc_rx_loopback = 0; sc->esc_rdba = 0; sc->esc_rxdesc = NULL; } } } static void e82545_tx_update_tdba(struct e82545_softc *sc) { /* XXX verify desc base/len within phys mem range */ sc->esc_tdba = (uint64_t)sc->esc_TDBAH << 32 | sc->esc_TDBAL; /* Cache host mapping of guest descriptor array */ sc->esc_txdesc = paddr_guest2host(sc->esc_ctx, sc->esc_tdba, sc->esc_TDLEN); } static void e82545_tx_ctl(struct e82545_softc *sc, uint32_t val) { int on; on = ((val & E1000_TCTL_EN) == E1000_TCTL_EN); /* ignore TCTL_EN settings that don't change state */ if (on == sc->esc_tx_enabled) return; if (on) { e82545_tx_update_tdba(sc); e82545_tx_enable(sc); } else { e82545_tx_disable(sc); sc->esc_tdba = 0; sc->esc_txdesc = NULL; } /* Save TCTL value after stripping reserved bits 31:25,23,2,0 */ sc->esc_TCTL = val & ~0xFE800005; } int e82545_bufsz(uint32_t rctl) { switch (rctl & (E1000_RCTL_BSEX | E1000_RCTL_SZ_256)) { case (E1000_RCTL_SZ_2048): return (2048); case (E1000_RCTL_SZ_1024): return (1024); case (E1000_RCTL_SZ_512): return (512); case (E1000_RCTL_SZ_256): return (256); case (E1000_RCTL_BSEX|E1000_RCTL_SZ_16384): return (16384); case (E1000_RCTL_BSEX|E1000_RCTL_SZ_8192): return (8192); case (E1000_RCTL_BSEX|E1000_RCTL_SZ_4096): return (4096); } return (256); /* Forbidden value. */ } static uint8_t dummybuf[2048]; /* XXX one packet at a time until this is debugged */ static void e82545_tap_callback(int fd, enum ev_type type, void *param) { struct e82545_softc *sc = param; struct e1000_rx_desc *rxd; struct iovec vec[64]; int left, len, lim, maxpktsz, maxpktdesc, bufsz, i, n, size; uint32_t cause = 0; uint16_t *tp, tag, head; pthread_mutex_lock(&sc->esc_mtx); DPRINTF("rx_run: head %x, tail %x\r\n", sc->esc_RDH, sc->esc_RDT); if (!sc->esc_rx_enabled || sc->esc_rx_loopback) { DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped\r\n", sc->esc_rx_enabled, sc->esc_rx_loopback); while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) { } goto done1; } bufsz = e82545_bufsz(sc->esc_RCTL); maxpktsz = (sc->esc_RCTL & E1000_RCTL_LPE) ? 16384 : 1522; maxpktdesc = (maxpktsz + bufsz - 1) / bufsz; size = sc->esc_RDLEN / 16; head = sc->esc_RDH; left = (size + sc->esc_RDT - head) % size; if (left < maxpktdesc) { DPRINTF("rx overflow (%d < %d) -- packet(s) dropped\r\n", left, maxpktdesc); while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) { } goto done1; } sc->esc_rx_active = 1; pthread_mutex_unlock(&sc->esc_mtx); for (lim = size / 4; lim > 0 && left >= maxpktdesc; lim -= n) { /* Grab rx descriptor pointed to by the head pointer */ for (i = 0; i < maxpktdesc; i++) { rxd = &sc->esc_rxdesc[(head + i) % size]; vec[i].iov_base = paddr_guest2host(sc->esc_ctx, rxd->buffer_addr, bufsz); vec[i].iov_len = bufsz; } len = readv(sc->esc_tapfd, vec, maxpktdesc); if (len <= 0) { DPRINTF("tap: readv() returned %d\n", len); goto done; } /* * Adjust the packet length based on whether the CRC needs * to be stripped or if the packet is less than the minimum * eth packet size. */ if (len < ETHER_MIN_LEN - ETHER_CRC_LEN) len = ETHER_MIN_LEN - ETHER_CRC_LEN; if (!(sc->esc_RCTL & E1000_RCTL_SECRC)) len += ETHER_CRC_LEN; n = (len + bufsz - 1) / bufsz; DPRINTF("packet read %d bytes, %d segs, head %d\r\n", len, n, head); /* Apply VLAN filter. */ tp = (uint16_t *)vec[0].iov_base + 6; if ((sc->esc_RCTL & E1000_RCTL_VFE) && (ntohs(tp[0]) == sc->esc_VET)) { tag = ntohs(tp[1]) & 0x0fff; if ((sc->esc_fvlan[tag >> 5] & (1 << (tag & 0x1f))) != 0) { DPRINTF("known VLAN %d\r\n", tag); } else { DPRINTF("unknown VLAN %d\r\n", tag); n = 0; continue; } } /* Update all consumed descriptors. */ for (i = 0; i < n - 1; i++) { rxd = &sc->esc_rxdesc[(head + i) % size]; rxd->length = bufsz; rxd->csum = 0; rxd->errors = 0; rxd->special = 0; rxd->status = E1000_RXD_STAT_DD; } rxd = &sc->esc_rxdesc[(head + i) % size]; rxd->length = len % bufsz; rxd->csum = 0; rxd->errors = 0; rxd->special = 0; /* XXX signal no checksum for now */ rxd->status = E1000_RXD_STAT_PIF | E1000_RXD_STAT_IXSM | E1000_RXD_STAT_EOP | E1000_RXD_STAT_DD; /* Schedule receive interrupts. */ if (len <= sc->esc_RSRPD) { cause |= E1000_ICR_SRPD | E1000_ICR_RXT0; } else { /* XXX: RDRT and RADV timers should be here. */ cause |= E1000_ICR_RXT0; } head = (head + n) % size; left -= n; } done: pthread_mutex_lock(&sc->esc_mtx); sc->esc_rx_active = 0; if (sc->esc_rx_enabled == 0) pthread_cond_signal(&sc->esc_rx_cond); sc->esc_RDH = head; /* Respect E1000_RCTL_RDMTS */ left = (size + sc->esc_RDT - head) % size; if (left < (size >> (((sc->esc_RCTL >> 8) & 3) + 1))) cause |= E1000_ICR_RXDMT0; /* Assert all accumulated interrupts. */ if (cause != 0) e82545_icr_assert(sc, cause); done1: DPRINTF("rx_run done: head %x, tail %x\r\n", sc->esc_RDH, sc->esc_RDT); pthread_mutex_unlock(&sc->esc_mtx); } static uint16_t e82545_carry(uint32_t sum) { sum = (sum & 0xFFFF) + (sum >> 16); if (sum > 0xFFFF) sum -= 0xFFFF; return (sum); } static uint16_t e82545_buf_checksum(uint8_t *buf, int len) { int i; uint32_t sum = 0; /* Checksum all the pairs of bytes first... */ for (i = 0; i < (len & ~1U); i += 2) sum += *((u_int16_t *)(buf + i)); /* * If there's a single byte left over, checksum it, too. * Network byte order is big-endian, so the remaining byte is * the high byte. */ if (i < len) sum += htons(buf[i] << 8); return (e82545_carry(sum)); } static uint16_t e82545_iov_checksum(struct iovec *iov, int iovcnt, int off, int len) { int now, odd; uint32_t sum = 0, s; /* Skip completely unneeded vectors. */ while (iovcnt > 0 && iov->iov_len <= off && off > 0) { off -= iov->iov_len; iov++; iovcnt--; } /* Calculate checksum of requested range. */ odd = 0; while (len > 0 && iovcnt > 0) { now = MIN(len, iov->iov_len - off); s = e82545_buf_checksum(iov->iov_base + off, now); sum += odd ? (s << 8) : s; odd ^= (now & 1); len -= now; off = 0; iov++; iovcnt--; } return (e82545_carry(sum)); } /* * Return the transmit descriptor type. */ int e82545_txdesc_type(uint32_t lower) { int type; type = 0; if (lower & E1000_TXD_CMD_DEXT) type = lower & E1000_TXD_MASK; return (type); } static void e82545_transmit_checksum(struct iovec *iov, int iovcnt, struct ck_info *ck) { uint16_t cksum; int cklen; DPRINTF("tx cksum: iovcnt/s/off/len %d/%d/%d/%d\r\n", iovcnt, ck->ck_start, ck->ck_off, ck->ck_len); cklen = ck->ck_len ? ck->ck_len - ck->ck_start + 1 : INT_MAX; cksum = e82545_iov_checksum(iov, iovcnt, ck->ck_start, cklen); *(uint16_t *)((uint8_t *)iov[0].iov_base + ck->ck_off) = ~cksum; } static void e82545_transmit_backend(struct e82545_softc *sc, struct iovec *iov, int iovcnt) { if (sc->esc_tapfd == -1) return; (void) writev(sc->esc_tapfd, iov, iovcnt); } static void e82545_transmit_done(struct e82545_softc *sc, uint16_t head, uint16_t tail, uint16_t dsize, int *tdwb) { union e1000_tx_udesc *dsc; for ( ; head != tail; head = (head + 1) % dsize) { dsc = &sc->esc_txdesc[head]; if (dsc->td.lower.data & E1000_TXD_CMD_RS) { dsc->td.upper.data |= E1000_TXD_STAT_DD; *tdwb = 1; } } } static int e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail, uint16_t dsize, uint16_t *rhead, int *tdwb) { uint8_t *hdr, *hdrp; struct iovec iovb[I82545_MAX_TXSEGS + 2]; struct iovec tiov[I82545_MAX_TXSEGS + 2]; struct e1000_context_desc *cd; struct ck_info ckinfo[2]; struct iovec *iov; union e1000_tx_udesc *dsc; int desc, dtype, len, ntype, iovcnt, tlen, hdrlen, vlen, tcp, tso; int mss, paylen, seg, tiovcnt, left, now, nleft, nnow, pv, pvoff; uint32_t tcpsum, tcpseq; uint16_t ipcs, tcpcs, ipid, ohead; ckinfo[0].ck_valid = ckinfo[1].ck_valid = 0; iovcnt = 0; tlen = 0; ntype = 0; tso = 0; ohead = head; /* iovb[0/1] may be used for writable copy of headers. */ iov = &iovb[2]; for (desc = 0; ; desc++, head = (head + 1) % dsize) { if (head == tail) { *rhead = head; return (0); } dsc = &sc->esc_txdesc[head]; dtype = e82545_txdesc_type(dsc->td.lower.data); if (desc == 0) { switch (dtype) { case E1000_TXD_TYP_C: DPRINTF("tx ctxt desc idx %d: %016jx " "%08x%08x\r\n", head, dsc->td.buffer_addr, dsc->td.upper.data, dsc->td.lower.data); /* Save context and return */ sc->esc_txctx = dsc->cd; goto done; case E1000_TXD_TYP_L: DPRINTF("tx legacy desc idx %d: %08x%08x\r\n", head, dsc->td.upper.data, dsc->td.lower.data); /* * legacy cksum start valid in first descriptor */ ntype = dtype; ckinfo[0].ck_start = dsc->td.upper.fields.css; break; case E1000_TXD_TYP_D: DPRINTF("tx data desc idx %d: %08x%08x\r\n", head, dsc->td.upper.data, dsc->td.lower.data); ntype = dtype; break; default: break; } } else { /* Descriptor type must be consistent */ assert(dtype == ntype); DPRINTF("tx next desc idx %d: %08x%08x\r\n", head, dsc->td.upper.data, dsc->td.lower.data); } len = (dtype == E1000_TXD_TYP_L) ? dsc->td.lower.flags.length : dsc->dd.lower.data & 0xFFFFF; if (len > 0) { /* Strip checksum supplied by guest. */ if ((dsc->td.lower.data & E1000_TXD_CMD_EOP) != 0 && (dsc->td.lower.data & E1000_TXD_CMD_IFCS) == 0) len -= 2; tlen += len; if (iovcnt < I82545_MAX_TXSEGS) { iov[iovcnt].iov_base = paddr_guest2host( sc->esc_ctx, dsc->td.buffer_addr, len); iov[iovcnt].iov_len = len; } iovcnt++; } /* * Pull out info that is valid in the final descriptor * and exit descriptor loop. */ if (dsc->td.lower.data & E1000_TXD_CMD_EOP) { if (dtype == E1000_TXD_TYP_L) { if (dsc->td.lower.data & E1000_TXD_CMD_IC) { ckinfo[0].ck_valid = 1; ckinfo[0].ck_off = dsc->td.lower.flags.cso; ckinfo[0].ck_len = 0; } } else { cd = &sc->esc_txctx; if (dsc->dd.lower.data & E1000_TXD_CMD_TSE) tso = 1; if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_IXSM) ckinfo[0].ck_valid = 1; if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_IXSM || tso) { ckinfo[0].ck_start = cd->lower_setup.ip_fields.ipcss; ckinfo[0].ck_off = cd->lower_setup.ip_fields.ipcso; ckinfo[0].ck_len = cd->lower_setup.ip_fields.ipcse; } if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_TXSM) ckinfo[1].ck_valid = 1; if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_TXSM || tso) { ckinfo[1].ck_start = cd->upper_setup.tcp_fields.tucss; ckinfo[1].ck_off = cd->upper_setup.tcp_fields.tucso; ckinfo[1].ck_len = cd->upper_setup.tcp_fields.tucse; } } break; } } if (iovcnt > I82545_MAX_TXSEGS) { WPRINTF("tx too many descriptors (%d > %d) -- dropped\r\n", iovcnt, I82545_MAX_TXSEGS); goto done; } hdrlen = vlen = 0; /* Estimate writable space for VLAN header insertion. */ if ((sc->esc_CTRL & E1000_CTRL_VME) && (dsc->td.lower.data & E1000_TXD_CMD_VLE)) { hdrlen = ETHER_ADDR_LEN*2; vlen = ETHER_VLAN_ENCAP_LEN; } if (!tso) { /* Estimate required writable space for checksums. */ if (ckinfo[0].ck_valid) hdrlen = MAX(hdrlen, ckinfo[0].ck_off + 2); if (ckinfo[1].ck_valid) hdrlen = MAX(hdrlen, ckinfo[1].ck_off + 2); /* Round up writable space to the first vector. */ if (hdrlen != 0 && iov[0].iov_len > hdrlen && iov[0].iov_len < hdrlen + 100) hdrlen = iov[0].iov_len; } else { /* In case of TSO header length provided by software. */ hdrlen = sc->esc_txctx.tcp_seg_setup.fields.hdr_len; } /* Allocate, fill and prepend writable header vector. */ if (hdrlen != 0) { hdr = __builtin_alloca(hdrlen + vlen); hdr += vlen; for (left = hdrlen, hdrp = hdr; left > 0; left -= now, hdrp += now) { now = MIN(left, iov->iov_len); memcpy(hdrp, iov->iov_base, now); iov->iov_base += now; iov->iov_len -= now; if (iov->iov_len == 0) { iov++; iovcnt--; } } iov--; iovcnt++; iov->iov_base = hdr; iov->iov_len = hdrlen; } /* Insert VLAN tag. */ if (vlen != 0) { hdr -= ETHER_VLAN_ENCAP_LEN; memmove(hdr, hdr + ETHER_VLAN_ENCAP_LEN, ETHER_ADDR_LEN*2); hdrlen += ETHER_VLAN_ENCAP_LEN; hdr[ETHER_ADDR_LEN*2 + 0] = sc->esc_VET >> 8; hdr[ETHER_ADDR_LEN*2 + 1] = sc->esc_VET & 0xff; hdr[ETHER_ADDR_LEN*2 + 2] = dsc->td.upper.fields.special >> 8; hdr[ETHER_ADDR_LEN*2 + 3] = dsc->td.upper.fields.special & 0xff; iov->iov_base = hdr; iov->iov_len += ETHER_VLAN_ENCAP_LEN; /* Correct checksum offsets after VLAN tag insertion. */ ckinfo[0].ck_start += ETHER_VLAN_ENCAP_LEN; ckinfo[0].ck_off += ETHER_VLAN_ENCAP_LEN; if (ckinfo[0].ck_len != 0) ckinfo[0].ck_len += ETHER_VLAN_ENCAP_LEN; ckinfo[1].ck_start += ETHER_VLAN_ENCAP_LEN; ckinfo[1].ck_off += ETHER_VLAN_ENCAP_LEN; if (ckinfo[1].ck_len != 0) ckinfo[1].ck_len += ETHER_VLAN_ENCAP_LEN; } /* Simple non-TSO case. */ if (!tso) { /* Calculate checksums and transmit. */ if (ckinfo[0].ck_valid) e82545_transmit_checksum(iov, iovcnt, &ckinfo[0]); if (ckinfo[1].ck_valid) e82545_transmit_checksum(iov, iovcnt, &ckinfo[1]); e82545_transmit_backend(sc, iov, iovcnt); goto done; } /* Doing TSO. */ tcp = (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) != 0; mss = sc->esc_txctx.tcp_seg_setup.fields.mss; paylen = (sc->esc_txctx.cmd_and_length & 0x000fffff); DPRINTF("tx %s segmentation offload %d+%d/%d bytes %d iovs\r\n", tcp ? "TCP" : "UDP", hdrlen, paylen, mss, iovcnt); ipid = ntohs(*(uint16_t *)&hdr[ckinfo[0].ck_start + 4]); tcpseq = ntohl(*(uint32_t *)&hdr[ckinfo[1].ck_start + 4]); ipcs = *(uint16_t *)&hdr[ckinfo[0].ck_off]; tcpcs = 0; if (ckinfo[1].ck_valid) /* Save partial pseudo-header checksum. */ tcpcs = *(uint16_t *)&hdr[ckinfo[1].ck_off]; pv = 1; pvoff = 0; for (seg = 0, left = paylen; left > 0; seg++, left -= now) { now = MIN(left, mss); /* Construct IOVs for the segment. */ /* Include whole original header. */ tiov[0].iov_base = hdr; tiov[0].iov_len = hdrlen; tiovcnt = 1; /* Include respective part of payload IOV. */ for (nleft = now; pv < iovcnt && nleft > 0; nleft -= nnow) { nnow = MIN(nleft, iov[pv].iov_len - pvoff); tiov[tiovcnt].iov_base = iov[pv].iov_base + pvoff; tiov[tiovcnt++].iov_len = nnow; if (pvoff + nnow == iov[pv].iov_len) { pv++; pvoff = 0; } else pvoff += nnow; } DPRINTF("tx segment %d %d+%d bytes %d iovs\r\n", seg, hdrlen, now, tiovcnt); /* Update IP header. */ if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_IP) { /* IPv4 -- set length and ID */ *(uint16_t *)&hdr[ckinfo[0].ck_start + 2] = htons(hdrlen - ckinfo[0].ck_start + now); *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = htons(ipid + seg); } else { /* IPv6 -- set length */ *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = htons(hdrlen - ckinfo[0].ck_start - 40 + now); } /* Update pseudo-header checksum. */ tcpsum = tcpcs; tcpsum += htons(hdrlen - ckinfo[1].ck_start + now); /* Update TCP/UDP headers. */ if (tcp) { /* Update sequence number and FIN/PUSH flags. */ *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = htonl(tcpseq + paylen - left); if (now < left) { hdr[ckinfo[1].ck_start + 13] &= ~(TH_FIN | TH_PUSH); } } else { /* Update payload length. */ *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = hdrlen - ckinfo[1].ck_start + now; } /* Calculate checksums and transmit. */ if (ckinfo[0].ck_valid) { *(uint16_t *)&hdr[ckinfo[0].ck_off] = ipcs; e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[0]); } if (ckinfo[1].ck_valid) { *(uint16_t *)&hdr[ckinfo[1].ck_off] = e82545_carry(tcpsum); e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[1]); } e82545_transmit_backend(sc, tiov, tiovcnt); } done: head = (head + 1) % dsize; e82545_transmit_done(sc, ohead, head, dsize, tdwb); *rhead = head; return (desc + 1); } static void e82545_tx_run(struct e82545_softc *sc) { uint32_t cause; uint16_t head, rhead, tail, size; int lim, tdwb, sent; head = sc->esc_TDH; tail = sc->esc_TDT; size = sc->esc_TDLEN / 16; DPRINTF("tx_run: head %x, rhead %x, tail %x\r\n", sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); pthread_mutex_unlock(&sc->esc_mtx); rhead = head; tdwb = 0; for (lim = size / 4; sc->esc_tx_enabled && lim > 0; lim -= sent) { sent = e82545_transmit(sc, head, tail, size, &rhead, &tdwb); if (sent == 0) break; head = rhead; } pthread_mutex_lock(&sc->esc_mtx); sc->esc_TDH = head; sc->esc_TDHr = rhead; cause = 0; if (tdwb) cause |= E1000_ICR_TXDW; if (lim != size / 4 && sc->esc_TDH == sc->esc_TDT) cause |= E1000_ICR_TXQE; if (cause) e82545_icr_assert(sc, cause); DPRINTF("tx_run done: head %x, rhead %x, tail %x\r\n", sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); } static void * e82545_tx_thread(void *param) { struct e82545_softc *sc = param; pthread_mutex_lock(&sc->esc_mtx); for (;;) { while (!sc->esc_tx_enabled || sc->esc_TDHr == sc->esc_TDT) { if (sc->esc_tx_enabled && sc->esc_TDHr != sc->esc_TDT) break; sc->esc_tx_active = 0; if (sc->esc_tx_enabled == 0) pthread_cond_signal(&sc->esc_tx_cond); pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); } sc->esc_tx_active = 1; /* Process some tx descriptors. Lock dropped inside. */ e82545_tx_run(sc); } } static void e82545_tx_start(struct e82545_softc *sc) { if (sc->esc_tx_active == 0) pthread_cond_signal(&sc->esc_tx_cond); } static void e82545_tx_enable(struct e82545_softc *sc) { sc->esc_tx_enabled = 1; } static void e82545_tx_disable(struct e82545_softc *sc) { sc->esc_tx_enabled = 0; while (sc->esc_tx_active) pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); } static void e82545_rx_enable(struct e82545_softc *sc) { sc->esc_rx_enabled = 1; } static void e82545_rx_disable(struct e82545_softc *sc) { sc->esc_rx_enabled = 0; while (sc->esc_rx_active) pthread_cond_wait(&sc->esc_rx_cond, &sc->esc_mtx); } static void e82545_write_ra(struct e82545_softc *sc, int reg, uint32_t wval) { struct eth_uni *eu; int idx; idx = reg >> 1; assert(idx < 15); eu = &sc->esc_uni[idx]; if (reg & 0x1) { /* RAH */ eu->eu_valid = ((wval & E1000_RAH_AV) == E1000_RAH_AV); eu->eu_addrsel = (wval >> 16) & 0x3; eu->eu_eth.octet[5] = wval >> 8; eu->eu_eth.octet[4] = wval; } else { /* RAL */ eu->eu_eth.octet[3] = wval >> 24; eu->eu_eth.octet[2] = wval >> 16; eu->eu_eth.octet[1] = wval >> 8; eu->eu_eth.octet[0] = wval; } } static uint32_t e82545_read_ra(struct e82545_softc *sc, int reg) { struct eth_uni *eu; uint32_t retval; int idx; idx = reg >> 1; assert(idx < 15); eu = &sc->esc_uni[idx]; if (reg & 0x1) { /* RAH */ retval = (eu->eu_valid << 31) | (eu->eu_addrsel << 16) | (eu->eu_eth.octet[5] << 8) | eu->eu_eth.octet[4]; } else { /* RAL */ retval = (eu->eu_eth.octet[3] << 24) | (eu->eu_eth.octet[2] << 16) | (eu->eu_eth.octet[1] << 8) | eu->eu_eth.octet[0]; } return (retval); } static void e82545_write_register(struct e82545_softc *sc, uint32_t offset, uint32_t value) { int ridx; if (offset & 0x3) { DPRINTF("Unaligned register write offset:0x%x value:0x%x\r\n", offset, value); return; } DPRINTF("Register write: 0x%x value: 0x%x\r\n", offset, value); switch (offset) { case E1000_CTRL: case E1000_CTRL_DUP: e82545_devctl(sc, value); break; case E1000_FCAL: sc->esc_FCAL = value; break; case E1000_FCAH: sc->esc_FCAH = value & ~0xFFFF0000; break; case E1000_FCT: sc->esc_FCT = value & ~0xFFFF0000; break; case E1000_VET: sc->esc_VET = value & ~0xFFFF0000; break; case E1000_FCTTV: sc->esc_FCTTV = value & ~0xFFFF0000; break; case E1000_LEDCTL: sc->esc_LEDCTL = value & ~0x30303000; break; case E1000_PBA: sc->esc_PBA = value & 0x0000FF80; break; case E1000_ICR: case E1000_ITR: case E1000_ICS: case E1000_IMS: case E1000_IMC: e82545_intr_write(sc, offset, value); break; case E1000_RCTL: e82545_rx_ctl(sc, value); break; case E1000_FCRTL: sc->esc_FCRTL = value & ~0xFFFF0007; break; case E1000_FCRTH: sc->esc_FCRTH = value & ~0xFFFF0007; break; case E1000_RDBAL(0): sc->esc_RDBAL = value & ~0xF; if (sc->esc_rx_enabled) { /* Apparently legal: update cached address */ e82545_rx_update_rdba(sc); } break; case E1000_RDBAH(0): assert(!sc->esc_rx_enabled); sc->esc_RDBAH = value; break; case E1000_RDLEN(0): assert(!sc->esc_rx_enabled); sc->esc_RDLEN = value & ~0xFFF0007F; break; case E1000_RDH(0): /* XXX should only ever be zero ? Range check ? */ sc->esc_RDH = value; break; case E1000_RDT(0): /* XXX if this opens up the rx ring, do something ? */ sc->esc_RDT = value; break; case E1000_RDTR: /* ignore FPD bit 31 */ sc->esc_RDTR = value & ~0xFFFF0000; break; case E1000_RXDCTL(0): sc->esc_RXDCTL = value & ~0xFEC0C0C0; break; case E1000_RADV: sc->esc_RADV = value & ~0xFFFF0000; break; case E1000_RSRPD: sc->esc_RSRPD = value & ~0xFFFFF000; break; case E1000_RXCSUM: sc->esc_RXCSUM = value & ~0xFFFFF800; break; case E1000_TXCW: sc->esc_TXCW = value & ~0x3FFF0000; break; case E1000_TCTL: e82545_tx_ctl(sc, value); break; case E1000_TIPG: sc->esc_TIPG = value; break; case E1000_AIT: sc->esc_AIT = value; break; case E1000_TDBAL(0): sc->esc_TDBAL = value & ~0xF; if (sc->esc_tx_enabled) { /* Apparently legal */ e82545_tx_update_tdba(sc); } break; case E1000_TDBAH(0): //assert(!sc->esc_tx_enabled); sc->esc_TDBAH = value; break; case E1000_TDLEN(0): //assert(!sc->esc_tx_enabled); sc->esc_TDLEN = value & ~0xFFF0007F; break; case E1000_TDH(0): //assert(!sc->esc_tx_enabled); /* XXX should only ever be zero ? Range check ? */ sc->esc_TDHr = sc->esc_TDH = value; break; case E1000_TDT(0): /* XXX range check ? */ sc->esc_TDT = value; if (sc->esc_tx_enabled) e82545_tx_start(sc); break; case E1000_TIDV: sc->esc_TIDV = value & ~0xFFFF0000; break; case E1000_TXDCTL(0): //assert(!sc->esc_tx_enabled); sc->esc_TXDCTL = value & ~0xC0C0C0; break; case E1000_TADV: sc->esc_TADV = value & ~0xFFFF0000; break; case E1000_RAL(0) ... E1000_RAH(15): /* convert to u32 offset */ ridx = (offset - E1000_RAL(0)) >> 2; e82545_write_ra(sc, ridx, value); break; case E1000_MTA ... (E1000_MTA + (127*4)): sc->esc_fmcast[(offset - E1000_MTA) >> 2] = value; break; case E1000_VFTA ... (E1000_VFTA + (127*4)): sc->esc_fvlan[(offset - E1000_VFTA) >> 2] = value; break; case E1000_EECD: { //DPRINTF("EECD write 0x%x -> 0x%x\r\n", sc->eeprom_control, value); /* edge triggered low->high */ uint32_t eecd_strobe = ((sc->eeprom_control & E1000_EECD_SK) ? 0 : (value & E1000_EECD_SK)); uint32_t eecd_mask = (E1000_EECD_SK|E1000_EECD_CS| E1000_EECD_DI|E1000_EECD_REQ); sc->eeprom_control &= ~eecd_mask; sc->eeprom_control |= (value & eecd_mask); /* grant/revoke immediately */ if (value & E1000_EECD_REQ) { sc->eeprom_control |= E1000_EECD_GNT; } else { sc->eeprom_control &= ~E1000_EECD_GNT; } if (eecd_strobe && (sc->eeprom_control & E1000_EECD_CS)) { e82545_eecd_strobe(sc); } return; } case E1000_MDIC: { uint8_t reg_addr = (uint8_t)((value & E1000_MDIC_REG_MASK) >> E1000_MDIC_REG_SHIFT); uint8_t phy_addr = (uint8_t)((value & E1000_MDIC_PHY_MASK) >> E1000_MDIC_PHY_SHIFT); sc->mdi_control = (value & ~(E1000_MDIC_ERROR|E1000_MDIC_DEST)); if ((value & E1000_MDIC_READY) != 0) { DPRINTF("Incorrect MDIC ready bit: 0x%x\r\n", value); return; } switch (value & E82545_MDIC_OP_MASK) { case E1000_MDIC_OP_READ: sc->mdi_control &= ~E82545_MDIC_DATA_MASK; sc->mdi_control |= e82545_read_mdi(sc, reg_addr, phy_addr); break; case E1000_MDIC_OP_WRITE: e82545_write_mdi(sc, reg_addr, phy_addr, value & E82545_MDIC_DATA_MASK); break; default: DPRINTF("Unknown MDIC op: 0x%x\r\n", value); return; } /* TODO: barrier? */ sc->mdi_control |= E1000_MDIC_READY; if (value & E82545_MDIC_IE) { // TODO: generate interrupt } return; } case E1000_MANC: case E1000_STATUS: return; default: DPRINTF("Unknown write register: 0x%x value:%x\r\n", offset, value); return; } } static uint32_t e82545_read_register(struct e82545_softc *sc, uint32_t offset) { uint32_t retval; int ridx; if (offset & 0x3) { DPRINTF("Unaligned register read offset:0x%x\r\n", offset); return 0; } DPRINTF("Register read: 0x%x\r\n", offset); switch (offset) { case E1000_CTRL: retval = sc->esc_CTRL; break; case E1000_STATUS: retval = E1000_STATUS_FD | E1000_STATUS_LU | E1000_STATUS_SPEED_1000; break; case E1000_FCAL: retval = sc->esc_FCAL; break; case E1000_FCAH: retval = sc->esc_FCAH; break; case E1000_FCT: retval = sc->esc_FCT; break; case E1000_VET: retval = sc->esc_VET; break; case E1000_FCTTV: retval = sc->esc_FCTTV; break; case E1000_LEDCTL: retval = sc->esc_LEDCTL; break; case E1000_PBA: retval = sc->esc_PBA; break; case E1000_ICR: case E1000_ITR: case E1000_ICS: case E1000_IMS: case E1000_IMC: retval = e82545_intr_read(sc, offset); break; case E1000_RCTL: retval = sc->esc_RCTL; break; case E1000_FCRTL: retval = sc->esc_FCRTL; break; case E1000_FCRTH: retval = sc->esc_FCRTH; break; case E1000_RDBAL(0): retval = sc->esc_RDBAL; break; case E1000_RDBAH(0): retval = sc->esc_RDBAH; break; case E1000_RDLEN(0): retval = sc->esc_RDLEN; break; case E1000_RDH(0): retval = sc->esc_RDH; break; case E1000_RDT(0): retval = sc->esc_RDT; break; case E1000_RDTR: retval = sc->esc_RDTR; break; case E1000_RXDCTL(0): retval = sc->esc_RXDCTL; break; case E1000_RADV: retval = sc->esc_RADV; break; case E1000_RSRPD: retval = sc->esc_RSRPD; break; case E1000_RXCSUM: retval = sc->esc_RXCSUM; break; case E1000_TXCW: retval = sc->esc_TXCW; break; case E1000_TCTL: retval = sc->esc_TCTL; break; case E1000_TIPG: retval = sc->esc_TIPG; break; case E1000_AIT: retval = sc->esc_AIT; break; case E1000_TDBAL(0): retval = sc->esc_TDBAL; break; case E1000_TDBAH(0): retval = sc->esc_TDBAH; break; case E1000_TDLEN(0): retval = sc->esc_TDLEN; break; case E1000_TDH(0): retval = sc->esc_TDH; break; case E1000_TDT(0): retval = sc->esc_TDT; break; case E1000_TIDV: retval = sc->esc_TIDV; break; case E1000_TXDCTL(0): retval = sc->esc_TXDCTL; break; case E1000_TADV: retval = sc->esc_TADV; break; case E1000_RAL(0) ... E1000_RAH(15): /* convert to u32 offset */ ridx = (offset - E1000_RAL(0)) >> 2; retval = e82545_read_ra(sc, ridx); break; case E1000_MTA ... (E1000_MTA + (127*4)): retval = sc->esc_fmcast[(offset - E1000_MTA) >> 2]; break; case E1000_VFTA ... (E1000_VFTA + (127*4)): retval = sc->esc_fvlan[(offset - E1000_VFTA) >> 2]; break; case E1000_EECD: //DPRINTF("EECD read %x\r\n", sc->eeprom_control); retval = sc->eeprom_control; break; case E1000_MDIC: retval = sc->mdi_control; break; case E1000_MANC: retval = 0; break; /* stats that we emulate. */ case E1000_MPC: retval = sc->missed_pkt_count; break; case E1000_PRC64: retval = sc->pkt_rx_by_size[0]; break; case E1000_PRC127: retval = sc->pkt_rx_by_size[1]; break; case E1000_PRC255: retval = sc->pkt_rx_by_size[2]; break; case E1000_PRC511: retval = sc->pkt_rx_by_size[3]; break; case E1000_PRC1023: retval = sc->pkt_rx_by_size[4]; break; case E1000_PRC1522: retval = sc->pkt_rx_by_size[5]; break; case E1000_GPRC: retval = sc->good_pkt_rx_count; break; case E1000_BPRC: retval = sc->bcast_pkt_rx_count; break; case E1000_MPRC: retval = sc->mcast_pkt_rx_count; break; case E1000_GPTC: case E1000_TPT: retval = sc->good_pkt_tx_count; break; case E1000_GORCL: retval = (uint32_t)sc->good_octets_rx; break; case E1000_GORCH: retval = (uint32_t)(sc->good_octets_rx >> 32); break; case E1000_TOTL: case E1000_GOTCL: retval = (uint32_t)sc->good_octets_tx; break; case E1000_TOTH: case E1000_GOTCH: retval = (uint32_t)(sc->good_octets_tx >> 32); break; case E1000_ROC: retval = sc->oversize_rx_count; break; case E1000_TORL: retval = (uint32_t)(sc->good_octets_rx + sc->missed_octets); break; case E1000_TORH: retval = (uint32_t)((sc->good_octets_rx + sc->missed_octets) >> 32); break; case E1000_TPR: retval = sc->good_pkt_rx_count + sc->missed_pkt_count + sc->oversize_rx_count; break; case E1000_PTC64: retval = sc->pkt_tx_by_size[0]; break; case E1000_PTC127: retval = sc->pkt_tx_by_size[1]; break; case E1000_PTC255: retval = sc->pkt_tx_by_size[2]; break; case E1000_PTC511: retval = sc->pkt_tx_by_size[3]; break; case E1000_PTC1023: retval = sc->pkt_tx_by_size[4]; break; case E1000_PTC1522: retval = sc->pkt_tx_by_size[5]; break; case E1000_MPTC: retval = sc->mcast_pkt_tx_count; break; case E1000_BPTC: retval = sc->bcast_pkt_tx_count; break; case E1000_TSCTC: retval = sc->tso_tx_count; break; /* stats that are always 0. */ case E1000_CRCERRS: case E1000_ALGNERRC: case E1000_SYMERRS: case E1000_RXERRC: case E1000_SCC: case E1000_ECOL: case E1000_MCC: case E1000_LATECOL: case E1000_COLC: case E1000_DC: case E1000_TNCRS: case E1000_SEC: case E1000_CEXTERR: case E1000_RLEC: case E1000_XONRXC: case E1000_XONTXC: case E1000_XOFFRXC: case E1000_XOFFTXC: case E1000_FCRUC: case E1000_RNBC: case E1000_RUC: case E1000_RFC: case E1000_RJC: case E1000_MGTPRC: case E1000_MGTPDC: case E1000_MGTPTC: case E1000_TSCTFC: retval = 0; break; default: DPRINTF("Unknown read register: 0x%x\r\n", offset); retval = 0; break; } return (retval); } static void e82545_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct e82545_softc *sc; //DPRINTF("Write bar:%d offset:0x%lx value:0x%lx size:%d\r\n", baridx, offset, value, size); sc = pi->pi_arg; pthread_mutex_lock(&sc->esc_mtx); switch (baridx) { case E82545_BAR_IO: switch (offset) { case E82545_IOADDR: if (size != 4) { DPRINTF("Wrong io addr write sz:%d value:0x%lx\r\n", size, value); } else sc->io_addr = (uint32_t)value; break; case E82545_IODATA: if (size != 4) { DPRINTF("Wrong io data write size:%d value:0x%lx\r\n", size, value); } else if (sc->io_addr > E82545_IO_REGISTER_MAX) { DPRINTF("Non-register io write addr:0x%x value:0x%lx\r\n", sc->io_addr, value); } else e82545_write_register(sc, sc->io_addr, (uint32_t)value); break; default: DPRINTF("Unknown io bar write offset:0x%lx value:0x%lx size:%d\r\n", offset, value, size); break; } break; case E82545_BAR_REGISTER: if (size != 4) { DPRINTF("Wrong register write size:%d offset:0x%lx value:0x%lx\r\n", size, offset, value); } else e82545_write_register(sc, (uint32_t)offset, (uint32_t)value); break; default: DPRINTF("Unknown write bar:%d off:0x%lx val:0x%lx size:%d\r\n", baridx, offset, value, size); } pthread_mutex_unlock(&sc->esc_mtx); } static uint64_t e82545_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct e82545_softc *sc; uint64_t retval; //DPRINTF("Read bar:%d offset:0x%lx size:%d\r\n", baridx, offset, size); sc = pi->pi_arg; retval = 0; pthread_mutex_lock(&sc->esc_mtx); switch (baridx) { case E82545_BAR_IO: switch (offset) { case E82545_IOADDR: if (size != 4) { DPRINTF("Wrong io addr read sz:%d\r\n", size); } else retval = sc->io_addr; break; case E82545_IODATA: if (size != 4) { DPRINTF("Wrong io data read sz:%d\r\n", size); } if (sc->io_addr > E82545_IO_REGISTER_MAX) { DPRINTF("Non-register io read addr:0x%x\r\n", sc->io_addr); } else retval = e82545_read_register(sc, sc->io_addr); break; default: DPRINTF("Unknown io bar read offset:0x%lx size:%d\r\n", offset, size); break; } break; case E82545_BAR_REGISTER: if (size != 4) { DPRINTF("Wrong register read size:%d offset:0x%lx\r\n", size, offset); } else retval = e82545_read_register(sc, (uint32_t)offset); break; default: DPRINTF("Unknown read bar:%d offset:0x%lx size:%d\r\n", baridx, offset, size); break; } pthread_mutex_unlock(&sc->esc_mtx); return (retval); } static void e82545_reset(struct e82545_softc *sc, int drvr) { int i; e82545_rx_disable(sc); e82545_tx_disable(sc); /* clear outstanding interrupts */ if (sc->esc_irq_asserted) pci_lintr_deassert(sc->esc_pi); /* misc */ if (!drvr) { sc->esc_FCAL = 0; sc->esc_FCAH = 0; sc->esc_FCT = 0; sc->esc_VET = 0; sc->esc_FCTTV = 0; } sc->esc_LEDCTL = 0x07061302; sc->esc_PBA = 0x00100030; /* start nvm in opcode mode. */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; sc->eeprom_control = E1000_EECD_PRES | E82545_EECD_FWE_EN; e82545_init_eeprom(sc); /* interrupt */ sc->esc_ICR = 0; sc->esc_ITR = 250; sc->esc_ICS = 0; sc->esc_IMS = 0; sc->esc_IMC = 0; /* L2 filters */ if (!drvr) { memset(sc->esc_fvlan, 0, sizeof(sc->esc_fvlan)); memset(sc->esc_fmcast, 0, sizeof(sc->esc_fmcast)); memset(sc->esc_uni, 0, sizeof(sc->esc_uni)); /* XXX not necessary on 82545 ?? */ sc->esc_uni[0].eu_valid = 1; memcpy(sc->esc_uni[0].eu_eth.octet, sc->esc_mac.octet, ETHER_ADDR_LEN); } else { /* Clear RAH valid bits */ for (i = 0; i < 16; i++) sc->esc_uni[i].eu_valid = 0; } /* receive */ if (!drvr) { sc->esc_RDBAL = 0; sc->esc_RDBAH = 0; } sc->esc_RCTL = 0; sc->esc_FCRTL = 0; sc->esc_FCRTH = 0; sc->esc_RDLEN = 0; sc->esc_RDH = 0; sc->esc_RDT = 0; sc->esc_RDTR = 0; sc->esc_RXDCTL = (1 << 24) | (1 << 16); /* default GRAN/WTHRESH */ sc->esc_RADV = 0; sc->esc_RXCSUM = 0; /* transmit */ if (!drvr) { sc->esc_TDBAL = 0; sc->esc_TDBAH = 0; sc->esc_TIPG = 0; sc->esc_AIT = 0; sc->esc_TIDV = 0; sc->esc_TADV = 0; } sc->esc_tdba = 0; sc->esc_txdesc = NULL; sc->esc_TXCW = 0; sc->esc_TCTL = 0; sc->esc_TDLEN = 0; sc->esc_TDT = 0; sc->esc_TDHr = sc->esc_TDH = 0; sc->esc_TXDCTL = 0; } static void e82545_open_tap(struct e82545_softc *sc, char *opts) { char tbuf[80]; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif if (opts == NULL) { sc->esc_tapfd = -1; return; } strcpy(tbuf, "/dev/"); strlcat(tbuf, opts, sizeof(tbuf)); sc->esc_tapfd = open(tbuf, O_RDWR); if (sc->esc_tapfd == -1) { DPRINTF("unable to open tap device %s\n", opts); exit(1); } /* * Set non-blocking and register for read * notifications with the event loop */ int opt = 1; if (ioctl(sc->esc_tapfd, FIONBIO, &opt) < 0) { WPRINTF("tap device O_NONBLOCK failed: %d\n", errno); close(sc->esc_tapfd); sc->esc_tapfd = -1; } +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (cap_rights_limit(sc->esc_tapfd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + sc->esc_mevp = mevent_add(sc->esc_tapfd, EVF_READ, e82545_tap_callback, sc); if (sc->esc_mevp == NULL) { DPRINTF("Could not register mevent %d\n", EVF_READ); close(sc->esc_tapfd); sc->esc_tapfd = -1; } } static int e82545_parsemac(char *mac_str, uint8_t *mac_addr) { struct ether_addr *ea; char *tmpstr; char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; tmpstr = strsep(&mac_str,"="); if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { ea = ether_aton(mac_str); if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { fprintf(stderr, "Invalid MAC %s\n", mac_str); return (1); } else memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); } return (0); } static int e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { DPRINTF("Loading with options: %s\r\n", opts); MD5_CTX mdctx; unsigned char digest[16]; char nstr[80]; struct e82545_softc *sc; char *devname; char *vtopts; int mac_provided; /* Setup our softc */ sc = calloc(1, sizeof(*sc)); pi->pi_arg = sc; sc->esc_pi = pi; sc->esc_ctx = ctx; pthread_mutex_init(&sc->esc_mtx, NULL); pthread_cond_init(&sc->esc_rx_cond, NULL); pthread_cond_init(&sc->esc_tx_cond, NULL); pthread_create(&sc->esc_tx_tid, NULL, e82545_tx_thread, sc); snprintf(nstr, sizeof(nstr), "e82545-%d:%d tx", pi->pi_slot, pi->pi_func); pthread_set_name_np(sc->esc_tx_tid, nstr); pci_set_cfgdata16(pi, PCIR_DEVICE, E82545_DEV_ID_82545EM_COPPER); pci_set_cfgdata16(pi, PCIR_VENDOR, E82545_VENDOR_ID_INTEL); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_NETWORK_ETHERNET); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, E82545_SUBDEV_ID); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, E82545_VENDOR_ID_INTEL); pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); pci_set_cfgdata8(pi, PCIR_INTPIN, 0x1); /* TODO: this card also supports msi, but the freebsd driver for it * does not, so I have not implemented it. */ pci_lintr_request(pi); pci_emul_alloc_bar(pi, E82545_BAR_REGISTER, PCIBAR_MEM32, E82545_BAR_REGISTER_LEN); pci_emul_alloc_bar(pi, E82545_BAR_FLASH, PCIBAR_MEM32, E82545_BAR_FLASH_LEN); pci_emul_alloc_bar(pi, E82545_BAR_IO, PCIBAR_IO, E82545_BAR_IO_LEN); /* * Attempt to open the tap device and read the MAC address * if specified. Copied from virtio-net, slightly modified. */ mac_provided = 0; sc->esc_tapfd = -1; if (opts != NULL) { int err; devname = vtopts = strdup(opts); (void) strsep(&vtopts, ","); if (vtopts != NULL) { err = e82545_parsemac(vtopts, sc->esc_mac.octet); if (err != 0) { free(devname); return (err); } mac_provided = 1; } if (strncmp(devname, "tap", 3) == 0 || strncmp(devname, "vmnet", 5) == 0) e82545_open_tap(sc, devname); free(devname); } /* * The default MAC address is the standard NetApp OUI of 00-a0-98, * followed by an MD5 of the PCI slot/func number and dev name */ if (!mac_provided) { snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot, pi->pi_func, vmname); MD5Init(&mdctx); MD5Update(&mdctx, nstr, strlen(nstr)); MD5Final(digest, &mdctx); sc->esc_mac.octet[0] = 0x00; sc->esc_mac.octet[1] = 0xa0; sc->esc_mac.octet[2] = 0x98; sc->esc_mac.octet[3] = digest[0]; sc->esc_mac.octet[4] = digest[1]; sc->esc_mac.octet[5] = digest[2]; } /* H/w initiated reset */ e82545_reset(sc, 0); return (0); } struct pci_devemu pci_de_e82545 = { .pe_emu = "e1000", .pe_init = e82545_init, .pe_barwrite = e82545_write, .pe_barread = e82545_read }; PCI_EMUL_SET(pci_de_e82545); Index: releng/11.1/usr.sbin/bhyve/pci_passthru.c =================================================================== --- releng/11.1/usr.sbin/bhyve/pci_passthru.c (revision 320933) +++ releng/11.1/usr.sbin/bhyve/pci_passthru.c (revision 320934) @@ -1,897 +1,932 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include #include #include #include #include #include #include #include #include #include +#include #include +#include #include #include #include #include "pci_emul.h" #include "mem.h" #ifndef _PATH_DEVPCI #define _PATH_DEVPCI "/dev/pci" #endif #ifndef _PATH_DEVIO #define _PATH_DEVIO "/dev/io" #endif #ifndef _PATH_MEM #define _PATH_MEM "/dev/mem" #endif #define LEGACY_SUPPORT 1 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1) #define MSIX_CAPLEN 12 static int pcifd = -1; static int iofd = -1; static int memfd = -1; struct passthru_softc { struct pci_devinst *psc_pi; struct pcibar psc_bar[PCI_BARMAX + 1]; struct { int capoff; int msgctrl; int emulated; } psc_msi; struct { int capoff; } psc_msix; struct pcisel psc_sel; }; static int msi_caplen(int msgctrl) { int len; len = 10; /* minimum length of msi capability */ if (msgctrl & PCIM_MSICTRL_64BIT) len += 4; #if 0 /* * Ignore the 'mask' and 'pending' bits in the MSI capability. * We'll let the guest manipulate them directly. */ if (msgctrl & PCIM_MSICTRL_VECTOR) len += 10; #endif return (len); } static uint32_t read_config(const struct pcisel *sel, long reg, int width) { struct pci_io pi; bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; if (ioctl(pcifd, PCIOCREAD, &pi) < 0) return (0); /* XXX */ else return (pi.pi_data); } static void write_config(const struct pcisel *sel, long reg, int width, uint32_t data) { struct pci_io pi; bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; pi.pi_data = data; (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */ } #ifdef LEGACY_SUPPORT static int passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr) { int capoff, i; struct msicap msicap; u_char *capdata; pci_populate_msicap(&msicap, msgnum, nextptr); /* * XXX * Copy the msi capability structure in the last 16 bytes of the * config space. This is wrong because it could shadow something * useful to the device. */ capoff = 256 - roundup(sizeof(msicap), 4); capdata = (u_char *)&msicap; for (i = 0; i < sizeof(msicap); i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); return (capoff); } #endif /* LEGACY_SUPPORT */ static int cfginitmsi(struct passthru_softc *sc) { int i, ptr, capptr, cap, sts, caplen, table_size; uint32_t u32; struct pcisel sel; struct pci_devinst *pi; struct msixcap msixcap; uint32_t *msixcap_ptr; pi = sc->psc_pi; sel = sc->psc_sel; /* * Parse the capabilities and cache the location of the MSI * and MSI-X capabilities. */ sts = read_config(&sel, PCIR_STATUS, 2); if (sts & PCIM_STATUS_CAPPRESENT) { ptr = read_config(&sel, PCIR_CAP_PTR, 1); while (ptr != 0 && ptr != 0xff) { cap = read_config(&sel, ptr + PCICAP_ID, 1); if (cap == PCIY_MSI) { /* * Copy the MSI capability into the config * space of the emulated pci device */ sc->psc_msi.capoff = ptr; sc->psc_msi.msgctrl = read_config(&sel, ptr + 2, 2); sc->psc_msi.emulated = 0; caplen = msi_caplen(sc->psc_msi.msgctrl); capptr = ptr; while (caplen > 0) { u32 = read_config(&sel, capptr, 4); pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; } } else if (cap == PCIY_MSIX) { /* * Copy the MSI-X capability */ sc->psc_msix.capoff = ptr; caplen = 12; msixcap_ptr = (uint32_t*) &msixcap; capptr = ptr; while (caplen > 0) { u32 = read_config(&sel, capptr, 4); *msixcap_ptr = u32; pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; msixcap_ptr++; } } ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1); } } if (sc->psc_msix.capoff != 0) { pi->pi_msix.pba_bar = msixcap.pba_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.pba_offset = msixcap.pba_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_bar = msixcap.table_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.table_offset = msixcap.table_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl); pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count); /* Allocate the emulated MSI-X table array */ table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* Mask all table entries */ for (i = 0; i < pi->pi_msix.table_count; i++) { pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } } #ifdef LEGACY_SUPPORT /* * If the passthrough device does not support MSI then craft a * MSI capability for it. We link the new MSI capability at the * head of the list of capabilities. */ if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) { int origptr, msiptr; origptr = read_config(&sel, PCIR_CAP_PTR, 1); msiptr = passthru_add_msicap(pi, 1, origptr); sc->psc_msi.capoff = msiptr; sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2); sc->psc_msi.emulated = 1; pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr); } #endif /* Make sure one of the capabilities is present */ if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) return (-1); else return (0); } static uint64_t msix_table_read(struct passthru_softc *sc, uint64_t offset, int size) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *src8; uint16_t *src16; uint32_t *src32; uint64_t *src64; uint64_t data; size_t entry_offset; int index; pi = sc->psc_pi; if (offset >= pi->pi_msix.pba_offset && offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { switch(size) { case 1: src8 = (uint8_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src8; break; case 2: src16 = (uint16_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src16; break; case 4: src32 = (uint32_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src32; break; case 8: src64 = (uint64_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); data = *src64; break; default: return (-1); } return (data); } if (offset < pi->pi_msix.table_offset) return (-1); offset -= pi->pi_msix.table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; if (index >= pi->pi_msix.table_count) return (-1); entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; switch(size) { case 1: src8 = (uint8_t *)((void *)entry + entry_offset); data = *src8; break; case 2: src16 = (uint16_t *)((void *)entry + entry_offset); data = *src16; break; case 4: src32 = (uint32_t *)((void *)entry + entry_offset); data = *src32; break; case 8: src64 = (uint64_t *)((void *)entry + entry_offset); data = *src64; break; default: return (-1); } return (data); } static void msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc, uint64_t offset, int size, uint64_t data) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *dest8; uint16_t *dest16; uint32_t *dest32; uint64_t *dest64; size_t entry_offset; uint32_t vector_control; int index; pi = sc->psc_pi; if (offset >= pi->pi_msix.pba_offset && offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { switch(size) { case 1: dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest8 = data; break; case 2: dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest16 = data; break; case 4: dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest32 = data; break; case 8: dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset - pi->pi_msix.pba_page_offset); *dest64 = data; break; default: break; } return; } if (offset < pi->pi_msix.table_offset) return; offset -= pi->pi_msix.table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; if (index >= pi->pi_msix.table_count) return; entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* Only 4 byte naturally-aligned writes are supported */ assert(size == 4); assert(entry_offset % 4 == 0); vector_control = entry->vector_control; dest32 = (uint32_t *)((void *)entry + entry_offset); *dest32 = data; /* If MSI-X hasn't been enabled, do nothing */ if (pi->pi_msix.enabled) { /* If the entry is masked, don't set it up */ if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 || (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { (void)vm_setup_pptdev_msix(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, index, entry->addr, entry->msg_data, entry->vector_control); } } } static int init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base) { int b, s, f; int error, idx; size_t len, remaining; uint32_t table_size, table_offset; uint32_t pba_size, pba_offset; vm_paddr_t start; struct pci_devinst *pi = sc->psc_pi; assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0); b = sc->psc_sel.pc_bus; s = sc->psc_sel.pc_dev; f = sc->psc_sel.pc_func; /* * If the MSI-X table BAR maps memory intended for * other uses, it is at least assured that the table * either resides in its own page within the region, * or it resides in a page shared with only the PBA. */ table_offset = rounddown2(pi->pi_msix.table_offset, 4096); table_size = pi->pi_msix.table_offset - table_offset; table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; table_size = roundup2(table_size, 4096); idx = pi->pi_msix.table_bar; start = pi->pi_bar[idx].addr; remaining = pi->pi_bar[idx].size; if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) { pba_offset = pi->pi_msix.pba_offset; pba_size = pi->pi_msix.pba_size; if (pba_offset >= table_offset + table_size || table_offset >= pba_offset + pba_size) { /* * If the PBA does not share a page with the MSI-x * tables, no PBA emulation is required. */ pi->pi_msix.pba_page = NULL; pi->pi_msix.pba_page_offset = 0; } else { /* * The PBA overlaps with either the first or last * page of the MSI-X table region. Map the * appropriate page. */ if (pba_offset <= table_offset) pi->pi_msix.pba_page_offset = table_offset; else pi->pi_msix.pba_page_offset = table_offset + table_size - 4096; pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, memfd, start + pi->pi_msix.pba_page_offset); if (pi->pi_msix.pba_page == MAP_FAILED) { warn( "Failed to map PBA page for MSI-X on %d/%d/%d", b, s, f); return (-1); } } } /* Map everything before the MSI-X table */ if (table_offset > 0) { len = table_offset; error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base); if (error) return (error); base += len; start += len; remaining -= len; } /* Skip the MSI-X table */ base += table_size; start += table_size; remaining -= table_size; /* Map everything beyond the end of the MSI-X table */ if (remaining > 0) { len = remaining; error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base); if (error) return (error); } return (0); } static int cfginitbar(struct vmctx *ctx, struct passthru_softc *sc) { int i, error; struct pci_devinst *pi; struct pci_bar_io bar; enum pcibar_type bartype; uint64_t base, size; pi = sc->psc_pi; /* * Initialize BAR registers */ for (i = 0; i <= PCI_BARMAX; i++) { bzero(&bar, sizeof(bar)); bar.pbi_sel = sc->psc_sel; bar.pbi_reg = PCIR_BAR(i); if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0) continue; if (PCI_BAR_IO(bar.pbi_base)) { bartype = PCIBAR_IO; base = bar.pbi_base & PCIM_BAR_IO_BASE; } else { switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) { case PCIM_BAR_MEM_64: bartype = PCIBAR_MEM64; break; default: bartype = PCIBAR_MEM32; break; } base = bar.pbi_base & PCIM_BAR_MEM_BASE; } size = bar.pbi_length; if (bartype != PCIBAR_IO) { if (((base | size) & PAGE_MASK) != 0) { warnx("passthru device %d/%d/%d BAR %d: " "base %#lx or size %#lx not page aligned\n", sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, base, size); return (-1); } } /* Cache information about the "real" BAR */ sc->psc_bar[i].type = bartype; sc->psc_bar[i].size = size; sc->psc_bar[i].addr = base; /* Allocate the BAR in the guest I/O or MMIO space */ error = pci_emul_alloc_pbar(pi, i, base, bartype, size); if (error) return (-1); /* The MSI-X table needs special handling */ if (i == pci_msix_table_bar(pi)) { error = init_msix_table(ctx, sc, base); if (error) return (-1); } else if (bartype != PCIBAR_IO) { /* Map the physical BAR in the guest MMIO space */ error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_bar[i].addr, pi->pi_bar[i].size, base); if (error) return (-1); } /* * 64-bit BAR takes up two slots so skip the next one. */ if (bartype == PCIBAR_MEM64) { i++; assert(i <= PCI_BARMAX); sc->psc_bar[i].type = PCIBAR_MEMHI64; } } return (0); } static int cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func) { int error; struct passthru_softc *sc; error = 1; sc = pi->pi_arg; bzero(&sc->psc_sel, sizeof(struct pcisel)); sc->psc_sel.pc_bus = bus; sc->psc_sel.pc_dev = slot; sc->psc_sel.pc_func = func; if (cfginitmsi(sc) != 0) { warnx("failed to initialize MSI for PCI %d/%d/%d", bus, slot, func); goto done; } if (cfginitbar(ctx, sc) != 0) { warnx("failed to initialize BARs for PCI %d/%d/%d", bus, slot, func); goto done; } error = 0; /* success */ done: return (error); } static int passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { int bus, slot, func, error, memflags; struct passthru_softc *sc; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t pci_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR }; + cap_ioctl_t io_ioctls[] = { IODEV_PIO }; +#endif sc = NULL; error = 1; +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_IOCTL, CAP_READ, CAP_WRITE); +#endif + memflags = vm_get_memflags(ctx); if (!(memflags & VM_MEM_F_WIRED)) { warnx("passthru requires guest memory to be wired"); goto done; } if (pcifd < 0) { pcifd = open(_PATH_DEVPCI, O_RDWR, 0); if (pcifd < 0) { warn("failed to open %s", _PATH_DEVPCI); goto done; } } +#ifndef WITHOUT_CAPSICUM + if (cap_rights_limit(pcifd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (cap_ioctls_limit(pcifd, pci_ioctls, nitems(pci_ioctls)) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + if (iofd < 0) { iofd = open(_PATH_DEVIO, O_RDWR, 0); if (iofd < 0) { warn("failed to open %s", _PATH_DEVIO); goto done; } } +#ifndef WITHOUT_CAPSICUM + if (cap_rights_limit(iofd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (cap_ioctls_limit(iofd, io_ioctls, nitems(io_ioctls)) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + if (memfd < 0) { memfd = open(_PATH_MEM, O_RDWR, 0); if (memfd < 0) { warn("failed to open %s", _PATH_MEM); goto done; } } + +#ifndef WITHOUT_CAPSICUM + cap_rights_clear(&rights, CAP_IOCTL); + cap_rights_set(&rights, CAP_MMAP_RW); + if (cap_rights_limit(memfd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif if (opts == NULL || sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) { warnx("invalid passthru options"); goto done; } if (vm_assign_pptdev(ctx, bus, slot, func) != 0) { warnx("PCI device at %d/%d/%d is not using the ppt(4) driver", bus, slot, func); goto done; } sc = calloc(1, sizeof(struct passthru_softc)); pi->pi_arg = sc; sc->psc_pi = pi; /* initialize config space */ if ((error = cfginit(ctx, pi, bus, slot, func)) != 0) goto done; error = 0; /* success */ done: if (error) { free(sc); vm_unassign_pptdev(ctx, bus, slot, func); } return (error); } static int bar_access(int coff) { if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) return (1); else return (0); } static int msicap_access(struct passthru_softc *sc, int coff) { int caplen; if (sc->psc_msi.capoff == 0) return (0); caplen = msi_caplen(sc->psc_msi.msgctrl); if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen) return (1); else return (0); } static int msixcap_access(struct passthru_softc *sc, int coff) { if (sc->psc_msix.capoff == 0) return (0); return (coff >= sc->psc_msix.capoff && coff < sc->psc_msix.capoff + MSIX_CAPLEN); } static int passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t *rv) { struct passthru_softc *sc; sc = pi->pi_arg; /* * PCI BARs and MSI capability is emulated. */ if (bar_access(coff) || msicap_access(sc, coff)) return (-1); #ifdef LEGACY_SUPPORT /* * Emulate PCIR_CAP_PTR if this device does not support MSI capability * natively. */ if (sc->psc_msi.emulated) { if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4) return (-1); } #endif /* Everything else just read from the device's config space */ *rv = read_config(&sc->psc_sel, coff, bytes); return (0); } static int passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, int bytes, uint32_t val) { int error, msix_table_entries, i; struct passthru_softc *sc; sc = pi->pi_arg; /* * PCI BARs are emulated */ if (bar_access(coff)) return (-1); /* * MSI capability is emulated */ if (msicap_access(sc, coff)) { msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val); error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum); if (error != 0) err(1, "vm_setup_pptdev_msi"); return (0); } if (msixcap_access(sc, coff)) { msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val); if (pi->pi_msix.enabled) { msix_table_entries = pi->pi_msix.table_count; for (i = 0; i < msix_table_entries; i++) { error = vm_setup_pptdev_msix(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, pi->pi_msix.table[i].addr, pi->pi_msix.table[i].msg_data, pi->pi_msix.table[i].vector_control); if (error) err(1, "vm_setup_pptdev_msix"); } } return (0); } #ifdef LEGACY_SUPPORT /* * If this device does not support MSI natively then we cannot let * the guest disable legacy interrupts from the device. It is the * legacy interrupt that is triggering the virtual MSI to the guest. */ if (sc->psc_msi.emulated && pci_msi_enabled(pi)) { if (coff == PCIR_COMMAND && bytes == 2) val &= ~PCIM_CMD_INTxDIS; } #endif write_config(&sc->psc_sel, coff, bytes, val); return (0); } static void passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct passthru_softc *sc; struct iodev_pio_req pio; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { msix_table_write(ctx, vcpu, sc, offset, size, value); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); bzero(&pio, sizeof(struct iodev_pio_req)); pio.access = IODEV_PIO_WRITE; pio.port = sc->psc_bar[baridx].addr + offset; pio.width = size; pio.val = value; (void)ioctl(iofd, IODEV_PIO, &pio); } } static uint64_t passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct passthru_softc *sc; struct iodev_pio_req pio; uint64_t val; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { val = msix_table_read(sc, offset, size); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); bzero(&pio, sizeof(struct iodev_pio_req)); pio.access = IODEV_PIO_READ; pio.port = sc->psc_bar[baridx].addr + offset; pio.width = size; pio.val = 0; (void)ioctl(iofd, IODEV_PIO, &pio); val = pio.val; } return (val); } struct pci_devemu passthru = { .pe_emu = "passthru", .pe_init = passthru_init, .pe_cfgwrite = passthru_cfgwrite, .pe_cfgread = passthru_cfgread, .pe_barwrite = passthru_write, .pe_barread = passthru_read, }; PCI_EMUL_SET(passthru); Index: releng/11.1/usr.sbin/bhyve/pci_virtio_net.c =================================================================== --- releng/11.1/usr.sbin/bhyve/pci_virtio_net.c (revision 320933) +++ releng/11.1/usr.sbin/bhyve/pci_virtio_net.c (revision 320934) @@ -1,976 +1,990 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include #include #include #include #include #ifndef NETMAP_WITH_LIBS #define NETMAP_WITH_LIBS #endif #include +#include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include "bhyverun.h" #include "pci_emul.h" #include "mevent.h" #include "virtio.h" #define VTNET_RINGSZ 1024 #define VTNET_MAXSEGS 256 /* * Host capabilities. Note that we only offer a few of these. */ #define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ #define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ #define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ #define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ #define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ #define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ #define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ #define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ #define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ #define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ #define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ #define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ #define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ #define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ #define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ #define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ #define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ #define VIRTIO_NET_F_GUEST_ANNOUNCE \ (1 << 21) /* guest can send gratuitous pkts */ #define VTNET_S_HOSTCAPS \ ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \ VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) /* * PCI config-space "registers" */ struct virtio_net_config { uint8_t mac[6]; uint16_t status; } __packed; /* * Queue definitions. */ #define VTNET_RXQ 0 #define VTNET_TXQ 1 #define VTNET_CTLQ 2 /* NB: not yet supported */ #define VTNET_MAXQ 3 /* * Fixed network header size */ struct virtio_net_rxhdr { uint8_t vrh_flags; uint8_t vrh_gso_type; uint16_t vrh_hdr_len; uint16_t vrh_gso_size; uint16_t vrh_csum_start; uint16_t vrh_csum_offset; uint16_t vrh_bufs; } __packed; /* * Debug printf */ static int pci_vtnet_debug; #define DPRINTF(params) if (pci_vtnet_debug) printf params #define WPRINTF(params) printf params /* * Per-device softc */ struct pci_vtnet_softc { struct virtio_softc vsc_vs; struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; pthread_mutex_t vsc_mtx; struct mevent *vsc_mevp; int vsc_tapfd; struct nm_desc *vsc_nmd; int vsc_rx_ready; volatile int resetting; /* set and checked outside lock */ uint64_t vsc_features; /* negotiated features */ struct virtio_net_config vsc_config; pthread_mutex_t rx_mtx; int rx_in_progress; int rx_vhdrlen; int rx_merge; /* merged rx bufs in use */ pthread_t tx_tid; pthread_mutex_t tx_mtx; pthread_cond_t tx_cond; int tx_in_progress; void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc); void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, int len); }; static void pci_vtnet_reset(void *); /* static void pci_vtnet_notify(void *, struct vqueue_info *); */ static int pci_vtnet_cfgread(void *, int, int, uint32_t *); static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); static void pci_vtnet_neg_features(void *, uint64_t); static struct virtio_consts vtnet_vi_consts = { "vtnet", /* our name */ VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ sizeof(struct virtio_net_config), /* config reg size */ pci_vtnet_reset, /* reset */ NULL, /* device-wide qnotify -- not used */ pci_vtnet_cfgread, /* read PCI config */ pci_vtnet_cfgwrite, /* write PCI config */ pci_vtnet_neg_features, /* apply negotiated features */ VTNET_S_HOSTCAPS, /* our capabilities */ }; /* * If the transmit thread is active then stall until it is done. */ static void pci_vtnet_txwait(struct pci_vtnet_softc *sc) { pthread_mutex_lock(&sc->tx_mtx); while (sc->tx_in_progress) { pthread_mutex_unlock(&sc->tx_mtx); usleep(10000); pthread_mutex_lock(&sc->tx_mtx); } pthread_mutex_unlock(&sc->tx_mtx); } /* * If the receive thread is active then stall until it is done. */ static void pci_vtnet_rxwait(struct pci_vtnet_softc *sc) { pthread_mutex_lock(&sc->rx_mtx); while (sc->rx_in_progress) { pthread_mutex_unlock(&sc->rx_mtx); usleep(10000); pthread_mutex_lock(&sc->rx_mtx); } pthread_mutex_unlock(&sc->rx_mtx); } static void pci_vtnet_reset(void *vsc) { struct pci_vtnet_softc *sc = vsc; DPRINTF(("vtnet: device reset requested !\n")); sc->resetting = 1; /* * Wait for the transmit and receive threads to finish their * processing. */ pci_vtnet_txwait(sc); pci_vtnet_rxwait(sc); sc->vsc_rx_ready = 0; sc->rx_merge = 1; sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); /* now reset rings, MSI-X vectors, and negotiated capabilities */ vi_reset_dev(&sc->vsc_vs); sc->resetting = 0; } /* * Called to send a buffer chain out to the tap device */ static void pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, int len) { static char pad[60]; /* all zero bytes */ if (sc->vsc_tapfd == -1) return; /* * If the length is < 60, pad out to that and add the * extra zero'd segment to the iov. It is guaranteed that * there is always an extra iov available by the caller. */ if (len < 60) { iov[iovcnt].iov_base = pad; iov[iovcnt].iov_len = 60 - len; iovcnt++; } (void) writev(sc->vsc_tapfd, iov, iovcnt); } /* * Called when there is read activity on the tap file descriptor. * Each buffer posted by the guest is assumed to be able to contain * an entire ethernet frame + rx header. * MP note: the dummybuf is only used for discarding frames, so there * is no need for it to be per-vtnet or locked. */ static uint8_t dummybuf[2048]; static __inline struct iovec * rx_iov_trim(struct iovec *iov, int *niov, int tlen) { struct iovec *riov; /* XXX short-cut: assume first segment is >= tlen */ assert(iov[0].iov_len >= tlen); iov[0].iov_len -= tlen; if (iov[0].iov_len == 0) { assert(*niov > 1); *niov -= 1; riov = &iov[1]; } else { iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen); riov = &iov[0]; } return (riov); } static void pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) { struct iovec iov[VTNET_MAXSEGS], *riov; struct vqueue_info *vq; void *vrx; int len, n; uint16_t idx; /* * Should never be called without a valid tap fd */ assert(sc->vsc_tapfd != -1); /* * But, will be called when the rx ring hasn't yet * been set up or the guest is resetting the device. */ if (!sc->vsc_rx_ready || sc->resetting) { /* * Drop the packet and try later. */ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); return; } /* * Check for available rx buffers */ vq = &sc->vsc_queues[VTNET_RXQ]; if (!vq_has_descs(vq)) { /* * Drop the packet and try later. Interrupt on * empty, if that's negotiated. */ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); vq_endchains(vq, 1); return; } do { /* * Get descriptor chain. */ n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); assert(n >= 1 && n <= VTNET_MAXSEGS); /* * Get a pointer to the rx header, and use the * data immediately following it for the packet buffer. */ vrx = iov[0].iov_base; riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); len = readv(sc->vsc_tapfd, riov, n); if (len < 0 && errno == EWOULDBLOCK) { /* * No more packets, but still some avail ring * entries. Interrupt if needed/appropriate. */ vq_retchain(vq); vq_endchains(vq, 0); return; } /* * The only valid field in the rx packet header is the * number of buffers if merged rx bufs were negotiated. */ memset(vrx, 0, sc->rx_vhdrlen); if (sc->rx_merge) { struct virtio_net_rxhdr *vrxh; vrxh = vrx; vrxh->vrh_bufs = 1; } /* * Release this chain and handle more chains. */ vq_relchain(vq, idx, len + sc->rx_vhdrlen); } while (vq_has_descs(vq)); /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ vq_endchains(vq, 1); } static __inline int pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt) { int r, i; int len = 0; for (r = nmd->cur_tx_ring; ; ) { struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r); uint32_t cur, idx; char *buf; if (nm_ring_empty(ring)) { r++; if (r > nmd->last_tx_ring) r = nmd->first_tx_ring; if (r == nmd->cur_tx_ring) break; continue; } cur = ring->cur; idx = ring->slot[cur].buf_idx; buf = NETMAP_BUF(ring, idx); for (i = 0; i < iovcnt; i++) { if (len + iov[i].iov_len > 2048) break; memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len); len += iov[i].iov_len; } ring->slot[cur].len = len; ring->head = ring->cur = nm_ring_next(ring, cur); nmd->cur_tx_ring = r; ioctl(nmd->fd, NIOCTXSYNC, NULL); break; } return (len); } static __inline int pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt) { int len = 0; int i = 0; int r; for (r = nmd->cur_rx_ring; ; ) { struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r); uint32_t cur, idx; char *buf; size_t left; if (nm_ring_empty(ring)) { r++; if (r > nmd->last_rx_ring) r = nmd->first_rx_ring; if (r == nmd->cur_rx_ring) break; continue; } cur = ring->cur; idx = ring->slot[cur].buf_idx; buf = NETMAP_BUF(ring, idx); left = ring->slot[cur].len; for (i = 0; i < iovcnt && left > 0; i++) { if (iov[i].iov_len > left) iov[i].iov_len = left; memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len); len += iov[i].iov_len; left -= iov[i].iov_len; } ring->head = ring->cur = nm_ring_next(ring, cur); nmd->cur_rx_ring = r; ioctl(nmd->fd, NIOCRXSYNC, NULL); break; } for (; i < iovcnt; i++) iov[i].iov_len = 0; return (len); } /* * Called to send a buffer chain out to the vale port */ static void pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, int len) { static char pad[60]; /* all zero bytes */ if (sc->vsc_nmd == NULL) return; /* * If the length is < 60, pad out to that and add the * extra zero'd segment to the iov. It is guaranteed that * there is always an extra iov available by the caller. */ if (len < 60) { iov[iovcnt].iov_base = pad; iov[iovcnt].iov_len = 60 - len; iovcnt++; } (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt); } static void pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc) { struct iovec iov[VTNET_MAXSEGS], *riov; struct vqueue_info *vq; void *vrx; int len, n; uint16_t idx; /* * Should never be called without a valid netmap descriptor */ assert(sc->vsc_nmd != NULL); /* * But, will be called when the rx ring hasn't yet * been set up or the guest is resetting the device. */ if (!sc->vsc_rx_ready || sc->resetting) { /* * Drop the packet and try later. */ (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); return; } /* * Check for available rx buffers */ vq = &sc->vsc_queues[VTNET_RXQ]; if (!vq_has_descs(vq)) { /* * Drop the packet and try later. Interrupt on * empty, if that's negotiated. */ (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); vq_endchains(vq, 1); return; } do { /* * Get descriptor chain. */ n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); assert(n >= 1 && n <= VTNET_MAXSEGS); /* * Get a pointer to the rx header, and use the * data immediately following it for the packet buffer. */ vrx = iov[0].iov_base; riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n); if (len == 0) { /* * No more packets, but still some avail ring * entries. Interrupt if needed/appropriate. */ vq_retchain(vq); vq_endchains(vq, 0); return; } /* * The only valid field in the rx packet header is the * number of buffers if merged rx bufs were negotiated. */ memset(vrx, 0, sc->rx_vhdrlen); if (sc->rx_merge) { struct virtio_net_rxhdr *vrxh; vrxh = vrx; vrxh->vrh_bufs = 1; } /* * Release this chain and handle more chains. */ vq_relchain(vq, idx, len + sc->rx_vhdrlen); } while (vq_has_descs(vq)); /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ vq_endchains(vq, 1); } static void pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) { struct pci_vtnet_softc *sc = param; pthread_mutex_lock(&sc->rx_mtx); sc->rx_in_progress = 1; sc->pci_vtnet_rx(sc); sc->rx_in_progress = 0; pthread_mutex_unlock(&sc->rx_mtx); } static void pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) { struct pci_vtnet_softc *sc = vsc; /* * A qnotify means that the rx process can now begin */ if (sc->vsc_rx_ready == 0) { sc->vsc_rx_ready = 1; vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; } } static void pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) { struct iovec iov[VTNET_MAXSEGS + 1]; int i, n; int plen, tlen; uint16_t idx; /* * Obtain chain of descriptors. The first one is * really the header descriptor, so we need to sum * up two lengths: packet length and transfer length. */ n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); assert(n >= 1 && n <= VTNET_MAXSEGS); plen = 0; tlen = iov[0].iov_len; for (i = 1; i < n; i++) { plen += iov[i].iov_len; tlen += iov[i].iov_len; } DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n)); sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen); /* chain is processed, release it and set tlen */ vq_relchain(vq, idx, tlen); } static void pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) { struct pci_vtnet_softc *sc = vsc; /* * Any ring entries to process? */ if (!vq_has_descs(vq)) return; /* Signal the tx thread for processing */ pthread_mutex_lock(&sc->tx_mtx); vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; if (sc->tx_in_progress == 0) pthread_cond_signal(&sc->tx_cond); pthread_mutex_unlock(&sc->tx_mtx); } /* * Thread which will handle processing of TX desc */ static void * pci_vtnet_tx_thread(void *param) { struct pci_vtnet_softc *sc = param; struct vqueue_info *vq; int error; vq = &sc->vsc_queues[VTNET_TXQ]; /* * Let us wait till the tx queue pointers get initialised & * first tx signaled */ pthread_mutex_lock(&sc->tx_mtx); error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); assert(error == 0); for (;;) { /* note - tx mutex is locked here */ while (sc->resetting || !vq_has_descs(vq)) { vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY; mb(); if (!sc->resetting && vq_has_descs(vq)) break; sc->tx_in_progress = 0; error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); assert(error == 0); } vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; sc->tx_in_progress = 1; pthread_mutex_unlock(&sc->tx_mtx); do { /* * Run through entries, placing them into * iovecs and sending when an end-of-packet * is found */ pci_vtnet_proctx(sc, vq); } while (vq_has_descs(vq)); /* * Generate an interrupt if needed. */ vq_endchains(vq, 1); pthread_mutex_lock(&sc->tx_mtx); } } #ifdef notyet static void pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) { DPRINTF(("vtnet: control qnotify!\n\r")); } #endif static int pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr) { struct ether_addr *ea; char *tmpstr; char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; tmpstr = strsep(&mac_str,"="); if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { ea = ether_aton(mac_str); if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { fprintf(stderr, "Invalid MAC %s\n", mac_str); return (EINVAL); } else memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); } return (0); } static void pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname) { char tbuf[80]; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif strcpy(tbuf, "/dev/"); strlcat(tbuf, devname, sizeof(tbuf)); sc->pci_vtnet_rx = pci_vtnet_tap_rx; sc->pci_vtnet_tx = pci_vtnet_tap_tx; sc->vsc_tapfd = open(tbuf, O_RDWR); if (sc->vsc_tapfd == -1) { WPRINTF(("open of tap device %s failed\n", tbuf)); return; } /* * Set non-blocking and register for read * notifications with the event loop */ int opt = 1; if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { WPRINTF(("tap device O_NONBLOCK failed\n")); close(sc->vsc_tapfd); sc->vsc_tapfd = -1; } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (cap_rights_limit(sc->vsc_tapfd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif sc->vsc_mevp = mevent_add(sc->vsc_tapfd, EVF_READ, pci_vtnet_rx_callback, sc); if (sc->vsc_mevp == NULL) { WPRINTF(("Could not register event\n")); close(sc->vsc_tapfd); sc->vsc_tapfd = -1; } } static void pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname) { sc->pci_vtnet_rx = pci_vtnet_netmap_rx; sc->pci_vtnet_tx = pci_vtnet_netmap_tx; sc->vsc_nmd = nm_open(ifname, NULL, 0, 0); if (sc->vsc_nmd == NULL) { WPRINTF(("open of netmap device %s failed\n", ifname)); return; } sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd, EVF_READ, pci_vtnet_rx_callback, sc); if (sc->vsc_mevp == NULL) { WPRINTF(("Could not register event\n")); nm_close(sc->vsc_nmd); sc->vsc_nmd = NULL; } } static int pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { MD5_CTX mdctx; unsigned char digest[16]; char nstr[80]; char tname[MAXCOMLEN + 1]; struct pci_vtnet_softc *sc; char *devname; char *vtopts; int mac_provided; sc = calloc(1, sizeof(struct pci_vtnet_softc)); pthread_mutex_init(&sc->vsc_mtx, NULL); vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues); sc->vsc_vs.vs_mtx = &sc->vsc_mtx; sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; #ifdef notyet sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; #endif /* * Attempt to open the tap device and read the MAC address * if specified */ mac_provided = 0; sc->vsc_tapfd = -1; sc->vsc_nmd = NULL; if (opts != NULL) { int err; devname = vtopts = strdup(opts); (void) strsep(&vtopts, ","); if (vtopts != NULL) { err = pci_vtnet_parsemac(vtopts, sc->vsc_config.mac); if (err != 0) { free(devname); return (err); } mac_provided = 1; } if (strncmp(devname, "vale", 4) == 0) pci_vtnet_netmap_setup(sc, devname); if (strncmp(devname, "tap", 3) == 0 || strncmp(devname, "vmnet", 5) == 0) pci_vtnet_tap_setup(sc, devname); free(devname); } /* * The default MAC address is the standard NetApp OUI of 00-a0-98, * followed by an MD5 of the PCI slot/func number and dev name */ if (!mac_provided) { snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot, pi->pi_func, vmname); MD5Init(&mdctx); MD5Update(&mdctx, nstr, strlen(nstr)); MD5Final(digest, &mdctx); sc->vsc_config.mac[0] = 0x00; sc->vsc_config.mac[1] = 0xa0; sc->vsc_config.mac[2] = 0x98; sc->vsc_config.mac[3] = digest[0]; sc->vsc_config.mac[4] = digest[1]; sc->vsc_config.mac[5] = digest[2]; } /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); /* Link is up if we managed to open tap device or vale port. */ sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 || sc->vsc_nmd != NULL); /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) return (1); /* use BAR 0 to map config regs in IO space */ vi_set_io_bar(&sc->vsc_vs, 0); sc->resetting = 0; sc->rx_merge = 1; sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); sc->rx_in_progress = 0; pthread_mutex_init(&sc->rx_mtx, NULL); /* * Initialize tx semaphore & spawn TX processing thread. * As of now, only one thread for TX desc processing is * spawned. */ sc->tx_in_progress = 0; pthread_mutex_init(&sc->tx_mtx, NULL); pthread_cond_init(&sc->tx_cond, NULL); pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, pi->pi_func); pthread_set_name_np(sc->tx_tid, tname); return (0); } static int pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) { struct pci_vtnet_softc *sc = vsc; void *ptr; if (offset < 6) { assert(offset + size <= 6); /* * The driver is allowed to change the MAC address */ ptr = &sc->vsc_config.mac[offset]; memcpy(ptr, &value, size); } else { /* silently ignore other writes */ DPRINTF(("vtnet: write to readonly reg %d\n\r", offset)); } return (0); } static int pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtnet_softc *sc = vsc; void *ptr; ptr = (uint8_t *)&sc->vsc_config + offset; memcpy(retval, ptr, size); return (0); } static void pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vtnet_softc *sc = vsc; sc->vsc_features = negotiated_features; if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) { sc->rx_merge = 0; /* non-merge rx header is 2 bytes shorter */ sc->rx_vhdrlen -= 2; } } struct pci_devemu pci_de_vnet = { .pe_emu = "virtio-net", .pe_init = pci_vtnet_init, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_vnet); Index: releng/11.1/usr.sbin/bhyve/pci_virtio_rnd.c =================================================================== --- releng/11.1/usr.sbin/bhyve/pci_virtio_rnd.c (revision 320933) +++ releng/11.1/usr.sbin/bhyve/pci_virtio_rnd.c (revision 320934) @@ -1,189 +1,203 @@ /*- * Copyright (c) 2014 Nahanni Systems Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * virtio entropy device emulation. * Randomness is sourced from /dev/random which does not block * once it has been seeded at bootup. */ #include __FBSDID("$FreeBSD$"); #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include +#include #include #include #include #include #include #include #include #include +#include #include "bhyverun.h" #include "pci_emul.h" #include "virtio.h" #define VTRND_RINGSZ 64 static int pci_vtrnd_debug; #define DPRINTF(params) if (pci_vtrnd_debug) printf params #define WPRINTF(params) printf params /* * Per-device softc */ struct pci_vtrnd_softc { struct virtio_softc vrsc_vs; struct vqueue_info vrsc_vq; pthread_mutex_t vrsc_mtx; uint64_t vrsc_cfg; int vrsc_fd; }; static void pci_vtrnd_reset(void *); static void pci_vtrnd_notify(void *, struct vqueue_info *); static struct virtio_consts vtrnd_vi_consts = { "vtrnd", /* our name */ 1, /* we support 1 virtqueue */ 0, /* config reg size */ pci_vtrnd_reset, /* reset */ pci_vtrnd_notify, /* device-wide qnotify */ NULL, /* read virtio config */ NULL, /* write virtio config */ NULL, /* apply negotiated features */ 0, /* our capabilities */ }; static void pci_vtrnd_reset(void *vsc) { struct pci_vtrnd_softc *sc; sc = vsc; DPRINTF(("vtrnd: device reset requested !\n")); vi_reset_dev(&sc->vrsc_vs); } static void pci_vtrnd_notify(void *vsc, struct vqueue_info *vq) { struct iovec iov; struct pci_vtrnd_softc *sc; int len; uint16_t idx; sc = vsc; if (sc->vrsc_fd < 0) { vq_endchains(vq, 0); return; } while (vq_has_descs(vq)) { vq_getchain(vq, &idx, &iov, 1, NULL); len = read(sc->vrsc_fd, iov.iov_base, iov.iov_len); DPRINTF(("vtrnd: vtrnd_notify(): %d\r\n", len)); /* Catastrophe if unable to read from /dev/random */ assert(len > 0); /* * Release this chain and handle more */ vq_relchain(vq, idx, len); } vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ } static int pci_vtrnd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { struct pci_vtrnd_softc *sc; int fd; int len; uint8_t v; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif /* * Should always be able to open /dev/random. */ fd = open("/dev/random", O_RDONLY | O_NONBLOCK); assert(fd >= 0); + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_READ); + if (cap_rights_limit(fd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif /* * Check that device is seeded and non-blocking. */ len = read(fd, &v, sizeof(v)); if (len <= 0) { WPRINTF(("vtrnd: /dev/random not ready, read(): %d", len)); return (1); } sc = calloc(1, sizeof(struct pci_vtrnd_softc)); vi_softc_linkup(&sc->vrsc_vs, &vtrnd_vi_consts, sc, pi, &sc->vrsc_vq); sc->vrsc_vs.vs_mtx = &sc->vrsc_mtx; sc->vrsc_vq.vq_qsize = VTRND_RINGSZ; /* keep /dev/random opened while emulating */ sc->vrsc_fd = fd; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_RANDOM); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_CRYPTO); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_ENTROPY); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix())) return (1); vi_set_io_bar(&sc->vrsc_vs, 0); return (0); } struct pci_devemu pci_de_vrnd = { .pe_emu = "virtio-rnd", .pe_init = pci_vtrnd_init, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_vrnd); Index: releng/11.1/usr.sbin/bhyve/rfb.c =================================================================== --- releng/11.1/usr.sbin/bhyve/rfb.c (revision 320933) +++ releng/11.1/usr.sbin/bhyve/rfb.c (revision 320934) @@ -1,926 +1,941 @@ /*- * Copyright (c) 2015 Tycho Nightingale * Copyright (c) 2015 Leon Dang * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include #include #include #include #include #include #include +#include +#include #include #include #include #include #include #include #include +#include #include #include #include "bhyvegc.h" #include "console.h" #include "rfb.h" #include "sockstream.h" static int rfb_debug = 0; #define DPRINTF(params) if (rfb_debug) printf params #define WPRINTF(params) printf params struct rfb_softc { int sfd; pthread_t tid; int cfd; int width, height; bool enc_raw_ok; bool enc_zlib_ok; bool enc_resize_ok; z_stream zstream; uint8_t *zbuf; int zbuflen; int conn_wait; int sending; pthread_mutex_t mtx; pthread_cond_t cond; int hw_crc; uint32_t *crc; /* WxH crc cells */ uint32_t *crc_tmp; /* buffer to store single crc row */ int crc_width, crc_height; }; struct rfb_pixfmt { uint8_t bpp; uint8_t depth; uint8_t bigendian; uint8_t truecolor; uint16_t red_max; uint16_t green_max; uint16_t blue_max; uint8_t red_shift; uint8_t green_shift; uint8_t blue_shift; uint8_t pad[3]; }; struct rfb_srvr_info { uint16_t width; uint16_t height; struct rfb_pixfmt pixfmt; uint32_t namelen; }; struct rfb_pixfmt_msg { uint8_t type; uint8_t pad[3]; struct rfb_pixfmt pixfmt; }; #define RFB_ENCODING_RAW 0 #define RFB_ENCODING_ZLIB 6 #define RFB_ENCODING_RESIZE -223 #define RFB_MAX_WIDTH 2000 #define RFB_MAX_HEIGHT 1200 #define RFB_ZLIB_BUFSZ RFB_MAX_WIDTH*RFB_MAX_HEIGHT*4 /* percentage changes to screen before sending the entire screen */ #define RFB_SEND_ALL_THRESH 25 struct rfb_enc_msg { uint8_t type; uint8_t pad; uint16_t numencs; }; struct rfb_updt_msg { uint8_t type; uint8_t incremental; uint16_t x; uint16_t y; uint16_t width; uint16_t height; }; struct rfb_key_msg { uint8_t type; uint8_t down; uint16_t pad; uint32_t code; }; struct rfb_ptr_msg { uint8_t type; uint8_t button; uint16_t x; uint16_t y; }; struct rfb_srvr_updt_msg { uint8_t type; uint8_t pad; uint16_t numrects; }; struct rfb_srvr_rect_hdr { uint16_t x; uint16_t y; uint16_t width; uint16_t height; uint32_t encoding; }; struct rfb_cuttext_msg { uint8_t type; uint8_t padding[3]; uint32_t length; }; static void rfb_send_server_init_msg(int cfd) { struct bhyvegc_image *gc_image; struct rfb_srvr_info sinfo; gc_image = console_get_image(); sinfo.width = htons(gc_image->width); sinfo.height = htons(gc_image->height); sinfo.pixfmt.bpp = 32; sinfo.pixfmt.depth = 32; sinfo.pixfmt.bigendian = 0; sinfo.pixfmt.truecolor = 1; sinfo.pixfmt.red_max = htons(255); sinfo.pixfmt.green_max = htons(255); sinfo.pixfmt.blue_max = htons(255); sinfo.pixfmt.red_shift = 16; sinfo.pixfmt.green_shift = 8; sinfo.pixfmt.blue_shift = 0; sinfo.namelen = htonl(strlen("bhyve")); (void)stream_write(cfd, &sinfo, sizeof(sinfo)); (void)stream_write(cfd, "bhyve", strlen("bhyve")); } static void rfb_send_resize_update_msg(struct rfb_softc *rc, int cfd) { struct rfb_srvr_updt_msg supdt_msg; struct rfb_srvr_rect_hdr srect_hdr; /* Number of rectangles: 1 */ supdt_msg.type = 0; supdt_msg.pad = 0; supdt_msg.numrects = htons(1); stream_write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg)); /* Rectangle header */ srect_hdr.x = htons(0); srect_hdr.y = htons(0); srect_hdr.width = htons(rc->width); srect_hdr.height = htons(rc->height); srect_hdr.encoding = htonl(RFB_ENCODING_RESIZE); stream_write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr)); } static void rfb_recv_set_pixfmt_msg(struct rfb_softc *rc, int cfd) { struct rfb_pixfmt_msg pixfmt_msg; (void)stream_read(cfd, ((void *)&pixfmt_msg)+1, sizeof(pixfmt_msg)-1); } static void rfb_recv_set_encodings_msg(struct rfb_softc *rc, int cfd) { struct rfb_enc_msg enc_msg; int i; uint32_t encoding; assert((sizeof(enc_msg) - 1) == 3); (void)stream_read(cfd, ((void *)&enc_msg)+1, sizeof(enc_msg)-1); for (i = 0; i < htons(enc_msg.numencs); i++) { (void)stream_read(cfd, &encoding, sizeof(encoding)); switch (htonl(encoding)) { case RFB_ENCODING_RAW: rc->enc_raw_ok = true; break; case RFB_ENCODING_ZLIB: rc->enc_zlib_ok = true; deflateInit(&rc->zstream, Z_BEST_SPEED); break; case RFB_ENCODING_RESIZE: rc->enc_resize_ok = true; break; } } } /* * Calculate CRC32 using SSE4.2; Intel or AMD Bulldozer+ CPUs only */ static __inline uint32_t fast_crc32(void *buf, int len, uint32_t crcval) { uint32_t q = len / sizeof(uint32_t); uint32_t *p = (uint32_t *)buf; while (q--) { asm volatile ( ".byte 0xf2, 0xf, 0x38, 0xf1, 0xf1;" :"=S" (crcval) :"0" (crcval), "c" (*p) ); p++; } return (crcval); } static int rfb_send_rect(struct rfb_softc *rc, int cfd, struct bhyvegc_image *gc, int x, int y, int w, int h) { struct rfb_srvr_updt_msg supdt_msg; struct rfb_srvr_rect_hdr srect_hdr; unsigned long zlen; ssize_t nwrite, total; int err; uint32_t *p; uint8_t *zbufp; /* * Send a single rectangle of the given x, y, w h dimensions. */ /* Number of rectangles: 1 */ supdt_msg.type = 0; supdt_msg.pad = 0; supdt_msg.numrects = htons(1); nwrite = stream_write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg)); if (nwrite <= 0) return (nwrite); /* Rectangle header */ srect_hdr.x = htons(x); srect_hdr.y = htons(y); srect_hdr.width = htons(w); srect_hdr.height = htons(h); h = y + h; w *= sizeof(uint32_t); if (rc->enc_zlib_ok) { zbufp = rc->zbuf; rc->zstream.total_in = 0; rc->zstream.total_out = 0; for (p = &gc->data[y * gc->width + x]; y < h; y++) { rc->zstream.next_in = (Bytef *)p; rc->zstream.avail_in = w; rc->zstream.next_out = (Bytef *)zbufp; rc->zstream.avail_out = RFB_ZLIB_BUFSZ + 16 - rc->zstream.total_out; rc->zstream.data_type = Z_BINARY; /* Compress with zlib */ err = deflate(&rc->zstream, Z_SYNC_FLUSH); if (err != Z_OK) { WPRINTF(("zlib[rect] deflate err: %d\n", err)); rc->enc_zlib_ok = false; deflateEnd(&rc->zstream); goto doraw; } zbufp = rc->zbuf + rc->zstream.total_out; p += gc->width; } srect_hdr.encoding = htonl(RFB_ENCODING_ZLIB); nwrite = stream_write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr)); if (nwrite <= 0) return (nwrite); zlen = htonl(rc->zstream.total_out); nwrite = stream_write(cfd, &zlen, sizeof(uint32_t)); if (nwrite <= 0) return (nwrite); return (stream_write(cfd, rc->zbuf, rc->zstream.total_out)); } doraw: total = 0; zbufp = rc->zbuf; for (p = &gc->data[y * gc->width + x]; y < h; y++) { memcpy(zbufp, p, w); zbufp += w; total += w; p += gc->width; } srect_hdr.encoding = htonl(RFB_ENCODING_RAW); nwrite = stream_write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr)); if (nwrite <= 0) return (nwrite); total = stream_write(cfd, rc->zbuf, total); return (total); } static int rfb_send_all(struct rfb_softc *rc, int cfd, struct bhyvegc_image *gc) { struct rfb_srvr_updt_msg supdt_msg; struct rfb_srvr_rect_hdr srect_hdr; ssize_t nwrite; unsigned long zlen; int err; /* * Send the whole thing */ /* Number of rectangles: 1 */ supdt_msg.type = 0; supdt_msg.pad = 0; supdt_msg.numrects = htons(1); nwrite = stream_write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg)); if (nwrite <= 0) return (nwrite); /* Rectangle header */ srect_hdr.x = 0; srect_hdr.y = 0; srect_hdr.width = htons(gc->width); srect_hdr.height = htons(gc->height); if (rc->enc_zlib_ok) { rc->zstream.next_in = (Bytef *)gc->data; rc->zstream.avail_in = gc->width * gc->height * sizeof(uint32_t); rc->zstream.next_out = (Bytef *)rc->zbuf; rc->zstream.avail_out = RFB_ZLIB_BUFSZ + 16; rc->zstream.data_type = Z_BINARY; rc->zstream.total_in = 0; rc->zstream.total_out = 0; /* Compress with zlib */ err = deflate(&rc->zstream, Z_SYNC_FLUSH); if (err != Z_OK) { WPRINTF(("zlib deflate err: %d\n", err)); rc->enc_zlib_ok = false; deflateEnd(&rc->zstream); goto doraw; } srect_hdr.encoding = htonl(RFB_ENCODING_ZLIB); nwrite = stream_write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr)); if (nwrite <= 0) return (nwrite); zlen = htonl(rc->zstream.total_out); nwrite = stream_write(cfd, &zlen, sizeof(uint32_t)); if (nwrite <= 0) return (nwrite); return (stream_write(cfd, rc->zbuf, rc->zstream.total_out)); } doraw: srect_hdr.encoding = htonl(RFB_ENCODING_RAW); nwrite = stream_write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr)); if (nwrite <= 0) return (nwrite); nwrite = stream_write(cfd, gc->data, gc->width * gc->height * sizeof(uint32_t)); return (nwrite); } #define PIX_PER_CELL 32 #define PIXCELL_SHIFT 5 #define PIXCELL_MASK 0x1F static int rfb_send_screen(struct rfb_softc *rc, int cfd, int all) { struct bhyvegc_image *gc_image; ssize_t nwrite; int x, y; int celly, cellwidth; int xcells, ycells; int w, h; uint32_t *p; int rem_x, rem_y; /* remainder for resolutions not x32 pixels ratio */ int retval; uint32_t *crc_p, *orig_crc; int changes; console_refresh(); gc_image = console_get_image(); pthread_mutex_lock(&rc->mtx); if (rc->sending) { pthread_mutex_unlock(&rc->mtx); return (1); } rc->sending = 1; pthread_mutex_unlock(&rc->mtx); retval = 0; if (all) { retval = rfb_send_all(rc, cfd, gc_image); goto done; } /* * Calculate the checksum for each 32x32 cell. Send each that * has changed since the last scan. */ /* Resolution changed */ rc->crc_width = gc_image->width; rc->crc_height = gc_image->height; w = rc->crc_width; h = rc->crc_height; xcells = howmany(rc->crc_width, PIX_PER_CELL); ycells = howmany(rc->crc_height, PIX_PER_CELL); rem_x = w & PIXCELL_MASK; rem_y = h & PIXCELL_MASK; if (!rem_y) rem_y = PIX_PER_CELL; p = gc_image->data; /* * Go through all cells and calculate crc. If significant number * of changes, then send entire screen. * crc_tmp is dual purpose: to store the new crc and to flag as * a cell that has changed. */ crc_p = rc->crc_tmp - xcells; orig_crc = rc->crc - xcells; changes = 0; memset(rc->crc_tmp, 0, sizeof(uint32_t) * xcells * ycells); for (y = 0; y < h; y++) { if ((y & PIXCELL_MASK) == 0) { crc_p += xcells; orig_crc += xcells; } for (x = 0; x < xcells; x++) { if (rc->hw_crc) crc_p[x] = fast_crc32(p, PIX_PER_CELL * sizeof(uint32_t), crc_p[x]); else crc_p[x] = (uint32_t)crc32(crc_p[x], (Bytef *)p, PIX_PER_CELL * sizeof(uint32_t)); p += PIX_PER_CELL; /* check for crc delta if last row in cell */ if ((y & PIXCELL_MASK) == PIXCELL_MASK || y == (h-1)) { if (orig_crc[x] != crc_p[x]) { orig_crc[x] = crc_p[x]; crc_p[x] = 1; changes++; } else { crc_p[x] = 0; } } } if (rem_x) { if (rc->hw_crc) crc_p[x] = fast_crc32(p, rem_x * sizeof(uint32_t), crc_p[x]); else crc_p[x] = (uint32_t)crc32(crc_p[x], (Bytef *)p, rem_x * sizeof(uint32_t)); p += rem_x; if ((y & PIXCELL_MASK) == PIXCELL_MASK || y == (h-1)) { if (orig_crc[x] != crc_p[x]) { orig_crc[x] = crc_p[x]; crc_p[x] = 1; changes++; } else { crc_p[x] = 0; } } } } /* If number of changes is > THRESH percent, send the whole screen */ if (((changes * 100) / (xcells * ycells)) >= RFB_SEND_ALL_THRESH) { retval = rfb_send_all(rc, cfd, gc_image); goto done; } /* Go through all cells, and send only changed ones */ crc_p = rc->crc_tmp; for (y = 0; y < h; y += PIX_PER_CELL) { /* previous cell's row */ celly = (y >> PIXCELL_SHIFT); /* Delta check crc to previous set */ for (x = 0; x < xcells; x++) { if (*crc_p++ == 0) continue; if (x == (xcells - 1) && rem_x > 0) cellwidth = rem_x; else cellwidth = PIX_PER_CELL; nwrite = rfb_send_rect(rc, cfd, gc_image, x * PIX_PER_CELL, celly * PIX_PER_CELL, cellwidth, y + PIX_PER_CELL >= h ? rem_y : PIX_PER_CELL); if (nwrite <= 0) { retval = nwrite; goto done; } } } retval = 1; done: pthread_mutex_lock(&rc->mtx); rc->sending = 0; pthread_mutex_unlock(&rc->mtx); return (retval); } static void rfb_recv_update_msg(struct rfb_softc *rc, int cfd, int discardonly) { struct rfb_updt_msg updt_msg; struct bhyvegc_image *gc_image; (void)stream_read(cfd, ((void *)&updt_msg) + 1 , sizeof(updt_msg) - 1); console_refresh(); gc_image = console_get_image(); updt_msg.x = htons(updt_msg.x); updt_msg.y = htons(updt_msg.y); updt_msg.width = htons(updt_msg.width); updt_msg.height = htons(updt_msg.height); if (updt_msg.width != gc_image->width || updt_msg.height != gc_image->height) { rc->width = gc_image->width; rc->height = gc_image->height; if (rc->enc_resize_ok) rfb_send_resize_update_msg(rc, cfd); } if (discardonly) return; rfb_send_screen(rc, cfd, 1); } static void rfb_recv_key_msg(struct rfb_softc *rc, int cfd) { struct rfb_key_msg key_msg; (void)stream_read(cfd, ((void *)&key_msg) + 1, sizeof(key_msg) - 1); console_key_event(key_msg.down, htonl(key_msg.code)); } static void rfb_recv_ptr_msg(struct rfb_softc *rc, int cfd) { struct rfb_ptr_msg ptr_msg; (void)stream_read(cfd, ((void *)&ptr_msg) + 1, sizeof(ptr_msg) - 1); console_ptr_event(ptr_msg.button, htons(ptr_msg.x), htons(ptr_msg.y)); } static void rfb_recv_cuttext_msg(struct rfb_softc *rc, int cfd) { struct rfb_cuttext_msg ct_msg; unsigned char buf[32]; int len; len = stream_read(cfd, ((void *)&ct_msg) + 1, sizeof(ct_msg) - 1); ct_msg.length = htonl(ct_msg.length); while (ct_msg.length > 0) { len = stream_read(cfd, buf, ct_msg.length > sizeof(buf) ? sizeof(buf) : ct_msg.length); ct_msg.length -= len; } } static int64_t timeval_delta(struct timeval *prev, struct timeval *now) { int64_t n1, n2; n1 = now->tv_sec * 1000000 + now->tv_usec; n2 = prev->tv_sec * 1000000 + prev->tv_usec; return (n1 - n2); } static void * rfb_wr_thr(void *arg) { struct rfb_softc *rc; fd_set rfds; struct timeval tv; struct timeval prev_tv; int64_t tdiff; int cfd; int err; rc = arg; cfd = rc->cfd; prev_tv.tv_sec = 0; prev_tv.tv_usec = 0; while (rc->cfd >= 0) { FD_ZERO(&rfds); FD_SET(cfd, &rfds); tv.tv_sec = 0; tv.tv_usec = 10000; err = select(cfd+1, &rfds, NULL, NULL, &tv); if (err < 0) return (NULL); /* Determine if its time to push screen; ~24hz */ gettimeofday(&tv, NULL); tdiff = timeval_delta(&prev_tv, &tv); if (tdiff > 40000) { prev_tv.tv_sec = tv.tv_sec; prev_tv.tv_usec = tv.tv_usec; if (rfb_send_screen(rc, cfd, 0) <= 0) { return (NULL); } } else { /* sleep */ usleep(40000 - tdiff); } } return (NULL); } void rfb_handle(struct rfb_softc *rc, int cfd) { const char *vbuf = "RFB 003.008\n"; unsigned char buf[80]; pthread_t tid; uint32_t sres; int len; rc->cfd = cfd; /* 1a. Send server version */ stream_write(cfd, vbuf, strlen(vbuf)); /* 1b. Read client version */ len = read(cfd, buf, sizeof(buf)); /* 2a. Send security type 'none' */ buf[0] = 1; buf[1] = 1; /* none */ stream_write(cfd, buf, 2); /* 2b. Read agreed security type */ len = stream_read(cfd, buf, 1); /* 2c. Write back a status of 0 */ sres = 0; stream_write(cfd, &sres, 4); /* 3a. Read client shared-flag byte */ len = stream_read(cfd, buf, 1); /* 4a. Write server-init info */ rfb_send_server_init_msg(cfd); if (!rc->zbuf) { rc->zbuf = malloc(RFB_ZLIB_BUFSZ + 16); assert(rc->zbuf != NULL); } rfb_send_screen(rc, cfd, 1); pthread_create(&tid, NULL, rfb_wr_thr, rc); pthread_set_name_np(tid, "rfbout"); /* Now read in client requests. 1st byte identifies type */ for (;;) { len = read(cfd, buf, 1); if (len <= 0) { DPRINTF(("rfb client exiting\r\n")); break; } switch (buf[0]) { case 0: rfb_recv_set_pixfmt_msg(rc, cfd); break; case 2: rfb_recv_set_encodings_msg(rc, cfd); break; case 3: rfb_recv_update_msg(rc, cfd, 1); break; case 4: rfb_recv_key_msg(rc, cfd); break; case 5: rfb_recv_ptr_msg(rc, cfd); break; case 6: rfb_recv_cuttext_msg(rc, cfd); break; default: WPRINTF(("rfb unknown cli-code %d!\n", buf[0] & 0xff)); goto done; } } done: rc->cfd = -1; pthread_join(tid, NULL); if (rc->enc_zlib_ok) deflateEnd(&rc->zstream); } static void * rfb_thr(void *arg) { struct rfb_softc *rc; sigset_t set; int cfd; rc = arg; sigemptyset(&set); sigaddset(&set, SIGPIPE); if (pthread_sigmask(SIG_BLOCK, &set, NULL) != 0) { perror("pthread_sigmask"); return (NULL); } for (;;) { rc->enc_raw_ok = false; rc->enc_zlib_ok = false; rc->enc_resize_ok = false; cfd = accept(rc->sfd, NULL, NULL); if (rc->conn_wait) { pthread_mutex_lock(&rc->mtx); pthread_cond_signal(&rc->cond); pthread_mutex_unlock(&rc->mtx); rc->conn_wait = 0; } rfb_handle(rc, cfd); close(cfd); } /* NOTREACHED */ return (NULL); } static int sse42_supported(void) { u_int cpu_registers[4], ecx; do_cpuid(1, cpu_registers); ecx = cpu_registers[2]; return ((ecx & CPUID2_SSE42) != 0); } int rfb_init(char *hostname, int port, int wait) { struct rfb_softc *rc; struct sockaddr_in sin; int on = 1; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif rc = calloc(1, sizeof(struct rfb_softc)); rc->crc = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32), sizeof(uint32_t)); rc->crc_tmp = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32), sizeof(uint32_t)); rc->crc_width = RFB_MAX_WIDTH; rc->crc_height = RFB_MAX_HEIGHT; rc->sfd = socket(AF_INET, SOCK_STREAM, 0); if (rc->sfd < 0) { perror("socket"); return (-1); } setsockopt(rc->sfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); sin.sin_len = sizeof(sin); sin.sin_family = AF_INET; sin.sin_port = htons(port); if (hostname && strlen(hostname) > 0) inet_pton(AF_INET, hostname, &(sin.sin_addr)); else sin.sin_addr.s_addr = htonl(INADDR_ANY); if (bind(rc->sfd, (struct sockaddr *)&sin, sizeof(sin)) < 0) { perror("bind"); return (-1); } if (listen(rc->sfd, 1) < 0) { perror("listen"); return (-1); } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE); + if (cap_rights_limit(rc->sfd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif rc->hw_crc = sse42_supported(); rc->conn_wait = wait; if (wait) { pthread_mutex_init(&rc->mtx, NULL); pthread_cond_init(&rc->cond, NULL); } pthread_create(&rc->tid, NULL, rfb_thr, rc); pthread_set_name_np(rc->tid, "rfb"); if (wait) { DPRINTF(("Waiting for rfb client...\n")); pthread_mutex_lock(&rc->mtx); pthread_cond_wait(&rc->cond, &rc->mtx); pthread_mutex_unlock(&rc->mtx); } return (0); } Index: releng/11.1/usr.sbin/bhyve/uart_emul.c =================================================================== --- releng/11.1/usr.sbin/bhyve/uart_emul.c (revision 320933) +++ releng/11.1/usr.sbin/bhyve/uart_emul.c (revision 320934) @@ -1,674 +1,702 @@ /*- * Copyright (c) 2012 NetApp, Inc. * Copyright (c) 2013 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include #include +#include +#include #include #include #include #include #include #include +#include #include "mevent.h" #include "uart_emul.h" #define COM1_BASE 0x3F8 #define COM1_IRQ 4 #define COM2_BASE 0x2F8 #define COM2_IRQ 3 #define DEFAULT_RCLK 1843200 #define DEFAULT_BAUD 9600 #define FCR_RX_MASK 0xC0 #define MCR_OUT1 0x04 #define MCR_OUT2 0x08 #define MSR_DELTA_MASK 0x0f #ifndef REG_SCR #define REG_SCR com_scr #endif #define FIFOSZ 16 static bool uart_stdio; /* stdio in use for i/o */ static struct termios tio_stdio_orig; static struct { int baseaddr; int irq; bool inuse; } uart_lres[] = { { COM1_BASE, COM1_IRQ, false}, { COM2_BASE, COM2_IRQ, false}, }; #define UART_NLDEVS (sizeof(uart_lres) / sizeof(uart_lres[0])) struct fifo { uint8_t buf[FIFOSZ]; int rindex; /* index to read from */ int windex; /* index to write to */ int num; /* number of characters in the fifo */ int size; /* size of the fifo */ }; struct ttyfd { bool opened; int fd; /* tty device file descriptor */ struct termios tio_orig, tio_new; /* I/O Terminals */ }; struct uart_softc { pthread_mutex_t mtx; /* protects all softc elements */ uint8_t data; /* Data register (R/W) */ uint8_t ier; /* Interrupt enable register (R/W) */ uint8_t lcr; /* Line control register (R/W) */ uint8_t mcr; /* Modem control register (R/W) */ uint8_t lsr; /* Line status register (R/W) */ uint8_t msr; /* Modem status register (R/W) */ uint8_t fcr; /* FIFO control register (W) */ uint8_t scr; /* Scratch register (R/W) */ uint8_t dll; /* Baudrate divisor latch LSB */ uint8_t dlh; /* Baudrate divisor latch MSB */ struct fifo rxfifo; struct mevent *mev; struct ttyfd tty; bool thre_int_pending; /* THRE interrupt pending */ void *arg; uart_intr_func_t intr_assert; uart_intr_func_t intr_deassert; }; static void uart_drain(int fd, enum ev_type ev, void *arg); static void ttyclose(void) { tcsetattr(STDIN_FILENO, TCSANOW, &tio_stdio_orig); } static void ttyopen(struct ttyfd *tf) { tcgetattr(tf->fd, &tf->tio_orig); tf->tio_new = tf->tio_orig; cfmakeraw(&tf->tio_new); tf->tio_new.c_cflag |= CLOCAL; tcsetattr(tf->fd, TCSANOW, &tf->tio_new); if (tf->fd == STDIN_FILENO) { tio_stdio_orig = tf->tio_orig; atexit(ttyclose); } } static int ttyread(struct ttyfd *tf) { unsigned char rb; if (read(tf->fd, &rb, 1) == 1) return (rb); else return (-1); } static void ttywrite(struct ttyfd *tf, unsigned char wb) { (void)write(tf->fd, &wb, 1); } static void rxfifo_reset(struct uart_softc *sc, int size) { char flushbuf[32]; struct fifo *fifo; ssize_t nread; int error; fifo = &sc->rxfifo; bzero(fifo, sizeof(struct fifo)); fifo->size = size; if (sc->tty.opened) { /* * Flush any unread input from the tty buffer. */ while (1) { nread = read(sc->tty.fd, flushbuf, sizeof(flushbuf)); if (nread != sizeof(flushbuf)) break; } /* * Enable mevent to trigger when new characters are available * on the tty fd. */ error = mevent_enable(sc->mev); assert(error == 0); } } static int rxfifo_available(struct uart_softc *sc) { struct fifo *fifo; fifo = &sc->rxfifo; return (fifo->num < fifo->size); } static int rxfifo_putchar(struct uart_softc *sc, uint8_t ch) { struct fifo *fifo; int error; fifo = &sc->rxfifo; if (fifo->num < fifo->size) { fifo->buf[fifo->windex] = ch; fifo->windex = (fifo->windex + 1) % fifo->size; fifo->num++; if (!rxfifo_available(sc)) { if (sc->tty.opened) { /* * Disable mevent callback if the FIFO is full. */ error = mevent_disable(sc->mev); assert(error == 0); } } return (0); } else return (-1); } static int rxfifo_getchar(struct uart_softc *sc) { struct fifo *fifo; int c, error, wasfull; wasfull = 0; fifo = &sc->rxfifo; if (fifo->num > 0) { if (!rxfifo_available(sc)) wasfull = 1; c = fifo->buf[fifo->rindex]; fifo->rindex = (fifo->rindex + 1) % fifo->size; fifo->num--; if (wasfull) { if (sc->tty.opened) { error = mevent_enable(sc->mev); assert(error == 0); } } return (c); } else return (-1); } static int rxfifo_numchars(struct uart_softc *sc) { struct fifo *fifo = &sc->rxfifo; return (fifo->num); } static void uart_opentty(struct uart_softc *sc) { ttyopen(&sc->tty); sc->mev = mevent_add(sc->tty.fd, EVF_READ, uart_drain, sc); assert(sc->mev != NULL); } static uint8_t modem_status(uint8_t mcr) { uint8_t msr; if (mcr & MCR_LOOPBACK) { /* * In the loopback mode certain bits from the MCR are * reflected back into MSR. */ msr = 0; if (mcr & MCR_RTS) msr |= MSR_CTS; if (mcr & MCR_DTR) msr |= MSR_DSR; if (mcr & MCR_OUT1) msr |= MSR_RI; if (mcr & MCR_OUT2) msr |= MSR_DCD; } else { /* * Always assert DCD and DSR so tty open doesn't block * even if CLOCAL is turned off. */ msr = MSR_DCD | MSR_DSR; } assert((msr & MSR_DELTA_MASK) == 0); return (msr); } /* * The IIR returns a prioritized interrupt reason: * - receive data available * - transmit holding register empty * - modem status change * * Return an interrupt reason if one is available. */ static int uart_intr_reason(struct uart_softc *sc) { if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0) return (IIR_RLS); else if (rxfifo_numchars(sc) > 0 && (sc->ier & IER_ERXRDY) != 0) return (IIR_RXTOUT); else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0) return (IIR_TXRDY); else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0) return (IIR_MLSC); else return (IIR_NOPEND); } static void uart_reset(struct uart_softc *sc) { uint16_t divisor; divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16; sc->dll = divisor; sc->dlh = divisor >> 16; sc->msr = modem_status(sc->mcr); rxfifo_reset(sc, 1); /* no fifo until enabled by software */ } /* * Toggle the COM port's intr pin depending on whether or not we have an * interrupt condition to report to the processor. */ static void uart_toggle_intr(struct uart_softc *sc) { uint8_t intr_reason; intr_reason = uart_intr_reason(sc); if (intr_reason == IIR_NOPEND) (*sc->intr_deassert)(sc->arg); else (*sc->intr_assert)(sc->arg); } static void uart_drain(int fd, enum ev_type ev, void *arg) { struct uart_softc *sc; int ch; sc = arg; assert(fd == sc->tty.fd); assert(ev == EVF_READ); /* * This routine is called in the context of the mevent thread * to take out the softc lock to protect against concurrent * access from a vCPU i/o exit */ pthread_mutex_lock(&sc->mtx); if ((sc->mcr & MCR_LOOPBACK) != 0) { (void) ttyread(&sc->tty); } else { while (rxfifo_available(sc) && ((ch = ttyread(&sc->tty)) != -1)) { rxfifo_putchar(sc, ch); } uart_toggle_intr(sc); } pthread_mutex_unlock(&sc->mtx); } void uart_write(struct uart_softc *sc, int offset, uint8_t value) { int fifosz; uint8_t msr; pthread_mutex_lock(&sc->mtx); /* * Take care of the special case DLAB accesses first */ if ((sc->lcr & LCR_DLAB) != 0) { if (offset == REG_DLL) { sc->dll = value; goto done; } if (offset == REG_DLH) { sc->dlh = value; goto done; } } switch (offset) { case REG_DATA: if (sc->mcr & MCR_LOOPBACK) { if (rxfifo_putchar(sc, value) != 0) sc->lsr |= LSR_OE; } else if (sc->tty.opened) { ttywrite(&sc->tty, value); } /* else drop on floor */ sc->thre_int_pending = true; break; case REG_IER: /* * Apply mask so that bits 4-7 are 0 * Also enables bits 0-3 only if they're 1 */ sc->ier = value & 0x0F; break; case REG_FCR: /* * When moving from FIFO and 16450 mode and vice versa, * the FIFO contents are reset. */ if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) { fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1; rxfifo_reset(sc, fifosz); } /* * The FCR_ENABLE bit must be '1' for the programming * of other FCR bits to be effective. */ if ((value & FCR_ENABLE) == 0) { sc->fcr = 0; } else { if ((value & FCR_RCV_RST) != 0) rxfifo_reset(sc, FIFOSZ); sc->fcr = value & (FCR_ENABLE | FCR_DMA | FCR_RX_MASK); } break; case REG_LCR: sc->lcr = value; break; case REG_MCR: /* Apply mask so that bits 5-7 are 0 */ sc->mcr = value & 0x1F; msr = modem_status(sc->mcr); /* * Detect if there has been any change between the * previous and the new value of MSR. If there is * then assert the appropriate MSR delta bit. */ if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS)) sc->msr |= MSR_DCTS; if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR)) sc->msr |= MSR_DDSR; if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD)) sc->msr |= MSR_DDCD; if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0) sc->msr |= MSR_TERI; /* * Update the value of MSR while retaining the delta * bits. */ sc->msr &= MSR_DELTA_MASK; sc->msr |= msr; break; case REG_LSR: /* * Line status register is not meant to be written to * during normal operation. */ break; case REG_MSR: /* * As far as I can tell MSR is a read-only register. */ break; case REG_SCR: sc->scr = value; break; default: break; } done: uart_toggle_intr(sc); pthread_mutex_unlock(&sc->mtx); } uint8_t uart_read(struct uart_softc *sc, int offset) { uint8_t iir, intr_reason, reg; pthread_mutex_lock(&sc->mtx); /* * Take care of the special case DLAB accesses first */ if ((sc->lcr & LCR_DLAB) != 0) { if (offset == REG_DLL) { reg = sc->dll; goto done; } if (offset == REG_DLH) { reg = sc->dlh; goto done; } } switch (offset) { case REG_DATA: reg = rxfifo_getchar(sc); break; case REG_IER: reg = sc->ier; break; case REG_IIR: iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0; intr_reason = uart_intr_reason(sc); /* * Deal with side effects of reading the IIR register */ if (intr_reason == IIR_TXRDY) sc->thre_int_pending = false; iir |= intr_reason; reg = iir; break; case REG_LCR: reg = sc->lcr; break; case REG_MCR: reg = sc->mcr; break; case REG_LSR: /* Transmitter is always ready for more data */ sc->lsr |= LSR_TEMT | LSR_THRE; /* Check for new receive data */ if (rxfifo_numchars(sc) > 0) sc->lsr |= LSR_RXRDY; else sc->lsr &= ~LSR_RXRDY; reg = sc->lsr; /* The LSR_OE bit is cleared on LSR read */ sc->lsr &= ~LSR_OE; break; case REG_MSR: /* * MSR delta bits are cleared on read */ reg = sc->msr; sc->msr &= ~MSR_DELTA_MASK; break; case REG_SCR: reg = sc->scr; break; default: reg = 0xFF; break; } done: uart_toggle_intr(sc); pthread_mutex_unlock(&sc->mtx); return (reg); } int uart_legacy_alloc(int which, int *baseaddr, int *irq) { if (which < 0 || which >= UART_NLDEVS || uart_lres[which].inuse) return (-1); uart_lres[which].inuse = true; *baseaddr = uart_lres[which].baseaddr; *irq = uart_lres[which].irq; return (0); } struct uart_softc * uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert, void *arg) { struct uart_softc *sc; sc = calloc(1, sizeof(struct uart_softc)); sc->arg = arg; sc->intr_assert = intr_assert; sc->intr_deassert = intr_deassert; pthread_mutex_init(&sc->mtx, NULL); uart_reset(sc); return (sc); } static int uart_tty_backend(struct uart_softc *sc, const char *opts) { int fd; int retval; retval = -1; fd = open(opts, O_RDWR | O_NONBLOCK); if (fd > 0 && isatty(fd)) { sc->tty.fd = fd; sc->tty.opened = true; retval = 0; } - + return (retval); } int uart_set_backend(struct uart_softc *sc, const char *opts) { int retval; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; + cap_ioctl_t sicmds[] = { TIOCGETA, TIOCGWINSZ }; +#endif retval = -1; if (opts == NULL) return (0); if (strcmp("stdio", opts) == 0) { if (!uart_stdio) { sc->tty.fd = STDIN_FILENO; sc->tty.opened = true; uart_stdio = true; retval = 0; } } else if (uart_tty_backend(sc, opts) == 0) { retval = 0; } /* Make the backend file descriptor non-blocking */ if (retval == 0) retval = fcntl(sc->tty.fd, F_SETFL, O_NONBLOCK); + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ, CAP_WRITE); + if (cap_rights_limit(sc->tty.fd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (cap_ioctls_limit(sc->tty.fd, cmds, nitems(cmds)) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (!uart_stdio) { + cap_rights_init(&rights, CAP_FCNTL, CAP_FSTAT, CAP_IOCTL, CAP_READ); + if (cap_rights_limit(STDIN_FILENO, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (cap_ioctls_limit(STDIN_FILENO, sicmds, nitems(sicmds)) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (cap_fcntls_limit(STDIN_FILENO, CAP_FCNTL_GETFL) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + } +#endif if (retval == 0) uart_opentty(sc); return (retval); } Index: releng/11.1 =================================================================== --- releng/11.1 (revision 320933) +++ releng/11.1 (revision 320934) Property changes on: releng/11.1 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,2 ## Merged /head:r313727,317483 Merged /stable/11:r320866