Index: projects/bhyve/lib/libvmmapi/vmmapi.c =================================================================== --- projects/bhyve/lib/libvmmapi/vmmapi.c (revision 241177) +++ projects/bhyve/lib/libvmmapi/vmmapi.c (revision 241178) @@ -1,724 +1,724 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmmapi.h" #include "mptable.h" #define BIOS_ROM_BASE (0xf0000) #define BIOS_ROM_SIZE (0x10000) struct vmctx { int fd; char *name; }; #define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) #define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) static int vm_device_open(const char *name) { int fd, len; char *vmfile; len = strlen("/dev/vmm/") + strlen(name) + 1; vmfile = malloc(len); assert(vmfile != NULL); snprintf(vmfile, len, "/dev/vmm/%s", name); /* Open the device file */ fd = open(vmfile, O_RDWR, 0); free(vmfile); return (fd); } int vm_create(const char *name) { return (CREATE((char *)name)); } struct vmctx * vm_open(const char *name) { struct vmctx *vm; vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); assert(vm != NULL); vm->fd = -1; vm->name = (char *)(vm + 1); strcpy(vm->name, name); if ((vm->fd = vm_device_open(vm->name)) < 0) goto err; return (vm); err: vm_destroy(vm); return (NULL); } void vm_destroy(struct vmctx *vm) { assert(vm != NULL); - DESTROY(vm->name); if (vm->fd >= 0) close(vm->fd); + DESTROY(vm->name); + free(vm); } size_t vmm_get_mem_total(void) { size_t mem_total = 0; size_t oldlen = sizeof(mem_total); int error; error = sysctlbyname("hw.vmm.mem_total", &mem_total, &oldlen, NULL, 0); if (error) return -1; return mem_total; } size_t vmm_get_mem_free(void) { size_t mem_free = 0; size_t oldlen = sizeof(mem_free); int error; error = sysctlbyname("hw.vmm.mem_free", &mem_free, &oldlen, NULL, 0); if (error) return -1; return mem_free; } int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, vm_paddr_t *ret_hpa, size_t *ret_len) { int error; struct vm_memory_segment seg; bzero(&seg, sizeof(seg)); seg.gpa = gpa; error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg); - *ret_hpa = seg.hpa; *ret_len = seg.len; return (error); } int vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **mapaddr) { int error; struct vm_memory_segment seg; /* * Create and optionally map 'len' bytes of memory at guest * physical address 'gpa' */ bzero(&seg, sizeof(seg)); seg.gpa = gpa; seg.len = len; error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg); if (error == 0 && mapaddr != NULL) { *mapaddr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, ctx->fd, gpa); } return (error); } char * vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len) { /* Map 'len' bytes of memory at guest physical address 'gpa' */ return ((char *)mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, ctx->fd, gpa)); } int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); vmsegdesc.cpuid = vcpu; vmsegdesc.regnum = reg; vmsegdesc.desc.base = base; vmsegdesc.desc.limit = limit; vmsegdesc.desc.access = access; error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc); return (error); } int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access) { int error; struct vm_seg_desc vmsegdesc; bzero(&vmsegdesc, sizeof(vmsegdesc)); vmsegdesc.cpuid = vcpu; vmsegdesc.regnum = reg; error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc); if (error == 0) { *base = vmsegdesc.desc.base; *limit = vmsegdesc.desc.limit; *access = vmsegdesc.desc.access; } return (error); } int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.cpuid = vcpu; vmreg.regnum = reg; vmreg.regval = val; error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg); return (error); } int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val) { int error; struct vm_register vmreg; bzero(&vmreg, sizeof(vmreg)); vmreg.cpuid = vcpu; vmreg.regnum = reg; error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg); *ret_val = vmreg.regval; return (error); } int vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid) { int error; struct vm_pin vmpin; bzero(&vmpin, sizeof(vmpin)); vmpin.vm_cpuid = vcpu; error = ioctl(ctx->fd, VM_GET_PINNING, &vmpin); *host_cpuid = vmpin.host_cpuid; return (error); } int vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid) { int error; struct vm_pin vmpin; bzero(&vmpin, sizeof(vmpin)); vmpin.vm_cpuid = vcpu; vmpin.host_cpuid = host_cpuid; error = ioctl(ctx->fd, VM_SET_PINNING, &vmpin); return (error); } int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, struct vm_exit *vmexit) { int error; struct vm_run vmrun; bzero(&vmrun, sizeof(vmrun)); vmrun.cpuid = vcpu; vmrun.rip = rip; error = ioctl(ctx->fd, VM_RUN, &vmrun); bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); return (error); } static int vm_inject_event_real(struct vmctx *ctx, int vcpu, enum vm_event_type type, int vector, int error_code, int error_code_valid) { struct vm_event ev; bzero(&ev, sizeof(ev)); ev.cpuid = vcpu; ev.type = type; ev.vector = vector; ev.error_code = error_code; ev.error_code_valid = error_code_valid; return (ioctl(ctx->fd, VM_INJECT_EVENT, &ev)); } int vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type, int vector) { return (vm_inject_event_real(ctx, vcpu, type, vector, 0, 0)); } int vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type, int vector, int error_code) { return (vm_inject_event_real(ctx, vcpu, type, vector, error_code, 1)); } int vm_build_tables(struct vmctx *ctxt, int ncpu, int ioapic, void *oemtbl, int oemtblsz) { return (vm_build_mptable(ctxt, BIOS_ROM_BASE, BIOS_ROM_SIZE, ncpu, ioapic, oemtbl, oemtblsz)); } int vm_apicid2vcpu(struct vmctx *ctx, int apicid) { /* * The apic id associated with the 'vcpu' has the same numerical value * as the 'vcpu' itself. */ return (apicid); } int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector) { struct vm_lapic_irq vmirq; bzero(&vmirq, sizeof(vmirq)); vmirq.cpuid = vcpu; vmirq.vector = vector; return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq)); } int vm_inject_nmi(struct vmctx *ctx, int vcpu) { struct vm_nmi vmnmi; bzero(&vmnmi, sizeof(vmnmi)); vmnmi.cpuid = vcpu; return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi)); } int vm_capability_name2type(const char *capname) { int i; static struct { const char *name; int type; } capstrmap[] = { { "hlt_exit", VM_CAP_HALT_EXIT }, { "mtrap_exit", VM_CAP_MTRAP_EXIT }, { "pause_exit", VM_CAP_PAUSE_EXIT }, { "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST }, { 0 } }; for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) { if (strcmp(capstrmap[i].name, capname) == 0) return (capstrmap[i].type); } return (-1); } int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int *retval) { int error; struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.cpuid = vcpu; vmcap.captype = cap; error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap); *retval = vmcap.capval; return (error); } int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val) { struct vm_capability vmcap; bzero(&vmcap, sizeof(vmcap)); vmcap.cpuid = vcpu; vmcap.captype = cap; vmcap.capval = val; return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap)); } int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev)); } int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func) { struct vm_pptdev pptdev; bzero(&pptdev, sizeof(pptdev)); pptdev.bus = bus; pptdev.slot = slot; pptdev.func = func; return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev)); } int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { struct vm_pptdev_mmio pptmmio; bzero(&pptmmio, sizeof(pptmmio)); pptmmio.bus = bus; pptmmio.slot = slot; pptmmio.func = func; pptmmio.gpa = gpa; pptmmio.len = len; pptmmio.hpa = hpa; return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio)); } int vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, int destcpu, int vector, int numvec) { struct vm_pptdev_msi pptmsi; bzero(&pptmsi, sizeof(pptmsi)); pptmsi.vcpu = vcpu; pptmsi.bus = bus; pptmsi.slot = slot; pptmsi.func = func; pptmsi.destcpu = destcpu; pptmsi.vector = vector; pptmsi.numvec = numvec; return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi)); } int vm_setup_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, int idx, uint32_t msg, uint32_t vector_control, uint64_t addr) { struct vm_pptdev_msix pptmsix; bzero(&pptmsix, sizeof(pptmsix)); pptmsix.vcpu = vcpu; pptmsix.bus = bus; pptmsix.slot = slot; pptmsix.func = func; pptmsix.idx = idx; pptmsix.msg = msg; pptmsix.addr = addr; pptmsix.vector_control = vector_control; return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix); } uint64_t * vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, int *ret_entries) { int error; static struct vm_stats vmstats; vmstats.cpuid = vcpu; error = ioctl(ctx->fd, VM_STATS, &vmstats); if (error == 0) { if (ret_entries) *ret_entries = vmstats.num_entries; if (ret_tv) *ret_tv = vmstats.tv; return (vmstats.statbuf); } else return (NULL); } const char * vm_get_stat_desc(struct vmctx *ctx, int index) { static struct vm_stat_desc statdesc; statdesc.index = index; if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) return (statdesc.desc); else return (NULL); } int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); x2apic.cpuid = vcpu; error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic); *state = x2apic.state; return (error); } int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state) { int error; struct vm_x2apic x2apic; bzero(&x2apic, sizeof(x2apic)); x2apic.cpuid = vcpu; x2apic.state = state; error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic); return (error); } /* * From Intel Vol 3a: * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT */ int vcpu_reset(struct vmctx *vmctx, int vcpu) { int error; uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx; uint32_t desc_access, desc_limit; uint16_t sel; zero = 0; rflags = 0x2; error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); if (error) goto done; rip = 0xfff0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0) goto done; cr0 = CR0_NE; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0) goto done; cr4 = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0) goto done; /* * CS: present, r/w, accessed, 16-bit, byte granularity, usable */ desc_base = 0xffff0000; desc_limit = 0xffff; desc_access = 0x0093; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0xf000; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0) goto done; /* * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0093; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, desc_base, desc_limit, desc_access); if (error) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0) goto done; /* General purpose registers */ rdx = 0xf00; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0) goto done; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0) goto done; /* GDTR, IDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR, desc_base, desc_limit, desc_access); if (error != 0) goto done; /* TR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x0000008b; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0) goto done; /* LDTR */ desc_base = 0; desc_limit = 0xffff; desc_access = 0x00000082; error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base, desc_limit, desc_access); if (error) goto done; sel = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) goto done; /* XXX cr2, debug registers */ error = 0; done: return (error); } Index: projects/bhyve/sys/amd64/include/vmm_dev.h =================================================================== --- projects/bhyve/sys/amd64/include/vmm_dev.h (revision 241177) +++ projects/bhyve/sys/amd64/include/vmm_dev.h (revision 241178) @@ -1,216 +1,215 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: vmm_dev.h 482 2011-05-09 21:22:43Z grehan $ */ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ #ifdef _KERNEL void vmmdev_init(void); void vmmdev_cleanup(void); #endif struct vm_memory_segment { - vm_paddr_t hpa; /* out */ vm_paddr_t gpa; /* in */ size_t len; /* in */ }; struct vm_register { int cpuid; int regnum; /* enum vm_reg_name */ uint64_t regval; }; struct vm_seg_desc { /* data or code segment */ int cpuid; int regnum; /* enum vm_reg_name */ struct seg_desc desc; }; struct vm_pin { int vm_cpuid; int host_cpuid; /* -1 to unpin */ }; struct vm_run { int cpuid; uint64_t rip; /* start running here */ struct vm_exit vm_exit; }; struct vm_event { int cpuid; enum vm_event_type type; int vector; uint32_t error_code; int error_code_valid; }; struct vm_lapic_irq { int cpuid; int vector; }; struct vm_capability { int cpuid; enum vm_cap_type captype; int capval; int allcpus; }; struct vm_pptdev { int bus; int slot; int func; }; struct vm_pptdev_mmio { int bus; int slot; int func; vm_paddr_t gpa; vm_paddr_t hpa; size_t len; }; struct vm_pptdev_msi { int vcpu; int bus; int slot; int func; int numvec; /* 0 means disabled */ int vector; int destcpu; }; struct vm_pptdev_msix { int vcpu; int bus; int slot; int func; int idx; uint32_t msg; uint32_t vector_control; uint64_t addr; }; struct vm_nmi { int cpuid; }; #define MAX_VM_STATS 64 struct vm_stats { int cpuid; /* in */ int num_entries; /* out */ struct timeval tv; uint64_t statbuf[MAX_VM_STATS]; }; struct vm_stat_desc { int index; /* in */ char desc[128]; /* out */ }; struct vm_x2apic { int cpuid; enum x2apic_state state; }; enum { IOCNUM_RUN, IOCNUM_SET_PINNING, IOCNUM_GET_PINNING, IOCNUM_MAP_MEMORY, IOCNUM_GET_MEMORY_SEG, IOCNUM_SET_REGISTER, IOCNUM_GET_REGISTER, IOCNUM_SET_SEGMENT_DESCRIPTOR, IOCNUM_GET_SEGMENT_DESCRIPTOR, IOCNUM_INJECT_EVENT, IOCNUM_LAPIC_IRQ, IOCNUM_SET_CAPABILITY, IOCNUM_GET_CAPABILITY, IOCNUM_BIND_PPTDEV, IOCNUM_UNBIND_PPTDEV, IOCNUM_MAP_PPTDEV_MMIO, IOCNUM_PPTDEV_MSI, IOCNUM_PPTDEV_MSIX, IOCNUM_INJECT_NMI, IOCNUM_VM_STATS, IOCNUM_VM_STAT_DESC, IOCNUM_SET_X2APIC_STATE, IOCNUM_GET_X2APIC_STATE, }; #define VM_RUN \ _IOWR('v', IOCNUM_RUN, struct vm_run) #define VM_SET_PINNING \ _IOW('v', IOCNUM_SET_PINNING, struct vm_pin) #define VM_GET_PINNING \ _IOWR('v', IOCNUM_GET_PINNING, struct vm_pin) #define VM_MAP_MEMORY \ _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment) #define VM_GET_MEMORY_SEG \ _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment) #define VM_SET_REGISTER \ _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) #define VM_GET_REGISTER \ _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) #define VM_SET_SEGMENT_DESCRIPTOR \ _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) #define VM_GET_SEGMENT_DESCRIPTOR \ _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) #define VM_INJECT_EVENT \ _IOW('v', IOCNUM_INJECT_EVENT, struct vm_event) #define VM_LAPIC_IRQ \ _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq) #define VM_SET_CAPABILITY \ _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) #define VM_GET_CAPABILITY \ _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) #define VM_BIND_PPTDEV \ _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev) #define VM_UNBIND_PPTDEV \ _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev) #define VM_MAP_PPTDEV_MMIO \ _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio) #define VM_PPTDEV_MSI \ _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi) #define VM_PPTDEV_MSIX \ _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix) #define VM_INJECT_NMI \ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) #define VM_STATS \ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) #define VM_STAT_DESC \ _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) #define VM_SET_X2APIC_STATE \ _IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic) #define VM_GET_X2APIC_STATE \ _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic) #endif Index: projects/bhyve/sys/amd64/vmm/io/ppt.c =================================================================== --- projects/bhyve/sys/amd64/vmm/io/ppt.c (revision 241177) +++ projects/bhyve/sys/amd64/vmm/io/ppt.c (revision 241178) @@ -1,623 +1,622 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_ktr.h" #include "iommu.h" #include "ppt.h" #define MAX_PPTDEVS (sizeof(pptdevs) / sizeof(pptdevs[0])) #define MAX_MMIOSEGS (PCIR_MAX_BAR_0 + 1) #define MAX_MSIMSGS 32 MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources"); struct pptintr_arg { /* pptintr(pptintr_arg) */ struct pptdev *pptdev; int vec; int vcpu; }; static struct pptdev { device_t dev; struct vm *vm; /* owner of this device */ struct vm_memory_segment mmio[MAX_MMIOSEGS]; struct { int num_msgs; /* guest state */ int vector; int vcpu; int startrid; /* host state */ struct resource *res[MAX_MSIMSGS]; void *cookie[MAX_MSIMSGS]; struct pptintr_arg arg[MAX_MSIMSGS]; } msi; struct { int num_msgs; int startrid; int msix_table_rid; struct resource *msix_table_res; struct resource **res; void **cookie; struct pptintr_arg *arg; } msix; } pptdevs[32]; static int num_pptdevs; static int ppt_probe(device_t dev) { int bus, slot, func; struct pci_devinfo *dinfo; dinfo = (struct pci_devinfo *)device_get_ivars(dev); bus = pci_get_bus(dev); slot = pci_get_slot(dev); func = pci_get_function(dev); /* * To qualify as a pci passthrough device a device must: * - be allowed by administrator to be used in this role * - be an endpoint device */ if (vmm_is_pptdev(bus, slot, func) && (dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL) return (0); else return (ENXIO); } static int ppt_attach(device_t dev) { int n; if (num_pptdevs >= MAX_PPTDEVS) { printf("ppt_attach: maximum number of pci passthrough devices " "exceeded\n"); return (ENXIO); } n = num_pptdevs++; pptdevs[n].dev = dev; if (bootverbose) device_printf(dev, "attached\n"); return (0); } static int ppt_detach(device_t dev) { /* * XXX check whether there are any pci passthrough devices assigned * to guests before we allow this driver to detach. */ return (0); } static device_method_t ppt_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ppt_probe), DEVMETHOD(device_attach, ppt_attach), DEVMETHOD(device_detach, ppt_detach), {0, 0} }; static devclass_t ppt_devclass; DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0); DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL); static struct pptdev * ppt_find(int bus, int slot, int func) { device_t dev; int i, b, s, f; for (i = 0; i < num_pptdevs; i++) { dev = pptdevs[i].dev; b = pci_get_bus(dev); s = pci_get_slot(dev); f = pci_get_function(dev); if (bus == b && slot == s && func == f) return (&pptdevs[i]); } return (NULL); } static void ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt) { int i; struct vm_memory_segment *seg; for (i = 0; i < MAX_MMIOSEGS; i++) { seg = &ppt->mmio[i]; if (seg->len == 0) continue; (void)vm_unmap_mmio(vm, seg->gpa, seg->len); bzero(seg, sizeof(struct vm_memory_segment)); } } static void ppt_teardown_msi(struct pptdev *ppt) { int i, rid; void *cookie; struct resource *res; if (ppt->msi.num_msgs == 0) return; for (i = 0; i < ppt->msi.num_msgs; i++) { rid = ppt->msi.startrid + i; res = ppt->msi.res[i]; cookie = ppt->msi.cookie[i]; if (cookie != NULL) bus_teardown_intr(ppt->dev, res, cookie); if (res != NULL) bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res); ppt->msi.res[i] = NULL; ppt->msi.cookie[i] = NULL; } if (ppt->msi.startrid == 1) pci_release_msi(ppt->dev); ppt->msi.num_msgs = 0; } static void ppt_teardown_msix_intr(struct pptdev *ppt, int idx) { int rid; struct resource *res; void *cookie; rid = ppt->msix.startrid + idx; res = ppt->msix.res[idx]; cookie = ppt->msix.cookie[idx]; if (cookie != NULL) bus_teardown_intr(ppt->dev, res, cookie); if (res != NULL) bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res); ppt->msix.res[idx] = NULL; ppt->msix.cookie[idx] = NULL; } static void ppt_teardown_msix(struct pptdev *ppt) { int i, error; if (ppt->msix.num_msgs == 0) return; for (i = 0; i < ppt->msix.num_msgs; i++) ppt_teardown_msix_intr(ppt, i); if (ppt->msix.msix_table_res) { bus_release_resource(ppt->dev, SYS_RES_MEMORY, ppt->msix.msix_table_rid, ppt->msix.msix_table_res); ppt->msix.msix_table_res = NULL; ppt->msix.msix_table_rid = 0; } free(ppt->msix.res, M_PPTMSIX); free(ppt->msix.cookie, M_PPTMSIX); free(ppt->msix.arg, M_PPTMSIX); error = pci_release_msi(ppt->dev); if (error) printf("ppt_teardown_msix: Failed to release MSI-X resources (error %i)\n", error); ppt->msix.num_msgs = 0; } int ppt_assign_device(struct vm *vm, int bus, int slot, int func) { struct pptdev *ppt; ppt = ppt_find(bus, slot, func); if (ppt != NULL) { /* * If this device is owned by a different VM then we * cannot change its owner. */ if (ppt->vm != NULL && ppt->vm != vm) return (EBUSY); ppt->vm = vm; iommu_add_device(vm_iommu_domain(vm), bus, slot, func); return (0); } return (ENOENT); } int ppt_unassign_device(struct vm *vm, int bus, int slot, int func) { struct pptdev *ppt; ppt = ppt_find(bus, slot, func); if (ppt != NULL) { /* * If this device is not owned by this 'vm' then bail out. */ if (ppt->vm != vm) return (EBUSY); ppt_unmap_mmio(vm, ppt); ppt_teardown_msi(ppt); ppt_teardown_msix(ppt); iommu_remove_device(vm_iommu_domain(vm), bus, slot, func); ppt->vm = NULL; return (0); } return (ENOENT); } int ppt_unassign_all(struct vm *vm) { int i, bus, slot, func; device_t dev; for (i = 0; i < num_pptdevs; i++) { if (pptdevs[i].vm == vm) { dev = pptdevs[i].dev; bus = pci_get_bus(dev); slot = pci_get_slot(dev); func = pci_get_function(dev); ppt_unassign_device(vm, bus, slot, func); } } return (0); } int ppt_map_mmio(struct vm *vm, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { int i, error; struct vm_memory_segment *seg; struct pptdev *ppt; ppt = ppt_find(bus, slot, func); if (ppt != NULL) { if (ppt->vm != vm) return (EBUSY); for (i = 0; i < MAX_MMIOSEGS; i++) { seg = &ppt->mmio[i]; if (seg->len == 0) { error = vm_map_mmio(vm, gpa, len, hpa); if (error == 0) { seg->gpa = gpa; seg->len = len; - seg->hpa = hpa; } return (error); } } return (ENOSPC); } return (ENOENT); } static int pptintr(void *arg) { int vec; struct pptdev *ppt; struct pptintr_arg *pptarg; pptarg = arg; ppt = pptarg->pptdev; vec = pptarg->vec; if (ppt->vm != NULL) (void) lapic_set_intr(ppt->vm, pptarg->vcpu, vec); else { /* * XXX * This is not expected to happen - panic? */ } /* * For legacy interrupts give other filters a chance in case * the interrupt was not generated by the passthrough device. */ if (ppt->msi.startrid == 0) return (FILTER_STRAY); else return (FILTER_HANDLED); } /* * XXX * When we try to free the MSI resource the kernel will bind the thread to * the host cpu was originally handling the MSI. The function freeing the * MSI vector (apic_free_vector()) will panic the kernel if the thread * is already bound to a cpu. * * So, we temporarily unbind the vcpu thread before freeing the MSI resource. */ static void PPT_TEARDOWN_MSI(struct vm *vm, int vcpu, struct pptdev *ppt) { int pincpu = -1; vm_get_pinning(vm, vcpu, &pincpu); if (pincpu >= 0) vm_set_pinning(vm, vcpu, -1); ppt_teardown_msi(ppt); if (pincpu >= 0) vm_set_pinning(vm, vcpu, pincpu); } int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, int destcpu, int vector, int numvec) { int i, rid, flags; int msi_count, startrid, error, tmp; struct pptdev *ppt; if ((destcpu >= VM_MAXCPU || destcpu < 0) || (vector < 0 || vector > 255) || (numvec < 0 || numvec > MAX_MSIMSGS)) return (EINVAL); ppt = ppt_find(bus, slot, func); if (ppt == NULL) return (ENOENT); if (ppt->vm != vm) /* Make sure we own this device */ return (EBUSY); /* Free any allocated resources */ PPT_TEARDOWN_MSI(vm, vcpu, ppt); if (numvec == 0) /* nothing more to do */ return (0); flags = RF_ACTIVE; msi_count = pci_msi_count(ppt->dev); if (msi_count == 0) { startrid = 0; /* legacy interrupt */ msi_count = 1; flags |= RF_SHAREABLE; } else startrid = 1; /* MSI */ /* * The device must be capable of supporting the number of vectors * the guest wants to allocate. */ if (numvec > msi_count) return (EINVAL); /* * Make sure that we can allocate all the MSI vectors that are needed * by the guest. */ if (startrid == 1) { tmp = numvec; error = pci_alloc_msi(ppt->dev, &tmp); if (error) return (error); else if (tmp != numvec) { pci_release_msi(ppt->dev); return (ENOSPC); } else { /* success */ } } ppt->msi.vector = vector; ppt->msi.vcpu = destcpu; ppt->msi.startrid = startrid; /* * Allocate the irq resource and attach it to the interrupt handler. */ for (i = 0; i < numvec; i++) { ppt->msi.num_msgs = i + 1; ppt->msi.cookie[i] = NULL; rid = startrid + i; ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ, &rid, flags); if (ppt->msi.res[i] == NULL) break; ppt->msi.arg[i].pptdev = ppt; ppt->msi.arg[i].vec = vector + i; error = bus_setup_intr(ppt->dev, ppt->msi.res[i], INTR_TYPE_NET | INTR_MPSAFE, pptintr, NULL, &ppt->msi.arg[i], &ppt->msi.cookie[i]); if (error != 0) break; } if (i < numvec) { PPT_TEARDOWN_MSI(vm, vcpu, ppt); return (ENXIO); } return (0); } int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, int idx, uint32_t msg, uint32_t vector_control, uint64_t addr) { struct pptdev *ppt; struct pci_devinfo *dinfo; int numvec, vector_count, rid, error; size_t res_size, cookie_size, arg_size; ppt = ppt_find(bus, slot, func); if (ppt == NULL) return (ENOENT); if (ppt->vm != vm) /* Make sure we own this device */ return (EBUSY); dinfo = device_get_ivars(ppt->dev); if (!dinfo) return (ENXIO); /* * First-time configuration: * Allocate the MSI-X table * Allocate the IRQ resources * Set up some variables in ppt->msix */ if (!ppt->msix.msix_table_res) { ppt->msix.res = NULL; ppt->msix.cookie = NULL; ppt->msix.arg = NULL; rid = dinfo->cfg.msix.msix_table_bar; ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (ppt->msix.msix_table_res == NULL) return (ENOSPC); ppt->msix.msix_table_rid = rid; vector_count = numvec = pci_msix_count(ppt->dev); error = pci_alloc_msix(ppt->dev, &numvec); if (error) return (error); else if (vector_count != numvec) { pci_release_msi(ppt->dev); return (ENOSPC); } ppt->msix.num_msgs = numvec; ppt->msix.startrid = 1; res_size = numvec * sizeof(ppt->msix.res[0]); cookie_size = numvec * sizeof(ppt->msix.cookie[0]); arg_size = numvec * sizeof(ppt->msix.arg[0]); ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK); ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX, M_WAITOK); ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK); if (ppt->msix.res == NULL || ppt->msix.cookie == NULL || ppt->msix.arg == NULL) { ppt_teardown_msix(ppt); return (ENOSPC); } bzero(ppt->msix.res, res_size); bzero(ppt->msix.cookie, cookie_size); bzero(ppt->msix.arg, arg_size); } if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { /* Tear down the IRQ if it's already set up */ ppt_teardown_msix_intr(ppt, idx); /* Allocate the IRQ resource */ ppt->msix.cookie[idx] = NULL; rid = ppt->msix.startrid + idx; ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (ppt->msix.res[idx] == NULL) return (ENXIO); ppt->msix.arg[idx].pptdev = ppt; ppt->msix.arg[idx].vec = msg; ppt->msix.arg[idx].vcpu = (addr >> 12) & 0xFF; /* Setup the MSI-X interrupt */ error = bus_setup_intr(ppt->dev, ppt->msix.res[idx], INTR_TYPE_NET | INTR_MPSAFE, pptintr, NULL, &ppt->msix.arg[idx], &ppt->msix.cookie[idx]); if (error != 0) { bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]); bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]); ppt->msix.cookie[idx] = NULL; ppt->msix.res[idx] = NULL; return (ENXIO); } } else { /* Masked, tear it down if it's already been set up */ ppt_teardown_msix_intr(ppt, idx); } return (0); } Index: projects/bhyve/sys/amd64/vmm/vmm.c =================================================================== --- projects/bhyve/sys/amd64/vmm/vmm.c (revision 241177) +++ projects/bhyve/sys/amd64/vmm/vmm.c (revision 241178) @@ -1,818 +1,853 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_mem.h" #include "vmm_util.h" #include #include "vlapic.h" #include "vmm_msr.h" #include "vmm_ipi.h" #include "vmm_stat.h" #include "io/ppt.h" #include "io/iommu.h" struct vlapic; struct vcpu { int flags; int pincpu; /* host cpuid this vcpu is bound to */ int hostcpu; /* host cpuid this vcpu last ran on */ uint64_t guest_msrs[VMM_MSR_NUM]; struct vlapic *vlapic; int vcpuid; struct savefpu *guestfpu; /* guest fpu state */ void *stats; struct vm_exit exitinfo; enum x2apic_state x2apic_state; }; #define VCPU_F_PINNED 0x0001 #define VCPU_F_RUNNING 0x0002 #define VCPU_PINCPU(vm, vcpuid) \ ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1) #define VCPU_UNPIN(vm, vcpuid) (vm->vcpu[vcpuid].flags &= ~VCPU_F_PINNED) #define VCPU_PIN(vm, vcpuid, host_cpuid) \ do { \ vm->vcpu[vcpuid].flags |= VCPU_F_PINNED; \ vm->vcpu[vcpuid].pincpu = host_cpuid; \ } while(0) #define VM_MAX_MEMORY_SEGMENTS 2 struct vm { void *cookie; /* processor-specific data */ void *iommu; /* iommu-specific data */ struct vcpu vcpu[VM_MAXCPU]; int num_mem_segs; struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS]; char name[VM_MAX_NAMELEN]; /* * Set of active vcpus. * An active vcpu is one that has been started implicitly (BSP) or * explicitly (AP) by sending it a startup ipi. */ cpuset_t active_cpus; }; static struct vmm_ops *ops; #define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) #define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL) #define VMRUN(vmi, vcpu, rip) \ (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO) #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) #define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \ (ops != NULL ? \ (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \ ENXIO) #define VMMMAP_GET(vmi, gpa) \ (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO) #define VMGETREG(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) #define VMSETREG(vmi, vcpu, num, val) \ (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) #define VMGETDESC(vmi, vcpu, num, desc) \ (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) #define VMSETDESC(vmi, vcpu, num, desc) \ (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) #define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \ (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO) #define VMNMI(vmi, vcpu) \ (ops != NULL ? (*ops->vmnmi)(vmi, vcpu) : ENXIO) #define VMGETCAP(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) #define VMSETCAP(vmi, vcpu, num, val) \ (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) #define fpu_start_emulating() start_emulating() #define fpu_stop_emulating() stop_emulating() static MALLOC_DEFINE(M_VM, "vm", "vm"); CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ /* statistics */ static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); static void vcpu_cleanup(struct vcpu *vcpu) { vlapic_cleanup(vcpu->vlapic); vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); } static void vcpu_init(struct vm *vm, uint32_t vcpu_id) { struct vcpu *vcpu; vcpu = &vm->vcpu[vcpu_id]; vcpu->hostcpu = -1; vcpu->vcpuid = vcpu_id; vcpu->vlapic = vlapic_init(vm, vcpu_id); vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED); vcpu->guestfpu = fpu_save_area_alloc(); fpu_save_area_reset(vcpu->guestfpu); vcpu->stats = vmm_stat_alloc(); } struct vm_exit * vm_exitinfo(struct vm *vm, int cpuid) { struct vcpu *vcpu; if (cpuid < 0 || cpuid >= VM_MAXCPU) panic("vm_exitinfo: invalid cpuid %d", cpuid); vcpu = &vm->vcpu[cpuid]; return (&vcpu->exitinfo); } static int vmm_init(void) { int error; vmm_ipi_init(); error = vmm_mem_init(); if (error) return (error); if (vmm_is_intel()) ops = &vmm_ops_intel; else if (vmm_is_amd()) ops = &vmm_ops_amd; else return (ENXIO); vmm_msr_init(); return (VMM_INIT()); } static int vmm_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: vmmdev_init(); iommu_init(); error = vmm_init(); break; case MOD_UNLOAD: vmmdev_cleanup(); iommu_cleanup(); vmm_ipi_cleanup(); error = VMM_CLEANUP(); break; default: error = 0; break; } return (error); } static moduledata_t vmm_kmod = { "vmm", vmm_handler, NULL }; /* * Execute the module load handler after the pci passthru driver has had * a chance to claim devices. We need this information at the time we do * iommu initialization. */ DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); struct vm * vm_create(const char *name) { int i; struct vm *vm; vm_paddr_t maxaddr; const int BSP = 0; if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) return (NULL); vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); vm->cookie = VMINIT(vm); for (i = 0; i < VM_MAXCPU; i++) { vcpu_init(vm, i); guest_msrs_init(vm, i); } maxaddr = vmm_mem_maxaddr(); vm->iommu = iommu_create_domain(maxaddr); vm_activate_cpu(vm, BSP); return (vm); } +static void +vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg) +{ + size_t len; + vm_paddr_t hpa; + + len = 0; + while (len < seg->len) { + hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE); + if (hpa == (vm_paddr_t)-1) { + panic("vm_free_mem_segs: cannot free hpa " + "associated with gpa 0x%016lx", seg->gpa + len); + } + + vmm_mem_free(hpa, PAGE_SIZE); + + len += PAGE_SIZE; + } + + bzero(seg, sizeof(struct vm_memory_segment)); +} + void vm_destroy(struct vm *vm) { int i; ppt_unassign_all(vm); for (i = 0; i < vm->num_mem_segs; i++) - vmm_mem_free(vm->mem_segs[i].hpa, vm->mem_segs[i].len); + vm_free_mem_seg(vm, &vm->mem_segs[i]); + vm->num_mem_segs = 0; + for (i = 0; i < VM_MAXCPU; i++) vcpu_cleanup(&vm->vcpu[i]); iommu_destroy_domain(vm->iommu); VMCLEANUP(vm->cookie); free(vm, M_VM); } const char * vm_name(struct vm *vm) { return (vm->name); } int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { const boolean_t spok = TRUE; /* superpage mappings are ok */ return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE, VM_PROT_RW, spok)); } int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { const boolean_t spok = TRUE; /* superpage mappings are ok */ return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0, VM_PROT_NONE, spok)); } /* * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise */ static boolean_t vm_gpa_available(struct vm *vm, vm_paddr_t gpa) { int i; vm_paddr_t gpabase, gpalimit; if (gpa & PAGE_MASK) panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa); for (i = 0; i < vm->num_mem_segs; i++) { gpabase = vm->mem_segs[i].gpa; gpalimit = gpabase + vm->mem_segs[i].len; if (gpa >= gpabase && gpa < gpalimit) return (FALSE); } return (TRUE); } int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) { int error, available, allocated; + struct vm_memory_segment *seg; vm_paddr_t g, hpa; const boolean_t spok = TRUE; /* superpage mappings are ok */ if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) return (EINVAL); available = allocated = 0; g = gpa; while (g < gpa + len) { if (vm_gpa_available(vm, g)) available++; else allocated++; g += PAGE_SIZE; } /* * If there are some allocated and some available pages in the address * range then it is an error. */ if (allocated && available) return (EINVAL); /* * If the entire address range being requested has already been * allocated then there isn't anything more to do. */ if (allocated && available == 0) return (0); if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) return (E2BIG); - hpa = vmm_mem_alloc(len); - if (hpa == 0) - return (ENOMEM); + seg = &vm->mem_segs[vm->num_mem_segs]; - error = VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK, - VM_PROT_ALL, spok); - if (error) { - vmm_mem_free(hpa, len); + seg->gpa = gpa; + seg->len = 0; + while (seg->len < len) { + hpa = vmm_mem_alloc(PAGE_SIZE); + if (hpa == 0) { + error = ENOMEM; + break; + } + + error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE, + VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok); + if (error) + break; + + iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE); + + seg->len += PAGE_SIZE; + } + + if (seg->len != len) { + vm_free_mem_seg(vm, seg); return (error); } - iommu_create_mapping(vm->iommu, gpa, hpa, len); - - vm->mem_segs[vm->num_mem_segs].gpa = gpa; - vm->mem_segs[vm->num_mem_segs].hpa = hpa; - vm->mem_segs[vm->num_mem_segs].len = len; vm->num_mem_segs++; return (0); } vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len) { vm_paddr_t nextpage; nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE); if (len > nextpage - gpa) panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len); return (VMMMAP_GET(vm->cookie, gpa)); } int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, struct vm_memory_segment *seg) { int i; for (i = 0; i < vm->num_mem_segs; i++) { if (gpabase == vm->mem_segs[i].gpa) { *seg = vm->mem_segs[i]; return (0); } } return (-1); } int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); return (VMGETREG(vm->cookie, vcpu, reg, retval)); } int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (reg >= VM_REG_LAST) return (EINVAL); return (VMSETREG(vm->cookie, vcpu, reg, val)); } static boolean_t is_descriptor_table(int reg) { switch (reg) { case VM_REG_GUEST_IDTR: case VM_REG_GUEST_GDTR: return (TRUE); default: return (FALSE); } } static boolean_t is_segment_register(int reg) { switch (reg) { case VM_REG_GUEST_ES: case VM_REG_GUEST_CS: case VM_REG_GUEST_SS: case VM_REG_GUEST_DS: case VM_REG_GUEST_FS: case VM_REG_GUEST_GS: case VM_REG_GUEST_TR: case VM_REG_GUEST_LDTR: return (TRUE); default: return (FALSE); } } int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (VMGETDESC(vm->cookie, vcpu, reg, desc)); } int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) return (EINVAL); return (VMSETDESC(vm->cookie, vcpu, reg, desc)); } int vm_get_pinning(struct vm *vm, int vcpuid, int *cpuid) { if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); *cpuid = VCPU_PINCPU(vm, vcpuid); return (0); } int vm_set_pinning(struct vm *vm, int vcpuid, int host_cpuid) { struct thread *td; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); td = curthread; /* XXXSMP only safe when muxing vcpus */ /* unpin */ if (host_cpuid < 0) { VCPU_UNPIN(vm, vcpuid); thread_lock(td); sched_unbind(td); thread_unlock(td); return (0); } if (CPU_ABSENT(host_cpuid)) return (EINVAL); /* * XXX we should check that 'host_cpuid' has not already been pinned * by another vm. */ thread_lock(td); sched_bind(td, host_cpuid); thread_unlock(td); VCPU_PIN(vm, vcpuid, host_cpuid); return (0); } static void restore_guest_fpustate(struct vcpu *vcpu) { /* flush host state to the pcb */ fpuexit(curthread); fpu_stop_emulating(); fpurestore(vcpu->guestfpu); } static void save_guest_fpustate(struct vcpu *vcpu) { fpusave(vcpu->guestfpu); fpu_start_emulating(); } int vm_run(struct vm *vm, struct vm_run *vmrun) { int error, vcpuid; struct vcpu *vcpu; struct pcb *pcb; uint64_t tscval; vcpuid = vmrun->cpuid; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; critical_enter(); tscval = rdtsc(); pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); vcpu->hostcpu = curcpu; restore_guest_msrs(vm, vcpuid); restore_guest_fpustate(vcpu); error = VMRUN(vm->cookie, vcpuid, vmrun->rip); save_guest_fpustate(vcpu); restore_host_msrs(vm, vcpuid); vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); /* copy the exit information */ bcopy(&vcpu->exitinfo, &vmrun->vm_exit, sizeof(struct vm_exit)); critical_exit(); return (error); } int vm_inject_event(struct vm *vm, int vcpuid, int type, int vector, uint32_t code, int code_valid) { if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0) return (EINVAL); if (vector < 0 || vector > 255) return (EINVAL); return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid)); } int vm_inject_nmi(struct vm *vm, int vcpu) { int error; if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); error = VMNMI(vm->cookie, vcpu); vm_interrupt_hostcpu(vm, vcpu); return (error); } int vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (VMGETCAP(vm->cookie, vcpu, type, retval)); } int vm_set_capability(struct vm *vm, int vcpu, int type, int val) { if (vcpu < 0 || vcpu >= VM_MAXCPU) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (VMSETCAP(vm->cookie, vcpu, type, val)); } uint64_t * vm_guest_msrs(struct vm *vm, int cpu) { return (vm->vcpu[cpu].guest_msrs); } struct vlapic * vm_lapic(struct vm *vm, int cpu) { return (vm->vcpu[cpu].vlapic); } boolean_t vmm_is_pptdev(int bus, int slot, int func) { int found, b, s, f, n; char *val, *cp, *cp2; /* * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12" */ found = 0; cp = val = getenv("pptdevs"); while (cp != NULL && *cp != '\0') { if ((cp2 = strchr(cp, ' ')) != NULL) *cp2 = '\0'; n = sscanf(cp, "%d/%d/%d", &b, &s, &f); if (n == 3 && bus == b && slot == s && func == f) { found = 1; break; } if (cp2 != NULL) *cp2++ = ' '; cp = cp2; } freeenv(val); return (found); } void * vm_iommu_domain(struct vm *vm) { return (vm->iommu); } void vm_set_run_state(struct vm *vm, int vcpuid, int state) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_set_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; if (state == VCPU_RUNNING) { if (vcpu->flags & VCPU_F_RUNNING) { panic("vm_set_run_state: %s[%d] is already running", vm_name(vm), vcpuid); } vcpu->flags |= VCPU_F_RUNNING; } else { if ((vcpu->flags & VCPU_F_RUNNING) == 0) { panic("vm_set_run_state: %s[%d] is already stopped", vm_name(vm), vcpuid); } vcpu->flags &= ~VCPU_F_RUNNING; } } int vm_get_run_state(struct vm *vm, int vcpuid, int *cpuptr) { int retval, hostcpu; struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= VM_MAXCPU) panic("vm_get_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; if (vcpu->flags & VCPU_F_RUNNING) { retval = VCPU_RUNNING; hostcpu = vcpu->hostcpu; } else { retval = VCPU_STOPPED; hostcpu = -1; } if (cpuptr) *cpuptr = hostcpu; return (retval); } void vm_activate_cpu(struct vm *vm, int vcpuid) { if (vcpuid >= 0 && vcpuid < VM_MAXCPU) CPU_SET(vcpuid, &vm->active_cpus); } cpuset_t vm_active_cpus(struct vm *vm) { return (vm->active_cpus); } void * vcpu_stats(struct vm *vm, int vcpuid) { return (vm->vcpu[vcpuid].stats); } int vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) { if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); *state = vm->vcpu[vcpuid].x2apic_state; return (0); } int vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) { if (vcpuid < 0 || vcpuid >= VM_MAXCPU) return (EINVAL); if (state < 0 || state >= X2APIC_STATE_LAST) return (EINVAL); vm->vcpu[vcpuid].x2apic_state = state; vlapic_set_x2apic_state(vm, vcpuid, state); return (0); } Index: projects/bhyve/sys/amd64/vmm/vmm_dev.c =================================================================== --- projects/bhyve/sys/amd64/vmm/vmm_dev.c (revision 241177) +++ projects/bhyve/sys/amd64/vmm/vmm_dev.c (revision 241178) @@ -1,509 +1,509 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_stat.h" #include "vmm_mem.h" #include "io/ppt.h" #include struct vmmdev_softc { struct vm *vm; /* vm instance cookie */ struct cdev *cdev; SLIST_ENTRY(vmmdev_softc) link; }; static SLIST_HEAD(, vmmdev_softc) head; static struct mtx vmmdev_mtx; static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); SYSCTL_DECL(_hw_vmm); static struct vmmdev_softc * vmmdev_lookup(const char *name) { struct vmmdev_softc *sc; #ifdef notyet /* XXX kernel is not compiled with invariants */ mtx_assert(&vmmdev_mtx, MA_OWNED); #endif SLIST_FOREACH(sc, &head, link) { if (strcmp(name, vm_name(sc->vm)) == 0) break; } return (sc); } static struct vmmdev_softc * vmmdev_lookup2(struct cdev *cdev) { struct vmmdev_softc *sc; #ifdef notyet /* XXX kernel is not compiled with invariants */ mtx_assert(&vmmdev_mtx, MA_OWNED); #endif SLIST_FOREACH(sc, &head, link) { if (sc->cdev == cdev) break; } return (sc); } static int vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) { int error, off, c; vm_paddr_t hpa, gpa; struct vmmdev_softc *sc; static char zerobuf[PAGE_SIZE]; error = 0; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup2(cdev); while (uio->uio_resid > 0 && error == 0) { gpa = uio->uio_offset; off = gpa & PAGE_MASK; c = min(uio->uio_resid, PAGE_SIZE - off); /* * The VM has a hole in its physical memory map. If we want to * use 'dd' to inspect memory beyond the hole we need to * provide bogus data for memory that lies in the hole. * * Since this device does not support lseek(2), dd(1) will * read(2) blocks of data to simulate the lseek(2). */ hpa = vm_gpa2hpa(sc->vm, gpa, c); if (hpa == (vm_paddr_t)-1) { if (uio->uio_rw == UIO_READ) error = uiomove(zerobuf, c, uio); else error = EFAULT; } else error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio); } mtx_unlock(&vmmdev_mtx); return (error); } static int vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { int error, vcpu; struct vmmdev_softc *sc; struct vm_memory_segment *seg; struct vm_register *vmreg; struct vm_seg_desc* vmsegdesc; struct vm_pin *vmpin; struct vm_run *vmrun; struct vm_event *vmevent; struct vm_lapic_irq *vmirq; struct vm_capability *vmcap; struct vm_pptdev *pptdev; struct vm_pptdev_mmio *pptmmio; struct vm_pptdev_msi *pptmsi; struct vm_pptdev_msix *pptmsix; struct vm_nmi *vmnmi; struct vm_stats *vmstats; struct vm_stat_desc *statdesc; struct vm_x2apic *x2apic; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup2(cdev); if (sc == NULL) { mtx_unlock(&vmmdev_mtx); return (ENXIO); } /* * Some VMM ioctls can operate only on vcpus that are not running. */ switch (cmd) { case VM_RUN: case VM_SET_PINNING: case VM_GET_REGISTER: case VM_SET_REGISTER: case VM_GET_SEGMENT_DESCRIPTOR: case VM_SET_SEGMENT_DESCRIPTOR: case VM_INJECT_EVENT: case VM_GET_CAPABILITY: case VM_SET_CAPABILITY: case VM_PPTDEV_MSI: case VM_SET_X2APIC_STATE: /* * XXX fragile, handle with care * Assumes that the first field of the ioctl data is the vcpu. */ vcpu = *(int *)data; if (vcpu < 0 || vcpu >= VM_MAXCPU) { error = EINVAL; goto done; } if (vcpu_is_running(sc->vm, vcpu, NULL)) { error = EBUSY; goto done; } break; default: break; } switch(cmd) { case VM_RUN: vmrun = (struct vm_run *)data; vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_RUNNING); mtx_unlock(&vmmdev_mtx); error = vm_run(sc->vm, vmrun); mtx_lock(&vmmdev_mtx); vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_STOPPED); break; case VM_STAT_DESC: { const char *desc; statdesc = (struct vm_stat_desc *)data; desc = vmm_stat_desc(statdesc->index); if (desc != NULL) { error = 0; strlcpy(statdesc->desc, desc, sizeof(statdesc->desc)); } else error = EINVAL; break; } case VM_STATS: { CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_TYPES); vmstats = (struct vm_stats *)data; getmicrotime(&vmstats->tv); error = vmm_stat_copy(sc->vm, vmstats->cpuid, &vmstats->num_entries, vmstats->statbuf); break; } case VM_PPTDEV_MSI: pptmsi = (struct vm_pptdev_msi *)data; error = ppt_setup_msi(sc->vm, pptmsi->vcpu, pptmsi->bus, pptmsi->slot, pptmsi->func, pptmsi->destcpu, pptmsi->vector, pptmsi->numvec); break; case VM_PPTDEV_MSIX: pptmsix = (struct vm_pptdev_msix *)data; error = ppt_setup_msix(sc->vm, pptmsix->vcpu, pptmsix->bus, pptmsix->slot, pptmsix->func, pptmsix->idx, pptmsix->msg, pptmsix->vector_control, pptmsix->addr); break; case VM_MAP_PPTDEV_MMIO: pptmmio = (struct vm_pptdev_mmio *)data; error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, pptmmio->func, pptmmio->gpa, pptmmio->len, pptmmio->hpa); break; case VM_BIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_UNBIND_PPTDEV: pptdev = (struct vm_pptdev *)data; error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot, pptdev->func); break; case VM_INJECT_EVENT: vmevent = (struct vm_event *)data; error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type, vmevent->vector, vmevent->error_code, vmevent->error_code_valid); break; case VM_INJECT_NMI: vmnmi = (struct vm_nmi *)data; error = vm_inject_nmi(sc->vm, vmnmi->cpuid); break; case VM_LAPIC_IRQ: vmirq = (struct vm_lapic_irq *)data; error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector); break; case VM_SET_PINNING: vmpin = (struct vm_pin *)data; error = vm_set_pinning(sc->vm, vmpin->vm_cpuid, vmpin->host_cpuid); break; case VM_GET_PINNING: vmpin = (struct vm_pin *)data; error = vm_get_pinning(sc->vm, vmpin->vm_cpuid, &vmpin->host_cpuid); break; case VM_MAP_MEMORY: seg = (struct vm_memory_segment *)data; error = vm_malloc(sc->vm, seg->gpa, seg->len); break; case VM_GET_MEMORY_SEG: seg = (struct vm_memory_segment *)data; - seg->hpa = seg->len = 0; + seg->len = 0; (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); error = 0; break; case VM_GET_REGISTER: vmreg = (struct vm_register *)data; error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, &vmreg->regval); break; case VM_SET_REGISTER: vmreg = (struct vm_register *)data; error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, vmreg->regval); break; case VM_SET_SEGMENT_DESCRIPTOR: vmsegdesc = (struct vm_seg_desc *)data; error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, vmsegdesc->regnum, &vmsegdesc->desc); break; case VM_GET_SEGMENT_DESCRIPTOR: vmsegdesc = (struct vm_seg_desc *)data; error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, vmsegdesc->regnum, &vmsegdesc->desc); break; case VM_GET_CAPABILITY: vmcap = (struct vm_capability *)data; error = vm_get_capability(sc->vm, vmcap->cpuid, vmcap->captype, &vmcap->capval); break; case VM_SET_CAPABILITY: vmcap = (struct vm_capability *)data; error = vm_set_capability(sc->vm, vmcap->cpuid, vmcap->captype, vmcap->capval); break; case VM_SET_X2APIC_STATE: x2apic = (struct vm_x2apic *)data; error = vm_set_x2apic_state(sc->vm, x2apic->cpuid, x2apic->state); break; case VM_GET_X2APIC_STATE: x2apic = (struct vm_x2apic *)data; error = vm_get_x2apic_state(sc->vm, x2apic->cpuid, &x2apic->state); break; default: error = ENOTTY; break; } done: mtx_unlock(&vmmdev_mtx); return (error); } static int vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, vm_memattr_t *memattr) { int error; struct vmmdev_softc *sc; error = -1; mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup2(cdev); if (sc != NULL && (nprot & PROT_EXEC) == 0) { *paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE); if (*paddr != (vm_paddr_t)-1) error = 0; } mtx_unlock(&vmmdev_mtx); return (error); } static void vmmdev_destroy(struct vmmdev_softc *sc) { #ifdef notyet /* XXX kernel is not compiled with invariants */ mtx_assert(&vmmdev_mtx, MA_OWNED); #endif /* * XXX must stop virtual machine instances that may be still * running and cleanup their state. */ SLIST_REMOVE(&head, sc, vmmdev_softc, link); destroy_dev(sc->cdev); vm_destroy(sc->vm); free(sc, M_VMMDEV); } static int sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) { int error; char buf[VM_MAX_NAMELEN]; struct vmmdev_softc *sc; strlcpy(buf, "beavis", sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(buf); if (sc == NULL) { mtx_unlock(&vmmdev_mtx); return (EINVAL); } vmmdev_destroy(sc); mtx_unlock(&vmmdev_mtx); return (0); } SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, NULL, 0, sysctl_vmm_destroy, "A", NULL); static struct cdevsw vmmdevsw = { .d_name = "vmmdev", .d_version = D_VERSION, .d_ioctl = vmmdev_ioctl, .d_mmap = vmmdev_mmap, .d_read = vmmdev_rw, .d_write = vmmdev_rw, }; static int sysctl_vmm_create(SYSCTL_HANDLER_ARGS) { int error; struct vm *vm; struct vmmdev_softc *sc; char buf[VM_MAX_NAMELEN]; strlcpy(buf, "beavis", sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&vmmdev_mtx); sc = vmmdev_lookup(buf); if (sc != NULL) { mtx_unlock(&vmmdev_mtx); return (EEXIST); } vm = vm_create(buf); if (vm == NULL) { mtx_unlock(&vmmdev_mtx); return (EINVAL); } sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); sc->vm = vm; sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); sc->cdev->si_drv1 = sc; SLIST_INSERT_HEAD(&head, sc, link); mtx_unlock(&vmmdev_mtx); return (0); } SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, NULL, 0, sysctl_vmm_create, "A", NULL); static int sysctl_vmm_mem_total(SYSCTL_HANDLER_ARGS) { size_t val = vmm_mem_get_mem_total(); return sysctl_handle_long(oidp, &val, 0, req); } SYSCTL_PROC(_hw_vmm, OID_AUTO, mem_total, CTLTYPE_LONG | CTLFLAG_RD, 0, 0, sysctl_vmm_mem_total, "LU", "Amount of Total memory"); static int sysctl_vmm_mem_free(SYSCTL_HANDLER_ARGS) { size_t val = vmm_mem_get_mem_free(); return sysctl_handle_long(oidp, &val, 0, req); } SYSCTL_PROC(_hw_vmm, OID_AUTO, mem_free, CTLTYPE_LONG | CTLFLAG_RD, 0, 0, sysctl_vmm_mem_free, "LU", "Amount of Free memory"); void vmmdev_init(void) { mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); } void vmmdev_cleanup(void) { struct vmmdev_softc *sc, *sc2; mtx_lock(&vmmdev_mtx); SLIST_FOREACH_SAFE(sc, &head, link, sc2) vmmdev_destroy(sc); mtx_unlock(&vmmdev_mtx); } Index: projects/bhyve/sys/amd64/vmm/vmm_mem.c =================================================================== --- projects/bhyve/sys/amd64/vmm/vmm_mem.c (revision 241177) +++ projects/bhyve/sys/amd64/vmm/vmm_mem.c (revision 241178) @@ -1,436 +1,436 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_util.h" #include "vmm_mem.h" static MALLOC_DEFINE(M_VMM_MEM, "vmm memory", "vmm memory"); #define MB (1024 * 1024) #define GB (1024 * MB) #define VMM_MEM_MAXSEGS 64 /* protected by vmm_mem_mtx */ static struct { vm_paddr_t base; vm_size_t length; } vmm_mem_avail[VMM_MEM_MAXSEGS]; static int vmm_mem_nsegs; size_t vmm_mem_total_bytes; static vm_paddr_t maxaddr; static struct mtx vmm_mem_mtx; /* * Steal any memory that was deliberately hidden from FreeBSD either by * the use of MAXMEM kernel config option or the hw.physmem loader tunable. */ static int vmm_mem_steal_memory(void) { int nsegs; caddr_t kmdp; uint32_t smapsize; uint64_t base, length; struct bios_smap *smapbase, *smap, *smapend; /* * Borrowed from hammer_time() and getmemsize() in machdep.c */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase == NULL) panic("No BIOS smap info from loader!"); smapsize = *((uint32_t *)smapbase - 1); smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); vmm_mem_total_bytes = 0; nsegs = 0; for (smap = smapbase; smap < smapend; smap++) { /* * XXX * Assuming non-overlapping, monotonically increasing * memory segments. */ if (smap->type != SMAP_TYPE_MEMORY) continue; if (smap->length == 0) break; base = roundup(smap->base, NBPDR); length = rounddown(smap->length, NBPDR); /* Skip this segment if FreeBSD is using all of it. */ if (base + length <= ptoa(Maxmem)) continue; /* * If FreeBSD is using part of this segment then adjust * 'base' and 'length' accordingly. */ if (base < ptoa(Maxmem)) { uint64_t used; used = roundup(ptoa(Maxmem), NBPDR) - base; base += used; length -= used; } if (length == 0) continue; vmm_mem_avail[nsegs].base = base; vmm_mem_avail[nsegs].length = length; vmm_mem_total_bytes += length; if (base + length > maxaddr) maxaddr = base + length; if (0 && bootverbose) { printf("vmm_mem_populate: index %d, base 0x%0lx, " "length %ld\n", nsegs, vmm_mem_avail[nsegs].base, vmm_mem_avail[nsegs].length); } nsegs++; if (nsegs >= VMM_MEM_MAXSEGS) { printf("vmm_mem_populate: maximum number of vmm memory " "segments reached!\n"); return (ENOSPC); } } vmm_mem_nsegs = nsegs; return (0); } static void vmm_mem_direct_map(vm_paddr_t start, vm_paddr_t end) { vm_paddr_t addr, remaining; int pdpi, pdi, superpage_size; pml4_entry_t *pml4p; pdp_entry_t *pdp; pd_entry_t *pd; uint64_t page_attr_bits; if (end >= NBPML4) panic("Cannot map memory beyond %ldGB", NBPML4 / GB); if (vmm_supports_1G_pages()) superpage_size = NBPDP; else superpage_size = NBPDR; /* * Get the page directory pointer page that contains the direct * map address mappings. */ pml4p = kernel_pmap->pm_pml4; pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4p[DMPML4I] & ~PAGE_MASK); page_attr_bits = PG_RW | PG_V | PG_PS | PG_G; addr = start; while (addr < end) { remaining = end - addr; pdpi = addr / NBPDP; if (superpage_size == NBPDP && remaining >= NBPDP && addr % NBPDP == 0) { /* * If there isn't a mapping for this address then * create one but if there is one already make sure * it matches what we expect it to be. */ if (pdp[pdpi] == 0) { pdp[pdpi] = addr | page_attr_bits; if (0 && bootverbose) { printf("vmm_mem_populate: mapping " "0x%lx with 1GB page at " "pdpi %d\n", addr, pdpi); } } else { pdp_entry_t pdpe = pdp[pdpi]; if ((pdpe & ~PAGE_MASK) != addr || (pdpe & page_attr_bits) != page_attr_bits) { panic("An invalid mapping 0x%016lx " "already exists for 0x%016lx\n", pdpe, addr); } } addr += NBPDP; } else { if (remaining < NBPDR) { panic("vmm_mem_populate: remaining (%ld) must " "be greater than NBPDR (%d)\n", remaining, NBPDR); } if (pdp[pdpi] == 0) { /* * XXX we lose this memory forever because * we do not keep track of the virtual address * that would be required to free this page. */ pd = malloc(PAGE_SIZE, M_VMM_MEM, M_WAITOK | M_ZERO); if ((uintptr_t)pd & PAGE_MASK) { panic("vmm_mem_populate: page directory" "page not aligned on %d " "boundary\n", PAGE_SIZE); } pdp[pdpi] = vtophys(pd); pdp[pdpi] |= PG_RW | PG_V | PG_U; if (0 && bootverbose) { printf("Creating page directory " "at pdp index %d for 0x%016lx\n", pdpi, addr); } } pdi = (addr % NBPDP) / NBPDR; pd = (pd_entry_t *)PHYS_TO_DMAP(pdp[pdpi] & ~PAGE_MASK); /* * Create a new mapping if one doesn't already exist * or validate it if it does. */ if (pd[pdi] == 0) { pd[pdi] = addr | page_attr_bits; if (0 && bootverbose) { printf("vmm_mem_populate: mapping " "0x%lx with 2MB page at " "pdpi %d, pdi %d\n", addr, pdpi, pdi); } } else { pd_entry_t pde = pd[pdi]; if ((pde & ~PAGE_MASK) != addr || (pde & page_attr_bits) != page_attr_bits) { panic("An invalid mapping 0x%016lx " "already exists for 0x%016lx\n", pde, addr); } } addr += NBPDR; } } } static int vmm_mem_populate(void) { int seg, error; vm_paddr_t start, end; /* populate the vmm_mem_avail[] array */ error = vmm_mem_steal_memory(); if (error) return (error); /* * Now map the memory that was hidden from FreeBSD in * the direct map VA space. */ for (seg = 0; seg < vmm_mem_nsegs; seg++) { start = vmm_mem_avail[seg].base; end = start + vmm_mem_avail[seg].length; if ((start & PDRMASK) != 0 || (end & PDRMASK) != 0) { panic("start (0x%016lx) and end (0x%016lx) must be " "aligned on a %dMB boundary\n", start, end, NBPDR / MB); } vmm_mem_direct_map(start, end); } return (0); } int vmm_mem_init(void) { int error; mtx_init(&vmm_mem_mtx, "vmm_mem_mtx", NULL, MTX_DEF); error = vmm_mem_populate(); if (error) return (error); return (0); } vm_paddr_t vmm_mem_alloc(size_t size) { int i; vm_paddr_t addr; - if ((size & PDRMASK) != 0) { + if ((size & PAGE_MASK) != 0) { panic("vmm_mem_alloc: size 0x%0lx must be " - "aligned on a 0x%0x boundary\n", size, NBPDR); + "aligned on a 0x%0x boundary\n", size, PAGE_SIZE); } addr = 0; mtx_lock(&vmm_mem_mtx); for (i = 0; i < vmm_mem_nsegs; i++) { if (vmm_mem_avail[i].length >= size) { addr = vmm_mem_avail[i].base; vmm_mem_avail[i].base += size; vmm_mem_avail[i].length -= size; /* remove a zero length segment */ if (vmm_mem_avail[i].length == 0) { memmove(&vmm_mem_avail[i], &vmm_mem_avail[i + 1], (vmm_mem_nsegs - (i + 1)) * sizeof(vmm_mem_avail[0])); vmm_mem_nsegs--; } break; } } mtx_unlock(&vmm_mem_mtx); return (addr); } size_t vmm_mem_get_mem_total(void) { return vmm_mem_total_bytes; } size_t vmm_mem_get_mem_free(void) { size_t length = 0; int i; mtx_lock(&vmm_mem_mtx); for (i = 0; i < vmm_mem_nsegs; i++) { length += vmm_mem_avail[i].length; } mtx_unlock(&vmm_mem_mtx); return(length); } void vmm_mem_free(vm_paddr_t base, size_t length) { int i; - if ((base & PDRMASK) != 0 || (length & PDRMASK) != 0) { + if ((base & PAGE_MASK) != 0 || (length & PAGE_MASK) != 0) { panic("vmm_mem_free: base 0x%0lx and length 0x%0lx must be " - "aligned on a 0x%0x boundary\n", base, length, NBPDR); + "aligned on a 0x%0x boundary\n", base, length, PAGE_SIZE); } mtx_lock(&vmm_mem_mtx); for (i = 0; i < vmm_mem_nsegs; i++) { if (vmm_mem_avail[i].base > base) break; } if (vmm_mem_nsegs >= VMM_MEM_MAXSEGS) panic("vmm_mem_free: cannot free any more segments"); /* Create a new segment at index 'i' */ memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i], (vmm_mem_nsegs - i) * sizeof(vmm_mem_avail[0])); vmm_mem_avail[i].base = base; vmm_mem_avail[i].length = length; vmm_mem_nsegs++; coalesce_some_more: for (i = 0; i < vmm_mem_nsegs - 1; i++) { if (vmm_mem_avail[i].base + vmm_mem_avail[i].length == vmm_mem_avail[i + 1].base) { vmm_mem_avail[i].length += vmm_mem_avail[i + 1].length; memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i + 2], (vmm_mem_nsegs - (i + 2)) * sizeof(vmm_mem_avail[0])); vmm_mem_nsegs--; goto coalesce_some_more; } } mtx_unlock(&vmm_mem_mtx); } vm_paddr_t vmm_mem_maxaddr(void) { return (maxaddr); } void vmm_mem_dump(void) { int i; vm_paddr_t base; vm_size_t length; mtx_lock(&vmm_mem_mtx); for (i = 0; i < vmm_mem_nsegs; i++) { base = vmm_mem_avail[i].base; length = vmm_mem_avail[i].length; printf("%-4d0x%016lx 0x%016lx\n", i, base, base + length); } mtx_unlock(&vmm_mem_mtx); }