Index: lib/Makefile =================================================================== --- lib/Makefile +++ lib/Makefile @@ -205,6 +205,9 @@ .if ${MACHINE_CPUARCH} == "amd64" SUBDIR.${MK_PMC}+= libipt +.endif + +.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "aarch64" SUBDIR.${MK_BHYVE}+= libvmmapi .endif Index: lib/libvmmapi/Makefile =================================================================== --- lib/libvmmapi/Makefile +++ lib/libvmmapi/Makefile @@ -1,12 +1,21 @@ # $FreeBSD$ -PACKAGE=lib${LIB} -LIB= vmmapi -SRCS= vmmapi.c vmmapi_freebsd.c -INCS= vmmapi.h +PACKAGE= lib${LIB} +SHLIBDIR?= /lib +LIB_SRCTOP?= ${.CURDIR} -LIBADD= util +LIB= vmmapi +WARNS?= 2 -CFLAGS+= -I${.CURDIR} +.if exists(${LIB_SRCTOP}/${MACHINE}) +LIB_ARCH= ${MACHINE} +.elif exists(${LIB_SRCTOP}/${MACHINE_ARCH}) +LIB_ARCH= ${MACHINE_ARCH} +.else +LIB_ARCH= ${MACHINE_CPUARCH} +.endif + +CFLAGS+= -I${LIB_SRCTOP}/${LIB_ARCH} +.include "${LIB_SRCTOP}/${LIB_ARCH}/Makefile.inc" .include Index: lib/libvmmapi/amd64/Makefile.inc =================================================================== --- /dev/null +++ lib/libvmmapi/amd64/Makefile.inc @@ -0,0 +1,7 @@ +# $FreeBSD$ +.PATH: ${LIB_SRCTOP}/amd64/ + +SRCS= vmmapi.c vmmapi_freebsd.c +INCS= vmmapi.h + +LIBADD= util Index: lib/libvmmapi/arm64/Makefile.inc =================================================================== --- /dev/null +++ lib/libvmmapi/arm64/Makefile.inc @@ -0,0 +1,7 @@ +# $FreeBSD$ +.PATH: ${LIB_SRCTOP}/arm64/ + +SRCS= vmmapi.c +INCS= vmmapi.h + +LIBADD= util Index: lib/libvmmapi/arm64/vmmapi.h =================================================================== --- /dev/null +++ lib/libvmmapi/arm64/vmmapi.h @@ -0,0 +1,80 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMMAPI_H_ +#define _VMMAPI_H_ + +struct vmctx; +struct vm_exit; +enum vm_cap_type; + +/* + * Different styles of mapping the memory assigned to a VM into the address + * space of the controlling process. + */ +enum vm_mmap_style { + VM_MMAP_NONE, /* no mapping */ + VM_MMAP_ALL, /* fully and statically mapped */ + VM_MMAP_SPARSE, /* mappings created on-demand */ +}; + +int vm_create(const char *name); +struct vmctx *vm_open(const char *name); +void vm_destroy(struct vmctx *ctx); +int vm_parse_memsize(const char *optarg, size_t *memsize); +int vm_get_memory_seg(struct vmctx *ctx, uint64_t gpa, size_t *ret_len); +int vm_setup_memory(struct vmctx *ctx, uint64_t membase, size_t len, enum vm_mmap_style s); +void *vm_map_ipa(struct vmctx *ctx, uint64_t gaddr, size_t len); +uint32_t vm_get_mem_limit(struct vmctx *ctx); +void vm_set_mem_limit(struct vmctx *ctx, uint32_t limit); +int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); +int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); +int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, + struct vm_exit *ret_vmexit); +const char *vm_capability_type2name(int type); +int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int *retval); +int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int val); +int vm_assert_irq(struct vmctx *ctx, uint32_t irq, uint32_t vcpuid); +int vm_deassert_irq(struct vmctx *ctx, uint32_t irq, uint32_t vcpuid); + +/* + * Return a pointer to the statistics buffer. Note that this is not MT-safe. + */ +uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, + int *ret_entries); +const char *vm_get_stat_desc(struct vmctx *ctx, int index); + +/* Reset vcpu register state */ +int vcpu_reset(struct vmctx *ctx, int vcpu); +int vm_activate_cpu(struct vmctx *ctx, int vcpu); + +int vm_attach_vgic(struct vmctx *ctx, uint64_t dist_start, size_t dist_size, + uint64_t redist_start, size_t redist_size); +#endif /* _VMMAPI_H_ */ Index: lib/libvmmapi/arm64/vmmapi.c =================================================================== --- /dev/null +++ lib/libvmmapi/arm64/vmmapi.c @@ -0,0 +1,406 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "vmmapi.h" + +#define MB (1024 * 1024UL) +#define GB (1024 * 1024 * 1024UL) + +struct vmctx { + int fd; + uint32_t mem_limit; + enum vm_mmap_style vms; + size_t mem_size; + uint64_t mem_base; + char *mem_addr; + char *name; +}; + +#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) +#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) + +static int +vm_device_open(const char *name) +{ + int fd, len; + char *vmfile; + + len = strlen("/dev/vmm/") + strlen(name) + 1; + vmfile = malloc(len); + assert(vmfile != NULL); + snprintf(vmfile, len, "/dev/vmm/%s", name); + + /* Open the device file */ + fd = open(vmfile, O_RDWR, 0); + + free(vmfile); + return (fd); +} + +int +vm_create(const char *name) +{ + + return (CREATE((char *)name)); +} + +struct vmctx * +vm_open(const char *name) +{ + struct vmctx *vm; + + vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); + assert(vm != NULL); + + vm->fd = -1; + vm->mem_limit = 2 * GB; + vm->name = (char *)(vm + 1); + strcpy(vm->name, name); + + if ((vm->fd = vm_device_open(vm->name)) < 0) + goto err; + + return (vm); +err: + vm_destroy(vm); + return (NULL); +} + +void +vm_destroy(struct vmctx *vm) +{ + assert(vm != NULL); + + if (vm->fd >= 0) + close(vm->fd); + DESTROY(vm->name); + + free(vm); +} + +int +vm_parse_memsize(const char *optarg, size_t *ret_memsize) +{ + char *endptr; + size_t optval; + int error; + + optval = strtoul(optarg, &endptr, 0); + if (*optarg != '\0' && *endptr == '\0') { + /* + * For the sake of backward compatibility if the memory size + * specified on the command line is less than a megabyte then + * it is interpreted as being in units of MB. + */ + if (optval < MB) + optval *= MB; + *ret_memsize = optval; + error = 0; + } else + error = expand_number(optarg, ret_memsize); + + return (error); +} + +int +vm_get_memory_seg(struct vmctx *ctx, uint64_t gpa, size_t *ret_len) +{ + int error; + struct vm_memory_segment seg; + + bzero(&seg, sizeof(seg)); + seg.gpa = gpa; + error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg); + *ret_len = seg.len; + return (error); +} + +uint32_t +vm_get_mem_limit(struct vmctx *ctx) +{ + + return (ctx->mem_limit); +} + +void +vm_set_mem_limit(struct vmctx *ctx, uint32_t limit) +{ + + ctx->mem_limit = limit; +} + +static int +setup_memory_segment(struct vmctx *ctx, uint64_t gpa, size_t len, char **addr) +{ + int error; + struct vm_memory_segment seg; + + /* + * Create and optionally map 'len' bytes of memory at guest + * physical address 'gpa' + */ + bzero(&seg, sizeof(seg)); + seg.gpa = gpa; + seg.len = len; + error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg); + if (error == 0 && addr != NULL) { + *addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, + ctx->fd, gpa); + } + return (error); +} + +int +vm_setup_memory(struct vmctx *ctx, uint64_t membase, size_t memsize, enum vm_mmap_style vms) +{ + int error; + + /* XXX VM_MMAP_SPARSE not implemented yet */ + assert(vms == VM_MMAP_ALL); + + ctx->vms = vms; + ctx->mem_base = membase; + + assert(memsize <= ctx->mem_limit); + ctx->mem_size = memsize; + + if (ctx->mem_size > 0) { + error = setup_memory_segment(ctx, ctx->mem_base, ctx->mem_size, + &ctx->mem_addr); + if (error) + return (error); + } + + return (0); +} + +void * +vm_map_ipa(struct vmctx *ctx, uint64_t iaddr, size_t len) +{ + /* XXX VM_MMAP_SPARSE not implemented yet */ + assert(ctx->vms == VM_MMAP_ALL); + + if (iaddr < ctx->mem_base) + return ((void *)(ctx->mem_addr + iaddr)); + else + return ((void *)(ctx->mem_addr + (iaddr - ctx->mem_base))); +} + + +int +vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val) +{ + int error; + struct vm_register vmreg; + + bzero(&vmreg, sizeof(vmreg)); + vmreg.cpuid = vcpu; + vmreg.regnum = reg; + vmreg.regval = val; + + error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg); + return (error); +} + +int +vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val) +{ + int error; + struct vm_register vmreg; + + bzero(&vmreg, sizeof(vmreg)); + vmreg.cpuid = vcpu; + vmreg.regnum = reg; + + error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg); + *ret_val = vmreg.regval; + return (error); +} + +int +vm_run(struct vmctx *ctx, int vcpu, uint64_t pc, struct vm_exit *vmexit) +{ + int error; + struct vm_run vmrun; + + bzero(&vmrun, sizeof(vmrun)); + vmrun.cpuid = vcpu; + vmrun.pc = pc; + + error = ioctl(ctx->fd, VM_RUN, &vmrun); + bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); + return (error); +} + +static struct { + const char *name; + int type; +} capstrmap[] = { + { "hlt_exit", VM_CAP_HALT_EXIT }, + { "mtrap_exit", VM_CAP_MTRAP_EXIT }, + { "pause_exit", VM_CAP_PAUSE_EXIT }, + { "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST }, + { 0 } +}; + +int +vm_capability_name2type(const char *capname) +{ + int i; + + for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) { + if (strcmp(capstrmap[i].name, capname) == 0) + return (capstrmap[i].type); + } + + return (-1); +} + +const char * +vm_capability_type2name(int type) +{ + int i; + + for (i = 0; capstrmap[i].name != NULL; i++) { + if (capstrmap[i].type == type) + return (capstrmap[i].name); + } + + return (NULL); +} + +int +vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int *retval) +{ + int error; + struct vm_capability vmcap; + + bzero(&vmcap, sizeof(vmcap)); + vmcap.cpuid = vcpu; + vmcap.captype = cap; + + error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap); + *retval = vmcap.capval; + return (error); +} + +int +vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val) +{ + struct vm_capability vmcap; + + bzero(&vmcap, sizeof(vmcap)); + vmcap.cpuid = vcpu; + vmcap.captype = cap; + vmcap.capval = val; + + return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap)); +} + +uint64_t * +vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, + int *ret_entries) +{ + int error; + + static struct vm_stats vmstats; + + vmstats.cpuid = vcpu; + + error = ioctl(ctx->fd, VM_STATS, &vmstats); + if (error == 0) { + if (ret_entries) + *ret_entries = vmstats.num_entries; + if (ret_tv) + *ret_tv = vmstats.tv; + return (vmstats.statbuf); + } else + return (NULL); +} + +const char * +vm_get_stat_desc(struct vmctx *ctx, int index) +{ + static struct vm_stat_desc statdesc; + + statdesc.index = index; + if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) + return (statdesc.desc); + else + return (NULL); +} + +int +vcpu_reset(struct vmctx *vmctx, int vcpu) +{ + return (ENXIO); +} + +int +vm_attach_vgic(struct vmctx *ctx, uint64_t dist_start, size_t dist_size, + uint64_t redist_start, size_t redist_size) +{ + struct vm_attach_vgic vav; + + bzero(&vav, sizeof(vav)); + vav.dist_start = dist_start; + vav.dist_size = dist_size; + vav.redist_start = redist_start; + vav.redist_size = redist_size; + + return (ioctl(ctx->fd, VM_ATTACH_VGIC, &vav)); +} + +int +vm_assert_irq(struct vmctx *ctx, uint32_t irq, uint32_t vcpuid) +{ + struct vm_irq vi; + + bzero(&vi, sizeof(vi)); + vi.irq = irq; + vi.vcpuid = vcpuid; + + return (ioctl(ctx->fd, VM_ASSERT_IRQ, &vi)); +} + +int +vm_deassert_irq(struct vmctx *ctx, uint32_t irq, uint32_t vcpuid) +{ + struct vm_irq vi; + + bzero(&vi, sizeof(vi)); + vi.irq = irq; + vi.vcpuid = vcpuid; + + return (ioctl(ctx->fd, VM_DEASSERT_IRQ, &vi)); +} + +int +vm_activate_cpu(struct vmctx *ctx, int vcpu) +{ + struct vm_activate_cpu ac; + int error; + + bzero(&ac, sizeof(struct vm_activate_cpu)); + ac.vcpuid = vcpu; + error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac); + return (error); +} Index: sys/arm/arm/generic_timer.h =================================================================== --- /dev/null +++ sys/arm/arm/generic_timer.h @@ -0,0 +1,44 @@ +/*- + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _ARM_GENERIC_TIMER_H_ +#define _ARM_GENERIC_TIMER_H_ + +#define GT_PHYS_SECURE 0 +#define GT_PHYS_NONSECURE 1 +#define GT_VIRT 2 +#define GT_HYP 3 + +int arm_tmr_setup_intr(int gt_type, driver_filter_t filter, + driver_intr_t handler, void *arg); +int arm_tmr_teardown_intr(int gt_type); + +#endif Index: sys/arm/arm/generic_timer.c =================================================================== --- sys/arm/arm/generic_timer.c +++ sys/arm/arm/generic_timer.c @@ -63,6 +63,10 @@ #include /* For arm_set_delay */ #endif +#if defined(__aarch64__) +#include /* For virt_enabled() */ +#endif + #ifdef FDT #include #include @@ -74,6 +78,8 @@ #include #endif +#include "generic_timer.h" + #define GT_CTRL_ENABLE (1 << 0) #define GT_CTRL_INT_MASK (1 << 1) #define GT_CTRL_INT_STAT (1 << 2) @@ -123,6 +129,8 @@ .tc_fill_vdso_timehands = arm_tmr_fill_vdso_timehands, }; +static device_t arm_tmr_dev; + #ifdef __arm__ #define get_el0(x) cp15_## x ##_get() #define get_el1(x) cp15_## x ##_get() @@ -314,6 +322,39 @@ return (FILTER_HANDLED); } +int +arm_tmr_setup_intr(int gt_type, driver_filter_t filter, driver_intr_t handler, + void *arg) +{ + if (gt_type != GT_PHYS_SECURE && + gt_type != GT_PHYS_NONSECURE && + gt_type != GT_VIRT && + gt_type != GT_HYP) + return (ENXIO); + + if (arm_tmr_sc->res[gt_type] == NULL) + return (ENXIO); + + return (bus_setup_intr(arm_tmr_dev, arm_tmr_sc->res[gt_type], + INTR_TYPE_CLK, filter, handler, arg, &arm_tmr_sc->ihl[gt_type])); +} + +int +arm_tmr_teardown_intr(int gt_type) +{ + if (gt_type != GT_PHYS_SECURE && + gt_type != GT_PHYS_NONSECURE && + gt_type != GT_VIRT && + gt_type != GT_HYP) + return (ENXIO); + + if (arm_tmr_sc->res[gt_type] == NULL) + return (ENXIO); + + return (bus_teardown_intr(arm_tmr_dev, arm_tmr_sc->res[gt_type], + arm_tmr_sc->ihl[gt_type])); +} + #ifdef FDT static int arm_tmr_fdt_probe(device_t dev) @@ -447,13 +488,26 @@ last_timer = 1; } +#ifdef __aarch64__ + sc->physical |= virt_enabled(); +#endif + arm_tmr_sc = sc; /* Setup secure, non-secure and virtual IRQs handler */ - for (i = first_timer; i <= last_timer; i++) { + for (i = GT_PHYS_SECURE; i <= GT_VIRT; i++) { /* If we do not have the interrupt, skip it. */ if (sc->res[i] == NULL) continue; +#if defined(__aarch64__) + if (i == 2 && virt_enabled()) { + /* + * Do not install an interrupt handler for the virtual + * timer. This will be used by the VM. + */ + continue; + } +#endif error = bus_setup_intr(dev, sc->res[i], INTR_TYPE_CLK, arm_tmr_intr, NULL, sc, &sc->ihl[i]); if (error) { @@ -461,7 +515,6 @@ return (ENXIO); } } - /* Disable the virtual timer until we are ready */ if (sc->res[2] != NULL) arm_tmr_disable(false); @@ -488,6 +541,8 @@ arm_set_delay(arm_tmr_do_delay, sc); #endif + arm_tmr_dev = dev; + return (0); } Index: sys/arm/arm/gic.h =================================================================== --- sys/arm/arm/gic.h +++ sys/arm/arm/gic.h @@ -47,11 +47,18 @@ struct arm_gic_softc { device_t gic_dev; + bool is_root; void * gic_intrhand; struct gic_irqsrc * gic_irqs; -#define GIC_RES_DIST 0 -#define GIC_RES_CPU 1 - struct resource * gic_res[3]; +#define GIC_RES_DIST 0 +#define GIC_RES_CPU 1 + struct resource * gic_res[6]; + bus_space_tag_t gic_c_bst; + bus_space_tag_t gic_d_bst; + bus_space_handle_t gic_c_bsh; + bus_space_handle_t gic_d_bsh; + bus_space_tag_t gic_h_bst; + bus_space_handle_t gic_h_bsh; uint8_t ver; struct mtx mutex; uint32_t nirqs; Index: sys/arm/arm/gic.c =================================================================== --- sys/arm/arm/gic.c +++ sys/arm/arm/gic.c @@ -128,10 +128,14 @@ static struct resource_spec arm_gic_spec[] = { { SYS_RES_MEMORY, 0, RF_ACTIVE }, /* Distributor registers */ { SYS_RES_MEMORY, 1, RF_ACTIVE }, /* CPU Interrupt Intf. registers */ - { SYS_RES_IRQ, 0, RF_ACTIVE | RF_OPTIONAL }, /* Parent interrupt */ + { SYS_RES_MEMORY, 2, RF_ACTIVE | RF_OPTIONAL }, /* Virtual Interface Control */ + { SYS_RES_MEMORY, 3, RF_ACTIVE | RF_OPTIONAL }, /* Virtual CPU interface */ + { SYS_RES_IRQ, 0, RF_ACTIVE | RF_OPTIONAL }, /* vGIC maintenance interrupt or parent interrupt */ { -1, 0 } }; +extern char hypmode_enabled[]; + #if defined(__arm__) && defined(INVARIANTS) static int gic_debug_spurious = 1; #else @@ -156,6 +160,22 @@ #define gic_d_write_4(_sc, _reg, _val) \ bus_write_4((_sc)->gic_res[GIC_RES_DIST], (_reg), (_val)) +#define gic_h_read_4(_sc, _reg) \ + bus_space_read_4((_sc)->gic_h_bst, (_sc)->gic_h_bsh, (_reg)) +#define gic_h_write_4(_sc, _reg, _val) \ + bus_space_write_4((_sc)->gic_h_bst, (_sc)->gic_h_bsh, (_reg), (_val)) + +struct arm_gic_softc * +arm_gic_get_sc(void) +{ + return gic_sc; +} +uint32_t +arm_gic_get_lr_num(void) +{ + return (gic_h_read_4(gic_sc, GICH_VTR) & 0x3f) + 1; +} + static inline void gic_irq_unmask(struct arm_gic_softc *sc, u_int irq) { @@ -323,6 +343,27 @@ /* Initialize mutex */ mtx_init(&sc->mutex, "GIC lock", NULL, MTX_SPIN); + /* Distributor Interface */ + sc->gic_d_bst = rman_get_bustag(sc->gic_res[DISTRIBUTOR_RES_IDX]); + sc->gic_d_bsh = rman_get_bushandle(sc->gic_res[DISTRIBUTOR_RES_IDX]); + + /* CPU Interface */ + sc->gic_c_bst = rman_get_bustag(sc->gic_res[CPU_INTERFACE_RES_IDX]); + sc->gic_c_bsh = rman_get_bushandle(sc->gic_res[CPU_INTERFACE_RES_IDX]); + + /* Virtual Interface Control */ + if (sc->is_root) { + if (sc->gic_res[VIRT_INTERFACE_CONTROL_RES_IDX] == NULL) { + device_printf(dev, "Cannot find Virtual Interface Control Registers. Disabling Hyp-Mode...\n"); + hypmode_enabled[0] = -1; + } else { + sc->gic_h_bst = rman_get_bustag(sc->gic_res[VIRT_INTERFACE_CONTROL_RES_IDX]); + sc->gic_h_bsh = rman_get_bushandle(sc->gic_res[VIRT_INTERFACE_CONTROL_RES_IDX]); + } + } else { + hypmode_enabled[0] = -1; + } + /* Disable interrupt forwarding to the CPU interface */ gic_d_write_4(sc, GICD_CTLR, 0x00); @@ -501,6 +542,33 @@ ("arm_gic_read_ivar: Invalid bus type %u", sc->gic_bus)); *result = sc->gic_bus; return (0); + case GIC_IVAR_VIRTUAL_INT_CTRL_RES: + *result = (uintptr_t)sc->gic_res[VIRT_INTERFACE_CONTROL_RES_IDX]; + return (0); + case GIC_IVAR_VIRTUAL_INT_CTRL_VADDR: + *result = (uintptr_t)rman_get_virtual(sc->gic_res[VIRT_INTERFACE_CONTROL_RES_IDX]); + return (0); + case GIC_IVAR_VIRTUAL_INT_CTRL_PADDR: + *result = (uintptr_t)rman_get_start(sc->gic_res[VIRT_INTERFACE_CONTROL_RES_IDX]); + return (0); + case GIC_IVAR_VIRTUAL_INT_CTRL_SIZE: + *result = rman_get_size(sc->gic_res[VIRT_INTERFACE_CONTROL_RES_IDX]); + return (0); + case GIC_IVAR_VIRTUAL_CPU_INT_PADDR: + *result = rman_get_start(sc->gic_res[VIRT_CPU_INTERFACE_RES_IDX]); + return (0); + case GIC_IVAR_VIRTUAL_CPU_INT_SIZE: + *result = rman_get_size(sc->gic_res[VIRT_CPU_INTERFACE_RES_IDX]); + return (0); + case GIC_IVAR_LR_NUM: + *result = (gic_h_read_4(gic_sc, GICH_VTR) & 0x3f) + 1; + return (0); + case GIC_IVAR_MAINTENANCE_INTR_RES: + if (sc->is_root) + *result = (uintptr_t)sc->gic_res[MAINTENANCE_INTR_RES_IDX]; + else + result = NULL; + return (0); } return (ENOENT); @@ -973,7 +1041,7 @@ if (CPU_ISSET(i, &cpus)) val |= arm_gic_map[i] << GICD_SGI_TARGET_SHIFT; - gic_d_write_4(sc, GICD_SGIR, val | gi->gi_irq); + gic_d_write_4(sc, GICD_SGIR(0), val | gi->gi_irq); } static int Index: sys/arm/arm/gic_common.h =================================================================== --- sys/arm/arm/gic_common.h +++ sys/arm/arm/gic_common.h @@ -31,8 +31,25 @@ #ifndef _GIC_COMMON_H_ #define _GIC_COMMON_H_ -#define GIC_IVAR_HW_REV 500 -#define GIC_IVAR_BUS 501 +#ifndef __ASSEMBLER__ + +#define DISTRIBUTOR_RES_IDX 0 +#define CPU_INTERFACE_RES_IDX 1 +#define VIRT_INTERFACE_CONTROL_RES_IDX 2 +#define VIRT_CPU_INTERFACE_RES_IDX 3 +#define MAINTENANCE_INTR_RES_IDX 4 +#define INTRNG_RES_IDX 5 + +#define GIC_IVAR_HW_REV 500 +#define GIC_IVAR_BUS 501 +#define GIC_IVAR_VIRTUAL_INT_CTRL_RES 502 +#define GIC_IVAR_VIRTUAL_INT_CTRL_VADDR 503 +#define GIC_IVAR_VIRTUAL_INT_CTRL_PADDR 505 +#define GIC_IVAR_VIRTUAL_INT_CTRL_SIZE 504 +#define GIC_IVAR_VIRTUAL_CPU_INT_PADDR 506 +#define GIC_IVAR_VIRTUAL_CPU_INT_SIZE 507 +#define GIC_IVAR_LR_NUM 508 +#define GIC_IVAR_MAINTENANCE_INTR_RES 509 /* GIC_IVAR_BUS values */ #define GIC_BUS_UNKNOWN 0 @@ -42,6 +59,19 @@ __BUS_ACCESSOR(gic, hw_rev, GIC, HW_REV, u_int); __BUS_ACCESSOR(gic, bus, GIC, BUS, u_int); +__BUS_ACCESSOR(gic, virtual_int_ctrl_res, GIC, VIRTUAL_INT_CTRL_RES, struct resource *); +__BUS_ACCESSOR(gic, virtual_int_ctrl_vaddr, GIC, VIRTUAL_INT_CTRL_VADDR, uint64_t); +__BUS_ACCESSOR(gic, virtual_int_ctrl_paddr, GIC, VIRTUAL_INT_CTRL_PADDR, uint64_t); +__BUS_ACCESSOR(gic, virtual_int_ctrl_size, GIC, VIRTUAL_INT_CTRL_SIZE, uint32_t); +__BUS_ACCESSOR(gic, virtual_cpu_int_paddr, GIC, VIRTUAL_CPU_INT_PADDR, uint32_t); +__BUS_ACCESSOR(gic, virtual_cpu_int_size, GIC, VIRTUAL_CPU_INT_SIZE, uint32_t); +__BUS_ACCESSOR(gic, lr_num, GIC, LR_NUM, uint32_t); +__BUS_ACCESSOR(gic, maintenance_intr_res, GIC, MAINTENANCE_INTR_RES, struct resource *); + +struct arm_gic_softc *arm_gic_get_sc(void); +uint32_t arm_gic_get_lr_num(void); + +#endif /*__ASSEMBLER__ */ /* Software Generated Interrupts */ #define GIC_FIRST_SGI 0 /* Irqs 0-15 are SGIs/IPIs. */ @@ -55,8 +85,9 @@ /* Common register values */ #define GICD_CTLR 0x0000 /* v1 ICDDCR */ #define GICD_TYPER 0x0004 /* v1 ICDICTR */ -#define GICD_TYPER_ITLINESNUM_MASK 0x1f -#define GICD_TYPER_I_NUM(n) ((((n) & 0x1F) + 1) * 32) +#define GICD_TYPER_ITLINESNUM_MASK (0x1f) +#define GICD_TYPER_I_NUM(n) \ + ((((n) & GICD_TYPER_ITLINESNUM_MASK) + 1) * 32) #define GICD_IIDR 0x0008 /* v1 ICDIIDR */ #define GICD_IIDR_PROD_SHIFT 24 #define GICD_IIDR_PROD_MASK 0xff000000 @@ -74,20 +105,33 @@ #define GICD_IIDR_IMPL_MASK 0x00000fff #define GICD_IIDR_IMPL(x) \ (((x) & GICD_IIDR_IMPL_MASK) >> GICD_IIDR_IMPL_SHIFT) -#define GICD_IGROUPR(n) (0x0080 + (((n) >> 5) * 4)) /* v1 ICDISER */ +#define GICD_TYPER2 0x000c +#define GICD_IGROUPR_BASE (0x0080) +#define GICD_IGROUPR(n) \ + (GICD_IGROUPR_BASE + (((n) >> 5) * 4)) /* v1 ICDISER */ #define GICD_I_PER_IGROUPRn 32 -#define GICD_ISENABLER(n) (0x0100 + (((n) >> 5) * 4)) /* v1 ICDISER */ +#define GICD_ISENABLER_BASE (0x0100) +#define GICD_ISENABLER(n) \ + (GICD_ISENABLER_BASE + (((n) >> 5) * 4)) /* v1 ICDISER */ #define GICD_I_MASK(n) (1ul << ((n) & 0x1f)) #define GICD_I_PER_ISENABLERn 32 -#define GICD_ICENABLER(n) (0x0180 + (((n) >> 5) * 4)) /* v1 ICDICER */ +#define GICD_ICENABLER_BASE (0x0180) +#define GICD_ICENABLER(n) \ + (GICD_ICENABLER_BASE + (((n) >> 5) * 4)) /* v1 ICDICER */ #define GICD_ISPENDR(n) (0x0200 + (((n) >> 5) * 4)) /* v1 ICDISPR */ #define GICD_ICPENDR(n) (0x0280 + (((n) >> 5) * 4)) /* v1 ICDICPR */ -#define GICD_ISACTIVER(n) (0x0300 + (((n) >> 5) * 4)) /* v1 ICDABR */ -#define GICD_ICACTIVER(n) (0x0380 + (((n) >> 5) * 4)) -#define GICD_IPRIORITYR(n) (0x0400 + (((n) >> 2) * 4)) /* v1 ICDIPR */ +#define GICD_ISACTIVER_BASE (0x0300) +#define GICD_ISACTIVER(n) (GICD_ISACTIVER_BASE + (((n) >> 5) * 4)) /* v1 ICDABR */ +#define GICD_ICACTIVER_BASE (0x0380) +#define GICD_ICACTIVER(n) (GICD_ICACTIVER_BASE + (((n) >> 5) * 4)) /* v1 ICDABR */ +#define GICD_IPRIORITYR_BASE (0x0400) +#define GICD_IPRIORITYR(n) \ + (GICD_IPRIORITYR_BASE + (((n) >> 2) * 4)) /* v1 ICDIPR */ #define GICD_I_PER_IPRIORITYn 4 #define GICD_ITARGETSR(n) (0x0800 + (((n) >> 2) * 4)) /* v1 ICDIPTR */ -#define GICD_ICFGR(n) (0x0C00 + (((n) >> 4) * 4)) /* v1 ICDICFR */ +#define GICD_ICFGR_BASE (0x0C00) +#define GICD_ICFGR(n) \ + (GICD_ICFGR_BASE + (((n) >> 4) * 4)) /* v1 ICDICFR */ #define GICD_I_PER_ICFGRn 16 /* First bit is a polarity bit (0 - low, 1 - high) */ #define GICD_ICFGR_POL_LOW (0 << 0) @@ -97,7 +141,34 @@ #define GICD_ICFGR_TRIG_LVL (0 << 1) #define GICD_ICFGR_TRIG_EDGE (1 << 1) #define GICD_ICFGR_TRIG_MASK 0x2 -#define GICD_SGIR 0x0F00 /* v1 ICDSGIR */ +#define GICD_SGIR(n) (0x0F00 + ((n) * 4)) /* v1 ICDSGIR */ #define GICD_SGI_TARGET_SHIFT 16 +/* GIC Hypervisor specific registers */ +#define GICH_HCR 0x0 +#define GICH_VTR 0x4 +#define GICH_VMCR 0x8 +#define GICH_VMCR_VMGRP1EN (1 << 1) +#define GICH_MISR 0x10 +#define GICH_EISR0 0x20 +#define GICH_EISR1 0x24 +#define GICH_ELSR0 0x30 +#define GICH_ELSR1 0x34 +#define GICH_APR 0xF0 +#define GICH_LR0 0x100 + +#define GICH_HCR_EN (1 << 0) +#define GICH_HCR_UIE (1 << 1) + +#define GICH_LR_VIRTID (0x3FF << 0) +#define GICH_LR_PHYSID_CPUID_SHIFT 10 +#define GICH_LR_PHYSID_CPUID (7 << GICH_LR_PHYSID_CPUID_SHIFT) +#define GICH_LR_STATE (3 << 28) +#define GICH_LR_PENDING (1 << 28) +#define GICH_LR_ACTIVE (1 << 29) +#define GICH_LR_EOI (1 << 19) + +#define GICH_MISR_EOI (1 << 0) +#define GICH_MISR_U (1 << 1) + #endif /* _GIC_COMMON_H_ */ Index: sys/arm/arm/gic_fdt.c =================================================================== --- sys/arm/arm/gic_fdt.c +++ sys/arm/arm/gic_fdt.c @@ -129,18 +129,25 @@ gic_fdt_attach(device_t dev) { struct arm_gic_fdt_softc *sc = device_get_softc(dev); - phandle_t pxref; - intptr_t xref; + phandle_t pxref = ofw_bus_find_iparent(ofw_bus_get_node(dev)); + intptr_t xref = OF_xref_from_node(ofw_bus_get_node(dev)); int err; + sc->base.is_root = false; + /* + * Controller is root if: + * - doesn't have interrupt parent + * - his interrupt parent is this controller + */ + if (pxref == 0 || xref == pxref) + sc->base.is_root = true; + sc->base.gic_bus = GIC_BUS_FDT; err = arm_gic_attach(dev); if (err != 0) return (err); - xref = OF_xref_from_node(ofw_bus_get_node(dev)); - /* * Now, when everything is initialized, it's right time to * register interrupt controller to interrupt framefork. @@ -150,13 +157,7 @@ goto cleanup; } - /* - * Controller is root if: - * - doesn't have interrupt parent - * - his interrupt parent is this controller - */ - pxref = ofw_bus_find_iparent(ofw_bus_get_node(dev)); - if (pxref == 0 || xref == pxref) { + if (sc->base.is_root) { if (intr_pic_claim_root(dev, xref, arm_gic_intr, sc, GIC_LAST_SGI - GIC_FIRST_SGI + 1) != 0) { device_printf(dev, "could not set PIC as a root\n"); Index: sys/arm64/arm64/gic_v3.c =================================================================== --- sys/arm64/arm64/gic_v3.c +++ sys/arm64/arm64/gic_v3.c @@ -106,6 +106,11 @@ static u_int sgi_first_unused = GIC_FIRST_SGI; #endif +static struct resource *maint_res; +static device_t gic_dev; +static int maint_rid; +static void *maint_cookie; + static device_method_t gic_v3_methods[] = { /* Device interface */ DEVMETHOD(device_detach, gic_v3_detach), @@ -401,12 +406,49 @@ return (0); } +void +gic_v3_alloc_maint_res(device_t dev) +{ + gic_dev = dev; + maint_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &maint_rid, + RF_ACTIVE); + if (!maint_res) + device_printf(dev, + "Could not allocate resource for maintenance interrupt\n"); +} + +int +gic_v3_setup_maint_intr(driver_filter_t filter, driver_intr_t handler, + void *arg) +{ + int flags; + + if (!maint_res) + return (EINVAL); + + flags = INTR_TYPE_MISC | INTR_MPSAFE; + return (bus_setup_intr(gic_dev, maint_res, flags, filter, handler, + arg, &maint_cookie)); +} + +int +gic_v3_teardown_maint_intr(void) +{ + if (!maint_res) + return (EINVAL); + + return (bus_teardown_intr(gic_dev, maint_res, maint_cookie)); +} + static int gic_v3_get_domain(device_t dev, device_t child, int *domain) { struct gic_v3_devinfo *di; di = device_get_ivars(child); + if (di == NULL) + return (0); + if (di->gic_domain < 0) return (ENOENT); @@ -1016,22 +1058,25 @@ struct resource *res; u_int cpuid; size_t us_left = 1000000; + uint32_t rwp; cpuid = PCPU_GET(cpuid); switch (xdist) { case DIST: res = sc->gic_dist; + rwp = GICD_CTLR_RWP; break; case REDIST: res = &sc->gic_redists.pcpu[cpuid]->res; + rwp = GICR_CTLR_RWP; break; default: KASSERT(0, ("%s: Attempt to wait for unknown RWP", __func__)); return; } - while ((bus_read_4(res, GICD_CTLR) & GICD_CTLR_RWP) != 0) { + while ((bus_read_4(res, GICD_CTLR) & rwp) != 0) { DELAY(1); if (us_left-- == 0) panic("GICD Register write pending for too long"); Index: sys/arm64/arm64/gic_v3_acpi.c =================================================================== --- sys/arm64/arm64/gic_v3_acpi.c +++ sys/arm64/arm64/gic_v3_acpi.c @@ -337,6 +337,8 @@ if (device_get_children(dev, &sc->gic_children, &sc->gic_nchildren) !=0) sc->gic_nchildren = 0; + gic_v3_alloc_maint_res(dev); + return (0); error: Index: sys/arm64/arm64/gic_v3_fdt.c =================================================================== --- sys/arm64/arm64/gic_v3_fdt.c +++ sys/arm64/arm64/gic_v3_fdt.c @@ -190,6 +190,8 @@ if (device_get_children(dev, &sc->gic_children, &sc->gic_nchildren) != 0) sc->gic_nchildren = 0; + gic_v3_alloc_maint_res(dev); + return (err); error: @@ -213,12 +215,19 @@ static int gic_v3_fdt_print_child(device_t bus, device_t child) { - struct gic_v3_ofw_devinfo *di = device_get_ivars(child); - struct resource_list *rl = &di->di_rl; + struct gic_v3_ofw_devinfo *di; + struct resource_list *rl; int retval = 0; retval += bus_print_child_header(bus, child); + + di = device_get_ivars(child); + if (di == NULL) + goto footer; + rl = &di->di_rl; + retval += resource_list_print_type(rl, "mem", SYS_RES_MEMORY, "%#jx"); +footer: retval += bus_print_child_footer(bus, child); return (retval); @@ -299,6 +308,7 @@ size_cells = 2; OF_getencprop(parent, "#size-cells", &size_cells, sizeof(size_cells)); + /* Iterate through all GIC subordinates */ for (node = OF_child(parent); node > 0; node = OF_peer(node)) { /* Allocate and populate devinfo. */ Index: sys/arm64/arm64/gic_v3_reg.h =================================================================== --- sys/arm64/arm64/gic_v3_reg.h +++ sys/arm64/arm64/gic_v3_reg.h @@ -55,8 +55,9 @@ #define GICD_CTLR_G1 (1 << 0) #define GICD_CTLR_G1A (1 << 1) #define GICD_CTLR_ARE_NS (1 << 4) +#define GICD_CTLR_DS (1 << 6) +#define GICD_CTLR_E1NWF (1 << 7) #define GICD_CTLR_RWP (1 << 31) - /* GICD_TYPER */ #define GICD_TYPER_SECURITYEXTN (1 << 10) #define GICD_TYPER_MBIS (1 << 16) @@ -64,6 +65,10 @@ #define GICD_TYPER_DVIS (1 << 18) #define GICD_TYPER_IDBITS_SHIFT 19 #define GICD_TYPER_IDBITS(n) ((((n) >> 19) & 0x1F) + 1) +#define GICD_TYPER_SECURITYEXTN \ + (1 << 10) +#define GICD_TYPER_DVIS (1 << 18) +#define GICD_TYPER_LPIS (1 << 17) /* * Registers (v3) @@ -103,8 +108,8 @@ #define GICD_PIDR3 0xFFEC -/* - * Redistributor registers +/* + * Redistributor registers */ /* RD_base registers */ @@ -130,6 +135,10 @@ #define GICR_TYPER_AFF_MASK (0xfffffffful << GICR_TYPER_AFF_SHIFT) #define GICR_TYPER_AFF(x) \ (((x) & GICR_TYPER_AFF_MASK) >> GICR_TYPER_AFF_SHIFT) +#define GICR_TYPER_AFF0(x) ((x >> GICR_TYPER_AFF_SHIFT) & 0xff) +#define GICR_TYPER_AFF1(x) ((x >> (GICR_TYPER_AFF_SHIFT + 8)) & 0xff) +#define GICR_TYPER_AFF2(x) ((x >> (GICR_TYPER_AFF_SHIFT + 16)) & 0xff) +#define GICR_TYPER_AFF3(x) ((x >> (GICR_TYPER_AFF_SHIFT + 24)) & 0xff) #define GICR_STATUSR 0x0010 @@ -240,6 +249,7 @@ #define GICR_I_ENABLER_SGI_MASK (0x0000FFFF) #define GICR_I_ENABLER_PPI_MASK (0xFFFF0000) +#define GICR_IPRIORITYR_BASE (0x0400) #define GICR_I_PER_IPRIORITYn (GICD_I_PER_IPRIORITYn) #define GICR_ISPENDR0 0x0200 Index: sys/arm64/arm64/gic_v3_var.h =================================================================== --- sys/arm64/arm64/gic_v3_var.h +++ sys/arm64/arm64/gic_v3_var.h @@ -113,6 +113,10 @@ void gic_r_write_4(device_t, bus_size_t, uint32_t var); void gic_r_write_8(device_t, bus_size_t, uint64_t var); +void gic_v3_alloc_maint_res(device_t); +int gic_v3_setup_maint_intr(driver_filter_t, driver_intr_t, void *); +int gic_v3_teardown_maint_intr(void); + /* * GIC Distributor accessors. * Notice that only GIC sofc can be passed. Index: sys/arm64/arm64/hyp_stub.S =================================================================== --- /dev/null +++ sys/arm64/arm64/hyp_stub.S @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +__FBSDID("$FreeBSD$"); + + .text + +/* + * Install a new exception vector table with the base address supplied by the + * parameter in register x0. + */ +ENTRY(handle_stub_el1h_sync) + msr vbar_el2, x0 + eret +END(handle_hyp_stub) + +.macro vempty + .align 7 + 1: b 1b +.endm + +.macro vector name + .align 7 + b handle_\name +.endm + + .align 11 + .globl hyp_stub_vectors +hyp_stub_vectors: + vempty /* Synchronous EL2t */ + vempty /* IRQ EL2t */ + vempty /* FIQ EL2t */ + vempty /* SError EL2t */ + + vempty /* Synchronous EL2h */ + vempty /* IRQ EL2h */ + vempty /* FIQ EL2h */ + vempty /* SError EL2h */ + + vector stub_el1h_sync /* Synchronous 64-bit EL1 */ + vempty /* IRQ 64-bit EL1 */ + vempty /* FIQ 64-bit EL1 */ + vempty /* SError 64-bit EL1 */ + + vempty /* Synchronous 32-bit EL1 */ + vempty /* IRQ 32-bit EL1 */ + vempty /* FIQ 32-bit EL1 */ + vempty /* SError 32-bit EL1 */ Index: sys/arm64/arm64/locore.S =================================================================== --- sys/arm64/arm64/locore.S +++ sys/arm64/arm64/locore.S @@ -213,6 +213,11 @@ END(mpentry) #endif + .align 3 + .globl _C_LABEL(hypmode_enabled) +_C_LABEL(hypmode_enabled): + .zero 8 + /* * If we are started in EL2, configure the required hypervisor * registers and drop to EL1. @@ -224,8 +229,22 @@ b.eq 1f ret 1: + /* + * If the MMU is active, then it is using a page table where VA == PA. + * But the page table won't have entries for the hypervisor EL2 + * initialization code which is loaded into memory with the vmm module. + * + * So we disable the MMU in EL2 to make the vmm hypervisor code run + * successfully. + */ + dsb sy + mrs x2, sctlr_el2 + bic x2, x2, SCTLR_M + msr sctlr_el2, x2 + isb + /* Configure the Hypervisor */ - mov x2, #(HCR_RW) + mov x2, #(HCR_RW & ~HCR_HCD) msr hcr_el2, x2 /* Load the Virtualization Process ID Register */ @@ -256,10 +275,18 @@ msr cntvoff_el2, xzr /* Hypervisor trap functions */ - adrp x2, hyp_vectors - add x2, x2, :lo12:hyp_vectors + adrp x2, hyp_stub_vectors msr vbar_el2, x2 + /* Use the host VTTBR_EL2 to tell the host and the guests apart */ + mov x2, #VTTBR_HOST + msr vttbr_el2, x2 + + /* Mark hypervisor mode as enabled */ + mov x1, #1 + adr x2, hypmode_enabled + str x1, [x2] + mov x2, #(PSR_F | PSR_I | PSR_A | PSR_D | PSR_M_EL1h) msr spsr_el2, x2 @@ -288,6 +315,10 @@ .quad SCTLR_RES1 LEND(drop_to_el1) +hcr: + /* Make sure the HVC instruction is not disabled */ + .quad (HCR_RW & ~HCR_HCD) + #define VECT_EMPTY \ .align 7; \ 1: b 1b @@ -733,6 +764,8 @@ ENTRY(abort) b abort + + .align 12 /* 4KiB aligned */ END(abort) .section .init_pagetable, "aw", %nobits Index: sys/arm64/arm64/pmap.c =================================================================== --- sys/arm64/arm64/pmap.c +++ sys/arm64/arm64/pmap.c @@ -407,6 +407,8 @@ static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); +static uint64_t pa_range_bits = 0; + /* * These load the old table data and store the new value. * They need to be atomic as the System MMU may write to the table at @@ -431,9 +433,19 @@ memcpy(d, s, PAGE_SIZE); } +#define pmap_l0_index(va) (((va) >> L0_SHIFT) & L0_ADDR_MASK) +#define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) +#define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) +#define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) + +#define STAGE2_L1_ADDR_MASK ((1UL << (pa_range_bits - L1_SHIFT)) - 1) +#define pmap_stage2_l1_index(va) (((va) >> L1_SHIFT) & STAGE2_L1_ADDR_MASK) + static __inline pd_entry_t * pmap_l0(pmap_t pmap, vm_offset_t va) { + KASSERT(pmap->pm_stage != PM_STAGE2, + ("Level 0 table is invalid for PM_STAGE2 pmap")); return (&pmap->pm_l0[pmap_l0_index(va)]); } @@ -450,6 +462,9 @@ static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { + if (pmap->pm_stage == PM_STAGE2) + return (&pmap->pm_l0[pmap_stage2_l1_index(va)]); + pd_entry_t *l0; l0 = pmap_l0(pmap, va); @@ -459,6 +474,32 @@ return (pmap_l0_to_l1(l0, va)); } +static __inline vm_page_t +pmap_l1pg(pmap_t pmap, vm_offset_t va) +{ + if (pmap->pm_stage == PM_STAGE1) { + pd_entry_t *l0, tl0; + + l0 = pmap_l0(pmap, va); + tl0 = pmap_load(l0); + + return (PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK)); + } else { + vm_paddr_t pa, pa_offset; + + /* + * The offset will be the bits + * [pa_range_bits-1:L0_SHIFT] + */ + va = va & ((1 << pa_range_bits) - 1); + pa_offset = va >> L0_SHIFT; + pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0) + \ + (pa_offset << PAGE_SHIFT); + + return (PHYS_TO_VM_PAGE(pa)); + } +} + static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va) { @@ -523,18 +564,28 @@ { pd_entry_t *l0, *l1, *l2, desc; - l0 = pmap_l0(pmap, va); - desc = pmap_load(l0) & ATTR_DESCR_MASK; - if (desc != L0_TABLE) { - *level = -1; - return (NULL); - } + if (pmap->pm_stage == PM_STAGE1) { + l0 = pmap_l0(pmap, va); + desc = pmap_load(l0) & ATTR_DESCR_MASK; + if (desc != L0_TABLE) { + *level = -1; + return (NULL); + } - l1 = pmap_l0_to_l1(l0, va); - desc = pmap_load(l1) & ATTR_DESCR_MASK; - if (desc != L1_TABLE) { - *level = 0; - return (l0); + l1 = pmap_l0_to_l1(l0, va); + desc = pmap_load(l1) & ATTR_DESCR_MASK; + if (desc != L1_TABLE) { + *level = 0; + return (l0); + } + } else { + l1 = pmap_l1(pmap, va); + desc = pmap_load(l1) & ATTR_DESCR_MASK; + if (desc != L1_TABLE) { + /* For PM_STAGE2 mappings the first level is level 1 */ + *level = -1; + return (NULL); + } } l2 = pmap_l1_to_l2(l1, va); @@ -611,13 +662,18 @@ if (pmap->pm_l0 == NULL) return (false); - l0p = pmap_l0(pmap, va); - *l0 = l0p; + if (pmap->pm_stage == PM_STAGE1) { + l0p = pmap_l0(pmap, va); + *l0 = l0p; - if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) - return (false); + if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) + return (false); - l1p = pmap_l0_to_l1(l0p, va); + l1p = pmap_l0_to_l1(l0p, va); + } else { + *l0 = NULL; + l1p = pmap_l1(pmap, va); + } *l1 = l1p; if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { @@ -930,6 +986,7 @@ pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { + uint64_t id_aa64mmfr0_el1; vm_offset_t freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t start_pa, pa, min_pa; @@ -1018,6 +1075,35 @@ physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); + id_aa64mmfr0_el1 = READ_SPECIALREG(id_aa64mmfr0_el1); + switch (ID_AA64MMFR0_PARange_VAL(id_aa64mmfr0_el1)) { + case ID_AA64MMFR0_PARange_4G: + pa_range_bits = 32; + break; + case ID_AA64MMFR0_PARange_64G: + pa_range_bits = 36; + break; + case ID_AA64MMFR0_PARange_1T: + pa_range_bits = 40; + break; + case ID_AA64MMFR0_PARange_4T: + pa_range_bits = 42; + break; + case ID_AA64MMFR0_PARange_16T: + pa_range_bits = 44; + break; + case ID_AA64MMFR0_PARange_256T: + pa_range_bits = 48; + break; + default: + /* + * Unknown PA range bits, will lead to a panic if a stage 2 + * pmap starting at level 1 is created. + */ + pa_range_bits = 0; + break; + } + cpu_tlb_flushID(); } @@ -1181,8 +1267,6 @@ { uint64_t r; - PMAP_ASSERT_STAGE1(pmap); - dsb(ishst); if (pmap == kernel_pmap) { r = atop(va); @@ -1200,8 +1284,6 @@ { uint64_t end, r, start; - PMAP_ASSERT_STAGE1(pmap); - dsb(ishst); if (pmap == kernel_pmap) { start = atop(sva); @@ -1650,10 +1732,12 @@ */ if (m->pindex >= (NUL2E + NUL1E)) { /* l1 page */ - pd_entry_t *l0; + if (pmap->pm_stage == PM_STAGE1) { + pd_entry_t *l0; - l0 = pmap_l0(pmap, va); - pmap_clear(l0); + l0 = pmap_l0(pmap, va); + pmap_clear(l0); + } } else if (m->pindex >= NUL2E) { /* l2 page */ pd_entry_t *l1; @@ -1679,12 +1763,16 @@ pmap_unwire_l3(pmap, va, l2pg, free); } else if (m->pindex < (NUL2E + NUL1E)) { /* We just released an l2, unhold the matching l1 */ - pd_entry_t *l0, tl0; vm_page_t l1pg; + pd_entry_t *l0, tl0; - l0 = pmap_l0(pmap, va); - tl0 = pmap_load(l0); - l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); + if (pmap->pm_stage == PM_STAGE1) { + l0 = pmap_l0(pmap, va); + tl0 = pmap_load(l0); + l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); + } else { + l1pg = pmap_l1pg(pmap, va); + } pmap_unwire_l3(pmap, va, l1pg, free); } pmap_invalidate_page(pmap, va); @@ -1761,12 +1849,48 @@ { vm_page_t m; + KASSERT((stage == PM_STAGE1 || stage == PM_STAGE2), + ("Invalid pmap stage %d", stage)); + KASSERT(!((stage == PM_STAGE2) && (pa_range_bits == 0)), + ("Unknown PARange bits")); + /* * allocate the l0 page */ - while ((m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) - vm_wait(NULL); + if (stage == PM_STAGE1) { + while ((m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) + vm_wait(NULL); + } else { + uint64_t npages; + uint64_t alignment; + + if (pa_range_bits <= L0_SHIFT) { + /* + * The level 1 translation table is not larger than a + * PM_STAGE1 level 1 table, use only one page. + */ + npages = 1; + alignment = PAGE_SIZE; + } else { + /* + * The level 1 translation table is larger than a + * regular PM_STAGE1 level 1 table, for every x bits + * that is larger we need 2^x pages and the table must + * be aligned at a 2^(x + 12) boundary. + * + * See Table D5-25 and Example D4-5 from the DDI0487B + * ARMv8 Architecture Manual for more information. + */ + npages = 1 << (pa_range_bits - L0_SHIFT); + alignment = 1 << (PAGE_SHIFT + pa_range_bits - L0_SHIFT); + } + while ((m = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO, + npages, DMAP_MIN_PHYSADDR, DMAP_MAX_PHYSADDR, + alignment, 0, VM_MEMATTR_DEFAULT)) == NULL) + vm_wait(NULL); + } pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m); pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); @@ -1775,6 +1899,7 @@ pagezero(pmap->pm_l0); pmap->pm_root.rt_root = 0; + pmap->pm_stage = stage; bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX); @@ -1887,27 +2012,34 @@ pd_entry_t tl0; l1index = ptepindex - NUL2E; - l0index = l1index >> L0_ENTRIES_SHIFT; - - l0 = &pmap->pm_l0[l0index]; - tl0 = pmap_load(l0); - if (tl0 == 0) { - /* recurse for allocating page dir */ - if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, - lockp) == NULL) { - vm_page_unwire_noq(m); - vm_page_free_zero(m); - return (NULL); + if (pmap->pm_stage == PM_STAGE1) { + l0index = l1index >> L0_ENTRIES_SHIFT; + l0 = &pmap->pm_l0[l0index]; + tl0 = pmap_load(l0); + if (tl0 == 0) { + /* recurse for allocating page dir */ + if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, + lockp) == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + } else { + l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); + l1pg->ref_count++; } + + l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); + l1 = &l1[ptepindex & Ln_ADDR_MASK]; + KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0, + ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); } else { - l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); + l1pg = pmap_l1pg(pmap, l1index); l1pg->ref_count++; + l1 = &pmap->pm_l0[l1index & STAGE2_L1_ADDR_MASK]; } - l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); - l1 = &l1[ptepindex & Ln_ADDR_MASK]; - KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0, - ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); + pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); } else { vm_pindex_t l0index, l1index; @@ -1915,24 +2047,40 @@ pd_entry_t tl0, tl1; l1index = ptepindex >> Ln_ENTRIES_SHIFT; - l0index = l1index >> L0_ENTRIES_SHIFT; - - l0 = &pmap->pm_l0[l0index]; - tl0 = pmap_load(l0); - if (tl0 == 0) { - /* recurse for allocating page dir */ - if (_pmap_alloc_l3(pmap, NUL2E + l1index, - lockp) == NULL) { - vm_page_unwire_noq(m); - vm_page_free_zero(m); - return (NULL); - } + if (pmap->pm_stage == PM_STAGE1) { + l0index = l1index >> L0_ENTRIES_SHIFT; + l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); - l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); - l1 = &l1[l1index & Ln_ADDR_MASK]; + if (tl0 == 0) { + /* recurse for allocating page dir */ + if (_pmap_alloc_l3(pmap, NUL2E + l1index, + lockp) == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + tl0 = pmap_load(l0); + l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); + l1 = &l1[l1index & Ln_ADDR_MASK]; + } else { + l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); + l1 = &l1[l1index & Ln_ADDR_MASK]; + tl1 = pmap_load(l1); + if (tl1 == 0) { + /* recurse for allocating page dir */ + if (_pmap_alloc_l3(pmap, NUL2E + l1index, + lockp) == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + } else { + l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); + l2pg->ref_count++; + } + } } else { - l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); - l1 = &l1[l1index & Ln_ADDR_MASK]; + l1 = &pmap->pm_l0[l1index & STAGE2_L1_ADDR_MASK]; tl1 = pmap_load(l1); if (tl1 == 0) { /* recurse for allocating page dir */ @@ -2127,9 +2275,27 @@ mtx_unlock_spin(&set->asid_set_mutex); } - m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); - vm_page_unwire_noq(m); - vm_page_free_zero(m); + if (pmap->pm_stage == PM_STAGE1) { + m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); + vm_page_unwire_noq(m); + vm_page_free_zero(m); + } else { + uint64_t i, page_cnt; + vm_paddr_t pa; + + if (pa_range_bits < L0_SHIFT) + page_cnt = 1; + else + page_cnt = 1 << (pa_range_bits - L0_SHIFT); + + pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0); + for (i = 0; i < page_cnt; i++) { + m = PHYS_TO_VM_PAGE(pa); + vm_page_unwire_noq(m); + vm_page_free_zero(m); + pa += PAGE_SIZE; + } + } } static int @@ -2496,7 +2662,7 @@ vm_page_t m; mtx_lock(&pv_chunks_mutex); - TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); @@ -2984,7 +3150,7 @@ * released. Otherwise, a concurrent * pmap_remove_all() on a physical page * could return while a stale TLB entry - * still provides access to that page. + * still provides access to that page. */ if (va != eva) { pmap_invalidate_range(pmap, va, @@ -3045,18 +3211,23 @@ if (pmap->pm_stats.resident_count == 0) break; - l0 = pmap_l0(pmap, sva); - if (pmap_load(l0) == 0) { - va_next = (sva + L0_SIZE) & ~L0_OFFSET; - if (va_next < sva) - va_next = eva; - continue; + if (pmap->pm_stage == PM_STAGE1) { + l0 = pmap_l0(pmap, sva); + if (pmap_load(l0) == 0) { + va_next = (sva + L0_SIZE) & ~L0_OFFSET; + if (va_next < sva) + va_next = eva; + continue; + } + + l1 = pmap_l0_to_l1(l0, sva); + } else { + l1 = pmap_l1(pmap, sva); } va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; - l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) continue; if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { @@ -3736,33 +3907,19 @@ new_l3 |= ATTR_S1_UXN; if (pmap != kernel_pmap) new_l3 |= ATTR_S1_nG; - } else { - /* - * Clear the access flag on executable mappings, this will be - * set later when the page is accessed. The fault handler is - * required to invalidate the I-cache. - * - * TODO: Switch to the valid flag to allow hardware management - * of the access flag. Much of the pmap code assumes the - * valid flag is set and fails to destroy the old page tables - * correctly if it is clear. - */ - if (prot & VM_PROT_EXECUTE) - new_l3 &= ~ATTR_AF; - } - if ((m->oflags & VPO_UNMANAGED) == 0) { - new_l3 |= ATTR_SW_MANAGED; - if ((prot & VM_PROT_WRITE) != 0) { - new_l3 |= ATTR_SW_DBM; - if ((flags & VM_PROT_WRITE) == 0) { - if (pmap->pm_stage == PM_STAGE1) - new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); - else - new_l3 &= - ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); + if ((m->oflags & VPO_UNMANAGED) == 0) { + new_l3 |= ATTR_SW_MANAGED; + if ((prot & VM_PROT_WRITE) != 0) { + new_l3 |= ATTR_SW_DBM; + if ((flags & VM_PROT_WRITE) == 0) + new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); } } + } else { + new_l3 = (pd_entry_t)(pa | ATTR_ST2_DEFAULT | L3_PAGE); } + if ((flags & PMAP_ENTER_WIRED) != 0) + new_l3 |= ATTR_SW_WIRED; CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); @@ -4676,7 +4833,7 @@ pmap_abort_ptp(dst_pmap, addr, dstmpte); goto out; } - /* Have we copied all of the valid mappings? */ + /* Have we copied all of the valid mappings? */ if (dstmpte->ref_count >= srcmpte->ref_count) break; } @@ -4976,7 +5133,7 @@ switch(lvl) { case 1: pte = pmap_l1_to_l2(pde, pv->pv_va); - tpte = pmap_load(pte); + tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("Attempting to remove an invalid " Index: sys/arm64/include/armreg.h =================================================================== --- sys/arm64/include/armreg.h +++ sys/arm64/include/armreg.h @@ -215,7 +215,7 @@ #define ISS_DATA_DFSC_TLB_CONFLICT (0x30 << 0) #define ESR_ELx_IL (0x01 << 25) #define ESR_ELx_EC_SHIFT 26 -#define ESR_ELx_EC_MASK (0x3f << 26) +#define ESR_ELx_EC_MASK (0x3f << ESR_ELx_EC_SHIFT) #define ESR_ELx_EXCEPTION(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) #define EXCP_UNKNOWN 0x00 /* Unkwn exception */ #define EXCP_TRAP_WFI_WFE 0x01 /* Trapped WFI or WFE */ @@ -226,10 +226,10 @@ #define EXCP_HVC 0x16 /* HVC trap */ #define EXCP_MSR 0x18 /* MSR/MRS trap */ #define EXCP_INSN_ABORT_L 0x20 /* Instruction abort, from lower EL */ -#define EXCP_INSN_ABORT 0x21 /* Instruction abort, from same EL */ +#define EXCP_INSN_ABORT 0x21 /* Instruction abort, from same EL */ #define EXCP_PC_ALIGN 0x22 /* PC alignment fault */ #define EXCP_DATA_ABORT_L 0x24 /* Data abort, from lower EL */ -#define EXCP_DATA_ABORT 0x25 /* Data abort, from same EL */ +#define EXCP_DATA_ABORT 0x25 /* Data abort, from same EL */ #define EXCP_SP_ALIGN 0x26 /* SP slignment fault */ #define EXCP_TRAP_FP 0x2c /* Trapped FP exception */ #define EXCP_SERROR 0x2f /* SError interrupt */ Index: sys/arm64/include/bitops.h =================================================================== --- /dev/null +++ sys/arm64/include/bitops.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) TODO + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ARM_BITOPS_H_ +#define _ARM_BITOPS_H_ + +#include + +#define for_each_set_bit(bit, addr, size) \ + for (bit_ffs((bitstr_t *)(addr), (size), (int *)&(bit)); \ + (bit) != -1; \ + bit_ffs_at((bitstr_t *)(addr), (bit) + 1, (size), (int *)&(bit))) + +/* same as for_each_set_bit() but use bit as value to start with */ +#define for_each_set_bit_from(bit, addr, size) \ + for (bit_ffs_at((bitstr_t *)(addr), (bit), (size), (int *)&(bit)); \ + (bit) != -1; \ + bit_ffs_at((bitstr_t *)(addr), (bit) + 1, (size), (int *)&(bit))) + +#define for_each_clear_bit(bit, addr, size) \ + for (bit_ffc((bitstr_t *)(addr), (size), (int *)&(bit)); \ + (bit) != -1; \ + bit_ffc_at((bitstr_t *)(addr), (bit) + 1, (size), (int *)&(bit))) + +/* same as for_each_clear_bit() but use bit as value to start with */ +#define for_each_clear_bit_from(bit, addr, size) \ + for (bit_ffc_at((bitstr_t *)(addr), (bit), (size), (int *)&(bit)); \ + (bit) != -1; \ + bit_ffc_at((bitstr_t *)(addr), (bit) + 1, (size), (int *)&(bit))) + +#endif /* _ARM_BITOPS_H_ */ Index: sys/arm64/include/cpu.h =================================================================== --- sys/arm64/include/cpu.h +++ sys/arm64/include/cpu.h @@ -117,6 +117,7 @@ #define CPU_IMPL_TO_MIDR(val) (((val) & 0xff) << 24) #define CPU_PART_TO_MIDR(val) (((val) & 0xfff) << 4) +#define CPU_ARCH_TO_MIDR(val) (((val) & 0xf) << 16) #define CPU_VAR_TO_MIDR(val) (((val) & 0xf) << 20) #define CPU_REV_TO_MIDR(val) (((val) & 0xf) << 0) Index: sys/arm64/include/hypervisor.h =================================================================== --- sys/arm64/include/hypervisor.h +++ sys/arm64/include/hypervisor.h @@ -201,4 +201,35 @@ #define VTTBR_VMID_SHIFT 48 #define VTTBR_HOST 0x0000000000000000 +/* VTCR_EL2 - Virtualization Translation Control Register */ +#define VTCR_EL2_RES1 (0x1 << 31) +#define VTCR_EL2_T0SZ_MASK 0x3f +#define VTCR_EL2_SL0_SHIFT 6 +#define VTCR_EL2_SL0_4K_LVL2 (0x0 << VTCR_EL2_SL0_SHIFT) +#define VTCR_EL2_SL0_4K_LVL1 (0x1 << VTCR_EL2_SL0_SHIFT) +#define VTCR_EL2_SL0_4K_LVL0 (0x2 << VTCR_EL2_SL0_SHIFT) +#define VTCR_EL2_IRGN0_SHIFT 8 +#define VTCR_EL2_IRGN0_WBWA (0x1 << VTCR_EL2_IRGN0_SHIFT) +#define VTCR_EL2_ORGN0_SHIFT 10 +#define VTCR_EL2_ORGN0_WBWA (0x1 << VTCR_EL2_ORGN0_SHIFT) +#define VTCR_EL2_SH0_SHIFT 12 +#define VTCR_EL2_SH0_NS (0x0 << VTCR_EL2_SH0_SHIFT) +#define VTCR_EL2_SH0_OS (0x2 << VTCR_EL2_SH0_SHIFT) +#define VTCR_EL2_SH0_IS (0x3 << VTCR_EL2_SH0_SHIFT) +#define VTCR_EL2_TG0_SHIFT 14 +#define VTCR_EL2_TG0_4K (0x0 << VTCR_EL2_TG0_SHIFT) +#define VTCR_EL2_TG0_64K (0x1 << VTCR_EL2_TG0_SHIFT) +#define VTCR_EL2_TG0_16K (0x2 << VTCR_EL2_TG0_SHIFT) +#define VTCR_EL2_PS_SHIFT 16 +#define VTCR_EL2_PS_32BIT (0x0 << VTCR_EL2_PS_SHIFT) +#define VTCR_EL2_PS_36BIT (0x1 << VTCR_EL2_PS_SHIFT) +#define VTCR_EL2_PS_40BIT (0x2 << VTCR_EL2_PS_SHIFT) +#define VTCR_EL2_PS_42BIT (0x3 << VTCR_EL2_PS_SHIFT) +#define VTCR_EL2_PS_44BIT (0x4 << VTCR_EL2_PS_SHIFT) +#define VTCR_EL2_PS_48BIT (0x5 << VTCR_EL2_PS_SHIFT) + +/* HPFAR_EL2 - Hypervisor IPA Fault Address Register */ +#define HPFAR_EL2_FIPA_SHIFT 4 +#define HPFAR_EL2_FIPA_MASK 0xfffffffff0 + #endif /* !_MACHINE_HYPERVISOR_H_ */ Index: sys/arm64/include/pcpu.h =================================================================== --- sys/arm64/include/pcpu.h +++ sys/arm64/include/pcpu.h @@ -43,6 +43,7 @@ u_int pc_acpi_id; /* ACPI CPU id */ \ u_int pc_midr; /* stored MIDR value */ \ uint64_t pc_clock; \ + void *pc_vcpu; \ pcpu_bp_harden pc_bp_harden; \ pcpu_ssbd pc_ssbd; \ struct pmap *pc_curpmap; \ Index: sys/arm64/include/pmap.h =================================================================== --- sys/arm64/include/pmap.h +++ sys/arm64/include/pmap.h @@ -191,6 +191,7 @@ pd_entry_t **, pt_entry_t **); int pmap_fault(pmap_t, uint64_t, uint64_t); +int pmap_pinit_type(pmap_t, enum pmap_stage); struct pcb *pmap_switch(struct thread *, struct thread *); Index: sys/arm64/include/pte.h =================================================================== --- sys/arm64/include/pte.h +++ sys/arm64/include/pte.h @@ -99,6 +99,35 @@ #define ATTR_DESCR_TYPE_TABLE 2 #define ATTR_DESCR_TYPE_PAGE 2 #define ATTR_DESCR_TYPE_BLOCK 0 +/* Stage 2 translation Block and Page attributes */ +#define ATTR_ST2_AF ATTR_AF +#define ATTR_ST2_SH(x) ATTR_SH(x) +#define ATTR_ST2_SH_MASK ATTR_SH_MASK +#define ATTR_ST2_SH_NS ATTR_SH_NS /* Non-shareable */ +#define ATTR_ST2_SH_OS ATTR_SH_OS /* Outer-shareable */ +#define ATTR_ST2_SH_IS ATTR_SH_IS /* Inner-shareable */ +#define ATTR_ST2_S2AP(x) ((x) << 6) /* Data access permissions */ +#define ATTR_ST2_S2AP_NONE (0 << 1) +#define ATTR_ST2_S2AP_R0 (1 << 0) +#define ATTR_ST2_S2AP_W0 (1 << 1) +#define ATTR_ST2_S2AP_RW (3 << 0) +#define ATTR_ST2_MEMATTR(x) ((x) << 2) /* Memory attributes */ +#define ATTR_ST2_MEM_DEV (0 << 2) /* Device memory */ +#define ATTR_ST2_MEM_DEV_nGnRnE (0 << 0) +#define ATTR_ST2_MEM_DEV_nGnRE (1 << 0) +#define ATTR_ST2_MEM_DEV_nGRE (1 << 1) +#define ATTR_ST2_MEM_DEV_GRE (3 << 0) +#define ATTR_ST2_MEM_ONC (1 << 2) /* Outer Non-cacheable */ +#define ATTR_ST2_MEM_OWT (1 << 2) /* Outer Write-Through Cacheable */ +#define ATTR_ST2_MEM_OWB (3 << 2) /* Outer Write-Back Cacheable */ +#define ATTR_ST2_MEM_INC (1 << 0) /* Inner Non-cacheable */ +#define ATTR_ST2_MEM_IWT (1 << 1) /* Inner Write-Through Cacheable */ +#define ATTR_ST2_MEM_IWB (3 << 0) /* Inner Write-Back Cacheable */ + +#define ATTR_ST2_DEFAULT (ATTR_ST2_AF | ATTR_ST2_SH(ATTR_ST2_SH_IS) | \ + ATTR_ST2_S2AP(ATTR_ST2_S2AP_RW) | \ + ATTR_ST2_MEMATTR(ATTR_ST2_MEM_OWB | ATTR_ST2_MEM_IWB)) + /* Level 0 table, 512GiB per entry */ #define L0_SHIFT 39 Index: sys/arm64/include/vmm.h =================================================================== --- /dev/null +++ sys/arm64/include/vmm.h @@ -0,0 +1,430 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_H_ +#define _VMM_H_ + +#include +#include +#include + +#include "pte.h" +#include "pmap.h" + +enum vm_suspend_how { + VM_SUSPEND_NONE, + VM_SUSPEND_RESET, + VM_SUSPEND_POWEROFF, + VM_SUSPEND_HALT, + VM_SUSPEND_TRIPLEFAULT, + VM_SUSPEND_LAST +}; + +/* + * Identifiers for architecturally defined registers. + */ +enum vm_reg_name { + VM_REG_GUEST_X0, + VM_REG_GUEST_X1, + VM_REG_GUEST_X2, + VM_REG_GUEST_X3, + VM_REG_GUEST_X4, + VM_REG_GUEST_X5, + VM_REG_GUEST_X6, + VM_REG_GUEST_X7, + VM_REG_GUEST_X8, + VM_REG_GUEST_X9, + VM_REG_GUEST_X10, + VM_REG_GUEST_X11, + VM_REG_GUEST_X12, + VM_REG_GUEST_X13, + VM_REG_GUEST_X14, + VM_REG_GUEST_X15, + VM_REG_GUEST_X16, + VM_REG_GUEST_X17, + VM_REG_GUEST_X18, + VM_REG_GUEST_X19, + VM_REG_GUEST_X20, + VM_REG_GUEST_X21, + VM_REG_GUEST_X22, + VM_REG_GUEST_X23, + VM_REG_GUEST_X24, + VM_REG_GUEST_X25, + VM_REG_GUEST_X26, + VM_REG_GUEST_X27, + VM_REG_GUEST_X28, + VM_REG_GUEST_X29, + VM_REG_GUEST_LR, + VM_REG_GUEST_SP, + VM_REG_GUEST_ELR, + VM_REG_GUEST_SPSR, + VM_REG_ELR_EL2, + VM_REG_LAST +}; + +#define VM_INTINFO_VECTOR(info) ((info) & 0xff) +#define VM_INTINFO_DEL_ERRCODE 0x800 +#define VM_INTINFO_RSVD 0x7ffff000 +#define VM_INTINFO_VALID 0x80000000 +#define VM_INTINFO_TYPE 0x700 +#define VM_INTINFO_HWINTR (0 << 8) +#define VM_INTINFO_NMI (2 << 8) +#define VM_INTINFO_HWEXCEPTION (3 << 8) +#define VM_INTINFO_SWINTR (4 << 8) + +#define VM_GUEST_BASE_IPA 0x80000000UL /* Guest kernel start ipa */ + +#ifdef _KERNEL + +#define VM_MAX_NAMELEN 32 + +struct vm; +struct vm_exception; +struct vm_memory_segment; +struct vm_exit; +struct vm_run; +struct vm_object; +struct pmap; +struct hypctx; + +typedef int (*vmm_init_func_t)(int ipinum); +typedef int (*vmm_cleanup_func_t)(void); +typedef void (*vmm_resume_func_t)(void); +typedef void * (*vmi_init_func_t)(struct vm *vm); +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, + struct pmap *pmap, void *rendezvous_cookie, + void *suspend_cookie); +typedef void (*vmi_cleanup_func_t)(void *vmi); +typedef void (*vmi_mmap_set_func_t)(void *arg, vm_offset_t va, + vm_offset_t pa, size_t len, + vm_prot_t prot); +typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *arg, vm_offset_t va); +typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, + uint64_t *retval); +typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, + uint64_t val); +typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); +typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); +typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); +typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); +typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); +typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); + +struct vmm_ops { + /* Module-wide functions */ + vmm_init_func_t init; + vmm_cleanup_func_t cleanup; + vmm_resume_func_t resume; + /* VM specific functions */ + vmi_init_func_t vminit; + vmi_run_func_t vmrun; + vmi_cleanup_func_t vmcleanup; + vmi_mmap_set_func_t vmmapset; + vmi_mmap_get_func_t vmmapget; + vmi_get_register_t vmgetreg; + vmi_set_register_t vmsetreg; + vmi_get_cap_t vmgetcap; + vmi_set_cap_t vmsetcap; +}; + +extern struct vmm_ops vmm_ops_arm; + +int vm_create(const char *name, struct vm **retvm); +void vm_destroy(struct vm *vm); +const char *vm_name(struct vm *vm); +int vm_malloc(struct vm *vm, uint64_t gpa, size_t len); +uint64_t vm_gpa2hpa(struct vm *vm, uint64_t gpa, size_t size); +int vm_gpabase2memseg(struct vm *vm, uint64_t gpabase, + struct vm_memory_segment *seg); +boolean_t vm_mem_allocated(struct vm *vm, uint64_t gpa); +int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); +int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); +int vm_run(struct vm *vm, struct vm_run *vmrun); +void* vm_get_cookie(struct vm *vm); +uint16_t vm_get_maxcpus(struct vm *vm); +int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); +int vm_set_capability(struct vm *vm, int vcpu, int type, int val); +int vm_activate_cpu(struct vm *vm, int vcpu); +int vm_attach_vgic(struct vm *vm, uint64_t dist_start, size_t dist_size, + uint64_t redist_start, size_t redist_size); +int vm_assert_irq(struct vm *vm, uint32_t irq, uint32_t vcpuid); +int vm_deassert_irq(struct vm *vm, uint32_t irq, uint32_t vcpuid); +struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); +void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); + +#ifdef _SYS__CPUSET_H_ +/* + * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. + * The rendezvous 'func(arg)' is not allowed to do anything that will + * cause the thread to be put to sleep. + * + * If the rendezvous is being initiated from a vcpu context then the + * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1. + * + * The caller cannot hold any locks when initiating the rendezvous. + * + * The implementation of this API may cause vcpus other than those specified + * by 'dest' to be stalled. The caller should not rely on any vcpus making + * forward progress when the rendezvous is in progress. + */ +typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); +void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, + vm_rendezvous_func_t func, void *arg); +cpuset_t vm_active_cpus(struct vm *vm); +cpuset_t vm_suspended_cpus(struct vm *vm); +#endif /* _SYS__CPUSET_H_ */ + +extern uint64_t hypmode_enabled; +static __inline bool +virt_enabled() +{ + return (hypmode_enabled != 0); +} + +static __inline int +vcpu_rendezvous_pending(void *rendezvous_cookie) +{ + + return (*(uintptr_t *)rendezvous_cookie != 0); +} + +static __inline int +vcpu_suspended(void *suspend_cookie) +{ + + return (*(int *)suspend_cookie); +} + +enum vcpu_state { + VCPU_IDLE, + VCPU_FROZEN, + VCPU_RUNNING, + VCPU_SLEEPING, +}; + +int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, + bool from_idle); +enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); + +static int __inline +vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) +{ + return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); +} + +#ifdef _SYS_PROC_H_ +static int __inline +vcpu_should_yield(struct vm *vm, int vcpu) +{ + + if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) + return (1); + else if (curthread->td_owepreempt) + return (1); + else + return (0); +} +#endif + +void *vcpu_stats(struct vm *vm, int vcpu); +void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); + +/* + * This function is called after a VM-exit that occurred during exception or + * interrupt delivery through the IDT. The format of 'intinfo' is described + * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. + * + * If a VM-exit handler completes the event delivery successfully then it + * should call vm_exit_intinfo() to extinguish the pending event. For e.g., + * if the task switch emulation is triggered via a task gate then it should + * call this function with 'intinfo=0' to indicate that the external event + * is not pending anymore. + * + * Return value is 0 on success and non-zero on failure. + */ +int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); + +/* + * This function is called before every VM-entry to retrieve a pending + * event that should be injected into the guest. This function combines + * nested events into a double or triple fault. + * + * Returns 0 if there are no events that need to be injected into the guest + * and non-zero otherwise. + */ +int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); + +int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); + +enum vm_reg_name vm_segment_name(int seg_encoding); + +struct vm_copyinfo { + uint64_t gpa; + size_t len; + void *hva; + void *cookie; +}; + +int vcpu_trace_exceptions(struct vm *vm, int vcpuid); +#endif /* _KERNEL */ + +#define VM_MAXCPU 1 + +#define VM_DIR_READ 0 +#define VM_DIR_WRITE 1 + +struct vie { + uint8_t access_size:4, sign_extend:1, dir:1, unused:2; + enum vm_reg_name reg; +}; + +struct vre { + uint32_t inst_syndrome; + uint8_t dir:1, unused:7; + enum vm_reg_name reg; +}; + +/* + * Identifiers for optional vmm capabilities + */ +enum vm_cap_type { + VM_CAP_HALT_EXIT, + VM_CAP_MTRAP_EXIT, + VM_CAP_PAUSE_EXIT, + VM_CAP_UNRESTRICTED_GUEST, + VM_CAP_MAX +}; +enum vm_exitcode { + VM_EXITCODE_BOGUS, + VM_EXITCODE_INST_EMUL, + VM_EXITCODE_REG_EMUL, + VM_EXITCODE_HVC, + VM_EXITCODE_SUSPENDED, + VM_EXITCODE_HYP, + VM_EXITCODE_WFI, + VM_EXITCODE_SPINUP_AP, + VM_EXITCODE_MAX +}; + +enum task_switch_reason { + TSR_CALL, + TSR_IRET, + TSR_JMP, + TSR_IDT_GATE, /* task gate in IDT */ +}; + +struct vm_task_switch { + uint16_t tsssel; /* new TSS selector */ + int ext; /* task switch due to external event */ + uint32_t errcode; + int errcode_valid; /* push 'errcode' on the new stack */ + enum task_switch_reason reason; +}; + +struct vm_exit { + enum vm_exitcode exitcode; + int inst_length; + uint64_t pc; + union { + /* + * ARM specific payload. + */ + struct { + uint32_t exception_nr; + uint32_t esr_el2; /* Exception Syndrome Register */ + uint64_t far_el2; /* Fault Address Register */ + uint64_t hpfar_el2; /* Hypervisor IPA Fault Address Register */ + } hyp; + struct { + struct vre vre; + } reg_emul; + struct { + uint64_t gpa; + int fault_type; + } paging; + struct { + uint64_t gpa; + struct vie vie; + } inst_emul; + + struct { + struct hypctx *hypctx; + } wfi; + /* + * VMX specific payload. Used when there is no "better" + * exitcode to represent the VM-exit. + */ + struct { + int status; /* vmx inst status */ + /* + * 'exit_reason' and 'exit_qualification' are valid + * only if 'status' is zero. + */ + uint32_t exit_reason; + uint64_t exit_qualification; + /* + * 'inst_error' and 'inst_type' are valid + * only if 'status' is non-zero. + */ + int inst_type; + int inst_error; + } vmx; + /* + * SVM specific payload. + */ + struct { + uint64_t exitcode; + uint64_t exitinfo1; + uint64_t exitinfo2; + } svm; + struct { +#ifdef __aarch64__ +#else + uint32_t code; /* ecx value */ + uint64_t wval; +#endif + } msr; + struct { + int vcpu; + uint64_t rip; + uint64_t ctx_id; + } spinup_ap; + struct { + uint64_t rflags; + } hlt; + struct { + int vector; + } ioapic_eoi; + struct { + enum vm_suspend_how how; + } suspended; + struct vm_task_switch task_switch; + } u; +}; + +#endif /* _VMM_H_ */ Index: sys/arm64/include/vmm_dev.h =================================================================== --- /dev/null +++ sys/arm64/include/vmm_dev.h @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_DEV_H_ +#define _VMM_DEV_H_ + +#ifdef _KERNEL +void vmmdev_init(void); +int vmmdev_cleanup(void); +#endif + +struct vm_memory_segment { + uint64_t gpa; /* in */ + size_t len; + int wired; +}; + +struct vm_register { + int cpuid; + int regnum; /* enum vm_reg_name */ + uint64_t regval; +}; + +struct vm_run { + int cpuid; + uint64_t pc; + struct vm_exit vm_exit; + +}; + +struct vm_exception { + int cpuid; + int vector; + uint32_t error_code; + int error_code_valid; + int restart_instruction; +}; + +struct vm_capability { + int cpuid; + enum vm_cap_type captype; + int capval; + int allcpus; +}; + +#define MAX_VM_STATS 64 +struct vm_stats { + int cpuid; /* in */ + int num_entries; /* out */ + struct timeval tv; + uint64_t statbuf[MAX_VM_STATS]; +}; +struct vm_stat_desc { + int index; /* in */ + char desc[128]; /* out */ +}; + + +struct vm_suspend { + enum vm_suspend_how how; +}; + +struct vm_gla2gpa { + int vcpuid; /* inputs */ + int prot; /* PROT_READ or PROT_WRITE */ + uint64_t gla; + int fault; /* outputs */ + uint64_t gpa; +}; + +struct vm_activate_cpu { + int vcpuid; +}; + +struct vm_attach_vgic { + uint64_t dist_start; + size_t dist_size; + uint64_t redist_start; + size_t redist_size; +}; + +struct vm_irq { + uint32_t irq; + uint32_t vcpuid; +}; + +#define VM_ACTIVE_CPUS 0 +#define VM_SUSPENDED_CPUS 1 + +enum { + /* general routines */ + IOCNUM_ABIVERS = 0, + IOCNUM_RUN = 1, + IOCNUM_SET_CAPABILITY = 2, + IOCNUM_GET_CAPABILITY = 3, + IOCNUM_SUSPEND = 4, + IOCNUM_REINIT = 5, + + /* memory apis */ + IOCNUM_MAP_MEMORY = 10, + IOCNUM_GET_MEMORY_SEG = 11, + IOCNUM_GET_GPA_PMAP = 12, + IOCNUM_GLA2GPA = 13, + + /* register/state accessors */ + IOCNUM_SET_REGISTER = 20, + IOCNUM_GET_REGISTER = 21, + + /* statistics */ + IOCNUM_VM_STATS = 50, + IOCNUM_VM_STAT_DESC = 51, + + /* interrupt injection */ + IOCNUM_ASSERT_IRQ = 80, + IOCNUM_DEASSERT_IRQ = 81, + + /* vm_cpuset */ + IOCNUM_ACTIVATE_CPU = 90, + IOCNUM_GET_CPUSET = 91, + + /* vm_attach_vgic */ + IOCNUM_ATTACH_VGIC = 110, +}; + +#define VM_RUN \ + _IOWR('v', IOCNUM_RUN, struct vm_run) +#define VM_SUSPEND \ + _IOW('v', IOCNUM_SUSPEND, struct vm_suspend) +#define VM_REINIT \ + _IO('v', IOCNUM_REINIT) +#define VM_MAP_MEMORY \ + _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment) +#define VM_GET_MEMORY_SEG \ + _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment) +#define VM_SET_REGISTER \ + _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) +#define VM_GET_REGISTER \ + _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) +#define VM_SET_CAPABILITY \ + _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) +#define VM_GET_CAPABILITY \ + _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) +#define VM_STATS \ + _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) +#define VM_STAT_DESC \ + _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) +#define VM_ASSERT_IRQ \ + _IOW('v', IOCNUM_ASSERT_IRQ, struct vm_irq) +#define VM_DEASSERT_IRQ \ + _IOW('v', IOCNUM_DEASSERT_IRQ, struct vm_irq) +#define VM_GLA2GPA \ + _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa) +#define VM_ACTIVATE_CPU \ + _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) +#define VM_GET_CPUS \ + _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) +#define VM_ATTACH_VGIC \ + _IOW('v', IOCNUM_ATTACH_VGIC, struct vm_attach_vgic) +#endif Index: sys/arm64/include/vmm_instruction_emul.h =================================================================== --- /dev/null +++ sys/arm64/include/vmm_instruction_emul.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_INSTRUCTION_EMUL_H_ +#define _VMM_INSTRUCTION_EMUL_H_ + +/* + * Callback functions to read and write memory regions. + */ +typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t *rval, int rsize, void *arg); +typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t wval, int wsize, void *arg); + +/* + * Callback functions to read and write registers. + */ +typedef int (*reg_read_t)(void *vm, int cpuid, uint64_t *rval, void *arg); +typedef int (*reg_write_t)(void *vm, int cpuid, uint64_t wval, void *arg); + +/* + * Emulate the decoded 'vie' instruction when it contains a memory operation. + * + * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region + * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * + */ +int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t mrr, mem_region_write_t mrw, void *mrarg); + +/* + * Emulate the decoded 'vre' instruction when it contains a register access. + * + * The callbacks 'regread' and 'regwrite' emulate reads and writes to the + * register from 'vie'. 'regarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * + */ +int vmm_emulate_register(void *vm, int vcpuid, struct vre *vre, reg_read_t regread, + reg_write_t regwrite, void *regarg); + +#endif /* _VMM_INSTRUCTION_EMUL_H_ */ Index: sys/arm64/vmm/arm64.h =================================================================== --- /dev/null +++ sys/arm64/vmm/arm64.h @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _VMM_ARM64_H_ +#define _VMM_ARM64_H_ + +#include +#include +#include +#include + +#include "mmu.h" +#include "io/vgic_v3.h" +#include "io/vtimer.h" + +struct hypctx { + struct reg regs; + + /* EL1 control registers */ + uint64_t actlr_el1; /* Auxiliary Control Register */ + uint64_t afsr0_el1; /* Auxiliary Fault Status Register 0 */ + uint64_t afsr1_el1; /* Auxiliary Fault Status Register 1 */ + uint64_t amair_el1; /* Auxiliary Memory Attribute Indirection Register */ + uint64_t contextidr_el1; /* Current Process Identifier */ + uint64_t cpacr_el1; /* Arhitectural Feature Access Control Register */ + uint64_t elr_el1; /* Exception Link Register */ + uint64_t esr_el1; /* Exception Syndrome Register */ + uint64_t far_el1; /* Fault Address Register */ + uint64_t fp; /* Frame Pointer */ + uint64_t mair_el1; /* Memory Attribute Indirection Register */ + uint64_t par_el1; /* Physical Address Register */ + uint64_t sctlr_el1; /* System Control Register */ + uint64_t sp_el0; /* Stack Pointer */ + uint64_t tcr_el1; /* Translation Control Register */ + uint64_t tpidr_el0; /* EL0 Software ID Register */ + uint64_t tpidrro_el0; /* Read-only Thread ID Register */ + uint64_t tpidr_el1; /* EL1 Software ID Register */ + uint64_t ttbr0_el1; /* Translation Table Base Register 0 */ + uint64_t ttbr1_el1; /* Translation Table Base Register 1 */ + uint64_t vbar_el1; /* Vector Base Address Register */ + uint32_t spsr_el1; /* Saved Program Status Register */ + + /* EL2 control registers */ + uint64_t cptr_el2; /* Architectural Feature Trap Register */ + uint64_t elr_el2; /* Exception Link Register */ + uint64_t hcr_el2; /* Hypervisor Configuration Register */ + uint64_t vpidr_el2; /* Virtualization Processor ID Register */ + uint64_t vmpidr_el2; /* Virtualization Multiprocessor ID Register */ + uint32_t spsr_el2; /* Saved Program Status Register */ + + uint32_t vcpu; + struct hyp *hyp; + struct { + uint64_t esr_el2; /* Exception Syndrome Register */ + uint64_t far_el2; /* Fault Address Register */ + uint64_t hpfar_el2; /* Hypervisor IPA Fault Address Register */ + } exit_info; + + struct vtimer_cpu vtimer_cpu; + struct vgic_v3_cpu_if vgic_cpu_if; + struct vgic_v3_redist vgic_redist; +#ifdef VFP + struct vfpstate vfpstate; +#endif +}; + +struct hyp { + pmap_t stage2_map; + struct hypctx ctx[VM_MAXCPU]; + struct vgic_mmio_region *vgic_mmio_regions; + size_t vgic_mmio_regions_num; + struct vgic_v3_dist vgic_dist; + struct vm *vm; + struct vtimer vtimer; + uint64_t vmid_generation; + uint64_t vttbr_el2; + bool vgic_attached; +}; + +uint64_t vmm_call_hyp(void *hyp_func_addr, ...); +void vmm_cleanup(void *hyp_stub_vectors); +uint64_t vmm_enter_guest(struct hypctx *hypctx); +uint64_t vmm_read_ich_vtr_el2(void); +uint64_t vmm_read_cnthctl_el2(void); +uint64_t vmm_read_tcr_el2(void); + +#define eprintf(fmt, ...) printf("%s:%d " fmt, __func__, __LINE__, ##__VA_ARGS__) +//#define eprintf(fmt, ...) do {} while(0) + +#define VMID_GENERATION_MASK ((1UL<<8) - 1) +#define build_vttbr(vmid, ptaddr) \ + ((((vmid) & VMID_GENERATION_MASK) << VTTBR_VMID_SHIFT) | \ + (uint64_t)(ptaddr)) + +#define MPIDR_SMP_MASK (0x3 << 30) +#define MPIDR_AFF1_LEVEL(x) (((x) >> 2) << 8) +#define MPIDR_AFF0_LEVEL(x) (((x) & 0x3) << 0) + +/* + * Return true if the exception was caused by a translation fault in the stage 2 + * translation regime. The DFSC encoding for a translation fault has the format + * 0b0001LL, where LL (bits [1:0]) represents the level where the fault occured + * (page D7-2280 of the ARMv8 Architecture Manual). + */ +#define ISS_DATA_DFSC_TF(esr_iss) \ + (!((esr_iss) & 0b111000) && ((esr_iss) & 0b000100)) +#define FAR_EL2_PAGE_OFFSET(x) ((x) & PAGE_MASK) + +#define DEBUG_ME 0 + +#define arm64_get_active_vcpu() ((struct hypctx *)PCPU_GET(vcpu)) + +#endif /* !_VMM_ARM64_H_ */ Index: sys/arm64/vmm/arm64.c =================================================================== --- /dev/null +++ sys/arm64/vmm/arm64.c @@ -0,0 +1,807 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mmu.h" +#include "arm64.h" +#include "hyp.h" +#include "reset.h" +#include "io/vgic_v3.h" +#include "io/vtimer.h" + +#define HANDLED 1 +#define UNHANDLED 0 + +#define UNUSED 0 + +MALLOC_DEFINE(M_HYP, "ARM VMM HYP", "ARM VMM HYP"); + +extern char hyp_init_vectors[]; +extern char hyp_vectors[]; +extern char hyp_code_start[]; +extern char hyp_code_end[]; +extern char hyp_stub_vectors[]; + +char *stack[MAXCPU]; +pmap_t hyp_pmap; + +static uint64_t vmid_generation = 0; +static struct mtx vmid_generation_mtx; + +static inline void +arm64_set_active_vcpu(struct hypctx *hypctx) +{ + PCPU_SET(vcpu, hypctx); +} + +static void arm64_set_vttbr(struct hyp *hyp) +{ + if (hyp->vmid_generation != 0 && + ((hyp->vmid_generation & ~VMID_GENERATION_MASK) != + (atomic_load_acq_64(&vmid_generation) & ~VMID_GENERATION_MASK))) + goto out; + + mtx_lock(&vmid_generation_mtx); + + /* Another VCPU has change the VMID already */ + if (hyp->vmid_generation && + ((hyp->vmid_generation & ~VMID_GENERATION_MASK) != + (vmid_generation & ~VMID_GENERATION_MASK))) { + mtx_unlock(&vmid_generation_mtx); + goto out; + } + + vmid_generation++; + if (!(vmid_generation & VMID_GENERATION_MASK)) + vmid_generation++; + + hyp->vmid_generation = vmid_generation; + mtx_unlock(&vmid_generation_mtx); +out: + hyp->vttbr_el2 = build_vttbr(hyp->vmid_generation, + vtophys(hyp->stage2_map->pm_l0)); +} + +static void +arm_init_vectors(void *arg) +{ + char *stack_top; + uint64_t tcr_el1, tcr_el2; + uint32_t sctlr_el2; + uint32_t vtcr_el2; + uint64_t id_aa64mmfr0_el1; + uint64_t pa_range_bits; + register_t daif; + + daif = intr_disable(); + + arm64_set_active_vcpu(NULL); + + /* + * Install the temporary vectors which will be responsible for + * initializing the VMM when we next trap into EL2. + * + * x0: the exception vector table responsible for hypervisor + * initialization on the next call. + */ + vmm_call_hyp((void *)vtophys(hyp_init_vectors)); + + /* Create and map the hypervisor stack */ + stack_top = stack[PCPU_GET(cpuid)] + PAGE_SIZE; + + /* Configure address translation at EL2 */ + tcr_el1 = READ_SPECIALREG(tcr_el1); + tcr_el2 = TCR_EL2_RES1; + + /* Set physical address size */ + id_aa64mmfr0_el1 = READ_SPECIALREG(id_aa64mmfr0_el1); + pa_range_bits = ID_AA64MMFR0_PARange_VAL(id_aa64mmfr0_el1); + tcr_el2 |= (pa_range_bits & 0x7) << TCR_EL2_PS_SHIFT; + + /* Use the same address translation attributes as the host */ + tcr_el2 |= tcr_el1 & TCR_T0SZ_MASK; + tcr_el2 |= tcr_el1 & (0xff << TCR_IRGN0_SHIFT); + + /* + * Configure the system control register for EL2: + * + * SCTLR_EL2_M: MMU on + * SCTLR_EL2_C: Data cacheability not affected + * SCTLR_EL2_I: Instruction cacheability not affected + * SCTLR_EL2_A: Instruction alignment check + * SCTLR_EL2_SA: Stack pointer alignment check + * SCTLR_EL2_WXN: Treat writable memory as execute never + * ~SCTLR_EL2_EE: Data accesses are little-endian + */ + sctlr_el2 = SCTLR_EL2_RES1; + sctlr_el2 |= SCTLR_EL2_M | SCTLR_EL2_C | SCTLR_EL2_I; + sctlr_el2 |= SCTLR_EL2_A | SCTLR_EL2_SA; + sctlr_el2 |= SCTLR_EL2_WXN; + sctlr_el2 &= ~SCTLR_EL2_EE; + + /* + * Configure the Stage 2 translation control register: + * + * VTCR_IRGN0_WBWA: Translation table walks access inner cacheable + * normal memory + * VTCR_ORGN0_WBWA: Translation table walks access outer cacheable + * normal memory + * VTCR_EL2_TG0_4K: Stage 2 uses 4K pages + * VTCR_EL2_SL0_4K_LVL1: Stage 2 uses concatenated level 1 tables + * VTCR_EL2_SH0_IS: Memory associated with Stage 2 walks is inner + * shareable + */ + vtcr_el2 = VTCR_EL2_RES1; + vtcr_el2 = (pa_range_bits & 0x7) << VTCR_EL2_PS_SHIFT; + vtcr_el2 |= VTCR_EL2_IRGN0_WBWA | VTCR_EL2_ORGN0_WBWA; + vtcr_el2 |= VTCR_EL2_TG0_4K; + vtcr_el2 |= VTCR_EL2_SH0_IS; + if (pa_range_bits == ID_AA64MMFR0_PARange_1T) { + /* + * 40 bits of physical addresses, use concatenated level 1 + * tables + */ + vtcr_el2 |= 24 & VTCR_EL2_T0SZ_MASK; + vtcr_el2 |= VTCR_EL2_SL0_4K_LVL1; + } + + /* Special call to initialize EL2 */ + vmm_call_hyp((void *)vtophys(hyp_vectors), vtophys(hyp_pmap->pm_l0), + ktohyp(stack_top), tcr_el2, sctlr_el2, vtcr_el2); + + intr_restore(daif); +} + +static void +arm_cleanup_vectors(void *arg) +{ + register_t daif; + + /* + * vmm_cleanup() will disable the MMU. For the next few instructions, + * before the hardware disables the MMU, one of the following is + * possible: + * + * a. The instruction addresses are fetched with the MMU disabled, + * and they must represent the actual physical addresses. This will work + * because we call the vmm_cleanup() function by its physical address. + * + * b. The instruction addresses are fetched using the old translation + * tables. This will work because we have an identity mapping in place + * in the translation tables and vmm_cleanup() is called by its physical + * address. + */ + daif = intr_disable(); + vmm_call_hyp((void *)vtophys(vmm_cleanup), vtophys(hyp_stub_vectors)); + intr_restore(daif); + + arm64_set_active_vcpu(NULL); +} + +static int +arm_init(int ipinum) +{ + size_t hyp_code_len; + uint64_t ich_vtr_el2; + uint64_t cnthctl_el2; + int cpu; + register_t daif; + + if (!virt_enabled()) { + printf("arm_init: Processor doesn't have support for virtualization.\n"); + return (ENXIO); + } + + mtx_init(&vmid_generation_mtx, "vmid_generation_mtx", NULL, MTX_DEF); + + /* Create the mappings for the hypervisor translation table. */ + hyp_pmap = malloc(sizeof(*hyp_pmap), M_HYP, M_WAITOK | M_ZERO); + hypmap_init(hyp_pmap, PM_STAGE1); + hyp_code_len = (size_t)hyp_code_end - (size_t)hyp_code_start; + hypmap_map(hyp_pmap, (vm_offset_t)hyp_code_start, hyp_code_len, VM_PROT_EXECUTE); + + /* We need an identity mapping for when we activate the MMU */ + hypmap_map_identity(hyp_pmap, (vm_offset_t)hyp_code_start, hyp_code_len, + VM_PROT_EXECUTE); + + /* Create a per-CPU hypervisor stack */ + CPU_FOREACH(cpu) { + stack[cpu] = malloc(PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO); + hypmap_map(hyp_pmap, (vm_offset_t)stack[cpu], PAGE_SIZE, + VM_PROT_READ | VM_PROT_WRITE); + } + + + smp_rendezvous(NULL, arm_init_vectors, NULL, NULL); + + daif = intr_disable(); + + ich_vtr_el2 = vmm_call_hyp((void *)ktohyp(vmm_read_ich_vtr_el2)); + vgic_v3_init(ich_vtr_el2); + + cnthctl_el2 = vmm_call_hyp((void *)ktohyp(vmm_read_cnthctl_el2)); + vtimer_init(cnthctl_el2); + + intr_restore(daif); + + return 0; +} + +static int +arm_cleanup(void) +{ + int cpu; + + smp_rendezvous(NULL, arm_cleanup_vectors, NULL, NULL); + + vtimer_cleanup(); + + hypmap_cleanup(hyp_pmap); + free(hyp_pmap, M_HYP); + for (cpu = 0; cpu < nitems(stack); cpu++) + free(stack[cpu], M_HYP); + + mtx_destroy(&vmid_generation_mtx); + + return (0); +} + +static void * +arm_vminit(struct vm *vm) +{ + struct hyp *hyp; + struct hypctx *hypctx; + bool last_vcpu; + int i; + + hyp = malloc(sizeof(struct hyp), M_HYP, M_WAITOK | M_ZERO); + hyp->vm = vm; + hyp->vgic_attached = false; + + hyp->stage2_map = malloc(sizeof(*hyp->stage2_map), + M_HYP, M_WAITOK | M_ZERO); + hypmap_init(hyp->stage2_map, PM_STAGE2); + arm64_set_vttbr(hyp); + + for (i = 0; i < VM_MAXCPU; i++) { + hypctx = &hyp->ctx[i]; + hypctx->vcpu = i; + hypctx->hyp = hyp; + + reset_vm_el01_regs(hypctx); + reset_vm_el2_regs(hypctx); + } + + vtimer_vminit(hyp); + vgic_v3_vminit(hyp); + for (i = 0; i < VM_MAXCPU; i++) { + hypctx = &hyp->ctx[i]; + vtimer_cpuinit(hypctx); + last_vcpu = (i == VM_MAXCPU - 1); + vgic_v3_cpuinit(hypctx, last_vcpu); + } + + hypmap_map(hyp_pmap, (vm_offset_t)hyp, sizeof(struct hyp), + VM_PROT_READ | VM_PROT_WRITE); + + return (hyp); +} + +static enum vm_reg_name +get_vm_reg_name(uint32_t reg_nr, uint32_t mode __attribute__((unused))) +{ + switch(reg_nr) { + case 0: + return VM_REG_GUEST_X0; + case 1: + return VM_REG_GUEST_X1; + case 2: + return VM_REG_GUEST_X2; + case 3: + return VM_REG_GUEST_X3; + case 4: + return VM_REG_GUEST_X4; + case 5: + return VM_REG_GUEST_X5; + case 6: + return VM_REG_GUEST_X6; + case 7: + return VM_REG_GUEST_X7; + case 8: + return VM_REG_GUEST_X8; + case 9: + return VM_REG_GUEST_X9; + case 10: + return VM_REG_GUEST_X10; + case 11: + return VM_REG_GUEST_X11; + case 12: + return VM_REG_GUEST_X12; + case 13: + return VM_REG_GUEST_X13; + case 14: + return VM_REG_GUEST_X14; + case 15: + return VM_REG_GUEST_X15; + case 16: + return VM_REG_GUEST_X16; + case 17: + return VM_REG_GUEST_X17; + case 18: + return VM_REG_GUEST_X18; + case 19: + return VM_REG_GUEST_X19; + case 20: + return VM_REG_GUEST_X20; + case 21: + return VM_REG_GUEST_X21; + case 22: + return VM_REG_GUEST_X22; + case 23: + return VM_REG_GUEST_X23; + case 24: + return VM_REG_GUEST_X24; + case 25: + return VM_REG_GUEST_X25; + case 26: + return VM_REG_GUEST_X26; + case 27: + return VM_REG_GUEST_X27; + case 28: + return VM_REG_GUEST_X28; + case 29: + return VM_REG_GUEST_X29; + case 30: + return VM_REG_GUEST_LR; + case 31: + return VM_REG_GUEST_SP; + case 32: + return VM_REG_GUEST_ELR; + case 33: + return VM_REG_GUEST_SPSR; + case 34: + return VM_REG_ELR_EL2; + default: + break; + } + + return (VM_REG_LAST); +} + +static inline void +arm64_print_hyp_regs(struct vm_exit *vme) +{ + printf("esr_el2: 0x%08x\n", vme->u.hyp.esr_el2); + printf("far_el2: 0x%016lx\n", vme->u.hyp.far_el2); + printf("hpfar_el2: 0x%016lx\n", vme->u.hyp.hpfar_el2); +} + +static void +arm64_gen_inst_emul_data(uint32_t esr_iss, struct vm_exit *vme_ret) +{ + struct vie *vie; + uint32_t esr_sas, reg_num; + uint64_t page_off; + + /* + * Get bits [47:12] of the IPA from HPFAR_EL2. + * At this point the 'u.hyp' member will be replaced by 'u.inst_emul'. + */ + vme_ret->u.inst_emul.gpa = \ + (vme_ret->u.hyp.hpfar_el2) >> HPFAR_EL2_FIPA_SHIFT; + /* The IPA is the base address of a 4KB page, make bits [11:0] zero. */ + vme_ret->u.inst_emul.gpa = (vme_ret->u.inst_emul.gpa) << PAGE_SHIFT; + /* Bits [11:0] are the same as bits [11:0] from the virtual address. */ + page_off = FAR_EL2_PAGE_OFFSET(vme_ret->u.hyp.far_el2); + vme_ret->u.inst_emul.gpa = vme_ret->u.inst_emul.gpa + page_off; + + esr_sas = (esr_iss & ISS_DATA_SAS_MASK) >> ISS_DATA_SAS_SHIFT; + reg_num = (esr_iss & ISS_DATA_SRT_MASK) >> ISS_DATA_SRT_SHIFT; + + vie = &vme_ret->u.inst_emul.vie; + vie->access_size = 1 << esr_sas; + vie->sign_extend = (esr_iss & ISS_DATA_SSE) ? 1 : 0; + vie->dir = (esr_iss & ISS_DATA_WnR) ? VM_DIR_WRITE : VM_DIR_READ; + vie->reg = get_vm_reg_name(reg_num, UNUSED); +} + +static void +arm64_gen_reg_emul_data(uint32_t esr_iss, struct vm_exit *vme_ret) +{ + uint32_t reg_num; + struct vre *vre; + + /* u.hyp member will be replaced by u.reg_emul */ + vre = &vme_ret->u.reg_emul.vre; + + vre->inst_syndrome = esr_iss; + /* ARMv8 Architecture Manual, p. D7-2273: 1 means read */ + vre->dir = (esr_iss & ISS_MSR_DIR) ? VM_DIR_READ : VM_DIR_WRITE; + reg_num = ISS_MSR_Rt(esr_iss); + vre->reg = get_vm_reg_name(reg_num, UNUSED); +} + +//static bool print_stuff = false; + +static int +handle_el1_sync_excp(struct hyp *hyp, int vcpu, struct vm_exit *vme_ret) +{ + uint32_t esr_ec, esr_iss; + + esr_ec = ESR_ELx_EXCEPTION(vme_ret->u.hyp.esr_el2); + esr_iss = vme_ret->u.hyp.esr_el2 & ESR_ELx_ISS_MASK; + + switch(esr_ec) { + case EXCP_UNKNOWN: + eprintf("Unknown exception from guest\n"); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + case EXCP_HVC: + vme_ret->exitcode = VM_EXITCODE_HVC; + break; + case EXCP_MSR: + arm64_gen_reg_emul_data(esr_iss, vme_ret); + vme_ret->exitcode = VM_EXITCODE_REG_EMUL; + break; + + case EXCP_DATA_ABORT_L: + /* Check if instruction syndrome is valid */ + if (!(esr_iss & ISS_DATA_ISV)) { + eprintf("Data abort with invalid instruction syndrome\n"); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + } + + /* + * Check if the data abort was caused by a translation fault. + * Any other type of data fault will be treated as an error. + */ + if (!(ISS_DATA_DFSC_TF(esr_iss))) { + eprintf("Data abort not on a stage 2 translation\n"); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + } + + arm64_gen_inst_emul_data(esr_iss, vme_ret); + vme_ret->exitcode = VM_EXITCODE_INST_EMUL; + break; + + default: + eprintf("Unsupported synchronous exception from guest: 0x%x\n", + esr_ec); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + } + + /* We don't don't do any instruction emulation here */ + return (UNHANDLED); +} + +static int +arm64_handle_world_switch(struct hyp *hyp, int vcpu, struct vm_exit *vme) +{ + int excp_type; + int handled; + + excp_type = vme->u.hyp.exception_nr; + switch (excp_type) { + case EXCP_TYPE_EL1_SYNC: + /* The exit code will be set by handle_el1_sync_excp(). */ + handled = handle_el1_sync_excp(hyp, vcpu, vme); + break; + + case EXCP_TYPE_EL1_IRQ: + case EXCP_TYPE_EL1_FIQ: + /* The host kernel will handle IRQs and FIQs. */ + vme->exitcode = VM_EXITCODE_BOGUS; + handled = UNHANDLED; + break; + + case EXCP_TYPE_EL1_ERROR: + case EXCP_TYPE_EL2_SYNC: + case EXCP_TYPE_EL2_IRQ: + case EXCP_TYPE_EL2_FIQ: + case EXCP_TYPE_EL2_ERROR: + eprintf("Unhandled exception type: %s\n", __STRING(excp_type)); + vme->exitcode = VM_EXITCODE_BOGUS; + handled = UNHANDLED; + break; + + default: + eprintf("Unknown exception type: %d\n", excp_type); + vme->exitcode = VM_EXITCODE_BOGUS; + handled = UNHANDLED; + break; + } + + return (handled); +} + +static int +arm_vmrun(void *arg, int vcpu, register_t pc, pmap_t pmap, + void *rendezvous_cookie, void *suspend_cookie) +{ + uint64_t excp_type; + int handled; + register_t daif; + struct hyp *hyp; + struct hypctx *hypctx; + struct vm *vm; + struct vm_exit *vme; + + hyp = (struct hyp *)arg; + vm = hyp->vm; + vme = vm_exitinfo(vm, vcpu); + + hypctx = &hyp->ctx[vcpu]; + hypctx->elr_el2 = (uint64_t)pc; + + for (;;) { + daif = intr_disable(); + /* + * TODO: What happens if a timer interrupt is asserted exactly + * here, but for the previous VM? + */ + arm64_set_active_vcpu(hypctx); + vgic_v3_sync_hwstate(hypctx); + excp_type = vmm_call_hyp((void *)ktohyp(vmm_enter_guest), + ktohyp(hypctx)); + intr_restore(daif); + + if (excp_type == EXCP_TYPE_MAINT_IRQ) + continue; + + vme->pc = hypctx->elr_el2; + vme->inst_length = INSN_SIZE; + vme->u.hyp.exception_nr = excp_type; + vme->u.hyp.esr_el2 = hypctx->exit_info.esr_el2; + vme->u.hyp.far_el2 = hypctx->exit_info.far_el2; + vme->u.hyp.hpfar_el2 = hypctx->exit_info.hpfar_el2; + + handled = arm64_handle_world_switch(hyp, vcpu, vme); + if (handled == UNHANDLED) + /* Exit loop to emulate instruction. */ + break; + else + /* Resume guest execution from the next instruction. */ + hypctx->elr_el2 += vme->inst_length; + } + + return (0); +} + +static void +arm_deactivate_pcpu(void *arg) +{ + struct hyp *hyp = arg; + int maxcpu; + int i; + + maxcpu = vm_get_maxcpus(hyp->vm); + for (i = 0; i < maxcpu; i++) + if (arm64_get_active_vcpu() == &hyp->ctx[i]) + arm64_set_active_vcpu(NULL); +} + +static void +arm_vmcleanup(void *arg) +{ + struct hyp *hyp = arg; + + smp_rendezvous(NULL, arm_deactivate_pcpu, NULL, hyp); + + vtimer_vmcleanup(arg); + vgic_v3_detach_from_vm(arg); + + /* Unmap the VM hyp struct from the hyp mode translation table */ + hypmap_map(hyp_pmap, (vm_offset_t)hyp, sizeof(struct hyp), + VM_PROT_NONE); + hypmap_cleanup(hyp->stage2_map); + free(hyp->stage2_map, M_HYP); + free(hyp, M_HYP); +} + +/* + * Return register value. Registers have different sizes and an explicit cast + * must be made to ensure proper conversion. + */ +static void * +hypctx_regptr(struct hypctx *hypctx, int reg) +{ + switch (reg) { + case VM_REG_GUEST_X0: + return (&hypctx->regs.x[0]); + case VM_REG_GUEST_X1: + return (&hypctx->regs.x[1]); + case VM_REG_GUEST_X2: + return (&hypctx->regs.x[2]); + case VM_REG_GUEST_X3: + return (&hypctx->regs.x[3]); + case VM_REG_GUEST_X4: + return (&hypctx->regs.x[4]); + case VM_REG_GUEST_X5: + return (&hypctx->regs.x[5]); + case VM_REG_GUEST_X6: + return (&hypctx->regs.x[6]); + case VM_REG_GUEST_X7: + return (&hypctx->regs.x[7]); + case VM_REG_GUEST_X8: + return (&hypctx->regs.x[8]); + case VM_REG_GUEST_X9: + return (&hypctx->regs.x[9]); + case VM_REG_GUEST_X10: + return (&hypctx->regs.x[10]); + case VM_REG_GUEST_X11: + return (&hypctx->regs.x[11]); + case VM_REG_GUEST_X12: + return (&hypctx->regs.x[12]); + case VM_REG_GUEST_X13: + return (&hypctx->regs.x[13]); + case VM_REG_GUEST_X14: + return (&hypctx->regs.x[14]); + case VM_REG_GUEST_X15: + return (&hypctx->regs.x[15]); + case VM_REG_GUEST_X16: + return (&hypctx->regs.x[16]); + case VM_REG_GUEST_X17: + return (&hypctx->regs.x[17]); + case VM_REG_GUEST_X18: + return (&hypctx->regs.x[18]); + case VM_REG_GUEST_X19: + return (&hypctx->regs.x[19]); + case VM_REG_GUEST_X20: + return (&hypctx->regs.x[20]); + case VM_REG_GUEST_X21: + return (&hypctx->regs.x[21]); + case VM_REG_GUEST_X22: + return (&hypctx->regs.x[22]); + case VM_REG_GUEST_X23: + return (&hypctx->regs.x[23]); + case VM_REG_GUEST_X24: + return (&hypctx->regs.x[24]); + case VM_REG_GUEST_X25: + return (&hypctx->regs.x[25]); + case VM_REG_GUEST_X26: + return (&hypctx->regs.x[26]); + case VM_REG_GUEST_X27: + return (&hypctx->regs.x[27]); + case VM_REG_GUEST_X28: + return (&hypctx->regs.x[28]); + case VM_REG_GUEST_X29: + return (&hypctx->regs.x[29]); + case VM_REG_GUEST_LR: + return (&hypctx->regs.lr); + case VM_REG_GUEST_SP: + return (&hypctx->regs.sp); + case VM_REG_GUEST_ELR: + return (&hypctx->regs.elr); + case VM_REG_GUEST_SPSR: + return (&hypctx->regs.spsr); + case VM_REG_ELR_EL2: + return (&hypctx->elr_el2); + default: + break; + } + return (NULL); +} + +static int +arm_getreg(void *arg, int vcpu, int reg, uint64_t *retval) +{ + void *regp; + int running, hostcpu; + struct hyp *hyp = arg; + + running = vcpu_is_running(hyp->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("arm_getreg: %s%d is running", vm_name(hyp->vm), vcpu); + + if ((regp = hypctx_regptr(&hyp->ctx[vcpu], reg)) != NULL) { + if (reg == VM_REG_GUEST_SPSR) + *retval = *(uint32_t *)regp; + else + *retval = *(uint64_t *)regp; + return (0); + } else { + return (EINVAL); + } +} + +static int +arm_setreg(void *arg, int vcpu, int reg, uint64_t val) +{ + void *regp; + struct hyp *hyp = arg; + int running, hostcpu; + + running = vcpu_is_running(hyp->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("hyp_setreg: %s%d is running", vm_name(hyp->vm), vcpu); + + if ((regp = hypctx_regptr(&hyp->ctx[vcpu], reg)) != NULL) { + if (reg == VM_REG_GUEST_SPSR) + *(uint32_t *)regp = (uint32_t)val; + else + *(uint64_t *)regp = val; + return (0); + } else { + return (EINVAL); + } +} + +static +void arm_restore(void) +{ + ; +} + +struct vmm_ops vmm_ops_arm = { + arm_init, + arm_cleanup, + arm_restore, + arm_vminit, + arm_vmrun, + arm_vmcleanup, + hypmap_set, + hypmap_get, + arm_getreg, + arm_setreg, + NULL, /* vmi_get_cap_t */ + NULL /* vmi_set_cap_t */ +}; Index: sys/arm64/vmm/hyp.h =================================================================== --- /dev/null +++ sys/arm64/vmm/hyp.h @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_HYP_H_ +#define _VMM_HYP_H_ + +/* + * The translation tables for the hypervisor mode will hold mappings for kernel + * virtual addresses and an identity mapping (VA == PA) necessary when + * enabling/disabling the MMU. + * + * When in EL2 exception level the translation table base register is TTBR0_EL2 + * and the virtual addresses generated by the CPU must be at the bottom of the + * memory, with the first 16 bits all set to zero: + * + * 0x0000ffffffffffff End hyp address space + * 0x0000000000000000 Start of hyp address space + * + * To run code in hyp mode we need to convert kernel virtual addresses to + * addreses that fit into this address space. + * + * The kernel virtual address range is: + * + * 0xffff007fffffffff End of KVA + * 0xffff000000000000 Kernel base address & start of KVA + * + * (see /sys/arm64/include/vmparam.h). + * + * We could convert the kernel virtual addresses to valid EL2 addresses by + * setting the first 16 bits to zero and thus mapping the kernel addresses in + * the bottom half of the EL2 address space, but then they might clash with the + * identity mapping addresses. Instead we map the kernel addresses in the upper + * half of the EL2 address space. + * + * The hypervisor address space will look like this: + * + * 0x0000807fffffffff End of KVA mapping + * 0x0000800000000000 Start of KVA mapping + * + * 0x00007fffffffffff End of identity mapping + * 0x0000000000000000 Start of identity mapping + * + * With the scheme we have 47 bits at our disposable for the identity map and + * another 47 bits for the kernel virtual addresses. For a maximum physical + * memory size of 128TB we are guaranteed to not have any clashes between + * addresses. + */ +#define HYP_VM_MIN_ADDRESS 0x0000000000000000 +#define HYP_VM_MAX_ADDRESS 0x0000ffffffffffff + +#define HYP_KVA_OFFSET 0x0000800000000000 +#define HYP_KVA_MASK 0x0000ffffffffffff + +/* + * When taking asynchronous exceptions, or interrupts, with the exception of the + * SError interrupt, the exception syndrome register is not updated with the + * exception code. We need to differentiate between the different exception + * types taken to EL2. + */ +#define EXCP_TYPE_EL1_SYNC 0 +#define EXCP_TYPE_EL1_IRQ 1 +#define EXCP_TYPE_EL1_FIQ 2 +#define EXCP_TYPE_EL1_ERROR 3 + +#define EXCP_TYPE_EL2_SYNC 4 +#define EXCP_TYPE_EL2_IRQ 5 +#define EXCP_TYPE_EL2_FIQ 6 +#define EXCP_TYPE_EL2_ERROR 7 + +#define EXCP_TYPE_MAINT_IRQ 8 + +#define HYP_GET_VECTOR_TABLE -1 + +#endif /* !_VMM_HYP_H_ */ Index: sys/arm64/vmm/hyp.S =================================================================== --- /dev/null +++ sys/arm64/vmm/hyp.S @@ -0,0 +1,387 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include +#include +#include +#include +#include + +#include "hyp_macros.h" +#include "hyp.h" +#include "hyp_assym.h" + + .text + + .globl hyp_code_start + .globl hyp_code_end + + .align 12 +hyp_code_start: + + +ENTRY(vmm_call_hyp) + hvc #0 + ret +END(vmm_call_hyp) + + +.macro vempty + .align 7 + 1: b 1b +.endm + +.macro vector name + .align 7 + b handle_\name +.endm + + .align 11 + .globl hyp_init_vectors +hyp_init_vectors: + vempty /* Synchronous EL2t */ + vempty /* IRQ EL2t */ + vempty /* FIQ EL2t */ + vempty /* Error EL2t */ + + vempty /* Synchronous EL2h */ + vempty /* IRQ EL2h */ + vempty /* FIQ EL2h */ + vempty /* Error EL2h */ + + vector hyp_init /* Synchronous 64-bit EL1 */ + vempty /* IRQ 64-bit EL1 */ + vempty /* FIQ 64-bit EL1 */ + vempty /* Error 64-bit EL1 */ + + vempty /* Synchronous 32-bit EL1 */ + vempty /* IRQ 32-bit EL1 */ + vempty /* FIQ 32-bit EL1 */ + vempty /* Error 32-bit EL1 */ + + +/* + * Initialize the hypervisor mode with a new exception vector table, translation + * table and stack. + * + * Expecting: + * x0 - the hypervisor exception vectors + * x1 - translation tables physical address + * x2 - stack top virtual address + * x3 - TCR_EL2 value + * x4 - SCTLR_EL2 value + * x5 - VTCR_EL2 value + */ +ENTRY(handle_hyp_init) + /* Install the new exception vectors */ + msr vbar_el2, x0 + /* Set the stack top address */ + mov sp, x2 + /* Use the host VTTBR_EL2 to tell the host and the guests apart */ + mov x9, #VTTBR_HOST + msr vttbr_el2, x9 + /* Load the base address for the translation tables */ + msr ttbr0_el2, x1 + /* Invalidate the TLB */ + tlbi alle2 + /* Use the same memory attributes as EL1 */ + mrs x9, mair_el1 + msr mair_el2, x9 + /* Configure address translation */ + msr tcr_el2, x3 + isb + /* Set the system control register for EL2 */ + msr sctlr_el2, x4 + /* Set the Stage 2 translation control register */ + msr vtcr_el2, x5 + /* Return success */ + mov x0, #0 + /* MMU is up and running */ + eret +END(handle_hyp_init) + + + .align 11 + .globl hyp_vectors +hyp_vectors: + vempty /* Synchronous EL2t */ + vempty /* IRQ EL2t */ + vempty /* FIQ EL2t */ + vempty /* Error EL2t */ + + vector el2_el2h_sync /* Synchronous EL2h */ + vector el2_el2h_irq /* IRQ EL2h */ + vector el2_el2h_fiq /* FIQ EL2h */ + vector el2_el2h_error /* Error EL2h */ + + vector el2_el1_sync64 /* Synchronous 64-bit EL1 */ + vector el2_el1_irq64 /* IRQ 64-bit EL1 */ + vector el2_el1_fiq64 /* FIQ 64-bit EL1 */ + vector el2_el1_error64 /* Error 64-bit EL1 */ + + vempty /* Synchronous 32-bit EL1 */ + vempty /* IRQ 32-bit EL1 */ + vempty /* FIQ 32-bit EL1 */ + vempty /* Error 32-bit EL1 */ + + +.macro do_world_switch_to_host + .align 7 + SAVE_GUEST_REGS() +#ifdef VFP + /* + * Saving the guest VFP registers needs to come after saving the rest of + * the registers because the process dirties the regular registers. + */ + SAVE_GUEST_VFP_REGS() + LOAD_HOST_VFP_REGS() +#endif + LOAD_HOST_REGS() + SAVE_EXIT_INFO() + + /* Restore host VTTBR */ + mov x9, #VTTBR_HOST + msr vttbr_el2, x9 +.endm + + +.macro handle_el2_excp type + .align 7 + /* Save registers before modifying so we can restore them */ + str x9, [sp, #-16]! + + /* Test if the exception happened when the host was running */ + mrs x9, vttbr_el2 + cmp x9, #VTTBR_HOST + beq 1f + + /* We got the exception while the guest was running */ + ldr x9, [sp], #16 + do_world_switch_to_host + b 2f +1: + /* We got the exception while the host was running */ + ldr x9, [sp], #16 +2: + mov x0, \type + eret +.endm + + +ENTRY(handle_el2_el2h_sync) + handle_el2_excp #EXCP_TYPE_EL2_SYNC +END(handle_el2_el2h_sync) + +ENTRY(handle_el2_el2h_irq) + handle_el2_excp #EXCP_TYPE_EL2_IRQ +END(handle_el2_el2h_sync) + +ENTRY(handle_el2_el2h_fiq) + handle_el2_excp #EXCP_TYPE_EL2_FIQ +END(handle_el2_el2h_sync) + +ENTRY(handle_el2_el2h_error) + handle_el2_excp #EXCP_TYPE_EL2_ERROR +END(handle_el2_el2h_sync) + + +ENTRY(handle_el2_el1_sync64) + /* Save registers before modifying so we can restore them */ + str x9, [sp, #-16]! + + /* Check for host hypervisor call */ + mrs x9, vttbr_el2 + cmp x9, #VTTBR_HOST + beq 1f + + /* Restore register */ + ldr x9, [sp], #16 + + /* Guest exception taken to EL2 */ + do_world_switch_to_host + mov x0, #EXCP_TYPE_EL1_SYNC + b exit + +1: + /* Restore register */ + ldr x9, [sp], #16 + + cmp x0, #HYP_GET_VECTOR_TABLE + beq 2f + b call_function +2: + /* Return the vector table base address */ + mrs x0, vbar_el2 +exit: + eret +END(handle_el2_el1_sync64) + + +/* + * Call a function in EL2 context + * + * Expecting: + * x0 - function virtual address + * x1-x7 - function parameters + */ +ENTRY(call_function) + /* Save the function address before shuffling parameters */ + mov x9, x0 + + /* Shuffle function parameters */ + mov x0, x1 + mov x1, x2 + mov x2, x3 + mov x3, x4 + mov x4, x5 + mov x5, x6 + mov x6, x7 + + /* Call function */ + br x9 +END(call_function) + + +/* + * We only trap IRQ, FIQ and SError exceptions when a guest is running. Do a + * world switch to host to handle these exceptions. + */ + + +ENTRY(handle_el2_el1_irq64) + do_world_switch_to_host + str x9, [sp, #-16]! + mrs x9, ich_misr_el2 + cmp x9, xzr + beq 1f + mov x0, #EXCP_TYPE_MAINT_IRQ + b 2f +1: + mov x0, #EXCP_TYPE_EL1_IRQ +2: + ldr x9, [sp], #16 + eret +END(handle_el2_el1_irq) + +ENTRY(handle_el2_el1_fiq64) + do_world_switch_to_host + mov x0, #EXCP_TYPE_EL1_FIQ + eret +END(handle_el2_el1_fiq64) + +ENTRY(handle_el2_el1_error64) + do_world_switch_to_host + mov x0, #EXCP_TYPE_EL1_ERROR + eret +END(handle_el2_el1_error64) + + +/* + * Usage: + * void vmm_enter_guest(struct hypctx *hypctx) + * + * Expecting: + * x0 - hypctx address + */ +ENTRY(vmm_enter_guest) + /* Save hypctx address */ + msr tpidr_el2, x0 + + SAVE_HOST_REGS() +#ifdef VFP + SAVE_HOST_VFP_REGS() + /* + * Loading the guest VFP registers needs to come before loading the + * rest of the registers because this process dirties the regular + * registers. + */ + LOAD_GUEST_VFP_REGS() +#endif + LOAD_GUEST_REGS() + + /* Enter guest */ + eret +END(vmm_enter_guest) + + +/* + * Usage: + * void vmm_cleanup(void *hyp_stub_vectors) + * + * Expecting: + * x0 - physical address of hyp_stub_vectors + */ +ENTRY(vmm_cleanup) + /* Restore the stub vectors */ + msr vbar_el2, x0 + + /* Disable the MMU */ + dsb sy + mrs x2, sctlr_el2 + bic x2, x2, #SCTLR_EL2_M + msr sctlr_el2, x2 + + eret +END(vmm_cleanup) + +.macro read_reg name + mrs x0, \name +.endm + +/* + * Return the value of the ICH_VTR_EL2 register. + */ +ENTRY(vmm_read_ich_vtr_el2) + read_reg ich_vtr_el2 + eret +END(vmm_read_ich_vtr_el2) + +/* + * Return the value of the CNTHCTL_EL2 register. + */ +ENTRY(vmm_read_cnthctl_el2) + read_reg cnthctl_el2 + eret +END(vmm_read_cnthctl_el2) + +/* + * Return the value of the TCR_EL2 register. + */ +ENTRY(vmm_read_tcr_el2) + read_reg tcr_el2 + eret +END(vmm_read_tcr_el2) + + + +hyp_code_end: Index: sys/arm64/vmm/hyp_genassym.c =================================================================== --- /dev/null +++ sys/arm64/vmm/hyp_genassym.c @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arm64.h" + +ASSYM(HYPCTX_REGS_X0, offsetof(struct hypctx, regs) + 0 * 8); +ASSYM(HYPCTX_REGS_X1, offsetof(struct hypctx, regs) + 1 * 8); +ASSYM(HYPCTX_REGS_X2, offsetof(struct hypctx, regs) + 2 * 8); +ASSYM(HYPCTX_REGS_X3, offsetof(struct hypctx, regs) + 3 * 8); +ASSYM(HYPCTX_REGS_X4, offsetof(struct hypctx, regs) + 4 * 8); +ASSYM(HYPCTX_REGS_X5, offsetof(struct hypctx, regs) + 5 * 8); +ASSYM(HYPCTX_REGS_X6, offsetof(struct hypctx, regs) + 6 * 8); +ASSYM(HYPCTX_REGS_X7, offsetof(struct hypctx, regs) + 7 * 8); +ASSYM(HYPCTX_REGS_X8, offsetof(struct hypctx, regs) + 8 * 8); +ASSYM(HYPCTX_REGS_X9, offsetof(struct hypctx, regs) + 9 * 8); +ASSYM(HYPCTX_REGS_X10, offsetof(struct hypctx, regs) + 10 * 8); +ASSYM(HYPCTX_REGS_X11, offsetof(struct hypctx, regs) + 11 * 8); +ASSYM(HYPCTX_REGS_X12, offsetof(struct hypctx, regs) + 12 * 8); +ASSYM(HYPCTX_REGS_X13, offsetof(struct hypctx, regs) + 13 * 8); +ASSYM(HYPCTX_REGS_X14, offsetof(struct hypctx, regs) + 14 * 8); +ASSYM(HYPCTX_REGS_X15, offsetof(struct hypctx, regs) + 15 * 8); +ASSYM(HYPCTX_REGS_X16, offsetof(struct hypctx, regs) + 16 * 8); +ASSYM(HYPCTX_REGS_X17, offsetof(struct hypctx, regs) + 17 * 8); +ASSYM(HYPCTX_REGS_X18, offsetof(struct hypctx, regs) + 18 * 8); +ASSYM(HYPCTX_REGS_X19, offsetof(struct hypctx, regs) + 19 * 8); +ASSYM(HYPCTX_REGS_X20, offsetof(struct hypctx, regs) + 20 * 8); +ASSYM(HYPCTX_REGS_X21, offsetof(struct hypctx, regs) + 21 * 8); +ASSYM(HYPCTX_REGS_X22, offsetof(struct hypctx, regs) + 22 * 8); +ASSYM(HYPCTX_REGS_X23, offsetof(struct hypctx, regs) + 23 * 8); +ASSYM(HYPCTX_REGS_X24, offsetof(struct hypctx, regs) + 24 * 8); +ASSYM(HYPCTX_REGS_X25, offsetof(struct hypctx, regs) + 25 * 8); +ASSYM(HYPCTX_REGS_X26, offsetof(struct hypctx, regs) + 26 * 8); +ASSYM(HYPCTX_REGS_X27, offsetof(struct hypctx, regs) + 27 * 8); +ASSYM(HYPCTX_REGS_X28, offsetof(struct hypctx, regs) + 28 * 8); +ASSYM(HYPCTX_REGS_X29, offsetof(struct hypctx, regs) + 29 * 8); +ASSYM(HYPCTX_REGS_LR, offsetof(struct hypctx, regs.lr)); +ASSYM(HYPCTX_REGS_SP, offsetof(struct hypctx, regs.sp)); +ASSYM(HYPCTX_REGS_ELR, offsetof(struct hypctx, regs.elr)); +ASSYM(HYPCTX_REGS_SPSR, offsetof(struct hypctx, regs.spsr)); + +ASSYM(HYPCTX_ACTLR_EL1, offsetof(struct hypctx, actlr_el1)); +ASSYM(HYPCTX_AMAIR_EL1, offsetof(struct hypctx, amair_el1)); +ASSYM(HYPCTX_ELR_EL1, offsetof(struct hypctx, elr_el1)); +ASSYM(HYPCTX_FAR_EL1, offsetof(struct hypctx, far_el1)); +ASSYM(HYPCTX_FP, offsetof(struct hypctx, fp)); +ASSYM(HYPCTX_MAIR_EL1, offsetof(struct hypctx, mair_el1)); +ASSYM(HYPCTX_PAR_EL1, offsetof(struct hypctx, par_el1)); +ASSYM(HYPCTX_SP_EL0, offsetof(struct hypctx, sp_el0)); +ASSYM(HYPCTX_TCR_EL1, offsetof(struct hypctx, tcr_el1)); +ASSYM(HYPCTX_TPIDR_EL0, offsetof(struct hypctx, tpidr_el0)); +ASSYM(HYPCTX_TPIDRRO_EL0, offsetof(struct hypctx, tpidrro_el0)); +ASSYM(HYPCTX_TPIDR_EL1, offsetof(struct hypctx, tpidr_el1)); +ASSYM(HYPCTX_TTBR0_EL1, offsetof(struct hypctx, ttbr0_el1)); +ASSYM(HYPCTX_TTBR1_EL1, offsetof(struct hypctx, ttbr1_el1)); +ASSYM(HYPCTX_VBAR_EL1, offsetof(struct hypctx, vbar_el1)); +ASSYM(HYPCTX_AFSR0_EL1, offsetof(struct hypctx, afsr0_el1)); +ASSYM(HYPCTX_AFSR1_EL1, offsetof(struct hypctx, afsr1_el1)); +ASSYM(HYPCTX_CONTEXTIDR_EL1, offsetof(struct hypctx, contextidr_el1)); +ASSYM(HYPCTX_CPACR_EL1, offsetof(struct hypctx, cpacr_el1)); +ASSYM(HYPCTX_ESR_EL1, offsetof(struct hypctx, esr_el1)); +ASSYM(HYPCTX_SCTLR_EL1, offsetof(struct hypctx, sctlr_el1)); +ASSYM(HYPCTX_SPSR_EL1, offsetof(struct hypctx, spsr_el1)); + +ASSYM(HYPCTX_ELR_EL2, offsetof(struct hypctx, elr_el2)); +ASSYM(HYPCTX_HCR_EL2, offsetof(struct hypctx, hcr_el2)); +ASSYM(HYPCTX_VPIDR_EL2, offsetof(struct hypctx, vpidr_el2)); +ASSYM(HYPCTX_VMPIDR_EL2, offsetof(struct hypctx, vmpidr_el2)); +ASSYM(HYPCTX_CPTR_EL2, offsetof(struct hypctx, cptr_el2)); +ASSYM(HYPCTX_SPSR_EL2, offsetof(struct hypctx, spsr_el2)); + +ASSYM(HYPCTX_HYP, offsetof(struct hypctx, hyp)); + +ASSYM(HYP_VTTBR_EL2, offsetof(struct hyp, vttbr_el2)); +ASSYM(HYP_VTIMER_CNTHCTL_EL2, offsetof(struct hyp, vtimer.cnthctl_el2)); +ASSYM(HYP_VTIMER_CNTVOFF_EL2, offsetof(struct hyp, vtimer.cntvoff_el2)); + +ASSYM(HYPCTX_EXIT_INFO_ESR_EL2, offsetof(struct hypctx, exit_info.esr_el2)); +ASSYM(HYPCTX_EXIT_INFO_FAR_EL2, offsetof(struct hypctx, exit_info.far_el2)); +ASSYM(HYPCTX_EXIT_INFO_HPFAR_EL2, offsetof(struct hypctx, exit_info.hpfar_el2)); + +ASSYM(HYPCTX_VGIC_ICH_LR_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_lr_el2)); +ASSYM(HYPCTX_VGIC_ICH_LR_NUM, offsetof(struct hypctx, vgic_cpu_if.ich_lr_num)); +ASSYM(HYPCTX_VGIC_ICH_AP0R_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_ap0r_el2)); +ASSYM(HYPCTX_VGIC_ICH_AP0R_NUM, offsetof(struct hypctx, vgic_cpu_if.ich_ap0r_num)); +ASSYM(HYPCTX_VGIC_ICH_AP1R_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_ap1r_el2)); +ASSYM(HYPCTX_VGIC_ICH_AP1R_NUM, offsetof(struct hypctx, vgic_cpu_if.ich_ap1r_num)); +ASSYM(HYPCTX_VGIC_ICH_EISR_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_eisr_el2)); +ASSYM(HYPCTX_VGIC_ICH_ELRSR_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_elrsr_el2)); +ASSYM(HYPCTX_VGIC_ICH_HCR_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_hcr_el2)); +ASSYM(HYPCTX_VGIC_ICH_MISR_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_misr_el2)); +ASSYM(HYPCTX_VGIC_ICH_VMCR_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_vmcr_el2)); +ASSYM(HYPCTX_VGIC_ICH_LR_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_lr_el2)); + +ASSYM(HYPCTX_VTIMER_CPU_CNTKCTL_EL1, offsetof(struct hypctx, vtimer_cpu.cntkctl_el1)); +ASSYM(HYPCTX_VTIMER_CPU_CNTV_CVAL_EL0, offsetof(struct hypctx, vtimer_cpu.cntv_cval_el0)); +ASSYM(HYPCTX_VTIMER_CPU_CNTV_CTL_EL0, offsetof(struct hypctx, vtimer_cpu.cntv_ctl_el0)); + +#ifdef VFP +ASSYM(HYPCTX_VFPSTATE_Q0, offsetof(struct hypctx, vfpstate.vfp_regs) + 0 * 16); +ASSYM(HYPCTX_VFPSTATE_Q1, offsetof(struct hypctx, vfpstate.vfp_regs) + 1 * 16); +ASSYM(HYPCTX_VFPSTATE_Q2, offsetof(struct hypctx, vfpstate.vfp_regs) + 2 * 16); +ASSYM(HYPCTX_VFPSTATE_Q3, offsetof(struct hypctx, vfpstate.vfp_regs) + 3 * 16); +ASSYM(HYPCTX_VFPSTATE_Q4, offsetof(struct hypctx, vfpstate.vfp_regs) + 4 * 16); +ASSYM(HYPCTX_VFPSTATE_Q5, offsetof(struct hypctx, vfpstate.vfp_regs) + 5 * 16); +ASSYM(HYPCTX_VFPSTATE_Q6, offsetof(struct hypctx, vfpstate.vfp_regs) + 6 * 16); +ASSYM(HYPCTX_VFPSTATE_Q7, offsetof(struct hypctx, vfpstate.vfp_regs) + 7 * 16); +ASSYM(HYPCTX_VFPSTATE_Q8, offsetof(struct hypctx, vfpstate.vfp_regs) + 8 * 16); +ASSYM(HYPCTX_VFPSTATE_Q9, offsetof(struct hypctx, vfpstate.vfp_regs) + 9 * 16); +ASSYM(HYPCTX_VFPSTATE_Q10, offsetof(struct hypctx, vfpstate.vfp_regs) + 10 * 16); +ASSYM(HYPCTX_VFPSTATE_Q11, offsetof(struct hypctx, vfpstate.vfp_regs) + 11 * 16); +ASSYM(HYPCTX_VFPSTATE_Q12, offsetof(struct hypctx, vfpstate.vfp_regs) + 12 * 16); +ASSYM(HYPCTX_VFPSTATE_Q13, offsetof(struct hypctx, vfpstate.vfp_regs) + 13 * 16); +ASSYM(HYPCTX_VFPSTATE_Q14, offsetof(struct hypctx, vfpstate.vfp_regs) + 14 * 16); +ASSYM(HYPCTX_VFPSTATE_Q15, offsetof(struct hypctx, vfpstate.vfp_regs) + 15 * 16); +ASSYM(HYPCTX_VFPSTATE_Q16, offsetof(struct hypctx, vfpstate.vfp_regs) + 16 * 16); +ASSYM(HYPCTX_VFPSTATE_Q17, offsetof(struct hypctx, vfpstate.vfp_regs) + 17 * 16); +ASSYM(HYPCTX_VFPSTATE_Q18, offsetof(struct hypctx, vfpstate.vfp_regs) + 18 * 16); +ASSYM(HYPCTX_VFPSTATE_Q19, offsetof(struct hypctx, vfpstate.vfp_regs) + 19 * 16); +ASSYM(HYPCTX_VFPSTATE_Q20, offsetof(struct hypctx, vfpstate.vfp_regs) + 20 * 16); +ASSYM(HYPCTX_VFPSTATE_Q21, offsetof(struct hypctx, vfpstate.vfp_regs) + 21 * 16); +ASSYM(HYPCTX_VFPSTATE_Q22, offsetof(struct hypctx, vfpstate.vfp_regs) + 22 * 16); +ASSYM(HYPCTX_VFPSTATE_Q23, offsetof(struct hypctx, vfpstate.vfp_regs) + 23 * 16); +ASSYM(HYPCTX_VFPSTATE_Q24, offsetof(struct hypctx, vfpstate.vfp_regs) + 24 * 16); +ASSYM(HYPCTX_VFPSTATE_Q25, offsetof(struct hypctx, vfpstate.vfp_regs) + 25 * 16); +ASSYM(HYPCTX_VFPSTATE_Q26, offsetof(struct hypctx, vfpstate.vfp_regs) + 26 * 16); +ASSYM(HYPCTX_VFPSTATE_Q27, offsetof(struct hypctx, vfpstate.vfp_regs) + 27 * 16); +ASSYM(HYPCTX_VFPSTATE_Q28, offsetof(struct hypctx, vfpstate.vfp_regs) + 28 * 16); +ASSYM(HYPCTX_VFPSTATE_Q29, offsetof(struct hypctx, vfpstate.vfp_regs) + 29 * 16); +ASSYM(HYPCTX_VFPSTATE_Q30, offsetof(struct hypctx, vfpstate.vfp_regs) + 30 * 16); +ASSYM(HYPCTX_VFPSTATE_Q31, offsetof(struct hypctx, vfpstate.vfp_regs) + 31 * 16); + + +ASSYM(HYPCTX_VFPSTATE_FPCR, offsetof(struct hypctx, vfpstate.vfp_fpcr)); +ASSYM(HYPCTX_VFPSTATE_FPSR, offsetof(struct hypctx, vfpstate.vfp_fpsr)); +#endif Index: sys/arm64/vmm/hyp_macros.h =================================================================== --- /dev/null +++ sys/arm64/vmm/hyp_macros.h @@ -0,0 +1,690 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_HYP_MACROS_H_ +#define _VMM_HYP_MACROS_H_ + + +#define PUSH_SYS_REG_PAIR(reg0, reg1) \ + mrs x1, reg0; \ + mrs x2, reg1; \ + stp x2, x1, [sp, #-16]!; + + +#define PUSH_SYS_REG(reg) \ + mrs x1, reg; \ + str x1, [sp, #-16]!; + + +/* + * Push all the host registers before entering the guest. + */ +#define SAVE_HOST_REGS() \ + /* Save the regular registers */ \ + stp x0, x1, [sp, #-16]!; \ + stp x2, x3, [sp, #-16]!; \ + stp x4, x5, [sp, #-16]!; \ + stp x6, x7, [sp, #-16]!; \ + stp x8, x9, [sp, #-16]!; \ + stp x10, x11, [sp, #-16]!; \ + stp x12, x13, [sp, #-16]!; \ + stp x14, x15, [sp, #-16]!; \ + stp x16, x17, [sp, #-16]!; \ + stp x18, x19, [sp, #-16]!; \ + stp x20, x21, [sp, #-16]!; \ + stp x22, x23, [sp, #-16]!; \ + stp x24, x25, [sp, #-16]!; \ + stp x26, x27, [sp, #-16]!; \ + stp x28, x29, [sp, #-16]!; \ + stp lr, fp, [sp, #-16]!; \ + \ + /* Push the system registers */ \ + PUSH_SYS_REG_PAIR(SP_EL0, SP_EL1); \ + PUSH_SYS_REG_PAIR(ACTLR_EL1, AMAIR_EL1); \ + PUSH_SYS_REG_PAIR(ELR_EL1, PAR_EL1); \ + PUSH_SYS_REG_PAIR(MAIR_EL1, TCR_EL1); \ + PUSH_SYS_REG_PAIR(TPIDR_EL0, TPIDRRO_EL0); \ + PUSH_SYS_REG_PAIR(TPIDR_EL1, TTBR0_EL1); \ + PUSH_SYS_REG_PAIR(TTBR1_EL1, VBAR_EL1); \ + PUSH_SYS_REG_PAIR(AFSR0_EL1, AFSR1_EL1); \ + PUSH_SYS_REG_PAIR(CONTEXTIDR_EL1, CPACR_EL1); \ + PUSH_SYS_REG_PAIR(ESR_EL1, FAR_EL1); \ + PUSH_SYS_REG_PAIR(SCTLR_EL1, SPSR_EL1); \ + PUSH_SYS_REG_PAIR(ELR_EL2, HCR_EL2); \ + PUSH_SYS_REG_PAIR(VPIDR_EL2, VMPIDR_EL2); \ + PUSH_SYS_REG_PAIR(CPTR_EL2, SPSR_EL2); \ + PUSH_SYS_REG_PAIR(ICH_HCR_EL2, ICH_VMCR_EL2); \ + PUSH_SYS_REG_PAIR(CNTHCTL_EL2, CNTKCTL_EL1); \ + PUSH_SYS_REG(CNTVOFF_EL2); + + +#define SAVE_HOST_VFP_REGS() \ + stp q0, q1, [sp, #-16 * 2]!; \ + stp q2, q3, [sp, #-16 * 2]!; \ + stp q4, q5, [sp, #-16 * 2]!; \ + stp q6, q7, [sp, #-16 * 2]!; \ + stp q8, q9, [sp, #-16 * 2]!; \ + stp q10, q11, [sp, #-16 * 2]!; \ + stp q12, q13, [sp, #-16 * 2]!; \ + stp q14, q15, [sp, #-16 * 2]!; \ + stp q16, q17, [sp, #-16 * 2]!; \ + stp q18, q19, [sp, #-16 * 2]!; \ + stp q20, q21, [sp, #-16 * 2]!; \ + stp q22, q23, [sp, #-16 * 2]!; \ + stp q24, q25, [sp, #-16 * 2]!; \ + stp q26, q27, [sp, #-16 * 2]!; \ + stp q28, q29, [sp, #-16 * 2]!; \ + stp q30, q31, [sp, #-16 * 2]!; \ + PUSH_SYS_REG_PAIR(FPCR, FPSR); + + +#define POP_SYS_REG_PAIR(reg0, reg1) \ + ldp x2, x1, [sp], #16; \ + msr reg1, x2; \ + msr reg0, x1; + + +#define LOAD_HOST_VFP_REGS() \ + POP_SYS_REG_PAIR(FPCR, FPSR); \ + ldp q30, q31, [sp], #16 * 2; \ + ldp q28, q29, [sp], #16 * 2; \ + ldp q26, q27, [sp], #16 * 2; \ + ldp q24, q25, [sp], #16 * 2; \ + ldp q22, q23, [sp], #16 * 2; \ + ldp q20, q21, [sp], #16 * 2; \ + ldp q18, q19, [sp], #16 * 2; \ + ldp q16, q17, [sp], #16 * 2; \ + ldp q14, q15, [sp], #16 * 2; \ + ldp q12, q13, [sp], #16 * 2; \ + ldp q10, q11, [sp], #16 * 2; \ + ldp q8, q9, [sp], #16 * 2; \ + ldp q6, q7, [sp], #16 * 2; \ + ldp q4, q5, [sp], #16 * 2; \ + ldp q2, q3, [sp], #16 * 2; \ + ldp q0, q1, [sp], #16 * 2; \ + + +#define POP_SYS_REG(reg) \ + ldr x1, [sp], #16; \ + msr reg, x1; + + +/* + * Restore all the host registers before entering the host. + */ +#define LOAD_HOST_REGS() \ + /* Pop the system registers first */ \ + POP_SYS_REG(CNTVOFF_EL2); \ + POP_SYS_REG_PAIR(CNTHCTL_EL2, CNTKCTL_EL1); \ + POP_SYS_REG_PAIR(ICH_HCR_EL2, ICH_VMCR_EL2); \ + POP_SYS_REG_PAIR(CPTR_EL2, SPSR_EL2); \ + POP_SYS_REG_PAIR(VPIDR_EL2, VMPIDR_EL2); \ + POP_SYS_REG_PAIR(ELR_EL2, HCR_EL2); \ + POP_SYS_REG_PAIR(SCTLR_EL1, SPSR_EL1); \ + POP_SYS_REG_PAIR(ESR_EL1, FAR_EL1); \ + POP_SYS_REG_PAIR(CONTEXTIDR_EL1, CPACR_EL1); \ + POP_SYS_REG_PAIR(AFSR0_EL1, AFSR1_EL1); \ + POP_SYS_REG_PAIR(TTBR1_EL1, VBAR_EL1); \ + POP_SYS_REG_PAIR(TPIDR_EL1, TTBR0_EL1); \ + POP_SYS_REG_PAIR(TPIDR_EL0, TPIDRRO_EL0); \ + POP_SYS_REG_PAIR(MAIR_EL1, TCR_EL1); \ + POP_SYS_REG_PAIR(ELR_EL1, PAR_EL1); \ + POP_SYS_REG_PAIR(ACTLR_EL1, AMAIR_EL1); \ + POP_SYS_REG_PAIR(SP_EL0, SP_EL1); \ + \ + /* Pop the regular registers */ \ + ldp lr, fp, [sp], #16; \ + ldp x28, x29, [sp], #16; \ + ldp x26, x27, [sp], #16; \ + ldp x24, x25, [sp], #16; \ + ldp x22, x23, [sp], #16; \ + ldp x20, x21, [sp], #16; \ + ldp x18, x19, [sp], #16; \ + ldp x16, x17, [sp], #16; \ + ldp x14, x15, [sp], #16; \ + ldp x12, x13, [sp], #16; \ + ldp x10, x11, [sp], #16; \ + ldp x8, x9, [sp], #16; \ + ldp x6, x7, [sp], #16; \ + ldp x4, x5, [sp], #16; \ + ldp x2, x3, [sp], #16; \ + ldp x0, x1, [sp], #16; \ + + +#define SAVE_ARRAY_REG64(reg, dest, remaining) \ + cmp remaining, #0; \ + beq 9f; \ + mrs x7, reg; \ + str x7, [dest]; \ + add dest, dest, #8; \ + sub remaining, remaining, #1; + + +#define SAVE_LR_REGS() \ + /* Load the number of ICH_LR_EL2 regs from memory */ \ + mov x2, #HYPCTX_VGIC_ICH_LR_NUM; \ + ldr x3, [x0, x2]; \ + /* x1 holds the destination address */ \ + mov x1, #HYPCTX_VGIC_ICH_LR_EL2; \ + add x1, x0, x1; \ + SAVE_ARRAY_REG64(ich_lr0_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr1_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr2_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr3_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr4_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr5_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr6_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr7_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr8_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr9_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr10_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr11_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr12_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr13_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr14_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr15_el2, x1, x3); \ +9:; \ + ; + + +#define SAVE_ARRAY_REG32(reg, dest, remaining) \ + cmp remaining, #0; \ + beq 9f; \ + mrs x7, reg; \ + str w7, [dest]; \ + add dest, dest, #4; \ + sub remaining, remaining, #1; + + +#define SAVE_AP0R_REGS() \ + /* Load the number of ICH_AP0R_EL2 regs from memory */ \ + mov x2, #HYPCTX_VGIC_ICH_AP0R_NUM; \ + ldr x3, [x0, x2]; \ + /* x1 holds the destination address */ \ + mov x1, #HYPCTX_VGIC_ICH_AP0R_EL2; \ + add x1, x0, x1; \ + SAVE_ARRAY_REG32(ich_ap0r0_el2, x1, x3); \ + SAVE_ARRAY_REG32(ich_ap0r1_el2, x1, x3); \ + SAVE_ARRAY_REG32(ich_ap0r2_el2, x1, x3); \ + SAVE_ARRAY_REG32(ich_ap0r3_el2, x1, x3); \ +9:; \ + ; + + +#define SAVE_AP1R_REGS() \ + /* Load the number of ICH_AP1R_EL2 regs from memory */ \ + mov x2, #HYPCTX_VGIC_ICH_AP1R_NUM; \ + ldr x3, [x0, x2]; \ + /* x1 holds the destination address */ \ + mov x1, #HYPCTX_VGIC_ICH_AP1R_EL2; \ + add x1, x0, x1; \ + SAVE_ARRAY_REG32(ich_ap1r0_el2, x1, x3); \ + SAVE_ARRAY_REG32(ich_ap1r1_el2, x1, x3); \ + SAVE_ARRAY_REG32(ich_ap1r2_el2, x1, x3); \ + SAVE_ARRAY_REG32(ich_ap1r3_el2, x1, x3); \ +9:; \ + ; + + +/* + * The STR and LDR instructions take an offset between [-256, 255], but the + * hypctx register offset can be larger than that. To get around this limitation + * we use a temporary register to hold the offset. + */ +#define SAVE_SYS_REG64(prefix, reg) \ + mrs x1, reg; \ + mov x2, prefix ##_ ##reg; \ + str x1, [x0, x2]; + + +#define SAVE_SYS_REG32(prefix, reg) \ + mrs x1, reg; \ + mov x2, prefix ##_ ##reg; \ + str w1, [x0, x2]; + + +#define SAVE_REG(prefix, reg) \ + mov x1, prefix ##_ ##reg; \ + str reg, [x0, x1]; + +/* + * The STP and LDP instructions takes an immediate in the range of [-512, 504] + * when using the post-indexed addressing mode, but the hypctx register offset + * can be larger than that. To get around this limitation we compute the address + * by adding the hypctx base address with the struct member offset. + * + * Using STP/LDP to save/load register pairs to the corresponding struct hypctx + * variables works because the registers are declared as an array and they are + * stored in contiguous memory addresses. + */ + +#define SAVE_REG_PAIR(prefix, reg0, reg1) \ + mov x1, prefix ##_ ##reg0; \ + add x1, x0, x1; \ + stp reg0, reg1, [x1]; + + +/* + * We use x0 to load the hypctx address from TPIDR_EL2 and x1 and x2 as + * temporary registers to compute the hypctx member addresses. To save the guest + * values at first we push them on the stack, use these temporary registers to + * save the rest of the registers and at the end we pop the values from the + * stack and save them. + */ +#define SAVE_GUEST_X_REGS() \ + /* Push x0 */ \ + str x0, [sp, #-16]!; \ + /* Restore hypctx address */ \ + mrs x0, tpidr_el2; \ + /* Push x1 and x2 */ \ + stp x1, x2, [sp, #-16]!; \ + \ + /* Save the other registers */ \ + SAVE_REG_PAIR(HYPCTX_REGS, X3, X4); \ + SAVE_REG_PAIR(HYPCTX_REGS, X5, X6); \ + SAVE_REG_PAIR(HYPCTX_REGS, X7, X8); \ + SAVE_REG_PAIR(HYPCTX_REGS, X9, X10); \ + SAVE_REG_PAIR(HYPCTX_REGS, X11, X12); \ + SAVE_REG_PAIR(HYPCTX_REGS, X13, X14); \ + SAVE_REG_PAIR(HYPCTX_REGS, X15, X16); \ + SAVE_REG_PAIR(HYPCTX_REGS, X17, X18); \ + SAVE_REG_PAIR(HYPCTX_REGS, X19, X20); \ + SAVE_REG_PAIR(HYPCTX_REGS, X21, X22); \ + SAVE_REG_PAIR(HYPCTX_REGS, X23, X24); \ + SAVE_REG_PAIR(HYPCTX_REGS, X25, X26); \ + SAVE_REG_PAIR(HYPCTX_REGS, X27, X28); \ + SAVE_REG(HYPCTX_REGS, X29); \ + SAVE_REG(HYPCTX_REGS, LR); \ + \ + /* Pop and save x1 and x2 */ \ + ldp x1, x2, [sp], #16; \ + mov x3, #HYPCTX_REGS_X1; \ + add x3, x0, x3; \ + stp x1, x2, [x3]; \ + /* Pop and save x0 */ \ + ldr x1, [sp], #16; \ + mov x2, #HYPCTX_REGS_X0; \ + add x2, x2, x0; \ + str x1, [x2]; + + +/* + * Save all the guest registers. Start by saving the regular registers first + * because those will be used as temporary registers for accessing the hypctx + * member addresses. + * + * Expecting: + * TPIDR_EL2 - struct hypctx address + * + * After call: + * x0 - struct hypctx address + */ +#define SAVE_GUEST_REGS() \ + SAVE_GUEST_X_REGS(); \ + \ + SAVE_REG(HYPCTX, FP); \ + \ + SAVE_SYS_REG32(HYPCTX_VTIMER_CPU, CNTKCTL_EL1); \ + SAVE_SYS_REG64(HYPCTX_VTIMER_CPU, CNTV_CVAL_EL0); \ + SAVE_SYS_REG32(HYPCTX_VTIMER_CPU, CNTV_CTL_EL0);\ + \ + /* \ + * ICH_EISR_EL2, ICH_ELRSR_EL2 and ICH_MISR_EL2 are read-only and are \ + * saved because they are modified by the hardware as part of the \ + * interrupt virtualization process and we need to inspect them in \ + * the VGIC driver. \ + */ \ + SAVE_SYS_REG32(HYPCTX_VGIC, ICH_EISR_EL2); \ + SAVE_SYS_REG32(HYPCTX_VGIC, ICH_ELRSR_EL2); \ + SAVE_SYS_REG32(HYPCTX_VGIC, ICH_MISR_EL2); \ + SAVE_SYS_REG32(HYPCTX_VGIC, ICH_HCR_EL2); \ + SAVE_SYS_REG32(HYPCTX_VGIC, ICH_VMCR_EL2); \ + \ + SAVE_LR_REGS(); \ + SAVE_AP0R_REGS(); \ + SAVE_AP1R_REGS(); \ + \ + /* Save the stack pointer. */ \ + mrs x1, sp_el1; \ + mov x2, #HYPCTX_REGS_SP; \ + str x1, [x0, x2]; \ + \ + SAVE_SYS_REG64(HYPCTX, ACTLR_EL1); \ + SAVE_SYS_REG64(HYPCTX, AFSR0_EL1); \ + SAVE_SYS_REG64(HYPCTX, AFSR1_EL1); \ + SAVE_SYS_REG64(HYPCTX, AMAIR_EL1); \ + SAVE_SYS_REG64(HYPCTX, CONTEXTIDR_EL1); \ + SAVE_SYS_REG64(HYPCTX, CPACR_EL1); \ + SAVE_SYS_REG64(HYPCTX, ELR_EL1); \ + SAVE_SYS_REG64(HYPCTX, ESR_EL1); \ + SAVE_SYS_REG64(HYPCTX, FAR_EL1); \ + SAVE_SYS_REG64(HYPCTX, MAIR_EL1); \ + SAVE_SYS_REG64(HYPCTX, PAR_EL1); \ + SAVE_SYS_REG64(HYPCTX, SCTLR_EL1); \ + SAVE_SYS_REG64(HYPCTX, SP_EL0); \ + SAVE_SYS_REG64(HYPCTX, TCR_EL1); \ + SAVE_SYS_REG64(HYPCTX, TPIDR_EL0); \ + SAVE_SYS_REG64(HYPCTX, TPIDRRO_EL0); \ + SAVE_SYS_REG64(HYPCTX, TPIDR_EL1); \ + SAVE_SYS_REG64(HYPCTX, TTBR0_EL1); \ + SAVE_SYS_REG64(HYPCTX, TTBR1_EL1); \ + SAVE_SYS_REG64(HYPCTX, VBAR_EL1); \ + \ + SAVE_SYS_REG32(HYPCTX, SPSR_EL1); \ + \ + SAVE_SYS_REG64(HYPCTX, CPTR_EL2); \ + SAVE_SYS_REG64(HYPCTX, ELR_EL2); \ + SAVE_SYS_REG64(HYPCTX, HCR_EL2); \ + SAVE_SYS_REG64(HYPCTX, VPIDR_EL2); \ + SAVE_SYS_REG64(HYPCTX, VMPIDR_EL2); \ + SAVE_SYS_REG32(HYPCTX, SPSR_EL2); + + +#define SAVE_GUEST_VFP_REGS() \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q0, Q1); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q2, Q3); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q4, Q5); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q6, Q7); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q8, Q9); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q10, Q11); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q12, Q13); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q14, Q15); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q16, Q17); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q18, Q19); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q20, Q21); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q22, Q23); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q24, Q25); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q26, Q27); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q28, Q29); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q30, Q31); \ + \ + SAVE_SYS_REG32(HYPCTX_VFPSTATE, FPCR); \ + SAVE_SYS_REG32(HYPCTX_VFPSTATE, FPSR); + + +/* See SAVE_SYS_REG */ +#define LOAD_SYS_REG64(prefix, reg) \ + mov x1, prefix ##_ ##reg; \ + ldr x2, [x0, x1]; \ + msr reg, x2; + + +#define LOAD_SYS_REG32(prefix, reg) \ + mov x1, prefix ##_ ##reg; \ + ldr w2, [x0, x1]; \ + msr reg, x2; + + +/* See SAVE_REG_PAIR */ +#define LOAD_REG_PAIR(prefix, reg0, reg1) \ + mov x1, prefix ##_ ##reg0; \ + add x1, x0, x1; \ + ldp reg0, reg1, [x1]; + + +#define LOAD_GUEST_VFP_REGS() \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q0, Q1); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q2, Q3); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q4, Q5); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q6, Q7); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q8, Q9); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q10, Q11); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q12, Q13); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q14, Q15); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q16, Q17); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q18, Q19); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q20, Q21); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q22, Q23); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q24, Q25); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q26, Q27); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q28, Q29); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q30, Q31); \ + \ + LOAD_SYS_REG32(HYPCTX_VFPSTATE, FPCR); \ + LOAD_SYS_REG32(HYPCTX_VFPSTATE, FPSR); + + +#define LOAD_REG(prefix, reg) \ + mov x1, prefix ##_ ##reg; \ + ldr reg, [x0, x1]; + + +/* + * We use x1 as a temporary register to store the hypctx member offset and x0 + * to hold the hypctx address. We load the guest x0 and x1 register values in + * registers x2 and x3, push x2 and x3 on the stack and then we restore x0 and + * x1. + */ +#define LOAD_GUEST_X_REGS() \ + mov x1, #HYPCTX_REGS_X0; \ + /* x1 now holds the address of hypctx reg x0 */ \ + add x1, x1, x0; \ + /* Make x2 = guest x0 and x3 = guest x1 */ \ + ldp x2, x3, [x1]; \ + stp x2, x3, [sp, #-16]!; \ + \ + /* Load the other registers */ \ + LOAD_REG_PAIR(HYPCTX_REGS, X2, X3); \ + LOAD_REG_PAIR(HYPCTX_REGS, X4, X5); \ + LOAD_REG_PAIR(HYPCTX_REGS, X6, X7); \ + LOAD_REG_PAIR(HYPCTX_REGS, X8, X9); \ + LOAD_REG_PAIR(HYPCTX_REGS, X10, X11); \ + LOAD_REG_PAIR(HYPCTX_REGS, X12, X13); \ + LOAD_REG_PAIR(HYPCTX_REGS, X14, X15); \ + LOAD_REG_PAIR(HYPCTX_REGS, X16, X17); \ + LOAD_REG_PAIR(HYPCTX_REGS, X18, X19); \ + LOAD_REG_PAIR(HYPCTX_REGS, X20, X21); \ + LOAD_REG_PAIR(HYPCTX_REGS, X22, X23); \ + LOAD_REG_PAIR(HYPCTX_REGS, X24, X25); \ + LOAD_REG_PAIR(HYPCTX_REGS, X26, X27); \ + LOAD_REG_PAIR(HYPCTX_REGS, X28, X29); \ + LOAD_REG(HYPCTX_REGS, LR); \ + \ + /* Pop guest x0 and x1 from the stack */ \ + ldp x0, x1, [sp], #16; \ + + +#define LOAD_ARRAY_REG64(reg, src, remaining) \ + cmp remaining, #0; \ + beq 9f; \ + ldr x2, [src]; \ + msr reg, x2; \ + add src, src, #8; \ + sub remaining, remaining, #1; + + +#define LOAD_LR_REGS(); \ + /* Load the number of ICH_LR_EL2 regs from memory */ \ + mov x2, #HYPCTX_VGIC_ICH_LR_NUM; \ + ldr x3, [x0, x2]; \ + mov x1, #HYPCTX_VGIC_ICH_LR_EL2; \ + /* x1 holds the load address */ \ + add x1, x0, x1; \ + LOAD_ARRAY_REG64(ich_lr0_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr1_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr2_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr3_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr4_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr5_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr6_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr7_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr8_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr9_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr10_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr11_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr12_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr13_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr14_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr15_el2, x1, x3); \ +9:; \ + ; + + +#define LOAD_ARRAY_REG32(reg, src, remaining) \ + cmp remaining, #0; \ + beq 9f; \ + ldr w2, [src]; \ + msr reg, x2; \ + add src, src, #4; \ + sub remaining, remaining, #1; + + +#define LOAD_AP0R_REGS(); \ + /* Load the number of ICH_AP0R_EL2 regs from memory */ \ + mov x2, #HYPCTX_VGIC_ICH_AP0R_NUM; \ + ldr x3, [x0, x2]; \ + /* x1 holds the load address */ \ + mov x1, #HYPCTX_VGIC_ICH_AP0R_EL2; \ + add x1, x0, x1; \ + LOAD_ARRAY_REG32(ich_ap0r0_el2, x1, x3); \ + LOAD_ARRAY_REG32(ich_ap0r1_el2, x1, x3); \ + LOAD_ARRAY_REG32(ich_ap0r2_el2, x1, x3); \ + LOAD_ARRAY_REG32(ich_ap0r3_el2, x1, x3); \ +9:; \ + ; + + +#define LOAD_AP1R_REGS(); \ + /* Load the number of ICH_AP1R_EL2 regs from memory */ \ + mov x2, #HYPCTX_VGIC_ICH_AP1R_NUM; \ + ldr x3, [x0, x2]; \ + /* x1 holds the load address */ \ + mov x1, #HYPCTX_VGIC_ICH_AP1R_EL2; \ + add x1, x0, x1; \ + LOAD_ARRAY_REG32(ich_ap1r0_el2, x1, x3); \ + LOAD_ARRAY_REG32(ich_ap1r1_el2, x1, x3); \ + LOAD_ARRAY_REG32(ich_ap1r2_el2, x1, x3); \ + LOAD_ARRAY_REG32(ich_ap1r3_el2, x1, x3); \ +9:; \ + ; + + + +#define KTOHYP_REG(reg) \ + mov x7, HYP_KVA_MASK; \ + and reg, reg, x7; \ + mov x7, HYP_KVA_OFFSET; \ + orr reg, reg, x7; + + +/* Load a register from struct hyp *hyp member of hypctx. */ +#define LOAD_HYP_REG(prefix, reg) \ + /* Compute VA of hyp member in x1 */ \ + mov x1, #HYPCTX_HYP; \ + add x1, x1, x0; \ + /* Get hyp address in x2 */ \ + ldr x2, [x1]; \ + /* Transform hyp kernel VA into an EL2 VA */ \ + KTOHYP_REG(x2); \ + /* Get register offset inside struct hyp */ \ + mov x1, prefix ##_ ##reg; \ + /* Compute regster address */ \ + add x2, x2, x1; \ + /* Load the register */ \ + ldr x1, [x2]; \ + msr reg, x1; \ + + +/* + * Restore all the guest registers to their original values. + * + * Expecting: + * x0 - struct hypctx address + * + * After call: + * tpidr_el2 - struct hypctx address + */ +#define LOAD_GUEST_REGS() \ + LOAD_SYS_REG64(HYPCTX, ACTLR_EL1); \ + LOAD_SYS_REG64(HYPCTX, AFSR0_EL1); \ + LOAD_SYS_REG64(HYPCTX, AFSR1_EL1); \ + LOAD_SYS_REG64(HYPCTX, AMAIR_EL1); \ + LOAD_SYS_REG64(HYPCTX, CONTEXTIDR_EL1); \ + LOAD_SYS_REG64(HYPCTX, CPACR_EL1); \ + LOAD_SYS_REG64(HYPCTX, ELR_EL1); \ + LOAD_SYS_REG64(HYPCTX, ESR_EL1); \ + LOAD_SYS_REG64(HYPCTX, FAR_EL1); \ + LOAD_SYS_REG64(HYPCTX, MAIR_EL1); \ + LOAD_SYS_REG64(HYPCTX, PAR_EL1); \ + LOAD_SYS_REG64(HYPCTX, SCTLR_EL1); \ + LOAD_SYS_REG64(HYPCTX, SP_EL0); \ + LOAD_SYS_REG64(HYPCTX, TCR_EL1); \ + LOAD_SYS_REG64(HYPCTX, TPIDR_EL0); \ + LOAD_SYS_REG64(HYPCTX, TPIDRRO_EL0); \ + LOAD_SYS_REG64(HYPCTX, TPIDR_EL1); \ + LOAD_SYS_REG64(HYPCTX, TTBR0_EL1); \ + LOAD_SYS_REG64(HYPCTX, TTBR1_EL1); \ + LOAD_SYS_REG64(HYPCTX, VBAR_EL1); \ + LOAD_SYS_REG32(HYPCTX, SPSR_EL1); \ + \ + LOAD_SYS_REG64(HYPCTX, CPTR_EL2); \ + LOAD_SYS_REG64(HYPCTX, ELR_EL2); \ + LOAD_SYS_REG64(HYPCTX, HCR_EL2); \ + LOAD_SYS_REG64(HYPCTX, VPIDR_EL2); \ + LOAD_SYS_REG64(HYPCTX, VMPIDR_EL2); \ + LOAD_SYS_REG32(HYPCTX, SPSR_EL2); \ + \ + LOAD_SYS_REG32(HYPCTX_VGIC, ICH_HCR_EL2); \ + LOAD_SYS_REG32(HYPCTX_VGIC, ICH_VMCR_EL2); \ + \ + LOAD_SYS_REG32(HYPCTX_VTIMER_CPU, CNTKCTL_EL1); \ + LOAD_SYS_REG64(HYPCTX_VTIMER_CPU, CNTV_CVAL_EL0); \ + LOAD_SYS_REG32(HYPCTX_VTIMER_CPU, CNTV_CTL_EL0); \ + \ + LOAD_REG(HYPCTX, FP); \ + \ + LOAD_HYP_REG(HYP, VTTBR_EL2); \ + LOAD_HYP_REG(HYP_VTIMER, CNTHCTL_EL2); \ + LOAD_HYP_REG(HYP_VTIMER, CNTVOFF_EL2); \ + \ + LOAD_LR_REGS(); \ + LOAD_AP0R_REGS(); \ + LOAD_AP1R_REGS(); \ + \ + /* Load the guest EL1 stack pointer */ \ + mov x1, #HYPCTX_REGS_SP; \ + add x1, x1, x0; \ + ldr x2, [x1]; \ + msr sp_el1, x2; \ + \ + LOAD_GUEST_X_REGS(); \ + + +/* + * Save exit information + * + * Expecting: + * x0 - struct hypctx address + */ +#define SAVE_EXIT_INFO() \ + SAVE_SYS_REG64(HYPCTX_EXIT_INFO, ESR_EL2); \ + SAVE_SYS_REG64(HYPCTX_EXIT_INFO, FAR_EL2); \ + SAVE_SYS_REG64(HYPCTX_EXIT_INFO, HPFAR_EL2); \ + +#endif /* !_VMM_HYP_MACROS_H_ */ Index: sys/arm64/vmm/io/vgic_v3.h =================================================================== --- /dev/null +++ sys/arm64/vmm/io/vgic_v3.h @@ -0,0 +1,194 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_VGIC_V3_H_ +#define _VMM_VGIC_V3_H_ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#define VGIC_SGI_NUM (GIC_LAST_SGI - GIC_FIRST_SGI + 1) +#define VGIC_PPI_NUM (GIC_LAST_PPI - GIC_FIRST_PPI + 1) +#define VGIC_SPI_NUM (GIC_LAST_SPI - GIC_FIRST_SPI + 1) +#define VGIC_PRV_I_NUM (VGIC_SGI_NUM + VGIC_PPI_NUM) +#define VGIC_SHR_I_NUM (VGIC_SPI_NUM) + +#define VGIC_ICH_LR_NUM_MAX 16 +#define VGIC_ICH_AP0R_NUM_MAX 4 +#define VGIC_ICH_AP1R_NUM_MAX VGIC_ICH_AP0R_NUM_MAX + +#define ICC_SGI1R_EL1_OP0 0x3 +#define ICC_SGI1R_EL1_OP0_MASK (ICC_SGI1R_EL1_OP0 << ISS_MSR_OP0_SHIFT) +#define ICC_SGI1R_EL1_OP1 0x0 +#define ICC_SGI1R_EL1_OP1_MASK (ICC_SGI1R_EL1_OP1 << ISS_MSR_OP1_SHIFT) +#define ICC_SGI1R_EL1_CRn 0xc +#define ICC_SGI1R_EL1_CRn_MASK (ICC_SGI1R_EL1_CRn << ISS_MSR_CRn_SHIFT) +#define ICC_SGI1R_EL1_CRm 0xb +#define ICC_SGI1R_EL1_CRm_MASK (ICC_SGI1R_EL1_CRm << ISS_MSR_CRm_SHIFT) +#define ICC_SGI1R_EL1_OP2 0x5 +#define ICC_SGI1R_EL1_OP2_MASK (ICC_SGI1R_EL1_OP2 << ISS_MSR_OP2_SHIFT) + +#define ICC_SGI1R_EL1 \ + (ICC_SGI1R_EL1_OP0_MASK | ICC_SGI1R_EL1_OP1_MASK | \ + ICC_SGI1R_EL1_CRn_MASK | ICC_SGI1R_EL1_CRm_MASK | \ + ICC_SGI1R_EL1_OP2_MASK) + +#define ICC_SGI1R_EL1_TargetList_Bits 16 + +int vgic_v3_icc_sgi1r_el1_read(void *vm, int vcpuid, uint64_t *rval, void *arg); +int vgic_v3_icc_sgi1r_el1_write(void *vm, int vcpuid, uint64_t rval, void *arg); + +/* Order matters, a lower value means a higher precedence */ +enum vgic_v3_irqtype { + VGIC_IRQ_MAXPRIO, + VGIC_IRQ_CLK, + VGIC_IRQ_VIRTIO, + VGIC_IRQ_MISC, + VGIC_IRQ_INVALID, +}; + +struct vgic_mmio_region { + vm_offset_t start; + vm_offset_t end; + mem_region_read_t read; + mem_region_write_t write; +}; + +struct vm; +struct vm_exit; +struct hyp; + +struct vgic_v3_dist { + struct mtx dist_mtx; + + uint64_t start; + size_t end; + size_t nirqs; + + uint32_t gicd_ctlr; /* Distributor Control Register */ + uint32_t gicd_typer; /* Interrupt Controller Type Register */ + uint32_t gicd_typer2; /* Interrupt Controller Type Register */ + uint32_t gicd_iidr; /* Implementer and Revision of the Distributor */ + uint32_t gicd_pidr2; /* Distributor Peripheral ID2 Register */ + /* Interrupt Configuration Registers. */ + uint32_t *gicd_icfgr; + /* Interrupt Priority Registers. */ + uint32_t *gicd_ipriorityr; + /* Interrupt Routing Registers. */ + uint64_t *gicd_irouter; + /* Interrupt Clear-Enable and Set-Enable Registers. */ + uint32_t *gicd_ixenabler; + uint32_t *gicd_ixactiver; +}; + +#define aff_routing_en(distp) (distp->gicd_ctlr & GICD_CTLR_ARE_NS) + +struct vgic_v3_redist { + uint64_t start; + uint64_t end; + + uint64_t gicr_typer; /* Redistributor Type Register */ + uint32_t gicr_ctlr; /* Redistributor Control Regiser */ + uint32_t gicr_ixenabler0; + /* Interrupt Priority Registers. */ + uint32_t gicr_ipriorityr[VGIC_PRV_I_NUM / 4]; + /* Interupt Configuration Registers */ + uint32_t gicr_icfgr0, gicr_icfgr1; + uint32_t gicr_icactiver0; +}; + +struct vgic_v3_irq; +struct vgic_v3_cpu_if { + uint32_t ich_eisr_el2; /* End of Interrupt Status Register */ + uint32_t ich_elrsr_el2; /* Empty List register Status Register (ICH_ELRSR_EL2) */ + uint32_t ich_hcr_el2; /* Hyp Control Register */ + uint32_t ich_misr_el2; /* Maintenance Interrupt State Register */ + uint32_t ich_vmcr_el2; /* Virtual Machine Control Register */ + + /* + * The List Registers are part of the VM context and are modified on a + * world switch. They need to be allocated statically so they are + * mapped in the EL2 translation tables when struct hypctx is mapped. + */ + uint64_t ich_lr_el2[VGIC_ICH_LR_NUM_MAX]; + size_t ich_lr_num; + + /* + * We need a mutex for accessing the list registers because they are + * modified asynchronously by the virtual timer. + * + * Note that the mutex *MUST* be a spin mutex because an interrupt can + * be injected by a callout callback function, thereby modifying the + * list registers from a context where sleeping is forbidden. + */ + struct mtx lr_mtx; + + /* Active Priorities Registers for Group 0 and 1 interrupts */ + uint32_t ich_ap0r_el2[VGIC_ICH_AP0R_NUM_MAX]; + size_t ich_ap0r_num; + uint32_t ich_ap1r_el2[VGIC_ICH_AP1R_NUM_MAX]; + size_t ich_ap1r_num; + + struct vgic_v3_irq *irqbuf; + size_t irqbuf_size; + size_t irqbuf_num; +}; + +int vgic_v3_attach_to_vm(void *arg, uint64_t dist_start, size_t dist_size, + uint64_t redist_start, size_t redist_size); +void vgic_v3_detach_from_vm(void *arg); +void vgic_v3_init(uint64_t ich_vtr_el2); +void vgic_v3_vminit(void *arg); +void vgic_v3_cpuinit(void *arg, bool last_vcpu); +void vgic_v3_sync_hwstate(void *arg); + +void vgic_v3_mmio_init(struct hyp *hyp); +void vgic_v3_mmio_destroy(struct hyp *hyp); + +int vgic_v3_vcpu_pending_irq(void *arg); +int vgic_v3_inject_irq(void *arg, uint32_t irq, + enum vgic_v3_irqtype irqtype); +int vgic_v3_remove_irq(void *arg, uint32_t irq, bool ignore_state); + +void vgic_v3_group_toggle_enabled(bool enabled, struct hyp *hyp); +int vgic_v3_irq_toggle_enabled(uint32_t irq, bool enabled, + struct hyp *hyp, int vcpuid); + +DECLARE_CLASS(arm_vgic_driver); + +#endif /* !_VMM_VGIC_V3_H_ */ Index: sys/arm64/vmm/io/vgic_v3.c =================================================================== --- /dev/null +++ sys/arm64/vmm/io/vgic_v3.c @@ -0,0 +1,1010 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include "vgic_v3.h" +#include "vgic_v3_reg.h" + +#define VGIC_V3_DEVNAME "vgic" +#define VGIC_V3_DEVSTR "ARM Virtual Generic Interrupt Controller v3" + +#define RES0 0UL + +#define IRQBUF_SIZE_MIN 32 +#define IRQBUF_SIZE_MAX (1 << 10) + +#define IRQ_SCHEDULED (GIC_LAST_SPI + 1) + +#define lr_pending(lr) \ + (ICH_LR_EL2_STATE(lr) == ICH_LR_EL2_STATE_PENDING) +#define lr_inactive(lr) \ + (ICH_LR_EL2_STATE(lr) == ICH_LR_EL2_STATE_INACTIVE) +#define lr_active(lr) \ + (ICH_LR_EL2_STATE(lr) == ICH_LR_EL2_STATE_ACTIVE) +#define lr_pending_active(lr) \ + (ICH_LR_EL2_STATE(lr) == ICH_LR_EL2_STATE_PENDING_ACTIVE) +#define lr_not_active(lr) (!lr_active(lr) && !lr_pending_active(lr)) + +#define lr_clear_irq(lr) ((lr) &= ~ICH_LR_EL2_STATE_MASK) + +MALLOC_DEFINE(M_VGIC_V3, "ARM VMM VGIC V3", "ARM VMM VGIC V3"); + +struct vgic_v3_virt_features { + uint8_t min_prio; + size_t ich_lr_num; + size_t ich_ap0r_num; + size_t ich_ap1r_num; +}; + +struct vgic_v3_ro_regs { + uint32_t gicd_icfgr0; + uint32_t gicd_pidr2; + uint32_t gicd_typer; +}; + +struct vgic_v3_irq { + uint32_t irq; + enum vgic_v3_irqtype irqtype; + uint8_t enabled; + uint8_t priority; +}; + +#define vip_to_lr(vip, lr) \ +do { \ + lr = ICH_LR_EL2_STATE_PENDING; \ + lr |= ICH_LR_EL2_GROUP1; \ + lr |= (uint64_t)vip->priority << ICH_LR_EL2_PRIO_SHIFT; \ + lr |= vip->irq; \ +} while (0) + +#define lr_to_vip(lr, vip) \ +do { \ + (vip)->irq = ICH_LR_EL2_VINTID(lr); \ + (vip)->priority = \ + (uint8_t)(((lr) & ICH_LR_EL2_PRIO_MASK) >> ICH_LR_EL2_PRIO_SHIFT); \ +} while (0) + +static struct vgic_v3_virt_features virt_features; +static struct vgic_v3_ro_regs ro_regs; + +static struct gic_v3_softc *gic_sc; + +void +vgic_v3_cpuinit(void *arg, bool last_vcpu) +{ + struct hypctx *hypctx = arg; + struct vgic_v3_cpu_if *cpu_if = &hypctx->vgic_cpu_if; + struct vgic_v3_redist *redist = &hypctx->vgic_redist; + uint64_t aff, vmpidr_el2; + int i; + + vmpidr_el2 = hypctx->vmpidr_el2; + KASSERT(vmpidr_el2 != 0, + ("Trying to init this CPU's vGIC before the vCPU")); + /* + * Get affinity for the current CPU. The guest CPU affinity is taken + * from VMPIDR_EL2. The Redistributor corresponding to this CPU is + * the Redistributor with the same affinity from GICR_TYPER. + */ + aff = (CPU_AFF3(vmpidr_el2) << 24) | (CPU_AFF2(vmpidr_el2) << 16) | + (CPU_AFF1(vmpidr_el2) << 8) | CPU_AFF0(vmpidr_el2); + + /* Set up GICR_TYPER. */ + redist->gicr_typer = aff << GICR_TYPER_AFF_SHIFT; + /* Redistributor doesn't support virtual or physical LPIS. */ + redist->gicr_typer &= ~GICR_TYPER_VLPIS; + redist->gicr_typer &= ~GICR_TYPER_PLPIS; + + if (last_vcpu) + /* Mark the last Redistributor */ + redist->gicr_typer |= GICR_TYPER_LAST; + + /* + * Configure the Redistributor Control Register. + * + * ~GICR_CTLR_LPI_ENABLE: LPIs are disabled + */ + redist->gicr_ctlr = 0 & ~GICR_CTLR_LPI_ENABLE; + + mtx_init(&cpu_if->lr_mtx, "VGICv3 ICH_LR_EL2 lock", NULL, MTX_SPIN); + + /* + * Configure the Interrupt Controller Hyp Control Register. + * + * ICH_HCR_EL2_En: enable virtual CPU interface. + * + * Maintenance interrupts are disabled. + */ + cpu_if->ich_hcr_el2 = ICH_HCR_EL2_En; + + /* + * Configure the Interrupt Controller Virtual Machine Control Register. + * + * ICH_VMCR_EL2_VPMR: lowest priority mask for the VCPU interface + * ICH_VMCR_EL2_VBPR1_NO_PREEMPTION: disable interrupt preemption for + * Group 1 interrupts + * ICH_VMCR_EL2_VBPR0_NO_PREEMPTION: disable interrupt preemption for + * Group 0 interrupts + * ~ICH_VMCR_EL2_VEOIM: writes to EOI registers perform priority drop + * and interrupt deactivation. + * ICH_VMCR_EL2_VENG0: virtual Group 0 interrupts enabled. + * ICH_VMCR_EL2_VENG1: virtual Group 1 interrupts enabled. + */ + cpu_if->ich_vmcr_el2 = \ + (virt_features.min_prio << ICH_VMCR_EL2_VPMR_SHIFT) | \ + ICH_VMCR_EL2_VBPR1_NO_PREEMPTION | ICH_VMCR_EL2_VBPR0_NO_PREEMPTION; + cpu_if->ich_vmcr_el2 &= ~ICH_VMCR_EL2_VEOIM; + cpu_if->ich_vmcr_el2 |= ICH_VMCR_EL2_VENG0 | ICH_VMCR_EL2_VENG1; + + cpu_if->ich_lr_num = virt_features.ich_lr_num; + for (i = 0; i < cpu_if->ich_lr_num; i++) + cpu_if->ich_lr_el2[i] = 0UL; + + cpu_if->ich_ap0r_num = virt_features.ich_ap0r_num; + cpu_if->ich_ap1r_num = virt_features.ich_ap1r_num; + + cpu_if->irqbuf = malloc(IRQBUF_SIZE_MIN * sizeof(*cpu_if->irqbuf), + M_VGIC_V3, M_WAITOK | M_ZERO); + cpu_if->irqbuf_size = IRQBUF_SIZE_MIN; + cpu_if->irqbuf_num = 0; +} + +void +vgic_v3_vminit(void *arg) +{ + struct hyp *hyp = arg; + struct vgic_v3_dist *dist = &hyp->vgic_dist; + + /* + * Configure the Distributor control register. The register resets to an + * architecturally UNKNOWN value, so we reset to 0 to disable all + * functionality controlled by the register. + * + * The exception is GICD_CTLR.DS, which is RA0/WI when the Distributor + * supports one security state (ARM GIC Architecture Specification for + * GICv3 and GICv4, p. 4-464) + */ + dist->gicd_ctlr = GICD_CTLR_DS; + + dist->gicd_typer = ro_regs.gicd_typer; + dist->nirqs = GICD_TYPER_I_NUM(dist->gicd_typer); + dist->gicd_pidr2 = ro_regs.gicd_pidr2; + + mtx_init(&dist->dist_mtx, "VGICv3 Distributor lock", NULL, MTX_SPIN); +} + +int +vgic_v3_attach_to_vm(void *arg, uint64_t dist_start, size_t dist_size, + uint64_t redist_start, size_t redist_size) +{ + struct hyp *hyp = arg; + struct vgic_v3_dist *dist = &hyp->vgic_dist; + struct vgic_v3_redist *redist; + int i; + + /* Set the distributor address and size for trapping guest access. */ + dist->start = dist_start; + dist->end = dist_start + dist_size; + + for (i = 0; i < VM_MAXCPU; i++) { + redist = &hyp->ctx[i].vgic_redist; + /* Set the redistributor address and size. */ + redist->start = redist_start; + redist->end = redist_start + redist_size; + } + vgic_v3_mmio_init(hyp); + + hyp->vgic_attached = true; + + return (0); +} + +void +vgic_v3_detach_from_vm(void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vgic_v3_cpu_if *cpu_if; + int i; + + hyp = arg; + + for (i = 0; i < VM_MAXCPU; i++) { + hypctx = & hyp->ctx[i]; + cpu_if = &hypctx->vgic_cpu_if; + free(cpu_if->irqbuf, M_VGIC_V3); + } + + vgic_v3_mmio_destroy(hyp); +} + +int +vgic_v3_vcpu_pending_irq(void *arg) +{ + struct hypctx *hypctx = arg; + struct vgic_v3_cpu_if *cpu_if = &hypctx->vgic_cpu_if; + + return (cpu_if->irqbuf_num); +} + +/* Removes ALL instances of interrupt 'irq' */ +static int +vgic_v3_irqbuf_remove_nolock(uint32_t irq, struct vgic_v3_cpu_if *cpu_if) +{ + size_t dest = 0; + size_t from = cpu_if->irqbuf_num; + + while (dest < cpu_if->irqbuf_num) { + if (cpu_if->irqbuf[dest].irq == irq) { + for (from = dest + 1; from < cpu_if->irqbuf_num; from++) { + if (cpu_if->irqbuf[from].irq == irq) + continue; + cpu_if->irqbuf[dest++] = cpu_if->irqbuf[from]; + } + cpu_if->irqbuf_num = dest; + } else { + dest++; + } + } + + return (from - dest); +} + +int +vgic_v3_remove_irq(void *arg, uint32_t irq, bool ignore_state) +{ + struct hypctx *hypctx = arg; + struct vgic_v3_cpu_if *cpu_if = &hypctx->vgic_cpu_if; + struct vgic_v3_dist *dist = &hypctx->hyp->vgic_dist; + size_t i; + + if (irq >= dist->nirqs) { + eprintf("Malformed IRQ %u.\n", irq); + return (1); + } + + mtx_lock_spin(&cpu_if->lr_mtx); + + for (i = 0; i < cpu_if->ich_lr_num; i++) { + if (ICH_LR_EL2_VINTID(cpu_if->ich_lr_el2[i]) == irq && + (lr_not_active(cpu_if->ich_lr_el2[i]) || ignore_state)) + lr_clear_irq(cpu_if->ich_lr_el2[i]); + } + vgic_v3_irqbuf_remove_nolock(irq, cpu_if); + + mtx_unlock_spin(&cpu_if->lr_mtx); + + return (0); +} + +static struct vgic_v3_irq * +vgic_v3_irqbuf_add_nolock(struct vgic_v3_cpu_if *cpu_if) +{ + struct vgic_v3_irq *new_irqbuf, *old_irqbuf; + size_t new_size; + + if (cpu_if->irqbuf_num == cpu_if->irqbuf_size) { + /* Double the size of the buffered interrupts list */ + new_size = cpu_if->irqbuf_size << 1; + if (new_size > IRQBUF_SIZE_MAX) + return (NULL); + + new_irqbuf = NULL; + /* TODO: malloc sleeps here and causes a panic */ + while (new_irqbuf == NULL) + new_irqbuf = malloc(new_size * sizeof(*cpu_if->irqbuf), + M_VGIC_V3, M_NOWAIT | M_ZERO); + memcpy(new_irqbuf, cpu_if->irqbuf, + cpu_if->irqbuf_size * sizeof(*cpu_if->irqbuf)); + + old_irqbuf = cpu_if->irqbuf; + cpu_if->irqbuf = new_irqbuf; + cpu_if->irqbuf_size = new_size; + free(old_irqbuf, M_VGIC_V3); + } + + cpu_if->irqbuf_num++; + + return (&cpu_if->irqbuf[cpu_if->irqbuf_num - 1]); +} + +static bool +vgic_v3_int_target(uint32_t irq, struct hypctx *hypctx) +{ + struct vgic_v3_dist *dist = &hypctx->hyp->vgic_dist; + struct vgic_v3_redist *redist = &hypctx->vgic_redist; + uint64_t irouter; + uint64_t aff; + uint32_t irq_off, irq_mask; + int n; + + if (irq <= GIC_LAST_PPI) + return (true); + + /* XXX Affinity routing disabled not implemented */ + if (!aff_routing_en(dist)) + return (true); + + irq_off = irq % 32; + irq_mask = 1 << irq_off; + n = irq / 32; + + irouter = dist->gicd_irouter[irq]; + /* Check if 1-of-N routing is active */ + if (irouter & GICD_IROUTER_IRM) + /* Check if the VCPU is participating */ + return (redist->gicr_ctlr & GICR_CTLR_DPG1NS ? true : false); + + aff = redist->gicr_typer >> GICR_TYPER_AFF_SHIFT; + /* Affinity in format for comparison with irouter */ + aff = GICR_TYPER_AFF0(redist->gicr_typer) | \ + (GICR_TYPER_AFF1(redist->gicr_typer) << 8) | \ + (GICR_TYPER_AFF2(redist->gicr_typer) << 16) | \ + (GICR_TYPER_AFF3(redist->gicr_typer) << 32); + if ((irouter & aff) == aff) + return (true); + else + return (false); +} + +static uint8_t +vgic_v3_get_priority(uint32_t irq, struct hypctx *hypctx) +{ + struct vgic_v3_dist *dist = &hypctx->hyp->vgic_dist; + struct vgic_v3_redist *redist = &hypctx->vgic_redist; + size_t n; + uint32_t off, mask; + uint8_t priority; + + n = irq / 4; + off = n % 4; + mask = 0xff << off; + /* + * When affinity routing is enabled, the Redistributor is used for + * SGIs and PPIs and the Distributor for SPIs. When affinity routing + * is not enabled, the Distributor registers are used for all + * interrupts. + */ + if (aff_routing_en(dist) && (n <= 7)) + priority = (redist->gicr_ipriorityr[n] & mask) >> off; + else + priority = (dist->gicd_ipriorityr[n] & mask) >> off; + + return (priority); +} + +static bool +vgic_v3_intid_enabled(uint32_t irq, struct hypctx *hypctx) +{ + struct vgic_v3_dist *dist; + struct vgic_v3_redist *redist; + uint32_t irq_off, irq_mask; + int n; + + irq_off = irq % 32; + irq_mask = 1 << irq_off; + n = irq / 32; + + if (irq <= GIC_LAST_PPI) { + redist = &hypctx->vgic_redist; + if (!(redist->gicr_ixenabler0 & irq_mask)) + return (false); + } else { + dist = &hypctx->hyp->vgic_dist; + if (!(dist->gicd_ixenabler[n] & irq_mask)) + return (false); + } + + return (true); +} + +static inline bool +dist_group_enabled(struct vgic_v3_dist *dist) +{ + return ((dist->gicd_ctlr & GICD_CTLR_G1A) != 0); +} + +int +vgic_v3_inject_irq(void *arg, uint32_t irq, enum vgic_v3_irqtype irqtype) +{ + struct hypctx *hypctx = arg; + struct vgic_v3_dist *dist = &hypctx->hyp->vgic_dist; + struct vgic_v3_cpu_if *cpu_if = &hypctx->vgic_cpu_if; + struct vgic_v3_irq *new_irqbuf, *old_irqbuf; + struct vgic_v3_irq *vip; + int error; + int i; + size_t new_size; + uint8_t priority; + bool enabled; + + if (irq >= dist->nirqs || irqtype >= VGIC_IRQ_INVALID) { + eprintf("Malformed IRQ %u.\n", irq); + return (1); + } + + /* + * TODO: Not sure if this will be reached by ONE thread at a time + * It can interfere with timer interrupts. + */ + if (cpu_if->irqbuf_num == cpu_if->irqbuf_size && + irqtype != VGIC_IRQ_CLK) { + /* Double the size of the buffered interrupts list */ + new_size = cpu_if->irqbuf_size << 1; + if (new_size > IRQBUF_SIZE_MAX) { + eprintf("Error adding IRQ %u to the IRQ buffer.\n", irq); + error = 1; + goto out; + } + + new_irqbuf = NULL; + while (new_irqbuf == NULL) + new_irqbuf = malloc(new_size * sizeof(*cpu_if->irqbuf), + M_VGIC_V3, M_NOWAIT | M_ZERO); + memcpy(new_irqbuf, cpu_if->irqbuf, + cpu_if->irqbuf_size * sizeof(*cpu_if->irqbuf)); + + old_irqbuf = cpu_if->irqbuf; + cpu_if->irqbuf = new_irqbuf; + cpu_if->irqbuf_size = new_size; + free(old_irqbuf, M_VGIC_V3); + } + + error = 0; + mtx_lock_spin(&dist->dist_mtx); + + enabled = dist_group_enabled(&hypctx->hyp->vgic_dist) && + vgic_v3_intid_enabled(irq, hypctx) && + vgic_v3_int_target(irq, hypctx); + priority = vgic_v3_get_priority(irq, hypctx); + + mtx_lock_spin(&cpu_if->lr_mtx); + + /* + * If the guest is running behind timer interrupts, don't swamp it with + * one interrupt after another. However, if the timer interrupt is being + * serviced by the guest (it is in a state other than pending, either + * active or pending and active), then add it to the buffer to be + * injected later. Otherwise, the timer would stop working because we + * disable the timer in the host interrupt handler. + */ + if (irqtype == VGIC_IRQ_CLK) { + for (i = 0; i < cpu_if->ich_lr_num; i++) + if (ICH_LR_EL2_VINTID(cpu_if->ich_lr_el2[i]) == irq && + lr_pending(cpu_if->ich_lr_el2[i])) + goto out; + for (i = 0; i < cpu_if->irqbuf_num; i++) + if (cpu_if->irqbuf[i].irq == irq) + goto out; + } + + cpu_if->irqbuf_num++; + vip = &cpu_if->irqbuf[cpu_if->irqbuf_num - 1]; + + vip->irq = irq; + vip->irqtype = irqtype; + vip->enabled = enabled; + vip->priority = priority; + +out: + mtx_unlock_spin(&cpu_if->lr_mtx); + mtx_unlock_spin(&dist->dist_mtx); + + return (error); +} + +void +vgic_v3_group_toggle_enabled(bool enabled, struct hyp *hyp) +{ + struct hypctx *hypctx; + struct vgic_v3_cpu_if *cpu_if; + struct vgic_v3_irq *vip; + int i, j; + + for (i = 0; i < VM_MAXCPU; i++) { + hypctx = &hyp->ctx[i]; + cpu_if = &hypctx->vgic_cpu_if; + + mtx_lock_spin(&cpu_if->lr_mtx); + + for (j = 0; j < cpu_if->irqbuf_num; j++) { + vip = &cpu_if->irqbuf[j]; + if (!enabled) + vip->enabled = 0; + else if (vgic_v3_intid_enabled(vip->irq, hypctx)) + vip->enabled = 1; + } + + mtx_unlock_spin(&cpu_if->lr_mtx); + } +} + +static int +vgic_v3_irq_toggle_enabled_vcpu(uint32_t irq, bool enabled, + struct vgic_v3_cpu_if *cpu_if) +{ + int i; + + mtx_lock_spin(&cpu_if->lr_mtx); + + if (enabled) { + /* + * Enable IRQs that were injected when the interrupt ID was + * disabled + */ + for (i = 0; i < cpu_if->irqbuf_num; i++) + if (cpu_if->irqbuf[i].irq == irq) + cpu_if->irqbuf[i].enabled = true; + } else { + /* Remove the disabled IRQ from the LR regs if it is pending */ + for (i = 0; i < cpu_if->ich_lr_num; i++) + if (lr_pending(cpu_if->ich_lr_el2[i]) && + ICH_LR_EL2_VINTID(cpu_if->ich_lr_el2[i]) == irq) + lr_clear_irq(cpu_if->ich_lr_el2[i]); + + /* Remove the IRQ from the interrupt buffer */ + vgic_v3_irqbuf_remove_nolock(irq, cpu_if); + } + + mtx_unlock_spin(&cpu_if->lr_mtx); + + return (0); +} + +int +vgic_v3_irq_toggle_enabled(uint32_t irq, bool enabled, + struct hyp *hyp, int vcpuid) +{ + struct vgic_v3_cpu_if *cpu_if; + int error; + int i; + + if (irq <= GIC_LAST_PPI) { + cpu_if = &hyp->ctx[vcpuid].vgic_cpu_if; + return (vgic_v3_irq_toggle_enabled_vcpu(irq, enabled, cpu_if)); + } else { + /* TODO: Update irqbuf for all VCPUs, not just VCPU 0 */ + for (i = 0; i < VM_MAXCPU; i++) { + cpu_if = &hyp->ctx[i].vgic_cpu_if; + error = vgic_v3_irq_toggle_enabled_vcpu(irq, enabled, cpu_if); + if (error) + return (error); + } + } + + return (0); +} + +static int +irqbuf_highest_priority(struct vgic_v3_cpu_if *cpu_if, int start, int end, + struct hypctx *hypctx) +{ + uint32_t irq; + int i, max_idx; + uint8_t priority, max_priority; + uint8_t vpmr; + + vpmr = (cpu_if->ich_vmcr_el2 & ICH_VMCR_EL2_VPMR_MASK) >> \ + ICH_VMCR_EL2_VPMR_SHIFT; + + max_idx = -1; + max_priority = 0xff; + for (i = start; i < end; i++) { + irq = cpu_if->irqbuf[i].irq; + /* Check that the interrupt hasn't been already scheduled */ + if (irq == IRQ_SCHEDULED) + continue; + + if (!dist_group_enabled(&hypctx->hyp->vgic_dist)) + continue; + if (!vgic_v3_int_target(irq, hypctx)) + continue; + + priority = cpu_if->irqbuf[i].priority; + if (priority >= vpmr) + continue; + + if (max_idx == -1) { + max_idx = i; + max_priority = priority; + } else if (priority > max_priority) { + max_idx = i; + max_priority = priority; + } else if (priority == max_priority && + cpu_if->irqbuf[i].irqtype < cpu_if->irqbuf[max_idx].irqtype) { + max_idx = i; + max_priority = priority; + } + } + + return (max_idx); +} + +static inline bool +cpu_if_group_enabled(struct vgic_v3_cpu_if *cpu_if) +{ + return ((cpu_if->ich_vmcr_el2 & ICH_VMCR_EL2_VENG1) != 0); +} + +static inline int +irqbuf_next_enabled(struct vgic_v3_irq *irqbuf, int start, int end, + struct hypctx *hypctx, struct vgic_v3_cpu_if *cpu_if) +{ + int i; + + if (!cpu_if_group_enabled(cpu_if)) + return (-1); + + for (i = start; i < end; i++) + if (irqbuf[i].enabled) + break; + + if (i < end) + return (i); + else + return (-1); +} + +static inline int +vgic_v3_lr_next_empty(uint32_t ich_elrsr_el2, int start, int end) +{ + int i; + + for (i = start; i < end; i++) + if (ich_elrsr_el2 & (1U << i)) + break; + + if (i < end) + return (i); + else + return (-1); +} + +/* + * There are two cases in which the virtual timer interrupt is in the list + * registers: + * + * 1. The virtual interrupt is active. The guest is executing the interrupt + * handler, and the timer fired after it programmed the new alarm time but + * before the guest had the chance to write to the EOIR1 register. + * + * 2. The virtual interrupt is pending and active. The timer interrupt is level + * sensitive. The guest wrote to the EOR1 register, but the write hasn't yet + * propagated to the timer. + * + * Injecting the interrupt in these cases would mean that another timer + * interrupt is asserted as soon as the guest writes to the EOIR1 register (or + * very shortly thereafter, in the pending and active scenario). This can lead + * to the guest servicing timer interrupts one after the other and doing + * nothing else. So do not inject a timer interrupt while one is active pending. + * The buffered timer interrupts will be injected after the next world switch in + * this case. + */ +static bool +clk_irq_in_lr(struct vgic_v3_cpu_if *cpu_if) +{ + uint64_t lr; + int i; + + for (i = 0; i < cpu_if->ich_lr_num; i++) { + lr = cpu_if->ich_lr_el2[i]; + if (ICH_LR_EL2_VINTID(lr) == GT_VIRT_IRQ && + (lr_active(lr) || lr_pending_active(lr))) + return (true); + } + + return (false); +} + +static void +vgic_v3_irqbuf_to_lr(struct hypctx *hypctx, struct vgic_v3_cpu_if *cpu_if, + bool by_priority) +{ + struct vgic_v3_irq *vip; + int irqbuf_idx; + int lr_idx; + bool clk_present; + + clk_present = clk_irq_in_lr(cpu_if); + + irqbuf_idx = 0; + lr_idx = 0; + for (;;) { + if (by_priority) + irqbuf_idx = irqbuf_highest_priority(cpu_if, + irqbuf_idx, cpu_if->irqbuf_num, hypctx); + else + irqbuf_idx = irqbuf_next_enabled(cpu_if->irqbuf, + irqbuf_idx, cpu_if->irqbuf_num, hypctx, cpu_if); + if (irqbuf_idx == -1) + break; + + lr_idx = vgic_v3_lr_next_empty(cpu_if->ich_elrsr_el2, + lr_idx, cpu_if->ich_lr_num); + if (lr_idx == -1) + break; + + vip = &cpu_if->irqbuf[irqbuf_idx]; + if (vip->irqtype == VGIC_IRQ_CLK && clk_present) { + /* Skip injecting timer interrupt. */ + irqbuf_idx++; + continue; + } + + vip_to_lr(vip, cpu_if->ich_lr_el2[lr_idx]); + vip->irq = IRQ_SCHEDULED; + irqbuf_idx++; + lr_idx++; + } + + /* Remove all interrupts that were just scheduled. */ + vgic_v3_irqbuf_remove_nolock(IRQ_SCHEDULED, cpu_if); +} + +void +vgic_v3_sync_hwstate(void *arg) +{ + struct hypctx *hypctx; + struct vgic_v3_cpu_if *cpu_if; + int lr_free; + int i; + bool by_priority; + bool en_underflow_intr; + + hypctx = arg; + cpu_if = &hypctx->vgic_cpu_if; + + /* + * All Distributor writes have been executed at this point, do not + * protect Distributor reads with a mutex. + * + * This is callled with all interrupts disabled, so there is no need for + * a List Register spinlock either. + */ + mtx_lock_spin(&cpu_if->lr_mtx); + + /* Exit early if there are no buffered interrupts */ + if (cpu_if->irqbuf_num == 0) { + cpu_if->ich_hcr_el2 &= ~ICH_HCR_EL2_UIE; + goto out; + } + + /* Test if all buffered interrupts can fit in the LR regs */ + lr_free = 0; + for (i = 0; i < cpu_if->ich_lr_num; i++) + if (cpu_if->ich_elrsr_el2 & (1U << i)) + lr_free++; + + by_priority = (lr_free <= cpu_if->ich_lr_num); + vgic_v3_irqbuf_to_lr(hypctx, cpu_if, by_priority); + + lr_free = 0; + for (i = 0; i < cpu_if->ich_lr_num; i++) + if (cpu_if->ich_elrsr_el2 & (1U << i)) + lr_free++; + + en_underflow_intr = false; + if (cpu_if->irqbuf_num > 0) + for (i = 0; i < cpu_if->irqbuf_num; i++) + if (cpu_if->irqbuf[i].irqtype != VGIC_IRQ_CLK) { + en_underflow_intr = true; + break; + } + if (en_underflow_intr) { + cpu_if->ich_hcr_el2 |= ICH_HCR_EL2_UIE; + } else { + cpu_if->ich_hcr_el2 &= ~ICH_HCR_EL2_UIE; + } + +out: + mtx_unlock_spin(&cpu_if->lr_mtx); +} + +static void +vgic_v3_get_ro_regs() +{ + /* GICD_ICFGR0 configures SGIs and it is read-only. */ + ro_regs.gicd_icfgr0 = gic_d_read(gic_sc, 4, GICD_ICFGR(0)); + + /* + * Configure the GIC type register for the guest. + * + * ~GICD_TYPER_SECURITYEXTN: disable security extensions. + * ~GICD_TYPER_DVIS: direct injection for virtual LPIs not supported. + * ~GICD_TYPER_LPIS: LPIs not supported. + */ + ro_regs.gicd_typer = gic_d_read(gic_sc, 4, GICD_TYPER); + ro_regs.gicd_typer &= ~GICD_TYPER_SECURITYEXTN; + ro_regs.gicd_typer &= ~GICD_TYPER_DVIS; + ro_regs.gicd_typer &= ~GICD_TYPER_LPIS; + + /* + * XXX. Guest reads of GICD_PIDR2 should return the same ArchRev as + * specified in the guest FDT. + */ + ro_regs.gicd_pidr2 = gic_d_read(gic_sc, 4, GICD_PIDR2); +} + +void +vgic_v3_init(uint64_t ich_vtr_el2) { + uint32_t pribits, prebits; + + KASSERT(gic_sc != NULL, ("GIC softc is NULL")); + + vgic_v3_get_ro_regs(); + + pribits = ICH_VTR_EL2_PRIBITS(ich_vtr_el2); + switch (pribits) { + case 5: + virt_features.min_prio = 0xf8; + case 6: + virt_features.min_prio = 0xfc; + case 7: + virt_features.min_prio = 0xfe; + case 8: + virt_features.min_prio = 0xff; + } + + prebits = ICH_VTR_EL2_PREBITS(ich_vtr_el2); + switch (prebits) { + case 5: + virt_features.ich_ap0r_num = 1; + virt_features.ich_ap1r_num = 1; + case 6: + virt_features.ich_ap0r_num = 2; + virt_features.ich_ap1r_num = 2; + case 7: + virt_features.ich_ap0r_num = 4; + virt_features.ich_ap1r_num = 4; + } + + virt_features.ich_lr_num = ICH_VTR_EL2_LISTREGS(ich_vtr_el2); +} + +static int +vgic_v3_maint_intr(void *arg) +{ + printf("MAINTENANCE INTERRUPT\n"); + + return (FILTER_HANDLED); +} + +/* + * TODO: Look at how gic_v3_fdt.c adds the gic driver. + * + * 1. In probe they set the device description. + * 2. In attach they create children devices for the GIC (in + * gic_v3_ofw_bus_attach). + * 3. There is no identify function being called. + * + * On the other hand, in man 9 DEVICE_IDENTIFY it is stated that a new device + * instance is created by the identify function. + */ + +static void +arm_vgic_identify(driver_t *driver, device_t parent) +{ + device_t dev; + + if (strcmp(device_get_name(parent), "gic") == 0) { + dev = device_find_child(parent, VGIC_V3_DEVNAME, -1); + if (!dev) + dev = device_add_child(parent, VGIC_V3_DEVNAME, -1); + gic_sc = device_get_softc(parent); + } +} + +static int +arm_vgic_probe(device_t dev) +{ + device_t parent; + + parent = device_get_parent(dev); + if (strcmp(device_get_name(parent), "gic") == 0) { + device_set_desc(dev, VGIC_V3_DEVSTR); + return (BUS_PROBE_DEFAULT); + } + + return (ENXIO); +} + +static int +arm_vgic_attach(device_t dev) +{ + int error; + + error = gic_v3_setup_maint_intr(vgic_v3_maint_intr, NULL, NULL); + if (error) + device_printf(dev, "Could not setup maintenance interrupt\n"); + + return (0); +} + +static int +arm_vgic_detach(device_t dev) +{ + int error; + + error = gic_v3_teardown_maint_intr(); + if (error) + device_printf(dev, "Could not teardown maintenance interrupt\n"); + + gic_sc = NULL; + + return (0); +} + +static device_method_t arm_vgic_methods[] = { + DEVMETHOD(device_identify, arm_vgic_identify), + DEVMETHOD(device_probe, arm_vgic_probe), + DEVMETHOD(device_attach, arm_vgic_attach), + DEVMETHOD(device_detach, arm_vgic_detach), + DEVMETHOD_END +}; + +DEFINE_CLASS_1(vgic, arm_vgic_driver, arm_vgic_methods, 0, gic_v3_driver); + +static devclass_t arm_vgic_devclass; +DRIVER_MODULE(vgic, gic, arm_vgic_driver, arm_vgic_devclass, 0, 0); Index: sys/arm64/vmm/io/vgic_v3_mmio.c =================================================================== --- /dev/null +++ sys/arm64/vmm/io/vgic_v3_mmio.c @@ -0,0 +1,1206 @@ +#include +#include +#include + +#include +#include +#include + +#include "vgic_v3.h" + +#define DEBUG 0 + +#define GICR_FRAME_RD 0 +#define GICR_FRAME_SGI GICR_RD_BASE_SIZE +#define GICR_FRAMES_END (GICR_FRAME_SGI + GICR_SGI_BASE_SIZE) + +#define RES0 (0UL) +#define RES1 (~0UL) + +#define redist_simple_read(src, destp, vm, vcpuid) \ +do { \ + struct hyp *hyp = vm_get_cookie(vm); \ + struct vgic_v3_redist *redist = &hyp->ctx[vcpuid].vgic_redist; \ + *destp = redist->src; \ +} while (0); + +#define redist_simple_write(src, dest, vm, vcpuid) \ +do { \ + struct hyp *hyp = vm_get_cookie(vm); \ + struct vgic_v3_redist *redist = &hyp->ctx[vcpuid].vgic_redist; \ + redist->dest = src; \ +} while (0); + +#define reg32_idx(ipa, region) (((ipa) - (region).start) / 4) +#define reg64_idx(ipa, region) (((ipa) - (region).start) / 8) + +#define reg_changed(new, old, mask) (((new) & (mask)) != ((old) & (mask))) + +/* The names should always be in ascending order of memory address */ +enum vgic_mmio_region_name { + /* Distributor registers */ + VGIC_GICD_CTLR, + VGIC_GICD_TYPER, + VGIC_GICD_IIDR, + VGIC_GICD_TYPER2, + VGIC_GICD_IGROUPR, + VGIC_GICD_ISENABLER, + VGIC_GICD_ICENABLER, + VGIC_GICD_ISACTIVER, + VGIC_GICD_ICACTIVER, + VGIC_GICD_IPRIORITYR, + VGIC_GICD_ICFGR, + VGIC_GICD_IROUTER, + VGIC_GICD_PIDR2, + /* Redistributor registers */ + VGIC_GICR_CTLR, + VGIC_GICR_TYPER, + VGIC_GICR_WAKER, + VGIC_GICR_PIDR2, + VGIC_GICR_IGROUPR0, + VGIC_GICR_ISENABLER0, + VGIC_GICR_ICENABLER0, + VGIC_GICR_ICACTIVER0, + VGIC_GICR_IPRIORITYR, + VGIC_GICR_ICFGR0, + VGIC_GICR_ICFGR1, + VGIC_MMIO_REGIONS_NUM, +}; +/* + * Necessary for calculating the number of Distributor and Redistributor + * regions emulated. + */ +#define FIRST_REDIST_MMIO_REGION VGIC_GICR_CTLR + +MALLOC_DEFINE(M_VGIC_V3_MMIO, "ARM VMM VGIC DIST MMIO", "ARM VMM VGIC DIST MMIO"); + +static int +dist_ctlr_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + mtx_lock_spin(&dist->dist_mtx); + *rval = dist->gicd_ctlr; + mtx_unlock_spin(&dist->dist_mtx); + + /* Writes are never pending */ + *rval &= ~GICD_CTLR_RWP; + + *retu = false; + return (0); +} + +static int +dist_ctlr_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + /* GICD_CTLR.DS is RAO/WI when only one security state is supported. */ + wval |= GICD_CTLR_DS; + + mtx_lock_spin(&dist->dist_mtx); + + if (reg_changed(wval, dist->gicd_ctlr, GICD_CTLR_G1A)) { + if (!(wval & GICD_CTLR_G1A)) + vgic_v3_group_toggle_enabled(false, hyp); + else + vgic_v3_group_toggle_enabled(true, hyp); + } + dist->gicd_ctlr = wval; + + mtx_unlock_spin(&dist->dist_mtx); + + *retu = false; + return (0); +} + +static int +dist_typer_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + *rval = dist->gicd_typer; + + *retu = false; + return (0); +} + +static int +dist_typer_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + eprintf("Warning: Attempted write to read-only register GICD_TYPER.\n"); + + *retu = false; + return (0); +} + +static int +dist_iidr_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + *rval = (0x42 << GICD_IIDR_PROD_SHIFT); + *rval |= (1 << GICD_IIDR_REV_SHIFT); + *rval |= (0x43b << GICD_IIDR_IMPL_SHIFT); + + *retu = false; + return (0); +} + +static int +dist_iidr_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + *retu = false; + return (0); +} + +static int +dist_typer2_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + *rval = RES0; + + *retu = false; + return (0); +} + +static int +dist_typer2_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + *retu = false; + return (0); +} + +/* Only group 1 interrupts are supported. Treat IGROUPR as RA0/WI. */ +static int +dist_igroupr_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + int n; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICD_IGROUPR]); + /* + * GIC Architecture specification, p 8-477: "For SGIs and PPIs: When + * ARE is 1 for the Security state of an interrupt, the field for that + * interrupt is RES0 and an implementation is permitted to make the + * field RAZ/WI in this case". + */ + if (n == 0 && aff_routing_en(dist)) { + *rval = RES0; + } else { + *rval = RES1; + } + + *retu = false; + return (0); +} + +/* Only group 1 interrupts are supported. Treat IGROUPR as RA0/WI. */ +static int +dist_igroupr_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + *retu = false; + return (0); +} + +static void +mmio_update_int_enabled(uint32_t new_ixenabler, uint32_t old_ixenabler, + uint32_t irq, struct hyp *hyp, int vcpuid) +{ + uint32_t irq_mask; + int error; + int i; + bool enabled; + + irq_mask = 0x1; + for (i = 0; i < 32; i++) { + if (reg_changed(new_ixenabler, old_ixenabler, irq_mask)) { + enabled = ((new_ixenabler & irq_mask) != 0); + error = vgic_v3_irq_toggle_enabled(irq, enabled, + hyp, vcpuid); + if (error) + eprintf("Warning: error while toggling IRQ %u\n", irq); + } + irq++; + irq_mask <<= 1; + } +} + +static int +dist_ixenabler_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + void *arg, enum vgic_mmio_region_name name) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + size_t n; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[name]); + /* + * GIC Architecture specification, p 8-471: "When ARE is 1 for the + * Security state of an interrupt, the field for that interrupt is RES0 + * and an implementation is permitted to* make the field RAZ/WI in this + * case". + */ + if (n == 0 && aff_routing_en(dist)) { + *rval = RES0; + goto out; + } + + mtx_lock_spin(&dist->dist_mtx); + *rval = dist->gicd_ixenabler[n]; + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); +} + +static int +dist_ixenabler_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + void *arg, enum vgic_mmio_region_name name) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + uint32_t old_ixenabler; + size_t n; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[name]); + /* See dist_ixenabler_read() */ + if (n == 0 && aff_routing_en(dist)) + /* Ignore writes */ + goto out; + + mtx_lock_spin(&dist->dist_mtx); + + old_ixenabler = dist->gicd_ixenabler[n]; + if (name == VGIC_GICD_ICENABLER) + dist->gicd_ixenabler[n] &= ~wval; + else + dist->gicd_ixenabler[n] |= wval; + mmio_update_int_enabled(dist->gicd_ixenabler[n], old_ixenabler, n * 32, + hyp, vcpuid); + + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); +} + +static int +dist_isenabler_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + return (dist_ixenabler_read(vm, vcpuid, fault_ipa, rval, arg, + VGIC_GICD_ISENABLER)); +} + +static int +dist_isenabler_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + return (dist_ixenabler_write(vm, vcpuid, fault_ipa, wval, arg, + VGIC_GICD_ISENABLER)); +} + +static int +dist_icenabler_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + return (dist_ixenabler_read(vm, vcpuid, fault_ipa, rval, arg, + VGIC_GICD_ICENABLER)); +} + +static int +dist_icenabler_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + return (dist_ixenabler_write(vm, vcpuid, fault_ipa, wval, arg, + VGIC_GICD_ICENABLER)); +} + +static int +dist_isactiver_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + *retu = false; + + return (0); +} + +static int +dist_isactiver_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + *retu = false; + + return (0); +} + +static int +dist_icactiver_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + *retu = false; + + return (0); +} + +static int +dist_icactiver_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + *retu = false; + + return (0); +} + +/* XXX: Registers are byte accessible. */ +static int +dist_ipriorityr_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + size_t n; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICD_IPRIORITYR]); + /* + * GIC Architecture specification, p 8-483: when affinity + * routing is enabled, GICD_IPRIORITYR is RAZ/WI for + * n = 0 to 7. + */ + if (aff_routing_en(dist) && n <= 7) { + *rval = RES0; + goto out; + } + + mtx_lock_spin(&dist->dist_mtx); + *rval = dist->gicd_ipriorityr[n]; + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); + +} + +static int +dist_ipriorityr_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + size_t n; + + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICD_IPRIORITYR]); + /* See dist_ipriorityr_read() */ + if (aff_routing_en(dist) && n <= 7) + /* Ignore writes */ + goto out; + + mtx_lock_spin(&dist->dist_mtx); + dist->gicd_ipriorityr[n] = wval; + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); +} + +static int +dist_icfgr_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + size_t n; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICD_ICFGR]); + /* + * ARM GIC Architecture Specification, p 8-472: "For SGIs, + * Int_config fields are RO, meaning that GICD_ICFGR0 is RO." + */ + if (n == 0) { + *rval = RES0; + goto out; + } + + mtx_lock_spin(&dist->dist_mtx); + *rval = dist->gicd_icfgr[n]; + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); + +} + +static int +dist_icfgr_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + size_t n; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICD_ICFGR]); + if (n == 0) + /* Ignore writes */ + goto out; + + mtx_lock_spin(&dist->dist_mtx); + dist->gicd_icfgr[n] = wval; + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); +} + +static int +dist_irouter_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + size_t n; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg64_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICD_IROUTER]); + /* GIC Architecture Manual, p 8-485: registers 0 to 31 are reserved */ + if (n <= 31) { + eprintf("Warning: Read from register GICD_IROUTER%zu\n", n); + *rval = RES0; + goto out; + } + + /* + * GIC Architecture Manual, p 8-485: when affinity routing is not + * enabled, the registers are RAZ/WI. + */ + if (!aff_routing_en(dist)) { + *rval = RES0; + goto out; + } + + mtx_lock_spin(&dist->dist_mtx); + *rval = dist->gicd_irouter[n]; + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); +} + +static int +dist_irouter_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + size_t n; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg64_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICD_IROUTER]); + if (n <= 31) { + eprintf("Warning: Write to register GICD_IROUTER%zu\n", n); + goto out; + } + + /* See dist_irouter_read() */ + if (!aff_routing_en(dist)) + /* Ignore writes */ + goto out; + + mtx_lock_spin(&dist->dist_mtx); + dist->gicd_irouter[n] = wval; + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); +} + +static int +dist_pidr2_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + *rval = dist->gicd_pidr2; + + *retu = false; + return (0); +} + +static int +dist_pidr2_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + eprintf("Warning: Attempted write to read-only register GICD_PIDR2.\n"); + + *retu = false; + return (0); +} + +static int +redist_ctlr_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + redist_simple_read(gicr_ctlr, rval, vm, vcpuid); + /* Writes are never pending */ + *rval &= ~GICR_CTLR_RWP & ~GICR_CTLR_UWP; + +#if (DEBUG > 0) + eprintf("\n"); +#endif + + *retu = false; + return (0); +} + +static int +redist_ctlr_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + redist_simple_write(wval, gicr_ctlr, vm, vcpuid); + +#if (DEBUG > 0) + eprintf("\n"); +#endif + + *retu = false; + return (0); +} + +static int +redist_typer_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + redist_simple_read(gicr_typer, rval, vm, vcpuid); + +#if (DEBUG > 0) + eprintf("\n"); +#endif + + *retu = false; + return (0); +} + +static int +redist_typer_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + eprintf("Warning: Attempted write to read-only register GICR_TYPER.\n"); + + *retu = false; + return (0); +} + +static int +redist_waker_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + /* Redistributor is always awake */ + *rval = 0 & ~GICR_WAKER_PS & ~GICR_WAKER_CA; + +#if (DEBUG > 0) + eprintf("\n"); +#endif + + *retu = false; + return (0); +} + +static int +redist_waker_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + /* Ignore writes */ +#if (DEBUG > 0) + eprintf("\n"); +#endif + + *retu = false; + return (0); +} + +/* Only group 1 interrupts are supported. Treat IGROUPR0 as RA0/WI. */ +static int +redist_igroupr0_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + *rval = RES1; + *retu = false; + return (0); +} + +/* Only group 1 interrupts are supported. Treat IGROUPR0 as RA0/WI. */ +static int +redist_igroupr0_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + if (wval == 0UL) + printf("Warning: Interrupts marked as group 0, ignoring\n"); + + *retu = false; + return (0); +} + +static int +redist_ixenabler_read(void *vm, int vcpuid, uint64_t *rval, void *arg, + enum vgic_mmio_region_name reg) +{ + struct hyp *hyp; + struct vgic_v3_redist *redist; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + redist = &hyp->ctx[vcpuid].vgic_redist; + + *rval = redist->gicr_ixenabler0; + + *retu = false; + return (0); +} + +static int +redist_ixenabler_write(void *vm, int vcpuid, uint64_t wval, void *arg, + enum vgic_mmio_region_name reg) +{ + struct hyp *hyp; + struct vgic_v3_redist *redist; + uint32_t old_ixenabler0, new_ixenabler0; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + redist = &hyp->ctx[vcpuid].vgic_redist; + + old_ixenabler0 = redist->gicr_ixenabler0; + if (reg == VGIC_GICR_ICENABLER0) + new_ixenabler0 = old_ixenabler0 & ~wval; + else + new_ixenabler0 = old_ixenabler0 | wval; + mmio_update_int_enabled(new_ixenabler0, old_ixenabler0, 0, hyp, vcpuid); + redist->gicr_ixenabler0 = new_ixenabler0; + + *retu = false; + return (0); +} + + +static int +redist_isenabler0_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ +#if (DEBUG > 0) + eprintf("\n"); +#endif + return (redist_ixenabler_read(vm, vcpuid, rval, arg, + VGIC_GICR_ISENABLER0)); +} + +static int +redist_isenabler0_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ +#if (DEBUG > 0) + eprintf("\n"); +#endif + return (redist_ixenabler_write(vm, vcpuid, wval, arg, + VGIC_GICR_ISENABLER0)); +} + +static int +redist_icenabler0_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ +#if (DEBUG > 0) + eprintf("\n"); +#endif + return (redist_ixenabler_read(vm, vcpuid, rval, arg, + VGIC_GICR_ICENABLER0)); +} + +static int +redist_icenabler0_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ +#if (DEBUG > 0) + eprintf("\n"); +#endif + return (redist_ixenabler_write(vm, vcpuid, wval, arg, + VGIC_GICR_ICENABLER0)); +} + +static int +redist_icactiver0_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + *retu = false; + + return (0); +} + +static int +redist_icactiver0_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + *retu = false; + + return (0); +} + +static int +redist_ipriorityr_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_redist *redist; + size_t n; + bool *retu = arg; + +#if (DEBUG > 0) + eprintf("\n"); +#endif + + hyp = vm_get_cookie(vm); + redist = &hyp->ctx[vcpuid].vgic_redist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICR_IPRIORITYR]); + *rval = redist->gicr_ipriorityr[n]; + + *retu = false; + return (0); +} + +static int +redist_ipriorityr_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_redist *redist; + size_t n; + bool *retu = arg; + +#if (DEBUG > 0) + eprintf("\n"); +#endif + + hyp = vm_get_cookie(vm); + redist = &hyp->ctx[vcpuid].vgic_redist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICR_IPRIORITYR]); + redist->gicr_ipriorityr[n] = wval; + + *retu = false; + return (0); +} + +static int +redist_pidr2_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + /* GICR_PIDR2 has the same value as GICD_PIDR2 */ + *rval = dist->gicd_pidr2; +#if (DEBUG > 0) + eprintf("\n"); +#endif + + *retu = false; + return (0); +} + +static int +redist_pidr2_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + eprintf("Warning: Attempted write to read-only register GICR_PIDR2.\n"); + + *retu = false; + return (0); +} + +static int +redist_icfgr0_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + redist_simple_read(gicr_icfgr0, rval, vm, vcpuid); + + *retu = false; + return (0); +} + +static int +redist_icfgr0_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + redist_simple_write(wval, gicr_icfgr0, vm, vcpuid); + + *retu = false; + return (0); +} + +static int +redist_icfgr1_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + redist_simple_read(gicr_icfgr0, rval, vm, vcpuid); + + *retu = false; + return (0); +} + +static int +redist_icfgr1_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + redist_simple_write(wval, gicr_icfgr0, vm, vcpuid); + + *retu = false; + return (0); +} + +#define alloc_registers(regs, num, size) \ +do { \ + size = n * sizeof(*regs); \ + regs = malloc(size, M_VGIC_V3_MMIO, M_WAITOK | M_ZERO); \ +} while (0) + +#define div_round_up(n, div) (((n) + (div) - 1) / (div)) + +static inline void +init_mmio_region(struct hyp *hyp, size_t regidx, vm_offset_t start, + size_t size, mem_region_read_t read_fn, mem_region_write_t write_fn) +{ + static int i = 0; + + hyp->vgic_mmio_regions[i] = (struct vgic_mmio_region) { + .start = start, + .end = start + size, + .read = read_fn, + .write = write_fn, + }; + + i++; +} + +static void +dist_mmio_init_regions(struct vgic_v3_dist *dist, struct hyp *hyp) +{ + size_t n; + size_t region_size; + + init_mmio_region(hyp, VGIC_GICD_CTLR, dist->start + GICD_CTLR, + sizeof(dist->gicd_ctlr), dist_ctlr_read, dist_ctlr_write); + init_mmio_region(hyp, VGIC_GICD_TYPER, dist->start + GICD_TYPER, + sizeof(dist->gicd_typer), dist_typer_read, dist_typer_write); + init_mmio_region(hyp, VGIC_GICD_IIDR, dist->start + GICD_IIDR, + sizeof(dist->gicd_iidr), dist_iidr_read, dist_iidr_write); + init_mmio_region(hyp, VGIC_GICD_TYPER2, dist->start + GICD_TYPER2, + sizeof(dist->gicd_typer2), dist_typer2_read, dist_typer2_write); + + n = div_round_up(dist->nirqs, 32); + init_mmio_region(hyp, VGIC_GICD_IGROUPR, dist->start + GICD_IGROUPR_BASE, + n * sizeof(uint32_t), dist_igroupr_read, dist_igroupr_write); + + /* ARM GIC Architecture Specification, page 8-471. */ + n = (dist->gicd_typer & GICD_TYPER_ITLINESNUM_MASK) + 1; + alloc_registers(dist->gicd_ixenabler, n , region_size); + init_mmio_region(hyp, VGIC_GICD_ISENABLER, dist->start + GICD_ISENABLER_BASE, + region_size, dist_isenabler_read, dist_isenabler_write); + init_mmio_region(hyp, VGIC_GICD_ICENABLER, dist->start + GICD_ICENABLER_BASE, + region_size, dist_icenabler_read, dist_icenabler_write); + + alloc_registers(dist->gicd_ixactiver, n , region_size); + init_mmio_region(hyp, VGIC_GICD_ISACTIVER, dist->start + GICD_ISACTIVER_BASE, + region_size, dist_isactiver_read, dist_isactiver_write); + init_mmio_region(hyp, VGIC_GICD_ICACTIVER, dist->start + GICD_ICACTIVER_BASE, + region_size, dist_icactiver_read, dist_icactiver_write); + + /* ARM GIC Architecture Specification, page 8-483. */ + n = 8 * ((dist->gicd_typer & GICD_TYPER_ITLINESNUM_MASK) + 1); + alloc_registers(dist->gicd_ipriorityr, n, region_size); + init_mmio_region(hyp, VGIC_GICD_IPRIORITYR, dist->start + GICD_IPRIORITYR_BASE, + region_size, dist_ipriorityr_read, dist_ipriorityr_write); + + n = div_round_up(dist->nirqs, 16); + alloc_registers(dist->gicd_icfgr, n, region_size); + init_mmio_region(hyp, VGIC_GICD_ICFGR, dist->start + GICD_ICFGR_BASE, + region_size, dist_icfgr_read, dist_icfgr_write); + + /* ARM GIC Architecture Specification, page 8-485. */ + n = 32 * (dist->gicd_typer & GICD_TYPER_ITLINESNUM_MASK + 1) - 1; + alloc_registers(dist->gicd_irouter, n, region_size); + init_mmio_region(hyp, VGIC_GICD_IROUTER, dist->start + GICD_IROUTER(0), + region_size, dist_irouter_read, dist_irouter_write); + + init_mmio_region(hyp, VGIC_GICD_PIDR2, dist->start + GICD_PIDR2, + sizeof(dist->gicd_pidr2), dist_pidr2_read, dist_pidr2_write); +} + +static void +redist_mmio_init_regions(struct hyp *hyp, int vcpuid) +{ + struct vgic_v3_redist *redist; + vm_offset_t start; + + redist = &hyp->ctx[vcpuid].vgic_redist; + start = redist->start + GICR_FRAME_RD + GICR_CTLR + GICR_FRAMES_END * vcpuid; + /* + hyp->vgic_mmio_regions[VGIC_GICR_CTLR] = (struct vgic_mmio_region) { + .start = start, + .end = start + sizeof(redist->gicr_ctlr), + .read = redist_ctlr_read, + .write = redist_ctlr_write, + }; + */ + init_mmio_region(hyp, VGIC_GICR_CTLR, start, sizeof(redist->gicr_ctlr), + redist_ctlr_read, redist_ctlr_write); + + start = redist->start + GICR_FRAME_RD + GICR_TYPER + GICR_FRAMES_END * vcpuid; + init_mmio_region(hyp, VGIC_GICR_TYPER, start, sizeof(redist->gicr_typer), + redist_typer_read, redist_typer_write); + + start = redist->start + GICR_FRAME_RD + GICR_WAKER + GICR_FRAMES_END * vcpuid; + init_mmio_region(hyp, VGIC_GICR_WAKER, start, 4, redist_waker_read, + redist_waker_write); + + start = redist->start + GICR_FRAME_RD + GICR_PIDR2 + GICR_FRAMES_END * vcpuid; + init_mmio_region(hyp, VGIC_GICR_PIDR2, start, 4, redist_pidr2_read, + redist_pidr2_write); + + start = redist->start + GICR_FRAME_SGI + GICR_IGROUPR0 + GICR_FRAMES_END * vcpuid; + init_mmio_region(hyp, VGIC_GICR_IGROUPR0, start, + sizeof(uint32_t), redist_igroupr0_read, redist_igroupr0_write); + + start = redist->start + GICR_FRAME_SGI + GICR_ISENABLER0 + GICR_FRAMES_END * vcpuid; + init_mmio_region(hyp, VGIC_GICR_ISENABLER0, start, + sizeof(redist->gicr_ixenabler0), redist_isenabler0_read, + redist_isenabler0_write); + + start = redist->start + GICR_FRAME_SGI + GICR_ICACTIVER0 + GICR_FRAMES_END * vcpuid; + init_mmio_region(hyp, VGIC_GICR_ICACTIVER0, start, + sizeof(redist->gicr_icactiver0), redist_icactiver0_read, + redist_icactiver0_write); + + start = redist->start + GICR_FRAME_SGI + GICR_ICENABLER0 + GICR_FRAMES_END * vcpuid; + init_mmio_region(hyp, VGIC_GICR_ICENABLER0, start, + sizeof(redist->gicr_ixenabler0), redist_icenabler0_read, + redist_icenabler0_write); + + start = redist->start + GICR_FRAME_SGI + GICR_IPRIORITYR_BASE + GICR_FRAMES_END * vcpuid; + init_mmio_region(hyp, VGIC_GICR_IPRIORITYR, start, + sizeof(redist->gicr_ipriorityr), redist_ipriorityr_read, + redist_ipriorityr_write); + + start = redist->start + GICR_FRAME_SGI + GICR_ICFGR0 + GICR_FRAMES_END * vcpuid; + init_mmio_region(hyp, VGIC_GICR_ICFGR0, start, + sizeof(redist->gicr_icfgr0), redist_icfgr0_read, redist_icfgr0_write); + + start = redist->start + GICR_FRAME_SGI + GICR_ICFGR1 + GICR_FRAMES_END * vcpuid; + init_mmio_region(hyp, VGIC_GICR_ICFGR1, start, + sizeof(redist->gicr_icfgr1), redist_icfgr1_read, redist_icfgr1_write); +} + +void +vgic_v3_mmio_init(struct hyp *hyp) +{ + struct vgic_v3_dist *dist = &hyp->vgic_dist; + int redist_region_num, dist_region_num, region_num; + int ncpus = VM_MAXCPU; + int i; + + dist_region_num = FIRST_REDIST_MMIO_REGION; + redist_region_num = \ + (VGIC_MMIO_REGIONS_NUM - FIRST_REDIST_MMIO_REGION); + region_num = dist_region_num + redist_region_num; + + hyp->vgic_mmio_regions = \ + malloc(region_num * sizeof(*hyp->vgic_mmio_regions), + M_VGIC_V3_MMIO, M_WAITOK | M_ZERO); + hyp->vgic_mmio_regions_num = region_num; + + dist_mmio_init_regions(dist, hyp); + + /* TODO: Do it for all VCPUs */ + for (i = 0; i < ncpus; i++) + redist_mmio_init_regions(hyp, i); +} + +void +vgic_v3_mmio_destroy(struct hyp *hyp) +{ + struct vgic_v3_dist *dist = &hyp->vgic_dist; + + if (!hyp->vgic_mmio_regions) + return; + free(hyp->vgic_mmio_regions, M_VGIC_V3_MMIO); + + free(dist->gicd_ixenabler, M_VGIC_V3_MMIO); + free(dist->gicd_ipriorityr, M_VGIC_V3_MMIO); + free(dist->gicd_icfgr, M_VGIC_V3_MMIO); + free(dist->gicd_irouter, M_VGIC_V3_MMIO); +} + +int +vgic_v3_icc_sgi1r_el1_read(void *vm, int vcpuid, uint64_t *rval, void *arg) +{ + bool *retu = arg; + + retu = false; + + return (0); +} + +int +vgic_v3_icc_sgi1r_el1_write(void *vm, int vcpuid, uint64_t wval, void *arg) +{ + struct hyp *hyp; + bool *retu = arg; + int intid, vcpu; + cpuset_t active_cpus; + + hyp = vm_get_cookie(vm); + active_cpus = vm_active_cpus(vm); + + if ((wval & ICC_SGI1R_EL1_IRM) == 0) { + intid = (wval >> ICC_SGI1R_EL1_SGIID_SHIFT) & + ICC_SGI1R_EL1_SGIID_MASK; + + for (vcpu = 0; vcpu < ICC_SGI1R_EL1_TargetList_Bits; vcpu++) { + if (CPU_ISSET(vcpu, &active_cpus) && vcpu != vcpuid) { + vgic_v3_inject_irq(&hyp->ctx[vcpu], intid, + VGIC_IRQ_MISC); + } + } + } else { + /* TODO Interrupts routed to all PEs, excluding "self" */ + } + + retu = false; + + return (0); +} Index: sys/arm64/vmm/io/vgic_v3_reg.h =================================================================== --- /dev/null +++ sys/arm64/vmm/io/vgic_v3_reg.h @@ -0,0 +1,97 @@ +#ifndef _VGIC_V3_REG_H_ +#define _VGIC_V3_REG_H_ + +/* Interrupt Controller End of Interrupt Status Register */ +#define ICH_EISR_EL2_STATUS_MASK 0xffff +#define ICH_EISR_EL2_EOI_NOT_HANDLED(lr) ((1 << lr) & ICH_EISR_EL2_STATUS_MASK) + +/* Interrupt Controller Empty List Register Status Register */ +#define ICH_ELRSR_EL2_STATUS_MASK 0xffff +#define ICH_ELRSR_EL2_LR_EMPTY(x) ((1 << x) & ICH_ELRSR_EL2_STATUS_MASK) + +/* Interrupt Controller Hyp Control Register */ +#define ICH_HCR_EL2_EOICOUNT_SHIFT 27 +#define ICH_HCR_EL2_EOICOUNT_MASK (0x1f << ICH_HCR_EL2_EOICOUNT_SHIFT) +#define ICH_HCR_EL2_TDIR (1 << 14) /* Trap non-secure EL1 writes to IC{C, V}_DIR_EL1 */ +#define ICH_HCR_EL2_TSEI (1 << 14) /* Trap System Error Interupts (SEI) to EL2 */ +#define ICH_HCR_EL2_TALL1 (1 << 12) /* Trap non-secure EL1 accesses to IC{C, V}_* for Group 1 interrupts */ +#define ICH_HCR_EL2_TALL0 (1 << 11) /* Trap non-secure EL1 accesses to IC{C, V}_* for Group 0 interrupts */ +#define ICH_HCR_EL2_TC (1 << 10) /* Trap non-secure EL1 accesses to common IC{C, V}_* registers */ +#define ICH_HCR_EL2_VGRP1DIE (1 << 7) /* VM Group 1 Disabled Interrupt Enable */ +#define ICH_HCR_EL2_VGRP1EIE (1 << 6) /* VM Group 1 Enabled Interrupt Enable */ +#define ICH_HCR_EL2_VGRP0DIE (1 << 5) /* VM Group 0 Disabled Interrupt Enable */ +#define ICH_HCR_EL2_VGRP0EIE (1 << 4) /* VM Group 0 Enabled Interrupt Enable */ +#define ICH_HCR_EL2_NPIE (1 << 3) /* No Pending Interrupt Enable */ +#define ICH_HCR_EL2_LRENPIE (1 << 2) /* List Register Entry Not Present Interrupt Enable */ +#define ICH_HCR_EL2_UIE (1 << 1) /* Underflow Interrupt Enable */ +#define ICH_HCR_EL2_En (1 << 0) /* Global enable for the virtual CPU interface */ + +/* Interrupt Controller List Registers */ +#define ICH_LR_EL2_VINTID_MASK 0xffffffff +#define ICH_LR_EL2_VINTID(x) ((x) & ICH_LR_EL2_VINTID_MASK) +#define ICH_LR_EL2_PINTID_SHIFT 32 +#define ICH_LR_EL2_PINTID_MASK (0x3fUL << ICH_LR_EL2_PINTID_SHIFT) +#define ICH_LR_EL2_PRIO_SHIFT 48 +#define ICH_LR_EL2_PRIO_MASK (0xffUL << ICH_LR_EL2_PRIO_SHIFT) +#define ICH_LR_EL2_GROUP_SHIFT 60 +#define ICH_LR_EL2_GROUP1 (1UL << ICH_LR_EL2_GROUP_SHIFT) +#define ICH_LR_EL2_HW (1UL << 61) +#define ICH_LR_EL2_STATE_SHIFT 62 +#define ICH_LR_EL2_STATE_MASK (0x3UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE(x) ((x) & ICH_LR_EL2_STATE_MASK) +#define ICH_LR_EL2_STATE_INACTIVE (0x0UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE_PENDING (0x1UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE_ACTIVE (0x2UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE_PENDING_ACTIVE (0x3UL << ICH_LR_EL2_STATE_SHIFT) + +/* Interrupt Controller Maintenance Interrupt State Register */ +#define ICH_MISR_EL2_VGRP1D (1 << 7) /* vPE Group 1 Disabled */ +#define ICH_MISR_EL2_VGRP1E (1 << 6) /* vPE Group 1 Enabled */ +#define ICH_MISR_EL2_VGRP0D (1 << 5) /* vPE Group 0 Disabled */ +#define ICH_MISR_EL2_VGRP0E (1 << 4) /* vPE Group 0 Enabled */ +#define ICH_MISR_EL2_NP (1 << 3) /* No Pending */ +#define ICH_MISR_EL2_LRENP (1 << 2) /* List Register Entry Not Present */ +#define ICH_MISR_EL2_U (1 << 1) /* Underflow */ +#define ICH_MISR_EL2_EOI (1 << 0) /* End Of Interrupt */ + +/* Interrupt Controller Virtual Machine Control Register */ +#define ICH_VMCR_EL2_VPMR_SHIFT 24 +#define ICH_VMCR_EL2_VPMR_MASK (0xff << ICH_VMCR_EL2_VPMR_SHIFT) +#define ICH_VMCR_EL2_VPMR_PRIO_LOWEST (0xff << ICH_VMCR_EL2_VPMR_SHIFT) +#define ICH_VMCR_EL2_VPMR_PRIO_HIGHEST (0x00 << ICH_VMCR_EL2_VPMR_SHIFT) +#define ICH_VMCR_EL2_VBPR0_SHIFT 21 +#define ICH_VMCR_EL2_VBPR0_MASK (0x7 << ICH_VMCR_EL2_VBPR0_SHIFT) +#define ICH_VMCR_EL2_VBPR0_NO_PREEMPTION \ + (0x7 << ICH_VMCR_EL2_VBPR0_SHIFT) +#define ICH_VMCR_EL2_VBPR1_SHIFT 18 +#define ICH_VMCR_EL2_VBPR1_MASK (0x7 << ICH_VMCR_EL2_VBPR1_SHIFT) +#define ICH_VMCR_EL2_VBPR1_NO_PREEMPTION \ + (0x7 << ICH_VMCR_EL2_VBPR1_SHIFT) +#define ICH_VMCR_EL2_VEOIM (1 << 9) /* Virtual EOI mode */ +#define ICH_VMCR_EL2_VCBPR (1 << 4) /* Virtual Common binary Point Register */ +#define ICH_VMCR_EL2_VFIQEN (1 << 3) /* Virtual FIQ enable */ +#define ICH_VMCR_EL2_VACKCTL (1 << 2) /* Virtual AckCtl */ +#define ICH_VMCR_EL2_VENG1 (1 << 1) /* Virtual Group 1 Interrupt Enable */ +#define ICH_VMCR_EL2_VENG0 (1 << 0) /* Virtual Group 0 Interrupt Enable */ + +/* Interrupt Controller VGIC Type Register */ +#define ICH_VTR_EL2_PRIBITS_SHIFT 29 +#define ICH_VTR_EL2_PRIBITS_MASK (0x7 << ICH_VTR_EL2_PRIBITS_SHIFT) +#define ICH_VTR_EL2_PRIBITS(x) \ + ((((x) & ICH_VTR_EL2_PRIBITS_MASK) >> ICH_VTR_EL2_PRIBITS_SHIFT) + 1) +#define ICH_VTR_EL2_PREBITS_SHIFT 26 +#define ICH_VTR_EL2_PREBITS_MASK (0x7 << ICH_VTR_EL2_PREBITS_SHIFT) +#define ICH_VTR_EL2_PREBITS(x) \ + (((x) & ICH_VTR_EL2_PREBITS_MASK) >> ICH_VTR_EL2_PREBITS_SHIFT) +#define ICH_VTR_EL2_SEIS (1 << 22) /* System Error Interrupt (SEI) Support */ +#define ICH_VTR_EL2_A3V (1 << 21) /* Affinity 3 Valid */ +#define ICH_VTR_EL2_NV4 (1 << 20) /* Direct injection of virtual interrupts. RES1 for GICv3 */ +#define ICH_VTR_EL2_TDS (1 << 19) /* Implementation supports ICH_HCR_EL2.TDIR */ +#define ICH_VTR_EL2_LISTREGS_MASK 0x1f +/* + * ICH_VTR_EL2.ListRegs holds the number of list registers, minus one. Add one + * to get the actual number of list registers. + */ +#define ICH_VTR_EL2_LISTREGS(x) (((x) & ICH_VTR_EL2_LISTREGS_MASK) + 1) + +#endif /* !_VGIC_V3_REG_H_ */ Index: sys/arm64/vmm/io/vtimer.h =================================================================== --- /dev/null +++ sys/arm64/vmm/io/vtimer.h @@ -0,0 +1,112 @@ +/*- + * Copyright (c) 2017 The FreeBSD Foundation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company nor the name of the author may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_VTIMER_H_ +#define _VMM_VTIMER_H_ + +#define GT_PHYS_NS_IRQ 30 +#define GT_VIRT_IRQ 27 + +#define CNTP_CTL_EL0_OP0 0b11 +#define CNTP_CTL_EL0_OP2 0b001 +#define CNTP_CTL_EL0_OP1 0b011 +#define CNTP_CTL_EL0_CRn 0b1110 +#define CNTP_CTL_EL0_CRm 0b0010 +#define ISS_CNTP_CTL_EL0 \ + (CNTP_CTL_EL0_OP0 << ISS_MSR_OP0_SHIFT | \ + CNTP_CTL_EL0_OP2 << ISS_MSR_OP2_SHIFT | \ + CNTP_CTL_EL0_OP1 << ISS_MSR_OP1_SHIFT | \ + CNTP_CTL_EL0_CRn << ISS_MSR_CRn_SHIFT | \ + CNTP_CTL_EL0_CRm << ISS_MSR_CRm_SHIFT) + +#define CNTP_CVAL_EL0_OP0 0b11 +#define CNTP_CVAL_EL0_OP1 0b011 +#define CNTP_CVAL_EL0_OP2 0b010 +#define CNTP_CVAL_EL0_CRn 0b1110 +#define CNTP_CVAL_EL0_CRm 0b0010 +#define ISS_CNTP_CVAL_EL0 \ + (CNTP_CVAL_EL0_OP0 << ISS_MSR_OP0_SHIFT | \ + CNTP_CVAL_EL0_OP2 << ISS_MSR_OP2_SHIFT | \ + CNTP_CVAL_EL0_OP1 << ISS_MSR_OP1_SHIFT | \ + CNTP_CVAL_EL0_CRn << ISS_MSR_CRn_SHIFT | \ + CNTP_CVAL_EL0_CRm << ISS_MSR_CRm_SHIFT) + +#define CNTP_TVAL_EL0_OP0 0b11 +#define CNTP_TVAL_EL0_OP1 0b011 +#define CNTP_TVAL_EL0_OP2 0b000 +#define CNTP_TVAL_EL0_CRn 0b1110 +#define CNTP_TVAL_EL0_CRm 0b0010 +#define ISS_CNTP_TVAL_EL0 \ + (CNTP_TVAL_EL0_OP0 << ISS_MSR_OP0_SHIFT | \ + CNTP_TVAL_EL0_OP2 << ISS_MSR_OP2_SHIFT | \ + CNTP_TVAL_EL0_OP1 << ISS_MSR_OP1_SHIFT | \ + CNTP_TVAL_EL0_CRn << ISS_MSR_CRn_SHIFT | \ + CNTP_TVAL_EL0_CRm << ISS_MSR_CRm_SHIFT) + +struct vtimer +{ + uint64_t cnthctl_el2; + uint64_t cntvoff_el2; +}; + +struct vtimer_cpu +{ + struct callout callout; + uint32_t cntkctl_el1; + /* + * Emulated registers: + * + * CNTP_CTL_EL0: Counter-timer Physical Timer Control Register + * CNTP_CVAL_EL0: Counter-timer Physical Timer CompareValue Register + */ + uint64_t cntp_cval_el0; + uint32_t cntp_ctl_el0; + /* + * The virtual machine has full access to the virtual timer. The + * following registers are part of the VM context for the current CPU: + * + * CNTV_CTL_EL0: Counter-timer Virtuel Timer Control Register + * CNTV_CVAL_EL0: Counter-timer Virtual Timer CompareValue Register + */ + uint64_t cntv_cval_el0; + uint32_t cntv_ctl_el0; +}; + +int vtimer_init(uint64_t cnthctl_el2); +void vtimer_vminit(void *arg); +void vtimer_cpuinit(void *arg); +void vtimer_vmcleanup(void *arg); +void vtimer_cleanup(void); + +int vtimer_phys_ctl_read(void *vm, int vcpuid, uint64_t *rval, void *arg); +int vtimer_phys_ctl_write(void *vm, int vcpuid, uint64_t wval, void *arg); +int vtimer_phys_cval_read(void *vm, int vcpuid, uint64_t *rval, void *arg); +int vtimer_phys_cval_write(void *vm, int vcpuid, uint64_t wval, void *arg); +int vtimer_phys_tval_read(void *vm, int vcpuid, uint64_t *rval, void *arg); +int vtimer_phys_tval_write(void *vm, int vcpuid, uint64_t wval, void *arg); +#endif Index: sys/arm64/vmm/io/vtimer.c =================================================================== --- /dev/null +++ sys/arm64/vmm/io/vtimer.c @@ -0,0 +1,407 @@ +/*- + * Copyright (c) 2017 The FreeBSD Foundation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company nor the name of the author may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "vgic_v3.h" +#include "vtimer.h" + +#define RES1 0xffffffffffffffffUL + +#define timer_enabled(ctl) \ + (!((ctl) & CNTP_CTL_IMASK) && ((ctl) & CNTP_CTL_ENABLE)) + +static uint64_t cnthctl_el2_reg; +static uint32_t tmr_frq; + +#define timer_condition_met(ctl) ((ctl) & CNTP_CTL_ISTATUS) + +static int +vtimer_virtual_timer_intr(void *arg) +{ + struct hypctx *hypctx; + uint32_t cntv_ctl; + + /* + * TODO everything here is very strange. The relantionship between the + * hardware value and the value in memory is not clear at all. + */ + + hypctx = arm64_get_active_vcpu(); + cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); + + if (!hypctx) { + /* vm_destroy() was called. */ + eprintf("No active vcpu\n"); + cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); + goto out; + } + if (!timer_enabled(cntv_ctl)) { + eprintf("Timer not enabled\n"); + goto out; + } + if (!timer_condition_met(cntv_ctl)) { + eprintf("Timer condition not met\n"); + goto out; + } + + vgic_v3_inject_irq(hypctx, GT_VIRT_IRQ, VGIC_IRQ_CLK); + + hypctx->vtimer_cpu.cntv_ctl_el0 &= ~CNTP_CTL_ENABLE; + cntv_ctl = hypctx->vtimer_cpu.cntv_ctl_el0; + +out: + /* + * Disable the timer interrupt. This will prevent the interrupt from + * being reasserted as soon as we exit the handler and getting stuck + * in an infinite loop. + * + * This is safe to do because the guest disabled the timer, and then + * enables it as part of the interrupt handling routine. + */ + cntv_ctl &= ~CNTP_CTL_ENABLE; + WRITE_SPECIALREG(cntv_ctl_el0, cntv_ctl); + + return (FILTER_HANDLED); +} + +int +vtimer_init(uint64_t cnthctl_el2) +{ + int error; + + cnthctl_el2_reg = cnthctl_el2; + /* + * The guest *MUST* use the same timer frequency as the host. The + * register CNTFRQ_EL0 is accessible to the guest and a different value + * in the guest dts file might have unforseen consequences. + */ + tmr_frq = READ_SPECIALREG(cntfrq_el0); + + error = arm_tmr_setup_intr(GT_VIRT, vtimer_virtual_timer_intr, NULL, NULL); + if (error) { + printf("WARNING: arm_tmr_setup_intr() error: %d\n", error); + printf("WARNING: Expect reduced performance\n"); + } + + return (0); +} + +void +vtimer_vminit(void *arg) +{ + struct hyp *hyp; + uint64_t now; + + hyp = (struct hyp *)arg; + /* + * Configure the Counter-timer Hypervisor Control Register for the VM. + * + * ~CNTHCTL_EL1PCEN: trap access to CNTP_{CTL, CVAL, TVAL}_EL0 from EL1 + * CNTHCTL_EL1PCTEN: don't trap access to CNTPCT_EL0 + */ + hyp->vtimer.cnthctl_el2 = cnthctl_el2_reg & ~CNTHCTL_EL1PCEN; + hyp->vtimer.cnthctl_el2 |= CNTHCTL_EL1PCTEN; + + now = READ_SPECIALREG(cntpct_el0); + hyp->vtimer.cntvoff_el2 = now; + + return; +} + +void +vtimer_cpuinit(void *arg) +{ + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + + hypctx = (struct hypctx *)arg; + vtimer_cpu = &hypctx->vtimer_cpu; + /* + * Configure physical timer interrupts for the VCPU. + * + * CNTP_CTL_IMASK: mask interrupts + * ~CNTP_CTL_ENABLE: disable the timer + */ + vtimer_cpu->cntp_ctl_el0 = CNTP_CTL_IMASK & ~CNTP_CTL_ENABLE; + /* + * Callout function is MP_SAFE because the VGIC uses a spin + * mutex when modifying the list registers. + */ + callout_init(&vtimer_cpu->callout, 1); +} + +void +vtimer_vmcleanup(void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer *vtimer; + struct vtimer_cpu *vtimer_cpu; + uint32_t cntv_ctl; + int i; + + hyp = arg; + vtimer = &hyp->vtimer; + + hypctx = arm64_get_active_vcpu(); + if (!hypctx) { + /* The active VM was destroyed, stop the timer. */ + cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); + cntv_ctl &= ~CNTP_CTL_ENABLE; + WRITE_SPECIALREG(cntv_ctl_el0, cntv_ctl); + } + + for (i = 0; i < VM_MAXCPU; i++) { + vtimer_cpu = &hyp->ctx[i].vtimer_cpu; + callout_drain(&vtimer_cpu->callout); + } +} + +void +vtimer_cleanup(void) +{ + int error; + + error = arm_tmr_teardown_intr(GT_VIRT); + if (error) + printf("WARNING: arm_tmr_teardown_intr() error: %d\n", error); + +} + +static void +vtimer_inject_irq_callout_func(void *context) +{ + struct hypctx *hypctx; + + hypctx = context; + vgic_v3_inject_irq(hypctx, GT_PHYS_NS_IRQ, VGIC_IRQ_CLK); +} + + +static void +vtimer_schedule_irq(struct vtimer_cpu *vtimer_cpu, struct hypctx *hypctx) +{ + sbintime_t time; + uint64_t cntpct_el0; + uint64_t diff; + + cntpct_el0 = READ_SPECIALREG(cntpct_el0); + if (vtimer_cpu->cntp_cval_el0 < cntpct_el0) { + /* Timer set in the past, trigger interrupt */ + vgic_v3_inject_irq(hypctx, GT_PHYS_NS_IRQ, VGIC_IRQ_CLK); + } else { + diff = vtimer_cpu->cntp_cval_el0 - cntpct_el0; + time = diff * SBT_1S / tmr_frq; + callout_reset_sbt(&vtimer_cpu->callout, time, 0, + vtimer_inject_irq_callout_func, hypctx, 0); + } +} + +static void +vtimer_remove_irq(struct hypctx *hypctx) +{ + struct vtimer_cpu *vtimer_cpu; + + vtimer_cpu = &hypctx->vtimer_cpu; + + callout_drain(&vtimer_cpu->callout); + /* + * The interrupt needs to be deactivated here regardless of the callout + * function having been executed. The timer interrupt can be masked with + * the CNTP_CTL_EL0.IMASK bit instead of reading the IAR register. + * Masking the interrupt doesn't remove it from the list registers. + */ + vgic_v3_remove_irq(hypctx, GT_PHYS_NS_IRQ, true); +} + +/* + * Timer emulation functions. + * + * The guest dts is configured to use the physical timer because the Generic + * Timer can only trap physical timer accesses. This is why we always read the + * physical counter value when programming the time for the timer interrupt in + * the guest. + */ + +int +vtimer_phys_ctl_read(void *vm, int vcpuid, uint64_t *rval, void *arg) +{ + struct hyp *hyp; + struct vtimer_cpu *vtimer_cpu; + uint64_t cntpct_el0; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + vtimer_cpu = &hyp->ctx[vcpuid].vtimer_cpu; + + cntpct_el0 = READ_SPECIALREG(cntpct_el0); + if (vtimer_cpu->cntp_cval_el0 < cntpct_el0) + /* Timer condition met */ + *rval = vtimer_cpu->cntp_ctl_el0 | CNTP_CTL_ISTATUS; + else + *rval = vtimer_cpu->cntp_ctl_el0 & ~CNTP_CTL_ISTATUS; + + *retu = false; + return (0); +} + +int +vtimer_phys_ctl_write(void *vm, int vcpuid, uint64_t wval, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + uint64_t ctl_el0; + bool timer_toggled_on; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + hypctx = &hyp->ctx[vcpuid]; + vtimer_cpu = &hypctx->vtimer_cpu; + + timer_toggled_on = false; + ctl_el0 = vtimer_cpu->cntp_ctl_el0; + + if (!timer_enabled(ctl_el0) && timer_enabled(wval)) + timer_toggled_on = true; + + vtimer_cpu->cntp_ctl_el0 = wval; + + if (timer_toggled_on) + vtimer_schedule_irq(vtimer_cpu, hypctx); + + *retu = false; + return (0); +} + +int +vtimer_phys_cval_read(void *vm, int vcpuid, uint64_t *rval, void *arg) +{ + struct hyp *hyp; + struct vtimer_cpu *vtimer_cpu; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + vtimer_cpu = &hyp->ctx[vcpuid].vtimer_cpu; + + *rval = vtimer_cpu->cntp_cval_el0; + + *retu = false; + return (0); +} + +int +vtimer_phys_cval_write(void *vm, int vcpuid, uint64_t wval, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + hypctx = &hyp->ctx[vcpuid]; + vtimer_cpu = &hypctx->vtimer_cpu; + + vtimer_cpu->cntp_cval_el0 = wval; + + if (timer_enabled(vtimer_cpu->cntp_ctl_el0)) { + vtimer_remove_irq(hypctx); + vtimer_schedule_irq(vtimer_cpu, hypctx); + } + + *retu = false; + return (0); +} + +int +vtimer_phys_tval_read(void *vm, int vcpuid, uint64_t *rval, void *arg) +{ + struct hyp *hyp; + struct vtimer_cpu *vtimer_cpu; + uint32_t cntpct_el0; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + vtimer_cpu = &hyp->ctx[vcpuid].vtimer_cpu; + + if (!(vtimer_cpu->cntp_ctl_el0 & CNTP_CTL_ENABLE)) { + /* + * ARMv8 Architecture Manual, p. D7-2702: the result of reading + * TVAL when the timer is disabled is UNKNOWN. I have chosen to + * return the maximum value possible on 32 bits which means the + * timer will fire very far into the future. + */ + *rval = (uint32_t)RES1; + } else { + cntpct_el0 = READ_SPECIALREG(cntpct_el0); + *rval = vtimer_cpu->cntp_cval_el0 - cntpct_el0; + } + + *retu = false; + return (0); +} + +int +vtimer_phys_tval_write(void *vm, int vcpuid, uint64_t wval, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + uint64_t cntpct_el0; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + hypctx = &hyp->ctx[vcpuid]; + vtimer_cpu = &hypctx->vtimer_cpu; + + cntpct_el0 = READ_SPECIALREG(cntpct_el0); + vtimer_cpu->cntp_cval_el0 = (int32_t)wval + cntpct_el0; + + if (timer_enabled(vtimer_cpu->cntp_ctl_el0)) { + vtimer_remove_irq(hypctx); + vtimer_schedule_irq(vtimer_cpu, hypctx); + } + + *retu = false; + return (0); +} Index: sys/arm64/vmm/mmu.h =================================================================== --- /dev/null +++ sys/arm64/vmm/mmu.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_MMU_H_ +#define _VMM_MMU_H_ + +#include +#include + +#include "hyp.h" + +#define ktohyp(kva) (((vm_offset_t)(kva) & HYP_KVA_MASK) | \ + HYP_KVA_OFFSET) +#define ipatok(ipa, hypmap) (PHYS_TO_DMAP(pmap_extract(hypmap, (ipa)))) +#define gtoipa(gva) ((gva) - KERNBASE + VM_GUEST_BASE_IPA) + +#define page_aligned(x) (((vm_offset_t)(x) & PAGE_MASK) == 0) + +void hypmap_init(pmap_t map, enum pmap_stage pm_stage); +void hypmap_map(pmap_t map, vm_offset_t va, size_t len, + vm_prot_t prot); +void hypmap_map_identity(pmap_t map, vm_offset_t va, size_t len, + vm_prot_t prot); +void hypmap_set(void *arg, vm_offset_t va, vm_offset_t pa, + size_t len, vm_prot_t prot); +vm_paddr_t hypmap_get(void *arg, vm_offset_t va); +void hypmap_cleanup(pmap_t map); + +#endif Index: sys/arm64/vmm/mmu.c =================================================================== --- /dev/null +++ sys/arm64/vmm/mmu.c @@ -0,0 +1,169 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mmu.h" +#include "arm64.h" + +MALLOC_DECLARE(M_HYP); + +void +hypmap_init(pmap_t map, enum pmap_stage pm_stage) +{ + mtx_init(&map->pm_mtx, "hypmap_pm_mtx", NULL, MTX_DEF); + pmap_pinit_stage(map, pm_stage, 4); +} + +void +hypmap_map(pmap_t map, vm_offset_t va, size_t len, vm_prot_t prot) +{ + vm_offset_t va_end, hypva; + vm_page_t dummy_page; + + dummy_page = malloc(sizeof(*dummy_page), M_HYP, M_WAITOK | M_ZERO); + dummy_page->oflags = VPO_UNMANAGED; + dummy_page->md.pv_memattr = VM_MEMATTR_DEFAULT; + + /* + * Add the physical pages which correspond to the specified virtual + * addresses.The virtual addresses span contiguous virtual pages, but + * they might not reside in contiguous physical pages. + */ + va_end = va + len - 1; + va = trunc_page(va); + while (va < va_end) { + dummy_page->phys_addr = vtophys(va); + hypva = (va >= VM_MIN_KERNEL_ADDRESS) ? ktohyp(va) : va; + pmap_enter(map, hypva, dummy_page, prot, PMAP_ENTER_WIRED, 0); + va += PAGE_SIZE; + } + + free(dummy_page, M_HYP); +} + +void +hypmap_map_identity(pmap_t map, vm_offset_t va, size_t len, + vm_prot_t prot) +{ + vm_offset_t va_end; + vm_page_t dummy_page; + + dummy_page = malloc(sizeof(*dummy_page), M_HYP, M_WAITOK | M_ZERO); + dummy_page->oflags = VPO_UNMANAGED; + dummy_page->md.pv_memattr = VM_MEMATTR_DEFAULT; + + /* + * The virtual addresses span contiguous virtual pages, but they might + * not reside in contiguous physical pages. For each virtual page we + * get the physical page address and use that for the mapping. + */ + va_end = va + len - 1; + va = trunc_page(va); + while (va < va_end) { + dummy_page->phys_addr = vtophys(va); + pmap_enter(map, dummy_page->phys_addr, dummy_page, + prot, PMAP_ENTER_WIRED, 0); + va += PAGE_SIZE; + } + + free(dummy_page, M_HYP); +} + +/* + * Map 'len' bytes starting at virtual address 'va' to 'len' bytes + * starting at physical address 'pa' + */ +void +hypmap_set(void *arg, vm_offset_t va, vm_offset_t pa, size_t len, + vm_prot_t prot) +{ + vm_offset_t va_end, hypva; + vm_page_t dummy_page; + struct hyp *hyp; + pmap_t map; + + hyp = (struct hyp *)arg; + map = hyp->stage2_map; + + dummy_page = malloc(sizeof(*dummy_page), M_HYP, M_WAITOK | M_ZERO); + dummy_page->oflags = VPO_UNMANAGED; + dummy_page->md.pv_memattr = VM_MEMATTR_DEFAULT; + + va_end = va + len - 1; + va = trunc_page(va); + dummy_page->phys_addr = trunc_page(pa); + while (va < va_end) { + hypva = (va >= VM_MIN_KERNEL_ADDRESS) ? ktohyp(va) : va; + pmap_enter(map, hypva, dummy_page, prot, PMAP_ENTER_WIRED, 0); + va += PAGE_SIZE; + dummy_page->phys_addr += PAGE_SIZE; + } + + free(dummy_page, M_HYP); +} + +/* + * Return the physical address associated with virtual address 'va' + */ +vm_paddr_t +hypmap_get(void *arg, vm_offset_t va) +{ + struct hyp *hyp; + pmap_t map; + + hyp = (struct hyp *)arg; + map = hyp->stage2_map; + + return pmap_extract(map, va); +} + +/* + * Remove all the mappings from the hyp translation tables + */ +void +hypmap_cleanup(pmap_t map) +{ + pmap_remove(map, HYP_VM_MIN_ADDRESS, HYP_VM_MAX_ADDRESS); + mtx_destroy(&map->pm_mtx); + pmap_release(map); +} Index: sys/arm64/vmm/psci.h =================================================================== --- /dev/null +++ sys/arm64/vmm/psci.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _PSCI_H_ +#define _PSCI_H_ + +#include "arm64.h" + +int psci_handle_call(struct vm *vm, int vcpuid, struct vm_exit *vme, + bool *retu); + +#endif Index: sys/arm64/vmm/psci.c =================================================================== --- /dev/null +++ sys/arm64/vmm/psci.c @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include + +#include "arm64.h" +#include "psci.h" + +#define PSCI_VERSION_0_2 0x2 + +static int +psci_version(struct hypctx *hypctx, bool *retu) +{ + + hypctx->regs.x[0] = PSCI_VERSION_0_2; + + *retu = false; + return (0); +} + +static int +psci_migrate_type(struct hypctx *hypctx, bool *retu) +{ + + hypctx->regs.x[0] = PSCI_VERSION_0_2; + + *retu = false; + return (0); +} + +static int +psci_system_off(struct vm_exit *vme, bool *retu) +{ + vme->u.suspended.how = VM_SUSPEND_POWEROFF; + vme->exitcode = VM_EXITCODE_SUSPENDED; + + *retu = true; + return (0); +} + +int +psci_handle_call(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu) +{ + struct hyp *hyp; + struct hypctx *hypctx; + uint64_t func_id; + uint32_t esr_el2, esr_iss; + int error; + + hyp = vm_get_cookie(vm); + hypctx = &hyp->ctx[vcpuid]; + + esr_el2 = hypctx->exit_info.esr_el2; + esr_iss = esr_el2 & ESR_ELx_ISS_MASK; + + if (esr_iss != 0) { + eprintf("Malformed HVC instruction with immediate: 0x%x\n", + esr_iss); + error = 1; + goto out; + } + + func_id = hypctx->regs.x[0]; + switch (func_id) { + case PSCI_FNID_VERSION: + error = psci_version(hypctx, retu); + break; + case PSCI_FNID_SYSTEM_OFF: + error = psci_system_off(vme, retu); + break; + case PSCI_FNID_CPU_ON: + vme->exitcode = VM_EXITCODE_SPINUP_AP; + vme->u.spinup_ap.vcpu = hypctx->regs.x[1]; + vme->u.spinup_ap.rip = hypctx->regs.x[2]; + vme->u.spinup_ap.ctx_id = hypctx->regs.x[3]; + *retu = true; + error = 0; + break; + case PSCI_FNID_MIGRATE_INFO_TYPE: + error = psci_migrate_type(hypctx, retu); + break; + case PSCI_FNID_MIGRATE_INFO_UP_CPU: + hypctx->regs.x[0] = PSCI_VERSION_0_2; + error = 0; + *retu = false; + break; + default: + eprintf("Unimplemented PSCI function: 0x%016lx\n", func_id); + hypctx->regs.x[0] = PSCI_RETVAL_NOT_SUPPORTED; + error = 1; + } + +out: + return (error); +} Index: sys/arm64/vmm/reset.h =================================================================== --- /dev/null +++ sys/arm64/vmm/reset.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _VMM_RESET_H_ +#define _VMM_RESET_H_ + +void reset_vm_el01_regs(void *vcpu); +void reset_vm_el2_regs(void *vcpu); + +#endif Index: sys/arm64/vmm/reset.c =================================================================== --- /dev/null +++ sys/arm64/vmm/reset.c @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include + +#include "arm64.h" +#include "reset.h" + +/* + * Make the architecturally UNKNOWN value 0. As a bonus, we don't have to + * manually set all those RES0 fields. + */ +#define ARCH_UNKNOWN 0 +#define set_arch_unknown(reg) (memset(&(reg), ARCH_UNKNOWN, sizeof(reg))) + +void +reset_vm_el01_regs(void *vcpu) +{ + struct hypctx *el2ctx; + + el2ctx = vcpu; + + set_arch_unknown(el2ctx->regs); + + set_arch_unknown(el2ctx->actlr_el1); + set_arch_unknown(el2ctx->afsr0_el1); + set_arch_unknown(el2ctx->afsr1_el1); + set_arch_unknown(el2ctx->amair_el1); + set_arch_unknown(el2ctx->contextidr_el1); + set_arch_unknown(el2ctx->cpacr_el1); + set_arch_unknown(el2ctx->elr_el1); + set_arch_unknown(el2ctx->esr_el1); + set_arch_unknown(el2ctx->far_el1); + set_arch_unknown(el2ctx->mair_el1); + set_arch_unknown(el2ctx->par_el1); + + /* + * Guest starts with: + * ~SCTLR_M: MMU off + * ~SCTLR_C: data cache off + * SCTLR_CP15BEN: memory barrier instruction enable from EL0; RAO/WI + * ~SCTLR_I: instruction cache off + */ + el2ctx->sctlr_el1 = SCTLR_RES1; + el2ctx->sctlr_el1 &= ~SCTLR_M & ~SCTLR_C & ~SCTLR_I; + el2ctx->sctlr_el1 |= SCTLR_CP15BEN; + + set_arch_unknown(el2ctx->sp_el0); + set_arch_unknown(el2ctx->tcr_el1); + set_arch_unknown(el2ctx->tpidr_el0); + set_arch_unknown(el2ctx->tpidr_el1); + set_arch_unknown(el2ctx->tpidrro_el0); + set_arch_unknown(el2ctx->ttbr0_el1); + set_arch_unknown(el2ctx->ttbr1_el1); + set_arch_unknown(el2ctx->vbar_el1); + set_arch_unknown(el2ctx->spsr_el1); +} + +void +reset_vm_el2_regs(void *vcpu) +{ + struct hypctx *el2ctx; + uint64_t cpu_aff; + + el2ctx = vcpu; + + /* + * Set the Hypervisor Configuration Register: + * + * HCR_RW: use AArch64 for EL1 + * HCR_BSU_IS: barrier instructions apply to the inner shareable + * domain + * HCR_SWIO: turn set/way invalidate into set/way clean and + * invalidate + * HCR_FB: broadcast maintenance operations + * HCR_AMO: route physical SError interrupts to EL2 + * HCR_IMO: route physical IRQ interrupts to EL2 + * HCR_FMO: route physical FIQ interrupts to EL2 + * HCR_VM: use stage 2 translation + */ + el2ctx->hcr_el2 = HCR_RW | HCR_BSU_IS | HCR_SWIO | HCR_FB | \ + HCR_VM | HCR_AMO | HCR_IMO | HCR_FMO; + + el2ctx->vmpidr_el2 = VMPIDR_EL2_RES1; + /* The guest will detect a multi-core, single-threaded CPU */ + el2ctx->vmpidr_el2 &= ~VMPIDR_EL2_U & ~VMPIDR_EL2_MT; + /* Only 24 bits of affinity, for a grand total of 16,777,216 cores. */ + cpu_aff = el2ctx->vcpu & (CPU_AFF0_MASK | CPU_AFF1_MASK | CPU_AFF2_MASK); + el2ctx->vmpidr_el2 |= cpu_aff; + + /* Use the same CPU identification information as the host */ + el2ctx->vpidr_el2 = CPU_IMPL_TO_MIDR(CPU_IMPL_ARM); + el2ctx->vpidr_el2 |= CPU_VAR_TO_MIDR(0); + el2ctx->vpidr_el2 |= CPU_ARCH_TO_MIDR(0xf); + el2ctx->vpidr_el2 |= CPU_PART_TO_MIDR(CPU_PART_FOUNDATION); + el2ctx->vpidr_el2 |= CPU_REV_TO_MIDR(0); + + /* + * Don't trap accesses to CPACR_EL1, trace, SVE, Advanced SIMD + * and floating point functionality to EL2. + */ + el2ctx->cptr_el2 = CPTR_RES1; + /* + * Disable interrupts in the guest. The guest OS will re-enable + * them. + */ + el2ctx->spsr_el2 = PSR_D | PSR_A | PSR_I | PSR_F; + /* Use the EL1 stack when taking exceptions to EL1 */ + el2ctx->spsr_el2 |= PSR_M_EL1h; +} Index: sys/arm64/vmm/vmm.c =================================================================== --- /dev/null +++ sys/arm64/vmm/vmm.c @@ -0,0 +1,918 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vmm_stat.h" +#include "vmm_mem.h" +#include "arm64.h" +#include "mmu.h" +#include "psci.h" + +#include "io/vgic_v3.h" +#include "io/vtimer.h" + +#define BSP 0 /* the boostrap processor */ + +struct vcpu { + int flags; + enum vcpu_state state; + struct mtx mtx; + int hostcpu; /* host cpuid this vcpu last ran on */ + int vcpuid; + void *stats; + struct vm_exit exitinfo; + uint64_t nextpc; /* (x) next instruction to execute */ +}; + +#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) +#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) +#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) +#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) +#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) + +struct mem_seg { + uint64_t gpa; + size_t len; + bool wired; + vm_object_t object; +}; +#define VM_MAX_MEMORY_SEGMENTS 2 + +struct vm { + void *cookie; + struct vcpu vcpu[VM_MAXCPU]; + int num_mem_segs; + struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS]; + char name[VM_MAX_NAMELEN]; + /* + * Set of active vcpus. + * An active vcpu is one that has been started implicitly (BSP) or + * explicitly (AP) by sending it a startup ipi. + */ + cpuset_t active_cpus; + uint16_t maxcpus; +}; + +static bool vmm_initialized = false; + +static struct vmm_ops *ops = NULL; + +#define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) +#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) + +#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL) +#define VMRUN(vmi, vcpu, pc, pmap, rvc, sc) \ + (ops != NULL ? (*ops->vmrun)(vmi, vcpu, pc, pmap, rvc, sc) : ENXIO) +#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) +#define VMMMAP_SET(vmi, ipa, pa, len, prot) \ + (ops != NULL ? \ + (*ops->vmmapset)(vmi, ipa, pa, len, prot) : ENXIO) +#define VMMMAP_GET(vmi, gpa) \ + (ops != NULL ? (*ops->vmmapget)(vmi, gpa) : ENXIO) +#define VMGETREG(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETREG(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) +#define VMGETCAP(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETCAP(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) + +#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) +#define fpu_stop_emulating() clts() + +static int vm_handle_wfi(struct vm *vm, int vcpuid, + struct vm_exit *vme, bool *retu); + +static MALLOC_DEFINE(M_VMM, "vmm", "vmm"); + +/* statistics */ +static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); + +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); + +/* + * Halt the guest if all vcpus are executing a HLT instruction with + * interrupts disabled. + */ +static int halt_detection_enabled = 1; +SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, + &halt_detection_enabled, 0, + "Halt VM if all vcpus execute HLT with interrupts disabled"); + +static int vmm_ipinum; +SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, + "IPI vector used for vcpu notifications"); + +static int trace_guest_exceptions; +SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, + &trace_guest_exceptions, 0, + "Trap into hypervisor on all guest exceptions and reflect them back"); + +static void +vcpu_cleanup(struct vm *vm, int i, bool destroy) +{ +// struct vcpu *vcpu = &vm->vcpu[i]; +} + +static void +vcpu_init(struct vm *vm, uint32_t vcpu_id, bool create) +{ + struct vcpu *vcpu; + + KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, + ("cpus_init: invalid vcpu %d", vcpu_id)); + + vcpu = &vm->vcpu[vcpu_id]; + + if (create) { + KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " + "initialized", vcpu_id)); + vcpu_lock_init(vcpu); + vcpu->hostcpu = NOCPU; + vcpu->vcpuid = vcpu_id; + } +} + +struct vm_exit * +vm_exitinfo(struct vm *vm, int cpuid) +{ + struct vcpu *vcpu; + + if (cpuid < 0 || cpuid >= VM_MAXCPU) + panic("vm_exitinfo: invalid cpuid %d", cpuid); + + vcpu = &vm->vcpu[cpuid]; + + return (&vcpu->exitinfo); +} + +static int +vmm_init(void) +{ + ops = &vmm_ops_arm; + + return (VMM_INIT(0)); +} + +static int +vmm_handler(module_t mod, int what, void *arg) +{ + int error; + + switch (what) { + case MOD_LOAD: + vmmdev_init(); + error = vmm_init(); + if (error == 0) + vmm_initialized = true; + break; + case MOD_UNLOAD: + error = vmmdev_cleanup(); + if (error == 0 && vmm_initialized) { + error = VMM_CLEANUP(); + if (error) + vmm_initialized = false; + } + break; + default: + error = 0; + break; + } + return (error); +} + +static moduledata_t vmm_kmod = { + "vmm", + vmm_handler, + NULL +}; + +/* + * vmm initialization has the following dependencies: + * + * - HYP initialization requires smp_rendezvous() and therefore must happen + * after SMP is fully functional (after SI_SUB_SMP). + */ +DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); +MODULE_VERSION(vmm, 1); + +int +vm_create(const char *name, struct vm **retvm) +{ + struct vm *vm; + int i; + + /* + * If vmm.ko could not be successfully initialized then don't attempt + * to create the virtual machine. + */ + if (!vmm_initialized) + return (ENXIO); + + if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) + return (EINVAL); + + vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO); + strcpy(vm->name, name); + vm->maxcpus = VM_MAXCPU; + vm->cookie = VMINIT(vm); + + CPU_ZERO(&vm->active_cpus); + for (i = 0; i < vm->maxcpus; i++) + vcpu_init(vm, i, true); + + *retvm = vm; + return (0); +} + +static void +vm_cleanup(struct vm *vm, bool destroy) +{ + VMCLEANUP(vm->cookie); +} + +void +vm_destroy(struct vm *vm) +{ + vm_cleanup(vm, true); + free(vm, M_VMM); +} + +const char * +vm_name(struct vm *vm) +{ + return (vm->name); +} + +#include +#include + +static caddr_t +search_by_type(const char *type, caddr_t preload_metadata) +{ + caddr_t curp, lname; + uint32_t *hdr; + int next; + + if (preload_metadata != NULL) { + + curp = preload_metadata; + lname = NULL; + for (;;) { + hdr = (uint32_t *)curp; + if (hdr[0] == 0 && hdr[1] == 0) + break; + + /* remember the start of each record */ + if (hdr[0] == MODINFO_NAME) + lname = curp; + + /* Search for a MODINFO_TYPE field */ + if ((hdr[0] == MODINFO_TYPE) && + !strcmp(type, curp + sizeof(uint32_t) * 2)) + return(lname); + + /* skip to next field */ + next = sizeof(uint32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + } + return(NULL); +} + +static int +vm_handle_reg_emul(struct vm *vm, int vcpuid, bool *retu) +{ + struct hyp *hyp; + struct vm_exit *vme; + struct vre *vre; + reg_read_t rread; + reg_write_t rwrite; + uint32_t iss_reg; + int error; + + hyp = (struct hyp *)vm->cookie; + vme = vm_exitinfo(vm, vcpuid); + vre = &vme->u.reg_emul.vre; + + iss_reg = vre->inst_syndrome & ISS_MSR_REG_MASK; + switch (iss_reg) { + case ISS_CNTP_CTL_EL0: + rread = vtimer_phys_ctl_read; + rwrite = vtimer_phys_ctl_write; + break; + case ISS_CNTP_CVAL_EL0: + rread = vtimer_phys_cval_read; + rwrite = vtimer_phys_cval_write; + break; + case ISS_CNTP_TVAL_EL0: + rread = vtimer_phys_tval_read; + rwrite = vtimer_phys_tval_write; + break; + case ICC_SGI1R_EL1: + rread = vgic_v3_icc_sgi1r_el1_read; + rwrite = vgic_v3_icc_sgi1r_el1_write; + break; + default: + goto out_user; + } + + error = vmm_emulate_register(vm, vcpuid, vre, rread, rwrite, retu); + + return (error); + +out_user: + *retu = true; + return (0); +} + +static int +vm_mmio_region_match(const void *key, const void *memb) +{ + const uint64_t *addr = key; + const struct vgic_mmio_region *vmr = memb; + + if (*addr < vmr->start) + return (-1); + else if (*addr >= vmr->start && *addr < vmr->end) + return (0); + else + return (1); +} + +static int +vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) +{ + struct vm_exit *vme; + struct vie *vie; + struct hyp *hyp = vm->cookie; + uint64_t fault_ipa; + struct vgic_mmio_region *vmr; + int error, i; + + if (!hyp->vgic_attached) + goto out_user; + + vme = vm_exitinfo(vm, vcpuid); + vie = &vme->u.inst_emul.vie; + + fault_ipa = vme->u.inst_emul.gpa; + + vmr = NULL; + for (i = 0; i < hyp->vgic_mmio_regions_num; i++) + if (fault_ipa >= hyp->vgic_mmio_regions[i].start && fault_ipa <= hyp->vgic_mmio_regions[i].end) + vmr = &hyp->vgic_mmio_regions[i]; + if (!vmr) + goto out_user; + + error = vmm_emulate_instruction(vm, vcpuid, fault_ipa, vie, + vmr->read, vmr->write, retu); + + return (error); + +out_user: + *retu = true; + return (0); +} + +static int +vm_handle_poweroff(struct vm *vm, int vcpuid) +{ + return (0); +} + +static int +vm_handle_psci_call(struct vm *vm, int vcpuid, bool *retu) +{ + struct vm_exit *vme; + enum vm_suspend_how how; + int error; + + vme = vm_exitinfo(vm, vcpuid); + + error = psci_handle_call(vm, vcpuid, vme, retu); + if (error) + goto out; + + if (vme->exitcode == VM_EXITCODE_SUSPENDED) { + how = vme->u.suspended.how; + switch (how) { + case VM_SUSPEND_POWEROFF: + vm_handle_poweroff(vm, vcpuid); + break; + default: + /* Nothing to do */ + ; + } + } + +out: + return (error); +} + +int +vm_run(struct vm *vm, struct vm_run *vmrun) +{ + int error, vcpuid; + register_t pc; + struct vm_exit *vme; + bool retu; + void *rvc, *sc; + + vcpuid = vmrun->cpuid; + pc = vmrun->pc; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + rvc = sc = NULL; +restart: + critical_enter(); + error = VMRUN(vm->cookie, vcpuid, pc, NULL, rvc, sc); + critical_exit(); + + vme = vm_exitinfo(vm, vcpuid); + if (error == 0) { + retu = false; + switch (vme->exitcode) { + case VM_EXITCODE_INST_EMUL: + pc = vme->pc + vme->inst_length; + error = vm_handle_inst_emul(vm, vcpuid, &retu); + break; + + case VM_EXITCODE_REG_EMUL: + pc = vme->pc + vme->inst_length; + error = vm_handle_reg_emul(vm, vcpuid, &retu); + break; + + case VM_EXITCODE_HVC: + /* + * The HVC instruction saves the address for the + * next instruction as the return address. + */ + pc = vme->pc; + /* + * The PSCI call can change the exit information in the + * case of suspend/reset/poweroff/cpu off/cpu on. + */ + error = psci_handle_call(vm, vcpuid, vme, &retu); + break; + + case VM_EXITCODE_WFI: + pc = vme->pc + vme->inst_length; + error = vm_handle_wfi(vm, vcpuid, vme, &retu); + break; + + default: + /* Handle in userland */ + retu = true; + break; + } + } + + if (error == 0 && retu == false) + goto restart; + + /* Copy the exit information */ + bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); + + return (error); +} + +int +vm_activate_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EBUSY); + + CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); + return (0); + +} + +cpuset_t +vm_active_cpus(struct vm *vm) +{ + + return (vm->active_cpus); +} + +void * +vcpu_stats(struct vm *vm, int vcpuid) +{ + + return (vm->vcpu[vcpuid].stats); +} + +static int +vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, + bool from_idle) +{ + int error; + + vcpu_assert_locked(vcpu); + + /* + * State transitions from the vmmdev_ioctl() must always begin from + * the VCPU_IDLE state. This guarantees that there is only a single + * ioctl() operating on a vcpu at any point. + */ + if (from_idle) { + while (vcpu->state != VCPU_IDLE) + msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); + } else { + KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " + "vcpu idle state")); + } + + if (vcpu->state == VCPU_RUNNING) { + KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " + "mismatch for running vcpu", curcpu, vcpu->hostcpu)); + } else { + KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " + "vcpu that is not running", vcpu->hostcpu)); + } + + /* + * The following state transitions are allowed: + * IDLE -> FROZEN -> IDLE + * FROZEN -> RUNNING -> FROZEN + * FROZEN -> SLEEPING -> FROZEN + */ + switch (vcpu->state) { + case VCPU_IDLE: + case VCPU_RUNNING: + case VCPU_SLEEPING: + error = (newstate != VCPU_FROZEN); + break; + case VCPU_FROZEN: + error = (newstate == VCPU_FROZEN); + break; + default: + error = 1; + break; + } + + if (error) + return (EBUSY); + + vcpu->state = newstate; + if (newstate == VCPU_RUNNING) + vcpu->hostcpu = curcpu; + else + vcpu->hostcpu = NOCPU; + + if (newstate == VCPU_IDLE) + wakeup(&vcpu->state); + + return (0); +} + +int +vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, + bool from_idle) +{ + int error; + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_set_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + error = vcpu_set_state_locked(vcpu, newstate, from_idle); + vcpu_unlock(vcpu); + + return (error); +} + +enum vcpu_state +vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) +{ + struct vcpu *vcpu; + enum vcpu_state state; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_get_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + state = vcpu->state; + if (hostcpu != NULL) + *hostcpu = vcpu->hostcpu; + vcpu_unlock(vcpu); + + return (state); +} + +uint64_t +vm_gpa2hpa(struct vm *vm, uint64_t gpa, size_t len) +{ + uint64_t nextpage; + + nextpage = trunc_page(gpa + PAGE_SIZE); + if (len > nextpage - gpa) + panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%zu", gpa, len); + + return (VMMMAP_GET(vm->cookie, gpa)); +} + +int +vm_gpabase2memseg(struct vm *vm, uint64_t gpabase, + struct vm_memory_segment *seg) +{ + int i; + + for (i = 0; i < vm->num_mem_segs; i++) { + if (gpabase == vm->mem_segs[i].gpa) { + *seg = vm->mem_segs[i]; + return (0); + } + } + return (-1); +} + +int +vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (VMGETREG(vm->cookie, vcpu, reg, retval)); +} + +int +vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) +{ + struct vcpu *vcpu; + int error; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + error = VMSETREG(vm->cookie, vcpuid, reg, val); + if (error) + return (error); + + vcpu = &vm->vcpu[vcpuid]; + vcpu->nextpc = val; + + return(0); +} + +void * +vm_get_cookie(struct vm *vm) +{ + return vm->cookie; +} + +uint16_t +vm_get_maxcpus(struct vm *vm) +{ + return (vm->maxcpus); +} + +static void +vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg) +{ + size_t len; + uint64_t hpa; + + len = 0; + while (len < seg->len) { + hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE); + if (hpa == (uint64_t)-1) { + panic("vm_free_mem_segs: cannot free hpa " + "associated with gpa 0x%016lx", seg->gpa + len); + } + + vmm_mem_free(hpa, PAGE_SIZE); + + len += PAGE_SIZE; + } + + bzero(seg, sizeof(struct vm_memory_segment)); +} + +/* + * Return true if 'gpa' is available for allocation, false otherwise + */ +static bool +vm_ipa_available(struct vm *vm, uint64_t ipa) +{ + uint64_t ipabase, ipalimit; + int i; + + if (!page_aligned(ipa)) + panic("vm_ipa_available: ipa (0x%016lx) not page aligned", ipa); + + for (i = 0; i < vm->num_mem_segs; i++) { + ipabase = vm->mem_segs[i].gpa; + ipalimit = ipabase + vm->mem_segs[i].len; + if (ipa >= ipabase && ipa < ipalimit) + return (false); + } + + return (true); +} + +/* + * Allocate 'len' bytes for the virtual machine starting at address 'ipa' + */ +int +vm_malloc(struct vm *vm, uint64_t ipa, size_t len) +{ + struct vm_memory_segment *seg; + int error, available, allocated; + uint64_t ipa2; + vm_paddr_t pa; + + if (!page_aligned(ipa) != 0 || !page_aligned(len) || len == 0) + return (EINVAL); + + available = allocated = 0; + ipa2 = ipa; + while (ipa2 < ipa + len) { + if (vm_ipa_available(vm, ipa2)) + available++; + else + allocated++; + ipa2 += PAGE_SIZE; + } + + /* + * If there are some allocated and some available pages in the address + * range then it is an error. + */ + if (allocated != 0 && available != 0) + return (EINVAL); + + /* + * If the entire address range being requested has already been + * allocated then there isn't anything more to do. + */ + if (allocated != 0 && available == 0) + return (0); + + if (vm->num_mem_segs == VM_MAX_MEMORY_SEGMENTS) + return (E2BIG); + + seg = &vm->mem_segs[vm->num_mem_segs]; + error = 0; + seg->gpa = ipa; + seg->len = 0; + while (seg->len < len) { + pa = vmm_mem_alloc(PAGE_SIZE); + if (pa == 0) { + error = ENOMEM; + break; + } + VMMMAP_SET(vm->cookie, ipa, pa, PAGE_SIZE, VM_PROT_ALL); + + seg->len += PAGE_SIZE; + ipa += PAGE_SIZE; + } + vm->num_mem_segs++; + + return (0); +} + +int +vm_attach_vgic(struct vm *vm, uint64_t dist_start, size_t dist_size, + uint64_t redist_start, size_t redist_size) +{ + int error; + + error = vgic_v3_attach_to_vm(vm->cookie, dist_start, dist_size, + redist_start, redist_size); + + return (error); +} + +int +vm_assert_irq(struct vm *vm, uint32_t irq, uint32_t vcpuid) +{ + struct hyp *hyp = (struct hyp *)vm->cookie; + int error; + + /* TODO: this is crap, send the vcpuid as an argument to vm_assert_irq */ + error = vgic_v3_inject_irq(&hyp->ctx[vcpuid], irq, VGIC_IRQ_VIRTIO); + + return (error); +} + +int +vm_deassert_irq(struct vm *vm, uint32_t irq, uint32_t vcpuid) +{ + struct hyp *hyp = (struct hyp *)vm->cookie; + int error; + + error = vgic_v3_remove_irq(&hyp->ctx[vcpuid], irq, false); + + return (error); +} + +static int +vm_handle_wfi(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu) +{ + struct vcpu *vcpu; + struct hypctx *hypctx; + bool intr_disabled; + + vcpu = &vm->vcpu[vcpuid]; + hypctx = vme->u.wfi.hypctx; + intr_disabled = !(hypctx->regs.spsr & PSR_I); + + vcpu_lock(vcpu); + while (1) { + if (!intr_disabled && vgic_v3_vcpu_pending_irq(hypctx)) + break; + + if (vcpu_should_yield(vm, vcpuid)) + break; + + vcpu_set_state_locked(vcpu, VCPU_SLEEPING, false); + msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz); + vcpu_set_state_locked(vcpu, VCPU_FROZEN, false); + } + vcpu_unlock(vcpu); + + *retu = false; + return (0); +} Index: sys/arm64/vmm/vmm_dev.c =================================================================== --- /dev/null +++ sys/arm64/vmm/vmm_dev.c @@ -0,0 +1,407 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +struct vmmdev_softc { + struct vm *vm; /* vm instance cookie */ + struct cdev *cdev; + SLIST_ENTRY(vmmdev_softc) link; + int flags; +}; +#define VSC_LINKED 0x01 + +static SLIST_HEAD(, vmmdev_softc) head; + +static struct mtx vmmdev_mtx; + +static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); + +SYSCTL_DECL(_hw_vmm); + +static struct vmmdev_softc * +vmmdev_lookup(const char *name) +{ + struct vmmdev_softc *sc; + +#ifdef notyet /* XXX kernel is not compiled with invariants */ + mtx_assert(&vmmdev_mtx, MA_OWNED); +#endif + + SLIST_FOREACH(sc, &head, link) { + if (strcmp(name, vm_name(sc->vm)) == 0) + break; + } + + return (sc); +} + +static struct vmmdev_softc * +vmmdev_lookup2(struct cdev *cdev) +{ + + return (cdev->si_drv1); +} + +static int +vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) +{ + int error = 0; + + return (error); +} + +static int +vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, + struct thread *td) +{ + int error, vcpu, state_changed; + struct vmmdev_softc *sc; + struct vm_run *vmrun; + struct vm_memory_segment *seg; + struct vm_register *vmreg; + struct vm_activate_cpu *vac; + struct vm_attach_vgic *vav; + struct vm_irq *vi; + + sc = vmmdev_lookup2(cdev); + if (sc == NULL) + return (ENXIO); + + error = 0; + vcpu = -1; + state_changed = 0; + + /* + * Some VMM ioctls can operate only on vcpus that are not running. + */ + switch (cmd) { + case VM_RUN: + case VM_GET_REGISTER: + case VM_SET_REGISTER: + /* + * XXX fragile, handle with care + * Assumes that the first field of the ioctl data is the vcpu. + */ + vcpu = *(int *)data; + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + error = EINVAL; + goto done; + } + + error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); + if (error) + goto done; + + state_changed = 1; + break; + + case VM_MAP_MEMORY: + case VM_ATTACH_VGIC: + /* + * ioctls that operate on the entire virtual machine must + * prevent all vcpus from running. + */ + error = 0; + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { + error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); + if (error) + break; + } + + if (error) { + vcpu--; + while (vcpu >= 0) { + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + vcpu--; + } + goto done; + } + + state_changed = 2; + break; + case VM_ASSERT_IRQ: + vi =(struct vm_irq *)data; + error = vm_assert_irq(sc->vm, vi->irq, vi->vcpuid); + break; + case VM_DEASSERT_IRQ: + vi = (struct vm_irq *)data; + error = vm_deassert_irq(sc->vm, vi->irq, vi->vcpuid); + break; + default: + break; + } + + switch(cmd) { + case VM_RUN: + vmrun = (struct vm_run *)data; + error = vm_run(sc->vm, vmrun); + break; + case VM_MAP_MEMORY: + seg = (struct vm_memory_segment *)data; + error = vm_malloc(sc->vm, seg->gpa, seg->len); + break; + case VM_GET_MEMORY_SEG: + seg = (struct vm_memory_segment *)data; + seg->len = 0; + (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); + error = 0; + break; + case VM_GET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, + &vmreg->regval); + break; + case VM_SET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, + vmreg->regval); + break; + case VM_ACTIVATE_CPU: + vac = (struct vm_activate_cpu *)data; + error = vm_activate_cpu(sc->vm, vac->vcpuid); + break; + case VM_ATTACH_VGIC: + vav = (struct vm_attach_vgic *)data; + error = vm_attach_vgic(sc->vm, vav->dist_start, vav->dist_size, + vav->redist_start, vav->redist_size); + break; + default: + error = ENOTTY; + break; + } + + if (state_changed == 1) { + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + } else if (state_changed == 2) { + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + } + +done: + /* Make sure that no handler returns a bogus value like ERESTART */ + KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); + return (error); +} + +static int +vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr, + int nprot, vm_memattr_t *memattr) +{ + int error; + struct vmmdev_softc *sc; + + error = -1; + mtx_lock(&vmmdev_mtx); + + sc = vmmdev_lookup2(cdev); + if (sc != NULL && !(nprot & PROT_EXEC)) { + *paddr = (vm_paddr_t)vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE); + if (*paddr != (vm_paddr_t)-1) + error = 0; + } + + mtx_unlock(&vmmdev_mtx); + + return (error); +} + +static void +vmmdev_destroy(void *arg) +{ + + struct vmmdev_softc *sc = arg; + + if (sc->cdev != NULL) + destroy_dev(sc->cdev); + + if (sc->vm != NULL) + vm_destroy(sc->vm); + + if ((sc->flags & VSC_LINKED) != 0) { + mtx_lock(&vmmdev_mtx); + SLIST_REMOVE(&head, sc, vmmdev_softc, link); + mtx_unlock(&vmmdev_mtx); + } + + free(sc, M_VMMDEV); +} + +static int +sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) +{ + int error; + char buf[VM_MAX_NAMELEN]; + struct vmmdev_softc *sc; + struct cdev *cdev; + + strlcpy(buf, "beavis", sizeof(buf)); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(buf); + if (sc == NULL || sc->cdev == NULL) { + mtx_unlock(&vmmdev_mtx); + return (EINVAL); + } + + /* + * The 'cdev' will be destroyed asynchronously when 'si_threadcount' + * goes down to 0 so we should not do it again in the callback. + */ + cdev = sc->cdev; + sc->cdev = NULL; + mtx_unlock(&vmmdev_mtx); + + /* + * Schedule the 'cdev' to be destroyed: + * + * - any new operations on this 'cdev' will return an error (ENXIO). + * + * - when the 'si_threadcount' dwindles down to zero the 'cdev' will + * be destroyed and the callback will be invoked in a taskqueue + * context. + */ + destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); + + return (0); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_vmm_destroy, "A", NULL); + +static struct cdevsw vmmdevsw = { + .d_name = "vmmdev", + .d_version = D_VERSION, + .d_ioctl = vmmdev_ioctl, + .d_mmap = vmmdev_mmap, + .d_read = vmmdev_rw, + .d_write = vmmdev_rw, +}; + +static int +sysctl_vmm_create(SYSCTL_HANDLER_ARGS) +{ + int error; + struct vm *vm; + struct cdev *cdev; + struct vmmdev_softc *sc, *sc2; + char buf[VM_MAX_NAMELEN]; + + strlcpy(buf, "beavis", sizeof(buf)); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(buf); + mtx_unlock(&vmmdev_mtx); + if (sc != NULL) + return (EEXIST); + + error = vm_create(buf, &vm); + if (error != 0) + return (error); + + sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); + sc->vm = vm; + + /* + * Lookup the name again just in case somebody sneaked in when we + * dropped the lock. + */ + mtx_lock(&vmmdev_mtx); + sc2 = vmmdev_lookup(buf); + if (sc2 == NULL) { + SLIST_INSERT_HEAD(&head, sc, link); + sc->flags |= VSC_LINKED; + } + mtx_unlock(&vmmdev_mtx); + + if (sc2 != NULL) { + vmmdev_destroy(sc); + return (EEXIST); + } + + error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, + UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); + if (error != 0) { + vmmdev_destroy(sc); + return (error); + } + + mtx_lock(&vmmdev_mtx); + sc->cdev = cdev; + sc->cdev->si_drv1 = sc; + mtx_unlock(&vmmdev_mtx); + + return (0); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_vmm_create, "A", NULL); + +void +vmmdev_init(void) +{ + mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); +} + +int +vmmdev_cleanup(void) +{ + int error; + + if (SLIST_EMPTY(&head)) + error = 0; + else + error = EBUSY; + + return (error); +} Index: sys/arm64/vmm/vmm_instruction_emul.c =================================================================== --- /dev/null +++ sys/arm64/vmm/vmm_instruction_emul.c @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef _KERNEL +#include +#include +#include +#include + +#include + +#include + +#else +#include +#include +#include +#include +#include + +#include + +#include +#include +#endif + +#include + +int +vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) +{ + uint64_t val; + int error; + + if (vie->dir == VM_DIR_READ) { + error = memread(vm, vcpuid, gpa, &val, vie->access_size, memarg); + if (error) + goto out; + error = vm_set_register(vm, vcpuid, vie->reg, val); + } else { + error = vm_get_register(vm, vcpuid, vie->reg, &val); + if (error) + goto out; + error = memwrite(vm, vcpuid, gpa, val, vie->access_size, memarg); + } + +out: + return (error); +} + +int +vmm_emulate_register(void *vm, int vcpuid, struct vre *vre, reg_read_t regread, + reg_write_t regwrite, void *regarg) +{ + uint64_t val; + int error; + + if (vre->dir == VM_DIR_READ) { + error = regread(vm, vcpuid, &val, regarg); + if (error) + goto out; + error = vm_set_register(vm, vcpuid, vre->reg, val); + } else { + error = vm_get_register(vm, vcpuid, vre->reg, &val); + if (error) + goto out; + error = regwrite(vm, vcpuid, val, regarg); + } + +out: + return (error); +} Index: sys/arm64/vmm/vmm_mem.h =================================================================== --- /dev/null +++ sys/arm64/vmm/vmm_mem.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_MEM_H_ +#define _VMM_MEM_H_ + +int vmm_mem_init(void); +vm_paddr_t vmm_mem_alloc(size_t size); +void vmm_mem_free(vm_paddr_t start, size_t size); +vm_paddr_t vmm_mem_maxaddr(void); + +#endif Index: sys/arm64/vmm/vmm_mem.c =================================================================== --- /dev/null +++ sys/arm64/vmm/vmm_mem.c @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "vmm_mem.h" + +SYSCTL_DECL(_hw_vmm); + +static u_long pages_allocated; +SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD, + &pages_allocated, 0, "4KB pages allocated"); + +static void +update_pages_allocated(int howmany) +{ + pages_allocated += howmany; /* XXX locking? */ +} + +int +vmm_mem_init(void) +{ + + return (0); +} + +vm_paddr_t +vmm_mem_alloc(size_t size) +{ + + int flags; + vm_page_t m; + vm_paddr_t pa; + + if (size != PAGE_SIZE) + panic("vmm_mem_alloc: invalid allocation size %zu", size); + + flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO; + + while (1) { + /* + * XXX need policy to determine when to back off the allocation + */ + m = vm_page_alloc(NULL, 0, flags); + if (m == NULL) + vm_wait(NULL); + else + break; + } + + pa = VM_PAGE_TO_PHYS(m); + + if ((m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + + m->valid = VM_PAGE_BITS_ALL; + update_pages_allocated(1); + + return (pa); +} + +void +vmm_mem_free(vm_paddr_t base, size_t length) +{ + vm_page_t m; + + if (base & PAGE_MASK) { + panic("vmm_mem_free: base 0x%0lx must be aligned on a " + "0x%0x boundary\n", base, PAGE_SIZE); + } + + if (length != PAGE_SIZE) + panic("vmm_mem_free: invalid length %zu", length); + + m = PHYS_TO_VM_PAGE(base); + vm_page_unwire_noq(m); + vm_page_free(m); + + update_pages_allocated(-1); +} + +vm_paddr_t +vmm_mem_maxaddr(void) +{ + + return (ptoa(Maxmem)); +} Index: sys/arm64/vmm/vmm_stat.h =================================================================== --- /dev/null +++ sys/arm64/vmm/vmm_stat.h @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_STAT_H_ +#define _VMM_STAT_H_ + +struct vm; + +#define MAX_VMM_STAT_ELEMS 64 /* arbitrary */ + +enum vmm_stat_scope { + VMM_STAT_SCOPE_ANY, + VMM_STAT_SCOPE_INTEL, /* Intel VMX specific statistic */ + VMM_STAT_SCOPE_AMD, /* AMD SVM specific statistic */ +}; + +struct vmm_stat_type; +typedef void (*vmm_stat_func_t)(struct vm *vm, int vcpu, + struct vmm_stat_type *stat); + +struct vmm_stat_type { + int index; /* position in the stats buffer */ + int nelems; /* standalone or array */ + const char *desc; /* description of statistic */ + vmm_stat_func_t func; + enum vmm_stat_scope scope; +}; + +void vmm_stat_register(void *arg); + +#define VMM_STAT_FDEFINE(type, nelems, desc, func, scope) \ + struct vmm_stat_type type[1] = { \ + { -1, nelems, desc, func, scope } \ + }; \ + SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_register, type) + +#define VMM_STAT_DEFINE(type, nelems, desc, scope) \ + VMM_STAT_FDEFINE(type, nelems, desc, NULL, scope) + +#define VMM_STAT_DECLARE(type) \ + extern struct vmm_stat_type type[1] + +#define VMM_STAT(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_ANY) +#define VMM_STAT_INTEL(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_INTEL) +#define VMM_STAT_AMD(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_AMD) + +#define VMM_STAT_FUNC(type, desc, func) \ + VMM_STAT_FDEFINE(type, 1, desc, func, VMM_STAT_SCOPE_ANY) + +#define VMM_STAT_ARRAY(type, nelems, desc) \ + VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY) + +void *vmm_stat_alloc(void); +void vmm_stat_init(void *vp); +void vmm_stat_free(void *vp); + +/* + * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries + */ +int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf); +int vmm_stat_desc_copy(int index, char *buf, int buflen); + +static void __inline +vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t x) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] += x; +#endif +} + +static void __inline +vmm_stat_array_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t val) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] = val; +#endif +} + +static void __inline +vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_incr(vm, vcpu, vst, 0, x); +#endif +} + +static void __inline +vmm_stat_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t val) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_set(vm, vcpu, vst, 0, val); +#endif +} + +VMM_STAT_DECLARE(VCPU_MIGRATIONS); +VMM_STAT_DECLARE(VMEXIT_COUNT); +VMM_STAT_DECLARE(VMEXIT_EXTINT); +VMM_STAT_DECLARE(VMEXIT_HLT); +VMM_STAT_DECLARE(VMEXIT_CR_ACCESS); +VMM_STAT_DECLARE(VMEXIT_RDMSR); +VMM_STAT_DECLARE(VMEXIT_WRMSR); +VMM_STAT_DECLARE(VMEXIT_MTRAP); +VMM_STAT_DECLARE(VMEXIT_PAUSE); +VMM_STAT_DECLARE(VMEXIT_INTR_WINDOW); +VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW); +VMM_STAT_DECLARE(VMEXIT_INOUT); +VMM_STAT_DECLARE(VMEXIT_CPUID); +VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT); +VMM_STAT_DECLARE(VMEXIT_INST_EMUL); +VMM_STAT_DECLARE(VMEXIT_UNKNOWN); +VMM_STAT_DECLARE(VMEXIT_ASTPENDING); +VMM_STAT_DECLARE(VMEXIT_USERSPACE); +VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS); +VMM_STAT_DECLARE(VMEXIT_EXCEPTION); +#endif Index: sys/arm64/vmm/vmm_stat.c =================================================================== --- /dev/null +++ sys/arm64/vmm/vmm_stat.c @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include + +#include +#include "vmm_stat.h" + +/* + * 'vst_num_elems' is the total number of addressable statistic elements + * 'vst_num_types' is the number of unique statistic types + * + * It is always true that 'vst_num_elems' is greater than or equal to + * 'vst_num_types'. This is because a stat type may represent more than + * one element (for e.g. VMM_STAT_ARRAY). + */ +static int vst_num_elems, vst_num_types; +static struct vmm_stat_type *vsttab[MAX_VMM_STAT_ELEMS]; + +static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat"); + +#define vst_size ((size_t)vst_num_elems * sizeof(uint64_t)) + +void +vmm_stat_register(void *arg) +{ + struct vmm_stat_type *vst = arg; + + /* We require all stats to identify themselves with a description */ + if (vst->desc == NULL) + return; + + if (vst_num_elems + vst->nelems >= MAX_VMM_STAT_ELEMS) { + printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc); + return; + } + + vst->index = vst_num_elems; + vst_num_elems += vst->nelems; + + vsttab[vst_num_types++] = vst; +} + +int +vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf) +{ + struct vmm_stat_type *vst; + uint64_t *stats; + int i; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + /* Let stats functions update their counters */ + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (vst->func != NULL) + (*vst->func)(vm, vcpu, vst); + } + + /* Copy over the stats */ + stats = vcpu_stats(vm, vcpu); + for (i = 0; i < vst_num_elems; i++) + buf[i] = stats[i]; + *num_stats = vst_num_elems; + return (0); +} + +void * +vmm_stat_alloc(void) +{ + + return (malloc(vst_size, M_VMM_STAT, M_WAITOK)); +} + +void +vmm_stat_init(void *vp) +{ + + bzero(vp, vst_size); +} + +void +vmm_stat_free(void *vp) +{ + free(vp, M_VMM_STAT); +} + +int +vmm_stat_desc_copy(int index, char *buf, int bufsize) +{ + int i; + struct vmm_stat_type *vst; + + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (index >= vst->index && index < vst->index + vst->nelems) { + if (vst->nelems > 1) { + snprintf(buf, bufsize, "%s[%d]", + vst->desc, index - vst->index); + } else { + strlcpy(buf, vst->desc, bufsize); + } + return (0); /* found it */ + } + } + + return (EINVAL); +} + +/* global statistics */ +VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); +VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); +VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt"); +VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted"); +VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted"); +VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted"); +VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted"); +VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits"); +VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted"); +VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening"); +VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); +VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); +VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); +VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); +VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation"); +VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); +VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); +VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); +VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit"); +VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); Index: sys/conf/files.arm64 =================================================================== --- sys/conf/files.arm64 +++ sys/conf/files.arm64 @@ -55,6 +55,7 @@ arm64/arm64/identcpu.c standard arm64/arm64/in_cksum.c optional inet | inet6 arm64/arm64/locore.S standard no-obj +arm64/arm64/hyp_stub.S standard arm64/arm64/machdep.c standard arm64/arm64/machdep_boot.c standard arm64/arm64/mem.c standard Index: sys/dev/psci/psci.h =================================================================== --- sys/dev/psci/psci.h +++ sys/dev/psci/psci.h @@ -32,6 +32,7 @@ #include #include +#ifdef _KERNEL typedef int (*psci_initfn_t)(device_t dev, int default_version); typedef int (*psci_callfn_t)(register_t, register_t, register_t, register_t, register_t, register_t, register_t, register_t, @@ -52,6 +53,7 @@ return (psci_callfn(a, b, c, d, 0, 0, 0, 0, NULL)); } +#endif /* * PSCI return codes. Index: sys/dts/Makefile =================================================================== --- sys/dts/Makefile +++ sys/dts/Makefile @@ -1,5 +1,5 @@ # $FreeBSD$ -SUBDIR=arm mips powerpc +SUBDIR=arm arm64 mips powerpc .include Index: sys/modules/Makefile =================================================================== --- sys/modules/Makefile +++ sys/modules/Makefile @@ -620,6 +620,9 @@ _enetc_mdio= enetc_mdio _felix= felix _rockchip= rockchip +.if ${MK_BHYVE} != "no" || defined(ALL_MODULES) +_vmm= vmm +.endif .endif .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "arm" Index: sys/modules/vmm/Makefile =================================================================== --- sys/modules/vmm/Makefile +++ sys/modules/vmm/Makefile @@ -4,90 +4,10 @@ KMOD= vmm -SRCS= opt_acpi.h opt_bhyve_snapshot.h opt_ddb.h -SRCS+= device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h vnode_if.h -DPSRCS+= vmx_assym.h svm_assym.h -DPSRCS+= vmx_genassym.c svm_genassym.c offset.inc +SRCS= opt_acpi.h opt_ddb.h device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h -CFLAGS+= -DVMM_KEEP_STATS -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/io -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/intel -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/amd +CFLAGS+= -DVMM_KEEP_STATS -DSMP -# generic vmm support -.PATH: ${SRCTOP}/sys/amd64/vmm -SRCS+= vmm.c \ - vmm_dev.c \ - vmm_host.c \ - vmm_instruction_emul.c \ - vmm_ioport.c \ - vmm_lapic.c \ - vmm_mem.c \ - vmm_stat.c \ - vmm_util.c \ - x86.c - -.PATH: ${SRCTOP}/sys/amd64/vmm/io -SRCS+= iommu.c \ - ppt.c \ - vatpic.c \ - vatpit.c \ - vhpet.c \ - vioapic.c \ - vlapic.c \ - vpmtmr.c \ - vrtc.c - -# intel-specific files -.PATH: ${SRCTOP}/sys/amd64/vmm/intel -SRCS+= ept.c \ - vmcs.c \ - vmx_msr.c \ - vmx_support.S \ - vmx.c \ - vtd.c - -# amd-specific files -.PATH: ${SRCTOP}/sys/amd64/vmm/amd -SRCS+= vmcb.c \ - amdiommu.c \ - ivhd_if.c \ - ivhd_if.h \ - svm.c \ - svm_support.S \ - npt.c \ - ivrs_drv.c \ - amdvi_hw.c \ - svm_msr.c - -.if ${KERN_OPTS:MBHYVE_SNAPSHOT} != "" -SRCS+= vmm_snapshot.c -.endif - -CLEANFILES= vmx_assym.h vmx_genassym.o svm_assym.h svm_genassym.o - -OBJS_DEPEND_GUESS.vmx_support.o+= vmx_assym.h -OBJS_DEPEND_GUESS.svm_support.o+= svm_assym.h - -vmx_assym.h: vmx_genassym.o - sh ${SYSDIR}/kern/genassym.sh vmx_genassym.o > ${.TARGET} - -svm_assym.h: svm_genassym.o - sh ${SYSDIR}/kern/genassym.sh svm_genassym.o > ${.TARGET} - -vmx_support.o: - ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ - ${.IMPSRC} -o ${.TARGET} - -svm_support.o: - ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ - ${.IMPSRC} -o ${.TARGET} - -vmx_genassym.o: offset.inc - ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} - -svm_genassym.o: offset.inc - ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} +.include .include Index: sys/modules/vmm/Makefile.amd64 =================================================================== --- /dev/null +++ sys/modules/vmm/Makefile.amd64 @@ -0,0 +1,89 @@ +# $FreeBSD$ + +KMOD= vmm + +SRCS= opt_acpi.h opt_bhyve_snapshot.h opt_ddb.h +SRCS+= device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h vnode_if.h +DPSRCS+= vmx_assym.h svm_assym.h +DPSRCS+= vmx_genassym.c svm_genassym.c offset.inc + +CFLAGS+= -DVMM_KEEP_STATS +CFLAGS+= -I${SRCTOP}/sys/amd64/vmm +CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/io +CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/intel +CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/amd + +# generic vmm support +.PATH: ${SRCTOP}/sys/amd64/vmm +SRCS+= vmm.c \ + vmm_dev.c \ + vmm_host.c \ + vmm_instruction_emul.c \ + vmm_ioport.c \ + vmm_lapic.c \ + vmm_mem.c \ + vmm_stat.c \ + vmm_util.c \ + x86.c + +.PATH: ${SRCTOP}/sys/amd64/vmm/io +SRCS+= iommu.c \ + ppt.c \ + vatpic.c \ + vatpit.c \ + vhpet.c \ + vioapic.c \ + vlapic.c \ + vpmtmr.c \ + vrtc.c + +# intel-specific files +.PATH: ${SRCTOP}/sys/amd64/vmm/intel +SRCS+= ept.c \ + vmcs.c \ + vmx_msr.c \ + vmx_support.S \ + vmx.c \ + vtd.c + +# amd-specific files +.PATH: ${SRCTOP}/sys/amd64/vmm/amd +SRCS+= vmcb.c \ + amdiommu.c \ + ivhd_if.c \ + ivhd_if.h \ + svm.c \ + svm_support.S \ + npt.c \ + ivrs_drv.c \ + amdvi_hw.c \ + svm_msr.c + +.if ${KERN_OPTS:MBHYVE_SNAPSHOT} != "" +SRCS+= vmm_snapshot.c +.endif + +CLEANFILES= vmx_assym.h vmx_genassym.o svm_assym.h svm_genassym.o + +OBJS_DEPEND_GUESS.vmx_support.o+= vmx_assym.h +OBJS_DEPEND_GUESS.svm_support.o+= svm_assym.h + +vmx_assym.h: vmx_genassym.o + sh ${SYSDIR}/kern/genassym.sh vmx_genassym.o > ${.TARGET} + +svm_assym.h: svm_genassym.o + sh ${SYSDIR}/kern/genassym.sh svm_genassym.o > ${.TARGET} + +vmx_support.o: + ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ + ${.IMPSRC} -o ${.TARGET} + +svm_support.o: + ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ + ${.IMPSRC} -o ${.TARGET} + +vmx_genassym.o: offset.inc + ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} + +svm_genassym.o: offset.inc + ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} Index: sys/modules/vmm/Makefile.arm64 =================================================================== --- /dev/null +++ sys/modules/vmm/Makefile.arm64 @@ -0,0 +1,30 @@ +DPSRCS+= hyp_assym.h +DPSRCS+= hyp_genassym.c + +CFLAGS+= -I${.CURDIR}/../../arm64/vmm -I${.CURDIR}/../../arm64/include + +# generic vmm support +.PATH: ${.CURDIR}/../../arm64/vmm +SRCS+= vmm.c \ + vmm_dev.c \ + vmm_instruction_emul.c \ + vmm_mem.c \ + mmu.c \ + vmm_stat.c \ + arm64.c \ + psci.c \ + reset.c \ + hyp.S + +.PATH: ${.CURDIR}/../../arm64/vmm/io +SRCS+= vgic_v3.c \ + vgic_v3_mmio.c \ + vtimer.c + +CLEANFILES= hyp_assym.h hyp_genassym.o + +hyp_assym.h: hyp_genassym.o + sh ${SYSDIR}/kern/genassym.sh hyp_genassym.o > ${.TARGET} + +hyp_genassym.o: + ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} Index: usr.sbin/Makefile.arm64 =================================================================== --- usr.sbin/Makefile.arm64 +++ usr.sbin/Makefile.arm64 @@ -4,3 +4,9 @@ SUBDIR+= acpi .endif SUBDIR+= ofwdump + +.if ${MK_BHYVE} != "no" +SUBDIR+= bhyve +SUBDIR+= bhyveload +SUBDIR+= bhyvectl +.endif Index: usr.sbin/bhyve/Makefile =================================================================== --- usr.sbin/bhyve/Makefile +++ usr.sbin/bhyve/Makefile @@ -3,134 +3,7 @@ # .include -CFLAGS+=-I${.CURDIR}/../../contrib/lib9p -CFLAGS+=-I${SRCTOP}/sys -.PATH: ${SRCTOP}/sys/cam/ctl -PROG= bhyve -PACKAGE= bhyve - -MAN= bhyve.8 bhyve_config.5 - -BHYVE_SYSDIR?=${SRCTOP} - -SRCS= \ - atkbdc.c \ - acpi.c \ - audio.c \ - bhyvegc.c \ - bhyverun.c \ - block_if.c \ - bootrom.c \ - config.c \ - console.c \ - ctl_util.c \ - ctl_scsi_all.c \ - fwctl.c \ - gdb.c \ - hda_codec.c \ - inout.c \ - ioapic.c \ - kernemu_dev.c \ - mem.c \ - mevent.c \ - mptbl.c \ - net_backends.c \ - net_utils.c \ - pci_ahci.c \ - pci_e82545.c \ - pci_emul.c \ - pci_hda.c \ - pci_fbuf.c \ - pci_hostbridge.c \ - pci_irq.c \ - pci_lpc.c \ - pci_nvme.c \ - pci_passthru.c \ - pci_virtio_9p.c \ - pci_virtio_block.c \ - pci_virtio_console.c \ - pci_virtio_input.c \ - pci_virtio_net.c \ - pci_virtio_rnd.c \ - pci_virtio_scsi.c \ - pci_uart.c \ - pci_xhci.c \ - pctestdev.c \ - pm.c \ - post.c \ - ps2kbd.c \ - ps2mouse.c \ - rfb.c \ - rtc.c \ - smbiostbl.c \ - sockstream.c \ - task_switch.c \ - uart_emul.c \ - usb_emul.c \ - usb_mouse.c \ - virtio.c \ - vga.c \ - vmgenc.c \ - xmsr.c \ - spinup_ap.c \ - iov.c - -.if ${MK_BHYVE_SNAPSHOT} != "no" -SRCS+= snapshot.c -.endif - -CFLAGS.kernemu_dev.c+= -I${SRCTOP}/sys/amd64 - -.PATH: ${BHYVE_SYSDIR}/sys/amd64/vmm -SRCS+= vmm_instruction_emul.c - -LIBADD= vmmapi md nv pthread z util sbuf cam 9p - -.if ${MK_CASPER} != "no" -LIBADD+= casper -LIBADD+= cap_pwd -LIBADD+= cap_grp -# Temporary disable capsicum, until we integrate checkpoint code with it. -#CFLAGS+=-DWITH_CASPER -.endif - -.if ${MK_BHYVE_SNAPSHOT} != "no" -LIBADD+= ucl xo -.endif - -.if ${MK_INET_SUPPORT} != "no" -CFLAGS+=-DINET -.endif -.if ${MK_INET6_SUPPORT} != "no" -CFLAGS+=-DINET6 -.endif -.if ${MK_NETGRAPH_SUPPORT} != "no" -CFLAGS+=-DNETGRAPH -LIBADD+= netgraph -.endif -.if ${MK_OPENSSL} == "no" -CFLAGS+=-DNO_OPENSSL -.else -LIBADD+= crypto -.endif - -CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000 -CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii -CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller -.if ${MK_BHYVE_SNAPSHOT} != "no" -CFLAGS+= -I${SRCTOP}/contrib/libucl/include - -# Temporary disable capsicum, until we integrate checkpoint code with it. -CFLAGS+= -DWITHOUT_CAPSICUM - -CFLAGS+= -DBHYVE_SNAPSHOT -.endif - -.ifdef GDB_LOG -CFLAGS+=-DGDB_LOG -.endif - -WARNS?= 2 +.include .include Index: usr.sbin/bhyve/Makefile.amd64 =================================================================== --- /dev/null +++ usr.sbin/bhyve/Makefile.amd64 @@ -0,0 +1,132 @@ +# +# $FreeBSD$ +# + +CFLAGS+=-I${.CURDIR}/../../contrib/lib9p +CFLAGS+=-I${SRCTOP}/sys +.PATH: ${SRCTOP}/sys/cam/ctl + +PROG= bhyve +PACKAGE= bhyve + +MAN= bhyve.8 bhyve_config.5 + +BHYVE_SYSDIR?=${SRCTOP} + +SRCS= \ + atkbdc.c \ + acpi.c \ + audio.c \ + bhyvegc.c \ + bhyverun.c \ + block_if.c \ + bootrom.c \ + config.c \ + console.c \ + ctl_util.c \ + ctl_scsi_all.c \ + fwctl.c \ + gdb.c \ + hda_codec.c \ + inout.c \ + ioapic.c \ + kernemu_dev.c \ + mem.c \ + mevent.c \ + mptbl.c \ + net_backends.c \ + net_utils.c \ + pci_ahci.c \ + pci_e82545.c \ + pci_emul.c \ + pci_hda.c \ + pci_fbuf.c \ + pci_hostbridge.c \ + pci_irq.c \ + pci_lpc.c \ + pci_nvme.c \ + pci_passthru.c \ + pci_virtio_9p.c \ + pci_virtio_block.c \ + pci_virtio_console.c \ + pci_virtio_net.c \ + pci_virtio_rnd.c \ + pci_virtio_scsi.c \ + pci_uart.c \ + pci_xhci.c \ + pctestdev.c \ + pm.c \ + post.c \ + ps2kbd.c \ + ps2mouse.c \ + rfb.c \ + rtc.c \ + smbiostbl.c \ + sockstream.c \ + task_switch.c \ + uart_emul.c \ + usb_emul.c \ + usb_mouse.c \ + virtio.c \ + vga.c \ + vmgenc.c \ + xmsr.c \ + spinup_ap.c \ + iov.c + +.if ${MK_BHYVE_SNAPSHOT} != "no" +SRCS+= snapshot.c +.endif + +CFLAGS.kernemu_dev.c+= -I${SRCTOP}/sys/amd64 + +.PATH: ${BHYVE_SYSDIR}/sys/amd64/vmm +SRCS+= vmm_instruction_emul.c + +LIBADD= vmmapi md nv pthread z util sbuf cam 9p + +.if ${MK_CASPER} != "no" +LIBADD+= casper +LIBADD+= cap_pwd +LIBADD+= cap_grp +# Temporary disable capsicum, until we integrate checkpoint code with it. +#CFLAGS+=-DWITH_CASPER +.endif + +.if ${MK_BHYVE_SNAPSHOT} != "no" +LIBADD+= ucl xo +.endif + +.if ${MK_INET_SUPPORT} != "no" +CFLAGS+=-DINET +.endif +.if ${MK_INET6_SUPPORT} != "no" +CFLAGS+=-DINET6 +.endif +.if ${MK_NETGRAPH_SUPPORT} != "no" +CFLAGS+=-DNETGRAPH +LIBADD+= netgraph +.endif +.if ${MK_OPENSSL} == "no" +CFLAGS+=-DNO_OPENSSL +.else +LIBADD+= crypto +.endif + +CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000 +CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii +CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller +.if ${MK_BHYVE_SNAPSHOT} != "no" +CFLAGS+= -I${SRCTOP}/contrib/libucl/include + +# Temporary disable capsicum, until we integrate checkpoint code with it. +CFLAGS+= -DWITHOUT_CAPSICUM + +CFLAGS+= -DBHYVE_SNAPSHOT +.endif + +.ifdef GDB_LOG +CFLAGS+=-DGDB_LOG +.endif + +WARNS?= 2 Index: usr.sbin/bhyve/Makefile.arm64 =================================================================== --- /dev/null +++ usr.sbin/bhyve/Makefile.arm64 @@ -0,0 +1,53 @@ +# +# $FreeBSD$ +# + +CFLAGS+=-I${SRCTOP}/sys +.PATH: ${SRCTOP}/sys/cam/ctl + +PROG= bhyve +PACKAGE= bhyve + +MAN= bhyve.8 + +BHYVE_SYSDIR?=${SRCTOP} +BHYVE_SRCTOP?=${.CURDIR} + +SRCS= \ + iov.c \ + mevent.c \ + sockstream.c + +CFLAGS+= -DWITHOUT_CAPSICUM +.include "${BHYVE_SRCTOP}/arm64/Makefile.inc" +.include "${BHYVE_SRCTOP}/mmio/Makefile.inc" + +LIBADD= vmmapi md pthread + +.if ${MK_INET_SUPPORT} != "no" +CFLAGS+=-DINET +.endif +.if ${MK_INET6_SUPPORT} != "no" +CFLAGS+=-DINET6 +.endif +.if ${MK_NETGRAPH_SUPPORT} != "no" +CFLAGS+=-DNETGRAPH +LIBADD+= netgraph +.endif +.if ${MK_OPENSSL} == "no" +CFLAGS+=-DNO_OPENSSL +.endif + +.PATH: ${BHYVE_SYSDIR}/sys/arm64/vmm +SRCS+= vmm_instruction_emul.c + +CFLAGS+= -I${BHYVE_SRCTOP} +CFLAGS+= -I${BHYVE_SRCTOP}/arm64 +CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/virtio +CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/virtio/console + +.ifdef GDB_LOG +CFLAGS+=-DGDB_LOG +.endif + +WARNS?= 2 Index: usr.sbin/bhyve/arm64/Makefile.inc =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/Makefile.inc @@ -0,0 +1,17 @@ +# +# $FreeBSD$ +# +.PATH: ${BHYVE_SRCTOP}/arm64/ +SRCS+= \ + arm64/bhyverun.c \ + arm64/bootrom.c \ + arm64/mem.c \ + arm64/reset.c + +.PATH: ${BHYVE_SYSDIR}/sys/${BHYVE_ARCH}/vmm + +MK_MAN=no + +BHYVE_BUS= mmio + +CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/virtio/mmio Index: usr.sbin/bhyve/arm64/bhyverun.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/bhyverun.h @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/usr.sbin/bhyve/arm/bhyverun.h 4 2017-04-18 20:28:32Z mihai.carabas $ + */ + +#ifndef _FBSDRUN_H_ +#define _FBSDRUN_H_ + +#ifndef CTASSERT /* Allow lint to override */ +#define CTASSERT(x) _CTASSERT(x, __LINE__) +#define _CTASSERT(x, y) __CTASSERT(x, y) +#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1] +#endif + +struct vmctx; +extern int guest_ncpus; +extern char *vmname; + +void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len); + +void fbsdrun_addcpu(struct vmctx *ctx, int oldcpu, int cpu, uint64_t rip); +int fbsdrun_muxed(void); +int fbsdrun_vmexit_on_hlt(void); +int fbsdrun_vmexit_on_pause(void); +int fbsdrun_disable_x2apic(void); +int fbsdrun_virtio_msix(void); +#endif Index: usr.sbin/bhyve/arm64/bhyverun.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/bhyverun.c @@ -0,0 +1,541 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "bhyverun.h" +#include "../mmio/mmio_emul.h" +#include "../mmio/mmio_irq.h" +#include "mem.h" +#include "mevent.h" +#include "bootrom.h" + +/* Exit codes. */ +#define EXIT_REBOOT 0 +#define EXIT_POWEROFF 1 +#define EXIT_HALT 2 +#define EXIT_ERROR 4 + +#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ + +#define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */ +#define VMEXIT_CONTINUE 1 /* continue from next instruction */ +#define VMEXIT_RESTART 2 /* restart current instruction */ +#define VMEXIT_ABORT 3 /* abort the vm run loop */ +#define VMEXIT_RESET 4 /* guest machine has reset */ + +#define MB (1024UL * 1024) +#define GB (1024UL * MB) + +#define GIC_V3_DIST_START 0x2f000000UL +#define GIC_V3_DIST_SIZE 0x10000UL +#define GIC_V3_REDIST_START 0x2f100000UL +#define GIC_V3_REDIST_SIZE 0x200000UL + +#define FILE_LEN 256 + +typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); + +char *vmname; + +int guest_ncpus; + +int raw_stdio = 0; + +static int foundcpus; + +static char *progname; +static const int BSP = 0; + +static cpuset_t cpumask; + +static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t pc); + +struct vm_exit vmexit[VM_MAXCPU]; + +struct bhyvestats { + uint64_t vmexit_bogus; + uint64_t vmexit_inst_emul; +} stats; + +struct mt_vmm_info { + pthread_t mt_thr; + struct vmctx *mt_ctx; + int mt_vcpu; +} mt_vmm_info[VM_MAXCPU]; + +static cpuset_t *vcpumap[VM_MAXCPU] = { NULL }; + +static void +usage(int code) +{ + + fprintf(stderr, + "Usage: %s [-bh] [-c vcpus] [-p pincpu] [-s ] [-l bootrom]" + "\n" + " -c: # cpus (default 1)\n" + " -p: pin vcpu 'n' to host cpu 'pincpu + n'\n" + " -s: device emulation config\n" + " -l: bootrom file\n" + " -h: help\n", + progname); + + exit(code); +} + +static int +pincpu_parse(const char *opt) +{ + int vcpu, pcpu; + + if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { + fprintf(stderr, "invalid format: %s\n", opt); + return (-1); + } + + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n", + vcpu, VM_MAXCPU - 1); + return (-1); + } + + if (pcpu < 0 || pcpu >= CPU_SETSIZE) { + fprintf(stderr, "hostcpu '%d' outside valid range from " + "0 to %d\n", pcpu, CPU_SETSIZE - 1); + return (-1); + } + + if (vcpumap[vcpu] == NULL) { + if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) { + perror("malloc"); + return (-1); + } + CPU_ZERO(vcpumap[vcpu]); + } + CPU_SET(pcpu, vcpumap[vcpu]); + return (0); +} + +void * +paddr_guest2host(struct vmctx *ctx, uintptr_t iaddr, size_t len) +{ + + return (vm_map_ipa(ctx, iaddr, len)); +} + +int +fbsdrun_virtio_msix(void) +{ + + return 0; +} + +static void * +fbsdrun_start_thread(void *param) +{ + char tname[MAXCOMLEN + 1]; + struct mt_vmm_info *mtp; + int vcpu; + + mtp = param; + vcpu = mtp->mt_vcpu; + + snprintf(tname, sizeof(tname), "%s vcpu %d", vmname, vcpu); + pthread_set_name_np(mtp->mt_thr, tname); + + vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].pc); + + /* not reached */ + return (NULL); +} + +void +fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int vcpu, uint64_t pc) +{ + int error; + + assert(fromcpu == BSP); + + error = vm_activate_cpu(ctx, vcpu); + if (error != 0) + err(EX_OSERR, "could not activate CPU %d", vcpu); + + CPU_SET_ATOMIC(vcpu, &cpumask); + foundcpus++; + + /* + * Set up the vmexit struct to allow execution to start + * at the given RIP + */ + vmexit[vcpu].pc = pc; + vmexit[vcpu].inst_length = 0; + + mt_vmm_info[vcpu].mt_ctx = ctx; + mt_vmm_info[vcpu].mt_vcpu = vcpu; + + error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL, + fbsdrun_start_thread, &mt_vmm_info[vcpu]); + assert(error == 0); +} + +static int +fbsdrun_get_next_cpu(int curcpu) +{ + + /* + * Get the next available CPU. Assumes they arrive + * in ascending order with no gaps. + */ + return ((curcpu + 1) % foundcpus); +} + +static int +vmexit_hyp(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + fprintf(stderr, "vm exit[%d]\n", *pvcpu); + fprintf(stderr, "\treason\t\tHYP\n"); + fprintf(stderr, "\tpc\t\t0x%016lx\n", vmexit->pc); + fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); + + return (VMEXIT_ABORT); +} + +static int +vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + stats.vmexit_bogus++; + + return (VMEXIT_RESTART); +} + +static int +vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + int err; + struct vie *vie; + + stats.vmexit_inst_emul++; + + vie = &vmexit->u.inst_emul.vie; + err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, vie); + + if (err) { + if (err == ESRCH) { + fprintf(stderr, "Unhandled memory access to 0x%lx\n", + vmexit->u.inst_emul.gpa); + } + + fprintf(stderr, "Failed to emulate instruction at 0x%lx\n", vmexit->pc); + return (VMEXIT_ABORT); + } + return (VMEXIT_CONTINUE); +} + +static int +vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + enum vm_suspend_how how; + + how = vmexit->u.suspended.how; + + switch (how) { + case VM_SUSPEND_POWEROFF: + exit(EXIT_POWEROFF); + case VM_SUSPEND_RESET: + exit(EXIT_REBOOT); + case VM_SUSPEND_HALT: + exit(EXIT_HALT); + case VM_SUSPEND_TRIPLEFAULT: + /* Not implemented yet. */ + exit(EXIT_ERROR); + default: + fprintf(stderr, "vmexit_suspend: invalid or unimplemented reason %d\n", how); + exit(100); + } + +} + +static int +vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + int error; + int newcpu = vmexit->u.spinup_ap.vcpu; + uint64_t pc = vmexit->u.spinup_ap.rip; + uint64_t ctx_id = vmexit->u.spinup_ap.ctx_id; + + assert(newcpu != 0); + if (guest_ncpus == 1 && newcpu >= guest_ncpus) { + error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_X0, + PSCI_RETVAL_DENIED); + assert(error == 0); + goto out; + } + + error = vm_set_register(ctx, newcpu, VM_REG_GUEST_X0, ctx_id); + assert(error == 0); + + error = vm_set_register(ctx, newcpu, VM_REG_ELR_EL2, pc); + assert(error == 0); + + fbsdrun_addcpu(ctx, BSP, newcpu, pc); + + error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_X0, + PSCI_RETVAL_SUCCESS); + assert(error == 0); + +out: + return (VMEXIT_CONTINUE); +} + +static vmexit_handler_t handler[VM_EXITCODE_MAX] = { + [VM_EXITCODE_BOGUS] = vmexit_bogus, + [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, + [VM_EXITCODE_REG_EMUL] = vmexit_hyp, + [VM_EXITCODE_SUSPENDED] = vmexit_suspend, + [VM_EXITCODE_HYP] = vmexit_hyp, + [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, +}; + +static void +vm_loop(struct vmctx *ctx, int vcpu, uint64_t pc) +{ + int error, rc, prevcpu; + enum vm_exitcode exitcode; + + if (vcpumap[vcpu] != NULL) { + error = pthread_setaffinity_np(pthread_self(), + sizeof(cpuset_t), vcpumap[vcpu]); + assert(error == 0); + } + + while (1) { + + error = vm_run(ctx, vcpu, pc, &vmexit[vcpu]); + + if (error != 0) { + /* + * It is possible that 'vmmctl' or some other process + * has transitioned the vcpu to CANNOT_RUN state right + * before we tried to transition it to RUNNING. + * + * This is expected to be temporary so just retry. + */ + if (errno == EBUSY) + continue; + else + break; + } + + prevcpu = vcpu; + + exitcode = vmexit[vcpu].exitcode; + if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { + fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", + exitcode); + exit(4); + } + + rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); + + switch (rc) { + case VMEXIT_CONTINUE: + pc = vmexit[vcpu].pc + vmexit[vcpu].inst_length; + break; + case VMEXIT_RESTART: + pc = vmexit[vcpu].pc; + break; + case VMEXIT_RESET: + exit(0); + default: + exit(4); + } + } + fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); +} + +static int +num_vcpus_allowed(struct vmctx *ctx) +{ + return (VM_MAXCPU); +} + +int +main(int argc, char *argv[]) +{ + int c, error; + int max_vcpus; + struct vmctx *ctx; + uint64_t pc; + uint64_t memory_base_address, mem_size; + char bootrom_file[FILE_LEN]; + bool bootrom; + + bootrom = false; + memory_base_address = VM_GUEST_BASE_IPA; + mem_size = 128 * MB; + progname = basename(argv[0]); + guest_ncpus = 1; + + while ((c = getopt(argc, argv, "bhcp:s:e:m:l:")) != -1) { + switch (c) { + case 'e': + error = vm_parse_memsize(optarg, &memory_base_address); + if (error) { + fprintf(stderr, "Invalid memaddr '%s'\n", optarg); + exit(1); + } + break; + case 'p': + if (pincpu_parse(optarg) != 0) { + errx(EX_USAGE, "invalid vcpu pinning " + "configuration '%s'", optarg); + } + break; + case 'c': + guest_ncpus = VM_MAXCPU; + break; + case 'm': + error = vm_parse_memsize(optarg, &mem_size); + if (error) { + fprintf(stderr, "Invalid memsize '%s'\n", optarg); + exit(1); + } + break; + case 's': + if (mmio_parse_opts(optarg) != 0) + exit(1); + break; + case 'l': + bootrom = true; + strncpy(bootrom_file, optarg, FILE_LEN); + break; + case 'h': + usage(0); + default: + usage(4); + } + } + argc -= optind; + argv += optind; + + if (argc != 1) + usage(4); + + vmname = argv[0]; + + if (bootrom == true) { + error = vm_create(vmname); + if (error != 0) { + fprintf(stderr, "Failed to create vm\n"); + exit(1); + } + } + + /* The VM must be created by bhyveload first. */ + ctx = vm_open(vmname); + if (ctx == NULL) { + perror("vm_open"); + exit(1); + } + + max_vcpus = num_vcpus_allowed(ctx); + if (guest_ncpus > max_vcpus) { + fprintf(stderr, "%d vCPUs requested but only %d available\n", + guest_ncpus, max_vcpus); + exit(1); + } + + error = vm_setup_memory(ctx, memory_base_address, mem_size, VM_MMAP_ALL); + if (error != 0) { + fprintf(stderr, "Unable to setup memory (%d)\n", error); + exit(1); + } + + init_mem(); + mmio_irq_init(ctx); + + if (init_mmio(ctx) != 0) { + fprintf(stderr, "Failed to initialize device emulation\n"); + exit(1); + } + + if (bootrom == true) { + pc = memory_base_address; + error = bootrom_loadrom(ctx, bootrom_file, &pc); + if (error) { + fprintf(stderr, "Error loading bootrom\n"); + exit(1); + } + + error = vm_attach_vgic(ctx, GIC_V3_DIST_START, GIC_V3_DIST_SIZE, + GIC_V3_REDIST_START, GIC_V3_REDIST_SIZE); + if (error) { + fprintf(stderr, "Error attaching VGIC to the virtual machine\n"); + exit(1); + } + + vm_set_register(ctx, BSP, VM_REG_ELR_EL2, pc); + } + + error = vm_get_register(ctx, BSP, VM_REG_ELR_EL2, &pc); + assert(error == 0); + /* + * Add CPU 0 + */ + fbsdrun_addcpu(ctx, BSP, BSP, pc); + + /* + * Head off to the main event dispatch loop + */ + mevent_dispatch(); + + exit(1); +} Index: usr.sbin/bhyve/arm64/bootrom.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/bootrom.h @@ -0,0 +1,44 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _BOOTROM_H_ +#define _BOOTROM_H_ + +#include +#include +#include +#include + +struct vmctx; + + +int bootrom_alloc(struct vmctx *ctx, uint64_t *gpa, size_t len, char **base); +int bootrom_loadrom(struct vmctx *ctx, const char *romfile, uint64_t *gpa); +#endif Index: usr.sbin/bhyve/arm64/bootrom.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/bootrom.c @@ -0,0 +1,120 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "bhyverun.h" +#include "bootrom.h" +#include "debug.h" + +int +bootrom_alloc(struct vmctx *ctx, uint64_t *gpa, size_t len, char **base) +{ + if (len == 0) { + warnx("ROM size %zu is invalid", len); + return (EINVAL); + } + + len = roundup2(len, PAGE_SIZE); + *base = vm_map_ipa(ctx, *gpa, len); + + printf("%s: gpa=%#lx base=%#lx\n", __func__, *gpa, (uint64_t)*base); + + return (0); +} + +int +bootrom_loadrom(struct vmctx *ctx, const char *romfile, uint64_t *gpa) +{ + struct stat sbuf; + ssize_t rlen; + char *base; + int fd, i, rv; + + rv = -1; + fd = open(romfile, O_RDONLY); + if (fd < 0) { + EPRINTLN("Error opening bootrom \"%s\": %s", + romfile, strerror(errno)); + goto done; + } + + if (fstat(fd, &sbuf) < 0) { + EPRINTLN("Could not fstat bootrom file \"%s\": %s", + romfile, strerror(errno)); + goto done; + } + + /* Map the bootrom into the guest address space */ + if (bootrom_alloc(ctx, gpa, sbuf.st_size, &base) != 0) + goto done; + + /* Read 'romfile' into the guest address space */ + for (i = 0; i < sbuf.st_size / PAGE_SIZE; i++) { + rlen = read(fd, base + i * PAGE_SIZE, PAGE_SIZE); + if (rlen != PAGE_SIZE) { + perror("read"); + EPRINTLN("Incomplete read of page %d of bootrom " + "file %s: %ld bytes", i, romfile, rlen); + goto done; + } + } + + if (sbuf.st_size % PAGE_SIZE != 0) { + rlen = read(fd, base + i * PAGE_SIZE, sbuf.st_size % PAGE_SIZE); + if (rlen != sbuf.st_size % PAGE_SIZE) { + perror("read"); + EPRINTLN("Incomplete read of page %d of bootrom " + "file %s: %ld bytes", i, romfile, rlen); + goto done; + } + } + + printf("Finished reading bootrom\n"); + rv = 0; +done: + if (fd >= 0) + close(fd); + return (rv); +} Index: usr.sbin/bhyve/arm64/mem.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/mem.h @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/usr.sbin/bhyve/arm/mem.h 38 2017-06-13 13:34:14Z darius.mihai $ + */ + +#ifndef _MEM_H_ +#define _MEM_H_ + +#include + +struct vmctx; + +typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2); + +struct mem_range { + const char *name; + int flags; + mem_func_t handler; + void *arg1; + long arg2; + uint64_t base; + uint64_t size; +}; +#define MEM_F_READ 0x1 +#define MEM_F_WRITE 0x2 +#define MEM_F_RW 0x3 + +void init_mem(void); +int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, void *vie); +int register_mem(struct mem_range *memp); +int register_mem_fallback(struct mem_range *memp); +int unregister_mem(struct mem_range *memp); + +#endif /* _MEM_H_ */ Index: usr.sbin/bhyve/arm64/mem.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/mem.c @@ -0,0 +1,271 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/usr.sbin/bhyve/arm/mem.c 4 2017-04-18 20:28:32Z mihai.carabas $ + */ + +/* + * Memory ranges are represented with an RB tree. On insertion, the range + * is checked for overlaps. On lookup, the key has the same base and limit + * so it can be searched within the range. + */ + +#include +__FBSDID("$FreeBSD: src/usr.sbin/bhyve/arm/mem.c 4 2017-04-18 20:28:32Z mihai.carabas $"); + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mem.h" + +struct mmio_rb_range { + RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */ + struct mem_range mr_param; + uint64_t mr_base; + uint64_t mr_end; +}; + +struct mmio_rb_tree; +RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); + +RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rb_root, mmio_rb_fallback; + +/* + * Per-vCPU cache. Since most accesses from a vCPU will be to + * consecutive addresses in a range, it makes sense to cache the + * result of a lookup. + */ +static struct mmio_rb_range *mmio_hint[VM_MAXCPU]; + +static pthread_rwlock_t mmio_rwlock; + +static int +mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b) +{ + if (a->mr_end < b->mr_base) + return (-1); + else if (a->mr_base > b->mr_end) + return (1); + return (0); +} + +static int +mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr, + struct mmio_rb_range **entry) +{ + struct mmio_rb_range find, *res; + + find.mr_base = find.mr_end = addr; + + res = RB_FIND(mmio_rb_tree, rbt, &find); + + if (res != NULL) { + *entry = res; + return (0); + } + + return (ENOENT); +} + +static int +mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new) +{ + struct mmio_rb_range *overlap; + + overlap = RB_INSERT(mmio_rb_tree, rbt, new); + + if (overlap != NULL) { +#ifdef RB_DEBUG + printf("overlap detected: new %lx:%lx, tree %lx:%lx\n", + new->mr_base, new->mr_end, + overlap->mr_base, overlap->mr_end); +#endif + + return (EEXIST); + } + + return (0); +} + +#if 0 +static void +mmio_rb_dump(struct mmio_rb_tree *rbt) +{ + struct mmio_rb_range *np; + + pthread_rwlock_rdlock(&mmio_rwlock); + RB_FOREACH(np, mmio_rb_tree, rbt) { + printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end, + np->mr_param.name); + } + pthread_rwlock_unlock(&mmio_rwlock); +} +#endif + +RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); + +static int +mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg) +{ + int error; + struct mem_range *mr = arg; + + error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size, + rval, mr->arg1, mr->arg2); + return (error); +} + +static int +mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg) +{ + int error; + struct mem_range *mr = arg; + + error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size, + &wval, mr->arg1, mr->arg2); + return (error); +} + +int +emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, void *vie) +{ + struct mmio_rb_range *entry; + int err; + + pthread_rwlock_rdlock(&mmio_rwlock); + /* + * First check the per-vCPU cache + */ + if (mmio_hint[vcpu] && + paddr >= mmio_hint[vcpu]->mr_base && + paddr <= mmio_hint[vcpu]->mr_end) { + entry = mmio_hint[vcpu]; + } else + entry = NULL; + + if (entry == NULL) { + if (mmio_rb_lookup(&mmio_rb_root, paddr, &entry) == 0) { + /* Update the per-vCPU cache */ + mmio_hint[vcpu] = entry; + } else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) { + pthread_rwlock_unlock(&mmio_rwlock); + return (ESRCH); + } + } + + assert(entry != NULL); + assert(NULL == NULL); + err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, + mem_read, mem_write, &entry->mr_param); + + pthread_rwlock_unlock(&mmio_rwlock); + + return (err); +} + +static int +register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp) +{ + struct mmio_rb_range *entry, *mrp; + int err; + + err = 0; + + mrp = malloc(sizeof(struct mmio_rb_range)); + + if (mrp != NULL) { + mrp->mr_param = *memp; + mrp->mr_base = memp->base; + mrp->mr_end = memp->base + memp->size - 1; + pthread_rwlock_wrlock(&mmio_rwlock); + if (mmio_rb_lookup(rbt, memp->base, &entry) != 0) + err = mmio_rb_add(rbt, mrp); + pthread_rwlock_unlock(&mmio_rwlock); + if (err) + free(mrp); + } else + err = ENOMEM; + + return (err); +} + +int +register_mem(struct mem_range *memp) +{ + + return (register_mem_int(&mmio_rb_root, memp)); +} + +int +register_mem_fallback(struct mem_range *memp) +{ + + return (register_mem_int(&mmio_rb_fallback, memp)); +} + +int +unregister_mem(struct mem_range *memp) +{ + struct mem_range *mr; + struct mmio_rb_range *entry = NULL; + int err, i; + + pthread_rwlock_wrlock(&mmio_rwlock); + err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry); + if (err == 0) { + mr = &entry->mr_param; + assert(mr->name == memp->name); + assert(mr->base == memp->base && mr->size == memp->size); + RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry); + + /* flush Per-vCPU cache */ + for (i=0; i < VM_MAXCPU; i++) { + if (mmio_hint[i] == entry) + mmio_hint[i] = NULL; + } + } + pthread_rwlock_unlock(&mmio_rwlock); + + if (entry) + free(entry); + + return (err); +} + +void +init_mem(void) +{ + RB_INIT(&mmio_rb_root); + RB_INIT(&mmio_rb_fallback); + pthread_rwlock_init(&mmio_rwlock, NULL); +} Index: usr.sbin/bhyve/arm64/mevent_test.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/mevent_test.c @@ -0,0 +1,256 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/usr.sbin/bhyve/arm/mevent_test.c 4 2017-04-18 20:28:32Z mihai.carabas $ + */ + +/* + * Test program for the micro event library. Set up a simple TCP echo + * service. + * + * cc mevent_test.c mevent.c -lpthread + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mevent.h" + +#define TEST_PORT 4321 + +static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER; + +static struct mevent *tevp; + +char *vmname = "test vm"; + + +#define MEVENT_ECHO + +/* Number of timer events to capture */ +#define TEVSZ 4096 +uint64_t tevbuf[TEVSZ]; + +static void +timer_print(void) +{ + uint64_t min, max, diff, sum, tsc_freq; + size_t len; + int j; + + min = UINT64_MAX; + max = 0; + sum = 0; + + len = sizeof(tsc_freq); + sysctlbyname("machdep.tsc_freq", &tsc_freq, &len, NULL, 0); + + for (j = 1; j < TEVSZ; j++) { + /* Convert a tsc diff into microseconds */ + diff = (tevbuf[j] - tevbuf[j-1]) * 1000000 / tsc_freq; + sum += diff; + if (min > diff) + min = diff; + if (max < diff) + max = diff; + } + + printf("timers done: usecs, min %ld, max %ld, mean %ld\n", min, max, + sum/(TEVSZ - 1)); +} + +static void +timer_callback(int fd, enum ev_type type, void *param) +{ + static int i; + + if (i >= TEVSZ) + abort(); + + tevbuf[i++] = rdtsc(); + + if (i == TEVSZ) { + mevent_delete(tevp); + timer_print(); + } +} + + +#ifdef MEVENT_ECHO +struct esync { + pthread_mutex_t e_mt; + pthread_cond_t e_cond; +}; + +static void +echoer_callback(int fd, enum ev_type type, void *param) +{ + struct esync *sync = param; + + pthread_mutex_lock(&sync->e_mt); + pthread_cond_signal(&sync->e_cond); + pthread_mutex_unlock(&sync->e_mt); +} + +static void * +echoer(void *param) +{ + struct esync sync; + struct mevent *mev; + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + pthread_mutex_init(&sync.e_mt, NULL); + pthread_cond_init(&sync.e_cond, NULL); + + pthread_mutex_lock(&sync.e_mt); + + mev = mevent_add(fd, EVF_READ, echoer_callback, &sync); + if (mev == NULL) { + printf("Could not allocate echoer event\n"); + exit(1); + } + + while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) { + len = read(fd, buf, sizeof(buf)); + if (len > 0) { + write(fd, buf, len); + write(0, buf, len); + } else { + break; + } + } + + mevent_delete_close(mev); + + pthread_mutex_unlock(&sync.e_mt); + pthread_mutex_destroy(&sync.e_mt); + pthread_cond_destroy(&sync.e_cond); + + return (NULL); +} + +#else + +static void * +echoer(void *param) +{ + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + while ((len = read(fd, buf, sizeof(buf))) > 0) { + write(1, buf, len); + } + + return (NULL); +} +#endif /* MEVENT_ECHO */ + +static void +acceptor_callback(int fd, enum ev_type type, void *param) +{ + pthread_mutex_lock(&accept_mutex); + pthread_cond_signal(&accept_condvar); + pthread_mutex_unlock(&accept_mutex); +} + +static void * +acceptor(void *param) +{ + struct sockaddr_in sin; + pthread_t tid; + int news; + int s; + static int first; + + if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("socket"); + exit(1); + } + + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(TEST_PORT); + + if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("bind"); + exit(1); + } + + if (listen(s, 1) < 0) { + perror("listen"); + exit(1); + } + + (void) mevent_add(s, EVF_READ, acceptor_callback, NULL); + + pthread_mutex_lock(&accept_mutex); + + while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) { + news = accept(s, NULL, NULL); + if (news < 0) { + perror("accept error"); + } else { + static int first = 1; + + if (first) { + /* + * Start a timer + */ + first = 0; + tevp = mevent_add(1, EVF_TIMER, timer_callback, + NULL); + } + + printf("incoming connection, spawning thread\n"); + pthread_create(&tid, NULL, echoer, + (void *)(uintptr_t)news); + } + } + + return (NULL); +} + +main() +{ + pthread_t tid; + + pthread_create(&tid, NULL, acceptor, NULL); + + mevent_dispatch(); +} Index: usr.sbin/bhyve/arm64/reset.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/reset.h @@ -0,0 +1,12 @@ +#ifndef _RESET_H_ +#define _RESET_H_ + +#define RESET_MAGIC 0xDEAD9731 + +#endif /* _RESET_H_ */ +#ifndef _RESET_H_ +#define _RESET_H_ + +#define RESET_MAGIC 0xDEAD9731 + +#endif /* _RESET_H_ */ Index: usr.sbin/bhyve/arm64/reset.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/reset.c @@ -0,0 +1,32 @@ +#include +#include + +#include "mem.h" +#include "reset.h" +#include "vmmapi.h" + +#define RESET_PORT 0x1c090100 + +static int +reset_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) +{ + vm_destroy(ctx); + + return (RESET_MAGIC); +} + +struct mem_range resetport ={ + "reset", + 0, + reset_handler, + NULL, + 0, + RESET_PORT, + sizeof(int) +}; + +void +init_reset(void) +{ + register_mem(&resetport); +} Index: usr.sbin/bhyve/block_if.c =================================================================== --- usr.sbin/bhyve/block_if.c +++ usr.sbin/bhyve/block_if.c @@ -58,7 +58,10 @@ #include #include + +#ifdef BHYVE_SNAPSHOT #include +#endif #include "bhyverun.h" #include "config.h" Index: usr.sbin/bhyve/mmio/Makefile.inc =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/Makefile.inc @@ -0,0 +1,23 @@ +# +# $FreeBSD$ +# + +.PATH: ${BHYVE_SRCTOP}/mmio/ +SRCS+= \ + mmio/block_if.c \ + mmio/mmio_uart.c \ + mmio/mmio_virtio.c \ + mmio/mmio_virtio_block.c \ + mmio/mmio_virtio_console.c \ + mmio/mmio_virtio_net.c \ + mmio/mmio_virtio_rnd.c \ + mmio/mmio_emul.c \ + mmio/mmio_irq.c \ + mmio/net_backends.c \ + mmio/net_utils.c \ + mmio/pl011.c \ + mmio/uart_backend.c + + + +CFLAGS+= -I${BHYVE_SRCTOP}/mmio Index: usr.sbin/bhyve/mmio/block_if.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/block_if.h @@ -0,0 +1,89 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Peter Grehan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * The block API to be used by bhyve block-device emulations. The routines + * are thread safe, with no assumptions about the context of the completion + * callback - it may occur in the caller's context, or asynchronously in + * another thread. + */ + +#ifndef _BLOCK_IF_H_ +#define _BLOCK_IF_H_ + +#include +#include + +struct vm_snapshot_meta; + + +/* + * BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in + * a single request. BLOCKIF_RING_MAX is the maxmimum number of + * pending requests that can be queued. + */ +#define BLOCKIF_IOV_MAX 128 /* not practical to be IOV_MAX */ +#define BLOCKIF_RING_MAX 128 + +struct blockif_req { + int br_iovcnt; + off_t br_offset; + ssize_t br_resid; + void (*br_callback)(struct blockif_req *req, int err); + void *br_param; + struct iovec br_iov[BLOCKIF_IOV_MAX]; +}; + +struct blockif_ctxt; +struct blockif_ctxt *blockif_open(const char *optstr, const char *ident); +off_t blockif_size(struct blockif_ctxt *bc); +void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, + uint8_t *s); +int blockif_sectsz(struct blockif_ctxt *bc); +void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off); +int blockif_queuesz(struct blockif_ctxt *bc); +int blockif_is_ro(struct blockif_ctxt *bc); +int blockif_candelete(struct blockif_ctxt *bc); +int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_close(struct blockif_ctxt *bc); +#ifdef BHYVE_SNAPSHOT +void blockif_pause(struct blockif_ctxt *bc); +void blockif_resume(struct blockif_ctxt *bc); +int blockif_snapshot_req(struct blockif_req *br, + struct vm_snapshot_meta *meta); +int blockif_snapshot(struct blockif_ctxt *bc, + struct vm_snapshot_meta *meta); +#endif + +#endif /* _BLOCK_IF_H_ */ Index: usr.sbin/bhyve/mmio/block_if.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/block_if.c @@ -0,0 +1,991 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Peter Grehan + * All rights reserved. + * Copyright 2020 Joyent, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef BHYVE_SNAPSHOT +#include +#endif + +#include "bhyverun.h" +#include "debug.h" +#include "mevent.h" +#include "block_if.h" + +#define BLOCKIF_SIG 0xb109b109 + +#define BLOCKIF_NUMTHR 8 +#define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) + +enum blockop { + BOP_READ, + BOP_WRITE, + BOP_FLUSH, + BOP_DELETE +}; + +enum blockstat { + BST_FREE, + BST_BLOCK, + BST_PEND, + BST_BUSY, + BST_DONE +}; + +struct blockif_elem { + TAILQ_ENTRY(blockif_elem) be_link; + struct blockif_req *be_req; + enum blockop be_op; + enum blockstat be_status; + pthread_t be_tid; + off_t be_block; +}; + +struct blockif_ctxt { + int bc_magic; + int bc_fd; + int bc_ischr; + int bc_isgeom; + int bc_candelete; + int bc_rdonly; + off_t bc_size; + int bc_sectsz; + int bc_psectsz; + int bc_psectoff; + int bc_closing; + int bc_paused; + int bc_work_count; + pthread_t bc_btid[BLOCKIF_NUMTHR]; + pthread_mutex_t bc_mtx; + pthread_cond_t bc_cond; + pthread_cond_t bc_paused_cond; + pthread_cond_t bc_work_done_cond; + + /* Request elements and free/pending/busy queues */ + TAILQ_HEAD(, blockif_elem) bc_freeq; + TAILQ_HEAD(, blockif_elem) bc_pendq; + TAILQ_HEAD(, blockif_elem) bc_busyq; + struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; +}; + +static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; + +struct blockif_sig_elem { + pthread_mutex_t bse_mtx; + pthread_cond_t bse_cond; + int bse_pending; + struct blockif_sig_elem *bse_next; +}; + +static struct blockif_sig_elem *blockif_bse_head; + +static int +blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, + enum blockop op) +{ + struct blockif_elem *be, *tbe; + off_t off; + int i; + + be = TAILQ_FIRST(&bc->bc_freeq); + assert(be != NULL); + assert(be->be_status == BST_FREE); + TAILQ_REMOVE(&bc->bc_freeq, be, be_link); + be->be_req = breq; + be->be_op = op; + switch (op) { + case BOP_READ: + case BOP_WRITE: + case BOP_DELETE: + off = breq->br_offset; + for (i = 0; i < breq->br_iovcnt; i++) + off += breq->br_iov[i].iov_len; + break; + default: + off = OFF_MAX; + } + be->be_block = off; + TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { + if (tbe->be_block == breq->br_offset) + break; + } + if (tbe == NULL) { + TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { + if (tbe->be_block == breq->br_offset) + break; + } + } + if (tbe == NULL) + be->be_status = BST_PEND; + else + be->be_status = BST_BLOCK; + TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); + return (be->be_status == BST_PEND); +} + +static int +blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) +{ + struct blockif_elem *be; + + TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { + if (be->be_status == BST_PEND) + break; + assert(be->be_status == BST_BLOCK); + } + if (be == NULL) + return (0); + TAILQ_REMOVE(&bc->bc_pendq, be, be_link); + be->be_status = BST_BUSY; + be->be_tid = t; + TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); + *bep = be; + return (1); +} + +static void +blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) +{ + struct blockif_elem *tbe; + + if (be->be_status == BST_DONE || be->be_status == BST_BUSY) + TAILQ_REMOVE(&bc->bc_busyq, be, be_link); + else + TAILQ_REMOVE(&bc->bc_pendq, be, be_link); + TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { + if (tbe->be_req->br_offset == be->be_block) + tbe->be_status = BST_PEND; + } + be->be_tid = 0; + be->be_status = BST_FREE; + be->be_req = NULL; + TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); +} + +static int +blockif_flush_bc(struct blockif_ctxt *bc) +{ + if (bc->bc_ischr) { + if (ioctl(bc->bc_fd, DIOCGFLUSH)) + return (errno); + } else if (fsync(bc->bc_fd)) + return (errno); + + return (0); +} + +static void +blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) +{ + struct blockif_req *br; + off_t arg[2]; + ssize_t clen, len, off, boff, voff; + int i, err; + + br = be->be_req; + if (br->br_iovcnt <= 1) + buf = NULL; + err = 0; + switch (be->be_op) { + case BOP_READ: + if (buf == NULL) { + if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + break; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + if (pread(bc->bc_fd, buf, len, br->br_offset + + off) < 0) { + err = errno; + break; + } + boff = 0; + do { + clen = MIN(len - boff, br->br_iov[i].iov_len - + voff); + memcpy(br->br_iov[i].iov_base + voff, + buf + boff, clen); + if (clen < br->br_iov[i].iov_len - voff) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + off += len; + br->br_resid -= len; + } + break; + case BOP_WRITE: + if (bc->bc_rdonly) { + err = EROFS; + break; + } + if (buf == NULL) { + if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + break; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + boff = 0; + do { + clen = MIN(len - boff, br->br_iov[i].iov_len - + voff); + memcpy(buf + boff, + br->br_iov[i].iov_base + voff, clen); + if (clen < br->br_iov[i].iov_len - voff) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + if (pwrite(bc->bc_fd, buf, len, br->br_offset + + off) < 0) { + err = errno; + break; + } + off += len; + br->br_resid -= len; + } + break; + case BOP_FLUSH: + err = blockif_flush_bc(bc); + break; + case BOP_DELETE: + if (!bc->bc_candelete) + err = EOPNOTSUPP; + else if (bc->bc_rdonly) + err = EROFS; + else if (bc->bc_ischr) { + arg[0] = br->br_offset; + arg[1] = br->br_resid; + if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) + err = errno; + else + br->br_resid = 0; + } else + err = EOPNOTSUPP; + break; + default: + err = EINVAL; + break; + } + + be->be_status = BST_DONE; + + (*br->br_callback)(br, err); +} + +static void * +blockif_thr(void *arg) +{ + struct blockif_ctxt *bc; + struct blockif_elem *be; + pthread_t t; + uint8_t *buf; + + bc = arg; + if (bc->bc_isgeom) + buf = malloc(MAXPHYS); + else + buf = NULL; + t = pthread_self(); + + pthread_mutex_lock(&bc->bc_mtx); + for (;;) { + bc->bc_work_count++; + + /* We cannot process work if the interface is paused */ + while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) { + pthread_mutex_unlock(&bc->bc_mtx); + blockif_proc(bc, be, buf); + pthread_mutex_lock(&bc->bc_mtx); + blockif_complete(bc, be); + } + + bc->bc_work_count--; + + /* If none of the workers are busy, notify the main thread */ + if (bc->bc_work_count == 0) + pthread_cond_broadcast(&bc->bc_work_done_cond); + + /* Check ctxt status here to see if exit requested */ + if (bc->bc_closing) + break; + + /* Make all worker threads wait here if the device is paused */ + while (bc->bc_paused) + pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx); + + pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); + } + pthread_mutex_unlock(&bc->bc_mtx); + + if (buf) + free(buf); + pthread_exit(NULL); + return (NULL); +} + +static void +blockif_sigcont_handler(int signal, enum ev_type type, void *arg) +{ + struct blockif_sig_elem *bse; + + for (;;) { + /* + * Process the entire list even if not intended for + * this thread. + */ + do { + bse = blockif_bse_head; + if (bse == NULL) + return; + } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, + (uintptr_t)bse, + (uintptr_t)bse->bse_next)); + + pthread_mutex_lock(&bse->bse_mtx); + bse->bse_pending = 0; + pthread_cond_signal(&bse->bse_cond); + pthread_mutex_unlock(&bse->bse_mtx); + } +} + +static void +blockif_init(void) +{ + mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); + (void) signal(SIGCONT, SIG_IGN); +} + +struct blockif_ctxt * +blockif_open(const char *optstr, const char *ident) +{ + char tname[MAXCOMLEN + 1]; + char name[MAXPATHLEN]; + char *nopt, *xopts, *cp; + struct blockif_ctxt *bc; + struct stat sbuf; + struct diocgattr_arg arg; + off_t size, psectsz, psectoff; + int extra, fd, i, sectsz; + int nocache, sync, ro, candelete, geom, ssopt, pssopt; + int nodelete; + +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; +#endif + + pthread_once(&blockif_once, blockif_init); + + fd = -1; + ssopt = 0; + nocache = 0; + sync = 0; + ro = 0; + nodelete = 0; + + /* + * The first element in the optstring is always a pathname. + * Optional elements follow + */ + nopt = xopts = strdup(optstr); + while (xopts != NULL) { + cp = strsep(&xopts, ","); + if (cp == nopt) /* file or device pathname */ + continue; + else if (!strcmp(cp, "nocache")) + nocache = 1; + else if (!strcmp(cp, "nodelete")) + nodelete = 1; + else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) + sync = 1; + else if (!strcmp(cp, "ro")) + ro = 1; + else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) + ; + else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) + pssopt = ssopt; + else { + EPRINTLN("Invalid device option \"%s\"", cp); + goto err; + } + } + + extra = 0; + if (nocache) + extra |= O_DIRECT; + if (sync) + extra |= O_SYNC; + + fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); + if (fd < 0 && !ro) { + /* Attempt a r/w fail with a r/o open */ + fd = open(nopt, O_RDONLY | extra); + ro = 1; + } + + if (fd < 0) { + warn("Could not open backing file: %s", nopt); + goto err; + } + + if (fstat(fd, &sbuf) < 0) { + warn("Could not stat backing file %s", nopt); + goto err; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, + CAP_WRITE); + if (ro) + cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); + + if (caph_rights_limit(fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* + * Deal with raw devices + */ + size = sbuf.st_size; + sectsz = DEV_BSIZE; + psectsz = psectoff = 0; + candelete = geom = 0; + if (S_ISCHR(sbuf.st_mode)) { + if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || + ioctl(fd, DIOCGSECTORSIZE, §sz)) { + perror("Could not fetch dev blk/sector size"); + goto err; + } + assert(size != 0); + assert(sectsz != 0); + if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) + ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); + strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); + arg.len = sizeof(arg.value.i); + if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) + candelete = arg.value.i; + if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) + geom = 1; + } else + psectsz = sbuf.st_blksize; + +#ifndef WITHOUT_CAPSICUM + if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + if (ssopt != 0) { + if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || + ssopt > pssopt) { + EPRINTLN("Invalid sector size %d/%d", + ssopt, pssopt); + goto err; + } + + /* + * Some backend drivers (e.g. cd0, ada0) require that the I/O + * size be a multiple of the device's sector size. + * + * Validate that the emulated sector size complies with this + * requirement. + */ + if (S_ISCHR(sbuf.st_mode)) { + if (ssopt < sectsz || (ssopt % sectsz) != 0) { + EPRINTLN("Sector size %d incompatible " + "with underlying device sector size %d", + ssopt, sectsz); + goto err; + } + } + + sectsz = ssopt; + psectsz = pssopt; + psectoff = 0; + } + + bc = calloc(1, sizeof(struct blockif_ctxt)); + if (bc == NULL) { + perror("calloc"); + goto err; + } + + bc->bc_magic = BLOCKIF_SIG; + bc->bc_fd = fd; + bc->bc_ischr = S_ISCHR(sbuf.st_mode); + bc->bc_isgeom = geom; + bc->bc_candelete = candelete; + bc->bc_rdonly = ro; + bc->bc_size = size; + bc->bc_sectsz = sectsz; + bc->bc_psectsz = psectsz; + bc->bc_psectoff = psectoff; + pthread_mutex_init(&bc->bc_mtx, NULL); + pthread_cond_init(&bc->bc_cond, NULL); + bc->bc_paused = 0; + bc->bc_work_count = 0; + pthread_cond_init(&bc->bc_paused_cond, NULL); + pthread_cond_init(&bc->bc_work_done_cond, NULL); + TAILQ_INIT(&bc->bc_freeq); + TAILQ_INIT(&bc->bc_pendq); + TAILQ_INIT(&bc->bc_busyq); + for (i = 0; i < BLOCKIF_MAXREQ; i++) { + bc->bc_reqs[i].be_status = BST_FREE; + TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); + } + + for (i = 0; i < BLOCKIF_NUMTHR; i++) { + pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); + snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); + pthread_set_name_np(bc->bc_btid[i], tname); + } + + return (bc); +err: + if (fd >= 0) + close(fd); + free(nopt); + return (NULL); +} + +static int +blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, + enum blockop op) +{ + int err; + + err = 0; + + pthread_mutex_lock(&bc->bc_mtx); + if (!TAILQ_EMPTY(&bc->bc_freeq)) { + /* + * Enqueue and inform the block i/o thread + * that there is work available + */ + if (blockif_enqueue(bc, breq, op)) + pthread_cond_signal(&bc->bc_cond); + } else { + /* + * Callers are not allowed to enqueue more than + * the specified blockif queue limit. Return an + * error to indicate that the queue length has been + * exceeded. + */ + err = E2BIG; + } + pthread_mutex_unlock(&bc->bc_mtx); + + return (err); +} + +int +blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_READ)); +} + +int +blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_WRITE)); +} + +int +blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_FLUSH)); +} + +int +blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_DELETE)); +} + +int +blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + struct blockif_elem *be; + + assert(bc->bc_magic == BLOCKIF_SIG); + + pthread_mutex_lock(&bc->bc_mtx); + /* XXX: not waiting while paused */ + + /* + * Check pending requests. + */ + TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { + if (be->be_req == breq) + break; + } + if (be != NULL) { + /* + * Found it. + */ + blockif_complete(bc, be); + pthread_mutex_unlock(&bc->bc_mtx); + + return (0); + } + + /* + * Check in-flight requests. + */ + TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { + if (be->be_req == breq) + break; + } + if (be == NULL) { + /* + * Didn't find it. + */ + pthread_mutex_unlock(&bc->bc_mtx); + return (EINVAL); + } + + /* + * Interrupt the processing thread to force it return + * prematurely via it's normal callback path. + */ + while (be->be_status == BST_BUSY) { + struct blockif_sig_elem bse, *old_head; + + pthread_mutex_init(&bse.bse_mtx, NULL); + pthread_cond_init(&bse.bse_cond, NULL); + + bse.bse_pending = 1; + + do { + old_head = blockif_bse_head; + bse.bse_next = old_head; + } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, + (uintptr_t)old_head, + (uintptr_t)&bse)); + + pthread_kill(be->be_tid, SIGCONT); + + pthread_mutex_lock(&bse.bse_mtx); + while (bse.bse_pending) + pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); + pthread_mutex_unlock(&bse.bse_mtx); + } + + pthread_mutex_unlock(&bc->bc_mtx); + + /* + * The processing thread has been interrupted. Since it's not + * clear if the callback has been invoked yet, return EBUSY. + */ + return (EBUSY); +} + +int +blockif_close(struct blockif_ctxt *bc) +{ + void *jval; + int i; + + assert(bc->bc_magic == BLOCKIF_SIG); + + /* + * Stop the block i/o thread + */ + pthread_mutex_lock(&bc->bc_mtx); + bc->bc_closing = 1; + pthread_mutex_unlock(&bc->bc_mtx); + pthread_cond_broadcast(&bc->bc_cond); + for (i = 0; i < BLOCKIF_NUMTHR; i++) + pthread_join(bc->bc_btid[i], &jval); + + /* XXX Cancel queued i/o's ??? */ + + /* + * Release resources + */ + bc->bc_magic = 0; + close(bc->bc_fd); + free(bc); + + return (0); +} + +/* + * Return virtual C/H/S values for a given block. Use the algorithm + * outlined in the VHD specification to calculate values. + */ +void +blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) +{ + off_t sectors; /* total sectors of the block dev */ + off_t hcyl; /* cylinders times heads */ + uint16_t secpt; /* sectors per track */ + uint8_t heads; + + assert(bc->bc_magic == BLOCKIF_SIG); + + sectors = bc->bc_size / bc->bc_sectsz; + + /* Clamp the size to the largest possible with CHS */ + if (sectors > 65535UL*16*255) + sectors = 65535UL*16*255; + + if (sectors >= 65536UL*16*63) { + secpt = 255; + heads = 16; + hcyl = sectors / secpt; + } else { + secpt = 17; + hcyl = sectors / secpt; + heads = (hcyl + 1023) / 1024; + + if (heads < 4) + heads = 4; + + if (hcyl >= (heads * 1024) || heads > 16) { + secpt = 31; + heads = 16; + hcyl = sectors / secpt; + } + if (hcyl >= (heads * 1024)) { + secpt = 63; + heads = 16; + hcyl = sectors / secpt; + } + } + + *c = hcyl / heads; + *h = heads; + *s = secpt; +} + +/* + * Accessors + */ +off_t +blockif_size(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_size); +} + +int +blockif_sectsz(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_sectsz); +} + +void +blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + *size = bc->bc_psectsz; + *off = bc->bc_psectoff; +} + +int +blockif_queuesz(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (BLOCKIF_MAXREQ - 1); +} + +int +blockif_is_ro(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_rdonly); +} + +int +blockif_candelete(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_candelete); +} + +#ifdef BHYVE_SNAPSHOT +void +blockif_pause(struct blockif_ctxt *bc) +{ + assert(bc != NULL); + assert(bc->bc_magic == BLOCKIF_SIG); + + pthread_mutex_lock(&bc->bc_mtx); + bc->bc_paused = 1; + + /* The interface is paused. Wait for workers to finish their work */ + while (bc->bc_work_count) + pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx); + pthread_mutex_unlock(&bc->bc_mtx); + + if (blockif_flush_bc(bc)) + fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n", + __func__); +} + +void +blockif_resume(struct blockif_ctxt *bc) +{ + assert(bc != NULL); + assert(bc->bc_magic == BLOCKIF_SIG); + + pthread_mutex_lock(&bc->bc_mtx); + bc->bc_paused = 0; + /* resume the threads waiting for paused */ + pthread_cond_broadcast(&bc->bc_paused_cond); + /* kick the threads after restore */ + pthread_cond_broadcast(&bc->bc_cond); + pthread_mutex_unlock(&bc->bc_mtx); +} + +int +blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta) +{ + int i; + struct iovec *iov; + int ret; + + SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done); + + /* + * XXX: The callback and parameter must be filled by the virtualized + * device that uses the interface, during its init; we're not touching + * them here. + */ + + /* Snapshot the iovecs. */ + for (i = 0; i < br->br_iovcnt; i++) { + iov = &br->br_iov[i]; + + SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done); + + /* We assume the iov is a guest-mapped address. */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len, + false, meta, ret, done); + } + +done: + return (ret); +} + +int +blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta) +{ + int ret; + + if (bc->bc_paused == 0) { + fprintf(stderr, "%s: Snapshot failed: " + "interface not paused.\r\n", __func__); + return (ENXIO); + } + + pthread_mutex_lock(&bc->bc_mtx); + + SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done); + +done: + pthread_mutex_unlock(&bc->bc_mtx); + return (ret); +} +#endif Index: usr.sbin/bhyve/mmio/mmio_emul.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_emul.h @@ -0,0 +1,116 @@ +#ifndef _EMUL_H_ +#define _EMUL_H_ + +#include + +#include + +struct vmctx; +struct mmio_devinst; + +// TODO suggestive naming +struct mmio_devemu { + char *de_emu; /* Device emulation name */ + + /* Instance creation */ + int (*de_init)(struct vmctx *ctx, struct mmio_devinst *di, + char *opts); + + /* Read / Write callbacks */ + void (*de_write)(struct vmctx *ctx, int vcpu, + struct mmio_devinst *di, int baridx, + uint64_t offset, int size, uint64_t val); + + uint64_t (*de_read)(struct vmctx *ctx, int vcpu, + struct mmio_devinst *di, int baridx, + uint64_t offset, int size); +}; + +#define MMIO_EMUL_SET(x) DATA_SET(mmio_set, x); +#define DI_NAMESZ 40 +#define MMIO_REGMAX 0xff +#define MMIO_REGNUM (MMIO_REGMAX + 1) + +struct devinst_addr { + uint64_t baddr; + uint64_t size; +}; + +enum lintr_stat { + IDLE, + ASSERTED, + PENDING +}; + +// TODO suggestive naming +struct mmio_devinst { + struct mmio_devemu *pi_d; /* Back ref to device */ + struct vmctx *pi_vmctx; /* Owner VM context */ + /* unused for mmio device emulation; may be used as uniquifiers */ + int pi_slot, di_func; + + char pi_name[DI_NAMESZ]; /* Instance name */ + + struct { + enum lintr_stat state; + int64_t irq; + pthread_mutex_t lock; + } di_lintr; + + void *pi_arg; /* Private data */ + + u_char pi_cfgregs[MMIO_REGNUM];/* Config regsters */ + + struct devinst_addr addr; /* Address info */ +}; + +int mmio_parse_opts(const char *args); +int mmio_alloc_mem(struct mmio_devinst *di); +int init_mmio(struct vmctx *ctx); +void mmio_lintr_request(struct mmio_devinst *di); +void mmio_lintr_assert(struct mmio_devinst *di); +void mmio_lintr_deassert(struct mmio_devinst *di); + +static __inline void +mmio_set_cfgreg8(struct mmio_devinst *di, size_t offset, uint32_t val) +{ + assert(offset <= MMIO_REGMAX); + *(uint32_t *)(di->pi_cfgregs + offset) = val; +} + +static __inline void +mmio_set_cfgreg16(struct mmio_devinst *di, size_t offset, uint32_t val) +{ + assert(offset <= (MMIO_REGMAX - 1) && (offset & 1) == 0); + *(uint32_t *)(di->pi_cfgregs + offset) = val; +} + +static __inline void +mmio_set_cfgreg32(struct mmio_devinst *di, size_t offset, uint32_t val) +{ + assert(offset <= (MMIO_REGMAX - 3) && (offset & 3) == 0); + *(uint32_t *)(di->pi_cfgregs + offset) = val; +} + +static __inline uint8_t +mmio_get_cfgreg8(struct mmio_devinst *di, size_t offset) +{ + assert(offset <= MMIO_REGMAX); + return (*(uint32_t *)(di->pi_cfgregs + offset)); +} + +static __inline uint16_t +mmio_get_cfgreg16(struct mmio_devinst *di, size_t offset) +{ + assert(offset <= (MMIO_REGMAX - 1) && (offset & 1) == 0); + return (*(uint32_t *)(di->pi_cfgregs + offset)); +} + +static __inline uint32_t +mmio_get_cfgreg32(struct mmio_devinst *di, size_t offset) +{ + assert(offset <= (MMIO_REGMAX - 3) && (offset & 3) == 0); + return (*(uint32_t *)(di->pi_cfgregs + offset)); +} + +#endif /* _EMUL_H_ */ Index: usr.sbin/bhyve/mmio/mmio_emul.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_emul.c @@ -0,0 +1,440 @@ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "arm64/mem.h" +#include "mmio_emul.h" +#include "mmio_irq.h" + +#define DEVEMU_MEMLIMIT 0xFD00000000UL +#define DEVEMU_MEMBASE 0xD000000000UL +#define MEM_ROUNDUP (1 << 20) +#ifndef max +# define max(A, B) ((A) > (B) ? (A) : (B)) +#endif + +static uint64_t mmio_membase; + +SET_DECLARE(mmio_set, struct mmio_devemu); + +static struct mmio_devemu *mmio_finddef(const char *name); +static void mmio_lintr_route(struct mmio_devinst *di); +static void mmio_lintr_update(struct mmio_devinst *di); + +static struct mmio_emul_info { + uint64_t size; /* address size */ + uint64_t baddr; /* address */ + int64_t irq; /* device interrupt number */ + char *name; /* device name */ + char *arg; /* device arguments */ + struct mmio_emul_info *next; /* pointer for linked list */ + struct mmio_devinst *di; /* pointer to device instance */ +} *mmio_emul_info_head = NULL; + +/* + * MMIO options are in the form: + * + * @#:[,] + * + * - size is the number of bytes required for the device mmio + * - base_addr is the base address for the MMIO mapped device; + * - irq specifies the device interrupt number the value MUST be a DECIMAL + * integer; if the device does not use interrupts, use -1 + * - emul is a string describing the type of device - e.g., virtio-net; + * - config is an optional string, depending on the device, that is used + * for configuration + * + * Examples of use: + * 0x200@0x100000#25:virtio-net,tap0 + * 0x100@0x200000#-1:dummy + */ +static void +mmio_parse_opts_usage(const char *args) +{ + fprintf(stderr, "Invalid mmio arguments \"%s\"\r\n", args); +} + +/* + * checks if two memory regions overlap + * checks are not required if one of the pointers is null + */ +static int +mmio_mem_overlap(uint64_t pa, uint64_t sa, uint64_t pb, uint64_t sb) +{ +#define IN_INTERVAL(lower, value, upper) \ + (((lower) < (value)) && ((value) < (upper))) + + if ((pa == 0) || (pb == 0)) + return 0; + + if (IN_INTERVAL(pa, pb, pa + sa) && + IN_INTERVAL(pb, pa, pb + sb)) + return 1; + + return 0; + +#undef IN_INTERVAL +} + +int +mmio_parse_opts(const char *args) +{ + char *emul, *config, *str; + uint64_t size, baddr; + int64_t irq; + int error; + struct mmio_emul_info *dif; + + error = -1; + emul = config = NULL; + baddr = 0, size = 0; + str = strdup(args); + + if ((emul = strchr(str, ':')) != NULL) { + *emul++ = '\0'; + + /* @# */ + if (sscanf(str, "%jx@%jx#%jd", &size, &baddr, &irq) != 3 && + sscanf(str, "%jx@%jx#%jd", &size, &baddr, &irq) != 3) { + mmio_parse_opts_usage(str); + goto parse_error; + } + } else { + mmio_parse_opts_usage(str); + goto parse_error; + } + + if ((config = strchr(emul, ',')) != NULL) + *config++ = '\0'; + + /* + * check if the required address can be obtained; + * if an address has not been requested, ignore the checks + * (however, an address will have to be later identified) + */ + if (baddr != 0) { + for (dif = mmio_emul_info_head; dif != NULL; dif = dif->next) + if (mmio_mem_overlap(dif->baddr, dif->size, + baddr, size)) + break; + + if (dif != NULL) { + fprintf(stderr, "The requested address 0x%jx is " + "already bound or overlapping\r\n", baddr); + error = EINVAL; + goto parse_error; + } + } + + dif = calloc(1, sizeof(struct mmio_emul_info)); + if (dif == NULL) { + error = ENOMEM; + goto parse_error; + } + + dif->next = mmio_emul_info_head; + mmio_emul_info_head = dif; + + dif->size = size; + dif->baddr = baddr; + dif->irq = irq; + if ((emul != NULL) && (strlen(emul)) > 0) + dif->name = strdup(emul); + else + dif->name = NULL; + if ((config != NULL) && (strlen(config)) > 0) + dif->arg = strdup(config); + else + dif->arg = NULL; + + error = 0; + +parse_error: + free(str); + + return error; +} + +static int +mmio_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2) +{ + struct mmio_devinst *di = arg1; + struct mmio_devemu *de = di->pi_d; + uint64_t offset; + int bidx = (int) arg2; + + assert(di->addr.baddr <= addr && + addr + size <= di->addr.baddr + di->addr.size); + + offset = addr - di->addr.baddr; + + if (dir == MEM_F_WRITE) { + if (size == 8) { + (*de->de_write)(ctx, vcpu, di, bidx, offset, + 4, *val & 0xffffffff); + (*de->de_write)(ctx, vcpu, di, bidx, offset + 4, + 4, *val >> 32); + } else { + (*de->de_write)(ctx, vcpu, di, bidx, offset, + size, *val); + } + } else { + if (size == 8) { + *val = (*de->de_read)(ctx, vcpu, di, bidx, + offset, 4); + *val |= (*de->de_read)(ctx, vcpu, di, bidx, + offset + 4, 4) << 32; + } else { + *val = (*de->de_read)(ctx, vcpu, di, bidx, + offset, size); + } + } + + return (0); +} + +static void +modify_mmio_registration(struct mmio_devinst *di, int registration) +{ + int error; + struct mem_range mr; + + bzero(&mr, sizeof(struct mem_range)); + mr.name = di->pi_name; + mr.base = di->addr.baddr; + mr.size = di->addr.size; + if (registration) { + mr.flags = MEM_F_RW; + mr.handler = mmio_mem_handler; + mr.arg1 = di; + mr.arg2 = 0; + error = register_mem(&mr); + } else { + error = unregister_mem(&mr); + } + + assert(error == 0); +} + +static void +register_mmio(struct mmio_devinst *di) +{ + return modify_mmio_registration(di, 1); +} + +static void +unregister_mmio(struct mmio_devinst *di) +{ + return modify_mmio_registration(di, 0); +} + +/* + * Update the MMIO address that is decoded + */ +static void +update_mem_address(struct mmio_devinst *di, uint64_t addr) +{ + /* TODO: check if the decoding is running */ + unregister_mmio(di); + + di->addr.baddr = addr; + + register_mmio(di); +} + +static int +mmio_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, + uint64_t *addr) +{ + uint64_t base; + + assert((size & (size - 1)) == 0); /* must be a power of 2 */ + + base = roundup2(*baseptr, size); + + if (base + size <= limit) { + *addr = base; + *baseptr = base + size; + return (0); + } else + return (-1); +} + +int +mmio_alloc_mem(struct mmio_devinst *di) +{ + int error; + uint64_t *baseptr, limit, addr, size; + + baseptr = &di->addr.baddr; + size = di->addr.size; + limit = DEVEMU_MEMLIMIT; + + if ((size & (size - 1)) != 0) + /* Round up to a power of 2 */ + size = 1UL << flsl(size); + + error = mmio_alloc_resource(baseptr, limit, size, &addr); + if (error != 0) + return (error); + + di->addr.baddr = addr; + + register_mmio(di); + + return (0); +} + +static struct mmio_devemu * +mmio_finddev(char *name) +{ + struct mmio_devemu **dpp, *dp; + + SET_FOREACH(dpp, mmio_set) { + dp = *dpp; + if (!strcmp(dp->de_emu, name)) + return (dp); + } + + return (NULL); +} + +static int +mmio_init(struct vmctx *ctx, struct mmio_devemu *de, struct mmio_emul_info *dif) +{ + struct mmio_devinst *di; + int error; + + di = calloc(1, sizeof(struct mmio_devinst)); + if (di == NULL) + return (ENOMEM); + + di->pi_d = de; + di->pi_vmctx = ctx; + snprintf(di->pi_name, DI_NAMESZ, "%s-mmio", de->de_emu); + di->di_lintr.state = IDLE; + di->di_lintr.irq = dif->irq; + pthread_mutex_init(&di->di_lintr.lock, NULL); + di->addr.baddr = dif->baddr; + di->addr.size = dif->size; + /* some devices (e.g., virtio-net) use these as uniquifiers; irq number + * should be unique and sufficient */ + di->pi_slot = dif->irq; + di->di_func = dif->irq; + + error = (*de->de_init)(ctx, di, dif->arg); + + if (error == 0) { + dif->di = di; + } else { + fprintf(stderr, "Device \"%s\": initialization failed\r\n", + di->pi_name); + fprintf(stderr, "Device arguments were: %s\r\n", dif->arg); + free(di); + } + + return (error); +} + +static void +init_mmio_error(const char *name) +{ + struct mmio_devemu **mdpp, *mdp; + + fprintf(stderr, "Device \"%s\" does not exist\r\n", name); + fprintf(stderr, "The following devices are available:\r\n"); + + SET_FOREACH(mdpp, mmio_set) { + mdp = *mdpp; + fprintf(stderr, "\t%s\r\n", mdp->de_emu); + } +} + +int init_mmio(struct vmctx *ctx) +{ + struct mmio_devemu *de; + struct mmio_emul_info *dif; + int error; + + mmio_membase = DEVEMU_MEMBASE; + + for (dif = mmio_emul_info_head; dif != NULL; dif = dif->next) { + if (dif->name == NULL) + continue; + + de = mmio_finddev(dif->name); + if (de == NULL) { + init_mmio_error(dif->name); + return (1); + } + + error = mmio_init(ctx, de, dif); + if (error != 0) + return (error); + + /* + * as specified in the amd64 implementation, add some + * slop to the memory resources decoded, in order to + * give the guest some flexibility to reprogram the addresses + */ + mmio_membase += MEM_ROUNDUP; + mmio_membase = roundup2(mmio_membase, MEM_ROUNDUP); + } + + /* activate the interrupts */ + for (dif = mmio_emul_info_head; dif != NULL; dif = dif->next) + if (dif->di != NULL) + mmio_lintr_route(dif->di); + + /* TODO: register fallback handlers? */ + + return (0); +} + +void +mmio_lintr_request(struct mmio_devinst *di) +{ + /* do nothing */ +} + +static void +mmio_lintr_route(struct mmio_devinst *di) +{ + /* do nothing */ +} + +void +mmio_lintr_assert(struct mmio_devinst *di) +{ + pthread_mutex_lock(&di->di_lintr.lock); + if (di->di_lintr.state == IDLE) { + di->di_lintr.state = ASSERTED; + mmio_irq_assert(di); + } + pthread_mutex_unlock(&di->di_lintr.lock); +} + +void +mmio_lintr_deassert(struct mmio_devinst *di) +{ + pthread_mutex_lock(&di->di_lintr.lock); + if (di->di_lintr.state == ASSERTED) { + mmio_irq_deassert(di); + di->di_lintr.state = IDLE; + } else if (di->di_lintr.state == PENDING) { + di->di_lintr.state = IDLE; + } + pthread_mutex_unlock(&di->di_lintr.lock); +} + +/* TODO: Add dummy? */ Index: usr.sbin/bhyve/mmio/mmio_irq.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_irq.h @@ -0,0 +1,12 @@ +#ifndef __MMIO_IRQ_H__ +#define __MMIO_IRQ_H__ + +struct mmio_devinst; + +void mmio_irq_init(struct vmctx *ctx); +void mmio_irq_reserve(int irq); +void mmio_irq_use(int irq); +void mmio_irq_assert(struct mmio_devinst *di); +void mmio_irq_deassert(struct mmio_devinst *di); + +#endif Index: usr.sbin/bhyve/mmio/mmio_irq.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_irq.c @@ -0,0 +1,113 @@ +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include + +#include "mmio_emul.h" +#include "mmio_irq.h" +#include "mmio_virtio.h" + +/* IRQ count to disable IRQ */ +#define IRQ_DISABLED 0xff + +static struct mmio_irq { + uint32_t use_count; /* number of binds */ + uint32_t active_count; /* number of asserts */ + uint32_t active; /* irq active */ + pthread_mutex_t lock; +} irqs[50]; + +void +mmio_irq_reserve(int irq) +{ + assert(irq >= 0 && irq < nitems(irqs)); + assert(irqs[irq].active == 0 || irqs[irq].active == IRQ_DISABLED); + irqs[irq].active = IRQ_DISABLED; +} + +void +mmio_irq_use(int irq) { + assert(irq >= 0 && irq < nitems(irqs)); + assert(irqs[irq].active != IRQ_DISABLED); + irqs[irq].active++; +} + +void +mmio_irq_init(struct vmctx *ctx) +{ + int i; + + for (i = 0; i < nitems(irqs); ++i) { + irqs[i].use_count = 0; + irqs[i].active_count = 0; + irqs[i].active = 0; + pthread_mutex_init(&irqs[i].lock, NULL); + } +} + +void +mmio_irq_assert(struct mmio_devinst *di) +{ + struct mmio_irq *irq; + uint32_t irq_status; + + assert(di->di_lintr.irq <= nitems(irqs)); + if (di->di_lintr.irq < 0) + return; + + irq = &irqs[di->di_lintr.irq]; + + pthread_mutex_lock(&irq->lock); + irq->active_count++; + + pthread_mutex_lock(&di->di_lintr.lock); + + irq_status = mmio_get_cfgreg32(di, VIRTIO_MMIO_INTERRUPT_STATUS); + irq_status |= VIRTIO_MMIO_INT_VRING; + mmio_set_cfgreg32(di, VIRTIO_MMIO_INTERRUPT_STATUS, irq_status); + + if (irq->active_count == 1) + vm_assert_irq(di->pi_vmctx, di->di_lintr.irq, 0); + + pthread_mutex_unlock(&di->di_lintr.lock); + + pthread_mutex_unlock(&irq->lock); +} + +void +mmio_irq_deassert(struct mmio_devinst *di) +{ + struct mmio_irq *irq; + uint32_t irq_status; + + assert(di->di_lintr.irq <= nitems(irqs)); + if (di->di_lintr.irq < 0) + return; + + irq = &irqs[di->di_lintr.irq]; + + pthread_mutex_lock(&irq->lock); + irq->active_count--; + + pthread_mutex_lock(&di->di_lintr.lock); + + irq_status = mmio_get_cfgreg32(di, VIRTIO_MMIO_INTERRUPT_STATUS); + irq_status &= ~VIRTIO_MMIO_INT_VRING; + mmio_set_cfgreg32(di, VIRTIO_MMIO_INTERRUPT_STATUS, irq_status); + +#if 0 + /* MMIO devices do not require deassertions */ + if (irq->active_count == 0) + vm_deassert_irq(di->di_vmctx, di->di_lintr.irq); +#endif + + pthread_mutex_unlock(&di->di_lintr.lock); + + pthread_mutex_unlock(&irq->lock); +} Index: usr.sbin/bhyve/mmio/mmio_uart.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_uart.c @@ -0,0 +1,112 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright TODO + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include + +#include +#include + +#include "bhyverun.h" +#include "debug.h" + +#include "mmio_emul.h" +#include "mmio_virtio.h" +#include "uart_emul.h" + +static void +mmio_uart_intr_assert(void *arg, uint32_t irq) +{ + struct vmctx *ctx = arg; + + vm_assert_irq(ctx, irq, 0); +} + +static void +mmio_uart_intr_deassert(void *arg, uint32_t irq) +{ + struct vmctx *ctx = arg; + + vm_deassert_irq(ctx, irq, 0); +} + +static int +mmio_uart_init(struct vmctx *ctx, struct mmio_devinst *pi, char *opts) +{ + struct uart_softc *sc; + + sc = uart_init(mmio_uart_intr_assert, mmio_uart_intr_deassert, ctx); + if (uart_set_backend(sc, "stdio") != 0) { + EPRINTLN("Unable to initialize backend '%s' for " + "mmio_uart", "stdio"); + return (-1); + } + + pi->pi_arg = sc; + sc->irqno = pi->di_lintr.irq; + + mmio_alloc_mem(pi); + + return (0); +} + +static void +mmio_uart_write(struct vmctx *ctx, int vcpu, struct mmio_devinst *di, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct uart_softc *sc = di->pi_arg; + long reg = offset >> 2; + + uart_write(sc, reg, value); +} + +uint64_t +mmio_uart_read(struct vmctx *ctx, int vcpu, struct mmio_devinst *di, + int baridx, uint64_t offset, int size) +{ + struct uart_softc *sc = di->pi_arg; + long reg = offset >> 2; + + return uart_read(sc, reg); +} + +struct mmio_devemu mmio_uart = { + .de_emu = "mmio-uart", + .de_init = mmio_uart_init, + .de_write = mmio_uart_write, + .de_read = mmio_uart_read +}; +MMIO_EMUL_SET(mmio_uart); Index: usr.sbin/bhyve/mmio/mmio_virtio.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_virtio.h @@ -0,0 +1,484 @@ +/*- + * Copyright (c) 2013 Chris Torek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VIRTIO_H_ +#define _VIRTIO_H_ + +#include + +/* + * These are derived from several virtio specifications. + * + * Some useful links: + * https://github.com/rustyrussell/virtio-spec + * http://people.redhat.com/pbonzini/virtio-spec.pdf + */ + +/* + * A virtual device has zero or more "virtual queues" (virtqueue). + * Each virtqueue uses at least two 4096-byte pages, laid out thus: + * + * +-----------------------------------------------+ + * | "desc": descriptors, 16 bytes each | + * | ----------------------------------------- | + * | "avail": 2 uint16; uint16; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * | "used": 2 x uint16; elems; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * + * The number that appears here is always a power of two and is + * limited to no more than 32768 (as it must fit in a 16-bit field). + * If is sufficiently large, the above will occupy more than + * two pages. In any case, all pages must be physically contiguous + * within the guest's physical address space. + * + * The 16-byte "desc" descriptors consist of a 64-bit guest + * physical address , a 32-bit length , a 16-bit + * , and a 16-bit field (all in guest byte order). + * + * There are three flags that may be set : + * NEXT descriptor is chained, so use its "next" field + * WRITE descriptor is for host to write into guest RAM + * (else host is to read from guest RAM) + * INDIRECT descriptor address field is (guest physical) + * address of a linear array of descriptors + * + * Unless INDIRECT is set, is the number of bytes that may + * be read/written from guest physical address . If + * INDIRECT is set, WRITE is ignored and provides the length + * of the indirect descriptors (and must be a multiple of + * 16). Note that NEXT may still be set in the main descriptor + * pointing to the indirect, and should be set in each indirect + * descriptor that uses the next descriptor (these should generally + * be numbered sequentially). However, INDIRECT must not be set + * in the indirect descriptors. Upon reaching an indirect descriptor + * without a NEXT bit, control returns to the direct descriptors. + * + * Except inside an indirect, each value must be in the + * range [0 .. N) (i.e., the half-open interval). (Inside an + * indirect, each must be in the range [0 .. /16).) + * + * The "avail" data structures reside in the same pages as the + * "desc" structures since both together are used by the device to + * pass information to the hypervisor's virtual driver. These + * begin with a 16-bit field and 16-bit index , then + * have 16-bit values, followed by one final 16-bit + * field . The entries are simply indices + * indices into the descriptor ring (and thus must meet the same + * constraints as each value). However, is counted + * up from 0 (initially) and simply wraps around after 65535; it + * is taken mod to find the next available entry. + * + * The "used" ring occupies a separate page or pages, and contains + * values written from the virtual driver back to the guest OS. + * This begins with a 16-bit and 16-bit , then there + * are "vring_used" elements, followed by a 16-bit . + * The "vring_used" elements consist of a 32-bit and a + * 32-bit (vu_tlen below). The is simply the index of + * the head of a descriptor chain the guest made available + * earlier, and the is the number of bytes actually written, + * e.g., in the case of a network driver that provided a large + * receive buffer but received only a small amount of data. + * + * The two event fields, and , in the + * avail and used rings (respectively -- note the reversal!), are + * always provided, but are used only if the virtual device + * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature + * negotiation. Similarly, both rings provide a flag -- + * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in + * their field, indicating that the guest does not need an + * interrupt, or that the hypervisor driver does not need a + * notify, when descriptors are added to the corresponding ring. + * (These are provided only for interrupt optimization and need + * not be implemented.) + */ + +#define VIRTIO_MMIO_MAGIC_VALUE 0x000 +#define VIRTIO_MMIO_VERSION 0x004 +#define VIRTIO_MMIO_DEVICE_ID 0x008 +#define VIRTIO_MMIO_VENDOR_ID 0x00c +#define VIRTIO_MMIO_HOST_FEATURES 0x010 +#define VIRTIO_MMIO_HOST_FEATURES_SEL 0x014 +#define VIRTIO_MMIO_GUEST_FEATURES 0x020 +#define VIRTIO_MMIO_GUEST_FEATURES_SEL 0x024 +#define VIRTIO_MMIO_GUEST_PAGE_SIZE 0x028 +#define VIRTIO_MMIO_QUEUE_SEL 0x030 +#define VIRTIO_MMIO_QUEUE_NUM_MAX 0x034 +#define VIRTIO_MMIO_QUEUE_NUM 0x038 +#define VIRTIO_MMIO_QUEUE_ALIGN 0x03c +#define VIRTIO_MMIO_QUEUE_PFN 0x040 +#define VIRTIO_MMIO_QUEUE_NOTIFY 0x050 +#define VIRTIO_MMIO_INTERRUPT_STATUS 0x060 +#define VIRTIO_MMIO_INTERRUPT_ACK 0x064 +#define VIRTIO_MMIO_STATUS 0x070 +#define VIRTIO_MMIO_CONFIG 0x100 +#define VIRTIO_MMIO_INT_VRING (1 << 0) +#define VIRTIO_MMIO_INT_CONFIG (1 << 1) +#define VIRTIO_MMIO_VRING_ALIGN 4096 + +#define VRING_ALIGN 4096 + +#define VRING_DESC_F_NEXT (1 << 0) +#define VRING_DESC_F_WRITE (1 << 1) +#define VRING_DESC_F_INDIRECT (1 << 2) + +struct virtio_desc { /* AKA vring_desc */ + uint64_t vd_addr; /* guest physical address */ + uint32_t vd_len; /* length of scatter/gather seg */ + uint16_t vd_flags; /* VRING_F_DESC_* */ + uint16_t vd_next; /* next desc if F_NEXT */ +} __packed; + +struct virtio_used { /* AKA vring_used_elem */ + uint32_t vu_idx; /* head of used descriptor chain */ + uint32_t vu_tlen; /* length written-to */ +} __packed; + +#define VRING_AVAIL_F_NO_INTERRUPT 1 + +struct vring_avail { + uint16_t va_flags; /* VRING_AVAIL_F_* */ + uint16_t va_idx; /* counts to 65535, then cycles */ + uint16_t va_ring[]; /* size N, reported in QNUM value */ +/* uint16_t va_used_event; -- after N ring entries */ +} __packed; + +#define VRING_USED_F_NO_NOTIFY 1 +struct vring_used { + uint16_t vu_flags; /* VRING_USED_F_* */ + uint16_t vu_idx; /* counts to 65535, then cycles */ + struct virtio_used vu_ring[]; /* size N */ +/* uint16_t vu_avail_event; -- after N ring entries */ +} __packed; + +/* + * The address of any given virtual queue is determined by a single + * Page Frame Number register. The guest writes the PFN into the + * PCI config space. However, a device that has two or more + * virtqueues can have a different PFN, and size, for each queue. + * The number of queues is determinable via the PCI config space + * VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means + * queue #0, 1 means queue#1, etc. Once a queue is selected, the + * remaining PFN and QNUM registers refer to that queue. + * + * QNUM is a read-only register containing a nonzero power of two + * that indicates the (hypervisor's) queue size. Or, if reading it + * produces zero, the hypervisor does not have a corresponding + * queue. (The number of possible queues depends on the virtual + * device. The block device has just one; the network device + * provides either two -- 0 = receive, 1 = transmit -- or three, + * with 2 = control.) + * + * PFN is a read/write register giving the physical page address of + * the virtqueue in guest memory (the guest must allocate enough space + * based on the hypervisor's provided QNUM). + * + * QNOTIFY is effectively write-only: when the guest writes a queue + * number to the register, the hypervisor should scan the specified + * virtqueue. (Reading QNOTIFY currently always gets 0). + */ + +/* + * PFN register shift amount + */ +#define VRING_PFN 12 + +/* + * Virtio device types + * + * XXX Should really be merged with defines + */ +#define VIRTIO_TYPE_NET 1 +#define VIRTIO_TYPE_BLOCK 2 +#define VIRTIO_TYPE_CONSOLE 3 +#define VIRTIO_TYPE_ENTROPY 4 +#define VIRTIO_TYPE_BALLOON 5 +#define VIRTIO_TYPE_IOMEMORY 6 +#define VIRTIO_TYPE_RPMSG 7 +#define VIRTIO_TYPE_SCSI 8 +#define VIRTIO_TYPE_9P 9 + +/* experimental IDs start at 65535 and work down */ + +/* + * PCI vendor/device IDs + */ +#define VIRTIO_VENDOR 0x1AF4 +#define VIRTIO_DEV_NET 0x1000 +#define VIRTIO_DEV_BLOCK 0x1001 +#define VIRTIO_DEV_CONSOLE 0x1003 +#define VIRTIO_DEV_RANDOM 0x1005 + +#define VIRTIO_MMIO_MAGIC_NUM 0x74726976 +#define VIRTIO_MMIO_VERSION_NUM 0x1 + +/* + * Bits in VTCFG_R_STATUS. Guests need not actually set any of these, + * but a guest writing 0 to this register means "please reset". + */ +#define VTCFG_STATUS_ACK 0x01 /* guest OS has acknowledged dev */ +#define VTCFG_STATUS_DRIVER 0x02 /* guest OS driver is loaded */ +#define VTCFG_STATUS_DRIVER_OK 0x04 /* guest OS driver ready */ +#define VTCFG_STATUS_FAILED 0x80 /* guest has given up on this dev */ + +/* + * Bits in VTCFG_R_ISR. These apply only if not using MSI-X. + * + * (We don't [yet?] ever use CONF_CHANGED.) + */ +#define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */ +#define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */ + +#define VIRTIO_MSI_NO_VECTOR 0xFFFF + +/* + * Feature flags. + * Note: bits 0 through 23 are reserved to each device type. + */ +#define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24) +#define VIRTIO_RING_F_INDIRECT_DESC (1 << 28) +#define VIRTIO_RING_F_EVENT_IDX (1 << 29) + +/* From section 2.3, "Virtqueue Configuration", of the virtio specification */ +static inline size_t +vring_size(u_int qsz, uint32_t align) +{ + size_t size; + + /* constant 3 below = va_flags, va_idx, va_used_event */ + size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz); + size = roundup2(size, align); + + /* constant 3 below = vu_flags, vu_idx, vu_avail_event */ + size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz; + size = roundup2(size, align); + + return (size); +} + +struct vmctx; +struct mmio_devinst; +struct vqueue_info; + +/* + * A virtual device, with some number (possibly 0) of virtual + * queues and some size (possibly 0) of configuration-space + * registers private to the device. The virtio_softc should come + * at the front of each "derived class", so that a pointer to the + * virtio_softc is also a pointer to the more specific, derived- + * from-virtio driver's softc. + * + * Note: inside each hypervisor virtio driver, changes to these + * data structures must be locked against other threads, if any. + * Except for PCI config space register read/write, we assume each + * driver does the required locking, but we need a pointer to the + * lock (if there is one) for PCI config space read/write ops. + * + * When the guest reads or writes the device's config space, the + * generic layer checks for operations on the special registers + * described above. If the offset of the register(s) being read + * or written is past the CFG area (CFG0 or CFG1), the request is + * passed on to the virtual device, after subtracting off the + * generic-layer size. (So, drivers can just use the offset as + * an offset into "struct config", for instance.) + * + * (The virtio layer also makes sure that the read or write is to/ + * from a "good" config offset, hence vc_cfgsize, and on BAR #0. + * However, the driver must verify the read or write size and offset + * and that no one is writing a readonly register.) + * + * The BROKED flag ("this thing done gone and broked") is for future + * use. + */ +#define VIRTIO_USE_MSIX 0x01 +#define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */ +#define VIRTIO_BROKED 0x08 /* ??? */ + +struct virtio_softc { + struct virtio_consts *vs_vc; /* constants (see below) */ + int vs_flags; /* VIRTIO_* flags from above */ + pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */ + struct mmio_devinst *vs_di; /* device instance */ + uint32_t vs_negotiated_caps; /* negotiated capabilities */ + uint32_t vs_align; /* virtual queue alignment */ + struct vqueue_info *vs_queues; /* one per vc_nvq */ + int vs_curq; /* current queue */ + int irq; /* interrupt */ + uint8_t vs_status; /* value from last status write */ + uint32_t vs_guest_page_size; /* size of guest page in bytes */ +}; + +#define VS_LOCK(vs) \ +do { \ + if (vs->vs_mtx) \ + pthread_mutex_lock(vs->vs_mtx); \ +} while (0) + +#define VS_UNLOCK(vs) \ +do { \ + if (vs->vs_mtx) \ + pthread_mutex_unlock(vs->vs_mtx); \ +} while (0) + +struct virtio_consts { + const char *vc_name; /* name of driver (for diagnostics) */ + int vc_nvq; /* number of virtual queues */ + size_t vc_cfgsize; /* size of dev-specific config regs */ + void (*vc_reset)(void *); /* called on virtual device reset */ + void (*vc_qnotify)(void *, struct vqueue_info *); + /* called on QNOTIFY if no VQ notify */ + int (*vc_cfgread)(void *, int, int, uint32_t *); + /* called to read config regs */ + int (*vc_cfgwrite)(void *, int, int, uint32_t); + /* called to write config regs */ + void (*vc_apply_features)(void *, uint64_t); + /* called to apply negotiated features */ + uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ +}; + +/* + * Data structure allocated (statically) per virtual queue. + * + * Drivers may change vq_qsize after a reset. When the guest OS + * requests a device reset, the hypervisor first calls + * vs->vs_vc->vc_reset(); then the data structure below is + * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq). + * + * The remaining fields should only be fussed-with by the generic + * code. + * + * Note: the addresses of vq_desc, vq_avail, and vq_used are all + * computable from each other, but it's a lot simpler if we just + * keep a pointer to each one. The event indices are similarly + * (but more easily) computable, and this time we'll compute them: + * they're just XX_ring[N]. + */ +#define VQ_ALLOC 0x01 /* set once we have a pfn */ +#define VQ_BROKED 0x02 /* ??? */ +struct vqueue_info { + uint16_t vq_qsize; /* size of this queue (a power of 2) */ + void (*vq_notify)(void *, struct vqueue_info *); + /* called instead of vc_notify, if not NULL */ + + struct virtio_softc *vq_vs; /* backpointer to softc */ + uint16_t vq_num; /* we're the num'th queue in the softc */ + + uint16_t vq_flags; /* flags (see above) */ + uint16_t vq_last_avail; /* a recent value of vq_avail->va_idx */ + uint16_t vq_save_used; /* saved vq_used->vu_idx; see vq_endchains */ + + uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */ + + volatile struct virtio_desc *vq_desc; /* descriptor array */ + volatile struct vring_avail *vq_avail; /* the "avail" ring */ + volatile struct vring_used *vq_used; /* the "used" ring */ +}; +/* as noted above, these are sort of backwards, name-wise */ +#define VQ_AVAIL_EVENT_IDX(vq) \ + (*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize]) +#define VQ_USED_EVENT_IDX(vq) \ + ((vq)->vq_avail->va_ring[(vq)->vq_qsize]) + +/* + * Is this ring ready for I/O? + */ +static inline int +vq_ring_ready(struct vqueue_info *vq) +{ + + return (vq->vq_flags & VQ_ALLOC); +} + +/* + * Are there "available" descriptors? (This does not count + * how many, just returns True if there are some.) + */ +static inline int +vq_has_descs(struct vqueue_info *vq) +{ + + return (vq_ring_ready(vq) && vq->vq_last_avail != + vq->vq_avail->va_idx); +} + +/* + * Deliver an interrupt to guest on the given virtual queue + * (if possible, or a generic MSI interrupt if not using MSI-X). + */ +static inline void +vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq) +{ + VS_LOCK(vs); + mmio_lintr_assert(vs->vs_di); + VS_UNLOCK(vs); +} + +static inline void +vq_kick_enable(struct vqueue_info *vq) +{ + + vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY; + /* + * Full memory barrier to make sure the store to vu_flags + * happens before the load from va_idx, which results from + * a subsequent call to vq_has_descs(). + */ + atomic_thread_fence_seq_cst(); +} + +static inline void +vq_kick_disable(struct vqueue_info *vq) +{ + + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; +} + +struct iovec; +void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct mmio_devinst *di, + struct vqueue_info *queues); +int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); +void vi_reset_dev(struct virtio_softc *); +void vi_set_io_res(struct virtio_softc *, int); + +int vq_getchain(struct vqueue_info *vq, uint16_t *pidx, + struct iovec *iov, int n_iov, uint16_t *flags); +void vq_retchains(struct vqueue_info *vq, uint16_t n_chains); +void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); +void vq_endchains(struct vqueue_info *vq, int used_all_avail); + +uint64_t vi_mmio_read(struct vmctx *ctx, int vcpu, struct mmio_devinst *di, + int baridx, uint64_t offset, int size); +void vi_mmio_write(struct vmctx *ctx, int vcpu, struct mmio_devinst *di, + int baridx, uint64_t offset, int size, uint64_t value); +void vi_devemu_init(struct mmio_devinst *di, uint32_t type); +#endif /* _VIRTIO_H_ */ Index: usr.sbin/bhyve/mmio/mmio_virtio.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_virtio.c @@ -0,0 +1,707 @@ +/*- + * Copyright (c) 2013 Chris Torek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include + +#include "bhyverun.h" +#include "mmio_emul.h" +#include "mmio_virtio.h" +#include "virtio_ids.h" + +static int debug_virtio = 0; + +#define DPRINTF(fmt, ...) if (debug_virtio) printf(fmt, ##__VA_ARGS__) +#define CFG_RW_DBG(offset, value) \ + DPRINTF("{device} | %-60s | %-35s | %-30s (%jx): value = %jx\r\n", \ + __FILE__, __func__, #offset, (uintmax_t)offset, (uintmax_t)value); + +/* + * Functions for dealing with generalized "virtual devices" as + * defined by + */ + +/* + * In case we decide to relax the "virtio softc comes at the + * front of virtio-based device softc" constraint, let's use + * this to convert. + */ +#define DEV_SOFTC(vs) ((void *)(vs)) + +/* + * Link a virtio_softc to its constants, the device softc, and + * the PCI emulation. + */ +void +vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct mmio_devinst *di, + struct vqueue_info *queues) +{ + int i; + + /* vs and dev_softc addresses must match */ + assert((void *)vs == dev_softc); + vs->vs_vc = vc; + vs->vs_di = di; + di->pi_arg = vs; + + vs->vs_queues = queues; + for (i = 0; i < vc->vc_nvq; i++) { + queues[i].vq_vs = vs; + queues[i].vq_num = i; + } +} + +/* + * Reset device (device-wide). This erases all queues, i.e., + * all the queues become invalid (though we don't wipe out the + * internal pointers, we just clear the VQ_ALLOC flag). + * + * It resets negotiated features to "none". + */ +void +vi_reset_dev(struct virtio_softc *vs) +{ + struct vqueue_info *vq; + int i, nvq; + + if (vs->vs_mtx) + assert(pthread_mutex_isowned_np(vs->vs_mtx)); + + nvq = vs->vs_vc->vc_nvq; + for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { + vq->vq_flags = 0; + vq->vq_last_avail = 0; + vq->vq_save_used = 0; + vq->vq_pfn = 0; + } + vs->vs_negotiated_caps = 0; + vs->vs_curq = 0; + /* vs->vs_status = 0; -- redundant */ + mmio_lintr_deassert(vs->vs_di); +} + +void +vi_set_io_res(struct virtio_softc *vs, int barnum) +{ + mmio_alloc_mem(vs->vs_di); +} + +/* + * Initialize interrupts for MMIO + */ +int +vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix) +{ + /* activate interrupts */ + mmio_lintr_request(vs->vs_di); + + return (0); +} + +/* + * Initialize the currently-selected virtio queue (vs->vs_curq). + * The guest just gave us a page frame number, from which we can + * calculate the addresses of the queue. + */ +void +vi_vq_init(struct virtio_softc *vs, uint32_t pfn) +{ + struct vqueue_info *vq; + uint64_t phys; + size_t size; + char *base; + + vq = &vs->vs_queues[vs->vs_curq]; + vq->vq_pfn = pfn; + phys = (uint64_t)pfn * vs->vs_guest_page_size; + size = vring_size(vq->vq_qsize, vs->vs_align); + base = paddr_guest2host(vs->vs_di->pi_vmctx, phys, size); + + /* First page(s) are descriptors... */ + vq->vq_desc = (struct virtio_desc *)base; + base += vq->vq_qsize * sizeof(struct virtio_desc); + + /* ... immediately followed by "avail" ring (entirely uint16_t's) */ + vq->vq_avail = (struct vring_avail *)base; + base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t); + + /* Then it's rounded up to the next page... */ + base = (char *)roundup2((uintptr_t)base, vs->vs_align); + + /* ... and the last page(s) are the used ring. */ + vq->vq_used = (struct vring_used *)base; + + /* Mark queue as allocated, and start at 0 when we use it. */ + vq->vq_flags = VQ_ALLOC; + vq->vq_last_avail = 0; + vq->vq_save_used = 0; +} + +/* + * Helper inline for vq_getchain(): record the i'th "real" + * descriptor. + */ +static inline void +_vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx, + struct iovec *iov, int n_iov, uint16_t *flags) { + + if (i >= n_iov) + return; + iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len); + iov[i].iov_len = vd->vd_len; + if (flags != NULL) + flags[i] = vd->vd_flags; +} +#define VQ_MAX_DESCRIPTORS 512 /* see below */ + +/* + * Examine the chain of descriptors starting at the "next one" to + * make sure that they describe a sensible request. If so, return + * the number of "real" descriptors that would be needed/used in + * acting on this request. This may be smaller than the number of + * available descriptors, e.g., if there are two available but + * they are two separate requests, this just returns 1. Or, it + * may be larger: if there are indirect descriptors involved, + * there may only be one descriptor available but it may be an + * indirect pointing to eight more. We return 8 in this case, + * i.e., we do not count the indirect descriptors, only the "real" + * ones. + * + * Basically, this vets the vd_flags and vd_next field of each + * descriptor and tells you how many are involved. Since some may + * be indirect, this also needs the vmctx (in the pci_devinst + * at vs->vs_di) so that it can find indirect descriptors. + * + * As we process each descriptor, we copy and adjust it (guest to + * host address wise, also using the vmtctx) into the given iov[] + * array (of the given size). If the array overflows, we stop + * placing values into the array but keep processing descriptors, + * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1. + * So you, the caller, must not assume that iov[] is as big as the + * return value (you can process the same thing twice to allocate + * a larger iov array if needed, or supply a zero length to find + * out how much space is needed). + * + * If you want to verify the WRITE flag on each descriptor, pass a + * non-NULL "flags" pointer to an array of "uint16_t" of the same size + * as n_iov and we'll copy each vd_flags field after unwinding any + * indirects. + * + * If some descriptor(s) are invalid, this prints a diagnostic message + * and returns -1. If no descriptors are ready now it simply returns 0. + * + * You are assumed to have done a vq_ring_ready() if needed (note + * that vq_has_descs() does one). + */ +int +vq_getchain(struct vqueue_info *vq, uint16_t *pidx, + struct iovec *iov, int n_iov, uint16_t *flags) +{ + int i; + u_int ndesc, n_indir; + u_int idx, next; + volatile struct virtio_desc *vdir, *vindir, *vp; + struct vmctx *ctx; + struct virtio_softc *vs; + const char *name; + + vs = vq->vq_vs; + name = vs->vs_vc->vc_name; + + /* + * Note: it's the responsibility of the guest not to + * update vq->vq_avail->va_idx until all of the descriptors + * the guest has written are valid (including all their + * vd_next fields and vd_flags). + * + * Compute (last_avail - va_idx) in integers mod 2**16. This is + * the number of descriptors the device has made available + * since the last time we updated vq->vq_last_avail. + * + * We just need to do the subtraction as an unsigned int, + * then trim off excess bits. + */ + idx = vq->vq_last_avail; + ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx); + if (ndesc == 0) + return (0); + if (ndesc > vq->vq_qsize) { + /* XXX need better way to diagnose issues */ + fprintf(stderr, + "%s: ndesc (%u) out of range, driver confused?\r\n", + name, (u_int)ndesc); + return (-1); + } + + /* + * Now count/parse "involved" descriptors starting from + * the head of the chain. + * + * To prevent loops, we could be more complicated and + * check whether we're re-visiting a previously visited + * index, but we just abort if the count gets excessive. + */ + ctx = vs->vs_di->pi_vmctx; + *pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)]; + vq->vq_last_avail++; + for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) { + if (next >= vq->vq_qsize) { + fprintf(stderr, + "%s: descriptor index %u out of range, " + "driver confused?\r\n", + name, next); + return (-1); + } + vdir = &vq->vq_desc[next]; + if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) { + _vq_record(i, vdir, ctx, iov, n_iov, flags); + i++; + } else if ((vs->vs_vc->vc_hv_caps & + VIRTIO_RING_F_INDIRECT_DESC) == 0) { + fprintf(stderr, + "%s: descriptor has forbidden INDIRECT flag, " + "driver confused?\r\n", + name); + return (-1); + } else { + n_indir = vdir->vd_len / 16; + if ((vdir->vd_len & 0xf) || n_indir == 0) { + fprintf(stderr, + "%s: invalid indir len 0x%x, " + "driver confused?\r\n", + name, (u_int)vdir->vd_len); + return (-1); + } + vindir = paddr_guest2host(ctx, + vdir->vd_addr, vdir->vd_len); + /* + * Indirects start at the 0th, then follow + * their own embedded "next"s until those run + * out. Each one's indirect flag must be off + * (we don't really have to check, could just + * ignore errors...). + */ + next = 0; + for (;;) { + vp = &vindir[next]; + if (vp->vd_flags & VRING_DESC_F_INDIRECT) { + fprintf(stderr, + "%s: indirect desc has INDIR flag," + " driver confused?\r\n", + name); + return (-1); + } + _vq_record(i, vp, ctx, iov, n_iov, flags); + if (++i > VQ_MAX_DESCRIPTORS) + goto loopy; + if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0) + break; + next = vp->vd_next; + if (next >= n_indir) { + fprintf(stderr, + "%s: invalid next %u > %u, " + "driver confused?\r\n", + name, (u_int)next, n_indir); + return (-1); + } + } + } + if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) + return (i); + } +loopy: + fprintf(stderr, + "%s: descriptor loop? count > %d - driver confused?\r\n", + name, i); + return (-1); +} + +/* + * Return the currently-first request chain back to the available queue. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_retchains(struct vqueue_info *vq, uint16_t n_chains) +{ + + vq->vq_last_avail -= n_chains; +} + +/* + * Return specified request chain to the guest, setting its I/O length + * to the provided value. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) +{ + uint16_t uidx, mask; + volatile struct vring_used *vuh; + volatile struct virtio_used *vue; + + /* + * Notes: + * - mask is N-1 where N is a power of 2 so computes x % N + * - vuh points to the "used" data shared with guest + * - vue points to the "used" ring entry we want to update + * - head is the same value we compute in vq_iovecs(). + * + * (I apologize for the two fields named vu_idx; the + * virtio spec calls the one that vue points to, "id"...) + */ + mask = vq->vq_qsize - 1; + vuh = vq->vq_used; + + uidx = vuh->vu_idx; + vue = &vuh->vu_ring[uidx++ & mask]; + vue->vu_idx = idx; + vue->vu_tlen = iolen; + vuh->vu_idx = uidx; +} + +/* + * Driver has finished processing "available" chains and calling + * vq_relchain on each one. If driver used all the available + * chains, used_all should be set. + * + * If the "used" index moved we may need to inform the guest, i.e., + * deliver an interrupt. Even if the used index did NOT move we + * may need to deliver an interrupt, if the avail ring is empty and + * we are supposed to interrupt on empty. + * + * Note that used_all_avail is provided by the caller because it's + * a snapshot of the ring state when he decided to finish interrupt + * processing -- it's possible that descriptors became available after + * that point. (It's also typically a constant 1/True as well.) + */ +void +vq_endchains(struct vqueue_info *vq, int used_all_avail) +{ + struct virtio_softc *vs; + uint16_t event_idx, new_idx, old_idx; + int intr; + + /* + * Interrupt generation: if we're using EVENT_IDX, + * interrupt if we've crossed the event threshold. + * Otherwise interrupt is generated if we added "used" entries, + * but suppressed by VRING_AVAIL_F_NO_INTERRUPT. + * + * In any case, though, if NOTIFY_ON_EMPTY is set and the + * entire avail was processed, we need to interrupt always. + */ + vs = vq->vq_vs; + old_idx = vq->vq_save_used; + vq->vq_save_used = new_idx = vq->vq_used->vu_idx; + if (used_all_avail && + (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) + intr = 1; + else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) { + event_idx = VQ_USED_EVENT_IDX(vq); + /* + * This calculation is per docs and the kernel + * (see src/sys/dev/virtio/virtio_ring.h). + */ + intr = (uint16_t)(new_idx - event_idx - 1) < + (uint16_t)(new_idx - old_idx); + } else { + intr = new_idx != old_idx && + !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT); + } + if (intr) + vq_interrupt(vs, vq); +} + +/* + * Handle pci config space reads. + * If it's to the interrupt system, do that + * If it's part of the virtio standard stuff, do that. + * Otherwise dispatch to the actual driver. + */ +uint64_t +vi_mmio_read(struct vmctx *ctx, int vcpu, struct mmio_devinst *di, + int baridx, uint64_t offset, int size) +{ + struct virtio_softc *vs = di->pi_arg; + struct virtio_consts *vc; + const char *name; + uint64_t sel; + uint32_t value; + int error; + + if (vs->vs_mtx) + pthread_mutex_lock(vs->vs_mtx); + + vc = vs->vs_vc; + name = vc->vc_name; + value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff; + + if (size != 1 && size != 2 && size != 4) + goto bad; + + if (offset >= VIRTIO_MMIO_CONFIG) { + error = (*vc->vc_cfgread)(DEV_SOFTC(vs), + offset - VIRTIO_MMIO_CONFIG, + size, + &value); + if (error) + goto bad; + + CFG_RW_DBG(offset, value); + goto done; + } + + switch (offset) { + case VIRTIO_MMIO_MAGIC_VALUE: + value = mmio_get_cfgreg32(di, offset); + CFG_RW_DBG(VIRTIO_MMIO_MAGIC_VALUE, value); + break; + case VIRTIO_MMIO_VERSION: + value = mmio_get_cfgreg32(di, offset); + CFG_RW_DBG(VIRTIO_MMIO_VERSION, value); + break; + case VIRTIO_MMIO_DEVICE_ID: + value = mmio_get_cfgreg32(di, offset); + CFG_RW_DBG(VIRTIO_MMIO_DEVICE_ID, value); + break; + case VIRTIO_MMIO_VENDOR_ID: + value = mmio_get_cfgreg32(di, offset); + CFG_RW_DBG(VIRTIO_MMIO_VENDOR_ID, value); + break; + case VIRTIO_MMIO_INTERRUPT_STATUS: + value = mmio_get_cfgreg32(di, offset); + CFG_RW_DBG(VIRTIO_MMIO_INTERRUPT_STATUS, value); + break; + case VIRTIO_MMIO_STATUS: + value = mmio_get_cfgreg32(di, offset); + CFG_RW_DBG(VIRTIO_MMIO_STATUS, value); + break; + case VIRTIO_MMIO_HOST_FEATURES: + sel = mmio_get_cfgreg32(di, VIRTIO_MMIO_HOST_FEATURES_SEL); + value = (vc->vc_hv_caps >> (32 * sel)) & 0xffffffff; + CFG_RW_DBG(VIRTIO_MMIO_HOST_FEATURES, value); + break; + case VIRTIO_MMIO_QUEUE_NUM_MAX: + value = vs->vs_curq < vc->vc_nvq ? + vs->vs_queues[vs->vs_curq].vq_qsize : 0; + CFG_RW_DBG(VIRTIO_MMIO_QUEUE_NUM_MAX, value); + break; + case VIRTIO_MMIO_QUEUE_PFN: + value = vs->vs_curq < vc->vc_nvq ? + vs->vs_queues[vs->vs_curq].vq_pfn : 0; + CFG_RW_DBG(VIRTIO_MMIO_QUEUE_PFN, value); + break; + default: + CFG_RW_DBG(offset, value); + goto bad; + break; + } + + goto done; + +bad: + fprintf(stderr, "%s: read from bad offset/size: %jd/%d\r\n", + name, (uintmax_t)offset, size); + +done: + if (vs->vs_mtx) + pthread_mutex_unlock(vs->vs_mtx); + return (value); +} + +/* + * Handle pci config space writes. + * If it's to the MSI-X info, do that. + * If it's part of the virtio standard stuff, do that. + * Otherwise dispatch to the actual driver. + */ +void +vi_mmio_write(struct vmctx *ctx, int vcpu, struct mmio_devinst *di, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct virtio_softc *vs = di->pi_arg; + struct vqueue_info *vq; + struct virtio_consts *vc; + const char *name; + int error; + + if (vs->vs_mtx) + pthread_mutex_lock(vs->vs_mtx); + + vc = vs->vs_vc; + name = vc->vc_name; + + if (size != 1 && size != 2 && size != 4) + goto bad; + + if (offset >= VIRTIO_MMIO_CONFIG) { + error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), + offset - VIRTIO_MMIO_CONFIG, + size, value); + if (error) + goto bad; + + CFG_RW_DBG(offset, value); + goto done; + } + + switch (offset) { + case VIRTIO_MMIO_HOST_FEATURES_SEL: + CFG_RW_DBG(VIRTIO_MMIO_HOST_FEATURES_SEL, value); + mmio_set_cfgreg32(di, offset, value); + break; + case VIRTIO_MMIO_GUEST_FEATURES_SEL: + CFG_RW_DBG(VIRTIO_MMIO_GUEST_FEATURES_SEL, value); + mmio_set_cfgreg32(di, offset, value); + break; + case VIRTIO_MMIO_INTERRUPT_ACK: + CFG_RW_DBG(VIRTIO_MMIO_INTERRUPT_ACK, value); + mmio_lintr_deassert(di); + mmio_set_cfgreg32(di, offset, value); + break; + case VIRTIO_MMIO_STATUS: + CFG_RW_DBG(VIRTIO_MMIO_STATUS, value); + mmio_set_cfgreg32(di, offset, value); + vs->vs_status = value; + if (value == 0) + (*vc->vc_reset)(DEV_SOFTC(vs)); + break; + case VIRTIO_MMIO_QUEUE_NUM: + CFG_RW_DBG(VIRTIO_MMIO_QUEUE_NUM, value); + mmio_set_cfgreg32(di, offset, value); + vq = &vs->vs_queues[vs->vs_curq]; + vq->vq_qsize = value; + break; + case VIRTIO_MMIO_GUEST_FEATURES: + CFG_RW_DBG(VIRTIO_MMIO_GUEST_FEATURES, value); + mmio_set_cfgreg32(di, offset, value); + vs->vs_negotiated_caps = value & vc->vc_hv_caps; + if (vc->vc_apply_features) + (*vc->vc_apply_features)(DEV_SOFTC(vs), + vs->vs_negotiated_caps); + break; + case VIRTIO_MMIO_GUEST_PAGE_SIZE: + mmio_set_cfgreg32(di, offset, value); + vs->vs_guest_page_size = value; + break; + case VIRTIO_MMIO_QUEUE_SEL: + CFG_RW_DBG(VIRTIO_MMIO_QUEUE_SEL, value); + mmio_set_cfgreg32(di, offset, value); + /* + * Note that the guest is allowed to select an + * invalid queue; we just need to return a QNUM + * of 0 while the bad queue is selected. + */ + vs->vs_curq = value; + break; + case VIRTIO_MMIO_QUEUE_ALIGN: + CFG_RW_DBG(VIRTIO_MMIO_QUEUE_ALIGN, value); + mmio_set_cfgreg32(di, offset, value); + vs->vs_align = value; + break; + case VIRTIO_MMIO_QUEUE_PFN: + CFG_RW_DBG(VIRTIO_MMIO_QUEUE_PFN, value); + mmio_set_cfgreg32(di, offset, value); + if (vs->vs_curq >= vc->vc_nvq) + fprintf(stderr, "%s: curq %d >= max %d\r\n", + name, vs->vs_curq, vc->vc_nvq); + else + vi_vq_init(vs, value); + break; + case VIRTIO_MMIO_QUEUE_NOTIFY: + CFG_RW_DBG(VIRTIO_MMIO_QUEUE_NOTIFY, value); + if (value >= vc->vc_nvq) { + fprintf(stderr, "%s: queue %d notify out of range\r\n", + name, (int)value); + break; + } + mmio_set_cfgreg32(di, offset, value); + vq = &vs->vs_queues[value]; + if (vq->vq_notify) + (*vq->vq_notify)(DEV_SOFTC(vs), vq); + else if (vc->vc_qnotify) + (*vc->vc_qnotify)(DEV_SOFTC(vs), vq); + else + fprintf(stderr, + "%s: qnotify queue %d: missing vq/vc notify\r\n", + name, (int)value); + break; + default: + CFG_RW_DBG(offset, value); + goto bad; + break; + } + + goto done; + +bad: + fprintf(stderr, "%s: write to bad offset/size %jd/%d\r\n", + name, (uintmax_t)offset, size); +done: + if (vs->vs_mtx) + pthread_mutex_unlock(vs->vs_mtx); +} + +void +vi_devemu_init(struct mmio_devinst *di, uint32_t type) +{ + uint32_t id; + + switch (type) { + case VIRTIO_TYPE_NET: + id = VIRTIO_ID_NETWORK; + break; + case VIRTIO_TYPE_BLOCK: + id = VIRTIO_ID_BLOCK; + break; + case VIRTIO_TYPE_CONSOLE: + id = VIRTIO_ID_CONSOLE; + break; + case VIRTIO_TYPE_ENTROPY: + id = VIRTIO_ID_ENTROPY; + break; + default: + return; + } + + mmio_set_cfgreg32(di, VIRTIO_MMIO_MAGIC_VALUE, VIRTIO_MMIO_MAGIC_NUM); + mmio_set_cfgreg32(di, VIRTIO_MMIO_VERSION, VIRTIO_MMIO_VERSION_NUM); + mmio_set_cfgreg32(di, VIRTIO_MMIO_DEVICE_ID, id); + mmio_set_cfgreg32(di, VIRTIO_MMIO_VENDOR_ID, VIRTIO_VENDOR); +} Index: usr.sbin/bhyve/mmio/mmio_virtio_block.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_virtio_block.c @@ -0,0 +1,424 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "debug.h" + +#include "mmio_emul.h" +#include "mmio_virtio.h" + +#include "block_if.h" + +#define VTBLK_RINGSZ 128 + +_Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request"); + +#define VTBLK_S_OK 0 +#define VTBLK_S_IOERR 1 +#define VTBLK_S_UNSUPP 2 + +#define VTBLK_BLK_ID_BYTES 20 + 1 + +/* Capability bits */ +#define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */ +#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */ +#define VTBLK_F_FLUSH (1 << 9) /* Cache flush support */ +#define VTBLK_F_TOPOLOGY (1 << 10) /* Optimal I/O alignment */ + +/* + * Host capabilities + */ +#define VTBLK_S_HOSTCAPS \ + ( VTBLK_F_SEG_MAX | \ + VTBLK_F_BLK_SIZE | \ + VTBLK_F_FLUSH | \ + VTBLK_F_TOPOLOGY | \ + VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ + +/* + * Config space "registers" + */ +struct vtblk_config { + uint64_t vbc_capacity; + uint32_t vbc_size_max; + uint32_t vbc_seg_max; + struct { + uint16_t cylinders; + uint8_t heads; + uint8_t sectors; + } vbc_geometry; + uint32_t vbc_blk_size; + struct { + uint8_t physical_block_exp; + uint8_t alignment_offset; + uint16_t min_io_size; + uint32_t opt_io_size; + } vbc_topology; + uint8_t vbc_writeback; +} __packed; + +/* + * Fixed-size block header + */ +struct virtio_blk_hdr { +#define VBH_OP_READ 0 +#define VBH_OP_WRITE 1 +#define VBH_OP_FLUSH 4 +#define VBH_OP_FLUSH_OUT 5 +#define VBH_OP_IDENT 8 +#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ + uint32_t vbh_type; + uint32_t vbh_ioprio; + uint64_t vbh_sector; +} __packed; + +/* + * Debug printf + */ +static int pci_vtblk_debug; +#define DPRINTF(params) if (pci_vtblk_debug) PRINTLN params +#define WPRINTF(params) PRINTLN params + +struct pci_vtblk_ioreq { + struct blockif_req io_req; + struct pci_vtblk_softc *io_sc; + uint8_t *io_status; + uint16_t io_idx; +}; + +/* + * Per-device softc + */ +struct pci_vtblk_softc { + struct virtio_softc vbsc_vs; + pthread_mutex_t vsc_mtx; + struct vqueue_info vbsc_vq; + struct vtblk_config vbsc_cfg; + struct blockif_ctxt *bc; + char vbsc_ident[VTBLK_BLK_ID_BYTES]; + struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; +}; + +static void pci_vtblk_reset(void *); +static void pci_vtblk_notify(void *, struct vqueue_info *); +static int pci_vtblk_cfgread(void *, int, int, uint32_t *); +static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); + +static struct virtio_consts vtblk_vi_consts = { + "vtblk", /* our name */ + 1, /* we support 1 virtqueue */ + sizeof(struct vtblk_config), /* config reg size */ + pci_vtblk_reset, /* reset */ + pci_vtblk_notify, /* device-wide qnotify */ + pci_vtblk_cfgread, /* read PCI config */ + pci_vtblk_cfgwrite, /* write PCI config */ + NULL, /* apply negotiated features */ + VTBLK_S_HOSTCAPS, /* our capabilities */ +}; + +static void +pci_vtblk_reset(void *vsc) +{ + struct pci_vtblk_softc *sc = vsc; + + DPRINTF(("vtblk: device reset requested !")); + vi_reset_dev(&sc->vbsc_vs); +} + +static void +pci_vtblk_done(struct blockif_req *br, int err) +{ + struct pci_vtblk_ioreq *io = br->br_param; + struct pci_vtblk_softc *sc = io->io_sc; + + /* convert errno into a virtio block error return */ + if (err == EOPNOTSUPP || err == ENOSYS) + *io->io_status = VTBLK_S_UNSUPP; + else if (err != 0) + *io->io_status = VTBLK_S_IOERR; + else + *io->io_status = VTBLK_S_OK; + + /* + * Return the descriptor back to the host. + * We wrote 1 byte (our status) to host. + */ + pthread_mutex_lock(&sc->vsc_mtx); + vq_relchain(&sc->vbsc_vq, io->io_idx, 1); + vq_endchains(&sc->vbsc_vq, 0); + pthread_mutex_unlock(&sc->vsc_mtx); +} + +static void +pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) +{ + struct virtio_blk_hdr *vbh; + struct pci_vtblk_ioreq *io; + int i, n; + int err; + ssize_t iolen; + int writeop, type; + struct iovec iov[BLOCKIF_IOV_MAX + 2]; + uint16_t idx, flags[BLOCKIF_IOV_MAX + 2]; + + n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags); + + /* + * The first descriptor will be the read-only fixed header, + * and the last is for status (hence +2 above and below). + * The remaining iov's are the actual data I/O vectors. + * + * XXX - note - this fails on crash dump, which does a + * VIRTIO_BLK_T_FLUSH with a zero transfer length + */ + assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2); + + io = &sc->vbsc_ios[idx]; + assert((flags[0] & VRING_DESC_F_WRITE) == 0); + assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); + vbh = iov[0].iov_base; + memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); + io->io_req.br_iovcnt = n - 2; + io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE; + io->io_status = iov[--n].iov_base; + assert(iov[n].iov_len == 1); + assert(flags[n] & VRING_DESC_F_WRITE); + + /* + * XXX + * The guest should not be setting the BARRIER flag because + * we don't advertise the capability. + */ + type = vbh->vbh_type & ~VBH_FLAG_BARRIER; + writeop = (type == VBH_OP_WRITE); + + iolen = 0; + for (i = 1; i < n; i++) { + /* + * - write op implies read-only descriptor, + * - read/ident op implies write-only descriptor, + * therefore test the inverse of the descriptor bit + * to the op. + */ + assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop); + iolen += iov[i].iov_len; + } + io->io_req.br_resid = iolen; + + DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %lld", + writeop ? "write" : "read/ident", iolen, i - 1, + (long long) io->io_req.br_offset)); + + switch (type) { + case VBH_OP_READ: + err = blockif_read(sc->bc, &io->io_req); + break; + case VBH_OP_WRITE: + err = blockif_write(sc->bc, &io->io_req); + break; + case VBH_OP_FLUSH: + case VBH_OP_FLUSH_OUT: + err = blockif_flush(sc->bc, &io->io_req); + break; + case VBH_OP_IDENT: + /* Assume a single buffer */ + /* S/n equal to buffer is not zero-terminated. */ + memset(iov[1].iov_base, 0, iov[1].iov_len); + strncpy(iov[1].iov_base, sc->vbsc_ident, + MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); + pci_vtblk_done(&io->io_req, 0); + return; + default: + pci_vtblk_done(&io->io_req, EOPNOTSUPP); + return; + } + assert(err == 0); +} + +static void +pci_vtblk_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtblk_softc *sc = vsc; + + while (vq_has_descs(vq)) + pci_vtblk_proc(sc, vq); +} + +static int +pci_vtblk_init(struct vmctx *ctx, struct mmio_devinst *pi, char *opts) +{ + char bident[sizeof("XX:X:X")]; + struct blockif_ctxt *bctxt; + MD5_CTX mdctx; + u_char digest[16]; + struct pci_vtblk_softc *sc; + off_t size; + int i, sectsz, sts, sto; + + if (opts == NULL) { + WPRINTF(("virtio-block: backing device required")); + return (1); + } + + /* + * The supplied backing file has to exist + */ + snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->di_func); + bctxt = blockif_open(opts, bident); + if (bctxt == NULL) { + perror("Could not open backing file"); + return (1); + } + + size = blockif_size(bctxt); + sectsz = blockif_sectsz(bctxt); + blockif_psectsz(bctxt, &sts, &sto); + + sc = calloc(1, sizeof(struct pci_vtblk_softc)); + sc->bc = bctxt; + for (i = 0; i < VTBLK_RINGSZ; i++) { + struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i]; + io->io_req.br_callback = pci_vtblk_done; + io->io_req.br_param = io; + io->io_sc = sc; + io->io_idx = i; + } + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + /* init virtio softc and virtqueues */ + vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq); + sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; + + sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; + /* sc->vbsc_vq.vq_notify = we have no per-queue notify */ + + /* + * Create an identifier for the backing file. Use parts of the + * md5 sum of the filename + */ + MD5Init(&mdctx); + MD5Update(&mdctx, opts, strlen(opts)); + MD5Final(digest, &mdctx); + snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES, + "BHYVE-%02X%02X-%02X%02X-%02X%02X", + digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); + + /* setup virtio block config space */ + sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */ + sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ + + /* + * If Linux is presented with a seg_max greater than the virtio queue + * size, it can stumble into situations where it violates its own + * invariants and panics. For safety, we keep seg_max clamped, paying + * heed to the two extra descriptors needed for the header and status + * of a request. + */ + sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX); + sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */ + sc->vbsc_cfg.vbc_geometry.heads = 0; + sc->vbsc_cfg.vbc_geometry.sectors = 0; + sc->vbsc_cfg.vbc_blk_size = sectsz; + sc->vbsc_cfg.vbc_topology.physical_block_exp = + (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0; + sc->vbsc_cfg.vbc_topology.alignment_offset = + (sto != 0) ? ((sts - sto) / sectsz) : 0; + sc->vbsc_cfg.vbc_topology.min_io_size = 0; + sc->vbsc_cfg.vbc_topology.opt_io_size = 0; + sc->vbsc_cfg.vbc_writeback = 0; + + /* + * Should we move some of this into virtio.c? Could + * have the device, class, and subdev_0 as fields in + * the virtio constants structure. + */ + vi_devemu_init(pi, VIRTIO_TYPE_BLOCK); + + if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) { + blockif_close(sc->bc); + free(sc); + return (1); + } + vi_set_io_res(&sc->vbsc_vs, 0); + return (0); +} + +static int +pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value) +{ + + DPRINTF(("vtblk: write to readonly reg %d", offset)); + return (1); +} + +static int +pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtblk_softc *sc = vsc; + void *ptr; + + /* our caller has already verified offset and size */ + ptr = (uint8_t *)&sc->vbsc_cfg + offset; + memcpy(retval, ptr, size); + return (0); +} + +struct mmio_devemu pci_de_vblk = { + .de_emu = "virtio-blk", + .de_init = pci_vtblk_init, + .de_write = vi_mmio_write, + .de_read = vi_mmio_read +}; +MMIO_EMUL_SET(pci_de_vblk); Index: usr.sbin/bhyve/mmio/mmio_virtio_console.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_virtio_console.c @@ -0,0 +1,680 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 iXsystems Inc. + * All rights reserved. + * + * This software was developed by Jakub Klama + * under sponsorship from iXsystems Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "debug.h" + +#include "mmio_emul.h" +#include "mmio_virtio.h" + +#include "mevent.h" +#include "sockstream.h" + +#define VTCON_RINGSZ 64 +#define VTCON_MAXPORTS 16 +#define VTCON_MAXQ (VTCON_MAXPORTS * 2 + 2) + +#define VTCON_DEVICE_READY 0 +#define VTCON_DEVICE_ADD 1 +#define VTCON_DEVICE_REMOVE 2 +#define VTCON_PORT_READY 3 +#define VTCON_CONSOLE_PORT 4 +#define VTCON_CONSOLE_RESIZE 5 +#define VTCON_PORT_OPEN 6 +#define VTCON_PORT_NAME 7 + +#define VTCON_F_SIZE 0 +#define VTCON_F_MULTIPORT 1 +#define VTCON_F_EMERG_WRITE 2 +#define VTCON_S_HOSTCAPS \ + (VTCON_F_SIZE | VTCON_F_MULTIPORT | VTCON_F_EMERG_WRITE) + +static int pci_vtcon_debug; +#define DPRINTF(params) if (pci_vtcon_debug) PRINTLN params +#define WPRINTF(params) PRINTLN params + +struct pci_vtcon_softc; +struct pci_vtcon_port; +struct pci_vtcon_config; +typedef void (pci_vtcon_cb_t)(struct pci_vtcon_port *, void *, struct iovec *, + int); + +struct pci_vtcon_port { + struct pci_vtcon_softc * vsp_sc; + int vsp_id; + const char * vsp_name; + bool vsp_enabled; + bool vsp_console; + bool vsp_rx_ready; + bool vsp_open; + int vsp_rxq; + int vsp_txq; + void * vsp_arg; + pci_vtcon_cb_t * vsp_cb; +}; + +struct pci_vtcon_sock +{ + struct pci_vtcon_port * vss_port; + const char * vss_path; + struct mevent * vss_server_evp; + struct mevent * vss_conn_evp; + int vss_server_fd; + int vss_conn_fd; + bool vss_open; +}; + +struct pci_vtcon_softc { + struct virtio_softc vsc_vs; + struct vqueue_info vsc_queues[VTCON_MAXQ]; + pthread_mutex_t vsc_mtx; + uint64_t vsc_cfg; + uint64_t vsc_features; + char * vsc_rootdir; + int vsc_kq; + int vsc_nports; + bool vsc_ready; + struct pci_vtcon_port vsc_control_port; + struct pci_vtcon_port vsc_ports[VTCON_MAXPORTS]; + struct pci_vtcon_config *vsc_config; +}; + +struct pci_vtcon_config { + uint16_t cols; + uint16_t rows; + uint32_t max_nr_ports; + uint32_t emerg_wr; +} __attribute__((packed)); + +struct pci_vtcon_control { + uint32_t id; + uint16_t event; + uint16_t value; +} __attribute__((packed)); + +struct pci_vtcon_console_resize { + uint16_t cols; + uint16_t rows; +} __attribute__((packed)); + +static void pci_vtcon_reset(void *); +static void pci_vtcon_notify_rx(void *, struct vqueue_info *); +static void pci_vtcon_notify_tx(void *, struct vqueue_info *); +static int pci_vtcon_cfgread(void *, int, int, uint32_t *); +static int pci_vtcon_cfgwrite(void *, int, int, uint32_t); +static void pci_vtcon_neg_features(void *, uint64_t); +static void pci_vtcon_sock_accept(int, enum ev_type, void *); +static void pci_vtcon_sock_rx(int, enum ev_type, void *); +static void pci_vtcon_sock_tx(struct pci_vtcon_port *, void *, struct iovec *, + int); +static void pci_vtcon_control_send(struct pci_vtcon_softc *, + struct pci_vtcon_control *, const void *, size_t); +static void pci_vtcon_announce_port(struct pci_vtcon_port *); +static void pci_vtcon_open_port(struct pci_vtcon_port *, bool); + +static struct virtio_consts vtcon_vi_consts = { + "vtcon", /* our name */ + VTCON_MAXQ, /* we support VTCON_MAXQ virtqueues */ + sizeof(struct pci_vtcon_config), /* config reg size */ + pci_vtcon_reset, /* reset */ + NULL, /* device-wide qnotify */ + pci_vtcon_cfgread, /* read virtio config */ + pci_vtcon_cfgwrite, /* write virtio config */ + pci_vtcon_neg_features, /* apply negotiated features */ + VTCON_S_HOSTCAPS, /* our capabilities */ +}; + + +static void +pci_vtcon_reset(void *vsc) +{ + struct pci_vtcon_softc *sc; + + sc = vsc; + + DPRINTF(("vtcon: device reset requested!")); + vi_reset_dev(&sc->vsc_vs); +} + +static void +pci_vtcon_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtcon_softc *sc = vsc; + + sc->vsc_features = negotiated_features; +} + +static int +pci_vtcon_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtcon_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)sc->vsc_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static int +pci_vtcon_cfgwrite(void *vsc, int offset, int size, uint32_t val) +{ + + return (0); +} + +static inline struct pci_vtcon_port * +pci_vtcon_vq_to_port(struct pci_vtcon_softc *sc, struct vqueue_info *vq) +{ + uint16_t num = vq->vq_num; + + if (num == 0 || num == 1) + return (&sc->vsc_ports[0]); + + if (num == 2 || num == 3) + return (&sc->vsc_control_port); + + return (&sc->vsc_ports[(num / 2) - 1]); +} + +static inline struct vqueue_info * +pci_vtcon_port_to_vq(struct pci_vtcon_port *port, bool tx_queue) +{ + int qnum; + + qnum = tx_queue ? port->vsp_txq : port->vsp_rxq; + return (&port->vsp_sc->vsc_queues[qnum]); +} + +static struct pci_vtcon_port * +pci_vtcon_port_add(struct pci_vtcon_softc *sc, const char *name, + pci_vtcon_cb_t *cb, void *arg) +{ + struct pci_vtcon_port *port; + + if (sc->vsc_nports == VTCON_MAXPORTS) { + errno = EBUSY; + return (NULL); + } + + port = &sc->vsc_ports[sc->vsc_nports++]; + port->vsp_id = sc->vsc_nports - 1; + port->vsp_sc = sc; + port->vsp_name = name; + port->vsp_cb = cb; + port->vsp_arg = arg; + + if (port->vsp_id == 0) { + /* port0 */ + port->vsp_txq = 0; + port->vsp_rxq = 1; + } else { + port->vsp_txq = sc->vsc_nports * 2; + port->vsp_rxq = port->vsp_txq + 1; + } + + port->vsp_enabled = true; + return (port); +} + +static int +pci_vtcon_sock_add(struct pci_vtcon_softc *sc, const char *name, + const char *path) +{ + struct pci_vtcon_sock *sock; + struct sockaddr_un sun; + char *pathcopy; + int s = -1, fd = -1, error = 0; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + sock = calloc(1, sizeof(struct pci_vtcon_sock)); + if (sock == NULL) { + error = -1; + goto out; + } + + s = socket(AF_UNIX, SOCK_STREAM, 0); + if (s < 0) { + error = -1; + goto out; + } + + pathcopy = strdup(path); + if (pathcopy == NULL) { + error = -1; + goto out; + } + + fd = open(dirname(pathcopy), O_RDONLY | O_DIRECTORY); + if (fd < 0) { + free(pathcopy); + error = -1; + goto out; + } + + sun.sun_family = AF_UNIX; + sun.sun_len = sizeof(struct sockaddr_un); + strcpy(pathcopy, path); + strlcpy(sun.sun_path, basename(pathcopy), sizeof(sun.sun_path)); + free(pathcopy); + + if (bindat(fd, s, (struct sockaddr *)&sun, sun.sun_len) < 0) { + error = -1; + goto out; + } + + if (fcntl(s, F_SETFL, O_NONBLOCK) < 0) { + error = -1; + goto out; + } + + if (listen(s, 1) < 0) { + error = -1; + goto out; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + sock->vss_port = pci_vtcon_port_add(sc, name, pci_vtcon_sock_tx, sock); + if (sock->vss_port == NULL) { + error = -1; + goto out; + } + + sock->vss_open = false; + sock->vss_conn_fd = -1; + sock->vss_server_fd = s; + sock->vss_server_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_accept, + sock); + + if (sock->vss_server_evp == NULL) { + error = -1; + goto out; + } + +out: + if (fd != -1) + close(fd); + + if (error != 0) { + if (s != -1) + close(s); + free(sock); + } + + return (error); +} + +static void +pci_vtcon_sock_accept(int fd __unused, enum ev_type t __unused, void *arg) +{ + struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; + int s; + + s = accept(sock->vss_server_fd, NULL, NULL); + if (s < 0) + return; + + if (sock->vss_open) { + close(s); + return; + } + + sock->vss_open = true; + sock->vss_conn_fd = s; + sock->vss_conn_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_rx, sock); + + pci_vtcon_open_port(sock->vss_port, true); +} + +static void +pci_vtcon_sock_rx(int fd __unused, enum ev_type t __unused, void *arg) +{ + struct pci_vtcon_port *port; + struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; + struct vqueue_info *vq; + struct iovec iov; + static char dummybuf[2048]; + int len, n; + uint16_t idx; + + port = sock->vss_port; + vq = pci_vtcon_port_to_vq(port, true); + + if (!sock->vss_open || !port->vsp_rx_ready) { + len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); + if (len == 0) + goto close; + + return; + } + + if (!vq_has_descs(vq)) { + len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); + vq_endchains(vq, 1); + if (len == 0) + goto close; + + return; + } + + do { + n = vq_getchain(vq, &idx, &iov, 1, NULL); + len = readv(sock->vss_conn_fd, &iov, n); + + if (len == 0 || (len < 0 && errno == EWOULDBLOCK)) { + vq_retchains(vq, 1); + vq_endchains(vq, 0); + if (len == 0) + goto close; + + return; + } + + vq_relchain(vq, idx, len); + } while (vq_has_descs(vq)); + + vq_endchains(vq, 1); + +close: + mevent_delete_close(sock->vss_conn_evp); + sock->vss_conn_fd = -1; + sock->vss_open = false; +} + +static void +pci_vtcon_sock_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, + int niov) +{ + struct pci_vtcon_sock *sock; + int i, ret; + + sock = (struct pci_vtcon_sock *)arg; + + if (sock->vss_conn_fd == -1) + return; + + for (i = 0; i < niov; i++) { + ret = stream_write(sock->vss_conn_fd, iov[i].iov_base, + iov[i].iov_len); + if (ret <= 0) + break; + } + + if (ret <= 0) { + mevent_delete_close(sock->vss_conn_evp); + sock->vss_conn_fd = -1; + sock->vss_open = false; + } +} + +static void +pci_vtcon_control_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, + int niov) +{ + struct pci_vtcon_softc *sc; + struct pci_vtcon_port *tmp; + struct pci_vtcon_control resp, *ctrl; + int i; + + assert(niov == 1); + + sc = port->vsp_sc; + ctrl = (struct pci_vtcon_control *)iov->iov_base; + + switch (ctrl->event) { + case VTCON_DEVICE_READY: + sc->vsc_ready = true; + /* set port ready events for registered ports */ + for (i = 0; i < VTCON_MAXPORTS; i++) { + tmp = &sc->vsc_ports[i]; + if (tmp->vsp_enabled) + pci_vtcon_announce_port(tmp); + + if (tmp->vsp_open) + pci_vtcon_open_port(tmp, true); + } + break; + + case VTCON_PORT_READY: + if (ctrl->id >= sc->vsc_nports) { + WPRINTF(("VTCON_PORT_READY event for unknown port %d", + ctrl->id)); + return; + } + + tmp = &sc->vsc_ports[ctrl->id]; + if (tmp->vsp_console) { + resp.event = VTCON_CONSOLE_PORT; + resp.id = ctrl->id; + resp.value = 1; + pci_vtcon_control_send(sc, &resp, NULL, 0); + } + break; + } +} + +static void +pci_vtcon_announce_port(struct pci_vtcon_port *port) +{ + struct pci_vtcon_control event; + + event.id = port->vsp_id; + event.event = VTCON_DEVICE_ADD; + event.value = 1; + pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); + + event.event = VTCON_PORT_NAME; + pci_vtcon_control_send(port->vsp_sc, &event, port->vsp_name, + strlen(port->vsp_name)); +} + +static void +pci_vtcon_open_port(struct pci_vtcon_port *port, bool open) +{ + struct pci_vtcon_control event; + + if (!port->vsp_sc->vsc_ready) { + port->vsp_open = true; + return; + } + + event.id = port->vsp_id; + event.event = VTCON_PORT_OPEN; + event.value = (int)open; + pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); +} + +static void +pci_vtcon_control_send(struct pci_vtcon_softc *sc, + struct pci_vtcon_control *ctrl, const void *payload, size_t len) +{ + struct vqueue_info *vq; + struct iovec iov; + uint16_t idx; + int n; + + vq = pci_vtcon_port_to_vq(&sc->vsc_control_port, true); + + if (!vq_has_descs(vq)) + return; + + n = vq_getchain(vq, &idx, &iov, 1, NULL); + + assert(n == 1); + + memcpy(iov.iov_base, ctrl, sizeof(struct pci_vtcon_control)); + if (payload != NULL && len > 0) + memcpy(iov.iov_base + sizeof(struct pci_vtcon_control), + payload, len); + + vq_relchain(vq, idx, sizeof(struct pci_vtcon_control) + len); + vq_endchains(vq, 1); +} + +static void +pci_vtcon_notify_tx(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtcon_softc *sc; + struct pci_vtcon_port *port; + struct iovec iov[1]; + uint16_t idx, n; + uint16_t flags[8]; + + sc = vsc; + port = pci_vtcon_vq_to_port(sc, vq); + + while (vq_has_descs(vq)) { + n = vq_getchain(vq, &idx, iov, 1, flags); + assert(n >= 1); + if (port != NULL) + port->vsp_cb(port, port->vsp_arg, iov, 1); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, 0); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ +} + +static void +pci_vtcon_notify_rx(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtcon_softc *sc; + struct pci_vtcon_port *port; + + sc = vsc; + port = pci_vtcon_vq_to_port(sc, vq); + + if (!port->vsp_rx_ready) { + port->vsp_rx_ready = 1; + vq_kick_disable(vq); + } +} + +static int +pci_vtcon_init(struct vmctx *ctx, struct mmio_devinst *pi, char *opts) +{ + struct pci_vtcon_softc *sc; + char *portname = NULL; + char *portpath = NULL; + char *opt; + int i; + + sc = calloc(1, sizeof(struct pci_vtcon_softc)); + sc->vsc_config = calloc(1, sizeof(struct pci_vtcon_config)); + sc->vsc_config->max_nr_ports = VTCON_MAXPORTS; + sc->vsc_config->cols = 80; + sc->vsc_config->rows = 25; + + vi_softc_linkup(&sc->vsc_vs, &vtcon_vi_consts, sc, pi, sc->vsc_queues); + sc->vsc_vs.vs_mtx = &sc->vsc_mtx; + + for (i = 0; i < VTCON_MAXQ; i++) { + sc->vsc_queues[i].vq_qsize = VTCON_RINGSZ; + sc->vsc_queues[i].vq_notify = i % 2 == 0 + ? pci_vtcon_notify_rx + : pci_vtcon_notify_tx; + } + + /* initialize config space */ + vi_devemu_init(pi, VIRTIO_TYPE_CONSOLE); + + if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_res(&sc->vsc_vs, 0); + + /* create control port */ + sc->vsc_control_port.vsp_sc = sc; + sc->vsc_control_port.vsp_txq = 2; + sc->vsc_control_port.vsp_rxq = 3; + sc->vsc_control_port.vsp_cb = pci_vtcon_control_tx; + sc->vsc_control_port.vsp_enabled = true; + + while ((opt = strsep(&opts, ",")) != NULL) { + portname = strsep(&opt, "="); + portpath = opt; + + /* create port */ + if (pci_vtcon_sock_add(sc, portname, portpath) < 0) { + EPRINTLN("cannot create port %s: %s", + portname, strerror(errno)); + return (1); + } + } + + return (0); +} + +struct mmio_devemu pci_de_vcon = { + .de_emu = "virtio-console", + .de_init = pci_vtcon_init, + .de_write = vi_mmio_write, + .de_read = vi_mmio_read +}; +MMIO_EMUL_SET(pci_de_vcon); Index: usr.sbin/bhyve/mmio/mmio_virtio_net.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_virtio_net.c @@ -0,0 +1,697 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "debug.h" + +#include "mmio_emul.h" +#include "mmio_virtio.h" + +#include "mevent.h" +#include "net_utils.h" +#include "net_backends.h" +#include "iov.h" + +#define VTNET_RINGSZ 1024 + +#define VTNET_MAXSEGS 256 + +#define VTNET_MAX_PKT_LEN (65536 + 64) + +#define VTNET_S_HOSTCAPS \ + ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | \ + VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) + +/* + * PCI config-space "registers" + */ +struct virtio_net_config { + uint8_t mac[6]; + uint16_t status; +} __packed; + +/* + * Queue definitions. + */ +#define VTNET_RXQ 0 +#define VTNET_TXQ 1 +#define VTNET_CTLQ 2 /* NB: not yet supported */ + +#define VTNET_MAXQ 3 + +/* + * Debug printf + */ +static int pci_vtnet_debug; +#define DPRINTF(params) if (pci_vtnet_debug) PRINTLN params +#define WPRINTF(params) PRINTLN params + +/* + * Per-device softc + */ +struct pci_vtnet_softc { + struct virtio_softc vsc_vs; + struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; + pthread_mutex_t vsc_mtx; + + net_backend_t *vsc_be; + + int resetting; /* protected by tx_mtx */ + + uint64_t vsc_features; /* negotiated features */ + + pthread_mutex_t rx_mtx; + int rx_merge; /* merged rx bufs in use */ + + pthread_t tx_tid; + pthread_mutex_t tx_mtx; + pthread_cond_t tx_cond; + int tx_in_progress; + + size_t vhdrlen; + size_t be_vhdrlen; + + struct virtio_net_config vsc_config; + struct virtio_consts vsc_consts; +}; + +static void pci_vtnet_reset(void *); +/* static void pci_vtnet_notify(void *, struct vqueue_info *); */ +static int pci_vtnet_cfgread(void *, int, int, uint32_t *); +static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); +static void pci_vtnet_neg_features(void *, uint64_t); + +static struct virtio_consts vtnet_vi_consts = { + "vtnet", /* our name */ + VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ + sizeof(struct virtio_net_config), /* config reg size */ + pci_vtnet_reset, /* reset */ + NULL, /* device-wide qnotify -- not used */ + pci_vtnet_cfgread, /* read PCI config */ + pci_vtnet_cfgwrite, /* write PCI config */ + pci_vtnet_neg_features, /* apply negotiated features */ + VTNET_S_HOSTCAPS, /* our capabilities */ +}; + +static void +pci_vtnet_reset(void *vsc) +{ + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device reset requested !")); + + /* Acquire the RX lock to block RX processing. */ + pthread_mutex_lock(&sc->rx_mtx); + + /* + * Make sure receive operation is disabled at least until we + * re-negotiate the features, since receive operation depends + * on the value of sc->rx_merge and the header length, which + * are both set in pci_vtnet_neg_features(). + * Receive operation will be enabled again once the guest adds + * the first receive buffers and kicks us. + */ + netbe_rx_disable(sc->vsc_be); + + /* Set sc->resetting and give a chance to the TX thread to stop. */ + pthread_mutex_lock(&sc->tx_mtx); + sc->resetting = 1; + while (sc->tx_in_progress) { + pthread_mutex_unlock(&sc->tx_mtx); + usleep(10000); + pthread_mutex_lock(&sc->tx_mtx); + } + + /* + * Now reset rings, MSI-X vectors, and negotiated capabilities. + * Do that with the TX lock held, since we need to reset + * sc->resetting. + */ + vi_reset_dev(&sc->vsc_vs); + + sc->resetting = 0; + pthread_mutex_unlock(&sc->tx_mtx); + pthread_mutex_unlock(&sc->rx_mtx); +} + +static __inline struct iovec * +iov_trim_hdr(struct iovec *iov, int *iovcnt, unsigned int hlen) +{ + struct iovec *riov; + + if (iov[0].iov_len < hlen) { + /* + * Not enough header space in the first fragment. + * That's not ok for us. + */ + return NULL; + } + + iov[0].iov_len -= hlen; + if (iov[0].iov_len == 0) { + *iovcnt -= 1; + if (*iovcnt == 0) { + /* + * Only space for the header. That's not + * enough for us. + */ + return NULL; + } + riov = &iov[1]; + } else { + iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + hlen); + riov = &iov[0]; + } + + return (riov); +} + +struct virtio_mrg_rxbuf_info { + uint16_t idx; + uint16_t pad; + uint32_t len; +}; + +static void +pci_vtnet_rx(struct pci_vtnet_softc *sc) +{ + int prepend_hdr_len = sc->vhdrlen - sc->be_vhdrlen; + struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS]; + struct iovec iov[VTNET_MAXSEGS + 1]; + struct vqueue_info *vq; + + + + vq = &sc->vsc_queues[VTNET_RXQ]; + for (;;) { + struct virtio_net_rxhdr *hdr; + uint32_t riov_bytes; + struct iovec *riov; + uint32_t ulen; + int riov_len; + int n_chains; + ssize_t rlen; + ssize_t plen; + + plen = netbe_peek_recvlen(sc->vsc_be); + if (plen <= 0) { + /* + * No more packets (plen == 0), or backend errored + * (plen < 0). Interrupt if needed and stop. + */ + vq_endchains(vq, /*used_all_avail=*/0); + return; + } + plen += prepend_hdr_len; + + /* + * Get a descriptor chain to store the next ingress + * packet. In case of mergeable rx buffers, get as + * many chains as necessary in order to make room + * for a maximum sized LRO packet. + */ + riov_bytes = 0; + riov_len = 0; + riov = iov; + n_chains = 0; + do { + int n = vq_getchain(vq, &info[n_chains].idx, riov, + VTNET_MAXSEGS - riov_len, NULL); + + if (n == 0) { + /* + * No rx buffers. Enable RX kicks and double + * check. + */ + vq_kick_enable(vq); + if (!vq_has_descs(vq)) { + /* + * Still no buffers. Return the unused + * chains (if any), interrupt if needed + * (including for NOTIFY_ON_EMPTY), and + * disable the backend until the next + * kick. + */ + vq_retchains(vq, n_chains); + vq_endchains(vq, /*used_all_avail=*/1); + netbe_rx_disable(sc->vsc_be); + return; + } + + /* More rx buffers found, so keep going. */ + vq_kick_disable(vq); + continue; + } + assert(n >= 1 && riov_len + n <= VTNET_MAXSEGS); + riov_len += n; + if (!sc->rx_merge) { + n_chains = 1; + break; + } + info[n_chains].len = (uint32_t)count_iov(riov, n); + riov_bytes += info[n_chains].len; + riov += n; + n_chains++; + } while (riov_bytes < plen && riov_len < VTNET_MAXSEGS); + + riov = iov; + hdr = riov[0].iov_base; + if (prepend_hdr_len > 0) { + /* + * The frontend uses a virtio-net header, but the + * backend does not. We need to prepend a zeroed + * header. + */ + riov = iov_trim_hdr(riov, &riov_len, prepend_hdr_len); + if (riov == NULL) { + /* + * The first collected chain is nonsensical, + * as it is not even enough to store the + * virtio-net header. Just drop it. + */ + vq_relchain(vq, info[0].idx, 0); + vq_retchains(vq, n_chains - 1); + continue; + } + memset(hdr, 0, prepend_hdr_len); + } + + rlen = netbe_recv(sc->vsc_be, riov, riov_len); + + if (rlen != plen - prepend_hdr_len) { + /* + * No more packets (len == 0), or backend errored + * (err < 0). Return unused available buffers + * and stop. + */ + vq_retchains(vq, n_chains); + /* Interrupt if needed/appropriate and stop. */ + vq_endchains(vq, /*used_all_avail=*/0); + return; + } + + ulen = (uint32_t)plen; /* avoid too many casts below */ + + /* Publish the used buffers to the guest. */ + if (!sc->rx_merge) { + vq_relchain(vq, info[0].idx, ulen); + } else { + uint32_t iolen; + int i = 0; + + do { + iolen = info[i].len; + if (iolen > ulen) { + iolen = ulen; + } + vq_relchain(vq, info[i].idx, iolen); + ulen -= iolen; + i++; + } while (ulen > 0); + + hdr->vrh_bufs = i; + // TODO add publish for arm64 + //vq_relchain_publish(vq); + vq_retchains(vq, n_chains - i); + } + } + +} + +/* + * Called when there is read activity on the backend file descriptor. + * Each buffer posted by the guest is assumed to be able to contain + * an entire ethernet frame + rx header. + */ +static void +pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) +{ + struct pci_vtnet_softc *sc = param; + + pthread_mutex_lock(&sc->rx_mtx); + pci_vtnet_rx(sc); + pthread_mutex_unlock(&sc->rx_mtx); + +} + +/* Called on RX kick. */ +static void +pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtnet_softc *sc = vsc; + + /* + * A qnotify means that the rx process can now begin. + */ + pthread_mutex_lock(&sc->rx_mtx); + vq_kick_disable(vq); + netbe_rx_enable(sc->vsc_be); + pthread_mutex_unlock(&sc->rx_mtx); +} + +/* TX virtqueue processing, called by the TX thread. */ +static void +pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) +{ + struct iovec iov[VTNET_MAXSEGS + 1]; + struct iovec *siov = iov; + uint16_t idx; + ssize_t len; + int n; + + /* + * Obtain chain of descriptors. The first descriptor also + * contains the virtio-net header. + */ + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); + + if (sc->vhdrlen != sc->be_vhdrlen) { + /* + * The frontend uses a virtio-net header, but the backend + * does not. We simply strip the header and ignore it, as + * it should be zero-filled. + */ + siov = iov_trim_hdr(siov, &n, sc->vhdrlen); + } + + if (siov == NULL) { + /* The chain is nonsensical. Just drop it. */ + len = 0; + } else { + len = netbe_send(sc->vsc_be, siov, n); + if (len < 0) { + /* + * If send failed, report that 0 bytes + * were read. + */ + len = 0; + } + } + + /* + * Return the processed chain to the guest, reporting + * the number of bytes that we read. + */ + vq_relchain(vq, idx, len > 0 ? len : 0); +} + +/* Called on TX kick. */ +static void +pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtnet_softc *sc = vsc; + + /* + * Any ring entries to process? + */ + if (!vq_has_descs(vq)) + return; + + /* Signal the tx thread for processing */ + pthread_mutex_lock(&sc->tx_mtx); + vq_kick_disable(vq); + if (sc->tx_in_progress == 0) + pthread_cond_signal(&sc->tx_cond); + pthread_mutex_unlock(&sc->tx_mtx); +} + +/* + * Thread which will handle processing of TX desc + */ +static void * +pci_vtnet_tx_thread(void *param) +{ + struct pci_vtnet_softc *sc = param; + struct vqueue_info *vq; + int error; + + vq = &sc->vsc_queues[VTNET_TXQ]; + + /* + * Let us wait till the tx queue pointers get initialised & + * first tx signaled + */ + pthread_mutex_lock(&sc->tx_mtx); + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + + for (;;) { + /* note - tx mutex is locked here */ + while (sc->resetting || !vq_has_descs(vq)) { + vq_kick_enable(vq); + if (!sc->resetting && vq_has_descs(vq)) + break; + + sc->tx_in_progress = 0; + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + } + vq_kick_disable(vq); + sc->tx_in_progress = 1; + pthread_mutex_unlock(&sc->tx_mtx); + + do { + /* + * Run through entries, placing them into + * iovecs and sending when an end-of-packet + * is found + */ + pci_vtnet_proctx(sc, vq); + } while (vq_has_descs(vq)); + + /* + * Generate an interrupt if needed. + */ + vq_endchains(vq, /*used_all_avail=*/1); + + pthread_mutex_lock(&sc->tx_mtx); + } +} + +#ifdef notyet +static void +pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) +{ + + DPRINTF(("vtnet: control qnotify!")); +} +#endif + +static int +pci_vtnet_init(struct vmctx *ctx, struct mmio_devinst *pi, char *opts) +{ + struct pci_vtnet_softc *sc; + char tname[MAXCOMLEN + 1]; + int mac_provided; + + /* + * Allocate data structures for further virtio initializations. + * sc also contains a copy of vtnet_vi_consts, since capabilities + * change depending on the backend. + */ + sc = calloc(1, sizeof(struct pci_vtnet_softc)); + + sc->vsc_consts = vtnet_vi_consts; + pthread_mutex_init(&sc->vsc_mtx, NULL); + + sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; + sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; +#ifdef notyet + sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; +#endif + + /* + * Attempt to open the backend device and read the MAC address + * if specified. + */ + mac_provided = 0; + if (opts != NULL) { + char *devname; + char *vtopts; + int err; + + devname = vtopts = strdup(opts); + (void) strsep(&vtopts, ","); + + if (vtopts != NULL) { + err = net_parsemac(vtopts, sc->vsc_config.mac); + if (err != 0) { + free(devname); + free(sc); + return (err); + } + mac_provided = 1; + } + + err = netbe_init(&sc->vsc_be, devname, pci_vtnet_rx_callback, + sc); + free(devname); + if (err) { + free(sc); + return (err); + } + sc->vsc_consts.vc_hv_caps |= netbe_get_cap(sc->vsc_be); + } + + if (!mac_provided) { + net_genmac(pi, sc->vsc_config.mac); + } + + /* initialize config space */ + vi_devemu_init(pi, VIRTIO_TYPE_NET); + + /* Link is up if we managed to open backend device. */ + sc->vsc_config.status = (opts == NULL || sc->vsc_be); + + vi_softc_linkup(&sc->vsc_vs, &sc->vsc_consts, sc, pi, sc->vsc_queues); + sc->vsc_vs.vs_mtx = &sc->vsc_mtx; + + /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ + if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) { + free(sc); + return (1); + } + + /* use BAR 0 to map config regs in IO space */ + vi_set_io_res(&sc->vsc_vs, 0); + + sc->resetting = 0; + + sc->rx_merge = 0; + pthread_mutex_init(&sc->rx_mtx, NULL); + + /* + * Initialize tx semaphore & spawn TX processing thread. + * As of now, only one thread for TX desc processing is + * spawned. + */ + sc->tx_in_progress = 0; + pthread_mutex_init(&sc->tx_mtx, NULL); + pthread_cond_init(&sc->tx_cond, NULL); + pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); + snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, + pi->di_func); + pthread_set_name_np(sc->tx_tid, tname); + + return (0); +} + +static int +pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) +{ + struct pci_vtnet_softc *sc = vsc; + void *ptr; + + if (offset < (int)sizeof(sc->vsc_config.mac)) { + assert(offset + size <= (int)sizeof(sc->vsc_config.mac)); + /* + * The driver is allowed to change the MAC address + */ + ptr = &sc->vsc_config.mac[offset]; + memcpy(ptr, &value, size); + } else { + /* silently ignore other writes */ + DPRINTF(("vtnet: write to readonly reg %d", offset)); + } + + return (0); +} + +static int +pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtnet_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)&sc->vsc_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static void +pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtnet_softc *sc = vsc; + + sc->vsc_features = negotiated_features; + + if (negotiated_features & VIRTIO_NET_F_MRG_RXBUF) { + sc->vhdrlen = sizeof(struct virtio_net_rxhdr); + sc->rx_merge = 1; + } else { + /* + * Without mergeable rx buffers, virtio-net header is 2 + * bytes shorter than sizeof(struct virtio_net_rxhdr). + */ + sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2; + sc->rx_merge = 0; + } + + /* Tell the backend to enable some capabilities it has advertised. */ + netbe_set_cap(sc->vsc_be, negotiated_features, sc->vhdrlen); + sc->be_vhdrlen = netbe_get_vnet_hdr_len(sc->vsc_be); +} + +static struct mmio_devemu pci_de_vnet = { + .de_emu = "virtio-net", + .de_init = pci_vtnet_init, + .de_write = vi_mmio_write, + .de_read = vi_mmio_read +}; +MMIO_EMUL_SET(pci_de_vnet); Index: usr.sbin/bhyve/mmio/mmio_virtio_rnd.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_virtio_rnd.c @@ -0,0 +1,208 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * virtio entropy device emulation. + * Randomness is sourced from /dev/random which does not block + * once it has been seeded at bootup. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "debug.h" + +#include "mmio_emul.h" +#include "mmio_virtio.h" + +#define VTRND_RINGSZ 64 + + +static int pci_vtrnd_debug; +#define DPRINTF(params) if (pci_vtrnd_debug) PRINTLN params +#define WPRINTF(params) PRINTLN params + +/* + * Per-device softc + */ +struct pci_vtrnd_softc { + struct virtio_softc vrsc_vs; + struct vqueue_info vrsc_vq; + pthread_mutex_t vrsc_mtx; + uint64_t vrsc_cfg; + int vrsc_fd; +}; + +static void pci_vtrnd_reset(void *); +static void pci_vtrnd_notify(void *, struct vqueue_info *); + +static struct virtio_consts vtrnd_vi_consts = { + "vtrnd", /* our name */ + 1, /* we support 1 virtqueue */ + 0, /* config reg size */ + pci_vtrnd_reset, /* reset */ + pci_vtrnd_notify, /* device-wide qnotify */ + NULL, /* read virtio config */ + NULL, /* write virtio config */ + NULL, /* apply negotiated features */ + 0, /* our capabilities */ +}; + + +static void +pci_vtrnd_reset(void *vsc) +{ + struct pci_vtrnd_softc *sc; + + sc = vsc; + + DPRINTF(("vtrnd: device reset requested !")); + vi_reset_dev(&sc->vrsc_vs); +} + + +static void +pci_vtrnd_notify(void *vsc, struct vqueue_info *vq) +{ + struct iovec iov; + struct pci_vtrnd_softc *sc; + int len; + uint16_t idx; + + sc = vsc; + + if (sc->vrsc_fd < 0) { + vq_endchains(vq, 0); + return; + } + + while (vq_has_descs(vq)) { + vq_getchain(vq, &idx, &iov, 1, NULL); + + len = read(sc->vrsc_fd, iov.iov_base, iov.iov_len); + + DPRINTF(("vtrnd: vtrnd_notify(): %d", len)); + + /* Catastrophe if unable to read from /dev/random */ + assert(len > 0); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, len); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ +} + + +static int +pci_vtrnd_init(struct vmctx *ctx, struct mmio_devinst *pi, char *opts) +{ + struct pci_vtrnd_softc *sc; + int fd; + int len; + uint8_t v; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + /* + * Should always be able to open /dev/random. + */ + fd = open("/dev/random", O_RDONLY | O_NONBLOCK); + + assert(fd >= 0); + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_READ); + if (caph_rights_limit(fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* + * Check that device is seeded and non-blocking. + */ + len = read(fd, &v, sizeof(v)); + if (len <= 0) { + WPRINTF(("vtrnd: /dev/random not ready, read(): %d", len)); + close(fd); + return (1); + } + + sc = calloc(1, sizeof(struct pci_vtrnd_softc)); + + vi_softc_linkup(&sc->vrsc_vs, &vtrnd_vi_consts, sc, pi, &sc->vrsc_vq); + sc->vrsc_vs.vs_mtx = &sc->vrsc_mtx; + + sc->vrsc_vq.vq_qsize = VTRND_RINGSZ; + + /* keep /dev/random opened while emulating */ + sc->vrsc_fd = fd; + + /* initialize config space */ + vi_devemu_init(pi, VIRTIO_TYPE_ENTROPY); + + if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_res(&sc->vrsc_vs, 0); + + return (0); +} + + +struct mmio_devemu pci_de_vrnd = { + .de_emu = "virtio-rnd", + .de_init = pci_vtrnd_init, + .de_write = vi_mmio_write, + .de_read = vi_mmio_read +}; +MMIO_EMUL_SET(pci_de_vrnd); Index: usr.sbin/bhyve/mmio/mmio_virtio_scsi.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_virtio_scsi.c @@ -0,0 +1,741 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Jakub Klama . + * Copyright (c) 2018 Marcelo Araujo . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "debug.h" +#include "iov.h" + +#include "mmio_emul.h" +#include "mmio_virtio.h" + +#define VTSCSI_RINGSZ 64 +#define VTSCSI_REQUESTQ 1 +#define VTSCSI_THR_PER_Q 16 +#define VTSCSI_MAXQ (VTSCSI_REQUESTQ + 2) +#define VTSCSI_MAXSEG 64 + +#define VTSCSI_IN_HEADER_LEN(_sc) \ + (sizeof(struct pci_vtscsi_req_cmd_rd) + _sc->vss_config.cdb_size) + +#define VTSCSI_OUT_HEADER_LEN(_sc) \ + (sizeof(struct pci_vtscsi_req_cmd_wr) + _sc->vss_config.sense_size) + +#define VIRTIO_SCSI_MAX_CHANNEL 0 +#define VIRTIO_SCSI_MAX_TARGET 0 +#define VIRTIO_SCSI_MAX_LUN 16383 + +#define VIRTIO_SCSI_F_INOUT (1 << 0) +#define VIRTIO_SCSI_F_HOTPLUG (1 << 1) +#define VIRTIO_SCSI_F_CHANGE (1 << 2) + +static int pci_vtscsi_debug = 0; +#define DPRINTF(params) if (pci_vtscsi_debug) PRINTLN params +#define WPRINTF(params) PRINTLN params + +struct pci_vtscsi_config { + uint32_t num_queues; + uint32_t seg_max; + uint32_t max_sectors; + uint32_t cmd_per_lun; + uint32_t event_info_size; + uint32_t sense_size; + uint32_t cdb_size; + uint16_t max_channel; + uint16_t max_target; + uint32_t max_lun; +} __attribute__((packed)); + +struct pci_vtscsi_queue { + struct pci_vtscsi_softc * vsq_sc; + struct vqueue_info * vsq_vq; + pthread_mutex_t vsq_mtx; + pthread_mutex_t vsq_qmtx; + pthread_cond_t vsq_cv; + STAILQ_HEAD(, pci_vtscsi_request) vsq_requests; + LIST_HEAD(, pci_vtscsi_worker) vsq_workers; +}; + +struct pci_vtscsi_worker { + struct pci_vtscsi_queue * vsw_queue; + pthread_t vsw_thread; + bool vsw_exiting; + LIST_ENTRY(pci_vtscsi_worker) vsw_link; +}; + +struct pci_vtscsi_request { + struct pci_vtscsi_queue * vsr_queue; + struct iovec vsr_iov_in[VTSCSI_MAXSEG]; + int vsr_niov_in; + struct iovec vsr_iov_out[VTSCSI_MAXSEG]; + int vsr_niov_out; + uint32_t vsr_idx; + STAILQ_ENTRY(pci_vtscsi_request) vsr_link; +}; + +/* + * Per-device softc + */ +struct pci_vtscsi_softc { + struct virtio_softc vss_vs; + struct vqueue_info vss_vq[VTSCSI_MAXQ]; + struct pci_vtscsi_queue vss_queues[VTSCSI_REQUESTQ]; + pthread_mutex_t vss_mtx; + int vss_iid; + int vss_ctl_fd; + uint32_t vss_features; + struct pci_vtscsi_config vss_config; +}; + +#define VIRTIO_SCSI_T_TMF 0 +#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 +#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 +#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 +#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 +#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 +#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 + +/* command-specific response values */ +#define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0 +#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 +#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 + +struct pci_vtscsi_ctrl_tmf { + uint32_t type; + uint32_t subtype; + uint8_t lun[8]; + uint64_t id; + uint8_t response; +} __attribute__((packed)); + +#define VIRTIO_SCSI_T_AN_QUERY 1 +#define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2 +#define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4 +#define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8 +#define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16 +#define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32 +#define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64 + +struct pci_vtscsi_ctrl_an { + uint32_t type; + uint8_t lun[8]; + uint32_t event_requested; + uint32_t event_actual; + uint8_t response; +} __attribute__((packed)); + +/* command-specific response values */ +#define VIRTIO_SCSI_S_OK 0 +#define VIRTIO_SCSI_S_OVERRUN 1 +#define VIRTIO_SCSI_S_ABORTED 2 +#define VIRTIO_SCSI_S_BAD_TARGET 3 +#define VIRTIO_SCSI_S_RESET 4 +#define VIRTIO_SCSI_S_BUSY 5 +#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 +#define VIRTIO_SCSI_S_TARGET_FAILURE 7 +#define VIRTIO_SCSI_S_NEXUS_FAILURE 8 +#define VIRTIO_SCSI_S_FAILURE 9 +#define VIRTIO_SCSI_S_INCORRECT_LUN 12 + +/* task_attr */ +#define VIRTIO_SCSI_S_SIMPLE 0 +#define VIRTIO_SCSI_S_ORDERED 1 +#define VIRTIO_SCSI_S_HEAD 2 +#define VIRTIO_SCSI_S_ACA 3 + +struct pci_vtscsi_event { + uint32_t event; + uint8_t lun[8]; + uint32_t reason; +} __attribute__((packed)); + +struct pci_vtscsi_req_cmd_rd { + uint8_t lun[8]; + uint64_t id; + uint8_t task_attr; + uint8_t prio; + uint8_t crn; + uint8_t cdb[]; +} __attribute__((packed)); + +struct pci_vtscsi_req_cmd_wr { + uint32_t sense_len; + uint32_t residual; + uint16_t status_qualifier; + uint8_t status; + uint8_t response; + uint8_t sense[]; +} __attribute__((packed)); + +static void *pci_vtscsi_proc(void *); +static void pci_vtscsi_reset(void *); +static void pci_vtscsi_neg_features(void *, uint64_t); +static int pci_vtscsi_cfgread(void *, int, int, uint32_t *); +static int pci_vtscsi_cfgwrite(void *, int, int, uint32_t); +static inline int pci_vtscsi_get_lun(uint8_t *); +static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *, void *, size_t); +static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *, + struct pci_vtscsi_ctrl_tmf *); +static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *, + struct pci_vtscsi_ctrl_an *); +static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *, struct iovec *, + int, struct iovec *, int); +static void pci_vtscsi_controlq_notify(void *, struct vqueue_info *); +static void pci_vtscsi_eventq_notify(void *, struct vqueue_info *); +static void pci_vtscsi_requestq_notify(void *, struct vqueue_info *); +static int pci_vtscsi_init_queue(struct pci_vtscsi_softc *, + struct pci_vtscsi_queue *, int); +static int pci_vtscsi_init(struct vmctx *, struct mmio_devinst *, char *); + +static struct virtio_consts vtscsi_vi_consts = { + "vtscsi", /* our name */ + VTSCSI_MAXQ, /* we support 2+n virtqueues */ + sizeof(struct pci_vtscsi_config), /* config reg size */ + pci_vtscsi_reset, /* reset */ + NULL, /* device-wide qnotify */ + pci_vtscsi_cfgread, /* read virtio config */ + pci_vtscsi_cfgwrite, /* write virtio config */ + pci_vtscsi_neg_features, /* apply negotiated features */ + 0, /* our capabilities */ +}; + +static void * +pci_vtscsi_proc(void *arg) +{ + struct pci_vtscsi_worker *worker = (struct pci_vtscsi_worker *)arg; + struct pci_vtscsi_queue *q = worker->vsw_queue; + struct pci_vtscsi_request *req; + int iolen; + + for (;;) { + pthread_mutex_lock(&q->vsq_mtx); + + while (STAILQ_EMPTY(&q->vsq_requests) + && !worker->vsw_exiting) + pthread_cond_wait(&q->vsq_cv, &q->vsq_mtx); + + if (worker->vsw_exiting) + break; + + req = STAILQ_FIRST(&q->vsq_requests); + STAILQ_REMOVE_HEAD(&q->vsq_requests, vsr_link); + + pthread_mutex_unlock(&q->vsq_mtx); + iolen = pci_vtscsi_request_handle(q, req->vsr_iov_in, + req->vsr_niov_in, req->vsr_iov_out, req->vsr_niov_out); + + pthread_mutex_lock(&q->vsq_qmtx); + vq_relchain(q->vsq_vq, req->vsr_idx, iolen); + vq_endchains(q->vsq_vq, 0); + pthread_mutex_unlock(&q->vsq_qmtx); + + DPRINTF(("virtio-scsi: request completed", + req->vsr_idx)); + free(req); + } + + pthread_mutex_unlock(&q->vsq_mtx); + return (NULL); +} + +static void +pci_vtscsi_reset(void *vsc) +{ + struct pci_vtscsi_softc *sc; + + sc = vsc; + + DPRINTF(("vtscsi: device reset requested")); + vi_reset_dev(&sc->vss_vs); + + /* initialize config structure */ + sc->vss_config = (struct pci_vtscsi_config){ + .num_queues = VTSCSI_REQUESTQ, + /* Leave room for the request and the response. */ + .seg_max = VTSCSI_MAXSEG - 2, + .max_sectors = 2, + .cmd_per_lun = 1, + .event_info_size = sizeof(struct pci_vtscsi_event), + .sense_size = 96, + .cdb_size = 32, + .max_channel = VIRTIO_SCSI_MAX_CHANNEL, + .max_target = VIRTIO_SCSI_MAX_TARGET, + .max_lun = VIRTIO_SCSI_MAX_LUN + }; +} + +static void +pci_vtscsi_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtscsi_softc *sc = vsc; + + sc->vss_features = negotiated_features; +} + +static int +pci_vtscsi_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtscsi_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)&sc->vss_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static int +pci_vtscsi_cfgwrite(void *vsc, int offset, int size, uint32_t val) +{ + + return (0); +} + +static inline int +pci_vtscsi_get_lun(uint8_t *lun) +{ + + return (((lun[2] << 8) | lun[3]) & 0x3fff); +} + +static int +pci_vtscsi_control_handle(struct pci_vtscsi_softc *sc, void *buf, + size_t bufsize) +{ + struct pci_vtscsi_ctrl_tmf *tmf; + struct pci_vtscsi_ctrl_an *an; + uint32_t type; + + type = *(uint32_t *)buf; + + if (type == VIRTIO_SCSI_T_TMF) { + tmf = (struct pci_vtscsi_ctrl_tmf *)buf; + return (pci_vtscsi_tmf_handle(sc, tmf)); + } + + if (type == VIRTIO_SCSI_T_AN_QUERY) { + an = (struct pci_vtscsi_ctrl_an *)buf; + return (pci_vtscsi_an_handle(sc, an)); + } + + return (0); +} + +static int +pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_ctrl_tmf *tmf) +{ + union ctl_io *io; + int err; + + io = ctl_scsi_alloc_io(sc->vss_iid); + ctl_scsi_zero_io(io); + + io->io_hdr.io_type = CTL_IO_TASK; + io->io_hdr.nexus.initid = sc->vss_iid; + io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(tmf->lun); + io->taskio.tag_type = CTL_TAG_SIMPLE; + io->taskio.tag_num = (uint32_t)tmf->id; + + switch (tmf->subtype) { + case VIRTIO_SCSI_T_TMF_ABORT_TASK: + io->taskio.task_action = CTL_TASK_ABORT_TASK; + break; + + case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: + io->taskio.task_action = CTL_TASK_ABORT_TASK_SET; + break; + + case VIRTIO_SCSI_T_TMF_CLEAR_ACA: + io->taskio.task_action = CTL_TASK_CLEAR_ACA; + break; + + case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: + io->taskio.task_action = CTL_TASK_CLEAR_TASK_SET; + break; + + case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: + io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET; + break; + + case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: + io->taskio.task_action = CTL_TASK_LUN_RESET; + break; + + case VIRTIO_SCSI_T_TMF_QUERY_TASK: + io->taskio.task_action = CTL_TASK_QUERY_TASK; + break; + + case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET: + io->taskio.task_action = CTL_TASK_QUERY_TASK_SET; + break; + } + + if (pci_vtscsi_debug) { + struct sbuf *sb = sbuf_new_auto(); + ctl_io_sbuf(io, sb); + sbuf_finish(sb); + DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); + sbuf_delete(sb); + } + + err = ioctl(sc->vss_ctl_fd, CTL_IO, io); + if (err != 0) + WPRINTF(("CTL_IO: err=%d (%s)", errno, strerror(errno))); + + tmf->response = io->taskio.task_status; + ctl_scsi_free_io(io); + return (1); +} + +static int +pci_vtscsi_an_handle(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_ctrl_an *an) +{ + + return (0); +} + +static int +pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in, + int niov_in, struct iovec *iov_out, int niov_out) +{ + struct pci_vtscsi_softc *sc = q->vsq_sc; + struct pci_vtscsi_req_cmd_rd *cmd_rd = NULL; + struct pci_vtscsi_req_cmd_wr *cmd_wr; + struct iovec data_iov_in[VTSCSI_MAXSEG], data_iov_out[VTSCSI_MAXSEG]; + union ctl_io *io; + int data_niov_in, data_niov_out; + void *ext_data_ptr = NULL; + uint32_t ext_data_len = 0, ext_sg_entries = 0; + int err, nxferred; + + seek_iov(iov_in, niov_in, data_iov_in, &data_niov_in, + VTSCSI_IN_HEADER_LEN(sc)); + seek_iov(iov_out, niov_out, data_iov_out, &data_niov_out, + VTSCSI_OUT_HEADER_LEN(sc)); + + truncate_iov(iov_in, &niov_in, VTSCSI_IN_HEADER_LEN(sc)); + truncate_iov(iov_out, &niov_out, VTSCSI_OUT_HEADER_LEN(sc)); + iov_to_buf(iov_in, niov_in, (void **)&cmd_rd); + + cmd_wr = malloc(VTSCSI_OUT_HEADER_LEN(sc)); + io = ctl_scsi_alloc_io(sc->vss_iid); + ctl_scsi_zero_io(io); + + io->io_hdr.nexus.initid = sc->vss_iid; + io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(cmd_rd->lun); + + io->io_hdr.io_type = CTL_IO_SCSI; + + if (data_niov_in > 0) { + ext_data_ptr = (void *)data_iov_in; + ext_sg_entries = data_niov_in; + ext_data_len = count_iov(data_iov_in, data_niov_in); + io->io_hdr.flags |= CTL_FLAG_DATA_OUT; + } else if (data_niov_out > 0) { + ext_data_ptr = (void *)data_iov_out; + ext_sg_entries = data_niov_out; + ext_data_len = count_iov(data_iov_out, data_niov_out); + io->io_hdr.flags |= CTL_FLAG_DATA_IN; + } + + io->scsiio.sense_len = sc->vss_config.sense_size; + io->scsiio.tag_num = (uint32_t)cmd_rd->id; + switch (cmd_rd->task_attr) { + case VIRTIO_SCSI_S_ORDERED: + io->scsiio.tag_type = CTL_TAG_ORDERED; + break; + case VIRTIO_SCSI_S_HEAD: + io->scsiio.tag_type = CTL_TAG_HEAD_OF_QUEUE; + break; + case VIRTIO_SCSI_S_ACA: + io->scsiio.tag_type = CTL_TAG_ACA; + break; + case VIRTIO_SCSI_S_SIMPLE: + default: + io->scsiio.tag_type = CTL_TAG_SIMPLE; + break; + } + io->scsiio.ext_sg_entries = ext_sg_entries; + io->scsiio.ext_data_ptr = ext_data_ptr; + io->scsiio.ext_data_len = ext_data_len; + io->scsiio.ext_data_filled = 0; + io->scsiio.cdb_len = sc->vss_config.cdb_size; + memcpy(io->scsiio.cdb, cmd_rd->cdb, sc->vss_config.cdb_size); + + if (pci_vtscsi_debug) { + struct sbuf *sb = sbuf_new_auto(); + ctl_io_sbuf(io, sb); + sbuf_finish(sb); + DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); + sbuf_delete(sb); + } + + err = ioctl(sc->vss_ctl_fd, CTL_IO, io); + if (err != 0) { + WPRINTF(("CTL_IO: err=%d (%s)", errno, strerror(errno))); + cmd_wr->response = VIRTIO_SCSI_S_FAILURE; + } else { + cmd_wr->sense_len = MIN(io->scsiio.sense_len, + sc->vss_config.sense_size); + cmd_wr->residual = io->scsiio.residual; + cmd_wr->status = io->scsiio.scsi_status; + cmd_wr->response = VIRTIO_SCSI_S_OK; + memcpy(&cmd_wr->sense, &io->scsiio.sense_data, + cmd_wr->sense_len); + } + + buf_to_iov(cmd_wr, VTSCSI_OUT_HEADER_LEN(sc), iov_out, niov_out, 0); + nxferred = VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled; + free(cmd_rd); + free(cmd_wr); + ctl_scsi_free_io(io); + return (nxferred); +} + +static void +pci_vtscsi_controlq_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtscsi_softc *sc; + struct iovec iov[VTSCSI_MAXSEG]; + uint16_t idx, n; + void *buf = NULL; + size_t bufsize; + int iolen; + + sc = vsc; + + while (vq_has_descs(vq)) { + n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, NULL); + bufsize = iov_to_buf(iov, n, &buf); + iolen = pci_vtscsi_control_handle(sc, buf, bufsize); + buf_to_iov(buf + bufsize - iolen, iolen, iov, n, + bufsize - iolen); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, iolen); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ + free(buf); +} + +static void +pci_vtscsi_eventq_notify(void *vsc, struct vqueue_info *vq) +{ + + vq_kick_disable(vq); +} + +static void +pci_vtscsi_requestq_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtscsi_softc *sc; + struct pci_vtscsi_queue *q; + struct pci_vtscsi_request *req; + struct iovec iov[VTSCSI_MAXSEG]; + uint16_t flags[VTSCSI_MAXSEG]; + uint16_t idx, n, i; + int readable; + + sc = vsc; + q = &sc->vss_queues[vq->vq_num - 2]; + + while (vq_has_descs(vq)) { + readable = 0; + n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, flags); + + /* Count readable descriptors */ + for (i = 0; i < n; i++) { + if (flags[i] & VRING_DESC_F_WRITE) + break; + + readable++; + } + + req = calloc(1, sizeof(struct pci_vtscsi_request)); + req->vsr_idx = idx; + req->vsr_queue = q; + req->vsr_niov_in = readable; + req->vsr_niov_out = n - readable; + memcpy(req->vsr_iov_in, iov, + req->vsr_niov_in * sizeof(struct iovec)); + memcpy(req->vsr_iov_out, iov + readable, + req->vsr_niov_out * sizeof(struct iovec)); + + pthread_mutex_lock(&q->vsq_mtx); + STAILQ_INSERT_TAIL(&q->vsq_requests, req, vsr_link); + pthread_cond_signal(&q->vsq_cv); + pthread_mutex_unlock(&q->vsq_mtx); + + DPRINTF(("virtio-scsi: request enqueued", idx)); + } +} + +static int +pci_vtscsi_init_queue(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_queue *queue, int num) +{ + struct pci_vtscsi_worker *worker; + char tname[MAXCOMLEN + 1]; + int i; + + queue->vsq_sc = sc; + queue->vsq_vq = &sc->vss_vq[num + 2]; + + pthread_mutex_init(&queue->vsq_mtx, NULL); + pthread_mutex_init(&queue->vsq_qmtx, NULL); + pthread_cond_init(&queue->vsq_cv, NULL); + STAILQ_INIT(&queue->vsq_requests); + LIST_INIT(&queue->vsq_workers); + + for (i = 0; i < VTSCSI_THR_PER_Q; i++) { + worker = calloc(1, sizeof(struct pci_vtscsi_worker)); + worker->vsw_queue = queue; + + pthread_create(&worker->vsw_thread, NULL, &pci_vtscsi_proc, + (void *)worker); + + snprintf(tname, sizeof(tname), "vtscsi:%d-%d", num, i); + pthread_set_name_np(worker->vsw_thread, tname); + LIST_INSERT_HEAD(&queue->vsq_workers, worker, vsw_link); + } + + return (0); +} + +static int +pci_vtscsi_init(struct vmctx *ctx, struct mmio_devinst *pi, char *opts) +{ + struct pci_vtscsi_softc *sc; + char *opt, *optname; + const char *devname; + int i, optidx = 0; + + sc = calloc(1, sizeof(struct pci_vtscsi_softc)); + devname = "/dev/cam/ctl"; + while ((opt = strsep(&opts, ",")) != NULL) { + optname = strsep(&opt, "="); + if (opt == NULL && optidx == 0) { + if (optname[0] != 0) + devname = optname; + } else if (strcmp(optname, "dev") == 0 && opt != NULL) { + devname = opt; + } else if (strcmp(optname, "iid") == 0 && opt != NULL) { + sc->vss_iid = strtoul(opt, NULL, 10); + } else { + EPRINTLN("Invalid option %s", optname); + free(sc); + return (1); + } + optidx++; + } + + sc->vss_ctl_fd = open(devname, O_RDWR); + if (sc->vss_ctl_fd < 0) { + WPRINTF(("cannot open %s: %s", devname, strerror(errno))); + free(sc); + return (1); + } + + vi_softc_linkup(&sc->vss_vs, &vtscsi_vi_consts, sc, pi, sc->vss_vq); + sc->vss_vs.vs_mtx = &sc->vss_mtx; + + /* controlq */ + sc->vss_vq[0].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[0].vq_notify = pci_vtscsi_controlq_notify; + + /* eventq */ + sc->vss_vq[1].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[1].vq_notify = pci_vtscsi_eventq_notify; + + /* request queues */ + for (i = 2; i < VTSCSI_MAXQ; i++) { + sc->vss_vq[i].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[i].vq_notify = pci_vtscsi_requestq_notify; + pci_vtscsi_init_queue(sc, &sc->vss_queues[i - 2], i - 2); + } + + /* initialize config space */ + mmio_set_cfgreg16(pi, PCIR_DEVICE, VIRTIO_DEV_SCSI); + mmio_set_cfgreg16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + mmio_set_cfgreg8(pi, PCIR_CLASS, PCIC_STORAGE); + mmio_set_cfgreg16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_SCSI); + mmio_set_cfgreg16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vss_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_res(&sc->vss_vs, 0); + + return (0); +} + + +struct mmio_devemu pci_de_vscsi = { + .de_emu = "virtio-scsi", + .de_init = pci_vtscsi_init, + .de_write = vi_mmio_write, + .de_read = vi_mmio_read +}; +MMIO_EMUL_SET(pci_de_vscsi); Index: usr.sbin/bhyve/mmio/net_backends.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/net_backends.h @@ -0,0 +1,95 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2019 Vincenzo Maffione + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __NET_BACKENDS_H__ +#define __NET_BACKENDS_H__ + +#include + +/* Opaque type representing a network backend. */ +typedef struct net_backend net_backend_t; + +/* Interface between network frontends and the network backends. */ +typedef void (*net_be_rxeof_t)(int, enum ev_type, void *param); +int netbe_init(net_backend_t **be, const char *opts, net_be_rxeof_t cb, + void *param); +void netbe_cleanup(net_backend_t *be); +uint64_t netbe_get_cap(net_backend_t *be); +int netbe_set_cap(net_backend_t *be, uint64_t cap, + unsigned vnet_hdr_len); +size_t netbe_get_vnet_hdr_len(net_backend_t *be); +ssize_t netbe_send(net_backend_t *be, const struct iovec *iov, int iovcnt); +ssize_t netbe_peek_recvlen(net_backend_t *be); +ssize_t netbe_recv(net_backend_t *be, const struct iovec *iov, int iovcnt); +ssize_t netbe_rx_discard(net_backend_t *be); +void netbe_rx_disable(net_backend_t *be); +void netbe_rx_enable(net_backend_t *be); + + +/* + * Network device capabilities taken from the VirtIO standard. + * Despite the name, these capabilities can be used by different frontents + * (virtio-net, ptnet) and supported by different backends (netmap, tap, ...). + */ +#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ +#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ +#define VIRTIO_NET_F_MTU (1 << 3) /* initial MTU advice */ +#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ +#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ +#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ +#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ +#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ +#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ +#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ +#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ +#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ +#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ +#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ +#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ +#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ +#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ +#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ +#define VIRTIO_NET_F_GUEST_ANNOUNCE \ + (1 << 21) /* guest can send gratuitous pkts */ +#define VIRTIO_NET_F_MQ (1 << 22) /* host supports multiple VQ pairs */ + +/* + * Fixed network header size + */ +struct virtio_net_rxhdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; + uint16_t vrh_bufs; +} __packed; + +#endif /* __NET_BACKENDS_H__ */ Index: usr.sbin/bhyve/mmio/net_backends.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/net_backends.c @@ -0,0 +1,1108 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2019 Vincenzo Maffione + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This file implements multiple network backends (tap, netmap, ...), + * to be used by network frontends such as virtio-net and e1000. + * The API to access the backend (e.g. send/receive packets, negotiate + * features) is exported by net_backends.h. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include /* u_short etc */ +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include + +#include +#include +#include +#define NETMAP_WITH_LIBS +#include + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef NETGRAPH +#include +#include +#include +#endif + +#include "debug.h" +#include "iov.h" +#include "mevent.h" +#include "net_backends.h" + +#include + +/* + * Each network backend registers a set of function pointers that are + * used to implement the net backends API. + * This might need to be exposed if we implement backends in separate files. + */ +struct net_backend { + const char *prefix; /* prefix matching this backend */ + + /* + * Routines used to initialize and cleanup the resources needed + * by a backend. The cleanup function is used internally, + * and should not be called by the frontend. + */ + int (*init)(struct net_backend *be, const char *devname, + const char *opts, net_be_rxeof_t cb, void *param); + void (*cleanup)(struct net_backend *be); + + /* + * Called to serve a guest transmit request. The scatter-gather + * vector provided by the caller has 'iovcnt' elements and contains + * the packet to send. + */ + ssize_t (*send)(struct net_backend *be, const struct iovec *iov, + int iovcnt); + + /* + * Get the length of the next packet that can be received from + * the backend. If no packets are currently available, this + * function returns 0. + */ + ssize_t (*peek_recvlen)(struct net_backend *be); + + /* + * Called to receive a packet from the backend. When the function + * returns a positive value 'len', the scatter-gather vector + * provided by the caller contains a packet with such length. + * The function returns 0 if the backend doesn't have a new packet to + * receive. + */ + ssize_t (*recv)(struct net_backend *be, const struct iovec *iov, + int iovcnt); + + /* + * Ask the backend to enable or disable receive operation in the + * backend. On return from a disable operation, it is guaranteed + * that the receive callback won't be called until receive is + * enabled again. Note however that it is up to the caller to make + * sure that netbe_recv() is not currently being executed by another + * thread. + */ + void (*recv_enable)(struct net_backend *be); + void (*recv_disable)(struct net_backend *be); + + /* + * Ask the backend for the virtio-net features it is able to + * support. Possible features are TSO, UFO and checksum offloading + * in both rx and tx direction and for both IPv4 and IPv6. + */ + uint64_t (*get_cap)(struct net_backend *be); + + /* + * Tell the backend to enable/disable the specified virtio-net + * features (capabilities). + */ + int (*set_cap)(struct net_backend *be, uint64_t features, + unsigned int vnet_hdr_len); + + struct pci_vtnet_softc *sc; + int fd; + + /* + * Length of the virtio-net header used by the backend and the + * frontend, respectively. A zero value means that the header + * is not used. + */ + unsigned int be_vnet_hdr_len; + unsigned int fe_vnet_hdr_len; + + /* Size of backend-specific private data. */ + size_t priv_size; + + /* Room for backend-specific data. */ + char opaque[0]; +}; + +SET_DECLARE(net_backend_set, struct net_backend); + +#define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) + +#define WPRINTF(params) PRINTLN params + +/* + * The tap backend + */ + +struct tap_priv { + struct mevent *mevp; + /* + * A bounce buffer that allows us to implement the peek_recvlen + * callback. In the future we may get the same information from + * the kevent data. + */ + char bbuf[1 << 16]; + ssize_t bbuflen; +}; + +static void +tap_cleanup(struct net_backend *be) +{ + struct tap_priv *priv = (struct tap_priv *)be->opaque; + + if (priv->mevp) { + mevent_delete(priv->mevp); + } + if (be->fd != -1) { + close(be->fd); + be->fd = -1; + } +} + +static int +tap_init(struct net_backend *be, const char *devname, + const char *opts, net_be_rxeof_t cb, void *param) +{ + struct tap_priv *priv = (struct tap_priv *)be->opaque; + char tbuf[80]; + int opt = 1; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + if (cb == NULL) { + WPRINTF(("TAP backend requires non-NULL callback")); + return (-1); + } + + strcpy(tbuf, "/dev/"); + strlcat(tbuf, devname, sizeof(tbuf)); + + be->fd = open(tbuf, O_RDWR); + if (be->fd == -1) { + WPRINTF(("open of tap device %s failed", tbuf)); + goto error; + } + + /* + * Set non-blocking and register for read + * notifications with the event loop + */ + if (ioctl(be->fd, FIONBIO, &opt) < 0) { + WPRINTF(("tap device O_NONBLOCK failed")); + goto error; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(be->fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + memset(priv->bbuf, 0, sizeof(priv->bbuf)); + priv->bbuflen = 0; + + priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); + if (priv->mevp == NULL) { + WPRINTF(("Could not register event")); + goto error; + } + + return (0); + +error: + tap_cleanup(be); + return (-1); +} + +/* + * Called to send a buffer chain out to the tap device + */ +static ssize_t +tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) +{ + return (writev(be->fd, iov, iovcnt)); +} + +static ssize_t +tap_peek_recvlen(struct net_backend *be) +{ + struct tap_priv *priv = (struct tap_priv *)be->opaque; + ssize_t ret; + + if (priv->bbuflen > 0) { + /* + * We already have a packet in the bounce buffer. + * Just return its length. + */ + return priv->bbuflen; + } + + /* + * Read the next packet (if any) into the bounce buffer, so + * that we get to know its length and we can return that + * to the caller. + */ + ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); + if (ret < 0 && errno == EWOULDBLOCK) { + return (0); + } + + if (ret > 0) + priv->bbuflen = ret; + + return (ret); +} + +static ssize_t +tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) +{ + struct tap_priv *priv = (struct tap_priv *)be->opaque; + ssize_t ret; + + if (priv->bbuflen > 0) { + /* + * A packet is available in the bounce buffer, so + * we read it from there. + */ + ret = buf_to_iov(priv->bbuf, priv->bbuflen, + iov, iovcnt, 0); + + /* Mark the bounce buffer as empty. */ + priv->bbuflen = 0; + + return (ret); + } + + ret = readv(be->fd, iov, iovcnt); + if (ret < 0 && errno == EWOULDBLOCK) { + return (0); + } + + return (ret); +} + +static void +tap_recv_enable(struct net_backend *be) +{ + struct tap_priv *priv = (struct tap_priv *)be->opaque; + + mevent_enable(priv->mevp); +} + +static void +tap_recv_disable(struct net_backend *be) +{ + struct tap_priv *priv = (struct tap_priv *)be->opaque; + + mevent_disable(priv->mevp); +} + +static uint64_t +tap_get_cap(struct net_backend *be) +{ + + return (0); /* no capabilities for now */ +} + +static int +tap_set_cap(struct net_backend *be, uint64_t features, + unsigned vnet_hdr_len) +{ + + return ((features || vnet_hdr_len) ? -1 : 0); +} + +static struct net_backend tap_backend = { + .prefix = "tap", + .priv_size = sizeof(struct tap_priv), + .init = tap_init, + .cleanup = tap_cleanup, + .send = tap_send, + .peek_recvlen = tap_peek_recvlen, + .recv = tap_recv, + .recv_enable = tap_recv_enable, + .recv_disable = tap_recv_disable, + .get_cap = tap_get_cap, + .set_cap = tap_set_cap, +}; + +/* A clone of the tap backend, with a different prefix. */ +static struct net_backend vmnet_backend = { + .prefix = "vmnet", + .priv_size = sizeof(struct tap_priv), + .init = tap_init, + .cleanup = tap_cleanup, + .send = tap_send, + .peek_recvlen = tap_peek_recvlen, + .recv = tap_recv, + .recv_enable = tap_recv_enable, + .recv_disable = tap_recv_disable, + .get_cap = tap_get_cap, + .set_cap = tap_set_cap, +}; + +DATA_SET(net_backend_set, tap_backend); +DATA_SET(net_backend_set, vmnet_backend); + +#ifdef NETGRAPH + +/* + * Netgraph backend + */ + +#define NG_SBUF_MAX_SIZE (4 * 1024 * 1024) + +static int +ng_init(struct net_backend *be, const char *devname, + const char *opts, net_be_rxeof_t cb, void *param) +{ + struct tap_priv *p = (struct tap_priv *)be->opaque; + struct ngm_connect ngc; + char *ngopts, *tofree; + char nodename[NG_NODESIZ]; + int sbsz; + int ctrl_sock; + int flags; + int path_provided; + int peerhook_provided; + int socket_provided; + unsigned long maxsbsz; + size_t msbsz; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + if (cb == NULL) { + WPRINTF(("Netgraph backend requires non-NULL callback")); + return (-1); + } + + be->fd = -1; + + memset(&ngc, 0, sizeof(ngc)); + + strncpy(ngc.ourhook, "vmlink", NG_HOOKSIZ - 1); + + tofree = ngopts = strdup(opts); + + if (ngopts == NULL) { + WPRINTF(("strdup error")); + return (-1); + } + + socket_provided = 0; + path_provided = 0; + peerhook_provided = 0; + + while (ngopts != NULL) { + char *value = ngopts; + char *key; + + key = strsep(&value, "="); + if (value == NULL) + break; + ngopts = value; + (void) strsep(&ngopts, ","); + + if (strcmp(key, "socket") == 0) { + strncpy(nodename, value, NG_NODESIZ - 1); + socket_provided = 1; + } else if (strcmp(key, "path") == 0) { + strncpy(ngc.path, value, NG_PATHSIZ - 1); + path_provided = 1; + } else if (strcmp(key, "hook") == 0) { + strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); + } else if (strcmp(key, "peerhook") == 0) { + strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); + peerhook_provided = 1; + } + } + + free(tofree); + + if (!path_provided) { + WPRINTF(("path must be provided")); + return (-1); + } + + if (!peerhook_provided) { + WPRINTF(("peer hook must be provided")); + return (-1); + } + + if (NgMkSockNode(socket_provided ? nodename : NULL, + &ctrl_sock, &be->fd) < 0) { + WPRINTF(("can't get Netgraph sockets")); + return (-1); + } + + if (NgSendMsg(ctrl_sock, ".", + NGM_GENERIC_COOKIE, + NGM_CONNECT, &ngc, sizeof(ngc)) < 0) { + WPRINTF(("can't connect to node")); + close(ctrl_sock); + goto error; + } + + close(ctrl_sock); + + flags = fcntl(be->fd, F_GETFL); + + if (flags < 0) { + WPRINTF(("can't get socket flags")); + goto error; + } + + if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) { + WPRINTF(("can't set O_NONBLOCK flag")); + goto error; + } + + /* + * The default ng_socket(4) buffer's size is too low. + * Calculate the minimum value between NG_SBUF_MAX_SIZE + * and kern.ipc.maxsockbuf. + */ + msbsz = sizeof(maxsbsz); + if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz, + NULL, 0) < 0) { + WPRINTF(("can't get 'kern.ipc.maxsockbuf' value")); + goto error; + } + + /* + * We can't set the socket buffer size to kern.ipc.maxsockbuf value, + * as it takes into account the mbuf(9) overhead. + */ + maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES); + + sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz); + + if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz, + sizeof(sbsz)) < 0) { + WPRINTF(("can't set TX buffer size")); + goto error; + } + + if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz, + sizeof(sbsz)) < 0) { + WPRINTF(("can't set RX buffer size")); + goto error; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(be->fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + memset(p->bbuf, 0, sizeof(p->bbuf)); + p->bbuflen = 0; + + p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); + if (p->mevp == NULL) { + WPRINTF(("Could not register event")); + goto error; + } + + return (0); + +error: + tap_cleanup(be); + return (-1); +} + +static struct net_backend ng_backend = { + .prefix = "netgraph", + .priv_size = sizeof(struct tap_priv), + .init = ng_init, + .cleanup = tap_cleanup, + .send = tap_send, + .peek_recvlen = tap_peek_recvlen, + .recv = tap_recv, + .recv_enable = tap_recv_enable, + .recv_disable = tap_recv_disable, + .get_cap = tap_get_cap, + .set_cap = tap_set_cap, +}; + +DATA_SET(net_backend_set, ng_backend); + +#endif /* NETGRAPH */ + +/* + * The netmap backend + */ + +/* The virtio-net features supported by netmap. */ +#define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ + VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ + VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ + VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) + +struct netmap_priv { + char ifname[IFNAMSIZ]; + struct nm_desc *nmd; + uint16_t memid; + struct netmap_ring *rx; + struct netmap_ring *tx; + struct mevent *mevp; + net_be_rxeof_t cb; + void *cb_param; +}; + +static void +nmreq_init(struct nmreq *req, char *ifname) +{ + + memset(req, 0, sizeof(*req)); + strlcpy(req->nr_name, ifname, sizeof(req->nr_name)); + req->nr_version = NETMAP_API; +} + +static int +netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) +{ + int err; + struct nmreq req; + struct netmap_priv *priv = (struct netmap_priv *)be->opaque; + + nmreq_init(&req, priv->ifname); + req.nr_cmd = NETMAP_BDG_VNET_HDR; + req.nr_arg1 = vnet_hdr_len; + err = ioctl(be->fd, NIOCREGIF, &req); + if (err) { + WPRINTF(("Unable to set vnet header length %d", + vnet_hdr_len)); + return (err); + } + + be->be_vnet_hdr_len = vnet_hdr_len; + + return (0); +} + +static int +netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) +{ + int prev_hdr_len = be->be_vnet_hdr_len; + int ret; + + if (vnet_hdr_len == prev_hdr_len) { + return (1); + } + + ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); + if (ret) { + return (0); + } + + netmap_set_vnet_hdr_len(be, prev_hdr_len); + + return (1); +} + +static uint64_t +netmap_get_cap(struct net_backend *be) +{ + + return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? + NETMAP_FEATURES : 0); +} + +static int +netmap_set_cap(struct net_backend *be, uint64_t features, + unsigned vnet_hdr_len) +{ + + return (netmap_set_vnet_hdr_len(be, vnet_hdr_len)); +} + +static int +netmap_init(struct net_backend *be, const char *devname, + const char *opts, net_be_rxeof_t cb, void *param) +{ + struct netmap_priv *priv = (struct netmap_priv *)be->opaque; + + strlcpy(priv->ifname, devname, sizeof(priv->ifname)); + priv->ifname[sizeof(priv->ifname) - 1] = '\0'; + + priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); + if (priv->nmd == NULL) { + WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)", + devname, strerror(errno))); + free(priv); + return (-1); + } + + priv->memid = priv->nmd->req.nr_arg2; + priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); + priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); + priv->cb = cb; + priv->cb_param = param; + be->fd = priv->nmd->fd; + + priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); + if (priv->mevp == NULL) { + WPRINTF(("Could not register event")); + return (-1); + } + + return (0); +} + +static void +netmap_cleanup(struct net_backend *be) +{ + struct netmap_priv *priv = (struct netmap_priv *)be->opaque; + + if (priv->mevp) { + mevent_delete(priv->mevp); + } + if (priv->nmd) { + nm_close(priv->nmd); + } + be->fd = -1; +} + +static ssize_t +netmap_send(struct net_backend *be, const struct iovec *iov, + int iovcnt) +{ + struct netmap_priv *priv = (struct netmap_priv *)be->opaque; + struct netmap_ring *ring; + ssize_t totlen = 0; + int nm_buf_size; + int nm_buf_len; + uint32_t head; + void *nm_buf; + int j; + + ring = priv->tx; + head = ring->head; + if (head == ring->tail) { + WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt))); + goto txsync; + } + nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); + nm_buf_size = ring->nr_buf_size; + nm_buf_len = 0; + + for (j = 0; j < iovcnt; j++) { + int iov_frag_size = iov[j].iov_len; + void *iov_frag_buf = iov[j].iov_base; + + totlen += iov_frag_size; + + /* + * Split each iovec fragment over more netmap slots, if + * necessary. + */ + for (;;) { + int copylen; + + copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; + memcpy(nm_buf, iov_frag_buf, copylen); + + iov_frag_buf += copylen; + iov_frag_size -= copylen; + nm_buf += copylen; + nm_buf_size -= copylen; + nm_buf_len += copylen; + + if (iov_frag_size == 0) { + break; + } + + ring->slot[head].len = nm_buf_len; + ring->slot[head].flags = NS_MOREFRAG; + head = nm_ring_next(ring, head); + if (head == ring->tail) { + /* + * We ran out of netmap slots while + * splitting the iovec fragments. + */ + WPRINTF(("No space, drop %zu bytes", + count_iov(iov, iovcnt))); + goto txsync; + } + nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); + nm_buf_size = ring->nr_buf_size; + nm_buf_len = 0; + } + } + + /* Complete the last slot, which must not have NS_MOREFRAG set. */ + ring->slot[head].len = nm_buf_len; + ring->slot[head].flags = 0; + head = nm_ring_next(ring, head); + + /* Now update ring->head and ring->cur. */ + ring->head = ring->cur = head; +txsync: + ioctl(be->fd, NIOCTXSYNC, NULL); + + return (totlen); +} + +static ssize_t +netmap_peek_recvlen(struct net_backend *be) +{ + struct netmap_priv *priv = (struct netmap_priv *)be->opaque; + struct netmap_ring *ring = priv->rx; + uint32_t head = ring->head; + ssize_t totlen = 0; + + while (head != ring->tail) { + struct netmap_slot *slot = ring->slot + head; + + totlen += slot->len; + if ((slot->flags & NS_MOREFRAG) == 0) + break; + head = nm_ring_next(ring, head); + } + + return (totlen); +} + +static ssize_t +netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) +{ + struct netmap_priv *priv = (struct netmap_priv *)be->opaque; + struct netmap_slot *slot = NULL; + struct netmap_ring *ring; + void *iov_frag_buf; + int iov_frag_size; + ssize_t totlen = 0; + uint32_t head; + + assert(iovcnt); + + ring = priv->rx; + head = ring->head; + iov_frag_buf = iov->iov_base; + iov_frag_size = iov->iov_len; + + do { + int nm_buf_len; + void *nm_buf; + + if (head == ring->tail) { + return (0); + } + + slot = ring->slot + head; + nm_buf = NETMAP_BUF(ring, slot->buf_idx); + nm_buf_len = slot->len; + + for (;;) { + int copylen = nm_buf_len < iov_frag_size ? + nm_buf_len : iov_frag_size; + + memcpy(iov_frag_buf, nm_buf, copylen); + nm_buf += copylen; + nm_buf_len -= copylen; + iov_frag_buf += copylen; + iov_frag_size -= copylen; + totlen += copylen; + + if (nm_buf_len == 0) { + break; + } + + iov++; + iovcnt--; + if (iovcnt == 0) { + /* No space to receive. */ + WPRINTF(("Short iov, drop %zd bytes", + totlen)); + return (-ENOSPC); + } + iov_frag_buf = iov->iov_base; + iov_frag_size = iov->iov_len; + } + + head = nm_ring_next(ring, head); + + } while (slot->flags & NS_MOREFRAG); + + /* Release slots to netmap. */ + ring->head = ring->cur = head; + + return (totlen); +} + +static void +netmap_recv_enable(struct net_backend *be) +{ + struct netmap_priv *priv = (struct netmap_priv *)be->opaque; + + mevent_enable(priv->mevp); +} + +static void +netmap_recv_disable(struct net_backend *be) +{ + struct netmap_priv *priv = (struct netmap_priv *)be->opaque; + + mevent_disable(priv->mevp); +} + +static struct net_backend netmap_backend = { + .prefix = "netmap", + .priv_size = sizeof(struct netmap_priv), + .init = netmap_init, + .cleanup = netmap_cleanup, + .send = netmap_send, + .peek_recvlen = netmap_peek_recvlen, + .recv = netmap_recv, + .recv_enable = netmap_recv_enable, + .recv_disable = netmap_recv_disable, + .get_cap = netmap_get_cap, + .set_cap = netmap_set_cap, +}; + +/* A clone of the netmap backend, with a different prefix. */ +static struct net_backend vale_backend = { + .prefix = "vale", + .priv_size = sizeof(struct netmap_priv), + .init = netmap_init, + .cleanup = netmap_cleanup, + .send = netmap_send, + .peek_recvlen = netmap_peek_recvlen, + .recv = netmap_recv, + .recv_enable = netmap_recv_enable, + .recv_disable = netmap_recv_disable, + .get_cap = netmap_get_cap, + .set_cap = netmap_set_cap, +}; + +DATA_SET(net_backend_set, netmap_backend); +DATA_SET(net_backend_set, vale_backend); + +/* + * Initialize a backend and attach to the frontend. + * This is called during frontend initialization. + * @pbe is a pointer to the backend to be initialized + * @devname is the backend-name as supplied on the command line, + * e.g. -s 2:0,frontend-name,backend-name[,other-args] + * @cb is the receive callback supplied by the frontend, + * and it is invoked in the event loop when a receive + * event is generated in the hypervisor, + * @param is a pointer to the frontend, and normally used as + * the argument for the callback. + */ +int +netbe_init(struct net_backend **ret, const char *opts, net_be_rxeof_t cb, + void *param) +{ + struct net_backend **pbe, *nbe, *tbe = NULL; + char *devname; + char *options; + int err; + + devname = options = strdup(opts); + + if (devname == NULL) { + return (-1); + } + + devname = strsep(&options, ","); + + /* + * Find the network backend that matches the user-provided + * device name. net_backend_set is built using a linker set. + */ + SET_FOREACH(pbe, net_backend_set) { + if (strncmp(devname, (*pbe)->prefix, + strlen((*pbe)->prefix)) == 0) { + tbe = *pbe; + assert(tbe->init != NULL); + assert(tbe->cleanup != NULL); + assert(tbe->send != NULL); + assert(tbe->recv != NULL); + assert(tbe->get_cap != NULL); + assert(tbe->set_cap != NULL); + break; + } + } + + *ret = NULL; + if (tbe == NULL) { + free(devname); + return (EINVAL); + } + + nbe = calloc(1, sizeof(*nbe) + tbe->priv_size); + *nbe = *tbe; /* copy the template */ + nbe->fd = -1; + nbe->sc = param; + nbe->be_vnet_hdr_len = 0; + nbe->fe_vnet_hdr_len = 0; + + /* Initialize the backend. */ + err = nbe->init(nbe, devname, options, cb, param); + if (err) { + free(devname); + free(nbe); + return (err); + } + + *ret = nbe; + free(devname); + + return (0); +} + +void +netbe_cleanup(struct net_backend *be) +{ + + if (be != NULL) { + be->cleanup(be); + free(be); + } +} + +uint64_t +netbe_get_cap(struct net_backend *be) +{ + + assert(be != NULL); + return (be->get_cap(be)); +} + +int +netbe_set_cap(struct net_backend *be, uint64_t features, + unsigned vnet_hdr_len) +{ + int ret; + + assert(be != NULL); + + /* There are only three valid lengths, i.e., 0, 10 and 12. */ + if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN + && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) + return (-1); + + be->fe_vnet_hdr_len = vnet_hdr_len; + + ret = be->set_cap(be, features, vnet_hdr_len); + assert(be->be_vnet_hdr_len == 0 || + be->be_vnet_hdr_len == be->fe_vnet_hdr_len); + + return (ret); +} + +ssize_t +netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) +{ + + return (be->send(be, iov, iovcnt)); +} + +ssize_t +netbe_peek_recvlen(struct net_backend *be) +{ + + return (be->peek_recvlen(be)); +} + +/* + * Try to read a packet from the backend, without blocking. + * If no packets are available, return 0. In case of success, return + * the length of the packet just read. Return -1 in case of errors. + */ +ssize_t +netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) +{ + + return (be->recv(be, iov, iovcnt)); +} + +/* + * Read a packet from the backend and discard it. + * Returns the size of the discarded packet or zero if no packet was available. + * A negative error code is returned in case of read error. + */ +ssize_t +netbe_rx_discard(struct net_backend *be) +{ + /* + * MP note: the dummybuf is only used to discard frames, + * so there is no need for it to be per-vtnet or locked. + * We only make it large enough for TSO-sized segment. + */ + static uint8_t dummybuf[65536 + 64]; + struct iovec iov; + + iov.iov_base = dummybuf; + iov.iov_len = sizeof(dummybuf); + + return netbe_recv(be, &iov, 1); +} + +void +netbe_rx_disable(struct net_backend *be) +{ + + return be->recv_disable(be); +} + +void +netbe_rx_enable(struct net_backend *be) +{ + + return be->recv_enable(be); +} + +size_t +netbe_get_vnet_hdr_len(struct net_backend *be) +{ + + return (be->be_vnet_hdr_len); +} Index: usr.sbin/bhyve/mmio/net_utils.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/net_utils.h @@ -0,0 +1,39 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2019 Vincenzo Maffione + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_UTILS_H_ +#define _NET_UTILS_H_ + +#include +#include "mmio_emul.h" + +void net_genmac(struct mmio_devinst *pi, uint8_t *macaddr); +int net_parsemac(char *mac_str, uint8_t *mac_addr); + +#endif /* _NET_UTILS_H_ */ Index: usr.sbin/bhyve/mmio/net_utils.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/net_utils.c @@ -0,0 +1,90 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include + +#include "../arm64/bhyverun.h" +#include "debug.h" +#include "net_utils.h" + +int +net_parsemac(char *mac_str, uint8_t *mac_addr) +{ + struct ether_addr *ea; + char *tmpstr; + char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; + + tmpstr = strsep(&mac_str,"="); + + if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { + ea = ether_aton(mac_str); + + if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || + memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { + EPRINTLN("Invalid MAC %s", mac_str); + return (EINVAL); + } else + memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); + } + + return (0); +} + +void +net_genmac(struct mmio_devinst *pi, uint8_t *macaddr) +{ + /* + * The default MAC address is the standard NetApp OUI of 00-a0-98, + * followed by an MD5 of the PCI slot/func number and dev name + */ + MD5_CTX mdctx; + unsigned char digest[16]; + char nstr[80]; + + snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot, + pi->di_func, vmname); + + MD5Init(&mdctx); + MD5Update(&mdctx, nstr, (unsigned int)strlen(nstr)); + MD5Final(digest, &mdctx); + + macaddr[0] = 0x00; + macaddr[1] = 0xa0; + macaddr[2] = 0x98; + macaddr[3] = digest[0]; + macaddr[4] = digest[1]; + macaddr[5] = digest[2]; +} Index: usr.sbin/bhyve/mmio/pl011.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/pl011.c @@ -0,0 +1,384 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2020 Andrew Turner + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include +#include + +#include "mevent.h" +#include "uart_backend.h" +#include "uart_emul.h" + +#define UART_FIFO_SIZE 16 + +#define UARTDR 0x00 +#define UARTDR_RSR_SHIFT 8 + +#define UARTRSR 0x01 +#define UARTRSR_OE (1 << 3) + +#define UARTFR 0x06 +#define UARTFR_TXFE (1 << 7) +#define UARTFR_RXFF (1 << 6) +#define UARTFR_TXFF (1 << 5) +#define UARTFR_RXFE (1 << 4) + +#define UARTRTINTR (1 << 6) +#define UARTTXINTR (1 << 5) +#define UARTRXINTR (1 << 4) + +#define UARTIBRD 0x09 + +#define UARTFBRD 0x0a +#define UARTFBRD_MASK 0x003f + +#define UARTLCR_H 0x0b +#define UARTLCR_H_MASK 0x00ff +#define UARTLCR_H_FEN (1 << 4) + +#define UARTCR 0x0c +/* TODO: Check the flags in the UARTCR register */ +#define UARTCR_MASK 0xffc7 +#define UARTCR_LBE (1 << 7) + +#define UARTIFLS 0x0d +#define UARTIFLS_MASK 0x003f +#define UARTIFLS_RXIFLSEL(x) (((x) >> 3) & 0x7) +#define UARTIFLS_TXIFLSEL(x) (((x) >> 0) & 0x7) + +#define UARTIMSC 0x0e +#define UARTIMSC_MASK 0x07ff + +#define UARTRIS 0x0f +#define UARTMIS 0x10 + +#define UARTICR 0x11 + + +#define UARTPeriphID 0x00241011 +#define UARTPeriphID0 0x3f8 +#define UARTPeriphID0_VAL (((UARTPeriphID) >> 0) & 0xff) +#define UARTPeriphID1 0x3f9 +#define UARTPeriphID1_VAL (((UARTPeriphID) >> 8) & 0xff) +#define UARTPeriphID2 0x3fa +#define UARTPeriphID2_VAL (((UARTPeriphID) >> 16) & 0xff) +#define UARTPeriphID3 0x3fb +#define UARTPeriphID3_VAL (((UARTPeriphID) >> 24) & 0xff) + +#define UARTPCellID 0xb105f00d +#define UARTPCellID0 0x3fc +#define UARTPCellID0_VAL (((UARTPCellID) >> 0) & 0xff) +#define UARTPCellID1 0x3fd +#define UARTPCellID1_VAL (((UARTPCellID) >> 8) & 0xff) +#define UARTPCellID2 0x3fe +#define UARTPCellID2_VAL (((UARTPCellID) >> 16) & 0xff) +#define UARTPCellID3 0x3ff +#define UARTPCellID3_VAL (((UARTPCellID) >> 24) & 0xff) + +static void +uart_reset(struct uart_softc *sc) +{ + + sc->ifls = 0x12; + + /* no fifo until enabled by software */ + uart_rxfifo_reset(sc->backend, 1); +} + +static int +uart_rx_trigger_level(struct uart_softc *sc) +{ + + /* If the FIFO is disabled trigger when we have any data */ + if ((sc->lcr_h & UARTLCR_H_FEN) != 0) + return (1); + + /* Trigger base on how full the fifo is */ + switch(UARTIFLS_RXIFLSEL(sc->ifls)) { + case 0: + return (UART_FIFO_SIZE / 8); + case 1: + return (UART_FIFO_SIZE / 4); + case 2: + return (UART_FIFO_SIZE / 2); + case 3: + return (UART_FIFO_SIZE * 3 / 4); + case 4: + return (UART_FIFO_SIZE * 7 / 8); + default: + /* TODO: Find out what happens in this case */ + return (UART_FIFO_SIZE); + } +} + +static void +uart_toggle_intr(struct uart_softc *sc) +{ + if ((sc->irq_state & sc->imsc) == 0) + (*sc->intr_deassert)(sc->arg, sc->irqno); + else + (*sc->intr_assert)(sc->arg, sc->irqno); +} + +static void +uart_drain(int fd, enum ev_type ev, void *arg) +{ + struct uart_softc *sc; + int old_size, trig_lvl; + bool loopback; + + sc = arg; + + assert(ev == EVF_READ); + + /* + * This routine is called in the context of the mevent thread + * to take out the softc lock to protect against concurrent + * access from a vCPU i/o exit + */ + pthread_mutex_lock(&sc->mtx); + + old_size = uart_rxfifo_numchars(sc->backend); + + loopback = (sc->cr & UARTCR_LBE) != 0; + uart_rxfifo_drain(sc->backend, loopback); + + /* If we cross the trigger level raise UARTRXINTR */ + trig_lvl = uart_rx_trigger_level(sc); + if (old_size < trig_lvl && + uart_rxfifo_numchars(sc->backend) >= trig_lvl) + sc->irq_state |= UARTRXINTR; + + if (uart_rxfifo_numchars(sc->backend) > 0) + sc->irq_state |= UARTRTINTR; + if (!loopback) + uart_toggle_intr(sc); + + pthread_mutex_unlock(&sc->mtx); +} + +void +uart_write(struct uart_softc *sc, int offset, uint32_t value) +{ + bool loopback; + + pthread_mutex_lock(&sc->mtx); + switch (offset) { + case UARTDR: + loopback = (sc->cr & UARTCR_LBE) != 0; + if (!uart_rxfifo_write(sc->backend, loopback, value & 0xff)) + sc->rsr |= UARTRSR_OE; + + /* We don't have a TX fifo, so trigger when we have data */ + sc->irq_state |= UARTTXINTR; + break; + case UARTRSR: + /* Any write clears this register */ + sc->rsr = 0; + break; + case UARTFR: + /* UARTFR is a read-only register */ + break; + /* TODO: UARTILPR */ + case UARTIBRD: + sc->ibrd = value; + break; + case UARTFBRD: + sc->fbrd = value & UARTFBRD_MASK; + break; + case UARTLCR_H: + /* Check if the FIFO enable bit changed */ + if (((sc->lcr_h ^ value) & UARTLCR_H_FEN) != 0) { + if ((value & UARTLCR_H_FEN) != 0) { + uart_rxfifo_reset(sc->backend, UART_FIFO_SIZE); + } else { + uart_rxfifo_reset(sc->backend, 1); + } + } + sc->lcr_h = value & UARTLCR_H_MASK; + break; + case UARTCR: + sc->cr = value & UARTCR_MASK; + break; + case UARTIFLS: + sc->ifls = value & UARTCR_MASK; + break; + case UARTIMSC: + sc->imsc = value & UARTIMSC_MASK; + break; + case UARTRIS: + case UARTMIS: + /* UARTRIS and UARTMIS are read-only registers */ + break; + case UARTICR: + sc->irq_state &= ~value; + break; + default: + /* Ignore writes to unassigned/ID registers */ + break; + } + uart_toggle_intr(sc); + pthread_mutex_unlock(&sc->mtx); +} + +uint32_t +uart_read(struct uart_softc *sc, int offset) +{ + uint32_t reg; + int fifo_sz; + + reg = 0; + pthread_mutex_lock(&sc->mtx); + switch(offset) { + case UARTDR: + reg = uart_rxfifo_getchar(sc->backend); + /* Deassert the irq if below the trigger level */ + fifo_sz = uart_rxfifo_numchars(sc->backend); + if (fifo_sz < uart_rx_trigger_level(sc)) + sc->irq_state &= ~UARTRXINTR; + if (fifo_sz == 0) + sc->irq_state &= ~UARTRTINTR; + + reg |= sc->rsr << UARTDR_RSR_SHIFT; + + /* After reading from the fifo there is now space in it */ + sc->rsr &= UARTRSR_OE; + break; + case UARTRSR: + /* Any write clears this register */ + reg = sc->rsr; + break; + case UARTFR: + /* Transmit is intstant, so the fifo is always empty */ + reg = UARTFR_TXFE; + + /* Set the receive fifo full/empty flags */ + fifo_sz = uart_rxfifo_numchars(sc->backend); + if (fifo_sz == UART_FIFO_SIZE) + reg |= UARTFR_RXFF; + else if (fifo_sz == 0) + reg |= UARTFR_RXFE; + break; + /* TODO: UARTILPR */ + case UARTIBRD: + reg = sc->ibrd; + break; + case UARTFBRD: + reg = sc->fbrd; + break; + case UARTLCR_H: + reg = sc->lcr_h; + break; + case UARTCR: + reg = sc->cr; + break; + case UARTIMSC: + reg = sc->imsc; + break; + case UARTRIS: + reg = sc->irq_state; + break; + case UARTMIS: + reg = sc->irq_state & sc->imsc; + break; + case UARTICR: + reg = 0; + break; + case UARTPeriphID0: + reg = UARTPeriphID0_VAL; + break; + case UARTPeriphID1: + reg =UARTPeriphID1_VAL; + break; + case UARTPeriphID2: + reg = UARTPeriphID2_VAL; + break; + case UARTPeriphID3: + reg = UARTPeriphID3_VAL; + break; + case UARTPCellID0: + reg = UARTPCellID0_VAL; + break; + case UARTPCellID1: + reg = UARTPCellID1_VAL; + break; + case UARTPCellID2: + reg = UARTPCellID2_VAL; + break; + case UARTPCellID3: + reg = UARTPCellID3_VAL; + break; + default: + /* Return 0 in reads from unasigned registers */ + reg = 0; + break; + } + uart_toggle_intr(sc); + pthread_mutex_unlock(&sc->mtx); + + return (reg); +} + +struct uart_softc * +uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert, + void *arg) +{ + struct uart_softc *sc; + + sc = calloc(1, sizeof(struct uart_softc)); + + sc->arg = arg; + sc->intr_assert = intr_assert; + sc->intr_deassert = intr_deassert; + sc->backend = uart_backend_alloc(); + + pthread_mutex_init(&sc->mtx, NULL); + + uart_reset(sc); + + return (sc); +} + +int +uart_set_backend(struct uart_softc *sc, const char *opts) +{ + int retval; + + retval = uart_backend_open(sc->backend, opts, uart_drain, sc); + return (retval); +} + Index: usr.sbin/bhyve/mmio/uart_backend.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/uart_backend.h @@ -0,0 +1,46 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _UART_BACKEND_H_ +#define _UART_BACKEND_H_ + +struct uart_backend; + +struct uart_backend *uart_backend_alloc(void); +int uart_backend_open(struct uart_backend *b, const char *opts, + void (*func)(int, enum ev_type, void *), void *arg); + +void uart_rxfifo_reset(struct uart_backend *b, int size); +int uart_rxfifo_getchar(struct uart_backend *b); +int uart_rxfifo_numchars(struct uart_backend *b); +void uart_rxfifo_drain(struct uart_backend *b, bool loopback); +bool uart_rxfifo_write(struct uart_backend *b, bool loopback, uint8_t ch); + +#endif Index: usr.sbin/bhyve/mmio/uart_backend.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/uart_backend.c @@ -0,0 +1,351 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mevent.h" +#include "uart_emul.h" +#include "debug.h" + +#define FIFOSZ 16 + +static bool uart_stdio; /* stdio in use for i/o */ +static struct termios tio_stdio_orig; + +struct fifo { + uint8_t buf[FIFOSZ]; + int rindex; /* index to read from */ + int windex; /* index to write to */ + int num; /* number of characters in the fifo */ + int size; /* size of the fifo */ +}; + +struct ttyfd { + bool opened; + int rfd; /* fd for reading */ + int wfd; /* fd for writing, may be == rfd */ +}; + +struct uart_backend { + struct fifo rxfifo; + struct mevent *mev; + struct ttyfd tty; +}; + +static void +ttyclose(void) +{ + + tcsetattr(STDIN_FILENO, TCSANOW, &tio_stdio_orig); +} + +static void +ttyopen(struct ttyfd *tf) +{ + struct termios orig, new; + + tcgetattr(tf->rfd, &orig); + new = orig; + cfmakeraw(&new); + new.c_cflag |= CLOCAL; + tcsetattr(tf->rfd, TCSANOW, &new); + if (uart_stdio) { + tio_stdio_orig = orig; + atexit(ttyclose); + } + raw_stdio = 1; +} + +static int +ttyread(struct ttyfd *tf) +{ + unsigned char rb; + + if (read(tf->rfd, &rb, 1) == 1) + return (rb); + else + return (-1); +} + +static void +ttywrite(struct ttyfd *tf, unsigned char wb) +{ + (void)write(tf->wfd, &wb, 1); +} + +void +uart_rxfifo_reset(struct uart_backend *sc, int size) +{ + char flushbuf[32]; + struct fifo *fifo; + ssize_t nread; + int error; + + fifo = &sc->rxfifo; + bzero(fifo, sizeof(struct fifo)); + fifo->size = size; + + if (sc->tty.opened) { + /* + * Flush any unread input from the tty buffer. + */ + while (1) { + nread = read(sc->tty.rfd, flushbuf, sizeof(flushbuf)); + if (nread != sizeof(flushbuf)) + break; + } + + /* + * Enable mevent to trigger when new characters are available + * on the tty fd. + */ + error = mevent_enable(sc->mev); + assert(error == 0); + } +} + +static int +rxfifo_available(struct uart_backend *sc) +{ + struct fifo *fifo; + + fifo = &sc->rxfifo; + return (fifo->num < fifo->size); +} + +static int +rxfifo_putchar(struct uart_backend *sc, uint8_t ch) +{ + struct fifo *fifo; + int error; + + fifo = &sc->rxfifo; + + if (fifo->num < fifo->size) { + fifo->buf[fifo->windex] = ch; + fifo->windex = (fifo->windex + 1) % fifo->size; + fifo->num++; + if (!rxfifo_available(sc)) { + if (sc->tty.opened) { + /* + * Disable mevent callback if the FIFO is full. + */ + error = mevent_disable(sc->mev); + assert(error == 0); + } + } + return (0); + } else + return (-1); +} + +int +uart_rxfifo_getchar(struct uart_backend *sc) +{ + struct fifo *fifo; + int c, error, wasfull; + + wasfull = 0; + fifo = &sc->rxfifo; + if (fifo->num > 0) { + if (!rxfifo_available(sc)) + wasfull = 1; + c = fifo->buf[fifo->rindex]; + fifo->rindex = (fifo->rindex + 1) % fifo->size; + fifo->num--; + if (wasfull) { + if (sc->tty.opened) { + error = mevent_enable(sc->mev); + assert(error == 0); + } + } + return (c); + } else + return (-1); +} + +int +uart_rxfifo_numchars(struct uart_backend *sc) +{ + struct fifo *fifo = &sc->rxfifo; + + return (fifo->num); +} + +void +uart_rxfifo_drain(struct uart_backend *b, bool loopback) +{ + int ch; + + if (loopback) { + (void) ttyread(&b->tty); + } else { + while (rxfifo_available(b) && + ((ch = ttyread(&b->tty)) != -1)) { + rxfifo_putchar(b, ch); + } + } +} + +bool +uart_rxfifo_write(struct uart_backend *b, bool loopback, uint8_t ch) +{ + if (loopback) { + if (rxfifo_putchar(b, ch) != 0) + return (false); + } else if (b->tty.opened) { + ttywrite(&b->tty, ch); + } /* else drop on floor */ + + return (true); +} + +static void +uart_opentty(struct uart_backend *sc, void (*func)(int, enum ev_type, void *), + void *arg) +{ + ttyopen(&sc->tty); + sc->mev = mevent_add(sc->tty.rfd, EVF_READ, func, arg); + assert(sc->mev != NULL); +} + +static int +uart_stdio_backend(struct uart_backend *sc) +{ +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; +#endif + + if (uart_stdio) + return (-1); + + sc->tty.rfd = STDIN_FILENO; + sc->tty.wfd = STDOUT_FILENO; + sc->tty.opened = true; + + if (fcntl(sc->tty.rfd, F_SETFL, O_NONBLOCK) != 0) + return (-1); + if (fcntl(sc->tty.wfd, F_SETFL, O_NONBLOCK) != 0) + return (-1); + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ); + if (caph_rights_limit(sc->tty.rfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_ioctls_limit(sc->tty.rfd, cmds, nitems(cmds)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + uart_stdio = true; + + return (0); +} + +static int +uart_tty_backend(struct uart_backend *sc, const char *opts) +{ +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; +#endif + int fd; + + fd = open(opts, O_RDWR | O_NONBLOCK); + if (fd < 0) + return (-1); + + if (!isatty(fd)) { + close(fd); + return (-1); + } + + sc->tty.rfd = sc->tty.wfd = fd; + sc->tty.opened = true; + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ, CAP_WRITE); + if (caph_rights_limit(fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + return (0); +} + +struct uart_backend * +uart_backend_alloc(void) +{ + struct uart_backend *b; + + b = calloc(1, sizeof(struct uart_backend)); + assert(b != NULL); + + return (b); +} + +int +uart_backend_open(struct uart_backend *b, const char *opts, + void (*func)(int, enum ev_type, void *), void *arg) +{ + int retval; + + if (opts == NULL) + return (0); + + if (strcmp("stdio", opts) == 0) + retval = uart_stdio_backend(b); + else + retval = uart_tty_backend(b, opts); + if (retval == 0) + uart_opentty(b, func, arg); + + return (retval); +} Index: usr.sbin/bhyve/mmio/uart_emul.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/uart_emul.h @@ -0,0 +1,66 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _UART_EMUL_H_ +#define _UART_EMUL_H_ + +typedef void (*uart_intr_func_t)(void *arg, uint32_t irq); + +struct uart_softc { + struct uart_backend *backend; + pthread_mutex_t mtx; /* protects all softc elements */ + + uint16_t irq_state; + + uint16_t rsr; + + uint16_t cr; + uint16_t ifls; + uint16_t imsc; + uint16_t lcr_h; + + uint16_t ibrd; + uint16_t fbrd; + + void *arg; + uint32_t irqno; + uart_intr_func_t intr_assert; + uart_intr_func_t intr_deassert; +}; + + +struct uart_softc *uart_init(uart_intr_func_t intr_assert, + uart_intr_func_t intr_deassert, void *arg); + +int uart_legacy_alloc(int unit, int *ioaddr, int *irq); +uint32_t uart_read(struct uart_softc *sc, int offset); +void uart_write(struct uart_softc *sc, int offset, uint32_t value); +int uart_set_backend(struct uart_softc *sc, const char *opt); +#endif Index: usr.sbin/bhyve/pci_ahci.c =================================================================== --- usr.sbin/bhyve/pci_ahci.c +++ usr.sbin/bhyve/pci_ahci.c @@ -63,6 +63,10 @@ #include "pci_emul.h" #include "ahci.h" #include "block_if.h" +#include "bhyverun.h" +#include "pci_emul.h" +#include "ahci.h" +#include "block_if.h" #define DEF_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ #define MAX_PORTS 32 /* AHCI supports 32 ports */ Index: usr.sbin/bhyve/pci_virtio_net.c =================================================================== --- usr.sbin/bhyve/pci_virtio_net.c +++ usr.sbin/bhyve/pci_virtio_net.c @@ -117,7 +117,7 @@ int resetting; /* protected by tx_mtx */ uint64_t vsc_features; /* negotiated features */ - + pthread_mutex_t rx_mtx; int rx_merge; /* merged rx bufs in use */ Index: usr.sbin/bhyvectl/Makefile =================================================================== --- usr.sbin/bhyvectl/Makefile +++ usr.sbin/bhyvectl/Makefile @@ -5,16 +5,13 @@ .include PROG= bhyvectl -SRCS= bhyvectl.c PACKAGE= bhyve -MAN= bhyvectl.8 - LIBADD= vmmapi util WARNS?= 3 -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm +CFLAGS+= -I${SRCTOP}/sys/${MACHINE}/vmm .if ${MK_BHYVE_SNAPSHOT} != "no" CFLAGS+= -DBHYVE_SNAPSHOT @@ -24,4 +21,5 @@ CFLAGS+= -I${SRCTOP}/usr.sbin/bhyve .endif +.include "${.CURDIR}/${MACHINE}/Makefile.inc" .include Index: usr.sbin/bhyvectl/amd64/Makefile.inc =================================================================== --- /dev/null +++ usr.sbin/bhyvectl/amd64/Makefile.inc @@ -0,0 +1,7 @@ +# +# $FreeBSD$ +# +.PATH: ${.CURDIR}/amd64 + +SRCS= bhyvectl.c +MAN= bhyvectl.8 Index: usr.sbin/bhyvectl/arm64/Makefile.inc =================================================================== --- /dev/null +++ usr.sbin/bhyvectl/arm64/Makefile.inc @@ -0,0 +1,7 @@ +# +# $FreeBSD$ +# +.PATH: ${.CURDIR}/arm64 + +SRCS= bhyvectl.c +MAN= bhyvectl.8 Index: usr.sbin/bhyvectl/arm64/bhyvectl.8 =================================================================== --- /dev/null +++ usr.sbin/bhyvectl/arm64/bhyvectl.8 @@ -0,0 +1,97 @@ +.\" Copyright (c) 2015 Christian Brueffer +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd November 13, 2016 +.Dt BHYVECTL 8 +.Os +.Sh NAME +.Nm bhyvectl +.Nd "control utility for bhyve instances" +.Sh SYNOPSIS +.Nm +.Fl -vm= Ns Ar +.Op Fl -create +.Op Fl -destroy +.Op Fl -get-stats +.Op Fl -inject-nmi +.Op Fl -force-reset +.Op Fl -force-poweroff +.Sh DESCRIPTION +The +.Nm +command is a control utility for active +.Xr bhyve 8 +virtual machine instances. +.Pp +.Em Note : +Most +.Nm +flags are intended for querying and setting the state of an active instance. +These commands are intended for development purposes, and are not documented here. +A complete list can be obtained by executing +.Nm +without any arguments. +.Pp +The user-facing options are as follows: +.Bl -tag -width ".Fl d Ar argument" +.It Fl -vm= Ns Ar +Operate on the virtual machine +.Ar . +.It Fl -create +Create the specified VM. +.It Fl -destroy +Destroy the specified VM. +.It Fl -get-stats +Retrieve statistics for the specified VM. +.It Fl -inject-nmi +Inject a non-maskable interrupt (NMI) into the VM. +.It Fl -force-reset +Force the VM to reset. +.It Fl -force-poweroff +Force the VM to power off. +.El +.Sh EXIT STATUS +.Ex -std +.Sh EXAMPLES +Destroy the VM called fbsd10: +.Pp +.Dl "bhyvectl --vm=fbsd10 --destroy" +.Sh SEE ALSO +.Xr bhyve 8 , +.Xr bhyveload 8 +.Sh HISTORY +The +.Nm +command first appeared in +.Fx 10.1 . +.Sh AUTHORS +.An -nosplit +The +.Nm +utility was written by +.An Peter Grehan +and +.An Neel Natu . Index: usr.sbin/bhyvectl/arm64/bhyvectl.c =================================================================== --- /dev/null +++ usr.sbin/bhyvectl/arm64/bhyvectl.c @@ -0,0 +1,143 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#define MB (1UL << 20) +#define GB (1UL << 30) + +#define REQ_ARG required_argument +#define NO_ARG no_argument +#define OPT_ARG optional_argument + +#define eprintf(fmt, ...) printf("%s:%d " fmt, __func__, __LINE__, ##__VA_ARGS__) + +static const char *progname; + +static void +usage() +{ + + (void)fprintf(stderr, + "Usage: %s --vm=\n" + " %*s [--destroy]\n", + progname, (int)strlen(progname), ""); + exit(1); +} + +static int create; +static int destroy; + +enum { + VMNAME = 1000, /* avoid collision with return values from getopt */ +}; + +const struct option opts[] = { + { "vm", REQ_ARG, NULL, VMNAME }, + { "destroy", NO_ARG, &destroy, 1 }, + { NULL, 0, NULL, 1 }, +}; + +int +main(int argc, char *argv[]) +{ + char *vmname; + int error, ch; + struct vmctx *ctx; + + vmname = NULL; + progname = basename(argv[0]); + + while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) { + switch (ch) { + case 0: + break; + case VMNAME: + vmname = optarg; + break; + default: + usage(); + } + } + argc -= optind; + argv += optind; + + if (vmname == NULL) + usage(); + + error = 0; + if (!error && create) + error = vm_create(vmname); + if (!error) { + ctx = vm_open(vmname); + if (ctx == NULL) { + printf("VM:%s is not created.\n", vmname); + exit(1); + } + } + + + if (error) + printf("errno = %d\n", errno); + + if (!error && destroy) + vm_destroy(ctx); + + exit(error); +} Index: usr.sbin/bhyveload/Makefile =================================================================== --- usr.sbin/bhyveload/Makefile +++ usr.sbin/bhyveload/Makefile @@ -1,14 +1,17 @@ # $FreeBSD$ PROG= bhyveload -SRCS= bhyveload.c -MAN= bhyveload.8 PACKAGE= bhyve +BHYVELOAD_SYSDIR?=${SRCTOP} +BHYVELOAD_SRCTOP?=${.CURDIR} + LIBADD= vmmapi WARNS?= 3 CFLAGS+=-I${SRCTOP}/stand/userboot +.include "${BHYVELOAD_SRCTOP}/${MACHINE}/Makefile.inc" + .include Index: usr.sbin/bhyveload/amd64/Makefile.inc =================================================================== --- /dev/null +++ usr.sbin/bhyveload/amd64/Makefile.inc @@ -0,0 +1,7 @@ +# $FreeBSD$ +.PATH: ${BHYVELOAD_SRCTOP}/amd64/ + +SRCS= bhyveload.c +MAN= bhyveload.8 + +CFLAGS+=-I${SRCTOP}/sys/boot/userboot Index: usr.sbin/bhyveload/arm64/Makefile.inc =================================================================== --- /dev/null +++ usr.sbin/bhyveload/arm64/Makefile.inc @@ -0,0 +1,13 @@ +# $FreeBSD$ +LIBADD+= util + +.PATH: ${BHYVELOAD_SRCTOP}/arm64/ + +SRCS= bhyveload.c \ + boot.c + +.PATH: ${.CURDIR}/../../sys/arm64/vmm + +CFLAGS += -I${.CURDIR}/../../stand/common + +MK_MAN=no Index: usr.sbin/bhyveload/arm64/bhyveload.c =================================================================== --- /dev/null +++ usr.sbin/bhyveload/arm64/bhyveload.c @@ -0,0 +1,470 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "boot.h" + +#define gvatovm(addr) ((uint64_t)(addr) - KERNBASE + \ + kernel_load_address - memory_base_address) +#define overlap(x_start, x_end, y_start, y_end) \ + ((x_start) >= (y_start) && (x_start) < (y_end) || \ + (x_end) >= (y_start) && (x_end) < (y_end)) + +#define MB (1024 * 1024UL) +#define BSP 0 +#define KERNEL_IMAGE_NAME_LEN 256 + +#define GIC_V3_DIST_START 0x2f000000UL +#define GIC_V3_DIST_SIZE 0x10000UL +#define GIC_V3_REDIST_START 0x2f100000UL +#define GIC_V3_REDIST_SIZE 0x200000UL + +struct env { + const char *str; + SLIST_ENTRY(env) next; +}; +static SLIST_HEAD(envhead, env) envhead; + +static uint64_t memory_base_address, kernel_load_address; + +static char *vmname, *progname; +static struct vmctx *ctx; + +static int +env_add(const char *str) +{ + struct env *env; + + env = malloc(sizeof(*env)); + if (env == NULL) + return (ENOMEM); + env->str = str; + SLIST_INSERT_HEAD(&envhead, env, next); + + return (0); +} + +static int +env_tostr(char **envstrp, int *envlen) +{ + struct env *env; + int i; + + *envlen = 0; + SLIST_FOREACH(env, &envhead, next) + *envlen = *envlen + strlen(env->str) + 1; + /* Make room for the two terminating zeroes */ + if (*envlen == 0) + *envlen = 2; + else + (*envlen)++; + + *envstrp = malloc(*envlen * sizeof(char)); + if (*envstrp == NULL) + return (ENOMEM); + + i = 0; + SLIST_FOREACH(env, &envhead, next) { + strncpy(*envstrp + i, env->str, strlen(env->str)); + i += strlen(env->str); + (*envstrp)[i++] = 0; + } + (*envstrp)[i] = 0; + + /* + * At this point we have envstr[0] == 0 if the environment is empty. + * Add the second 0 to properly terminate the environment string. + */ + if (SLIST_EMPTY(&envhead)) + (*envstrp)[1] = 0; + + /* + for (i = 0; i < *envlen; i++) + printf("%d ", (int)(*envstrp)[i]); + printf("\n"); + */ + + return (0); +} + +/* + * Guest virtual machinee + */ +static int +guest_copyin(const void *from, uint64_t to, size_t size) +{ + char *ptr; + ptr = vm_map_ipa(ctx, to, size); + if (ptr == NULL) + return (EFAULT); + + memcpy(ptr, from, size); + return (0); +} + +static int +guest_copyout(uint64_t from, void *to, size_t size) +{ + char *ptr; + + ptr = vm_map_ipa(ctx, from, size); + if (ptr == NULL) + return (EFAULT); + + memcpy(to, ptr, size); + return (0); +} + +static void +guest_setreg(enum vm_reg_name vmreg, uint64_t v) +{ + int error; + + error = vm_set_register(ctx, BSP, vmreg, v); + if (error) + perror("vm_set_register"); +} + +#if 0 +static int +parse_memsize(const char *optarg, size_t *ret_memsize) +{ + char *endptr; + size_t optval; + int error; + + optval = strtoul(optarg, &endptr, 0); + if (*optarg != '\0' && *endptr == '\0') { + /* Memory size must be at least one megabyte. */ + if (optval < MB) + optval = optval * MB; + *ret_memsize = optval; + error = 0; + } else { + error = expand_number(optarg, ret_memsize); + } + + return (error); +} +#endif + +static void +usage(int code) +{ + fprintf(stderr, + "Usage: %s [-h] [-k ] [-d dtb-offset] [-t ] [-e ] [-b base-address]\n" + " %*s [-m mem-size] [-l load-address] \n" + " -k: path to guest kernel image\n" + " -d: where to load the device tree, an offset from the start of the kernel address\n" + " -t: path to guest device tree file\n" + " -e: guest boot environment\n" + " -b: memory base address\n" + " -m: memory size\n" + " -l: kernel load address in the guest physical memory\n" + " -h: help\n", + progname, (int)strlen(progname), ""); + exit(code); +} + +int +main(int argc, char** argv) +{ + struct vm_bootparams bootparams; + uint64_t mem_size; + int opt, error; + int kernel_image_fd, dtb_fd; + uint64_t periphbase; + char kernel_image_name[KERNEL_IMAGE_NAME_LEN]; + char device_tree_name[KERNEL_IMAGE_NAME_LEN]; + struct stat st, dtb_st; + void *addr, *dtb_addr; + char *envstr; + int envlen; + uint64_t dtb_address = 0x0; + bool dtb_address_is_offset = false; + bool use_dtb_file = false; + + progname = basename(argv[0]); + + mem_size = 128 * MB; + memory_base_address = VM_GUEST_BASE_IPA; + kernel_load_address = memory_base_address; + periphbase = 0x2c000000UL; + strncpy(kernel_image_name, "kernel.bin", KERNEL_IMAGE_NAME_LEN); + memset(&bootparams, 0, sizeof(struct vm_bootparams)); + + while ((opt = getopt(argc, argv, "hk:l:b:m:e:d:t:")) != -1) { + switch (opt) { + case 't': + strncpy(device_tree_name, optarg, KERNEL_IMAGE_NAME_LEN); + use_dtb_file = true; + break; + case 'd': + dtb_address = strtoul(optarg, NULL, 0); + dtb_address_is_offset = true; + break; + case 'k': + strncpy(kernel_image_name, optarg, KERNEL_IMAGE_NAME_LEN); + break; + case 'l': + kernel_load_address = strtoul(optarg, NULL, 0); + break; + case 'b': + memory_base_address = strtoul(optarg, NULL, 0); + break; + case 'm': + error = vm_parse_memsize(optarg, &mem_size); + if (error) { + fprintf(stderr, "Invalid memsize '%s'\n", optarg); + exit(1); + } + break; + case 'e': + error = env_add(optarg); + if (error) { + perror("env_add"); + exit(1); + } + break; + case 'h': + usage(0); + default: + fprintf(stderr, "Unknown argument '%c'\n", opt); + usage(1); + } + } + + argc -= optind; + argv += optind; + + if (argc != 1) { + fprintf(stderr, "Missing or unknown arguments\n"); + usage(1); + } + + if (kernel_load_address < memory_base_address) { + fprintf(stderr, "Kernel load address is below memory base address\n"); + exit(1); + } + + vmname = argv[0]; + + kernel_image_fd = open(kernel_image_name, O_RDONLY); + if (kernel_image_fd == -1) { + perror("open kernel_image_name"); + exit(1); + } + + error = vm_create(vmname); + if (error) { + perror("vm_create"); + exit(1); + } + + ctx = vm_open(vmname); + if (ctx == NULL) { + perror("vm_open"); + exit(1); + } + + error = vm_setup_memory(ctx, memory_base_address, mem_size, VM_MMAP_ALL); + if (error) { + perror("vm_setup_memory"); + exit(1); + } + + error = fstat(kernel_image_fd, &st); + if (error) { + perror("fstat"); + exit(1); + } + + if ((uint64_t)st.st_size > mem_size) { + fprintf(stderr, "Kernel image larger than memory size\n"); + exit(1); + } + if (kernel_load_address + st.st_size >= memory_base_address + mem_size) { + fprintf(stderr, "Kernel image out of bounds of guest memory\n"); + exit(1); + } + + addr = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, kernel_image_fd, 0); + if (addr == MAP_FAILED) { + perror("mmap kernel_image_fd"); + exit(1); + } + + if (guest_copyin(addr, kernel_load_address - memory_base_address, st.st_size) != 0) { + perror("guest_copyin"); + exit(1); + } + + error = env_tostr(&envstr, &envlen); + if (error) { + perror("parse boot environment\n"); + exit(1); + } + + bootparams.envstr = envstr; + bootparams.envlen = envlen; + error = parse_kernel(addr, st.st_size, ctx, &bootparams); + if (error) { + fprintf(stderr, "Error parsing image\n"); + exit(1); + } + + if (dtb_address == 0) + dtb_address = kernel_load_address + st.st_size; + else if (dtb_address_is_offset) + dtb_address += kernel_load_address; + + if (use_dtb_file) { + dtb_fd = open(device_tree_name, O_RDONLY); + if (dtb_fd == -1) { + perror("open device_tree_name"); + exit(1); + } + + error = fstat(dtb_fd, &dtb_st); + if (error) { + perror("fstat"); + exit(1); + } + + dtb_addr = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, dtb_fd, 0); + if (dtb_addr == MAP_FAILED) { + perror("mmap dtb_fd"); + exit(1); + } + + if (guest_copyin(dtb_addr, dtb_address, dtb_st.st_size) != 0) { + perror("guest_copyin"); + exit(1); + } + + free(bootparams.modulep); + + bootparams.modulep = calloc(1, dtb_st.st_size); + if (bootparams.modulep == NULL) { + perror("calloc"); + return (ENOMEM); + } + + memcpy(bootparams.modulep, dtb_addr, dtb_st.st_size); + + bootparams.module_len = dtb_st.st_size; + } + + /* + fprintf(stderr, "bootparams.envp_gva = 0x%016lx\n", bootparams.envp_gva); + fprintf(stderr, "gvatom(bootparams.envp_gva) = 0x%016lx\n", gvatovm(bootparams.envp_gva)); + fprintf(stderr, "vm_map_ipa() = 0x%016lx\n", (uint64_t)vm_map_ipa(ctx, gvatovm(bootparams.envp_gva), PAGE_SIZE)); + fprintf(stderr, "\n"); + + fprintf(stderr, "bootparams.mudulep_gva = 0x%016lx\n", bootparams.modulep_gva); + fprintf(stderr, "gvatom(bootparams.modulep_gva) = 0x%016lx\n", gvatovm(bootparams.modulep_gva)); + fprintf(stderr, "vm_map_ipa() = 0x%016lx\n", (uint64_t)vm_map_ipa(ctx, gvatovm(bootparams.modulep_gva), PAGE_SIZE)); + fprintf(stderr, "\n"); + */ + + /* Copy the environment string in the guest memory */ + if (guest_copyin((void *)envstr, gvatovm(bootparams.envp_gva), envlen) != 0) { + perror("guest_copyin"); + exit(1); + } + + /* Copy the module data in the guest memory */ + if (guest_copyin(bootparams.modulep, gvatovm(bootparams.modulep_gva), bootparams.module_len) != 0) { + perror("guest_copyin"); + exit(1); + } + + uint64_t mem_end = memory_base_address + mem_size; + uint64_t dist_end = GIC_V3_DIST_START + GIC_V3_DIST_SIZE; + uint64_t redist_end = GIC_V3_REDIST_START + GIC_V3_REDIST_SIZE; + + if (overlap(GIC_V3_DIST_SIZE, dist_end, memory_base_address, mem_end)) { + fprintf(stderr, "Guest memory overlaps with VGIC Distributor\n"); + exit(1); + } + + if (overlap(GIC_V3_REDIST_SIZE, redist_end, memory_base_address, mem_end)) { + fprintf(stderr, "Guest memory overlaps with VGIC Redistributor\n"); + exit(1); + } + + error = vm_attach_vgic(ctx, GIC_V3_DIST_START, GIC_V3_DIST_SIZE, + GIC_V3_REDIST_START, GIC_V3_REDIST_SIZE); + if (error) { + fprintf(stderr, "Error attaching VGIC to the virtual machine\n"); + exit(1); + } + + munmap(addr, st.st_size); + if (use_dtb_file) + munmap(dtb_addr, dtb_st.st_size); + + /* TODO: If we want to boot Linux, this entry_off should be not fine + * bootparams.entry_off = 0x80000; + * Based on the Linux ARM64/boot documentation + */ + guest_setreg(VM_REG_ELR_EL2, kernel_load_address + bootparams.entry_off); + guest_setreg(VM_REG_GUEST_X0, bootparams.modulep_gva); + + return 0; +} Index: usr.sbin/bhyveload/arm64/boot.h =================================================================== --- /dev/null +++ usr.sbin/bhyveload/arm64/boot.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _BOOT_H_ +#define _BOOT_H_ + + + +struct vm_bootparams { + uint64_t entry_off; + uint64_t modulep_gva; /* Guest virtual address of modulep data */ + uint64_t envp_gva; /* Guest virtual address for env */ + char *envstr; + int envlen; + int module_len; + void *modulep; /* Bhyveload address of modulep data */ +}; + +int parse_kernel(void *addr, size_t img_size, struct vmctx *ctx, + struct vm_bootparams *bootparams); + +#endif Index: usr.sbin/bhyveload/arm64/boot.c =================================================================== --- /dev/null +++ usr.sbin/bhyveload/arm64/boot.c @@ -0,0 +1,622 @@ +/* + * Copyright (C) 2015-2021 Mihai Carabas + * Copyright (C) 2017-2019 Alexandru Elisei + * Copyright (C) 2017-2021 Darius Mihai + * Copyright (C) 2019-2021 Andrei Martin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "boot.h" + +#define gvatou(gva, addr) ((vm_offset_t)(gva) - KERNBASE + (vm_offset_t)(addr)) + +struct elf_file { + Elf_Phdr *ph; + Elf_Ehdr *ehdr; + Elf_Sym *symtab; + Elf_Hashelt *hashtab; + Elf_Hashelt nbuckets; + Elf_Hashelt nchains; + Elf_Hashelt *buckets; + Elf_Hashelt *chains; + Elf_Rel *rel; + size_t relsz; + Elf_Rela *rela; + size_t relasz; + char *strtab; + size_t strsz; + caddr_t firstpage_u; /* Userspace address of mmap'ed guest kernel */ +}; + +static uint64_t parse_image(struct preloaded_file *img, struct elf_file *ef); +static void image_addmetadata(struct preloaded_file *img, int type, + size_t size, void *addr); +static int image_addmodule(struct preloaded_file *img, char *modname, int version); +static void parse_metadata(struct preloaded_file *img, struct elf_file *ef, + Elf_Addr p_startu, Elf_Addr p_endu); +static int lookup_symbol(struct elf_file *ef, const char *name, Elf_Sym *symp); +static struct kernel_module *image_findmodule(struct preloaded_file *img, char *modname, + struct mod_depend *verinfo); +static uint64_t moddata_len(struct preloaded_file *img); +static void moddata_copy(vm_offset_t dest, struct preloaded_file *img); + +static int +load_elf_header(struct elf_file *ef) +{ + Elf_Ehdr *ehdr; + + ehdr = ef->ehdr = (Elf_Ehdr *)ef->firstpage_u; + /* Is it ELF? */ + if (!IS_ELF(*ehdr)) + return (EFTYPE); + + if (ehdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||/* Layout ? */ + ehdr->e_ident[EI_DATA] != ELF_TARG_DATA || + ehdr->e_ident[EI_VERSION] != EV_CURRENT || /* Version ? */ + ehdr->e_version != EV_CURRENT || + ehdr->e_machine != ELF_TARG_MACH) /* Machine ? */ + return (EFTYPE); + + return (0); +} + +static caddr_t +preload_search_by_type(const char *type, caddr_t preload_metadata) +{ + caddr_t curp, lname; + uint32_t *hdr; + int next; + + if (preload_metadata != NULL) { + + curp = preload_metadata; + lname = NULL; + for (;;) { + hdr = (uint32_t *)curp; + if (hdr[0] == 0 && hdr[1] == 0) + break; + + /* remember the start of each record */ + if (hdr[0] == MODINFO_NAME) + lname = curp; + + /* Search for a MODINFO_TYPE field */ + if ((hdr[0] == MODINFO_TYPE) && + !strcmp(type, curp + sizeof(uint32_t) * 2)) + return(lname); + + /* skip to next field */ + next = sizeof(uint32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + } + return(NULL); +} + +int +parse_kernel(void *addr, size_t img_size, struct vmctx *ctx, + struct vm_bootparams *bootparams) +{ + struct elf_file ef; + struct preloaded_file img; + Elf_Ehdr *ehdr_u; + int err; + vm_offset_t lastaddr_gva; + uint64_t kernend; + uint64_t size; + uint64_t modlen; + int boothowto; + + //fprintf(stderr, "[PARSE_KERNEL]\n\n"); + + memset(&ef, 0, sizeof(struct elf_file)); + memset(&img, 0, sizeof(struct preloaded_file)); + + ef.firstpage_u = (caddr_t)addr; + err = load_elf_header(&ef); + if (err != 0) + return (err); + + ehdr_u = ef.ehdr; + if (ehdr_u->e_type != ET_EXEC) { + fprintf(stderr, "Image not a kernel\n"); + return (EPERM); + } + img.f_name = "elf kernel"; + img.f_type = "elf kernel"; + img.f_size = img_size; + + size = parse_image(&img, &ef); + if (size == 0) + return (ENOEXEC); + bootparams->entry_off = ehdr_u->e_entry - KERNBASE; + + image_addmetadata(&img, MODINFOMD_ELFHDR, sizeof(*ehdr_u), ehdr_u); + + /* XXX: Add boothowto options? */ + boothowto = 0; + image_addmetadata(&img, MODINFOMD_HOWTO, sizeof(boothowto), &boothowto); + + lastaddr_gva = roundup(img.f_addr + img.f_size + 0x3fd000, PAGE_SIZE); + image_addmetadata(&img, MODINFOMD_ENVP, sizeof(lastaddr_gva), &lastaddr_gva); + bootparams->envp_gva = lastaddr_gva; + + lastaddr_gva = roundup(lastaddr_gva + bootparams->envlen, PAGE_SIZE); + /* Module data start in the guest kernel virtual address space */ + bootparams->modulep_gva = lastaddr_gva; + + modlen = moddata_len(&img); + kernend = roundup(bootparams->modulep_gva + modlen, PAGE_SIZE); + image_addmetadata(&img, MODINFOMD_KERNEND, sizeof(kernend), &kernend); + + bootparams->module_len = roundup(modlen, PAGE_SIZE); + bootparams->modulep = calloc(1, bootparams->module_len); + if (bootparams->modulep == NULL) { + perror("calloc"); + return (ENOMEM); + } + + moddata_copy((vm_offset_t)bootparams->modulep, &img); + + return (0); +} + +static uint64_t +parse_image(struct preloaded_file *img, struct elf_file *ef) +{ + Elf_Ehdr *ehdr; + Elf_Phdr *phdr; + Elf_Phdr *php; + Elf_Shdr *shdr; + Elf_Dyn *dp; + Elf_Addr adp; + Elf_Addr ctors; + Elf_Addr ssym, esym; + Elf_Addr p_start, p_end; + Elf_Size size; + Elf_Sym sym; + vm_offset_t firstaddr, lastaddr; + vm_offset_t shstr_addr; + char *shstr; + int symstrindex; + int symtabindex; + size_t chunk_len; + uint64_t ret; + int ndp; + int i; + unsigned int j; + + dp = NULL; + shdr = NULL; + ret = 0; + + ehdr = ef->ehdr; + phdr = (Elf_Phdr *)(ef->firstpage_u + ehdr->e_phoff); + + firstaddr = lastaddr = 0; + for (i = 0; i < ehdr->e_phnum; i++) { + if (phdr[i].p_type != PT_LOAD) + continue; + if (firstaddr == 0 || firstaddr > phdr[i].p_vaddr) + firstaddr = phdr[i].p_vaddr; + /* We mmap'ed the kernel, so p_memsz == p_filesz. */ + if (lastaddr == 0 || lastaddr < (phdr[i].p_vaddr + phdr[i].p_filesz)) + lastaddr = phdr[i].p_vaddr + phdr[i].p_filesz; + } + lastaddr = roundup(lastaddr, sizeof(long)); + + /* + * Get the section headers. We need this for finding the .ctors + * section as well as for loading any symbols. Both may be hard + * to do if reading from a .gz file as it involves seeking. I + * think the rule is going to have to be that you must strip a + * file to remove symbols before gzipping it. + */ + chunk_len = ehdr->e_shnum * ehdr->e_shentsize; + if (chunk_len == 0 || ehdr->e_shoff == 0) + goto nosyms; + shdr = (Elf_Shdr *)(ef->firstpage_u + ehdr->e_shoff); + image_addmetadata(img, MODINFOMD_SHDR, chunk_len, shdr); + + /* + * Read the section string table and look for the .ctors section. + * We need to tell the kernel where it is so that it can call the + * ctors. + */ + chunk_len = shdr[ehdr->e_shstrndx].sh_size; + if (chunk_len > 0) { + shstr_addr = (vm_offset_t)(ef->firstpage_u + \ + shdr[ehdr->e_shstrndx].sh_offset); + shstr = malloc(chunk_len); + memcpy(shstr, (void *)shstr_addr, chunk_len); + for (i = 0; i < ehdr->e_shnum; i++) { + if (strcmp(shstr + shdr[i].sh_name, ".ctors") != 0) + continue; + ctors = shdr[i].sh_addr; + image_addmetadata(img, MODINFOMD_CTORS_ADDR, + sizeof(ctors), &ctors); + size = shdr[i].sh_size; + image_addmetadata(img, MODINFOMD_CTORS_SIZE, + sizeof(size), &size); + break; + } + free(shstr); + } + + /* + * Now load any symbols. + */ + symtabindex = -1; + symstrindex = -1; + for (i = 0; i < ehdr->e_shnum; i++) { + if (shdr[i].sh_type != SHT_SYMTAB) + continue; + for (j = 0; j < ehdr->e_phnum; j++) { + if (phdr[j].p_type != PT_LOAD) + continue; + if (shdr[i].sh_offset >= phdr[j].p_offset && + (shdr[i].sh_offset + shdr[i].sh_size <= + phdr[j].p_offset + phdr[j].p_filesz)) { + shdr[i].sh_offset = 0; + shdr[i].sh_size = 0; + break; + } + } + if (shdr[i].sh_offset == 0 || shdr[i].sh_size == 0) + continue; /* alread loaded in a PT_LOAD above */ + /* Save it for loading below */ + symtabindex = i; + symstrindex = shdr[i].sh_link; + } + if (symtabindex < 0 || symstrindex < 0) + goto nosyms; + + ssym = lastaddr; + i = symtabindex; + for (;;) { + size = shdr[i].sh_size; + lastaddr += sizeof(size); + lastaddr += shdr[i].sh_size; + lastaddr = roundup(lastaddr, sizeof(size)); + + if (i == symtabindex) + i = symstrindex; + else if (i == symstrindex) + break; + } + esym = lastaddr; + + image_addmetadata(img, MODINFOMD_SSYM, sizeof(ssym), &ssym); + image_addmetadata(img, MODINFOMD_ESYM, sizeof(esym), &esym); + +nosyms: + ret = lastaddr - firstaddr; + img->f_addr = firstaddr; + + php = NULL; + for (i = 0; i < ehdr->e_phnum; i++) { + if (phdr[i].p_type == PT_DYNAMIC) { + php = &phdr[i]; + adp = php->p_vaddr; + image_addmetadata(img, MODINFOMD_DYNAMIC, + sizeof(adp), &adp); + break; + } + } + if (php == NULL) + goto out; + ndp = php->p_filesz / sizeof(Elf_Dyn); + if (ndp == 0) + goto out; + + ef->strsz = 0; + dp = (Elf_Dyn *)(ef->firstpage_u + php->p_offset); + for (i = 0; i < ndp; i++) { + if (dp[i].d_tag == 0) + break; + switch(dp[i].d_tag) { + case DT_HASH: + ef->hashtab = (Elf_Hashelt *)(uintptr_t)dp[i].d_un.d_ptr; + break; + case DT_STRTAB: + ef->strtab = (char *)(uintptr_t)dp[i].d_un.d_ptr; + case DT_STRSZ: + ef->strsz = dp[i].d_un.d_val; + break; + case DT_SYMTAB: + ef->symtab = (Elf_Sym *)(uintptr_t)dp[i].d_un.d_ptr; + break; + case DT_REL: + ef->rel = (Elf_Rel *)(uintptr_t)dp[i].d_un.d_ptr; + break; + case DT_RELSZ: + ef->relsz = dp[i].d_un.d_val; + break; + case DT_RELA: + ef->rela = (Elf_Rela *)(uintptr_t)dp[i].d_un.d_ptr; + break; + case DT_RELASZ: + ef->relasz = dp[i].d_un.d_val; + break; + } + } + if (ef->hashtab == NULL || ef->symtab == NULL || + ef->strtab == NULL || ef->strsz == 0) + goto out; + + memcpy(&ef->nbuckets, (void *)gvatou(ef->hashtab, ef->firstpage_u), sizeof(ef->nbuckets)); + memcpy(&ef->nchains, (void *)gvatou(ef->hashtab + 1, ef->firstpage_u), sizeof(ef->nchains)); + ef->buckets = (Elf_Hashelt *)gvatou(ef->hashtab + 2, ef->firstpage_u); + ef->chains = ef->buckets + ef->nbuckets; + + if (lookup_symbol(ef, "__start_set_modmetadata_set", &sym) != 0) { + ret = 0; + goto out; + } + p_start = gvatou(sym.st_value, ef->firstpage_u); + if (lookup_symbol(ef, "__stop_set_modmetadata_set", &sym) != 0) { + ret = ENOENT; + goto out; + } + p_end = gvatou(sym.st_value, ef->firstpage_u); + parse_metadata(img, ef, p_start, p_end); + +out: + return ret; +} + +static uint64_t +moddata_len(struct preloaded_file *img) +{ + struct file_metadata *md; + uint64_t len; + + /* Count the kernel image name */ + len = 8 + roundup(strlen(img->f_name) + 1, sizeof(uint64_t)); + /* Count the kernel's type */ + len += 8 + roundup(strlen(img->f_type) + 1, sizeof(uint64_t)); + /* Count the kernel's virtual address */ + len += 8 + roundup(sizeof(img->f_addr), sizeof(uint64_t)); + /* Count the kernel's size */ + len += 8 + roundup(sizeof(img->f_size), sizeof(uint64_t)); + /* Count the metadata size */ + for (md = img->f_metadata; md != NULL; md = md->md_next) + len += 8 + roundup(md->md_size, sizeof(uint64_t)); + + return len; +} + +#define COPY32(dest, what) \ + do { \ + uint32_t w = (what); \ + memcpy((void *)dest, &w, sizeof(w)); \ + dest += sizeof(w); \ + } while (0) + +#define COPY_MODINFO(modinfo, dest, val, len) \ + do { \ + COPY32(dest, modinfo); \ + COPY32(dest, len); \ + memcpy((void *)dest, val, len); \ + dest += roundup(len, sizeof(uint64_t)); \ + } while (0) + +#define COPY_MODEND(dest) \ + do { \ + COPY32(dest, MODINFO_END); \ + COPY32(dest, 0); \ + } while (0); + +static void +moddata_copy(vm_offset_t dest, struct preloaded_file *img) +{ + struct file_metadata *md; + + COPY_MODINFO(MODINFO_NAME, dest, img->f_name, strlen(img->f_name) + 1); + COPY_MODINFO(MODINFO_TYPE, dest, img->f_type, strlen(img->f_type) + 1); + COPY_MODINFO(MODINFO_ADDR, dest, &img->f_addr, sizeof(img->f_addr)); + COPY_MODINFO(MODINFO_SIZE, dest, &img->f_size, sizeof(img->f_size)); + + for (md = img->f_metadata; md != NULL; md = md->md_next) + COPY_MODINFO(MODINFO_METADATA | md->md_type, dest, + md->md_data, md->md_size); + + COPY_MODEND(dest); +} + +static void +image_addmetadata(struct preloaded_file *img, int type, + size_t size, void *addr) +{ + struct file_metadata *md; + + md = malloc(sizeof(struct file_metadata) - sizeof(md->md_data) + size); + md->md_size = size; + md->md_type = type; + memcpy(md->md_data, addr, size); + md->md_next = img->f_metadata; + img->f_metadata = md; +} + +static uint64_t +elf_hash(const char *name) +{ + const unsigned char *p = (const unsigned char *)name; + uint64_t h; + uint64_t g; + + h = 0; + while (*p != '\0') { + h = (h << 4) + *p++; + if ((g = h & 0xf0000000) != 0) + h ^= g >> 24; + h &= ~g; + } + + return h; +} + +static int +lookup_symbol(struct elf_file *ef, const char *name, Elf_Sym *symp) +{ + Elf_Hashelt symnum; + Elf_Sym sym; + char *strp; + uint64_t hash; + + hash = elf_hash(name); + memcpy(&symnum, &ef->buckets[hash % ef->nbuckets], sizeof(symnum)); + + while (symnum != STN_UNDEF) { + if (symnum >= ef->nchains) { + fprintf(stderr, "lookup_symbol: corrupt symbol table\n"); + return ENOENT; + } + + memcpy(&sym, (void *)gvatou(ef->symtab + symnum, ef->firstpage_u), sizeof(sym)); + if (sym.st_name == 0) { + fprintf(stderr, "lookup_symbol: corrupt symbol table\n"); + return ENOENT; + } + + strp = strdup((char *)gvatou(ef->strtab + sym.st_name, ef->firstpage_u)); + if (strcmp(name, strp) == 0) { + free(strp); + if (sym.st_shndx != SHN_UNDEF || + (sym.st_value != 0 && + ELF_ST_TYPE(sym.st_info) == STT_FUNC)) { + *symp = sym; + return 0; + } + return ENOENT; + } + free(strp); + memcpy(&symnum, &ef->chains[symnum], sizeof(symnum)); + } + + return ENOENT; +} + +static void +parse_metadata(struct preloaded_file *img, struct elf_file *ef, + Elf_Addr p_startu, Elf_Addr p_endu) +{ + struct mod_metadata md; + struct mod_version mver; + char *s; + int modcnt; + Elf_Addr v, p; + + modcnt = 0; + for (p = p_startu; p < p_endu; p += sizeof(Elf_Addr)) { + memcpy(&v, (void *)p, sizeof(v)); + memcpy(&md, (void *)gvatou(v, ef->firstpage_u), sizeof(md)); + if (md.md_type == MDT_VERSION) { + s = strdup((char *)gvatou(md.md_cval, ef->firstpage_u)); + memcpy(&mver, + (void *)gvatou(md.md_data, ef->firstpage_u), + sizeof(mver)); + image_addmodule(img, s, mver.mv_version); + free(s); + modcnt++; + } + } + + if (modcnt == 0) { + image_addmodule(img, "kernel", 1); + free(s); + } +} + +static int +image_addmodule(struct preloaded_file *img, char *modname, int version) +{ + struct kernel_module *mp; + struct mod_depend mdepend; + + bzero(&mdepend, sizeof(mdepend)); + mdepend.md_ver_preferred = version; + + mp = image_findmodule(img, modname, &mdepend); + if (mp) + return (EEXIST); + mp = malloc(sizeof(struct kernel_module)); + if (mp == NULL) + return (ENOMEM); + + bzero(mp, sizeof(struct kernel_module)); + mp->m_name = strdup(modname); + mp->m_version = version; + mp->m_fp = img; + mp->m_next = img->f_modules; + img->f_modules = mp; + + return (0); +} + +static struct kernel_module * +image_findmodule(struct preloaded_file *img, char *modname, + struct mod_depend *verinfo) +{ + struct kernel_module *mp, *best; + int bestver, mver; + + best = NULL; + bestver = 0; + for (mp = img->f_modules; mp != NULL; mp = mp->m_next) { + if (strcmp(modname, mp->m_name) == 0) { + if (verinfo == NULL) + return (mp); + mver = mp->m_version; + if (mver == verinfo->md_ver_preferred) + return (mp); + if (mver >= verinfo->md_ver_minimum && + mver <= verinfo->md_ver_maximum && + mver > bestver) { + best = mp; + bestver = mver; + } + } + } + + return (best); +}