Index: lib/Makefile =================================================================== --- lib/Makefile +++ lib/Makefile @@ -203,6 +203,9 @@ .if ${MACHINE_CPUARCH} == "amd64" SUBDIR.${MK_PMC}+= libipt +.endif + +.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "aarch64" SUBDIR.${MK_BHYVE}+= libvmmapi .endif Index: lib/libvmmapi/Makefile =================================================================== --- lib/libvmmapi/Makefile +++ lib/libvmmapi/Makefile @@ -1,14 +1,21 @@ # $FreeBSD$ -PACKAGE=lib${LIB} -LIB= vmmapi -SRCS= vmmapi.c vmmapi_freebsd.c -INCS= vmmapi.h +PACKAGE= lib${LIB} +SHLIBDIR?= /lib +LIB_SRCTOP?= ${.CURDIR} +LIB= vmmapi WARNS?= 2 -LIBADD= util +.if exists(${LIB_SRCTOP}/${MACHINE}) +LIB_ARCH= ${MACHINE} +.elif exists(${LIB_SRCTOP}/${MACHINE_ARCH}) +LIB_ARCH= ${MACHINE_ARCH} +.else +LIB_ARCH= ${MACHINE_CPUARCH} +.endif -CFLAGS+= -I${.CURDIR} +CFLAGS+= -I${LIB_SRCTOP}/${LIB_ARCH} +.include "${LIB_SRCTOP}/${LIB_ARCH}/Makefile.inc" .include Index: lib/libvmmapi/amd64/Makefile.inc =================================================================== --- /dev/null +++ lib/libvmmapi/amd64/Makefile.inc @@ -0,0 +1,7 @@ +# $FreeBSD$ +.PATH: ${LIB_SRCTOP}/amd64/ + +SRCS= vmmapi.c vmmapi_freebsd.c +INCS= vmmapi.h + +LIBADD= util Index: lib/libvmmapi/arm64/Makefile.inc =================================================================== --- /dev/null +++ lib/libvmmapi/arm64/Makefile.inc @@ -0,0 +1,7 @@ +# $FreeBSD$ +.PATH: ${LIB_SRCTOP}/arm64/ + +SRCS= vmmapi.c +INCS= vmmapi.h + +LIBADD= util Index: lib/libvmmapi/arm64/vmmapi.h =================================================================== --- /dev/null +++ lib/libvmmapi/arm64/vmmapi.h @@ -0,0 +1,79 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMMAPI_H_ +#define _VMMAPI_H_ + +struct vmctx; +struct vm_exit; +enum vm_cap_type; + +/* + * Different styles of mapping the memory assigned to a VM into the address + * space of the controlling process. + */ +enum vm_mmap_style { + VM_MMAP_NONE, /* no mapping */ + VM_MMAP_ALL, /* fully and statically mapped */ + VM_MMAP_SPARSE, /* mappings created on-demand */ +}; + +int vm_create(const char *name); +struct vmctx *vm_open(const char *name); +void vm_destroy(struct vmctx *ctx); +int vm_parse_memsize(const char *optarg, size_t *memsize); +int vm_get_memory_seg(struct vmctx *ctx, uint64_t gpa, size_t *ret_len); +int vm_setup_memory(struct vmctx *ctx, uint64_t membase, size_t len, enum vm_mmap_style s); +void *vm_map_ipa(struct vmctx *ctx, uint64_t gaddr, size_t len); +uint32_t vm_get_mem_limit(struct vmctx *ctx); +void vm_set_mem_limit(struct vmctx *ctx, uint32_t limit); +int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); +int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); +int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, + struct vm_exit *ret_vmexit); +const char *vm_capability_type2name(int type); +int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int *retval); +int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int val); +int vm_assert_irq(struct vmctx *ctx, uint32_t irq); +int vm_deassert_irq(struct vmctx *ctx, uint32_t irq); + +/* + * Return a pointer to the statistics buffer. Note that this is not MT-safe. + */ +uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, + int *ret_entries); +const char *vm_get_stat_desc(struct vmctx *ctx, int index); + +/* Reset vcpu register state */ +int vcpu_reset(struct vmctx *ctx, int vcpu); + +int vm_attach_vgic(struct vmctx *ctx, uint64_t dist_start, size_t dist_size, + uint64_t redist_start, size_t redist_size); +#endif /* _VMMAPI_H_ */ Index: lib/libvmmapi/arm64/vmmapi.c =================================================================== --- /dev/null +++ lib/libvmmapi/arm64/vmmapi.c @@ -0,0 +1,392 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "vmmapi.h" + +#define MB (1024 * 1024UL) +#define GB (1024 * 1024 * 1024UL) + +struct vmctx { + int fd; + uint32_t mem_limit; + enum vm_mmap_style vms; + size_t mem_size; + uint64_t mem_base; + char *mem_addr; + char *name; +}; + +#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) +#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) + +static int +vm_device_open(const char *name) +{ + int fd, len; + char *vmfile; + + len = strlen("/dev/vmm/") + strlen(name) + 1; + vmfile = malloc(len); + assert(vmfile != NULL); + snprintf(vmfile, len, "/dev/vmm/%s", name); + + /* Open the device file */ + fd = open(vmfile, O_RDWR, 0); + + free(vmfile); + return (fd); +} + +int +vm_create(const char *name) +{ + + return (CREATE((char *)name)); +} + +struct vmctx * +vm_open(const char *name) +{ + struct vmctx *vm; + + vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); + assert(vm != NULL); + + vm->fd = -1; + vm->mem_limit = 2 * GB; + vm->name = (char *)(vm + 1); + strcpy(vm->name, name); + + if ((vm->fd = vm_device_open(vm->name)) < 0) + goto err; + + return (vm); +err: + vm_destroy(vm); + return (NULL); +} + +void +vm_destroy(struct vmctx *vm) +{ + assert(vm != NULL); + + if (vm->fd >= 0) + close(vm->fd); + DESTROY(vm->name); + + free(vm); +} + +int +vm_parse_memsize(const char *optarg, size_t *ret_memsize) +{ + char *endptr; + size_t optval; + int error; + + optval = strtoul(optarg, &endptr, 0); + if (*optarg != '\0' && *endptr == '\0') { + /* + * For the sake of backward compatibility if the memory size + * specified on the command line is less than a megabyte then + * it is interpreted as being in units of MB. + */ + if (optval < MB) + optval *= MB; + *ret_memsize = optval; + error = 0; + } else + error = expand_number(optarg, ret_memsize); + + return (error); +} + +int +vm_get_memory_seg(struct vmctx *ctx, uint64_t gpa, size_t *ret_len) +{ + int error; + struct vm_memory_segment seg; + + bzero(&seg, sizeof(seg)); + seg.gpa = gpa; + error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg); + *ret_len = seg.len; + return (error); +} + +uint32_t +vm_get_mem_limit(struct vmctx *ctx) +{ + + return (ctx->mem_limit); +} + +void +vm_set_mem_limit(struct vmctx *ctx, uint32_t limit) +{ + + ctx->mem_limit = limit; +} + +static int +setup_memory_segment(struct vmctx *ctx, uint64_t gpa, size_t len, char **addr) +{ + int error; + struct vm_memory_segment seg; + + /* + * Create and optionally map 'len' bytes of memory at guest + * physical address 'gpa' + */ + bzero(&seg, sizeof(seg)); + seg.gpa = gpa; + seg.len = len; + error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg); + if (error == 0 && addr != NULL) { + *addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, + ctx->fd, gpa); + } + return (error); +} + +int +vm_setup_memory(struct vmctx *ctx, uint64_t membase, size_t memsize, enum vm_mmap_style vms) +{ + int error; + + /* XXX VM_MMAP_SPARSE not implemented yet */ + assert(vms == VM_MMAP_ALL); + + ctx->vms = vms; + ctx->mem_base = membase; + + assert(memsize <= ctx->mem_limit); + ctx->mem_size = memsize; + + if (ctx->mem_size > 0) { + error = setup_memory_segment(ctx, ctx->mem_base, ctx->mem_size, + &ctx->mem_addr); + if (error) + return (error); + } + + return (0); +} + +void * +vm_map_ipa(struct vmctx *ctx, uint64_t iaddr, size_t len) +{ + /* XXX VM_MMAP_SPARSE not implemented yet */ + assert(ctx->vms == VM_MMAP_ALL); + + if (iaddr < ctx->mem_base) + return ((void *)(ctx->mem_addr + iaddr)); + else + return ((void *)(ctx->mem_addr + (iaddr - ctx->mem_base))); +} + + +int +vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val) +{ + int error; + struct vm_register vmreg; + + bzero(&vmreg, sizeof(vmreg)); + vmreg.cpuid = vcpu; + vmreg.regnum = reg; + vmreg.regval = val; + + error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg); + return (error); +} + +int +vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val) +{ + int error; + struct vm_register vmreg; + + bzero(&vmreg, sizeof(vmreg)); + vmreg.cpuid = vcpu; + vmreg.regnum = reg; + + error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg); + *ret_val = vmreg.regval; + return (error); +} + +int +vm_run(struct vmctx *ctx, int vcpu, uint64_t pc, struct vm_exit *vmexit) +{ + int error; + struct vm_run vmrun; + + bzero(&vmrun, sizeof(vmrun)); + vmrun.cpuid = vcpu; + vmrun.pc = pc; + + error = ioctl(ctx->fd, VM_RUN, &vmrun); + bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); + return (error); +} + +static struct { + const char *name; + int type; +} capstrmap[] = { + { "hlt_exit", VM_CAP_HALT_EXIT }, + { "mtrap_exit", VM_CAP_MTRAP_EXIT }, + { "pause_exit", VM_CAP_PAUSE_EXIT }, + { "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST }, + { 0 } +}; + +int +vm_capability_name2type(const char *capname) +{ + int i; + + for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) { + if (strcmp(capstrmap[i].name, capname) == 0) + return (capstrmap[i].type); + } + + return (-1); +} + +const char * +vm_capability_type2name(int type) +{ + int i; + + for (i = 0; capstrmap[i].name != NULL; i++) { + if (capstrmap[i].type == type) + return (capstrmap[i].name); + } + + return (NULL); +} + +int +vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int *retval) +{ + int error; + struct vm_capability vmcap; + + bzero(&vmcap, sizeof(vmcap)); + vmcap.cpuid = vcpu; + vmcap.captype = cap; + + error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap); + *retval = vmcap.capval; + return (error); +} + +int +vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val) +{ + struct vm_capability vmcap; + + bzero(&vmcap, sizeof(vmcap)); + vmcap.cpuid = vcpu; + vmcap.captype = cap; + vmcap.capval = val; + + return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap)); +} + +uint64_t * +vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, + int *ret_entries) +{ + int error; + + static struct vm_stats vmstats; + + vmstats.cpuid = vcpu; + + error = ioctl(ctx->fd, VM_STATS, &vmstats); + if (error == 0) { + if (ret_entries) + *ret_entries = vmstats.num_entries; + if (ret_tv) + *ret_tv = vmstats.tv; + return (vmstats.statbuf); + } else + return (NULL); +} + +const char * +vm_get_stat_desc(struct vmctx *ctx, int index) +{ + static struct vm_stat_desc statdesc; + + statdesc.index = index; + if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) + return (statdesc.desc); + else + return (NULL); +} + +int +vcpu_reset(struct vmctx *vmctx, int vcpu) +{ + return (ENXIO); +} + +int +vm_attach_vgic(struct vmctx *ctx, uint64_t dist_start, size_t dist_size, + uint64_t redist_start, size_t redist_size) +{ + struct vm_attach_vgic vav; + + bzero(&vav, sizeof(vav)); + vav.dist_start = dist_start; + vav.dist_size = dist_size; + vav.redist_start = redist_start; + vav.redist_size = redist_size; + + return (ioctl(ctx->fd, VM_ATTACH_VGIC, &vav)); +} + +int +vm_assert_irq(struct vmctx *ctx, uint32_t irq) +{ + struct vm_irq vi; + + bzero(&vi, sizeof(vi)); + vi.irq = irq; + + return (ioctl(ctx->fd, VM_ASSERT_IRQ, &vi)); +} + +int +vm_deassert_irq(struct vmctx *ctx, uint32_t irq) +{ + struct vm_irq vi; + + bzero(&vi, sizeof(vi)); + vi.irq = irq; + + return (ioctl(ctx->fd, VM_DEASSERT_IRQ, &vi)); +} Index: sys/arm/arm/generic_timer.h =================================================================== --- /dev/null +++ sys/arm/arm/generic_timer.h @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2018 Alexandru Elise + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company nor the name of the author may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _ARM_GENERIC_TIMER_H_ +#define _ARM_GENERIC_TIMER_H_ + +#define GT_PHYS_SECURE 0 +#define GT_PHYS_NONSECURE 1 +#define GT_VIRT 2 +#define GT_HYP 3 + +int arm_tmr_setup_intr(int gt_type, driver_filter_t filter, + driver_intr_t handler, void *arg); +int arm_tmr_teardown_intr(int gt_type); + +#endif Index: sys/arm/arm/generic_timer.c =================================================================== --- sys/arm/arm/generic_timer.c +++ sys/arm/arm/generic_timer.c @@ -63,6 +63,10 @@ #include /* For arm_set_delay */ #endif +#if defined(__aarch64__) +#include /* For virt_enabled() */ +#endif + #ifdef FDT #include #include @@ -74,6 +78,8 @@ #include #endif +#include "generic_timer.h" + #define GT_CTRL_ENABLE (1 << 0) #define GT_CTRL_INT_MASK (1 << 1) #define GT_CTRL_INT_STAT (1 << 2) @@ -123,6 +129,8 @@ .tc_fill_vdso_timehands = arm_tmr_fill_vdso_timehands, }; +static device_t arm_tmr_dev; + #ifdef __arm__ #define get_el0(x) cp15_## x ##_get() #define get_el1(x) cp15_## x ##_get() @@ -314,6 +322,39 @@ return (FILTER_HANDLED); } +int +arm_tmr_setup_intr(int gt_type, driver_filter_t filter, driver_intr_t handler, + void *arg) +{ + if (gt_type != GT_PHYS_SECURE && + gt_type != GT_PHYS_NONSECURE && + gt_type != GT_VIRT && + gt_type != GT_HYP) + return (ENXIO); + + if (arm_tmr_sc->res[gt_type] == NULL) + return (ENXIO); + + return (bus_setup_intr(arm_tmr_dev, arm_tmr_sc->res[gt_type], + INTR_TYPE_CLK, filter, handler, arg, &arm_tmr_sc->ihl[gt_type])); +} + +int +arm_tmr_teardown_intr(int gt_type) +{ + if (gt_type != GT_PHYS_SECURE && + gt_type != GT_PHYS_NONSECURE && + gt_type != GT_VIRT && + gt_type != GT_HYP) + return (ENXIO); + + if (arm_tmr_sc->res[gt_type] == NULL) + return (ENXIO); + + return (bus_teardown_intr(arm_tmr_dev, arm_tmr_sc->res[gt_type], + arm_tmr_sc->ihl[gt_type])); +} + #ifdef FDT static int arm_tmr_fdt_probe(device_t dev) @@ -447,13 +488,26 @@ last_timer = 1; } +#ifdef __aarch64__ + sc->physical |= virt_enabled(); +#endif + arm_tmr_sc = sc; /* Setup secure, non-secure and virtual IRQs handler */ - for (i = first_timer; i <= last_timer; i++) { + for (i = GT_PHYS_SECURE; i <= GT_VIRT; i++) { /* If we do not have the interrupt, skip it. */ if (sc->res[i] == NULL) continue; +#if defined(__aarch64__) + if (i == 2 && virt_enabled()) { + /* + * Do not install an interrupt handler for the virtual + * timer. This will be used by the VM. + */ + continue; + } +#endif error = bus_setup_intr(dev, sc->res[i], INTR_TYPE_CLK, arm_tmr_intr, NULL, sc, &sc->ihl[i]); if (error) { @@ -461,7 +515,6 @@ return (ENXIO); } } - /* Disable the virtual timer until we are ready */ if (sc->res[2] != NULL) arm_tmr_disable(false); @@ -488,6 +541,8 @@ arm_set_delay(arm_tmr_do_delay, sc); #endif + arm_tmr_dev = dev; + return (0); } Index: sys/arm/arm/gic.h =================================================================== --- sys/arm/arm/gic.h +++ sys/arm/arm/gic.h @@ -47,13 +47,16 @@ struct arm_gic_softc { device_t gic_dev; + bool is_root; void * gic_intrhand; struct gic_irqsrc * gic_irqs; - struct resource * gic_res[3]; + struct resource * gic_res[6]; bus_space_tag_t gic_c_bst; bus_space_tag_t gic_d_bst; bus_space_handle_t gic_c_bsh; bus_space_handle_t gic_d_bsh; + bus_space_tag_t gic_h_bst; + bus_space_handle_t gic_h_bsh; uint8_t ver; struct mtx mutex; uint32_t nirqs; Index: sys/arm/arm/gic.c =================================================================== --- sys/arm/arm/gic.c +++ sys/arm/arm/gic.c @@ -128,10 +128,14 @@ static struct resource_spec arm_gic_spec[] = { { SYS_RES_MEMORY, 0, RF_ACTIVE }, /* Distributor registers */ { SYS_RES_MEMORY, 1, RF_ACTIVE }, /* CPU Interrupt Intf. registers */ - { SYS_RES_IRQ, 0, RF_ACTIVE | RF_OPTIONAL }, /* Parent interrupt */ + { SYS_RES_MEMORY, 2, RF_ACTIVE | RF_OPTIONAL }, /* Virtual Interface Control */ + { SYS_RES_MEMORY, 3, RF_ACTIVE | RF_OPTIONAL }, /* Virtual CPU interface */ + { SYS_RES_IRQ, 0, RF_ACTIVE | RF_OPTIONAL }, /* vGIC maintenance interrupt or parent interrupt */ { -1, 0 } }; +extern char hypmode_enabled[]; + #if defined(__arm__) && defined(INVARIANTS) static int gic_debug_spurious = 1; #else @@ -154,6 +158,22 @@ #define gic_d_write_4(_sc, _reg, _val) \ bus_space_write_4((_sc)->gic_d_bst, (_sc)->gic_d_bsh, (_reg), (_val)) +#define gic_h_read_4(_sc, _reg) \ + bus_space_read_4((_sc)->gic_h_bst, (_sc)->gic_h_bsh, (_reg)) +#define gic_h_write_4(_sc, _reg, _val) \ + bus_space_write_4((_sc)->gic_h_bst, (_sc)->gic_h_bsh, (_reg), (_val)) + +struct arm_gic_softc * +arm_gic_get_sc(void) +{ + return gic_sc; +} +uint32_t +arm_gic_get_lr_num(void) +{ + return (gic_h_read_4(gic_sc, GICH_VTR) & 0x3f) + 1; +} + static inline void gic_irq_unmask(struct arm_gic_softc *sc, u_int irq) { @@ -322,12 +342,25 @@ mtx_init(&sc->mutex, "GIC lock", NULL, MTX_SPIN); /* Distributor Interface */ - sc->gic_d_bst = rman_get_bustag(sc->gic_res[0]); - sc->gic_d_bsh = rman_get_bushandle(sc->gic_res[0]); + sc->gic_d_bst = rman_get_bustag(sc->gic_res[DISTRIBUTOR_RES_IDX]); + sc->gic_d_bsh = rman_get_bushandle(sc->gic_res[DISTRIBUTOR_RES_IDX]); /* CPU Interface */ - sc->gic_c_bst = rman_get_bustag(sc->gic_res[1]); - sc->gic_c_bsh = rman_get_bushandle(sc->gic_res[1]); + sc->gic_c_bst = rman_get_bustag(sc->gic_res[CPU_INTERFACE_RES_IDX]); + sc->gic_c_bsh = rman_get_bushandle(sc->gic_res[CPU_INTERFACE_RES_IDX]); + + /* Virtual Interface Control */ + if (sc->is_root) { + if (sc->gic_res[VIRT_INTERFACE_CONTROL_RES_IDX] == NULL) { + device_printf(dev, "Cannot find Virtual Interface Control Registers. Disabling Hyp-Mode...\n"); + hypmode_enabled[0] = -1; + } else { + sc->gic_h_bst = rman_get_bustag(sc->gic_res[VIRT_INTERFACE_CONTROL_RES_IDX]); + sc->gic_h_bsh = rman_get_bushandle(sc->gic_res[VIRT_INTERFACE_CONTROL_RES_IDX]); + } + } else { + hypmode_enabled[0] = -1; + } /* Disable interrupt forwarding to the CPU interface */ gic_d_write_4(sc, GICD_CTLR, 0x00); @@ -507,6 +540,33 @@ ("arm_gic_read_ivar: Invalid bus type %u", sc->gic_bus)); *result = sc->gic_bus; return (0); + case GIC_IVAR_VIRTUAL_INT_CTRL_RES: + *result = (uintptr_t)sc->gic_res[VIRT_INTERFACE_CONTROL_RES_IDX]; + return (0); + case GIC_IVAR_VIRTUAL_INT_CTRL_VADDR: + *result = (uintptr_t)rman_get_virtual(sc->gic_res[VIRT_INTERFACE_CONTROL_RES_IDX]); + return (0); + case GIC_IVAR_VIRTUAL_INT_CTRL_PADDR: + *result = (uintptr_t)rman_get_start(sc->gic_res[VIRT_INTERFACE_CONTROL_RES_IDX]); + return (0); + case GIC_IVAR_VIRTUAL_INT_CTRL_SIZE: + *result = rman_get_size(sc->gic_res[VIRT_INTERFACE_CONTROL_RES_IDX]); + return (0); + case GIC_IVAR_VIRTUAL_CPU_INT_PADDR: + *result = rman_get_start(sc->gic_res[VIRT_CPU_INTERFACE_RES_IDX]); + return (0); + case GIC_IVAR_VIRTUAL_CPU_INT_SIZE: + *result = rman_get_size(sc->gic_res[VIRT_CPU_INTERFACE_RES_IDX]); + return (0); + case GIC_IVAR_LR_NUM: + *result = (gic_h_read_4(gic_sc, GICH_VTR) & 0x3f) + 1; + return (0); + case GIC_IVAR_MAINTENANCE_INTR_RES: + if (sc->is_root) + *result = (uintptr_t)sc->gic_res[MAINTENANCE_INTR_RES_IDX]; + else + result = NULL; + return (0); } return (ENOENT); @@ -979,7 +1039,7 @@ if (CPU_ISSET(i, &cpus)) val |= arm_gic_map[i] << GICD_SGI_TARGET_SHIFT; - gic_d_write_4(sc, GICD_SGIR, val | gi->gi_irq); + gic_d_write_4(sc, GICD_SGIR(0), val | gi->gi_irq); } static int Index: sys/arm/arm/gic_common.h =================================================================== --- sys/arm/arm/gic_common.h +++ sys/arm/arm/gic_common.h @@ -32,8 +32,25 @@ #ifndef _GIC_COMMON_H_ #define _GIC_COMMON_H_ -#define GIC_IVAR_HW_REV 500 -#define GIC_IVAR_BUS 501 +#ifndef __ASSEMBLER__ + +#define DISTRIBUTOR_RES_IDX 0 +#define CPU_INTERFACE_RES_IDX 1 +#define VIRT_INTERFACE_CONTROL_RES_IDX 2 +#define VIRT_CPU_INTERFACE_RES_IDX 3 +#define MAINTENANCE_INTR_RES_IDX 4 +#define INTRNG_RES_IDX 5 + +#define GIC_IVAR_HW_REV 500 +#define GIC_IVAR_BUS 501 +#define GIC_IVAR_VIRTUAL_INT_CTRL_RES 502 +#define GIC_IVAR_VIRTUAL_INT_CTRL_VADDR 503 +#define GIC_IVAR_VIRTUAL_INT_CTRL_PADDR 505 +#define GIC_IVAR_VIRTUAL_INT_CTRL_SIZE 504 +#define GIC_IVAR_VIRTUAL_CPU_INT_PADDR 506 +#define GIC_IVAR_VIRTUAL_CPU_INT_SIZE 507 +#define GIC_IVAR_LR_NUM 508 +#define GIC_IVAR_MAINTENANCE_INTR_RES 509 /* GIC_IVAR_BUS values */ #define GIC_BUS_UNKNOWN 0 @@ -43,6 +60,19 @@ __BUS_ACCESSOR(gic, hw_rev, GIC, HW_REV, u_int); __BUS_ACCESSOR(gic, bus, GIC, BUS, u_int); +__BUS_ACCESSOR(gic, virtual_int_ctrl_res, GIC, VIRTUAL_INT_CTRL_RES, struct resource *); +__BUS_ACCESSOR(gic, virtual_int_ctrl_vaddr, GIC, VIRTUAL_INT_CTRL_VADDR, uint64_t); +__BUS_ACCESSOR(gic, virtual_int_ctrl_paddr, GIC, VIRTUAL_INT_CTRL_PADDR, uint64_t); +__BUS_ACCESSOR(gic, virtual_int_ctrl_size, GIC, VIRTUAL_INT_CTRL_SIZE, uint32_t); +__BUS_ACCESSOR(gic, virtual_cpu_int_paddr, GIC, VIRTUAL_CPU_INT_PADDR, uint32_t); +__BUS_ACCESSOR(gic, virtual_cpu_int_size, GIC, VIRTUAL_CPU_INT_SIZE, uint32_t); +__BUS_ACCESSOR(gic, lr_num, GIC, LR_NUM, uint32_t); +__BUS_ACCESSOR(gic, maintenance_intr_res, GIC, MAINTENANCE_INTR_RES, struct resource *); + +struct arm_gic_softc *arm_gic_get_sc(void); +uint32_t arm_gic_get_lr_num(void); + +#endif /*__ASSEMBLER__ */ /* Software Generated Interrupts */ #define GIC_FIRST_SGI 0 /* Irqs 0-15 are SGIs/IPIs. */ @@ -56,7 +86,9 @@ /* Common register values */ #define GICD_CTLR 0x0000 /* v1 ICDDCR */ #define GICD_TYPER 0x0004 /* v1 ICDICTR */ -#define GICD_TYPER_I_NUM(n) ((((n) & 0x1F) + 1) * 32) +#define GICD_TYPER_ITLINESNUM_MASK (0x1f) +#define GICD_TYPER_I_NUM(n) \ + ((((n) & GICD_TYPER_ITLINESNUM_MASK) + 1) * 32) #define GICD_IIDR 0x0008 /* v1 ICDIIDR */ #define GICD_IIDR_PROD_SHIFT 24 #define GICD_IIDR_PROD_MASK 0xff000000 @@ -74,19 +106,30 @@ #define GICD_IIDR_IMPL_MASK 0x00000fff #define GICD_IIDR_IMPL(x) \ (((x) & GICD_IIDR_IMPL_MASK) >> GICD_IIDR_IMPL_SHIFT) -#define GICD_IGROUPR(n) (0x0080 + (((n) >> 5) * 4)) /* v1 ICDISER */ +#define GICD_IGROUPR_BASE (0x0080) +#define GICD_IGROUPR(n) \ + (GICD_IGROUPR_BASE + (((n) >> 5) * 4)) /* v1 ICDISER */ #define GICD_I_PER_IGROUPRn 32 -#define GICD_ISENABLER(n) (0x0100 + (((n) >> 5) * 4)) /* v1 ICDISER */ +#define GICD_ISENABLER_BASE (0x0100) +#define GICD_ISENABLER(n) \ + (GICD_ISENABLER_BASE + (((n) >> 5) * 4)) /* v1 ICDISER */ #define GICD_I_MASK(n) (1ul << ((n) & 0x1f)) #define GICD_I_PER_ISENABLERn 32 -#define GICD_ICENABLER(n) (0x0180 + (((n) >> 5) * 4)) /* v1 ICDICER */ +#define GICD_ICENABLER_BASE (0x0180) +#define GICD_ICENABLER(n) \ + (GICD_ICENABLER_BASE + (((n) >> 5) * 4)) /* v1 ICDICER */ #define GICD_ISPENDR(n) (0x0200 + (((n) >> 5) * 4)) /* v1 ICDISPR */ #define GICD_ICPENDR(n) (0x0280 + (((n) >> 5) * 4)) /* v1 ICDICPR */ +#define GICD_ISACTIVER(n) (0x0300 + (((n) >> 5) * 4)) /* v1 ICDABR */ #define GICD_ICACTIVER(n) (0x0380 + (((n) >> 5) * 4)) /* v1 ICDABR */ -#define GICD_IPRIORITYR(n) (0x0400 + (((n) >> 2) * 4)) /* v1 ICDIPR */ +#define GICD_IPRIORITYR_BASE (0x0400) +#define GICD_IPRIORITYR(n) \ + (GICD_IPRIORITYR_BASE + (((n) >> 2) * 4)) /* v1 ICDIPR */ #define GICD_I_PER_IPRIORITYn 4 #define GICD_ITARGETSR(n) (0x0800 + (((n) >> 2) * 4)) /* v1 ICDIPTR */ -#define GICD_ICFGR(n) (0x0C00 + (((n) >> 4) * 4)) /* v1 ICDICFR */ +#define GICD_ICFGR_BASE (0x0C00) +#define GICD_ICFGR(n) \ + (GICD_ICFGR_BASE + (((n) >> 4) * 4)) /* v1 ICDICFR */ #define GICD_I_PER_ICFGRn 16 /* First bit is a polarity bit (0 - low, 1 - high) */ #define GICD_ICFGR_POL_LOW (0 << 0) @@ -96,7 +139,34 @@ #define GICD_ICFGR_TRIG_LVL (0 << 1) #define GICD_ICFGR_TRIG_EDGE (1 << 1) #define GICD_ICFGR_TRIG_MASK 0x2 -#define GICD_SGIR 0x0F00 /* v1 ICDSGIR */ +#define GICD_SGIR(n) (0x0F00 + ((n) * 4)) /* v1 ICDSGIR */ #define GICD_SGI_TARGET_SHIFT 16 +/* GIC Hypervisor specific registers */ +#define GICH_HCR 0x0 +#define GICH_VTR 0x4 +#define GICH_VMCR 0x8 +#define GICH_VMCR_VMGRP1EN (1 << 1) +#define GICH_MISR 0x10 +#define GICH_EISR0 0x20 +#define GICH_EISR1 0x24 +#define GICH_ELSR0 0x30 +#define GICH_ELSR1 0x34 +#define GICH_APR 0xF0 +#define GICH_LR0 0x100 + +#define GICH_HCR_EN (1 << 0) +#define GICH_HCR_UIE (1 << 1) + +#define GICH_LR_VIRTID (0x3FF << 0) +#define GICH_LR_PHYSID_CPUID_SHIFT 10 +#define GICH_LR_PHYSID_CPUID (7 << GICH_LR_PHYSID_CPUID_SHIFT) +#define GICH_LR_STATE (3 << 28) +#define GICH_LR_PENDING (1 << 28) +#define GICH_LR_ACTIVE (1 << 29) +#define GICH_LR_EOI (1 << 19) + +#define GICH_MISR_EOI (1 << 0) +#define GICH_MISR_U (1 << 1) + #endif /* _GIC_COMMON_H_ */ Index: sys/arm/arm/gic_fdt.c =================================================================== --- sys/arm/arm/gic_fdt.c +++ sys/arm/arm/gic_fdt.c @@ -129,18 +129,25 @@ gic_fdt_attach(device_t dev) { struct arm_gic_fdt_softc *sc = device_get_softc(dev); - phandle_t pxref; - intptr_t xref; + phandle_t pxref = ofw_bus_find_iparent(ofw_bus_get_node(dev)); + intptr_t xref = OF_xref_from_node(ofw_bus_get_node(dev)); int err; + sc->base.is_root = false; + /* + * Controller is root if: + * - doesn't have interrupt parent + * - his interrupt parent is this controller + */ + if (pxref == 0 || xref == pxref) + sc->base.is_root = true; + sc->base.gic_bus = GIC_BUS_FDT; err = arm_gic_attach(dev); if (err != 0) return (err); - xref = OF_xref_from_node(ofw_bus_get_node(dev)); - /* * Now, when everything is initialized, it's right time to * register interrupt controller to interrupt framefork. @@ -150,13 +157,7 @@ goto cleanup; } - /* - * Controller is root if: - * - doesn't have interrupt parent - * - his interrupt parent is this controller - */ - pxref = ofw_bus_find_iparent(ofw_bus_get_node(dev)); - if (pxref == 0 || xref == pxref) { + if (sc->base.is_root) { if (intr_pic_claim_root(dev, xref, arm_gic_intr, sc, GIC_LAST_SGI - GIC_FIRST_SGI + 1) != 0) { device_printf(dev, "could not set PIC as a root\n"); @@ -164,13 +165,13 @@ goto cleanup; } } else { - if (sc->base.gic_res[2] == NULL) { + if (sc->base.gic_res[INTRNG_RES_IDX] == NULL) { device_printf(dev, "not root PIC must have defined interrupt\n"); intr_pic_deregister(dev, xref); goto cleanup; } - if (bus_setup_intr(dev, sc->base.gic_res[2], INTR_TYPE_CLK, + if (bus_setup_intr(dev, sc->base.gic_res[INTRNG_RES_IDX], INTR_TYPE_CLK, arm_gic_intr, NULL, sc, &sc->base.gic_intrhand)) { device_printf(dev, "could not setup irq handler\n"); intr_pic_deregister(dev, xref); @@ -199,9 +200,8 @@ struct arm_gic_devinfo *di; di = device_get_ivars(child); - KASSERT(di != NULL, ("gic_fdt_get_resource_list: No devinfo")); - return (&di->rl); + return di ? (&di->rl) : (NULL); } static int Index: sys/arm64/arm64/gic_v3.c =================================================================== --- sys/arm64/arm64/gic_v3.c +++ sys/arm64/arm64/gic_v3.c @@ -99,6 +99,11 @@ static u_int sgi_first_unused = GIC_FIRST_SGI; #endif +static struct resource *maint_res; +static device_t gic_dev; +static int maint_rid; +static void *maint_cookie; + static device_method_t gic_v3_methods[] = { /* Device interface */ DEVMETHOD(device_detach, gic_v3_detach), @@ -366,12 +371,49 @@ return (0); } +void +gic_v3_alloc_maint_res(device_t dev) +{ + gic_dev = dev; + maint_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &maint_rid, + RF_ACTIVE); + if (!maint_res) + device_printf(dev, + "Could not allocate resource for maintenance interrupt\n"); +} + +int +gic_v3_setup_maint_intr(driver_filter_t filter, driver_intr_t handler, + void *arg) +{ + int flags; + + if (!maint_res) + return (EINVAL); + + flags = INTR_TYPE_MISC | INTR_MPSAFE; + return (bus_setup_intr(gic_dev, maint_res, flags, filter, handler, + arg, &maint_cookie)); +} + +int +gic_v3_teardown_maint_intr(void) +{ + if (!maint_res) + return (EINVAL); + + return (bus_teardown_intr(gic_dev, maint_res, maint_cookie)); +} + static int gic_v3_get_domain(device_t dev, device_t child, int *domain) { struct gic_v3_devinfo *di; di = device_get_ivars(child); + if (di == NULL) + return (0); + if (di->gic_domain < 0) return (ENOENT); @@ -978,22 +1020,25 @@ struct resource *res; u_int cpuid; size_t us_left = 1000000; + uint32_t rwp; cpuid = PCPU_GET(cpuid); switch (xdist) { case DIST: res = sc->gic_dist; + rwp = GICD_CTLR_RWP; break; case REDIST: res = &sc->gic_redists.pcpu[cpuid]->res; + rwp = GICR_CTLR_RWP; break; default: KASSERT(0, ("%s: Attempt to wait for unknown RWP", __func__)); return; } - while ((bus_read_4(res, GICD_CTLR) & GICD_CTLR_RWP) != 0) { + while ((bus_read_4(res, GICD_CTLR) & rwp) != 0) { DELAY(1); if (us_left-- == 0) panic("GICD Register write pending for too long"); Index: sys/arm64/arm64/gic_v3_acpi.c =================================================================== --- sys/arm64/arm64/gic_v3_acpi.c +++ sys/arm64/arm64/gic_v3_acpi.c @@ -284,6 +284,8 @@ if (device_get_children(dev, &sc->gic_children, &sc->gic_nchildren) !=0) sc->gic_nchildren = 0; + gic_v3_alloc_maint_res(dev); + return (0); error: Index: sys/arm64/arm64/gic_v3_fdt.c =================================================================== --- sys/arm64/arm64/gic_v3_fdt.c +++ sys/arm64/arm64/gic_v3_fdt.c @@ -171,6 +171,8 @@ if (device_get_children(dev, &sc->gic_children, &sc->gic_nchildren) != 0) sc->gic_nchildren = 0; + gic_v3_alloc_maint_res(dev); + return (err); error: @@ -194,12 +196,19 @@ static int gic_v3_fdt_print_child(device_t bus, device_t child) { - struct gic_v3_ofw_devinfo *di = device_get_ivars(child); - struct resource_list *rl = &di->di_rl; + struct gic_v3_ofw_devinfo *di; + struct resource_list *rl; int retval = 0; retval += bus_print_child_header(bus, child); + + di = device_get_ivars(child); + if (di == NULL) + goto footer; + rl = &di->di_rl; + retval += resource_list_print_type(rl, "mem", SYS_RES_MEMORY, "%#jx"); +footer: retval += bus_print_child_footer(bus, child); return (retval); @@ -280,6 +289,8 @@ size_cells = 2; OF_getencprop(parent, "#size-cells", &size_cells, sizeof(size_cells)); + + /* Iterate through all GIC subordinates */ for (node = OF_child(parent); node > 0; node = OF_peer(node)) { /* Allocate and populate devinfo. */ Index: sys/arm64/arm64/gic_v3_reg.h =================================================================== --- sys/arm64/arm64/gic_v3_reg.h +++ sys/arm64/arm64/gic_v3_reg.h @@ -56,14 +56,22 @@ #define GICD_CTLR_G1 (1 << 0) #define GICD_CTLR_G1A (1 << 1) #define GICD_CTLR_ARE_NS (1 << 4) +#define GICD_CTLR_DS (1 << 6) +#define GICD_CTLR_E1NWF (1 << 7) #define GICD_CTLR_RWP (1 << 31) /* GICD_TYPER */ #define GICD_TYPER_IDBITS(n) ((((n) >> 19) & 0x1F) + 1) +#define GICD_TYPER_SECURITYEXTN \ + (1 << 10) +#define GICD_TYPER_DVIS (1 << 18) +#define GICD_TYPER_LPIS (1 << 17) /* * Registers (v3) */ -#define GICD_IROUTER(n) (0x6000 + ((n) * 8)) +#define GICD_IROUTER_BASE (0x6000) +#define GICD_IROUTER(n) (GICD_IROUTER_BASE + ((n) * 8)) +#define GICD_IROUTER_IRM (31) #define GICD_PIDR4 0xFFD0 #define GICD_PIDR5 0xFFD4 @@ -84,7 +92,11 @@ /* Redistributor registers */ #define GICR_CTLR GICD_CTLR -#define GICR_CTLR_LPI_ENABLE (1 << 0) +#define GICR_CTLR_RWP (1 << 3) +#define GICR_CTLR_UWP (1 << 31) +#define GICR_CTLR_LPI_ENABLE (1 << 0) +#define GICR_CTLR_DPG1NS (1 << 25) +#define GICR_CTLR_DPG0 (1 << 24) #define GICR_PIDR2 GICD_PIDR2 @@ -97,6 +109,10 @@ #define GICR_TYPER_CPUNUM(x) \ (((x) & GICR_TYPER_CPUNUM_MASK) >> GICR_TYPER_CPUNUM_SHIFT) #define GICR_TYPER_AFF_SHIFT (32) +#define GICR_TYPER_AFF0(x) ((x >> GICR_TYPER_AFF_SHIFT) & 0xff) +#define GICR_TYPER_AFF1(x) ((x >> (GICR_TYPER_AFF_SHIFT + 8)) & 0xff) +#define GICR_TYPER_AFF2(x) ((x >> (GICR_TYPER_AFF_SHIFT + 16)) & 0xff) +#define GICR_TYPER_AFF3(x) ((x >> (GICR_TYPER_AFF_SHIFT + 24)) & 0xff) #define GICR_WAKER (0x0014) #define GICR_WAKER_PS (1 << 1) /* Processor sleep */ @@ -193,8 +209,13 @@ #define GICR_I_ENABLER_SGI_MASK (0x0000FFFF) #define GICR_I_ENABLER_PPI_MASK (0xFFFF0000) +#define GICR_IPRIORITYR_BASE (0x0400) #define GICR_I_PER_IPRIORITYn (GICD_I_PER_IPRIORITYn) +#define GICR_ICFGR0_BASE (0x0C00) +#define GICR_ICFGR1_BASE (0x0C04) + + /* ITS registers */ #define GITS_PIDR2 GICR_PIDR2 #define GITS_PIDR2_ARCH_MASK GICR_PIDR2_ARCH_MASK Index: sys/arm64/arm64/gic_v3_var.h =================================================================== --- sys/arm64/arm64/gic_v3_var.h +++ sys/arm64/arm64/gic_v3_var.h @@ -109,6 +109,10 @@ void gic_r_write_4(device_t, bus_size_t, uint32_t var); void gic_r_write_8(device_t, bus_size_t, uint64_t var); +void gic_v3_alloc_maint_res(device_t); +int gic_v3_setup_maint_intr(driver_filter_t, driver_intr_t, void *); +int gic_v3_teardown_maint_intr(void); + /* * GIC Distributor accessors. * Notice that only GIC sofc can be passed. Index: sys/arm64/arm64/hyp_stub.S =================================================================== --- /dev/null +++ sys/arm64/arm64/hyp_stub.S @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2017 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +__FBSDID("$FreeBSD$"); + + .text + +/* + * Install a new exception vector table with the base address supplied by the + * parameter in register x0. + */ +ENTRY(handle_stub_el1h_sync) + msr vbar_el2, x0 + eret +END(handle_hyp_stub) + +.macro vempty + .align 7 + 1: b 1b +.endm + +.macro vector name + .align 7 + b handle_\name +.endm + + .align 11 + .globl hyp_stub_vectors +hyp_stub_vectors: + vempty /* Synchronous EL2t */ + vempty /* IRQ EL2t */ + vempty /* FIQ EL2t */ + vempty /* SError EL2t */ + + vempty /* Synchronous EL2h */ + vempty /* IRQ EL2h */ + vempty /* FIQ EL2h */ + vempty /* SError EL2h */ + + vector stub_el1h_sync /* Synchronous 64-bit EL1 */ + vempty /* IRQ 64-bit EL1 */ + vempty /* FIQ 64-bit EL1 */ + vempty /* SError 64-bit EL1 */ + + vempty /* Synchronous 32-bit EL1 */ + vempty /* IRQ 32-bit EL1 */ + vempty /* FIQ 32-bit EL1 */ + vempty /* SError 32-bit EL1 */ Index: sys/arm64/arm64/locore.S =================================================================== --- sys/arm64/arm64/locore.S +++ sys/arm64/arm64/locore.S @@ -237,6 +237,11 @@ END(mpentry) #endif + .align 3 + .globl _C_LABEL(hypmode_enabled) +_C_LABEL(hypmode_enabled): + .zero 8 + /* * If we are started in EL2, configure the required hypervisor * registers and drop to EL1. @@ -246,10 +251,26 @@ lsr x23, x23, #2 cmp x23, #0x2 b.eq 1f + + /* We didn't start in EL2, hypmode will remain disabled */ ret 1: - /* Configure the Hypervisor */ - mov x2, #(HCR_RW) + /* + * If the MMU is active, then it is using a page table where VA == PA. + * But the page table won't have entries for the hypervisor EL2 + * initialization code which is loaded into memory with the vmm module. + * + * So we disable the MMU in EL2 to make the vmm hypervisor code run + * successfully. + */ + dsb sy + mrs x2, sctlr_el2 + bic x2, x2, SCTLR_M + msr sctlr_el2, x2 + isb + + /* Enable the HVC Instruction and Make EL1 aarch64 */ + ldr x2, hcr msr hcr_el2, x2 /* Load the Virtualization Process ID Register */ @@ -279,10 +300,19 @@ /* Set the counter offset to a known value */ msr cntvoff_el2, xzr - /* Hypervisor trap functions */ - adr x2, hyp_vectors + /* Install hypervisor trap functions */ + adrp x2, hyp_stub_vectors msr vbar_el2, x2 + /* Use the host VTTBR_EL2 to tell the host and the guests apart */ + mov x2, #VTTBR_HOST + msr vttbr_el2, x2 + + /* Mark hypervisor mode as enabled */ + mov x1, #1 + adr x2, hypmode_enabled + str x1, [x2] + mov x2, #(PSR_F | PSR_I | PSR_A | PSR_D | PSR_M_EL1h) msr spsr_el2, x2 @@ -299,7 +329,6 @@ orr x2, x2, #ICC_SRE_EL2_SRE /* Enable system registers */ msr icc_sre_el2, x2 2: - /* Set the address to return to our return address */ msr elr_el2, x30 isb @@ -310,31 +339,9 @@ .Lsctlr_res1: .quad SCTLR_RES1 -#define VECT_EMPTY \ - .align 7; \ - 1: b 1b - - .align 11 -hyp_vectors: - VECT_EMPTY /* Synchronous EL2t */ - VECT_EMPTY /* IRQ EL2t */ - VECT_EMPTY /* FIQ EL2t */ - VECT_EMPTY /* Error EL2t */ - - VECT_EMPTY /* Synchronous EL2h */ - VECT_EMPTY /* IRQ EL2h */ - VECT_EMPTY /* FIQ EL2h */ - VECT_EMPTY /* Error EL2h */ - - VECT_EMPTY /* Synchronous 64-bit EL1 */ - VECT_EMPTY /* IRQ 64-bit EL1 */ - VECT_EMPTY /* FIQ 64-bit EL1 */ - VECT_EMPTY /* Error 64-bit EL1 */ - - VECT_EMPTY /* Synchronous 32-bit EL1 */ - VECT_EMPTY /* IRQ 32-bit EL1 */ - VECT_EMPTY /* FIQ 32-bit EL1 */ - VECT_EMPTY /* Error 32-bit EL1 */ +hcr: + /* Make sure the HVC instruction is not disabled */ + .quad (HCR_RW & ~HCR_HCD) /* * Get the delta between the physical address we were loaded to and the Index: sys/arm64/arm64/pmap.c =================================================================== --- sys/arm64/arm64/pmap.c +++ sys/arm64/arm64/pmap.c @@ -407,6 +407,8 @@ static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); +static uint64_t pa_range_bits = 0; + /* * These load the old table data and store the new value. * They need to be atomic as the System MMU may write to the table at @@ -431,9 +433,19 @@ memcpy(d, s, PAGE_SIZE); } +#define pmap_l0_index(va) (((va) >> L0_SHIFT) & L0_ADDR_MASK) +#define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) +#define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) +#define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) + +#define STAGE2_L1_ADDR_MASK ((1UL << (pa_range_bits - L1_SHIFT)) - 1) +#define pmap_stage2_l1_index(va) (((va) >> L1_SHIFT) & STAGE2_L1_ADDR_MASK) + static __inline pd_entry_t * pmap_l0(pmap_t pmap, vm_offset_t va) { + KASSERT(pmap->pm_stage != PM_STAGE2, + ("Level 0 table is invalid for PM_STAGE2 pmap")); return (&pmap->pm_l0[pmap_l0_index(va)]); } @@ -450,6 +462,9 @@ static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { + if (pmap->pm_stage == PM_STAGE2) + return (&pmap->pm_l0[pmap_stage2_l1_index(va)]); + pd_entry_t *l0; l0 = pmap_l0(pmap, va); @@ -459,6 +474,32 @@ return (pmap_l0_to_l1(l0, va)); } +static __inline vm_page_t +pmap_l1pg(pmap_t pmap, vm_offset_t va) +{ + if (pmap->pm_stage == PM_STAGE1) { + pd_entry_t *l0, tl0; + + l0 = pmap_l0(pmap, va); + tl0 = pmap_load(l0); + + return (PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK)); + } else { + vm_paddr_t pa, pa_offset; + + /* + * The offset will be the bits + * [pa_range_bits-1:L0_SHIFT] + */ + va = va & ((1 << pa_range_bits) - 1); + pa_offset = va >> L0_SHIFT; + pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0) + \ + (pa_offset << PAGE_SHIFT); + + return (PHYS_TO_VM_PAGE(pa)); + } +} + static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va) { @@ -519,18 +560,28 @@ { pd_entry_t *l0, *l1, *l2, desc; - l0 = pmap_l0(pmap, va); - desc = pmap_load(l0) & ATTR_DESCR_MASK; - if (desc != L0_TABLE) { - *level = -1; - return (NULL); - } + if (pmap->pm_stage == PM_STAGE1) { + l0 = pmap_l0(pmap, va); + desc = pmap_load(l0) & ATTR_DESCR_MASK; + if (desc != L0_TABLE) { + *level = -1; + return (NULL); + } - l1 = pmap_l0_to_l1(l0, va); - desc = pmap_load(l1) & ATTR_DESCR_MASK; - if (desc != L1_TABLE) { - *level = 0; - return (l0); + l1 = pmap_l0_to_l1(l0, va); + desc = pmap_load(l1) & ATTR_DESCR_MASK; + if (desc != L1_TABLE) { + *level = 0; + return (l0); + } + } else { + l1 = pmap_l1(pmap, va); + desc = pmap_load(l1) & ATTR_DESCR_MASK; + if (desc != L1_TABLE) { + /* For PM_STAGE2 mappings the first level is level 1 */ + *level = -1; + return (NULL); + } } l2 = pmap_l1_to_l2(l1, va); @@ -607,13 +658,18 @@ if (pmap->pm_l0 == NULL) return (false); - l0p = pmap_l0(pmap, va); - *l0 = l0p; + if (pmap->pm_stage == PM_STAGE1) { + l0p = pmap_l0(pmap, va); + *l0 = l0p; - if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) - return (false); + if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) + return (false); - l1p = pmap_l0_to_l1(l0p, va); + l1p = pmap_l0_to_l1(l0p, va); + } else { + *l0 = NULL; + l1p = pmap_l1(pmap, va); + } *l1 = l1p; if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { @@ -948,6 +1004,7 @@ pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { + uint64_t id_aa64mmfr0_el1; vm_offset_t freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t start_pa, pa, min_pa; @@ -1036,6 +1093,35 @@ physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); + id_aa64mmfr0_el1 = READ_SPECIALREG(id_aa64mmfr0_el1); + switch (ID_AA64MMFR0_PARange_VAL(id_aa64mmfr0_el1)) { + case ID_AA64MMFR0_PARange_4G: + pa_range_bits = 32; + break; + case ID_AA64MMFR0_PARange_64G: + pa_range_bits = 36; + break; + case ID_AA64MMFR0_PARange_1T: + pa_range_bits = 40; + break; + case ID_AA64MMFR0_PARange_4T: + pa_range_bits = 42; + break; + case ID_AA64MMFR0_PARange_16T: + pa_range_bits = 44; + break; + case ID_AA64MMFR0_PARange_256T: + pa_range_bits = 48; + break; + default: + /* + * Unknown PA range bits, will lead to a panic if a stage 2 + * pmap starting at level 1 is created. + */ + pa_range_bits = 0; + break; + } + cpu_tlb_flushID(); } @@ -1619,10 +1705,12 @@ */ if (m->pindex >= (NUL2E + NUL1E)) { /* l1 page */ - pd_entry_t *l0; + if (pmap->pm_stage == PM_STAGE1) { + pd_entry_t *l0; - l0 = pmap_l0(pmap, va); - pmap_clear(l0); + l0 = pmap_l0(pmap, va); + pmap_clear(l0); + } } else if (m->pindex >= NUL2E) { /* l2 page */ pd_entry_t *l1; @@ -1648,12 +1736,16 @@ pmap_unwire_l3(pmap, va, l2pg, free); } else if (m->pindex < (NUL2E + NUL1E)) { /* We just released an l2, unhold the matching l1 */ - pd_entry_t *l0, tl0; vm_page_t l1pg; + pd_entry_t *l0, tl0; - l0 = pmap_l0(pmap, va); - tl0 = pmap_load(l0); - l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); + if (pmap->pm_stage == PM_STAGE1) { + l0 = pmap_l0(pmap, va); + tl0 = pmap_load(l0); + l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); + } else { + l1pg = pmap_l1pg(pmap, va); + } pmap_unwire_l3(pmap, va, l1pg, free); } pmap_invalidate_page(pmap, va); @@ -1728,12 +1820,48 @@ { vm_page_t m; + KASSERT((stage == PM_STAGE1 || stage == PM_STAGE2), + ("Invalid pmap stage %d", stage)); + KASSERT(!((stage == PM_STAGE2) && (pa_range_bits == 0)), + ("Unknown PARange bits")); + /* * allocate the l0 page */ - while ((m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) - vm_wait(NULL); + if (stage == PM_STAGE1) { + while ((m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) + vm_wait(NULL); + } else { + uint64_t npages; + uint64_t alignment; + + if (pa_range_bits <= L0_SHIFT) { + /* + * The level 1 translation table is not larger than a + * PM_STAGE1 level 1 table, use only one page. + */ + npages = 1; + alignment = PAGE_SIZE; + } else { + /* + * The level 1 translation table is larger than a + * regular PM_STAGE1 level 1 table, for every x bits + * that is larger we need 2^x pages and the table must + * be aligned at a 2^(x + 12) boundary. + * + * See Table D5-25 and Example D4-5 from the DDI0487B + * ARMv8 Architecture Manual for more information. + */ + npages = 1 << (pa_range_bits - L0_SHIFT); + alignment = 1 << (PAGE_SHIFT + pa_range_bits - L0_SHIFT); + } + while ((m = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO, + npages, DMAP_MIN_PHYSADDR, DMAP_MAX_PHYSADDR, + alignment, 0, VM_MEMATTR_DEFAULT)) == NULL) + vm_wait(NULL); + } pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m); pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); @@ -1742,6 +1870,7 @@ pagezero(pmap->pm_l0); pmap->pm_root.rt_root = 0; + pmap->pm_stage = stage; bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX); @@ -1852,25 +1981,30 @@ pd_entry_t tl0; l1index = ptepindex - NUL2E; - l0index = l1index >> L0_ENTRIES_SHIFT; - - l0 = &pmap->pm_l0[l0index]; - tl0 = pmap_load(l0); - if (tl0 == 0) { - /* recurse for allocating page dir */ - if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, - lockp) == NULL) { - vm_page_unwire_noq(m); - vm_page_free_zero(m); - return (NULL); + if (pmap->pm_stage == PM_STAGE1) { + l0index = l1index >> L0_ENTRIES_SHIFT; + l0 = &pmap->pm_l0[l0index]; + tl0 = pmap_load(l0); + if (tl0 == 0) { + /* recurse for allocating page dir */ + if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, + lockp) == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + } else { + l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); + l1pg->ref_count++; } + + l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); + l1 = &l1[ptepindex & Ln_ADDR_MASK]; } else { - l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); + l1pg = pmap_l1pg(pmap, l1index); l1pg->ref_count++; + l1 = &pmap->pm_l0[l1index & STAGE2_L1_ADDR_MASK]; } - - l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); - l1 = &l1[ptepindex & Ln_ADDR_MASK]; pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); } else { vm_pindex_t l0index, l1index; @@ -1878,24 +2012,40 @@ pd_entry_t tl0, tl1; l1index = ptepindex >> Ln_ENTRIES_SHIFT; - l0index = l1index >> L0_ENTRIES_SHIFT; - - l0 = &pmap->pm_l0[l0index]; - tl0 = pmap_load(l0); - if (tl0 == 0) { - /* recurse for allocating page dir */ - if (_pmap_alloc_l3(pmap, NUL2E + l1index, - lockp) == NULL) { - vm_page_unwire_noq(m); - vm_page_free_zero(m); - return (NULL); - } + if (pmap->pm_stage == PM_STAGE1) { + l0index = l1index >> L0_ENTRIES_SHIFT; + l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); - l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); - l1 = &l1[l1index & Ln_ADDR_MASK]; + if (tl0 == 0) { + /* recurse for allocating page dir */ + if (_pmap_alloc_l3(pmap, NUL2E + l1index, + lockp) == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + tl0 = pmap_load(l0); + l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); + l1 = &l1[l1index & Ln_ADDR_MASK]; + } else { + l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); + l1 = &l1[l1index & Ln_ADDR_MASK]; + tl1 = pmap_load(l1); + if (tl1 == 0) { + /* recurse for allocating page dir */ + if (_pmap_alloc_l3(pmap, NUL2E + l1index, + lockp) == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + } else { + l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); + l2pg->ref_count++; + } + } } else { - l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); - l1 = &l1[l1index & Ln_ADDR_MASK]; + l1 = &pmap->pm_l0[l1index & STAGE2_L1_ADDR_MASK]; tl1 = pmap_load(l1); if (tl1 == 0) { /* recurse for allocating page dir */ @@ -2085,9 +2235,27 @@ mtx_unlock_spin(&set->asid_set_mutex); } - m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); - vm_page_unwire_noq(m); - vm_page_free_zero(m); + if (pmap->pm_stage == PM_STAGE1) { + m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); + vm_page_unwire_noq(m); + vm_page_free_zero(m); + } else { + uint64_t i, page_cnt; + vm_paddr_t pa; + + if (pa_range_bits < L0_SHIFT) + page_cnt = 1; + else + page_cnt = 1 << (pa_range_bits - L0_SHIFT); + + pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0); + for (i = 0; i < page_cnt; i++) { + m = PHYS_TO_VM_PAGE(pa); + vm_page_unwire_noq(m); + vm_page_free_zero(m); + pa += PAGE_SIZE; + } + } } static int @@ -3003,12 +3171,14 @@ if (pmap->pm_stats.resident_count == 0) break; - l0 = pmap_l0(pmap, sva); - if (pmap_load(l0) == 0) { - va_next = (sva + L0_SIZE) & ~L0_OFFSET; - if (va_next < sva) - va_next = eva; - continue; + if (pmap->pm_stage == PM_STAGE1) { + l0 = pmap_l0(pmap, sva); + if (pmap_load(l0) == 0) { + va_next = (sva + L0_SIZE) & ~L0_OFFSET; + if (va_next < sva) + va_next = eva; + continue; + } } va_next = (sva + L1_SIZE) & ~L1_OFFSET; @@ -3862,33 +4032,19 @@ new_l3 |= ATTR_S1_UXN; if (pmap != kernel_pmap) new_l3 |= ATTR_S1_nG; - } else { - /* - * Clear the access flag on executable mappings, this will be - * set later when the page is accessed. The fault handler is - * required to invalidate the I-cache. - * - * TODO: Switch to the valid flag to allow hardware management - * of the access flag. Much of the pmap code assumes the - * valid flag is set and fails to destroy the old page tables - * correctly if it is clear. - */ - if (prot & VM_PROT_EXECUTE) - new_l3 &= ~ATTR_AF; - } - if ((m->oflags & VPO_UNMANAGED) == 0) { - new_l3 |= ATTR_SW_MANAGED; - if ((prot & VM_PROT_WRITE) != 0) { - new_l3 |= ATTR_SW_DBM; - if ((flags & VM_PROT_WRITE) == 0) { - if (pmap->pm_stage == PM_STAGE1) - new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); - else - new_l3 &= - ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); + if ((m->oflags & VPO_UNMANAGED) == 0) { + new_l3 |= ATTR_SW_MANAGED; + if ((prot & VM_PROT_WRITE) != 0) { + new_l3 |= ATTR_SW_DBM; + if ((flags & VM_PROT_WRITE) == 0) + new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); } } + } else { + new_l3 = (pd_entry_t)(pa | ATTR_ST2_DEFAULT | L3_PAGE); } + if ((flags & PMAP_ENTER_WIRED) != 0) + new_l3 |= ATTR_SW_WIRED; CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); @@ -3942,6 +4098,7 @@ } /* We need to allocate an L3 table. */ } + if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; Index: sys/arm64/arm64/pmap_guest.c =================================================================== --- /dev/null +++ sys/arm64/arm64/pmap_guest.c @@ -0,0 +1,5025 @@ +/*- + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * Copyright (c) 1994 John S. Dyson + * All rights reserved. + * Copyright (c) 1994 David Greenman + * All rights reserved. + * Copyright (c) 2003 Peter Wemm + * All rights reserved. + * Copyright (c) 2005-2010 Alan L. Cox + * All rights reserved. + * Copyright (c) 2014 Andrew Turner + * All rights reserved. + * Copyright (c) 2014-2016 The FreeBSD Foundation + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department and William Jolitz of UUNET Technologies Inc. + * + * This software was developed by Andrew Turner under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 + */ +/*- + * Copyright (c) 2003 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Jake Burkholder, + * Safeport Network Services, and Network Associates Laboratories, the + * Security Research Division of Network Associates, Inc. under + * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA + * CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + * Manages physical address maps. + * + * Since the information managed by this module is + * also stored by the logical address mapping module, + * this module may throw away valid virtual-to-physical + * mappings at almost any time. However, invalidations + * of virtual-to-physical mappings must be done as + * requested. + * + * In order to cope with hardware architectures which + * make virtual-to-physical map invalidates expensive, + * this module may delay invalidate or reduced protection + * operations until such time as they are actually + * necessary. This module is given full information as + * to which processors are currently using which maps, + * and to when physical maps must be made correct. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) +#define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) +#define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) +#define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) + +#define NUL0E L0_ENTRIES +#define NUL1E (NUL0E * NL1PG) +#define NUL2E (NUL1E * NL2PG) + +#if !defined(DIAGNOSTIC) +#ifdef __GNUC_GNU_INLINE__ +#define PMAP_INLINE __attribute__((__gnu_inline__)) inline +#else +#define PMAP_INLINE extern inline +#endif +#else +#define PMAP_INLINE +#endif + +/* + * These are configured by the mair_el1 register. This is set up in locore.S + */ +#define DEVICE_MEMORY 0 +#define UNCACHED_MEMORY 1 +#define CACHED_MEMORY 2 + + +#ifdef PV_STATS +#define PV_STAT(x) do { x ; } while (0) +#else +#define PV_STAT(x) do { } while (0) +#endif + +#define pmap_l2_pindex(v) ((v) >> L2_SHIFT) +#define pa_to_pvh(pa) (&pv_table[pmap_l2_pindex(pa)]) + +#define NPV_LIST_LOCKS MAXCPU + +#define PHYS_TO_PV_LIST_LOCK(pa) \ + (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) + +#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ + struct rwlock **_lockp = (lockp); \ + struct rwlock *_new_lock; \ + \ + _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ + if (_new_lock != *_lockp) { \ + if (*_lockp != NULL) \ + rw_wunlock(*_lockp); \ + *_lockp = _new_lock; \ + rw_wlock(*_lockp); \ + } \ +} while (0) + +#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) + +#define RELEASE_PV_LIST_LOCK(lockp) do { \ + struct rwlock **_lockp = (lockp); \ + \ + if (*_lockp != NULL) { \ + rw_wunlock(*_lockp); \ + *_lockp = NULL; \ + } \ +} while (0) + +#define VM_PAGE_TO_PV_LIST_LOCK(m) \ + PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) + +struct pmap kernel_pmap_store; + +vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ +vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ +vm_offset_t kernel_vm_end = 0; + +struct msgbuf *msgbufp = NULL; + +/* + * Data for the pv entry allocation mechanism. + * Updates to pv_invl_gen are protected by the pv_list_locks[] + * elements, but reads are not. + */ +static struct md_page *pv_table; +static struct md_page pv_dummy; + +vm_paddr_t dmap_phys_base; /* The start of the dmap region */ +vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ +vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ + +/* This code assumes all L1 DMAP entries will be used */ +CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS); +CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS); + +#define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) +extern pt_entry_t pagetable_dmap[]; + +static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); + +static int superpages_enabled = 1; +SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, + CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, + "Are large page mappings enabled?"); + +/* + * Data for the pv entry allocation mechanism + */ +static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); +static struct mtx pv_chunks_mutex; +static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; + +static void free_pv_chunk(struct pv_chunk *pc); +static void free_pv_entry(pmap_t pmap, pv_entry_t pv); +static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); +static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); +static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); +static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, + vm_offset_t va); + +static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode); +static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); +static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); +static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, + vm_offset_t va, struct rwlock **lockp); +static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); +static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, + vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); +static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, + pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); +static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, + pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); +static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, + vm_page_t m, struct rwlock **lockp); + +static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, + struct rwlock **lockp); + +static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, + struct spglist *free); +static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); +static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); + +static uint64_t pa_range_bits = 0; + +/* + * These load the old table data and store the new value. + * They need to be atomic as the System MMU may write to the table at + * the same time as the CPU. + */ +#define pmap_load_store(table, entry) atomic_swap_64(table, entry) +#define pmap_set(table, mask) atomic_set_64(table, mask) +#define pmap_load_clear(table) atomic_swap_64(table, 0) +#define pmap_load(table) (*table) + +/********************/ +/* Inline functions */ +/********************/ + +static __inline void +pagecopy(void *s, void *d) +{ + + memcpy(d, s, PAGE_SIZE); +} + +#define pmap_l0_index(va) (((va) >> L0_SHIFT) & L0_ADDR_MASK) +#define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) +#define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) +#define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) + +#define STAGE2_L1_ADDR_MASK ((1UL << (pa_range_bits - L1_SHIFT)) - 1) +#define pmap_stage2_l1_index(va) (((va) >> L1_SHIFT) & STAGE2_L1_ADDR_MASK) + +static __inline pd_entry_t * +pmap_l0(pmap_t pmap, vm_offset_t va) +{ + KASSERT(pmap->pm_type != PT_STAGE2, + ("Level 0 table is invalid for PT_STAGE2 pmap")); + return (&pmap->pm_l0[pmap_l0_index(va)]); +} + +static __inline pd_entry_t * +pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) +{ + pd_entry_t *l1; + + l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); + return (&l1[pmap_l1_index(va)]); +} + +static __inline pd_entry_t * +pmap_l1(pmap_t pmap, vm_offset_t va) +{ + if (pmap->pm_type == PT_STAGE2) + return (&pmap->pm_l0[pmap_stage2_l1_index(va)]); + + pd_entry_t *l0; + + l0 = pmap_l0(pmap, va); + if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) + return (NULL); + + return(pmap_l0_to_l1(l0, va)); +} + +static __inline vm_page_t +pmap_l1pg(pmap_t pmap, vm_offset_t va) +{ + if (pmap->pm_type == PT_STAGE1) { + pd_entry_t *l0, tl0; + + l0 = pmap_l0(pmap, va); + tl0 = pmap_load(l0); + + return (PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK)); + } else { + vm_paddr_t pa, pa_offset; + + /* + * The offset will be the bits + * [pa_range_bits-1:L0_SHIFT] + */ + va = va & ((1 << pa_range_bits) - 1); + pa_offset = va >> L0_SHIFT; + pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0) + \ + (pa_offset << PAGE_SHIFT); + + return (PHYS_TO_VM_PAGE(pa)); + } +} + +static __inline pd_entry_t * +pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) +{ + pd_entry_t *l2; + + l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); + return (&l2[pmap_l2_index(va)]); +} + +static __inline pd_entry_t * +pmap_l2(pmap_t pmap, vm_offset_t va) +{ + pd_entry_t *l1; + + l1 = pmap_l1(pmap, va); + if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) + return (NULL); + + return (pmap_l1_to_l2(l1, va)); +} + +static __inline pt_entry_t * +pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) +{ + pt_entry_t *l3; + + l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK); + return (&l3[pmap_l3_index(va)]); +} + +/* + * Returns the lowest valid pde for a given virtual address. + * The next level may or may not point to a valid page or block. + */ +static __inline pd_entry_t * +pmap_pde(pmap_t pmap, vm_offset_t va, int *level) +{ + pd_entry_t *l0, *l1, *l2, desc; + + if (pmap->pm_type == PT_STAGE1) { + l0 = pmap_l0(pmap, va); + desc = pmap_load(l0) & ATTR_DESCR_MASK; + if (desc != L0_TABLE) { + *level = -1; + return (NULL); + } + + l1 = pmap_l0_to_l1(l0, va); + desc = pmap_load(l1) & ATTR_DESCR_MASK; + if (desc != L1_TABLE) { + *level = 0; + return (l0); + } + } else { + l1 = pmap_l1(pmap, va); + desc = pmap_load(l1) & ATTR_DESCR_MASK; + if (desc != L1_TABLE) { + /* For PT_STAGE2 mappings the first level is level 1 */ + *level = -1; + return (NULL); + } + } + + l2 = pmap_l1_to_l2(l1, va); + desc = pmap_load(l2) & ATTR_DESCR_MASK; + if (desc != L2_TABLE) { + *level = 1; + return (l1); + } + + *level = 2; + return (l2); +} + +/* + * Returns the lowest valid pte block or table entry for a given virtual + * address. If there are no valid entries return NULL and set the level to + * the first invalid level. + */ +static __inline pt_entry_t * +pmap_pte(pmap_t pmap, vm_offset_t va, int *level) +{ + pd_entry_t *l1, *l2, desc; + pt_entry_t *l3; + + l1 = pmap_l1(pmap, va); + if (l1 == NULL) { + *level = 0; + return (NULL); + } + desc = pmap_load(l1) & ATTR_DESCR_MASK; + if (desc == L1_BLOCK) { + *level = 1; + return (l1); + } + + if (desc != L1_TABLE) { + *level = 1; + return (NULL); + } + + l2 = pmap_l1_to_l2(l1, va); + desc = pmap_load(l2) & ATTR_DESCR_MASK; + if (desc == L2_BLOCK) { + *level = 2; + return (l2); + } + + if (desc != L2_TABLE) { + *level = 2; + return (NULL); + } + + *level = 3; + l3 = pmap_l2_to_l3(l2, va); + if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) + return (NULL); + + return (l3); +} + +static inline bool +pmap_superpages_enabled(void) +{ + + return (superpages_enabled != 0); +} + +bool +pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, + pd_entry_t **l2, pt_entry_t **l3) +{ + pd_entry_t *l0p, *l1p, *l2p; + + if (pmap->pm_l0 == NULL) + return (false); + + if (pmap->pm_type == PT_STAGE1) { + l0p = pmap_l0(pmap, va); + *l0 = l0p; + + if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) + return (false); + + l1p = pmap_l0_to_l1(l0p, va); + } else { + *l0 = NULL; + l1p = pmap_l1(pmap, va); + } + *l1 = l1p; + + if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { + *l2 = NULL; + *l3 = NULL; + return (true); + } + + if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) + return (false); + + l2p = pmap_l1_to_l2(l1p, va); + *l2 = l2p; + + if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { + *l3 = NULL; + return (true); + } + + *l3 = pmap_l2_to_l3(l2p, va); + + return (true); +} + +static __inline int +pmap_l3_valid(pt_entry_t l3) +{ + + return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); +} + + +CTASSERT(L1_BLOCK == L2_BLOCK); + +/* + * Checks if the page is dirty. We currently lack proper tracking of this on + * arm64 so for now assume is a page mapped as rw was accessed it is. + */ +static inline int +pmap_page_dirty(pt_entry_t pte) +{ + + return ((pte & (ATTR_AF | ATTR_AP_RW_BIT)) == + (ATTR_AF | ATTR_AP(ATTR_AP_RW))); +} + +static __inline void +pmap_resident_count_inc(pmap_t pmap, int count) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + pmap->pm_stats.resident_count += count; +} + +static __inline void +pmap_resident_count_dec(pmap_t pmap, int count) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT(pmap->pm_stats.resident_count >= count, + ("pmap %p resident count underflow %ld %d", pmap, + pmap->pm_stats.resident_count, count)); + pmap->pm_stats.resident_count -= count; +} + +static pt_entry_t * +pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, + u_int *l2_slot) +{ + pt_entry_t *l2; + pd_entry_t *l1; + + l1 = (pd_entry_t *)l1pt; + *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; + + /* Check locore has used a table L1 map */ + KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE, + ("Invalid bootstrap L1 table")); + /* Find the address of the L2 table */ + l2 = (pt_entry_t *)init_pt_va; + *l2_slot = pmap_l2_index(va); + + return (l2); +} + +static vm_paddr_t +pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) +{ + u_int l1_slot, l2_slot; + pt_entry_t *l2; + + l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); + + return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET)); +} + +static void +pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) +{ + vm_offset_t va; + vm_paddr_t pa; + u_int l1_slot; + + pa = dmap_phys_base = min_pa & ~L1_OFFSET; + va = DMAP_MIN_ADDRESS; + for (; va < DMAP_MAX_ADDRESS && pa < max_pa; + pa += L1_SIZE, va += L1_SIZE, l1_slot++) { + l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); + + pmap_load_store(&pagetable_dmap[l1_slot], + (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_XN | + ATTR_IDX(CACHED_MEMORY) | L1_BLOCK); + } + + /* Set the upper limit of the DMAP region */ + dmap_phys_max = pa; + dmap_max_addr = va; + + cpu_tlb_flushID(); +} + +static vm_offset_t +pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start) +{ + vm_offset_t l2pt; + vm_paddr_t pa; + pd_entry_t *l1; + u_int l1_slot; + + KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); + + l1 = (pd_entry_t *)l1pt; + l1_slot = pmap_l1_index(va); + l2pt = l2_start; + + for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) { + KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); + pa = pmap_early_vtophys(l1pt, l2pt); + pmap_load_store(&l1[l1_slot], + (pa & ~Ln_TABLE_MASK) | L1_TABLE); + l2pt += PAGE_SIZE; + } + + + /* Clean the L2 page table */ + memset((void *)l2_start, 0, l2pt - l2_start); + + return l2pt; +} + +static vm_offset_t +pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) +{ + vm_offset_t l2pt, l3pt; + vm_paddr_t pa; + pd_entry_t *l2; + u_int l2_slot; + + KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); + + l2 = pmap_l2(kernel_pmap, va); + l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE); + l2pt = (vm_offset_t)l2; + l2_slot = pmap_l2_index(va); + l3pt = l3_start; + + for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { + KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); + + pa = pmap_early_vtophys(l1pt, l3pt); + pmap_load_store(&l2[l2_slot], + (pa & ~Ln_TABLE_MASK) | L2_TABLE); + l3pt += PAGE_SIZE; + } + + /* Clean the L2 page table */ + memset((void *)l3_start, 0, l3pt - l3_start); + + return l3pt; +} + +/* + * Bootstrap the system enough to run with virtual memory. + */ +void +pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, + vm_size_t kernlen) +{ + u_int l1_slot, l2_slot, avail_slot, map_slot, used_map_slot; + uint64_t kern_delta; + uint64_t id_aa64mmfr0_el1; + pt_entry_t *l2; + vm_offset_t va, freemempos; + vm_offset_t dpcpu, msgbufpv; + vm_paddr_t pa, max_pa, min_pa; + int i; + + kern_delta = KERNBASE - kernstart; + physmem = 0; + + printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); + printf("%lx\n", l1pt); + printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); + + /* Set this early so we can use the pagetable walking functions */ + kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt; + PMAP_LOCK_INIT(kernel_pmap); + + /* Assume the address we were loaded to is a valid physical address */ + min_pa = max_pa = KERNBASE - kern_delta; + + /* + * Find the minimum physical address. physmap is sorted, + * but may contain empty ranges. + */ + for (i = 0; i < (physmap_idx * 2); i += 2) { + if (physmap[i] == physmap[i + 1]) + continue; + if (physmap[i] <= min_pa) + min_pa = physmap[i]; + if (physmap[i + 1] > max_pa) + max_pa = physmap[i + 1]; + } + + /* Create a direct map region early so we can use it for pa -> va */ + pmap_bootstrap_dmap(l1pt, min_pa, max_pa); + + va = KERNBASE; + pa = KERNBASE - kern_delta; + + /* + * Start to initialise phys_avail by copying from physmap + * up to the physical address KERNBASE points at. + */ + map_slot = avail_slot = 0; + for (; map_slot < (physmap_idx * 2) && + avail_slot < (PHYS_AVAIL_SIZE - 2); map_slot += 2) { + if (physmap[map_slot] == physmap[map_slot + 1]) + continue; + + if (physmap[map_slot] <= pa && + physmap[map_slot + 1] > pa) + break; + + phys_avail[avail_slot] = physmap[map_slot]; + phys_avail[avail_slot + 1] = physmap[map_slot + 1]; + physmem += (phys_avail[avail_slot + 1] - + phys_avail[avail_slot]) >> PAGE_SHIFT; + avail_slot += 2; + } + + /* Add the memory before the kernel */ + if (physmap[avail_slot] < pa && avail_slot < (PHYS_AVAIL_SIZE - 2)) { + phys_avail[avail_slot] = physmap[map_slot]; + phys_avail[avail_slot + 1] = pa; + physmem += (phys_avail[avail_slot + 1] - + phys_avail[avail_slot]) >> PAGE_SHIFT; + avail_slot += 2; + } + used_map_slot = map_slot; + + /* + * Read the page table to find out what is already mapped. + * This assumes we have mapped a block of memory from KERNBASE + * using a single L1 entry. + */ + l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); + + /* Sanity check the index, KERNBASE should be the first VA */ + KASSERT(l2_slot == 0, ("The L2 index is non-zero")); + + // WORKS HERE + + /* Find how many pages we have mapped */ + for (; l2_slot < Ln_ENTRIES; l2_slot++) { + if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0) + break; + + /* Check locore used L2 blocks */ + KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK, + ("Invalid bootstrap L2 table")); + KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa, + ("Incorrect PA in L2 table")); + + va += L2_SIZE; + pa += L2_SIZE; + } + + va = roundup2(va, L1_SIZE); + + freemempos = KERNBASE + kernlen; + + /* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */ + freemempos = pmap_bootstrap_l2(l1pt, va, freemempos); + + /* And the l3 tables for the early devmap */ + freemempos = pmap_bootstrap_l3(l1pt, + VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); + + cpu_tlb_flushID(); + + +#define alloc_pages(var, np) \ + (var) = freemempos; \ + freemempos += (np * PAGE_SIZE); \ + memset((char *)(var), 0, ((np) * PAGE_SIZE)); + + /* Allocate dynamic per-cpu area. */ + alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); + dpcpu_init((void *)dpcpu, 0); + + /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ + alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); + msgbufp = (void *)msgbufpv; + + virtual_avail = roundup2(freemempos, L1_SIZE); + virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; + kernel_vm_end = virtual_avail; + + pa = pmap_early_vtophys(l1pt, freemempos); + + /* Finish initialising physmap */ + map_slot = used_map_slot; + for (; avail_slot < (PHYS_AVAIL_SIZE - 2) && + map_slot < (physmap_idx * 2); map_slot += 2) { + if (physmap[map_slot] == physmap[map_slot + 1]) + continue; + + /* Have we used the current range? */ + if (physmap[map_slot + 1] <= pa) + continue; + + /* Do we need to split the entry? */ + if (physmap[map_slot] < pa) { + phys_avail[avail_slot] = pa; + phys_avail[avail_slot + 1] = physmap[map_slot + 1]; + } else { + phys_avail[avail_slot] = physmap[map_slot]; + phys_avail[avail_slot + 1] = physmap[map_slot + 1]; + } + physmem += (phys_avail[avail_slot + 1] - + phys_avail[avail_slot]) >> PAGE_SHIFT; + + avail_slot += 2; + } + phys_avail[avail_slot] = 0; + phys_avail[avail_slot + 1] = 0; + + /* + * Maxmem isn't the "maximum memory", it's one larger than the + * highest page of the physical address space. It should be + * called something like "Maxphyspage". + */ + Maxmem = atop(phys_avail[avail_slot - 1]); + + id_aa64mmfr0_el1 = READ_SPECIALREG(id_aa64mmfr0_el1); + switch (ID_AA64MMFR0_PA_RANGE(id_aa64mmfr0_el1)) { + case ID_AA64MMFR0_PA_RANGE_4G: + pa_range_bits = 32; + break; + case ID_AA64MMFR0_PA_RANGE_64G: + pa_range_bits = 36; + break; + case ID_AA64MMFR0_PA_RANGE_1T: + pa_range_bits = 40; + break; + case ID_AA64MMFR0_PA_RANGE_4T: + pa_range_bits = 42; + break; + case ID_AA64MMFR0_PA_RANGE_16T: + pa_range_bits = 44; + break; + case ID_AA64MMFR0_PA_RANGE_256T: + pa_range_bits = 48; + break; + default: + /* + * Unknown PA range bits, will lead to a panic if a stage 2 + * pmap starting at level 1 is created. + */ + pa_range_bits = 0; + break; + } + + cpu_tlb_flushID(); +} + +/* + * Initialize a vm_page's machine-dependent fields. + */ +void +pmap_page_init(vm_page_t m) +{ + + TAILQ_INIT(&m->md.pv_list); + m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; +} + +/* + * Initialize the pmap module. + * Called by vm_init, to initialize any structures that the pmap + * system needs to map virtual memory. + */ +void +pmap_init(void) +{ + vm_size_t s; + int i, pv_npg; + + /* + * Are large page mappings enabled? + */ + TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); + + /* + * Initialize the pv chunk list mutex. + */ + mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); + + /* + * Initialize the pool of pv list locks. + */ + for (i = 0; i < NPV_LIST_LOCKS; i++) + rw_init(&pv_list_locks[i], "pmap pv list"); + + /* + * Calculate the size of the pv head table for superpages. + */ + pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); + + /* + * Allocate memory for the pv head table for superpages. + */ + s = (vm_size_t)(pv_npg * sizeof(struct md_page)); + s = round_page(s); + pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, + M_WAITOK | M_ZERO); + for (i = 0; i < pv_npg; i++) + TAILQ_INIT(&pv_table[i].pv_list); + TAILQ_INIT(&pv_dummy.pv_list); +} + +static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0, + "2MB page mapping counters"); + +static u_long pmap_l2_demotions; +SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, + &pmap_l2_demotions, 0, "2MB page demotions"); + +static u_long pmap_l2_p_failures; +SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, + &pmap_l2_p_failures, 0, "2MB page promotion failures"); + +static u_long pmap_l2_promotions; +SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, + &pmap_l2_promotions, 0, "2MB page promotions"); + +/* + * Invalidate a single TLB entry. + */ +PMAP_INLINE void +pmap_invalidate_page(pmap_t pmap, vm_offset_t va) +{ + + sched_pin(); + __asm __volatile( + "dsb ishst \n" + "tlbi vaae1is, %0 \n" + "dsb ish \n" + "isb \n" + : : "r"(va >> PAGE_SHIFT)); + sched_unpin(); +} + +PMAP_INLINE void +pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t addr; + + sched_pin(); + dsb(ishst); + for (addr = sva; addr < eva; addr += PAGE_SIZE) { + __asm __volatile( + "tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT)); + } + __asm __volatile( + "dsb ish \n" + "isb \n"); + sched_unpin(); +} + +PMAP_INLINE void +pmap_invalidate_all(pmap_t pmap) +{ + + sched_pin(); + __asm __volatile( + "dsb ishst \n" + "tlbi vmalle1is \n" + "dsb ish \n" + "isb \n"); + sched_unpin(); +} + +/* + * Routine: pmap_extract + * Function: + * Extract the physical page address associated + * with the given map/virtual_address pair. + */ +vm_paddr_t +pmap_extract(pmap_t pmap, vm_offset_t va) +{ + pt_entry_t *pte, tpte; + vm_paddr_t pa; + int lvl; + + pa = 0; + PMAP_LOCK(pmap); + /* + * Find the block or page map for this virtual address. pmap_pte + * will return either a valid block/page entry, or NULL. + */ + pte = pmap_pte(pmap, va, &lvl); + if (pte != NULL) { + tpte = pmap_load(pte); + pa = tpte & ~ATTR_MASK; + switch(lvl) { + case 1: + KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, + ("pmap_extract: Invalid L1 pte found: %lx", + tpte & ATTR_DESCR_MASK)); + pa |= (va & L1_OFFSET); + break; + case 2: + KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, + ("pmap_extract: Invalid L2 pte found: %lx", + tpte & ATTR_DESCR_MASK)); + pa |= (va & L2_OFFSET); + break; + case 3: + KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, + ("pmap_extract: Invalid L3 pte found: %lx", + tpte & ATTR_DESCR_MASK)); + pa |= (va & L3_OFFSET); + break; + } + } + PMAP_UNLOCK(pmap); + return (pa); +} + +/* + * Routine: pmap_extract_and_hold + * Function: + * Atomically extract and hold the physical page + * with the given pmap and virtual address pair + * if that mapping permits the given protection. + */ +vm_page_t +pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) +{ + pt_entry_t *pte, tpte; + vm_offset_t off; + vm_paddr_t pa; + vm_page_t m; + int lvl; + + pa = 0; + m = NULL; + PMAP_LOCK(pmap); +retry: + pte = pmap_pte(pmap, va, &lvl); + if (pte != NULL) { + tpte = pmap_load(pte); + + KASSERT(lvl > 0 && lvl <= 3, + ("pmap_extract_and_hold: Invalid level %d", lvl)); + CTASSERT(L1_BLOCK == L2_BLOCK); + KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || + (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), + ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, + tpte & ATTR_DESCR_MASK)); + if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) || + ((prot & VM_PROT_WRITE) == 0)) { + switch(lvl) { + case 1: + off = va & L1_OFFSET; + break; + case 2: + off = va & L2_OFFSET; + break; + case 3: + default: + off = 0; + } + if (vm_page_pa_tryrelock(pmap, + (tpte & ~ATTR_MASK) | off, &pa)) + goto retry; + m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off); + vm_page_hold(m); + } + } + PA_UNLOCK_COND(pa); + PMAP_UNLOCK(pmap); + return (m); +} + +vm_paddr_t +pmap_kextract(vm_offset_t va) +{ + pt_entry_t *pte, tpte; + vm_paddr_t pa; + int lvl; + + if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { + pa = DMAP_TO_PHYS(va); + } else { + pa = 0; + pte = pmap_pte(kernel_pmap, va, &lvl); + if (pte != NULL) { + tpte = pmap_load(pte); + pa = tpte & ~ATTR_MASK; + switch(lvl) { + case 1: + KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, + ("pmap_kextract: Invalid L1 pte found: %lx", + tpte & ATTR_DESCR_MASK)); + pa |= (va & L1_OFFSET); + break; + case 2: + KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, + ("pmap_kextract: Invalid L2 pte found: %lx", + tpte & ATTR_DESCR_MASK)); + pa |= (va & L2_OFFSET); + break; + case 3: + KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, + ("pmap_kextract: Invalid L3 pte found: %lx", + tpte & ATTR_DESCR_MASK)); + pa |= (va & L3_OFFSET); + break; + } + } + } + return (pa); +} + +/*************************************************** + * Low level mapping routines..... + ***************************************************/ + +static void +pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) +{ + pd_entry_t *pde; + pt_entry_t *pte, attr; + vm_offset_t va; + int lvl; + + KASSERT((pa & L3_OFFSET) == 0, + ("pmap_kenter: Invalid physical address")); + KASSERT((sva & L3_OFFSET) == 0, + ("pmap_kenter: Invalid virtual address")); + KASSERT((size & PAGE_MASK) == 0, + ("pmap_kenter: Mapping is not page-sized")); + + attr = ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE; + if (mode == DEVICE_MEMORY) + attr |= ATTR_XN; + + va = sva; + while (size != 0) { + pde = pmap_pde(kernel_pmap, va, &lvl); + KASSERT(pde != NULL, + ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); + KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); + + pte = pmap_l2_to_l3(pde, va); + pmap_load_store(pte, (pa & ~L3_OFFSET) | attr); + + va += PAGE_SIZE; + pa += PAGE_SIZE; + size -= PAGE_SIZE; + } + pmap_invalidate_range(kernel_pmap, sva, va); +} + +void +pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) +{ + + pmap_kenter(sva, size, pa, DEVICE_MEMORY); +} + +/* + * Remove a page from the kernel pagetables. + */ +PMAP_INLINE void +pmap_kremove(vm_offset_t va) +{ + pt_entry_t *pte; + int lvl; + + pte = pmap_pte(kernel_pmap, va, &lvl); + KASSERT(pte != NULL, ("pmap_kremove: Invalid address")); + KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl)); + + pmap_load_clear(pte); + pmap_invalidate_page(kernel_pmap, va); +} + +void +pmap_kremove_device(vm_offset_t sva, vm_size_t size) +{ + pt_entry_t *pte; + vm_offset_t va; + int lvl; + + KASSERT((sva & L3_OFFSET) == 0, + ("pmap_kremove_device: Invalid virtual address")); + KASSERT((size & PAGE_MASK) == 0, + ("pmap_kremove_device: Mapping is not page-sized")); + + va = sva; + while (size != 0) { + pte = pmap_pte(kernel_pmap, va, &lvl); + KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va)); + KASSERT(lvl == 3, + ("Invalid device pagetable level: %d != 3", lvl)); + pmap_load_clear(pte); + + va += PAGE_SIZE; + size -= PAGE_SIZE; + } + pmap_invalidate_range(kernel_pmap, sva, va); +} + +/* + * Used to map a range of physical addresses into kernel + * virtual address space. + * + * The value passed in '*virt' is a suggested virtual address for + * the mapping. Architectures which can support a direct-mapped + * physical to virtual region can return the appropriate address + * within that region, leaving '*virt' unchanged. Other + * architectures should map the pages starting at '*virt' and + * update '*virt' with the first usable address after the mapped + * region. + */ +vm_offset_t +pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) +{ + return PHYS_TO_DMAP(start); +} + + +/* + * Add a list of wired pages to the kva + * this routine is only used for temporary + * kernel mappings that do not need to have + * page modification or references recorded. + * Note that old mappings are simply written + * over. The page *must* be wired. + * Note: SMP coherent. Uses a ranged shootdown IPI. + */ +void +pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) +{ + pd_entry_t *pde; + pt_entry_t *pte, pa; + vm_offset_t va; + vm_page_t m; + int i, lvl; + + va = sva; + for (i = 0; i < count; i++) { + pde = pmap_pde(kernel_pmap, va, &lvl); + KASSERT(pde != NULL, + ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); + KASSERT(lvl == 2, + ("pmap_qenter: Invalid level %d", lvl)); + + m = ma[i]; + pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) | + ATTR_IDX(m->md.pv_memattr) | L3_PAGE; + if (m->md.pv_memattr == DEVICE_MEMORY) + pa |= ATTR_XN; + pte = pmap_l2_to_l3(pde, va); + pmap_load_store(pte, pa); + + va += L3_SIZE; + } + pmap_invalidate_range(kernel_pmap, sva, va); +} + +/* + * This routine tears out page mappings from the + * kernel -- it is meant only for temporary mappings. + */ +void +pmap_qremove(vm_offset_t sva, int count) +{ + pt_entry_t *pte; + vm_offset_t va; + int lvl; + + KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); + + va = sva; + while (count-- > 0) { + pte = pmap_pte(kernel_pmap, va, &lvl); + KASSERT(lvl == 3, + ("Invalid device pagetable level: %d != 3", lvl)); + if (pte != NULL) { + pmap_load_clear(pte); + } + + va += PAGE_SIZE; + } + pmap_invalidate_range(kernel_pmap, sva, va); +} + +/*************************************************** + * Page table page management routines..... + ***************************************************/ +static __inline void +pmap_free_zero_pages(struct spglist *free) +{ + vm_page_t m; + + while ((m = SLIST_FIRST(free)) != NULL) { + SLIST_REMOVE_HEAD(free, plinks.s.ss); + /* Preserve the page's PG_ZERO setting. */ + vm_page_free_toq(m); + } +} + +/* + * Schedule the specified unused page table page to be freed. Specifically, + * add the page to the specified list of pages that will be released to the + * physical memory manager after the TLB has been updated. + */ +static __inline void +pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, + boolean_t set_PG_ZERO) +{ + + if (set_PG_ZERO) + m->flags |= PG_ZERO; + else + m->flags &= ~PG_ZERO; + SLIST_INSERT_HEAD(free, m, plinks.s.ss); +} + +/* + * Decrements a page table page's wire count, which is used to record the + * number of valid page table entries within the page. If the wire count + * drops to zero, then the page table page is unmapped. Returns TRUE if the + * page table page was unmapped and FALSE otherwise. + */ +static inline boolean_t +pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) +{ + + --m->wire_count; + if (m->wire_count == 0) { + _pmap_unwire_l3(pmap, va, m, free); + return (TRUE); + } else + return (FALSE); +} + +static void +_pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* + * unmap the page table page + */ + if (m->pindex >= (NUL2E + NUL1E)) { + /* l1 page */ + pd_entry_t *l0; + + l0 = pmap_l0(pmap, va); + pmap_load_clear(l0); + } else if (m->pindex >= NUL2E) { + /* l2 page */ + pd_entry_t *l1; + + l1 = pmap_l1(pmap, va); + pmap_load_clear(l1); + } else { + /* l3 page */ + pd_entry_t *l2; + + l2 = pmap_l2(pmap, va); + pmap_load_clear(l2); + } + pmap_resident_count_dec(pmap, 1); + if (m->pindex < NUL2E) { + /* We just released an l3, unhold the matching l2 */ + pd_entry_t *l1, tl1; + vm_page_t l2pg; + + l1 = pmap_l1(pmap, va); + tl1 = pmap_load(l1); + l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); + pmap_unwire_l3(pmap, va, l2pg, free); + } else if (m->pindex < (NUL2E + NUL1E)) { + /* We just released an l2, unhold the matching l1 */ + vm_page_t l1pg; + l1pg = pmap_l1pg(pmap, va); + pmap_unwire_l3(pmap, va, l1pg, free); + } + pmap_invalidate_page(pmap, va); + + /* + * This is a release store so that the ordinary store unmapping + * the page table page is globally performed before TLB shoot- + * down is begun. + */ + atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); + + /* + * Put page on a list so that it is released after + * *ALL* TLB shootdown is done + */ + pmap_add_delayed_free_list(m, free, TRUE); +} + +/* + * After removing a page table entry, this routine is used to + * conditionally free the page, and manage the hold/wire counts. + */ +static int +pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, + struct spglist *free) +{ + vm_page_t mpte; + + if (va >= VM_MAXUSER_ADDRESS) + return (0); + KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); + mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK); + return (pmap_unwire_l3(pmap, va, mpte, free)); +} + +void +pmap_pinit0(pmap_t pmap) +{ + + PMAP_LOCK_INIT(pmap); + bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); + pmap->pm_l0 = kernel_pmap->pm_l0; + pmap->pm_root.rt_root = 0; +} + +int +pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type) +{ + vm_paddr_t l0phys; + vm_page_t l0pt; + + KASSERT(pm_type < PT_INVALID, ("Unknown pmap type")); + KASSERT(!((pm_type == PT_STAGE2) && (pa_range_bits == 0)), + ("Unknown PARange bits")); + + /* + * allocate the l0 page + */ + if (pm_type == PT_STAGE1) { + while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) + VM_WAIT; + } else { + uint64_t npages; + uint64_t alignment; + + if (pa_range_bits <= L0_SHIFT) { + /* + * The level 1 translation table is not larger than a + * PT_STAGE1 level 1 table, use only one page. + */ + npages = 1; + alignment = PAGE_SIZE; + } else { + /* + * The level 1 translation table is larger than a + * regular PT_STAGE1 level 1 table, for every x bits + * that is larger we need 2^x pages and the table must + * be aligned at a 2^(x + 12) boundary. + * + * See Table D5-25 and Example D4-5 from the DDI0487B + * ARMv8 Architecture Manual for more information. + */ + npages = 1 << (pa_range_bits - L0_SHIFT); + alignment = 1 << (PAGE_SHIFT + pa_range_bits - L0_SHIFT); + } + while ((l0pt = vm_page_alloc_contig(NULL, 0, VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO, + npages, DMAP_MIN_PHYSADDR, DMAP_MAX_PHYSADDR, + alignment, 0, VM_MEMATTR_DEFAULT)) == NULL) + VM_WAIT; + } + + l0phys = VM_PAGE_TO_PHYS(l0pt); + pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys); + + if ((l0pt->flags & PG_ZERO) == 0) + pagezero(pmap->pm_l0); + + pmap->pm_root.rt_root = 0; + pmap->pm_type = pm_type; + bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); + + return (1); +} + +int +pmap_pinit(pmap_t pmap) +{ + return (pmap_pinit_type(pmap, PT_STAGE1)); +} + +/* + * This routine is called if the desired page table page does not exist. + * + * If page table page allocation fails, this routine may sleep before + * returning NULL. It sleeps only if a lock pointer was given. + * + * Note: If a page allocation fails at page table level two or three, + * one or two pages may be held during the wait, only to be released + * afterwards. This conservative approach is easily argued to avoid + * race conditions. + */ +static vm_page_t +_pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) +{ + vm_page_t m, l1pg, l2pg; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * Allocate a page table page. + */ + if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + if (lockp != NULL) { + RELEASE_PV_LIST_LOCK(lockp); + PMAP_UNLOCK(pmap); + VM_WAIT; + PMAP_LOCK(pmap); + } + + /* + * Indicate the need to retry. While waiting, the page table + * page may have been allocated. + */ + return (NULL); + } + if ((m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + + /* + * Map the pagetable page into the process address space, if + * it isn't already there. + */ + + if (ptepindex >= (NUL2E + NUL1E)) { + pd_entry_t *l0; + vm_pindex_t l0index; + + l0index = ptepindex - (NUL2E + NUL1E); + l0 = &pmap->pm_l0[l0index]; + pmap_load_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE); + } else if (ptepindex >= NUL2E) { + vm_pindex_t l0index, l1index; + pd_entry_t *l0, *l1; + pd_entry_t tl0; + + l1index = ptepindex - NUL2E; + if (pmap->pm_type == PT_STAGE1) { + l0index = l1index >> L0_ENTRIES_SHIFT; + l0 = &pmap->pm_l0[l0index]; + tl0 = pmap_load(l0); + if (tl0 == 0) { + /* recurse for allocating page dir */ + if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, + lockp) == NULL) { + --m->wire_count; + /* XXX: release mem barrier? */ + atomic_subtract_int(&vm_cnt.v_wire_count, 1); + vm_page_free_zero(m); + return (NULL); + } + } else { + l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); + l1pg->wire_count++; + } + l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); + l1 = &l1[ptepindex & Ln_ADDR_MASK]; + } else { + l1pg = pmap_l1pg(pmap, l1index); + l1pg->wire_count++; + l1 = &pmap->pm_l0[l1index & STAGE2_L1_ADDR_MASK]; + } + pmap_load_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); + } else { + vm_pindex_t l0index, l1index; + pd_entry_t *l0, *l1, *l2; + pd_entry_t tl0, tl1; + + l1index = ptepindex >> Ln_ENTRIES_SHIFT; + if (pmap->pm_type == PT_STAGE1) { + l0index = l1index >> L0_ENTRIES_SHIFT; + l0 = &pmap->pm_l0[l0index]; + tl0 = pmap_load(l0); + if (tl0 == 0) { + /* recurse for allocating page dir */ + if (_pmap_alloc_l3(pmap, NUL2E + l1index, + lockp) == NULL) { + --m->wire_count; + atomic_subtract_int( + &vm_cnt.v_wire_count, 1); + vm_page_free_zero(m); + return (NULL); + } + tl0 = pmap_load(l0); + l1 = (pd_entry_t *)PHYS_TO_DMAP( + tl0 & ~ATTR_MASK); + l1 = &l1[l1index & Ln_ADDR_MASK]; + } else { + l1 = (pd_entry_t *)PHYS_TO_DMAP( + tl0 & ~ATTR_MASK); + l1 = &l1[l1index & Ln_ADDR_MASK]; + tl1 = pmap_load(l1); + if (tl1 == 0) { + /* recurse for allocating page dir */ + if (_pmap_alloc_l3(pmap, NUL2E + l1index, + lockp) == NULL) { + --m->wire_count; + /* XXX: release mem barrier? */ + atomic_subtract_int( + &vm_cnt.v_wire_count, 1); + vm_page_free_zero(m); + return (NULL); + } + } else { + l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); + l2pg->wire_count++; + } + } + } else { + l1 = &pmap->pm_l0[l1index & STAGE2_L1_ADDR_MASK]; + tl1 = pmap_load(l1); + if (tl1 == 0) { + /* recurse for allocating page dir */ + if (_pmap_alloc_l3(pmap, NUL2E + l1index, + lockp) == NULL) { + --m->wire_count; + /* XXX: release mem barrier? */ + atomic_subtract_int( + &vm_cnt.v_wire_count, 1); + vm_page_free_zero(m); + return (NULL); + } + } else { + l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); + l2pg->wire_count++; + } + } + + l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); + l2 = &l2[ptepindex & Ln_ADDR_MASK]; + pmap_load_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE); + } + + pmap_resident_count_inc(pmap, 1); + + return (m); +} + +static vm_page_t +pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) +{ + vm_pindex_t ptepindex; + pd_entry_t *pde, tpde; +#ifdef INVARIANTS + pt_entry_t *pte; +#endif + vm_page_t m; + int lvl; + + /* + * Calculate pagetable page index + */ + ptepindex = pmap_l2_pindex(va); +retry: + /* + * Get the page directory entry + */ + pde = pmap_pde(pmap, va, &lvl); + + /* + * If the page table page is mapped, we just increment the hold count, + * and activate it. If we get a level 2 pde it will point to a level 3 + * table. + */ + switch (lvl) { + case -1: + break; + case 0: +#ifdef INVARIANTS + pte = pmap_l0_to_l1(pde, va); + KASSERT(pmap_load(pte) == 0, + ("pmap_alloc_l3: TODO: l0 superpages")); +#endif + break; + case 1: +#ifdef INVARIANTS + pte = pmap_l1_to_l2(pde, va); + KASSERT(pmap_load(pte) == 0, + ("pmap_alloc_l3: TODO: l1 superpages")); +#endif + break; + case 2: + tpde = pmap_load(pde); + if (tpde != 0) { + m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); + m->wire_count++; + return (m); + } + break; + default: + panic("pmap_alloc_l3: Invalid level %d", lvl); + } + + /* + * Here if the pte page isn't mapped, or if it has been deallocated. + */ + m = _pmap_alloc_l3(pmap, ptepindex, lockp); + if (m == NULL && lockp != NULL) + goto retry; + + return (m); +} + + +/*************************************************** + * Pmap allocation/deallocation routines. + ***************************************************/ + +/* + * Release any resources held by the given physical map. + * Called when a pmap initialized by pmap_pinit is being released. + * Should only be called if the map contains no valid mappings. + */ +void +pmap_release(pmap_t pmap) +{ + vm_page_t m; + + KASSERT(pmap->pm_stats.resident_count == 0, + ("pmap_release: pmap resident count %ld != 0", + pmap->pm_stats.resident_count)); + KASSERT(vm_radix_is_empty(&pmap->pm_root), + ("pmap_release: pmap has reserved page table page(s)")); + + if (pmap->pm_type == PT_STAGE1) { + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0)); + + m->wire_count--; + atomic_subtract_int(&vm_cnt.v_wire_count, 1); + vm_page_free_zero(m); + } else { + uint64_t i, page_cnt; + vm_paddr_t pa; + + if (pa_range_bits < L0_SHIFT) + page_cnt = 1; + else + page_cnt = 1 << (pa_range_bits - L0_SHIFT); + + pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0); + for (i = 0; i < page_cnt; i++) { + m = PHYS_TO_VM_PAGE(pa); + + m->wire_count--; + atomic_subtract_int(&vm_cnt.v_wire_count, 1); + vm_page_free_zero(m); + + pa += PAGE_SIZE; + } + } +} + +static int +kvm_size(SYSCTL_HANDLER_ARGS) +{ + unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; + + return sysctl_handle_long(oidp, &ksize, 0, req); +} +SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, + 0, 0, kvm_size, "LU", "Size of KVM"); + +static int +kvm_free(SYSCTL_HANDLER_ARGS) +{ + unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; + + return sysctl_handle_long(oidp, &kfree, 0, req); +} +SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, + 0, 0, kvm_free, "LU", "Amount of KVM free"); + +/* + * grow the number of kernel page table entries, if needed + */ +void +pmap_growkernel(vm_offset_t addr) +{ + vm_paddr_t paddr; + vm_page_t nkpg; + pd_entry_t *l0, *l1, *l2; + + mtx_assert(&kernel_map->system_mtx, MA_OWNED); + + addr = roundup2(addr, L2_SIZE); + if (addr - 1 >= kernel_map->max_offset) + addr = kernel_map->max_offset; + while (kernel_vm_end < addr) { + l0 = pmap_l0(kernel_pmap, kernel_vm_end); + KASSERT(pmap_load(l0) != 0, + ("pmap_growkernel: No level 0 kernel entry")); + + l1 = pmap_l0_to_l1(l0, kernel_vm_end); + if (pmap_load(l1) == 0) { + /* We need a new PDP entry */ + nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, + VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (nkpg == NULL) + panic("pmap_growkernel: no memory to grow kernel"); + if ((nkpg->flags & PG_ZERO) == 0) + pmap_zero_page(nkpg); + paddr = VM_PAGE_TO_PHYS(nkpg); + pmap_load_store(l1, paddr | L1_TABLE); + continue; /* try again */ + } + l2 = pmap_l1_to_l2(l1, kernel_vm_end); + if ((pmap_load(l2) & ATTR_AF) != 0) { + kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; + if (kernel_vm_end - 1 >= kernel_map->max_offset) { + kernel_vm_end = kernel_map->max_offset; + break; + } + continue; + } + + nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, + VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + if (nkpg == NULL) + panic("pmap_growkernel: no memory to grow kernel"); + if ((nkpg->flags & PG_ZERO) == 0) + pmap_zero_page(nkpg); + paddr = VM_PAGE_TO_PHYS(nkpg); + pmap_load_store(l2, paddr | L2_TABLE); + pmap_invalidate_page(kernel_pmap, kernel_vm_end); + + kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; + if (kernel_vm_end - 1 >= kernel_map->max_offset) { + kernel_vm_end = kernel_map->max_offset; + break; + } + } +} + + +/*************************************************** + * page management routines. + ***************************************************/ + +CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); +CTASSERT(_NPCM == 3); +CTASSERT(_NPCPV == 168); + +static __inline struct pv_chunk * +pv_to_chunk(pv_entry_t pv) +{ + + return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); +} + +#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) + +#define PC_FREE0 0xfffffffffffffffful +#define PC_FREE1 0xfffffffffffffffful +#define PC_FREE2 0x000000fffffffffful + +static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; + +#if 0 +#ifdef PV_STATS +static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; + +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, + "Current number of pv entry chunks"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, + "Current number of pv entry chunks allocated"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, + "Current number of pv entry chunks frees"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, + "Number of times tried to get a chunk page but failed."); + +static long pv_entry_frees, pv_entry_allocs, pv_entry_count; +static int pv_entry_spare; + +SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, + "Current number of pv entry frees"); +SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, + "Current number of pv entry allocs"); +SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, + "Current number of pv entries"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, + "Current number of spare pv entries"); +#endif +#endif /* 0 */ + +/* + * We are in a serious low memory condition. Resort to + * drastic measures to free some pages so we can allocate + * another pv entry chunk. + * + * Returns NULL if PV entries were reclaimed from the specified pmap. + * + * We do not, however, unmap 2mpages because subsequent accesses will + * allocate per-page pv entries until repromotion occurs, thereby + * exacerbating the shortage of free pv entries. + */ +static vm_page_t +reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) +{ + struct pch new_tail; + struct pv_chunk *pc; + struct md_page *pvh; + pd_entry_t *pde; + pmap_t pmap; + pt_entry_t *pte, tpte; + pv_entry_t pv; + vm_offset_t va; + vm_page_t m, m_pc; + struct spglist free; + uint64_t inuse; + int bit, field, freed, lvl; + + PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); + KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); + pmap = NULL; + m_pc = NULL; + SLIST_INIT(&free); + TAILQ_INIT(&new_tail); + mtx_lock(&pv_chunks_mutex); + while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) { + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + mtx_unlock(&pv_chunks_mutex); + if (pmap != pc->pc_pmap) { + if (pmap != NULL && pmap != locked_pmap) + PMAP_UNLOCK(pmap); + pmap = pc->pc_pmap; + /* Avoid deadlock and lock recursion. */ + if (pmap > locked_pmap) { + RELEASE_PV_LIST_LOCK(lockp); + PMAP_LOCK(pmap); + } else if (pmap != locked_pmap && + !PMAP_TRYLOCK(pmap)) { + pmap = NULL; + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + mtx_lock(&pv_chunks_mutex); + continue; + } + } + + /* + * Destroy every non-wired, 4 KB page mapping in the chunk. + */ + freed = 0; + for (field = 0; field < _NPCM; field++) { + for (inuse = ~pc->pc_map[field] & pc_freemask[field]; + inuse != 0; inuse &= ~(1UL << bit)) { + bit = ffsl(inuse) - 1; + pv = &pc->pc_pventry[field * 64 + bit]; + va = pv->pv_va; + pde = pmap_pde(pmap, va, &lvl); + if (lvl != 2) + continue; + pte = pmap_l2_to_l3(pde, va); + tpte = pmap_load(pte); + if ((tpte & ATTR_SW_WIRED) != 0) + continue; + tpte = pmap_load_clear(pte); + pmap_invalidate_page(pmap, va); + m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK); + if (pmap_page_dirty(tpte)) + vm_page_dirty(m); + if ((tpte & ATTR_AF) != 0) + vm_page_aflag_set(m, PGA_REFERENCED); + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + if (TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) { + vm_page_aflag_clear(m, + PGA_WRITEABLE); + } + } + pc->pc_map[field] |= 1UL << bit; + pmap_unuse_pt(pmap, va, pmap_load(pde), &free); + freed++; + } + } + if (freed == 0) { + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + mtx_lock(&pv_chunks_mutex); + continue; + } + /* Every freed mapping is for a 4 KB page. */ + pmap_resident_count_dec(pmap, freed); + PV_STAT(atomic_add_long(&pv_entry_frees, freed)); + PV_STAT(atomic_add_int(&pv_entry_spare, freed)); + PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && + pc->pc_map[2] == PC_FREE2) { + PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); + PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); + /* Entire chunk is free; return it. */ + m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); + dump_drop_page(m_pc->phys_addr); + mtx_lock(&pv_chunks_mutex); + break; + } + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + mtx_lock(&pv_chunks_mutex); + /* One freed pv entry in locked_pmap is sufficient. */ + if (pmap == locked_pmap) + break; + } + TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); + mtx_unlock(&pv_chunks_mutex); + if (pmap != NULL && pmap != locked_pmap) + PMAP_UNLOCK(pmap); + if (m_pc == NULL && !SLIST_EMPTY(&free)) { + m_pc = SLIST_FIRST(&free); + SLIST_REMOVE_HEAD(&free, plinks.s.ss); + /* Recycle a freed page table page. */ + m_pc->wire_count = 1; + atomic_add_int(&vm_cnt.v_wire_count, 1); + } + pmap_free_zero_pages(&free); + return (m_pc); +} + +/* + * free the pv_entry back to the free list + */ +static void +free_pv_entry(pmap_t pmap, pv_entry_t pv) +{ + struct pv_chunk *pc; + int idx, field, bit; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PV_STAT(atomic_add_long(&pv_entry_frees, 1)); + PV_STAT(atomic_add_int(&pv_entry_spare, 1)); + PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); + pc = pv_to_chunk(pv); + idx = pv - &pc->pc_pventry[0]; + field = idx / 64; + bit = idx % 64; + pc->pc_map[field] |= 1ul << bit; + if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || + pc->pc_map[2] != PC_FREE2) { + /* 98% of the time, pc is already at the head of the list. */ + if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + } + return; + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + free_pv_chunk(pc); +} + +static void +free_pv_chunk(struct pv_chunk *pc) +{ + vm_page_t m; + + mtx_lock(&pv_chunks_mutex); + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + mtx_unlock(&pv_chunks_mutex); + PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); + PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); + /* entire chunk is free, return it */ + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); + dump_drop_page(m->phys_addr); + vm_page_unwire(m, PQ_NONE); + vm_page_free(m); +} + +/* + * Returns a new PV entry, allocating a new PV chunk from the system when + * needed. If this PV chunk allocation fails and a PV list lock pointer was + * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is + * returned. + * + * The given PV list lock may be released. + */ +static pv_entry_t +get_pv_entry(pmap_t pmap, struct rwlock **lockp) +{ + int bit, field; + pv_entry_t pv; + struct pv_chunk *pc; + vm_page_t m; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); +retry: + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + if (pc != NULL) { + for (field = 0; field < _NPCM; field++) { + if (pc->pc_map[field]) { + bit = ffsl(pc->pc_map[field]) - 1; + break; + } + } + if (field < _NPCM) { + pv = &pc->pc_pventry[field * 64 + bit]; + pc->pc_map[field] &= ~(1ul << bit); + /* If this was the last item, move it to tail */ + if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && + pc->pc_map[2] == 0) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, + pc_list); + } + PV_STAT(atomic_add_long(&pv_entry_count, 1)); + PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); + return (pv); + } + } + /* No free items, allocate another chunk */ + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED); + if (m == NULL) { + if (lockp == NULL) { + PV_STAT(pc_chunk_tryfail++); + return (NULL); + } + m = reclaim_pv_chunk(pmap, lockp); + if (m == NULL) + goto retry; + } + PV_STAT(atomic_add_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); + dump_add_page(m->phys_addr); + pc = (void *)PHYS_TO_DMAP(m->phys_addr); + pc->pc_pmap = pmap; + pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ + pc->pc_map[1] = PC_FREE1; + pc->pc_map[2] = PC_FREE2; + mtx_lock(&pv_chunks_mutex); + TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); + mtx_unlock(&pv_chunks_mutex); + pv = &pc->pc_pventry[0]; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + PV_STAT(atomic_add_long(&pv_entry_count, 1)); + PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); + return (pv); +} + +/* + * Ensure that the number of spare PV entries in the specified pmap meets or + * exceeds the given count, "needed". + * + * The given PV list lock may be released. + */ +static void +reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) +{ + struct pch new_tail; + struct pv_chunk *pc; + int avail, free; + vm_page_t m; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); + + /* + * Newly allocated PV chunks must be stored in a private list until + * the required number of PV chunks have been allocated. Otherwise, + * reclaim_pv_chunk() could recycle one of these chunks. In + * contrast, these chunks must be added to the pmap upon allocation. + */ + TAILQ_INIT(&new_tail); +retry: + avail = 0; + TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { + bit_count((bitstr_t *)pc->pc_map, 0, + sizeof(pc->pc_map) * NBBY, &free); + if (free == 0) + break; + avail += free; + if (avail >= needed) + break; + } + for (; avail < needed; avail += _NPCPV) { + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED); + if (m == NULL) { + m = reclaim_pv_chunk(pmap, lockp); + if (m == NULL) + goto retry; + } + PV_STAT(atomic_add_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); + dump_add_page(m->phys_addr); + pc = (void *)PHYS_TO_DMAP(m->phys_addr); + pc->pc_pmap = pmap; + pc->pc_map[0] = PC_FREE0; + pc->pc_map[1] = PC_FREE1; + pc->pc_map[2] = PC_FREE2; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); + } + if (!TAILQ_EMPTY(&new_tail)) { + mtx_lock(&pv_chunks_mutex); + TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); + mtx_unlock(&pv_chunks_mutex); + } +} + +/* + * First find and then remove the pv entry for the specified pmap and virtual + * address from the specified pv list. Returns the pv entry if found and NULL + * otherwise. This operation can be performed on pv lists for either 4KB or + * 2MB page mappings. + */ +static __inline pv_entry_t +pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + if (pmap == PV_PMAP(pv) && va == pv->pv_va) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + break; + } + } + return (pv); +} + +/* + * After demotion from a 2MB page mapping to 512 4KB page mappings, + * destroy the pv entry for the 2MB page mapping and reinstantiate the pv + * entries for each of the 4KB page mappings. + */ +static void +pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) +{ + struct md_page *pvh; + struct pv_chunk *pc; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + int bit, field; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((pa & L2_OFFSET) == 0, + ("pmap_pv_demote_l2: pa is not 2mpage aligned")); + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + + /* + * Transfer the 2mpage's pv entry for this mapping to the first + * page's pv list. Once this transfer begins, the pv list lock + * must not be released until the last pv entry is reinstantiated. + */ + pvh = pa_to_pvh(pa); + va = va & ~L2_OFFSET; + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); + m = PHYS_TO_VM_PAGE(pa); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ + PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); + va_last = va + L2_SIZE - PAGE_SIZE; + for (;;) { + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || + pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); + for (field = 0; field < _NPCM; field++) { + while (pc->pc_map[field]) { + bit = ffsl(pc->pc_map[field]) - 1; + pc->pc_map[field] &= ~(1ul << bit); + pv = &pc->pc_pventry[field * 64 + bit]; + va += PAGE_SIZE; + pv->pv_va = va; + m++; + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_pv_demote_l2: page %p is not managed", m)); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + if (va == va_last) + goto out; + } + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + } +out: + if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + } + PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); + PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); +} + +/* + * First find and then destroy the pv entry for the specified pmap and virtual + * address. This operation can be performed on pv lists for either 4KB or 2MB + * page mappings. + */ +static void +pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); + free_pv_entry(pmap, pv); +} + +/* + * Conditionally create the PV entry for a 4KB page mapping if the required + * memory can be allocated without resorting to reclamation. + */ +static boolean_t +pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, + struct rwlock **lockp) +{ + pv_entry_t pv; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* Pass NULL instead of the lock pointer to disable reclamation. */ + if ((pv = get_pv_entry(pmap, NULL)) != NULL) { + pv->pv_va = va; + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + return (TRUE); + } else + return (FALSE); +} + +/* + * pmap_remove_l2: do the things to unmap a level 2 superpage in a process + */ +static int +pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, + pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) +{ + struct md_page *pvh; + pt_entry_t old_l2; + vm_offset_t eva, va; + vm_page_t m, ml3; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); + old_l2 = pmap_load_clear(l2); + pmap_invalidate_range(pmap, sva, sva + L2_SIZE); + if (old_l2 & ATTR_SW_WIRED) + pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; + pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); + if (old_l2 & ATTR_SW_MANAGED) { + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK); + pvh = pa_to_pvh(old_l2 & ~ATTR_MASK); + pmap_pvh_free(pvh, pmap, sva); + eva = sva + L2_SIZE; + for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); + va < eva; va += PAGE_SIZE, m++) { + if (pmap_page_dirty(old_l2)) + vm_page_dirty(m); + if (old_l2 & ATTR_AF) + vm_page_aflag_set(m, PGA_REFERENCED); + if (TAILQ_EMPTY(&m->md.pv_list) && + TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + KASSERT(pmap != kernel_pmap, + ("Attempting to remove an l2 kernel page")); + ml3 = pmap_remove_pt_page(pmap, sva); + if (ml3 != NULL) { + pmap_resident_count_dec(pmap, 1); + KASSERT(ml3->wire_count == NL3PG, + ("pmap_remove_pages: l3 page wire count error")); + ml3->wire_count = 0; + pmap_add_delayed_free_list(ml3, free, FALSE); + atomic_subtract_int(&vm_cnt.v_wire_count, 1); + } + return (pmap_unuse_pt(pmap, sva, l1e, free)); +} + +/* + * pmap_remove_l3: do the things to unmap a page in a process + */ +static int +pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, + pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) +{ + struct md_page *pvh; + pt_entry_t old_l3; + vm_page_t m; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + old_l3 = pmap_load_clear(l3); + pmap_invalidate_page(pmap, va); + if (old_l3 & ATTR_SW_WIRED) + pmap->pm_stats.wired_count -= 1; + pmap_resident_count_dec(pmap, 1); + if (old_l3 & ATTR_SW_MANAGED) { + m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); + if (pmap_page_dirty(old_l3)) + vm_page_dirty(m); + if (old_l3 & ATTR_AF) + vm_page_aflag_set(m, PGA_REFERENCED); + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + pmap_pvh_free(&m->md, pmap, va); + if (TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + return (pmap_unuse_pt(pmap, va, l2e, free)); +} + +/* + * Remove the given range of addresses from the specified map. + * + * It is assumed that the start and end are properly + * rounded to the page size. + */ +void +pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + struct rwlock *lock; + vm_offset_t va, va_next; + pd_entry_t *l0, *l1, *l2; + pt_entry_t l3_paddr, *l3; + struct spglist free; + int anyvalid; + + /* + * Perform an unsynchronized read. This is, however, safe. + */ + if (pmap->pm_stats.resident_count == 0) + return; + + anyvalid = 0; + SLIST_INIT(&free); + + PMAP_LOCK(pmap); + + lock = NULL; + for (; sva < eva; sva = va_next) { + + if (pmap->pm_stats.resident_count == 0) + break; + + if (pmap->pm_type == PT_STAGE1) { + l0 = pmap_l0(pmap, sva); + if (pmap_load(l0) == 0) { + va_next = (sva + L0_SIZE) & ~L0_OFFSET; + if (va_next < sva) + va_next = eva; + continue; + } + + l1 = pmap_l0_to_l1(l0, sva); + } else { + l1 = pmap_l1(pmap, sva); + } + if (pmap_load(l1) == 0) { + va_next = (sva + L1_SIZE) & ~L1_OFFSET; + if (va_next < sva) + va_next = eva; + continue; + } + + /* + * Calculate index for next page table. + */ + va_next = (sva + L2_SIZE) & ~L2_OFFSET; + if (va_next < sva) + va_next = eva; + + l2 = pmap_l1_to_l2(l1, sva); + if (l2 == NULL) + continue; + + l3_paddr = pmap_load(l2); + + if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { + if (sva + L2_SIZE == va_next && eva >= va_next) { + pmap_remove_l2(pmap, l2, sva, pmap_load(l1), + &free, &lock); + continue; + } else if (pmap_demote_l2_locked(pmap, l2, + sva &~L2_OFFSET, &lock) == NULL) + continue; + l3_paddr = pmap_load(l2); + } + + /* + * Weed out invalid mappings. + */ + if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) + continue; + + /* + * Limit our scan to either the end of the va represented + * by the current page table page, or to the end of the + * range being removed. + */ + if (va_next > eva) + va_next = eva; + + va = va_next; + for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, + sva += L3_SIZE) { + if (l3 == NULL) + panic("l3 == NULL"); + if (pmap_load(l3) == 0) { + if (va != va_next) { + pmap_invalidate_range(pmap, va, sva); + va = va_next; + } + continue; + } + if (va == va_next) + va = sva; + if (pmap_remove_l3(pmap, l3, sva, l3_paddr, &free, + &lock)) { + sva += L3_SIZE; + break; + } + } + if (va != va_next) + pmap_invalidate_range(pmap, va, sva); + } + if (lock != NULL) + rw_wunlock(lock); + if (anyvalid) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); + pmap_free_zero_pages(&free); +} + +/* + * Routine: pmap_remove_all + * Function: + * Removes this physical page from + * all physical maps in which it resides. + * Reflects back modify bits to the pager. + * + * Notes: + * Original versions of this routine were very + * inefficient because they iteratively called + * pmap_remove (slow...) + */ + +void +pmap_remove_all(vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t pv; + pmap_t pmap; + struct rwlock *lock; + pd_entry_t *pde, tpde; + pt_entry_t *pte, tpte; + vm_offset_t va; + struct spglist free; + int lvl, pvh_gen, md_gen; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_remove_all: page %p is not managed", m)); + SLIST_INIT(&free); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : + pa_to_pvh(VM_PAGE_TO_PHYS(m)); +retry: + rw_wlock(lock); + while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + rw_wunlock(lock); + PMAP_UNLOCK(pmap); + goto retry; + } + } + va = pv->pv_va; + pte = pmap_pte(pmap, va, &lvl); + KASSERT(pte != NULL, + ("pmap_remove_all: no page table entry found")); + KASSERT(lvl == 2, + ("pmap_remove_all: invalid pte level %d", lvl)); + + pmap_demote_l2_locked(pmap, pte, va, &lock); + PMAP_UNLOCK(pmap); + } + while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + md_gen = m->md.pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { + rw_wunlock(lock); + PMAP_UNLOCK(pmap); + goto retry; + } + } + pmap_resident_count_dec(pmap, 1); + + pde = pmap_pde(pmap, pv->pv_va, &lvl); + KASSERT(pde != NULL, + ("pmap_remove_all: no page directory entry found")); + KASSERT(lvl == 2, + ("pmap_remove_all: invalid pde level %d", lvl)); + tpde = pmap_load(pde); + + pte = pmap_l2_to_l3(pde, pv->pv_va); + tpte = pmap_load(pte); + pmap_load_clear(pte); + pmap_invalidate_page(pmap, pv->pv_va); + if (tpte & ATTR_SW_WIRED) + pmap->pm_stats.wired_count--; + if ((tpte & ATTR_AF) != 0) + vm_page_aflag_set(m, PGA_REFERENCED); + + /* + * Update the vm_page_t clean and reference bits. + */ + if (pmap_page_dirty(tpte)) + vm_page_dirty(m); + pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + free_pv_entry(pmap, pv); + PMAP_UNLOCK(pmap); + } + vm_page_aflag_clear(m, PGA_WRITEABLE); + rw_wunlock(lock); + pmap_free_zero_pages(&free); +} + +/* + * Set the physical protection on the + * specified range of this map as requested. + */ +void +pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) +{ + vm_offset_t va, va_next; + pd_entry_t *l0, *l1, *l2; + pt_entry_t *l3p, l3, nbits; + + KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); + if (prot == VM_PROT_NONE) { + pmap_remove(pmap, sva, eva); + return; + } + + if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == + (VM_PROT_WRITE | VM_PROT_EXECUTE)) + return; + + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + + l0 = pmap_l0(pmap, sva); + if (pmap_load(l0) == 0) { + va_next = (sva + L0_SIZE) & ~L0_OFFSET; + if (va_next < sva) + va_next = eva; + continue; + } + + l1 = pmap_l0_to_l1(l0, sva); + if (pmap_load(l1) == 0) { + va_next = (sva + L1_SIZE) & ~L1_OFFSET; + if (va_next < sva) + va_next = eva; + continue; + } + + va_next = (sva + L2_SIZE) & ~L2_OFFSET; + if (va_next < sva) + va_next = eva; + + l2 = pmap_l1_to_l2(l1, sva); + if (pmap_load(l2) == 0) + continue; + + if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { + l3p = pmap_demote_l2(pmap, l2, sva); + if (l3p == NULL) + continue; + } + KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, + ("pmap_protect: Invalid L2 entry after demotion")); + + if (va_next > eva) + va_next = eva; + + va = va_next; + for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, + sva += L3_SIZE) { + l3 = pmap_load(l3p); + if (!pmap_l3_valid(l3)) + continue; + + nbits = 0; + if ((prot & VM_PROT_WRITE) == 0) { + if ((l3 & ATTR_SW_MANAGED) && + pmap_page_dirty(l3)) { + vm_page_dirty(PHYS_TO_VM_PAGE(l3 & + ~ATTR_MASK)); + } + nbits |= ATTR_AP(ATTR_AP_RO); + } + if ((prot & VM_PROT_EXECUTE) == 0) + nbits |= ATTR_XN; + + pmap_set(l3p, nbits); + /* XXX: Use pmap_invalidate_range */ + pmap_invalidate_page(pmap, va); + } + } + PMAP_UNLOCK(pmap); + + /* TODO: Only invalidate entries we are touching */ + pmap_invalidate_all(pmap); +} + +/* + * Inserts the specified page table page into the specified pmap's collection + * of idle page table pages. Each of a pmap's page table pages is responsible + * for mapping a distinct range of virtual addresses. The pmap's collection is + * ordered by this virtual address range. + */ +static __inline int +pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + return (vm_radix_insert(&pmap->pm_root, mpte)); +} + +/* + * Removes the page table page mapping the specified virtual address from the + * specified pmap's collection of idle page table pages, and returns it. + * Otherwise, returns NULL if there is no page table page corresponding to the + * specified virtual address. + */ +static __inline vm_page_t +pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); +} + +/* + * Performs a break-before-make update of a pmap entry. This is needed when + * either promoting or demoting pages to ensure the TLB doesn't get into an + * inconsistent state. + */ +static void +pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, + vm_offset_t va, vm_size_t size) +{ + register_t intr; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * Ensure we don't get switched out with the page table in an + * inconsistent state. We also need to ensure no interrupts fire + * as they may make use of an address we are about to invalidate. + */ + intr = intr_disable(); + critical_enter(); + + /* Clear the old mapping */ + pmap_load_clear(pte); + pmap_invalidate_range(pmap, va, va + size); + + /* Create the new mapping */ + pmap_load_store(pte, newpte); + + critical_exit(); + intr_restore(intr); +} + +/* + * After promotion from 512 4KB page mappings to a single 2MB page mapping, + * replace the many pv entries for the 4KB page mappings by a single pv entry + * for the 2MB page mapping. + */ +static void +pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + + KASSERT((pa & L2_OFFSET) == 0, + ("pmap_pv_promote_l2: pa is not 2mpage aligned")); + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + + /* + * Transfer the first page's pv entry for this mapping to the 2mpage's + * pv list. Aside from avoiding the cost of a call to get_pv_entry(), + * a transfer avoids the possibility that get_pv_entry() calls + * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the + * mappings that is being promoted. + */ + m = PHYS_TO_VM_PAGE(pa); + va = va & ~L2_OFFSET; + pv = pmap_pvh_remove(&m->md, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + /* Free the remaining NPTEPG - 1 pv entries. */ + va_last = va + L2_SIZE - PAGE_SIZE; + do { + m++; + va += PAGE_SIZE; + pmap_pvh_free(&m->md, pmap, va); + } while (va < va_last); +} + +/* + * Tries to promote the 512, contiguous 4KB page mappings that are within a + * single level 2 table entry to a single 2MB page mapping. For promotion + * to occur, two conditions must be met: (1) the 4KB page mappings must map + * aligned, contiguous physical memory and (2) the 4KB page mappings must have + * identical characteristics. + */ +static void +pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, + struct rwlock **lockp) +{ + pt_entry_t *firstl3, *l3, newl2, oldl3, pa; + vm_page_t mpte; + vm_offset_t sva; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + sva = va & ~L2_OFFSET; + firstl3 = pmap_l2_to_l3(l2, sva); + newl2 = pmap_load(firstl3); + + /* Check the alingment is valid */ + if (((newl2 & ~ATTR_MASK) & L2_OFFSET) != 0) { + atomic_add_long(&pmap_l2_p_failures, 1); + CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" + " in pmap %p", va, pmap); + return; + } + + pa = newl2 + L2_SIZE - PAGE_SIZE; + for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { + oldl3 = pmap_load(l3); + if (oldl3 != pa) { + atomic_add_long(&pmap_l2_p_failures, 1); + CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" + " in pmap %p", va, pmap); + return; + } + pa -= PAGE_SIZE; + } + + /* + * Save the page table page in its current state until the L2 + * mapping the superpage is demoted by pmap_demote_l2() or + * destroyed by pmap_remove_l3(). + */ + mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); + KASSERT(mpte >= vm_page_array && + mpte < &vm_page_array[vm_page_array_size], + ("pmap_promote_l2: page table page is out of range")); + KASSERT(mpte->pindex == pmap_l2_pindex(va), + ("pmap_promote_l2: page table page's pindex is wrong")); + if (pmap_insert_pt_page(pmap, mpte)) { + atomic_add_long(&pmap_l2_p_failures, 1); + CTR2(KTR_PMAP, + "pmap_promote_l2: failure for va %#lx in pmap %p", va, + pmap); + return; + } + + if ((newl2 & ATTR_SW_MANAGED) != 0) + pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp); + + newl2 &= ~ATTR_DESCR_MASK; + newl2 |= L2_BLOCK; + + pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE); + + atomic_add_long(&pmap_l2_promotions, 1); + CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, + pmap); +} + +/* + * Insert the given physical page (p) at + * the specified virtual address (v) in the + * target physical map with the protection requested. + * + * If specified, the page will be wired down, meaning + * that the related pte can not be reclaimed. + * + * NB: This is the only routine which MAY NOT lazy-evaluate + * or lose information. That is, this routine must actually + * insert this page into the given map NOW. + */ +int +pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, + u_int flags, int8_t psind __unused) +{ + struct rwlock *lock; + pd_entry_t *pde; + pt_entry_t new_l3, orig_l3; + pt_entry_t *l2, *l3; + pv_entry_t pv; + vm_paddr_t opa, pa, l1_pa, l2_pa, l3_pa; + vm_page_t mpte, om, l1_m, l2_m, l3_m; + boolean_t nosleep; + int lvl; + + va = trunc_page(va); + if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) + VM_OBJECT_ASSERT_LOCKED(m->object); + pa = VM_PAGE_TO_PHYS(m); + + if (pmap->pm_type == PT_STAGE1) { + new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | + ATTR_IDX(m->md.pv_memattr) | L3_PAGE); + if ((prot & VM_PROT_WRITE) == 0) + new_l3 |= ATTR_AP(ATTR_AP_RO); + if ((prot & VM_PROT_EXECUTE) == 0 || + m->md.pv_memattr == DEVICE_MEMORY) + new_l3 |= ATTR_XN; + if (va < VM_MAXUSER_ADDRESS) + new_l3 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN; + } else { + new_l3 = (pd_entry_t)(pa | ATTR_ST2_DEFAULT | L3_PAGE); + } + if ((flags & PMAP_ENTER_WIRED) != 0) + new_l3 |= ATTR_SW_WIRED; + + CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); + + mpte = NULL; + + lock = NULL; + PMAP_LOCK(pmap); + + pde = pmap_pde(pmap, va, &lvl); + if (pde != NULL && lvl == 1) { + l2 = pmap_l1_to_l2(pde, va); + if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && + (l3 = pmap_demote_l2_locked(pmap, l2, va & ~L2_OFFSET, + &lock)) != NULL) { + l3 = &l3[pmap_l3_index(va)]; + if (va < VM_MAXUSER_ADDRESS) { + mpte = PHYS_TO_VM_PAGE( + pmap_load(l2) & ~ATTR_MASK); + mpte->wire_count++; + } + goto havel3; + } + } + + if (va < VM_MAXUSER_ADDRESS) { + nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; + mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); + if (mpte == NULL && nosleep) { + CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); + if (lock != NULL) + rw_wunlock(lock); + PMAP_UNLOCK(pmap); + return (KERN_RESOURCE_SHORTAGE); + } + pde = pmap_pde(pmap, va, &lvl); + KASSERT(pde != NULL, + ("pmap_enter: Invalid page entry, va: 0x%lx", va)); + KASSERT(lvl == 2, + ("pmap_enter: Invalid level %d", lvl)); + + l3 = pmap_l2_to_l3(pde, va); + } else { + /* + * If we get a level 2 pde it must point to a level 3 entry + * otherwise we will need to create the intermediate tables + */ + if (lvl < 2) { + switch(lvl) { + default: + case -1: + /* Get the l0 pde to update */ + pde = pmap_l0(pmap, va); + KASSERT(pde != NULL, ("...")); + + l1_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + if (l1_m == NULL) + panic("pmap_enter: l1 pte_m == NULL"); + if ((l1_m->flags & PG_ZERO) == 0) + pmap_zero_page(l1_m); + + l1_pa = VM_PAGE_TO_PHYS(l1_m); + pmap_load_store(pde, l1_pa | L0_TABLE); + /* FALLTHROUGH */ + case 0: + /* Get the l1 pde to update */ + pde = pmap_l1_to_l2(pde, va); + KASSERT(pde != NULL, ("...")); + + l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + if (l2_m == NULL) + panic("pmap_enter: l2 pte_m == NULL"); + if ((l2_m->flags & PG_ZERO) == 0) + pmap_zero_page(l2_m); + + l2_pa = VM_PAGE_TO_PHYS(l2_m); + pmap_load_store(pde, l2_pa | L1_TABLE); + /* FALLTHROUGH */ + case 1: + /* Get the l2 pde to update */ + pde = pmap_l1_to_l2(pde, va); + + l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + if (l3_m == NULL) + panic("pmap_enter: l3 pte_m == NULL"); + if ((l3_m->flags & PG_ZERO) == 0) + pmap_zero_page(l3_m); + + l3_pa = VM_PAGE_TO_PHYS(l3_m); + pmap_load_store(pde, l3_pa | L2_TABLE); + break; + } + } + l3 = pmap_l2_to_l3(pde, va); + pmap_invalidate_page(pmap, va); + } +havel3: + + om = NULL; + orig_l3 = pmap_load(l3); + opa = orig_l3 & ~ATTR_MASK; + + /* + * Is the specified virtual address already mapped? + */ + if (pmap_l3_valid(orig_l3)) { + /* + * Wiring change, just update stats. We don't worry about + * wiring PT pages as they remain resident as long as there + * are valid mappings in them. Hence, if a user page is wired, + * the PT page will be also. + */ + if ((flags & PMAP_ENTER_WIRED) != 0 && + (orig_l3 & ATTR_SW_WIRED) == 0) + pmap->pm_stats.wired_count++; + else if ((flags & PMAP_ENTER_WIRED) == 0 && + (orig_l3 & ATTR_SW_WIRED) != 0) + pmap->pm_stats.wired_count--; + + /* + * Remove the extra PT page reference. + */ + if (mpte != NULL) { + mpte->wire_count--; + KASSERT(mpte->wire_count > 0, + ("pmap_enter: missing reference to page table page," + " va: 0x%lx", va)); + } + + /* + * Has the physical page changed? + */ + if (opa == pa) { + /* + * No, might be a protection or wiring change. + */ + if ((orig_l3 & ATTR_SW_MANAGED) != 0) { + new_l3 |= ATTR_SW_MANAGED; + if ((new_l3 & ATTR_AP(ATTR_AP_RW)) == + ATTR_AP(ATTR_AP_RW)) { + vm_page_aflag_set(m, PGA_WRITEABLE); + } + } + goto validate; + } + } else { + /* + * Increment the counters. + */ + if ((new_l3 & ATTR_SW_WIRED) != 0) + pmap->pm_stats.wired_count++; + pmap_resident_count_inc(pmap, 1); + } + /* + * Enter on the PV list if part of our managed memory. + */ + if ((m->oflags & VPO_UNMANAGED) == 0) { + new_l3 |= ATTR_SW_MANAGED; + pv = get_pv_entry(pmap, &lock); + pv->pv_va = va; + CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + if ((new_l3 & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) + vm_page_aflag_set(m, PGA_WRITEABLE); + } + + /* + * Update the L3 entry. + */ + if (orig_l3 != 0) { +validate: + orig_l3 = pmap_load(l3); + opa = orig_l3 & ~ATTR_MASK; + + if (opa != pa) { + pmap_update_entry(pmap, l3, new_l3, va, PAGE_SIZE); + if ((orig_l3 & ATTR_SW_MANAGED) != 0) { + om = PHYS_TO_VM_PAGE(opa); + if (pmap_page_dirty(orig_l3)) + vm_page_dirty(om); + if ((orig_l3 & ATTR_AF) != 0) + vm_page_aflag_set(om, PGA_REFERENCED); + CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); + pmap_pvh_free(&om->md, pmap, va); + if ((om->aflags & PGA_WRITEABLE) != 0 && + TAILQ_EMPTY(&om->md.pv_list) && + ((om->flags & PG_FICTITIOUS) != 0 || + TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) + vm_page_aflag_clear(om, PGA_WRITEABLE); + } + } else { + pmap_load_store(l3, new_l3); + pmap_invalidate_page(pmap, va); + if (pmap_page_dirty(orig_l3) && + (orig_l3 & ATTR_SW_MANAGED) != 0) + vm_page_dirty(m); + } + } else { + pmap_load_store(l3, new_l3); + } + + pmap_invalidate_page(pmap, va); + + if (pmap != pmap_kernel()) { + if (pmap == &curproc->p_vmspace->vm_pmap && + (prot & VM_PROT_EXECUTE) != 0) + cpu_icache_sync_range(va, PAGE_SIZE); + + if ((mpte == NULL || mpte->wire_count == NL3PG) && + pmap_superpages_enabled() && + (m->flags & PG_FICTITIOUS) == 0 && + vm_reserv_level_iffullpop(m) == 0) { + pmap_promote_l2(pmap, pde, va, &lock); + } + } + + if (lock != NULL) + rw_wunlock(lock); + PMAP_UNLOCK(pmap); + return (KERN_SUCCESS); +} + +/* + * Maps a sequence of resident pages belonging to the same object. + * The sequence begins with the given page m_start. This page is + * mapped at the given virtual address start. Each subsequent page is + * mapped at a virtual address that is offset from start by the same + * amount as the page is offset from m_start within the object. The + * last page in the sequence is the page with the largest offset from + * m_start that can be mapped at a virtual address less than the given + * virtual address end. Not every virtual page between start and end + * is mapped; only those for which a resident page exists with the + * corresponding offset from m_start are mapped. + */ +void +pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, + vm_page_t m_start, vm_prot_t prot) +{ + struct rwlock *lock; + vm_offset_t va; + vm_page_t m, mpte; + vm_pindex_t diff, psize; + + VM_OBJECT_ASSERT_LOCKED(m_start->object); + + psize = atop(end - start); + mpte = NULL; + m = m_start; + lock = NULL; + PMAP_LOCK(pmap); + while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { + va = start + ptoa(diff); + mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); + m = TAILQ_NEXT(m, listq); + } + if (lock != NULL) + rw_wunlock(lock); + PMAP_UNLOCK(pmap); +} + +/* + * this code makes some *MAJOR* assumptions: + * 1. Current pmap & pmap exists. + * 2. Not wired. + * 3. Read access. + * 4. No page table pages. + * but is *MUCH* faster than pmap_enter... + */ + +void +pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) +{ + struct rwlock *lock; + + lock = NULL; + PMAP_LOCK(pmap); + (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); + if (lock != NULL) + rw_wunlock(lock); + PMAP_UNLOCK(pmap); +} + +static vm_page_t +pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) +{ + struct spglist free; + pd_entry_t *pde; + pt_entry_t *l2, *l3; + vm_paddr_t pa; + int lvl; + + KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || + (m->oflags & VPO_UNMANAGED) != 0, + ("pmap_enter_quick_locked: managed mapping within the clean submap")); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + vm_pindex_t l2pindex; + + /* + * Calculate pagetable page index + */ + l2pindex = pmap_l2_pindex(va); + if (mpte && (mpte->pindex == l2pindex)) { + mpte->wire_count++; + } else { + /* + * Get the l2 entry + */ + pde = pmap_pde(pmap, va, &lvl); + + /* + * If the page table page is mapped, we just increment + * the hold count, and activate it. Otherwise, we + * attempt to allocate a page table page. If this + * attempt fails, we don't retry. Instead, we give up. + */ + if (lvl == 1) { + l2 = pmap_l1_to_l2(pde, va); + if ((pmap_load(l2) & ATTR_DESCR_MASK) == + L2_BLOCK) + return (NULL); + } + if (lvl == 2 && pmap_load(pde) != 0) { + mpte = + PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); + mpte->wire_count++; + } else { + /* + * Pass NULL instead of the PV list lock + * pointer, because we don't intend to sleep. + */ + mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); + if (mpte == NULL) + return (mpte); + } + } + l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); + l3 = &l3[pmap_l3_index(va)]; + } else { + mpte = NULL; + pde = pmap_pde(kernel_pmap, va, &lvl); + KASSERT(pde != NULL, + ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", + va)); + KASSERT(lvl == 2, + ("pmap_enter_quick_locked: Invalid level %d", lvl)); + l3 = pmap_l2_to_l3(pde, va); + } + + if (pmap_load(l3) != 0) { + if (mpte != NULL) { + mpte->wire_count--; + mpte = NULL; + } + return (mpte); + } + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((m->oflags & VPO_UNMANAGED) == 0 && + !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { + if (mpte != NULL) { + SLIST_INIT(&free); + if (pmap_unwire_l3(pmap, va, mpte, &free)) { + pmap_invalidate_page(pmap, va); + pmap_free_zero_pages(&free); + } + mpte = NULL; + } + return (mpte); + } + + /* + * Increment counters + */ + pmap_resident_count_inc(pmap, 1); + + pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | + ATTR_AP(ATTR_AP_RO) | L3_PAGE; + if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) + pa |= ATTR_XN; + else if (va < VM_MAXUSER_ADDRESS) + pa |= ATTR_PXN; + + /* + * Now validate mapping with RO protection + */ + if ((m->oflags & VPO_UNMANAGED) == 0) + pa |= ATTR_SW_MANAGED; + pmap_load_store(l3, pa); + pmap_invalidate_page(pmap, va); + return (mpte); +} + +/* + * This code maps large physical mmap regions into the + * processor address space. Note that some shortcuts + * are taken, but the code works. + */ +void +pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, + vm_pindex_t pindex, vm_size_t size) +{ + + VM_OBJECT_ASSERT_WLOCKED(object); + KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, + ("pmap_object_init_pt: non-device object")); +} + +/* + * Clear the wired attribute from the mappings for the specified range of + * addresses in the given pmap. Every valid mapping within that range + * must have the wired attribute set. In contrast, invalid mappings + * cannot have the wired attribute set, so they are ignored. + * + * The wired attribute of the page table entry is not a hardware feature, + * so there is no need to invalidate any TLB entries. + */ +void +pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t va_next; + pd_entry_t *l0, *l1, *l2; + pt_entry_t *l3; + + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + l0 = pmap_l0(pmap, sva); + if (pmap_load(l0) == 0) { + va_next = (sva + L0_SIZE) & ~L0_OFFSET; + if (va_next < sva) + va_next = eva; + continue; + } + + l1 = pmap_l0_to_l1(l0, sva); + if (pmap_load(l1) == 0) { + va_next = (sva + L1_SIZE) & ~L1_OFFSET; + if (va_next < sva) + va_next = eva; + continue; + } + + va_next = (sva + L2_SIZE) & ~L2_OFFSET; + if (va_next < sva) + va_next = eva; + + l2 = pmap_l1_to_l2(l1, sva); + if (pmap_load(l2) == 0) + continue; + + if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { + l3 = pmap_demote_l2(pmap, l2, sva); + if (l3 == NULL) + continue; + } + KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, + ("pmap_unwire: Invalid l2 entry after demotion")); + + if (va_next > eva) + va_next = eva; + for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, + sva += L3_SIZE) { + if (pmap_load(l3) == 0) + continue; + if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) + panic("pmap_unwire: l3 %#jx is missing " + "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); + + /* + * PG_W must be cleared atomically. Although the pmap + * lock synchronizes access to PG_W, another processor + * could be setting PG_M and/or PG_A concurrently. + */ + atomic_clear_long(l3, ATTR_SW_WIRED); + pmap->pm_stats.wired_count--; + } + } + PMAP_UNLOCK(pmap); +} + +/* + * Copy the range specified by src_addr/len + * from the source map to the range dst_addr/len + * in the destination map. + * + * This routine is only advisory and need not do anything. + */ + +void +pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, + vm_offset_t src_addr) +{ +} + +/* + * pmap_zero_page zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. + */ +void +pmap_zero_page(vm_page_t m) +{ + vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + + pagezero((void *)va); +} + +/* + * pmap_zero_page_area zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. + * + * off and size may not cover an area beyond a single hardware page. + */ +void +pmap_zero_page_area(vm_page_t m, int off, int size) +{ + vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + + if (off == 0 && size == PAGE_SIZE) + pagezero((void *)va); + else + bzero((char *)va + off, size); +} + +/* + * pmap_copy_page copies the specified (machine independent) + * page by mapping the page into virtual memory and using + * bcopy to copy the page, one machine dependent page at a + * time. + */ +void +pmap_copy_page(vm_page_t msrc, vm_page_t mdst) +{ + vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); + vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); + + pagecopy((void *)src, (void *)dst); +} + +int unmapped_buf_allowed = 1; + +void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + void *a_cp, *b_cp; + vm_page_t m_a, m_b; + vm_paddr_t p_a, p_b; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + m_a = ma[a_offset >> PAGE_SHIFT]; + p_a = m_a->phys_addr; + b_pg_offset = b_offset & PAGE_MASK; + m_b = mb[b_offset >> PAGE_SHIFT]; + p_b = m_b->phys_addr; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + if (__predict_false(!PHYS_IN_DMAP(p_a))) { + panic("!DMAP a %lx", p_a); + } else { + a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; + } + if (__predict_false(!PHYS_IN_DMAP(p_b))) { + panic("!DMAP b %lx", p_b); + } else { + b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; + } + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } +} + +vm_offset_t +pmap_quick_enter_page(vm_page_t m) +{ + + return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); +} + +void +pmap_quick_remove_page(vm_offset_t addr) +{ +} + +/* + * Returns true if the pmap's pv is one of the first + * 16 pvs linked to from this page. This count may + * be changed upwards or downwards in the future; it + * is only necessary that true be returned for a small + * subset of pmaps for proper page aging. + */ +boolean_t +pmap_page_exists_quick(pmap_t pmap, vm_page_t m) +{ + struct md_page *pvh; + struct rwlock *lock; + pv_entry_t pv; + int loops = 0; + boolean_t rv; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_page_exists_quick: page %p is not managed", m)); + rv = FALSE; + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + if (PV_PMAP(pv) == pmap) { + rv = TRUE; + break; + } + loops++; + if (loops >= 16) + break; + } + if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + if (PV_PMAP(pv) == pmap) { + rv = TRUE; + break; + } + loops++; + if (loops >= 16) + break; + } + } + rw_runlock(lock); + return (rv); +} + +/* + * pmap_page_wired_mappings: + * + * Return the number of managed mappings to the given physical page + * that are wired. + */ +int +pmap_page_wired_mappings(vm_page_t m) +{ + struct rwlock *lock; + struct md_page *pvh; + pmap_t pmap; + pt_entry_t *pte; + pv_entry_t pv; + int count, lvl, md_gen, pvh_gen; + + if ((m->oflags & VPO_UNMANAGED) != 0) + return (0); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); +restart: + count = 0; + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pte(pmap, pv->pv_va, &lvl); + if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) + count++; + PMAP_UNLOCK(pmap); + } + if ((m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen || + pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pte(pmap, pv->pv_va, &lvl); + if (pte != NULL && + (pmap_load(pte) & ATTR_SW_WIRED) != 0) + count++; + PMAP_UNLOCK(pmap); + } + } + rw_runlock(lock); + return (count); +} + +/* + * Destroy all managed, non-wired mappings in the given user-space + * pmap. This pmap cannot be active on any processor besides the + * caller. + * + * This function cannot be applied to the kernel pmap. Moreover, it + * is not intended for general use. It is only to be used during + * process termination. Consequently, it can be implemented in ways + * that make it faster than pmap_remove(). First, it can more quickly + * destroy mappings by iterating over the pmap's collection of PV + * entries, rather than searching the page table. Second, it doesn't + * have to test and clear the page table entries atomically, because + * no processor is currently accessing the user address space. In + * particular, a page table entry's dirty bit won't change state once + * this function starts. + */ +void +pmap_remove_pages(pmap_t pmap) +{ + pd_entry_t *pde; + pt_entry_t *pte, tpte; + struct spglist free; + vm_page_t m, ml3, mt; + pv_entry_t pv; + struct md_page *pvh; + struct pv_chunk *pc, *npc; + struct rwlock *lock; + int64_t bit; + uint64_t inuse, bitmask; + int allfree, field, freed, idx, lvl; + vm_paddr_t pa; + + lock = NULL; + + SLIST_INIT(&free); + PMAP_LOCK(pmap); + TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { + allfree = 1; + freed = 0; + for (field = 0; field < _NPCM; field++) { + inuse = ~pc->pc_map[field] & pc_freemask[field]; + while (inuse != 0) { + bit = ffsl(inuse) - 1; + bitmask = 1UL << bit; + idx = field * 64 + bit; + pv = &pc->pc_pventry[idx]; + inuse &= ~bitmask; + + pde = pmap_pde(pmap, pv->pv_va, &lvl); + KASSERT(pde != NULL, + ("Attempting to remove an unmapped page")); + + switch(lvl) { + case 1: + pte = pmap_l1_to_l2(pde, pv->pv_va); + tpte = pmap_load(pte); + KASSERT((tpte & ATTR_DESCR_MASK) == + L2_BLOCK, + ("Attempting to remove an invalid " + "block: %lx", tpte)); + tpte = pmap_load(pte); + break; + case 2: + pte = pmap_l2_to_l3(pde, pv->pv_va); + tpte = pmap_load(pte); + KASSERT((tpte & ATTR_DESCR_MASK) == + L3_PAGE, + ("Attempting to remove an invalid " + "page: %lx", tpte)); + break; + default: + panic( + "Invalid page directory level: %d", + lvl); + } + +/* + * We cannot remove wired pages from a process' mapping at this time + */ + if (tpte & ATTR_SW_WIRED) { + allfree = 0; + continue; + } + + pa = tpte & ~ATTR_MASK; + + m = PHYS_TO_VM_PAGE(pa); + KASSERT(m->phys_addr == pa, + ("vm_page_t %p phys_addr mismatch %016jx %016jx", + m, (uintmax_t)m->phys_addr, + (uintmax_t)tpte)); + + KASSERT((m->flags & PG_FICTITIOUS) != 0 || + m < &vm_page_array[vm_page_array_size], + ("pmap_remove_pages: bad pte %#jx", + (uintmax_t)tpte)); + + pmap_load_clear(pte); + + /* + * Update the vm_page_t clean/reference bits. + */ + if ((tpte & ATTR_AP_RW_BIT) == + ATTR_AP(ATTR_AP_RW)) { + switch (lvl) { + case 1: + for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) + vm_page_dirty(m); + break; + case 2: + vm_page_dirty(m); + break; + } + } + + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); + + /* Mark free */ + pc->pc_map[field] |= bitmask; + switch (lvl) { + case 1: + pmap_resident_count_dec(pmap, + L2_SIZE / PAGE_SIZE); + pvh = pa_to_pvh(tpte & ~ATTR_MASK); + TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); + pvh->pv_gen++; + if (TAILQ_EMPTY(&pvh->pv_list)) { + for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) + if ((mt->aflags & PGA_WRITEABLE) != 0 && + TAILQ_EMPTY(&mt->md.pv_list)) + vm_page_aflag_clear(mt, PGA_WRITEABLE); + } + ml3 = pmap_remove_pt_page(pmap, + pv->pv_va); + if (ml3 != NULL) { + pmap_resident_count_dec(pmap,1); + KASSERT(ml3->wire_count == NL3PG, + ("pmap_remove_pages: l3 page wire count error")); + ml3->wire_count = 0; + pmap_add_delayed_free_list(ml3, + &free, FALSE); + atomic_subtract_int( + &vm_cnt.v_wire_count, 1); + } + break; + case 2: + pmap_resident_count_dec(pmap, 1); + TAILQ_REMOVE(&m->md.pv_list, pv, + pv_next); + m->md.pv_gen++; + if ((m->aflags & PGA_WRITEABLE) != 0 && + TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh( + VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, + PGA_WRITEABLE); + } + break; + } + pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), + &free); + freed++; + } + } + PV_STAT(atomic_add_long(&pv_entry_frees, freed)); + PV_STAT(atomic_add_int(&pv_entry_spare, freed)); + PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); + if (allfree) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + free_pv_chunk(pc); + } + } + pmap_invalidate_all(pmap); + if (lock != NULL) + rw_wunlock(lock); + PMAP_UNLOCK(pmap); + pmap_free_zero_pages(&free); +} + +/* + * This is used to check if a page has been accessed or modified. As we + * don't have a bit to see if it has been modified we have to assume it + * has been if the page is read/write. + */ +static boolean_t +pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) +{ + struct rwlock *lock; + pv_entry_t pv; + struct md_page *pvh; + pt_entry_t *pte, mask, value; + pmap_t pmap; + int lvl, md_gen, pvh_gen; + boolean_t rv; + + rv = FALSE; + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); +restart: + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pte(pmap, pv->pv_va, &lvl); + KASSERT(lvl == 3, + ("pmap_page_test_mappings: Invalid level %d", lvl)); + mask = 0; + value = 0; + if (modified) { + mask |= ATTR_AP_RW_BIT; + value |= ATTR_AP(ATTR_AP_RW); + } + if (accessed) { + mask |= ATTR_AF | ATTR_DESCR_MASK; + value |= ATTR_AF | L3_PAGE; + } + rv = (pmap_load(pte) & mask) == value; + PMAP_UNLOCK(pmap); + if (rv) + goto out; + } + if ((m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen || + pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pte(pmap, pv->pv_va, &lvl); + KASSERT(lvl == 2, + ("pmap_page_test_mappings: Invalid level %d", lvl)); + mask = 0; + value = 0; + if (modified) { + mask |= ATTR_AP_RW_BIT; + value |= ATTR_AP(ATTR_AP_RW); + } + if (accessed) { + mask |= ATTR_AF | ATTR_DESCR_MASK; + value |= ATTR_AF | L2_BLOCK; + } + rv = (pmap_load(pte) & mask) == value; + PMAP_UNLOCK(pmap); + if (rv) + goto out; + } + } +out: + rw_runlock(lock); + return (rv); +} + +/* + * pmap_is_modified: + * + * Return whether or not the specified physical page was modified + * in any physical maps. + */ +boolean_t +pmap_is_modified(vm_page_t m) +{ + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_is_modified: page %p is not managed", m)); + + /* + * If the page is not exclusive busied, then PGA_WRITEABLE cannot be + * concurrently set while the object is locked. Thus, if PGA_WRITEABLE + * is clear, no PTEs can have PG_M set. + */ + VM_OBJECT_ASSERT_WLOCKED(m->object); + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) + return (FALSE); + return (pmap_page_test_mappings(m, FALSE, TRUE)); +} + +/* + * pmap_is_prefaultable: + * + * Return whether or not the specified virtual address is eligible + * for prefault. + */ +boolean_t +pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) +{ + pt_entry_t *pte; + boolean_t rv; + int lvl; + + rv = FALSE; + PMAP_LOCK(pmap); + pte = pmap_pte(pmap, addr, &lvl); + if (pte != NULL && pmap_load(pte) != 0) { + rv = TRUE; + } + PMAP_UNLOCK(pmap); + return (rv); +} + +/* + * pmap_is_referenced: + * + * Return whether or not the specified physical page was referenced + * in any physical maps. + */ +boolean_t +pmap_is_referenced(vm_page_t m) +{ + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_is_referenced: page %p is not managed", m)); + return (pmap_page_test_mappings(m, TRUE, FALSE)); +} + +/* + * Clear the write and modified bits in each of the given page's mappings. + */ +void +pmap_remove_write(vm_page_t m) +{ + struct md_page *pvh; + pmap_t pmap; + struct rwlock *lock; + pv_entry_t next_pv, pv; + pt_entry_t oldpte, *pte; + vm_offset_t va; + int lvl, md_gen, pvh_gen; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_remove_write: page %p is not managed", m)); + + /* + * If the page is not exclusive busied, then PGA_WRITEABLE cannot be + * set by another thread while the object is locked. Thus, + * if PGA_WRITEABLE is clear, no page table entries need updating. + */ + VM_OBJECT_ASSERT_WLOCKED(m->object); + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) + return; + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : + pa_to_pvh(VM_PAGE_TO_PHYS(m)); +retry_pv_loop: + rw_wlock(lock); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + rw_wunlock(lock); + goto retry_pv_loop; + } + } + va = pv->pv_va; + pte = pmap_pte(pmap, pv->pv_va, &lvl); + if ((pmap_load(pte) & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) + pmap_demote_l2_locked(pmap, pte, va & ~L2_OFFSET, + &lock); + KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), + ("inconsistent pv lock %p %p for page %p", + lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + PMAP_UNLOCK(pmap); + } + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + md_gen = m->md.pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || + md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + rw_wunlock(lock); + goto retry_pv_loop; + } + } + pte = pmap_pte(pmap, pv->pv_va, &lvl); +retry: + oldpte = pmap_load(pte); + if ((oldpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) { + if (!atomic_cmpset_long(pte, oldpte, + oldpte | ATTR_AP(ATTR_AP_RO))) + goto retry; + if ((oldpte & ATTR_AF) != 0) + vm_page_dirty(m); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + rw_wunlock(lock); + vm_page_aflag_clear(m, PGA_WRITEABLE); +} + +static __inline boolean_t +safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) +{ + + return (FALSE); +} + +/* + * pmap_ts_referenced: + * + * Return a count of reference bits for a page, clearing those bits. + * It is not necessary for every reference bit to be cleared, but it + * is necessary that 0 only be returned when there are truly no + * reference bits set. + * + * As an optimization, update the page's dirty field if a modified bit is + * found while counting reference bits. This opportunistic update can be + * performed at low cost and can eliminate the need for some future calls + * to pmap_is_modified(). However, since this function stops after + * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some + * dirty pages. Those dirty pages will only be detected by a future call + * to pmap_is_modified(). + */ +int +pmap_ts_referenced(vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t pv, pvf; + pmap_t pmap; + struct rwlock *lock; + pd_entry_t *pde, tpde; + pt_entry_t *pte, tpte; + pt_entry_t *l3; + vm_offset_t va; + vm_paddr_t pa; + int cleared, md_gen, not_cleared, lvl, pvh_gen; + struct spglist free; + bool demoted; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_ts_referenced: page %p is not managed", m)); + SLIST_INIT(&free); + cleared = 0; + pa = VM_PAGE_TO_PHYS(m); + lock = PHYS_TO_PV_LIST_LOCK(pa); + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); + rw_wlock(lock); +retry: + not_cleared = 0; + if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) + goto small_mappings; + pv = pvf; + do { + if (pvf == NULL) + pvf = pv; + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto retry; + } + } + va = pv->pv_va; + pde = pmap_pde(pmap, pv->pv_va, &lvl); + KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found")); + KASSERT(lvl == 1, + ("pmap_ts_referenced: invalid pde level %d", lvl)); + tpde = pmap_load(pde); + KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE, + ("pmap_ts_referenced: found an invalid l1 table")); + pte = pmap_l1_to_l2(pde, pv->pv_va); + tpte = pmap_load(pte); + if (pmap_page_dirty(tpte)) { + /* + * Although "tpte" is mapping a 2MB page, because + * this function is called at a 4KB page granularity, + * we only update the 4KB page under test. + */ + vm_page_dirty(m); + } + if ((tpte & ATTR_AF) != 0) { + /* + * Since this reference bit is shared by 512 4KB + * pages, it should not be cleared every time it is + * tested. Apply a simple "hash" function on the + * physical page number, the virtual superpage number, + * and the pmap address to select one 4KB page out of + * the 512 on which testing the reference bit will + * result in clearing that reference bit. This + * function is designed to avoid the selection of the + * same 4KB page for every 2MB page mapping. + * + * On demotion, a mapping that hasn't been referenced + * is simply destroyed. To avoid the possibility of a + * subsequent page fault on a demoted wired mapping, + * always leave its reference bit set. Moreover, + * since the superpage is wired, the current state of + * its reference bit won't affect page replacement. + */ + if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ + (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && + (tpte & ATTR_SW_WIRED) == 0) { + if (safe_to_clear_referenced(pmap, tpte)) { + /* + * TODO: We don't handle the access + * flag at all. We need to be able + * to set it in the exception handler. + */ + panic("ARM64TODO: " + "safe_to_clear_referenced\n"); + } else if (pmap_demote_l2_locked(pmap, pte, + pv->pv_va, &lock) != NULL) { + demoted = true; + va += VM_PAGE_TO_PHYS(m) - + (tpte & ~ATTR_MASK); + l3 = pmap_l2_to_l3(pte, va); + pmap_remove_l3(pmap, l3, va, + pmap_load(pte), NULL, &lock); + } else + demoted = true; + + if (demoted) { + /* + * The superpage mapping was removed + * entirely and therefore 'pv' is no + * longer valid. + */ + if (pvf == pv) + pvf = NULL; + pv = NULL; + } + cleared++; + KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), + ("inconsistent pv lock %p %p for page %p", + lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + } else + not_cleared++; + } + PMAP_UNLOCK(pmap); + /* Rotate the PV list if it has more than one entry. */ + if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + } + if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) + goto out; + } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); +small_mappings: + if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) + goto out; + pv = pvf; + do { + if (pvf == NULL) + pvf = pv; + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + md_gen = m->md.pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto retry; + } + } + pde = pmap_pde(pmap, pv->pv_va, &lvl); + KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found")); + KASSERT(lvl == 2, + ("pmap_ts_referenced: invalid pde level %d", lvl)); + tpde = pmap_load(pde); + KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE, + ("pmap_ts_referenced: found an invalid l2 table")); + pte = pmap_l2_to_l3(pde, pv->pv_va); + tpte = pmap_load(pte); + if (pmap_page_dirty(tpte)) + vm_page_dirty(m); + if ((tpte & ATTR_AF) != 0) { + if (safe_to_clear_referenced(pmap, tpte)) { + /* + * TODO: We don't handle the access flag + * at all. We need to be able to set it in + * the exception handler. + */ + panic("ARM64TODO: safe_to_clear_referenced\n"); + } else if ((tpte & ATTR_SW_WIRED) == 0) { + /* + * Wired pages cannot be paged out so + * doing accessed bit emulation for + * them is wasted effort. We do the + * hard work for unwired pages only. + */ + pmap_remove_l3(pmap, pte, pv->pv_va, tpde, + &free, &lock); + pmap_invalidate_page(pmap, pv->pv_va); + cleared++; + if (pvf == pv) + pvf = NULL; + pv = NULL; + KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), + ("inconsistent pv lock %p %p for page %p", + lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + } else + not_cleared++; + } + PMAP_UNLOCK(pmap); + /* Rotate the PV list if it has more than one entry. */ + if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + } + } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + + not_cleared < PMAP_TS_REFERENCED_MAX); +out: + rw_wunlock(lock); + pmap_free_zero_pages(&free); + return (cleared + not_cleared); +} + +/* + * Apply the given advice to the specified range of addresses within the + * given pmap. Depending on the advice, clear the referenced and/or + * modified flags in each mapping and set the mapped page's dirty field. + */ +void +pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) +{ +} + +/* + * Clear the modify bits on the specified physical page. + */ +void +pmap_clear_modify(vm_page_t m) +{ + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_clear_modify: page %p is not managed", m)); + VM_OBJECT_ASSERT_WLOCKED(m->object); + KASSERT(!vm_page_xbusied(m), + ("pmap_clear_modify: page %p is exclusive busied", m)); + + /* + * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. + * If the object containing the page is locked and the page is not + * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. + */ + if ((m->aflags & PGA_WRITEABLE) == 0) + return; + + /* ARM64TODO: We lack support for tracking if a page is modified */ +} + +void * +pmap_mapbios(vm_paddr_t pa, vm_size_t size) +{ + + return ((void *)PHYS_TO_DMAP(pa)); +} + +void +pmap_unmapbios(vm_paddr_t pa, vm_size_t size) +{ +} + +/* + * Sets the memory attribute for the specified page. + */ +void +pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) +{ + + m->md.pv_memattr = ma; + + /* + * If "m" is a normal page, update its direct mapping. This update + * can be relied upon to perform any cache operations that are + * required for data coherence. + */ + if ((m->flags & PG_FICTITIOUS) == 0 && + pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, + m->md.pv_memattr) != 0) + panic("memory attribute change on the direct map failed"); +} + +/* + * Changes the specified virtual address range's memory type to that given by + * the parameter "mode". The specified virtual address range must be + * completely contained within either the direct map or the kernel map. If + * the virtual address range is contained within the kernel map, then the + * memory type for each of the corresponding ranges of the direct map is also + * changed. (The corresponding ranges of the direct map are those ranges that + * map the same physical pages as the specified virtual address range.) These + * changes to the direct map are necessary because Intel describes the + * behavior of their processors as "undefined" if two or more mappings to the + * same physical page have different memory types. + * + * Returns zero if the change completed successfully, and either EINVAL or + * ENOMEM if the change failed. Specifically, EINVAL is returned if some part + * of the virtual address range was not mapped, and ENOMEM is returned if + * there was insufficient memory available to complete the change. In the + * latter case, the memory type may have been changed on some part of the + * virtual address range or the direct map. + */ +static int +pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) +{ + int error; + + PMAP_LOCK(kernel_pmap); + error = pmap_change_attr_locked(va, size, mode); + PMAP_UNLOCK(kernel_pmap); + return (error); +} + +static int +pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) +{ + vm_offset_t base, offset, tmpva; + pt_entry_t l3, *pte, *newpte; + int lvl; + + PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); + base = trunc_page(va); + offset = va & PAGE_MASK; + size = round_page(offset + size); + + if (!VIRT_IN_DMAP(base)) + return (EINVAL); + + for (tmpva = base; tmpva < base + size; ) { + pte = pmap_pte(kernel_pmap, va, &lvl); + if (pte == NULL) + return (EINVAL); + + if ((pmap_load(pte) & ATTR_IDX_MASK) == ATTR_IDX(mode)) { + /* + * We already have the correct attribute, + * ignore this entry. + */ + switch (lvl) { + default: + panic("Invalid DMAP table level: %d\n", lvl); + case 1: + tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; + break; + case 2: + tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; + break; + case 3: + tmpva += PAGE_SIZE; + break; + } + } else { + /* + * Split the entry to an level 3 table, then + * set the new attribute. + */ + switch (lvl) { + default: + panic("Invalid DMAP table level: %d\n", lvl); + case 1: + newpte = pmap_demote_l1(kernel_pmap, pte, + tmpva & ~L1_OFFSET); + if (newpte == NULL) + return (EINVAL); + pte = pmap_l1_to_l2(pte, tmpva); + case 2: + newpte = pmap_demote_l2(kernel_pmap, pte, + tmpva & ~L2_OFFSET); + if (newpte == NULL) + return (EINVAL); + pte = pmap_l2_to_l3(pte, tmpva); + case 3: + /* Update the entry */ + l3 = pmap_load(pte); + l3 &= ~ATTR_IDX_MASK; + l3 |= ATTR_IDX(mode); + if (mode == DEVICE_MEMORY) + l3 |= ATTR_XN; + + pmap_update_entry(kernel_pmap, pte, l3, tmpva, + PAGE_SIZE); + + /* + * If moving to a non-cacheable entry flush + * the cache. + */ + if (mode == VM_MEMATTR_UNCACHEABLE) + cpu_dcache_wbinv_range(tmpva, L3_SIZE); + + break; + } + tmpva += PAGE_SIZE; + } + } + + return (0); +} + +/* + * Create an L2 table to map all addresses within an L1 mapping. + */ +static pt_entry_t * +pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) +{ + pt_entry_t *l2, newl2, oldl1; + vm_offset_t tmpl1; + vm_paddr_t l2phys, phys; + vm_page_t ml2; + int i; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + oldl1 = pmap_load(l1); + KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, + ("pmap_demote_l1: Demoting a non-block entry")); + KASSERT((va & L1_OFFSET) == 0, + ("pmap_demote_l1: Invalid virtual address %#lx", va)); + KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, + ("pmap_demote_l1: Level 1 table shouldn't be managed")); + + tmpl1 = 0; + if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { + tmpl1 = kva_alloc(PAGE_SIZE); + if (tmpl1 == 0) + return (NULL); + } + + if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { + CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" + " in pmap %p", va, pmap); + return (NULL); + } + + l2phys = VM_PAGE_TO_PHYS(ml2); + l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); + + /* Address the range points at */ + phys = oldl1 & ~ATTR_MASK; + /* The attributed from the old l1 table to be copied */ + newl2 = oldl1 & ATTR_MASK; + + /* Create the new entries */ + for (i = 0; i < Ln_ENTRIES; i++) { + l2[i] = newl2 | phys; + phys += L2_SIZE; + } + KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), + ("Invalid l2 page (%lx != %lx)", l2[0], + (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); + + if (tmpl1 != 0) { + pmap_kenter(tmpl1, PAGE_SIZE, + DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, CACHED_MEMORY); + l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); + } + + pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); + + if (tmpl1 != 0) { + pmap_kremove(tmpl1); + kva_free(tmpl1, PAGE_SIZE); + } + + return (l2); +} + +/* + * Create an L3 table to map all addresses within an L2 mapping. + */ +static pt_entry_t * +pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, + struct rwlock **lockp) +{ + pt_entry_t *l3, newl3, oldl2; + vm_offset_t tmpl2; + vm_paddr_t l3phys, phys; + vm_page_t ml3; + int i; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + l3 = NULL; + oldl2 = pmap_load(l2); + KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, + ("pmap_demote_l2: Demoting a non-block entry")); + KASSERT((va & L2_OFFSET) == 0, + ("pmap_demote_l2: Invalid virtual address %#lx", va)); + + tmpl2 = 0; + if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { + tmpl2 = kva_alloc(PAGE_SIZE); + if (tmpl2 == 0) + return (NULL); + } + + if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { + ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va), + (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); + if (ml3 == NULL) { + CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" + " in pmap %p", va, pmap); + goto fail; + } + if (va < VM_MAXUSER_ADDRESS) + pmap_resident_count_inc(pmap, 1); + } + + l3phys = VM_PAGE_TO_PHYS(ml3); + l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); + + /* Address the range points at */ + phys = oldl2 & ~ATTR_MASK; + /* The attributed from the old l2 table to be copied */ + newl3 = (oldl2 & (ATTR_MASK & ~ATTR_DESCR_MASK)) | L3_PAGE; + + /* + * If the page table page is new, initialize it. + */ + if (ml3->wire_count == 1) { + for (i = 0; i < Ln_ENTRIES; i++) { + l3[i] = newl3 | phys; + phys += L3_SIZE; + } + } + KASSERT(l3[0] == ((oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE), + ("Invalid l3 page (%lx != %lx)", l3[0], + (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE)); + + /* + * Map the temporary page so we don't lose access to the l2 table. + */ + if (tmpl2 != 0) { + pmap_kenter(tmpl2, PAGE_SIZE, + DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, CACHED_MEMORY); + l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); + } + + /* + * The spare PV entries must be reserved prior to demoting the + * mapping, that is, prior to changing the PDE. Otherwise, the state + * of the L2 and the PV lists will be inconsistent, which can result + * in reclaim_pv_chunk() attempting to remove a PV entry from the + * wrong PV list and pmap_pv_demote_l2() failing to find the expected + * PV entry for the 2MB page mapping that is being demoted. + */ + if ((oldl2 & ATTR_SW_MANAGED) != 0) + reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); + + pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); + + /* + * Demote the PV entry. + */ + if ((oldl2 & ATTR_SW_MANAGED) != 0) + pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp); + + atomic_add_long(&pmap_l2_demotions, 1); + CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" + " in pmap %p %lx", va, pmap, l3[0]); + +fail: + if (tmpl2 != 0) { + pmap_kremove(tmpl2); + kva_free(tmpl2, PAGE_SIZE); + } + + return (l3); + +} + +static pt_entry_t * +pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) +{ + struct rwlock *lock; + pt_entry_t *l3; + + lock = NULL; + l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); + if (lock != NULL) + rw_wunlock(lock); + return (l3); +} + +/* + * perform the pmap work for mincore + */ +int +pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) +{ + pd_entry_t *l1p, l1; + pd_entry_t *l2p, l2; + pt_entry_t *l3p, l3; + vm_paddr_t pa; + bool managed; + int val; + + PMAP_LOCK(pmap); +retry: + pa = 0; + val = 0; + managed = false; + + l1p = pmap_l1(pmap, addr); + if (l1p == NULL) /* No l1 */ + goto done; + + l1 = pmap_load(l1p); + if ((l1 & ATTR_DESCR_MASK) == L1_INVAL) + goto done; + + if ((l1 & ATTR_DESCR_MASK) == L1_BLOCK) { + pa = (l1 & ~ATTR_MASK) | (addr & L1_OFFSET); + managed = (l1 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED; + val = MINCORE_SUPER | MINCORE_INCORE; + if (pmap_page_dirty(l1)) + val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; + if ((l1 & ATTR_AF) == ATTR_AF) + val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; + goto done; + } + + l2p = pmap_l1_to_l2(l1p, addr); + if (l2p == NULL) /* No l2 */ + goto done; + + l2 = pmap_load(l2p); + if ((l2 & ATTR_DESCR_MASK) == L2_INVAL) + goto done; + + if ((l2 & ATTR_DESCR_MASK) == L2_BLOCK) { + pa = (l2 & ~ATTR_MASK) | (addr & L2_OFFSET); + managed = (l2 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED; + val = MINCORE_SUPER | MINCORE_INCORE; + if (pmap_page_dirty(l2)) + val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; + if ((l2 & ATTR_AF) == ATTR_AF) + val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; + goto done; + } + + l3p = pmap_l2_to_l3(l2p, addr); + if (l3p == NULL) /* No l3 */ + goto done; + + l3 = pmap_load(l2p); + if ((l3 & ATTR_DESCR_MASK) == L3_INVAL) + goto done; + + if ((l3 & ATTR_DESCR_MASK) == L3_PAGE) { + pa = (l3 & ~ATTR_MASK) | (addr & L3_OFFSET); + managed = (l3 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED; + val = MINCORE_INCORE; + if (pmap_page_dirty(l3)) + val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; + if ((l3 & ATTR_AF) == ATTR_AF) + val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; + } + +done: + if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != + (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { + /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ + if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) + goto retry; + } else + PA_UNLOCK_COND(*locked_pa); + PMAP_UNLOCK(pmap); + + return (val); +} + +void +pmap_activate(struct thread *td) +{ + pmap_t pmap; + + critical_enter(); + pmap = vmspace_pmap(td->td_proc->p_vmspace); + td->td_pcb->pcb_l0addr = vtophys(pmap->pm_l0); + __asm __volatile("msr ttbr0_el1, %0" : : "r"(td->td_pcb->pcb_l0addr)); + pmap_invalidate_all(pmap); + critical_exit(); +} + +void +pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) +{ + + if (va >= VM_MIN_KERNEL_ADDRESS) { + cpu_icache_sync_range(va, sz); + } else { + u_int len, offset; + vm_paddr_t pa; + + /* Find the length of data in this page to flush */ + offset = va & PAGE_MASK; + len = imin(PAGE_SIZE - offset, sz); + + while (sz != 0) { + /* Extract the physical address & find it in the DMAP */ + pa = pmap_extract(pmap, va); + if (pa != 0) + cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); + + /* Move to the next page */ + sz -= len; + va += len; + /* Set the length for the next iteration */ + len = imin(PAGE_SIZE, sz); + } + } +} + +int +pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) +{ +#ifdef SMP + uint64_t par; +#endif + + switch (ESR_ELx_EXCEPTION(esr)) { + case EXCP_DATA_ABORT_L: + case EXCP_DATA_ABORT: + break; + default: + return (KERN_FAILURE); + } + +#ifdef SMP + PMAP_LOCK(pmap); + switch (esr & ISS_DATA_DFSC_MASK) { + case ISS_DATA_DFSC_TF_L0: + case ISS_DATA_DFSC_TF_L1: + case ISS_DATA_DFSC_TF_L2: + case ISS_DATA_DFSC_TF_L3: + /* Ask the MMU to check the address */ + if (pmap == kernel_pmap) + par = arm64_address_translate_s1e1r(far); + else + par = arm64_address_translate_s1e0r(far); + + /* + * If the translation was successful the address was invalid + * due to a break-before-make sequence. We can unlock and + * return success to the trap handler. + */ + if (PAR_SUCCESS(par)) { + PMAP_UNLOCK(pmap); + return (KERN_SUCCESS); + } + break; + default: + break; + } + PMAP_UNLOCK(pmap); +#endif + + return (KERN_FAILURE); +} + +/* + * Increase the starting virtual address of the given mapping if a + * different alignment might result in more superpage mappings. + */ +void +pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, + vm_offset_t *addr, vm_size_t size) +{ + vm_offset_t superpage_offset; + + if (size < L2_SIZE) + return; + if (object != NULL && (object->flags & OBJ_COLORED) != 0) + offset += ptoa(object->pg_color); + superpage_offset = offset & L2_OFFSET; + if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || + (*addr & L2_OFFSET) == superpage_offset) + return; + if ((*addr & L2_OFFSET) < superpage_offset) + *addr = (*addr & ~L2_OFFSET) + superpage_offset; + else + *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; +} + +/** + * Get the kernel virtual address of a set of physical pages. If there are + * physical addresses not covered by the DMAP perform a transient mapping + * that will be removed when calling pmap_unmap_io_transient. + * + * \param page The pages the caller wishes to obtain the virtual + * address on the kernel memory map. + * \param vaddr On return contains the kernel virtual memory address + * of the pages passed in the page parameter. + * \param count Number of pages passed in. + * \param can_fault TRUE if the thread using the mapped pages can take + * page faults, FALSE otherwise. + * + * \returns TRUE if the caller must call pmap_unmap_io_transient when + * finished or FALSE otherwise. + * + */ +boolean_t +pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, + boolean_t can_fault) +{ + vm_paddr_t paddr; + boolean_t needs_mapping; + int error, i; + + /* + * Allocate any KVA space that we need, this is done in a separate + * loop to prevent calling vmem_alloc while pinned. + */ + needs_mapping = FALSE; + for (i = 0; i < count; i++) { + paddr = VM_PAGE_TO_PHYS(page[i]); + if (__predict_false(!PHYS_IN_DMAP(paddr))) { + error = vmem_alloc(kernel_arena, PAGE_SIZE, + M_BESTFIT | M_WAITOK, &vaddr[i]); + KASSERT(error == 0, ("vmem_alloc failed: %d", error)); + needs_mapping = TRUE; + } else { + vaddr[i] = PHYS_TO_DMAP(paddr); + } + } + + /* Exit early if everything is covered by the DMAP */ + if (!needs_mapping) + return (FALSE); + + if (!can_fault) + sched_pin(); + for (i = 0; i < count; i++) { + paddr = VM_PAGE_TO_PHYS(page[i]); + if (!PHYS_IN_DMAP(paddr)) { + panic( + "pmap_map_io_transient: TODO: Map out of DMAP data"); + } + } + + return (needs_mapping); +} + +void +pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, + boolean_t can_fault) +{ + vm_paddr_t paddr; + int i; + + if (!can_fault) + sched_unpin(); + for (i = 0; i < count; i++) { + paddr = VM_PAGE_TO_PHYS(page[i]); + if (!PHYS_IN_DMAP(paddr)) { + panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); + } + } +} Index: sys/arm64/include/armreg.h =================================================================== --- sys/arm64/include/armreg.h +++ sys/arm64/include/armreg.h @@ -209,7 +209,7 @@ #define ISS_DATA_DFSC_TLB_CONFLICT (0x30 << 0) #define ESR_ELx_IL (0x01 << 25) #define ESR_ELx_EC_SHIFT 26 -#define ESR_ELx_EC_MASK (0x3f << 26) +#define ESR_ELx_EC_MASK (0x3f << ESR_ELx_EC_SHIFT) #define ESR_ELx_EXCEPTION(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) #define EXCP_UNKNOWN 0x00 /* Unkwn exception */ #define EXCP_TRAP_WFI_WFE 0x01 /* Trapped WFI or WFE */ Index: sys/arm64/include/bitops.h =================================================================== --- /dev/null +++ sys/arm64/include/bitops.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) TODO + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ARM_BITOPS_H_ +#define _ARM_BITOPS_H_ + +#include + +#define for_each_set_bit(bit, addr, size) \ + for (bit_ffs((bitstr_t *)(addr), (size), (int *)&(bit)); \ + (bit) != -1; \ + bit_ffs_at((bitstr_t *)(addr), (bit) + 1, (size), (int *)&(bit))) + +/* same as for_each_set_bit() but use bit as value to start with */ +#define for_each_set_bit_from(bit, addr, size) \ + for (bit_ffs_at((bitstr_t *)(addr), (bit), (size), (int *)&(bit)); \ + (bit) != -1; \ + bit_ffs_at((bitstr_t *)(addr), (bit) + 1, (size), (int *)&(bit))) + +#define for_each_clear_bit(bit, addr, size) \ + for (bit_ffc((bitstr_t *)(addr), (size), (int *)&(bit)); \ + (bit) != -1; \ + bit_ffc_at((bitstr_t *)(addr), (bit) + 1, (size), (int *)&(bit))) + +/* same as for_each_clear_bit() but use bit as value to start with */ +#define for_each_clear_bit_from(bit, addr, size) \ + for (bit_ffc_at((bitstr_t *)(addr), (bit), (size), (int *)&(bit)); \ + (bit) != -1; \ + bit_ffc_at((bitstr_t *)(addr), (bit) + 1, (size), (int *)&(bit))) + +#endif /* _ARM_BITOPS_H_ */ Index: sys/arm64/include/cpu.h =================================================================== --- sys/arm64/include/cpu.h +++ sys/arm64/include/cpu.h @@ -115,6 +115,7 @@ #define CPU_IMPL_TO_MIDR(val) (((val) & 0xff) << 24) #define CPU_PART_TO_MIDR(val) (((val) & 0xfff) << 4) +#define CPU_ARCH_TO_MIDR(val) (((val) & 0xf) << 16) #define CPU_VAR_TO_MIDR(val) (((val) & 0xf) << 20) #define CPU_REV_TO_MIDR(val) (((val) & 0xf) << 0) Index: sys/arm64/include/hypervisor.h =================================================================== --- sys/arm64/include/hypervisor.h +++ sys/arm64/include/hypervisor.h @@ -182,4 +182,35 @@ #define VTTBR_VMID_SHIFT 48 #define VTTBR_HOST 0x0000000000000000 +/* VTCR_EL2 - Virtualization Translation Control Register */ +#define VTCR_EL2_RES1 (0x1 << 31) +#define VTCR_EL2_T0SZ_MASK 0x3f +#define VTCR_EL2_SL0_SHIFT 6 +#define VTCR_EL2_SL0_4K_LVL2 (0x0 << VTCR_EL2_SL0_SHIFT) +#define VTCR_EL2_SL0_4K_LVL1 (0x1 << VTCR_EL2_SL0_SHIFT) +#define VTCR_EL2_SL0_4K_LVL0 (0x2 << VTCR_EL2_SL0_SHIFT) +#define VTCR_EL2_IRGN0_SHIFT 8 +#define VTCR_EL2_IRGN0_WBWA (0x1 << VTCR_EL2_IRGN0_SHIFT) +#define VTCR_EL2_ORGN0_SHIFT 10 +#define VTCR_EL2_ORGN0_WBWA (0x1 << VTCR_EL2_ORGN0_SHIFT) +#define VTCR_EL2_SH0_SHIFT 12 +#define VTCR_EL2_SH0_NS (0x0 << VTCR_EL2_SH0_SHIFT) +#define VTCR_EL2_SH0_OS (0x2 << VTCR_EL2_SH0_SHIFT) +#define VTCR_EL2_SH0_IS (0x3 << VTCR_EL2_SH0_SHIFT) +#define VTCR_EL2_TG0_SHIFT 14 +#define VTCR_EL2_TG0_4K (0x0 << VTCR_EL2_TG0_SHIFT) +#define VTCR_EL2_TG0_64K (0x1 << VTCR_EL2_TG0_SHIFT) +#define VTCR_EL2_TG0_16K (0x2 << VTCR_EL2_TG0_SHIFT) +#define VTCR_EL2_PS_SHIFT 16 +#define VTCR_EL2_PS_32BIT (0x0 << VTCR_EL2_PS_SHIFT) +#define VTCR_EL2_PS_36BIT (0x1 << VTCR_EL2_PS_SHIFT) +#define VTCR_EL2_PS_40BIT (0x2 << VTCR_EL2_PS_SHIFT) +#define VTCR_EL2_PS_42BIT (0x3 << VTCR_EL2_PS_SHIFT) +#define VTCR_EL2_PS_44BIT (0x4 << VTCR_EL2_PS_SHIFT) +#define VTCR_EL2_PS_48BIT (0x5 << VTCR_EL2_PS_SHIFT) + +/* HPFAR_EL2 - Hypervisor IPA Fault Address Register */ +#define HPFAR_EL2_FIPA_SHIFT 4 +#define HPFAR_EL2_FIPA_MASK 0xfffffffff0 + #endif /* !_MACHINE_HYPERVISOR_H_ */ Index: sys/arm64/include/pcpu.h =================================================================== --- sys/arm64/include/pcpu.h +++ sys/arm64/include/pcpu.h @@ -43,6 +43,7 @@ u_int pc_acpi_id; /* ACPI CPU id */ \ u_int pc_midr; /* stored MIDR value */ \ uint64_t pc_clock; \ + void *pc_vcpu; \ pcpu_bp_harden pc_bp_harden; \ pcpu_ssbd pc_ssbd; \ struct pmap *pc_curpmap; \ Index: sys/arm64/include/pmap.h =================================================================== --- sys/arm64/include/pmap.h +++ sys/arm64/include/pmap.h @@ -188,6 +188,7 @@ pd_entry_t **, pt_entry_t **); int pmap_fault(pmap_t, uint64_t, uint64_t); +int pmap_pinit_type(pmap_t, enum pmap_stage); /* System MMU (SMMU). */ int pmap_senter(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_prot_t prot, Index: sys/arm64/include/pte.h =================================================================== --- sys/arm64/include/pte.h +++ sys/arm64/include/pte.h @@ -99,6 +99,35 @@ #define ATTR_DESCR_TYPE_TABLE 2 #define ATTR_DESCR_TYPE_PAGE 2 #define ATTR_DESCR_TYPE_BLOCK 0 +/* Stage 2 translation Block and Page attributes */ +#define ATTR_ST2_AF ATTR_AF +#define ATTR_ST2_SH(x) ATTR_SH(x) +#define ATTR_ST2_SH_MASK ATTR_SH_MASK +#define ATTR_ST2_SH_NS ATTR_SH_NS /* Non-shareable */ +#define ATTR_ST2_SH_OS ATTR_SH_OS /* Outer-shareable */ +#define ATTR_ST2_SH_IS ATTR_SH_IS /* Inner-shareable */ +#define ATTR_ST2_S2AP(x) ((x) << 6) /* Data access permissions */ +#define ATTR_ST2_S2AP_NONE (0 << 1) +#define ATTR_ST2_S2AP_R0 (1 << 0) +#define ATTR_ST2_S2AP_W0 (1 << 1) +#define ATTR_ST2_S2AP_RW (3 << 0) +#define ATTR_ST2_MEMATTR(x) ((x) << 2) /* Memory attributes */ +#define ATTR_ST2_MEM_DEV (0 << 2) /* Device memory */ +#define ATTR_ST2_MEM_DEV_nGnRnE (0 << 0) +#define ATTR_ST2_MEM_DEV_nGnRE (1 << 0) +#define ATTR_ST2_MEM_DEV_nGRE (1 << 1) +#define ATTR_ST2_MEM_DEV_GRE (3 << 0) +#define ATTR_ST2_MEM_ONC (1 << 2) /* Outer Non-cacheable */ +#define ATTR_ST2_MEM_OWT (1 << 2) /* Outer Write-Through Cacheable */ +#define ATTR_ST2_MEM_OWB (3 << 2) /* Outer Write-Back Cacheable */ +#define ATTR_ST2_MEM_INC (1 << 0) /* Inner Non-cacheable */ +#define ATTR_ST2_MEM_IWT (1 << 1) /* Inner Write-Through Cacheable */ +#define ATTR_ST2_MEM_IWB (3 << 0) /* Inner Write-Back Cacheable */ + +#define ATTR_ST2_DEFAULT (ATTR_ST2_AF | ATTR_ST2_SH(ATTR_ST2_SH_IS) | \ + ATTR_ST2_S2AP(ATTR_ST2_S2AP_RW) | \ + ATTR_ST2_MEMATTR(ATTR_ST2_MEM_OWB | ATTR_ST2_MEM_IWB)) + /* Level 0 table, 512GiB per entry */ #define L0_SHIFT 39 Index: sys/arm64/include/vmm.h =================================================================== --- /dev/null +++ sys/arm64/include/vmm.h @@ -0,0 +1,427 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_H_ +#define _VMM_H_ + +#include +#include +#include + +#include "pte.h" +#include "pmap.h" + +enum vm_suspend_how { + VM_SUSPEND_NONE, + VM_SUSPEND_RESET, + VM_SUSPEND_POWEROFF, + VM_SUSPEND_HALT, + VM_SUSPEND_TRIPLEFAULT, + VM_SUSPEND_LAST +}; + +/* + * Identifiers for architecturally defined registers. + */ +enum vm_reg_name { + VM_REG_GUEST_X0, + VM_REG_GUEST_X1, + VM_REG_GUEST_X2, + VM_REG_GUEST_X3, + VM_REG_GUEST_X4, + VM_REG_GUEST_X5, + VM_REG_GUEST_X6, + VM_REG_GUEST_X7, + VM_REG_GUEST_X8, + VM_REG_GUEST_X9, + VM_REG_GUEST_X10, + VM_REG_GUEST_X11, + VM_REG_GUEST_X12, + VM_REG_GUEST_X13, + VM_REG_GUEST_X14, + VM_REG_GUEST_X15, + VM_REG_GUEST_X16, + VM_REG_GUEST_X17, + VM_REG_GUEST_X18, + VM_REG_GUEST_X19, + VM_REG_GUEST_X20, + VM_REG_GUEST_X21, + VM_REG_GUEST_X22, + VM_REG_GUEST_X23, + VM_REG_GUEST_X24, + VM_REG_GUEST_X25, + VM_REG_GUEST_X26, + VM_REG_GUEST_X27, + VM_REG_GUEST_X28, + VM_REG_GUEST_X29, + VM_REG_GUEST_LR, + VM_REG_GUEST_SP, + VM_REG_GUEST_ELR, + VM_REG_GUEST_SPSR, + VM_REG_ELR_EL2, + VM_REG_LAST +}; + +#define VM_INTINFO_VECTOR(info) ((info) & 0xff) +#define VM_INTINFO_DEL_ERRCODE 0x800 +#define VM_INTINFO_RSVD 0x7ffff000 +#define VM_INTINFO_VALID 0x80000000 +#define VM_INTINFO_TYPE 0x700 +#define VM_INTINFO_HWINTR (0 << 8) +#define VM_INTINFO_NMI (2 << 8) +#define VM_INTINFO_HWEXCEPTION (3 << 8) +#define VM_INTINFO_SWINTR (4 << 8) + +#define VM_GUEST_BASE_IPA 0x80000000UL /* Guest kernel start ipa */ + +#ifdef _KERNEL + +#define VM_MAX_NAMELEN 32 + +struct vm; +struct vm_exception; +struct vm_memory_segment; +struct vm_exit; +struct vm_run; +struct vm_object; +struct pmap; +struct hypctx; + +typedef int (*vmm_init_func_t)(int ipinum); +typedef int (*vmm_cleanup_func_t)(void); +typedef void (*vmm_resume_func_t)(void); +typedef void * (*vmi_init_func_t)(struct vm *vm); +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, + struct pmap *pmap, void *rendezvous_cookie, + void *suspend_cookie); +typedef void (*vmi_cleanup_func_t)(void *vmi); +typedef void (*vmi_mmap_set_func_t)(void *arg, vm_offset_t va, + vm_offset_t pa, size_t len, + vm_prot_t prot); +typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *arg, vm_offset_t va); +typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, + uint64_t *retval); +typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, + uint64_t val); +typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); +typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); +typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); +typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); +typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); +typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); + +struct vmm_ops { + /* Module-wide functions */ + vmm_init_func_t init; + vmm_cleanup_func_t cleanup; + vmm_resume_func_t resume; + /* VM specific functions */ + vmi_init_func_t vminit; + vmi_run_func_t vmrun; + vmi_cleanup_func_t vmcleanup; + vmi_mmap_set_func_t vmmapset; + vmi_mmap_get_func_t vmmapget; + vmi_get_register_t vmgetreg; + vmi_set_register_t vmsetreg; + vmi_get_cap_t vmgetcap; + vmi_set_cap_t vmsetcap; +}; + +extern struct vmm_ops vmm_ops_arm; + +int vm_create(const char *name, struct vm **retvm); +void vm_destroy(struct vm *vm); +const char *vm_name(struct vm *vm); +int vm_malloc(struct vm *vm, uint64_t gpa, size_t len); +uint64_t vm_gpa2hpa(struct vm *vm, uint64_t gpa, size_t size); +int vm_gpabase2memseg(struct vm *vm, uint64_t gpabase, + struct vm_memory_segment *seg); +boolean_t vm_mem_allocated(struct vm *vm, uint64_t gpa); +int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); +int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); +int vm_run(struct vm *vm, struct vm_run *vmrun); +void* vm_get_cookie(struct vm *vm); +int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); +int vm_set_capability(struct vm *vm, int vcpu, int type, int val); +int vm_activate_cpu(struct vm *vm, int vcpu); +int vm_attach_vgic(struct vm *vm, uint64_t dist_start, size_t dist_size, + uint64_t redist_start, size_t redist_size); +int vm_assert_irq(struct vm *vm, uint32_t irq); +int vm_deassert_irq(struct vm *vm, uint32_t irq); +struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); +void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); + +#ifdef _SYS__CPUSET_H_ +/* + * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. + * The rendezvous 'func(arg)' is not allowed to do anything that will + * cause the thread to be put to sleep. + * + * If the rendezvous is being initiated from a vcpu context then the + * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1. + * + * The caller cannot hold any locks when initiating the rendezvous. + * + * The implementation of this API may cause vcpus other than those specified + * by 'dest' to be stalled. The caller should not rely on any vcpus making + * forward progress when the rendezvous is in progress. + */ +typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); +void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, + vm_rendezvous_func_t func, void *arg); +cpuset_t vm_active_cpus(struct vm *vm); +cpuset_t vm_suspended_cpus(struct vm *vm); +#endif /* _SYS__CPUSET_H_ */ + +extern uint64_t hypmode_enabled; +static __inline bool +virt_enabled() +{ + return (hypmode_enabled != 0); +} + +static __inline int +vcpu_rendezvous_pending(void *rendezvous_cookie) +{ + + return (*(uintptr_t *)rendezvous_cookie != 0); +} + +static __inline int +vcpu_suspended(void *suspend_cookie) +{ + + return (*(int *)suspend_cookie); +} + +enum vcpu_state { + VCPU_IDLE, + VCPU_FROZEN, + VCPU_RUNNING, + VCPU_SLEEPING, +}; + +int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, + bool from_idle); +enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); + +static int __inline +vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) +{ + return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); +} + +#ifdef _SYS_PROC_H_ +static int __inline +vcpu_should_yield(struct vm *vm, int vcpu) +{ + + if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) + return (1); + else if (curthread->td_owepreempt) + return (1); + else + return (0); +} +#endif + +void *vcpu_stats(struct vm *vm, int vcpu); +void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); + +/* + * This function is called after a VM-exit that occurred during exception or + * interrupt delivery through the IDT. The format of 'intinfo' is described + * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. + * + * If a VM-exit handler completes the event delivery successfully then it + * should call vm_exit_intinfo() to extinguish the pending event. For e.g., + * if the task switch emulation is triggered via a task gate then it should + * call this function with 'intinfo=0' to indicate that the external event + * is not pending anymore. + * + * Return value is 0 on success and non-zero on failure. + */ +int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); + +/* + * This function is called before every VM-entry to retrieve a pending + * event that should be injected into the guest. This function combines + * nested events into a double or triple fault. + * + * Returns 0 if there are no events that need to be injected into the guest + * and non-zero otherwise. + */ +int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); + +int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); + +enum vm_reg_name vm_segment_name(int seg_encoding); + +struct vm_copyinfo { + uint64_t gpa; + size_t len; + void *hva; + void *cookie; +}; + +int vcpu_trace_exceptions(struct vm *vm, int vcpuid); +#endif /* _KERNEL */ + +#define VM_MAXCPU 1 + +#define VM_DIR_READ 0 +#define VM_DIR_WRITE 1 + +struct vie { + uint8_t access_size:4, sign_extend:1, dir:1, unused:2; + enum vm_reg_name reg; +}; + +struct vre { + uint32_t inst_syndrome; + uint8_t dir:1, unused:7; + enum vm_reg_name reg; +}; + +/* + * Identifiers for optional vmm capabilities + */ +enum vm_cap_type { + VM_CAP_HALT_EXIT, + VM_CAP_MTRAP_EXIT, + VM_CAP_PAUSE_EXIT, + VM_CAP_UNRESTRICTED_GUEST, + VM_CAP_MAX +}; +enum vm_exitcode { + VM_EXITCODE_BOGUS, + VM_EXITCODE_INST_EMUL, + VM_EXITCODE_REG_EMUL, + VM_EXITCODE_HVC, + VM_EXITCODE_SUSPENDED, + VM_EXITCODE_HYP, + VM_EXITCODE_WFI, + VM_EXITCODE_MAX +}; + +enum task_switch_reason { + TSR_CALL, + TSR_IRET, + TSR_JMP, + TSR_IDT_GATE, /* task gate in IDT */ +}; + +struct vm_task_switch { + uint16_t tsssel; /* new TSS selector */ + int ext; /* task switch due to external event */ + uint32_t errcode; + int errcode_valid; /* push 'errcode' on the new stack */ + enum task_switch_reason reason; +}; + +struct vm_exit { + enum vm_exitcode exitcode; + int inst_length; + uint64_t pc; + union { + /* + * ARM specific payload. + */ + struct { + uint32_t exception_nr; + uint32_t esr_el2; /* Exception Syndrome Register */ + uint64_t far_el2; /* Fault Address Register */ + uint64_t hpfar_el2; /* Hypervisor IPA Fault Address Register */ + } hyp; + struct { + struct vre vre; + } reg_emul; + struct { + uint64_t gpa; + int fault_type; + } paging; + struct { + uint64_t gpa; + struct vie vie; + } inst_emul; + + struct { + struct hypctx *hypctx; + } wfi; + /* + * VMX specific payload. Used when there is no "better" + * exitcode to represent the VM-exit. + */ + struct { + int status; /* vmx inst status */ + /* + * 'exit_reason' and 'exit_qualification' are valid + * only if 'status' is zero. + */ + uint32_t exit_reason; + uint64_t exit_qualification; + /* + * 'inst_error' and 'inst_type' are valid + * only if 'status' is non-zero. + */ + int inst_type; + int inst_error; + } vmx; + /* + * SVM specific payload. + */ + struct { + uint64_t exitcode; + uint64_t exitinfo1; + uint64_t exitinfo2; + } svm; + struct { +#ifdef __aarch64__ +#else + uint32_t code; /* ecx value */ + uint64_t wval; +#endif + } msr; + struct { + int vcpu; + uint64_t rip; + } spinup_ap; + struct { + uint64_t rflags; + } hlt; + struct { + int vector; + } ioapic_eoi; + struct { + enum vm_suspend_how how; + } suspended; + struct vm_task_switch task_switch; + } u; +}; + +#endif /* _VMM_H_ */ Index: sys/arm64/include/vmm_dev.h =================================================================== --- /dev/null +++ sys/arm64/include/vmm_dev.h @@ -0,0 +1,181 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_DEV_H_ +#define _VMM_DEV_H_ + +#ifdef _KERNEL +void vmmdev_init(void); +int vmmdev_cleanup(void); +#endif + +struct vm_memory_segment { + uint64_t gpa; /* in */ + size_t len; + int wired; +}; + +struct vm_register { + int cpuid; + int regnum; /* enum vm_reg_name */ + uint64_t regval; +}; + +struct vm_run { + int cpuid; + uint64_t pc; + struct vm_exit vm_exit; + +}; + +struct vm_exception { + int cpuid; + int vector; + uint32_t error_code; + int error_code_valid; + int restart_instruction; +}; + +struct vm_capability { + int cpuid; + enum vm_cap_type captype; + int capval; + int allcpus; +}; + +#define MAX_VM_STATS 64 +struct vm_stats { + int cpuid; /* in */ + int num_entries; /* out */ + struct timeval tv; + uint64_t statbuf[MAX_VM_STATS]; +}; +struct vm_stat_desc { + int index; /* in */ + char desc[128]; /* out */ +}; + + +struct vm_suspend { + enum vm_suspend_how how; +}; + +struct vm_gla2gpa { + int vcpuid; /* inputs */ + int prot; /* PROT_READ or PROT_WRITE */ + uint64_t gla; + int fault; /* outputs */ + uint64_t gpa; +}; + +struct vm_activate_cpu { + int vcpuid; +}; + +struct vm_attach_vgic { + uint64_t dist_start; + size_t dist_size; + uint64_t redist_start; + size_t redist_size; +}; + +struct vm_irq { + uint32_t irq; +}; + +#define VM_ACTIVE_CPUS 0 +#define VM_SUSPENDED_CPUS 1 + +enum { + /* general routines */ + IOCNUM_ABIVERS = 0, + IOCNUM_RUN = 1, + IOCNUM_SET_CAPABILITY = 2, + IOCNUM_GET_CAPABILITY = 3, + IOCNUM_SUSPEND = 4, + IOCNUM_REINIT = 5, + + /* memory apis */ + IOCNUM_MAP_MEMORY = 10, + IOCNUM_GET_MEMORY_SEG = 11, + IOCNUM_GET_GPA_PMAP = 12, + IOCNUM_GLA2GPA = 13, + + /* register/state accessors */ + IOCNUM_SET_REGISTER = 20, + IOCNUM_GET_REGISTER = 21, + + /* statistics */ + IOCNUM_VM_STATS = 50, + IOCNUM_VM_STAT_DESC = 51, + + /* interrupt injection */ + IOCNUM_ASSERT_IRQ = 80, + IOCNUM_DEASSERT_IRQ = 81, + + /* vm_cpuset */ + IOCNUM_ACTIVATE_CPU = 90, + IOCNUM_GET_CPUSET = 91, + + /* vm_attach_vgic */ + IOCNUM_ATTACH_VGIC = 110, +}; + +#define VM_RUN \ + _IOWR('v', IOCNUM_RUN, struct vm_run) +#define VM_SUSPEND \ + _IOW('v', IOCNUM_SUSPEND, struct vm_suspend) +#define VM_REINIT \ + _IO('v', IOCNUM_REINIT) +#define VM_MAP_MEMORY \ + _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment) +#define VM_GET_MEMORY_SEG \ + _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment) +#define VM_SET_REGISTER \ + _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) +#define VM_GET_REGISTER \ + _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) +#define VM_SET_CAPABILITY \ + _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) +#define VM_GET_CAPABILITY \ + _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) +#define VM_STATS \ + _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) +#define VM_STAT_DESC \ + _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) +#define VM_ASSERT_IRQ \ + _IOW('v', IOCNUM_ASSERT_IRQ, struct vm_irq) +#define VM_DEASSERT_IRQ \ + _IOW('v', IOCNUM_DEASSERT_IRQ, struct vm_irq) +#define VM_GLA2GPA \ + _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa) +#define VM_ACTIVATE_CPU \ + _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) +#define VM_GET_CPUS \ + _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) +#define VM_ATTACH_VGIC \ + _IOW('v', IOCNUM_ATTACH_VGIC, struct vm_attach_vgic) +#endif Index: sys/arm64/include/vmm_instruction_emul.h =================================================================== --- /dev/null +++ sys/arm64/include/vmm_instruction_emul.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_INSTRUCTION_EMUL_H_ +#define _VMM_INSTRUCTION_EMUL_H_ + +/* + * Callback functions to read and write memory regions. + */ +typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t *rval, int rsize, void *arg); +typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t wval, int wsize, void *arg); + +/* + * Callback functions to read and write registers. + */ +typedef int (*reg_read_t)(void *vm, int cpuid, uint64_t *rval, void *arg); +typedef int (*reg_write_t)(void *vm, int cpuid, uint64_t wval, void *arg); + +/* + * Emulate the decoded 'vie' instruction when it contains a memory operation. + * + * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region + * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * + */ +int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t mrr, mem_region_write_t mrw, void *mrarg); + +/* + * Emulate the decoded 'vre' instruction when it contains a register access. + * + * The callbacks 'regread' and 'regwrite' emulate reads and writes to the + * register from 'vie'. 'regarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * + */ +int vmm_emulate_register(void *vm, int vcpuid, struct vre *vre, reg_read_t regread, + reg_write_t regwrite, void *regarg); + +#endif /* _VMM_INSTRUCTION_EMUL_H_ */ Index: sys/arm64/vmm/arm64.h =================================================================== --- /dev/null +++ sys/arm64/vmm/arm64.h @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _VMM_ARM64_H_ +#define _VMM_ARM64_H_ + +#include +#include +#include +#include + +#include "mmu.h" +#include "io/vgic_v3.h" +#include "io/vtimer.h" + +struct hypctx { + struct reg regs; + + /* EL1 control registers */ + uint64_t actlr_el1; /* Auxiliary Control Register */ + uint64_t afsr0_el1; /* Auxiliary Fault Status Register 0 */ + uint64_t afsr1_el1; /* Auxiliary Fault Status Register 1 */ + uint64_t amair_el1; /* Auxiliary Memory Attribute Indirection Register */ + uint64_t contextidr_el1; /* Current Process Identifier */ + uint64_t cpacr_el1; /* Arhitectural Feature Access Control Register */ + uint64_t elr_el1; /* Exception Link Register */ + uint64_t esr_el1; /* Exception Syndrome Register */ + uint64_t far_el1; /* Fault Address Register */ + uint64_t fp; /* Frame Pointer */ + uint64_t mair_el1; /* Memory Attribute Indirection Register */ + uint64_t par_el1; /* Physical Address Register */ + uint64_t sctlr_el1; /* System Control Register */ + uint64_t sp_el0; /* Stack Pointer */ + uint64_t tcr_el1; /* Translation Control Register */ + uint64_t tpidr_el0; /* EL0 Software ID Register */ + uint64_t tpidrro_el0; /* Read-only Thread ID Register */ + uint64_t tpidr_el1; /* EL1 Software ID Register */ + uint64_t ttbr0_el1; /* Translation Table Base Register 0 */ + uint64_t ttbr1_el1; /* Translation Table Base Register 1 */ + uint64_t vbar_el1; /* Vector Base Address Register */ + uint32_t spsr_el1; /* Saved Program Status Register */ + + /* EL2 control registers */ + uint64_t cptr_el2; /* Architectural Feature Trap Register */ + uint64_t elr_el2; /* Exception Link Register */ + uint64_t hcr_el2; /* Hypervisor Configuration Register */ + uint64_t vpidr_el2; /* Virtualization Processor ID Register */ + uint64_t vmpidr_el2; /* Virtualization Multiprocessor ID Register */ + uint32_t spsr_el2; /* Saved Program Status Register */ + + uint32_t vcpu; + struct hyp *hyp; + struct { + uint64_t esr_el2; /* Exception Syndrome Register */ + uint64_t far_el2; /* Fault Address Register */ + uint64_t hpfar_el2; /* Hypervisor IPA Fault Address Register */ + } exit_info; + + struct vtimer_cpu vtimer_cpu; + struct vgic_v3_cpu_if vgic_cpu_if; + struct vgic_v3_redist vgic_redist; +#ifdef VFP + struct vfpstate vfpstate; +#endif +}; + +struct hyp { + pmap_t stage2_map; + struct hypctx ctx[VM_MAXCPU]; + struct vgic_mmio_region *vgic_mmio_regions; + size_t vgic_mmio_regions_num; + struct vgic_v3_dist vgic_dist; + struct vm *vm; + struct vtimer vtimer; + uint64_t vmid_generation; + uint64_t vttbr_el2; + bool vgic_attached; +}; + +uint64_t vmm_call_hyp(void *hyp_func_addr, ...); +void vmm_cleanup(void *hyp_stub_vectors); +uint64_t vmm_enter_guest(struct hypctx *hypctx); +uint64_t vmm_read_ich_vtr_el2(void); +uint64_t vmm_read_cnthctl_el2(void); +uint64_t vmm_read_tcr_el2(void); + +#define eprintf(fmt, ...) printf("%s:%d " fmt, __func__, __LINE__, ##__VA_ARGS__) +//#define eprintf(fmt, ...) do {} while(0) + +#define VMID_GENERATION_MASK ((1UL<<8) - 1) +#define build_vttbr(vmid, ptaddr) \ + ((((vmid) & VMID_GENERATION_MASK) << VTTBR_VMID_SHIFT) | \ + (uint64_t)(ptaddr)) + +#define MPIDR_SMP_MASK (0x3 << 30) +#define MPIDR_AFF1_LEVEL(x) (((x) >> 2) << 8) +#define MPIDR_AFF0_LEVEL(x) (((x) & 0x3) << 0) + +/* + * Return true if the exception was caused by a translation fault in the stage 2 + * translation regime. The DFSC encoding for a translation fault has the format + * 0b0001LL, where LL (bits [1:0]) represents the level where the fault occured + * (page D7-2280 of the ARMv8 Architecture Manual). + */ +#define ISS_DATA_DFSC_TF(esr_iss) \ + (!((esr_iss) & 0b111000) && ((esr_iss) & 0b000100)) +#define FAR_EL2_PAGE_OFFSET(x) ((x) & PAGE_MASK) + +#define DEBUG_ME 0 + +#define arm64_get_active_vcpu() ((struct hypctx *)PCPU_GET(vcpu)) + +#endif /* !_VMM_ARM64_H_ */ Index: sys/arm64/vmm/arm64.c =================================================================== --- /dev/null +++ sys/arm64/vmm/arm64.c @@ -0,0 +1,764 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mmu.h" +#include "arm64.h" +#include "hyp.h" +#include "reset.h" +#include "io/vgic_v3.h" +#include "io/vtimer.h" + +#define HANDLED 1 +#define UNHANDLED 0 + +#define UNUSED 0 + +MALLOC_DEFINE(M_HYP, "ARM VMM HYP", "ARM VMM HYP"); + +extern char hyp_init_vectors[]; +extern char hyp_vectors[]; +extern char hyp_code_start[]; +extern char hyp_code_end[]; +extern char hyp_stub_vectors[]; + +char *stack; +pmap_t hyp_pmap; + +static uint64_t vmid_generation = 0; +static struct mtx vmid_generation_mtx; + +static inline void +arm64_set_active_vcpu(struct hypctx *hypctx) +{ + PCPU_SET(vcpu, hypctx); +} + +static void arm64_set_vttbr(struct hyp *hyp) +{ + if (hyp->vmid_generation != 0 && + ((hyp->vmid_generation & ~VMID_GENERATION_MASK) != + (atomic_load_acq_64(&vmid_generation) & ~VMID_GENERATION_MASK))) + goto out; + + mtx_lock(&vmid_generation_mtx); + + /* Another VCPU has change the VMID already */ + if (hyp->vmid_generation && + ((hyp->vmid_generation & ~VMID_GENERATION_MASK) != + (vmid_generation & ~VMID_GENERATION_MASK))) { + mtx_unlock(&vmid_generation_mtx); + goto out; + } + + vmid_generation++; + if (!(vmid_generation & VMID_GENERATION_MASK)) + vmid_generation++; + + hyp->vmid_generation = vmid_generation; + mtx_unlock(&vmid_generation_mtx); +out: + hyp->vttbr_el2 = build_vttbr(hyp->vmid_generation, + vtophys(hyp->stage2_map->pm_l0)); +} + +static int +arm_init(int ipinum) +{ + char *stack_top; + size_t hyp_code_len; + uint64_t ich_vtr_el2; + uint64_t cnthctl_el2; + uint64_t tcr_el1, tcr_el2; + uint64_t id_aa64mmfr0_el1; + uint64_t pa_range_bits; + uint32_t sctlr_el2; + uint32_t vtcr_el2; + register_t daif; + + if (!virt_enabled()) { + printf("arm_init: Processor doesn't have support for virtualization.\n"); + return (ENXIO); + } + + mtx_init(&vmid_generation_mtx, "vmid_generation_mtx", NULL, MTX_DEF); + + daif = intr_disable(); + arm64_set_active_vcpu(NULL); + /* + * Install the temporary vectors which will be responsible for + * initializing the VMM when we next trap into EL2. + * + * x0: the exception vector table responsible for hypervisor + * initialization on the next call. + */ + vmm_call_hyp((void *)vtophys(hyp_init_vectors)); + + /* Create the mappings for the hypervisor translation table. */ + hyp_pmap = malloc(sizeof(*hyp_pmap), M_HYP, M_WAITOK | M_ZERO); + hypmap_init(hyp_pmap, PM_STAGE1); + hyp_code_len = (size_t)hyp_code_end - (size_t)hyp_code_start; + hypmap_map(hyp_pmap, (vm_offset_t)hyp_code_start, hyp_code_len, VM_PROT_EXECUTE); + + /* We need an identity mapping for when we activate the MMU */ + hypmap_map_identity(hyp_pmap, (vm_offset_t)hyp_code_start, hyp_code_len, + VM_PROT_EXECUTE); + + /* Create and map the hypervisor stack */ + stack = malloc(PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO); + stack_top = stack + PAGE_SIZE; + hypmap_map(hyp_pmap, (vm_offset_t)stack, PAGE_SIZE, VM_PROT_READ | VM_PROT_WRITE); + + /* Configure address translation at EL2 */ + tcr_el1 = READ_SPECIALREG(tcr_el1); + tcr_el2 = TCR_EL2_RES1; + + /* Set physical address size */ + id_aa64mmfr0_el1 = READ_SPECIALREG(id_aa64mmfr0_el1); + pa_range_bits = ID_AA64MMFR0_PARange_VAL(id_aa64mmfr0_el1); + tcr_el2 |= (pa_range_bits & 0x7) << TCR_EL2_PS_SHIFT; + + /* Use the same address translation attributes as the host */ + tcr_el2 |= tcr_el1 & TCR_T0SZ_MASK; + tcr_el2 |= tcr_el1 & (0xff << TCR_IRGN0_SHIFT); + + /* + * Configure the system control register for EL2: + * + * SCTLR_EL2_M: MMU on + * SCTLR_EL2_C: Data cacheability not affected + * SCTLR_EL2_I: Instruction cacheability not affected + * SCTLR_EL2_A: Instruction alignment check + * SCTLR_EL2_SA: Stack pointer alignment check + * SCTLR_EL2_WXN: Treat writable memory as execute never + * ~SCTLR_EL2_EE: Data accesses are little-endian + */ + sctlr_el2 = SCTLR_EL2_RES1; + sctlr_el2 |= SCTLR_EL2_M | SCTLR_EL2_C | SCTLR_EL2_I; + sctlr_el2 |= SCTLR_EL2_A | SCTLR_EL2_SA; + sctlr_el2 |= SCTLR_EL2_WXN; + sctlr_el2 &= ~SCTLR_EL2_EE; + + /* + * Configure the Stage 2 translation control register: + * + * VTCR_IRGN0_WBWA: Translation table walks access inner cacheable + * normal memory + * VTCR_ORGN0_WBWA: Translation table walks access outer cacheable + * normal memory + * VTCR_EL2_TG0_4K: Stage 2 uses 4K pages + * VTCR_EL2_SL0_4K_LVL1: Stage 2 uses concatenated level 1 tables + * VTCR_EL2_SH0_IS: Memory associated with Stage 2 walks is inner + * shareable + */ + vtcr_el2 = VTCR_EL2_RES1; + vtcr_el2 = (pa_range_bits & 0x7) << VTCR_EL2_PS_SHIFT; + vtcr_el2 |= VTCR_EL2_IRGN0_WBWA | VTCR_EL2_ORGN0_WBWA; + vtcr_el2 |= VTCR_EL2_TG0_4K; + vtcr_el2 |= VTCR_EL2_SH0_IS; + if (pa_range_bits == ID_AA64MMFR0_PARange_1T) { + /* + * 40 bits of physical addresses, use concatenated level 1 + * tables + */ + vtcr_el2 |= 24 & VTCR_EL2_T0SZ_MASK; + vtcr_el2 |= VTCR_EL2_SL0_4K_LVL1; + } + + /* Special call to initialize EL2 */ + vmm_call_hyp((void *)vtophys(hyp_vectors), vtophys(hyp_pmap->pm_l0), + ktohyp(stack_top), tcr_el2, sctlr_el2, vtcr_el2); + + ich_vtr_el2 = vmm_call_hyp((void *)ktohyp(vmm_read_ich_vtr_el2)); + vgic_v3_init(ich_vtr_el2); + + cnthctl_el2 = vmm_call_hyp((void *)ktohyp(vmm_read_cnthctl_el2)); + vtimer_init(cnthctl_el2); + + intr_restore(daif); + + return 0; +} + +static int +arm_cleanup(void) +{ + register_t daif; + + /* + * vmm_cleanup() will disable the MMU. For the next few instructions, + * before the hardware disables the MMU, one of the following is + * possible: + * + * a. The instruction addresses are fetched with the MMU disabled, + * and they must represent the actual physical addresses. This will work + * because we call the vmm_cleanup() function by its physical address. + * + * b. The instruction addresses are fetched using the old translation + * tables. This will work because we have an identity mapping in place + * in the translation tables and vmm_cleanup() is called by its physical + * address. + */ + daif = intr_disable(); + vmm_call_hyp((void *)vtophys(vmm_cleanup), vtophys(hyp_stub_vectors)); + intr_restore(daif); + + arm64_set_active_vcpu(NULL); + + vtimer_cleanup(); + + hypmap_cleanup(hyp_pmap); + free(hyp_pmap, M_HYP); + free(stack, M_HYP); + + mtx_destroy(&vmid_generation_mtx); + + return (0); +} + +static void * +arm_vminit(struct vm *vm) +{ + struct hyp *hyp; + struct hypctx *hypctx; + bool last_vcpu; + int i; + + hyp = malloc(sizeof(struct hyp), M_HYP, M_WAITOK | M_ZERO); + hyp->vm = vm; + hyp->vgic_attached = false; + + hyp->stage2_map = malloc(sizeof(*hyp->stage2_map), + M_HYP, M_WAITOK | M_ZERO); + hypmap_init(hyp->stage2_map, PM_STAGE2); + arm64_set_vttbr(hyp); + + for (i = 0; i < VM_MAXCPU; i++) { + hypctx = &hyp->ctx[i]; + hypctx->vcpu = i; + hypctx->hyp = hyp; + + reset_vm_el01_regs(hypctx); + reset_vm_el2_regs(hypctx); + } + + vtimer_vminit(hyp); + vgic_v3_vminit(hyp); + for (i = 0; i < VM_MAXCPU; i++) { + hypctx = &hyp->ctx[i]; + vtimer_cpuinit(hypctx); + last_vcpu = (i == VM_MAXCPU - 1); + vgic_v3_cpuinit(hypctx, last_vcpu); + } + + hypmap_map(hyp_pmap, (vm_offset_t)hyp, sizeof(struct hyp), + VM_PROT_READ | VM_PROT_WRITE); + + return (hyp); +} + +static enum vm_reg_name +get_vm_reg_name(uint32_t reg_nr, uint32_t mode __attribute__((unused))) +{ + switch(reg_nr) { + case 0: + return VM_REG_GUEST_X0; + case 1: + return VM_REG_GUEST_X1; + case 2: + return VM_REG_GUEST_X2; + case 3: + return VM_REG_GUEST_X3; + case 4: + return VM_REG_GUEST_X4; + case 5: + return VM_REG_GUEST_X5; + case 6: + return VM_REG_GUEST_X6; + case 7: + return VM_REG_GUEST_X7; + case 8: + return VM_REG_GUEST_X8; + case 9: + return VM_REG_GUEST_X9; + case 10: + return VM_REG_GUEST_X10; + case 11: + return VM_REG_GUEST_X11; + case 12: + return VM_REG_GUEST_X12; + case 13: + return VM_REG_GUEST_X13; + case 14: + return VM_REG_GUEST_X14; + case 15: + return VM_REG_GUEST_X15; + case 16: + return VM_REG_GUEST_X16; + case 17: + return VM_REG_GUEST_X17; + case 18: + return VM_REG_GUEST_X18; + case 19: + return VM_REG_GUEST_X19; + case 20: + return VM_REG_GUEST_X20; + case 21: + return VM_REG_GUEST_X21; + case 22: + return VM_REG_GUEST_X22; + case 23: + return VM_REG_GUEST_X23; + case 24: + return VM_REG_GUEST_X24; + case 25: + return VM_REG_GUEST_X25; + case 26: + return VM_REG_GUEST_X26; + case 27: + return VM_REG_GUEST_X27; + case 28: + return VM_REG_GUEST_X28; + case 29: + return VM_REG_GUEST_X29; + case 30: + return VM_REG_GUEST_LR; + case 31: + return VM_REG_GUEST_SP; + case 32: + return VM_REG_GUEST_ELR; + case 33: + return VM_REG_GUEST_SPSR; + case 34: + return VM_REG_ELR_EL2; + default: + break; + } + + return (VM_REG_LAST); +} + +static inline void +arm64_print_hyp_regs(struct vm_exit *vme) +{ + printf("esr_el2: 0x%08x\n", vme->u.hyp.esr_el2); + printf("far_el2: 0x%016lx\n", vme->u.hyp.far_el2); + printf("hpfar_el2: 0x%016lx\n", vme->u.hyp.hpfar_el2); +} + +static void +arm64_gen_inst_emul_data(uint32_t esr_iss, struct vm_exit *vme_ret) +{ + struct vie *vie; + uint32_t esr_sas, reg_num; + uint64_t page_off; + + /* + * Get bits [47:12] of the IPA from HPFAR_EL2. + * At this point the 'u.hyp' member will be replaced by 'u.inst_emul'. + */ + vme_ret->u.inst_emul.gpa = \ + (vme_ret->u.hyp.hpfar_el2) >> HPFAR_EL2_FIPA_SHIFT; + /* The IPA is the base address of a 4KB page, make bits [11:0] zero. */ + vme_ret->u.inst_emul.gpa = (vme_ret->u.inst_emul.gpa) << PAGE_SHIFT; + /* Bits [11:0] are the same as bits [11:0] from the virtual address. */ + page_off = FAR_EL2_PAGE_OFFSET(vme_ret->u.hyp.far_el2); + vme_ret->u.inst_emul.gpa = vme_ret->u.inst_emul.gpa + page_off; + + esr_sas = (esr_iss & ISS_DATA_SAS_MASK) >> ISS_DATA_SAS_SHIFT; + reg_num = (esr_iss & ISS_DATA_SRT_MASK) >> ISS_DATA_SRT_SHIFT; + + vie = &vme_ret->u.inst_emul.vie; + vie->access_size = 1 << esr_sas; + vie->sign_extend = (esr_iss & ISS_DATA_SSE) ? 1 : 0; + vie->dir = (esr_iss & ISS_DATA_WnR) ? VM_DIR_WRITE : VM_DIR_READ; + vie->reg = get_vm_reg_name(reg_num, UNUSED); +} + +static void +arm64_gen_reg_emul_data(uint32_t esr_iss, struct vm_exit *vme_ret) +{ + uint32_t reg_num; + struct vre *vre; + + /* u.hyp member will be replaced by u.reg_emul */ + vre = &vme_ret->u.reg_emul.vre; + + vre->inst_syndrome = esr_iss; + /* ARMv8 Architecture Manual, p. D7-2273: 1 means read */ + vre->dir = (esr_iss & ISS_MSR_DIR) ? VM_DIR_READ : VM_DIR_WRITE; + reg_num = ISS_MSR_Rt(esr_iss); + vre->reg = get_vm_reg_name(reg_num, UNUSED); +} + +//static bool print_stuff = false; + +static int +handle_el1_sync_excp(struct hyp *hyp, int vcpu, struct vm_exit *vme_ret) +{ + uint32_t esr_ec, esr_iss; + + esr_ec = ESR_ELx_EXCEPTION(vme_ret->u.hyp.esr_el2); + esr_iss = vme_ret->u.hyp.esr_el2 & ESR_ELx_ISS_MASK; + + switch(esr_ec) { + case EXCP_UNKNOWN: + eprintf("Unknown exception from guest\n"); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + case EXCP_HVC: + vme_ret->exitcode = VM_EXITCODE_HVC; + break; + case EXCP_MSR: + arm64_gen_reg_emul_data(esr_iss, vme_ret); + vme_ret->exitcode = VM_EXITCODE_REG_EMUL; + break; + + case EXCP_DATA_ABORT_L: + /* Check if instruction syndrome is valid */ + if (!(esr_iss & ISS_DATA_ISV)) { + eprintf("Data abort with invalid instruction syndrome\n"); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + } + + /* + * Check if the data abort was caused by a translation fault. + * Any other type of data fault will be treated as an error. + */ + if (!(ISS_DATA_DFSC_TF(esr_iss))) { + eprintf("Data abort not on a stage 2 translation\n"); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + } + + arm64_gen_inst_emul_data(esr_iss, vme_ret); + vme_ret->exitcode = VM_EXITCODE_INST_EMUL; + break; + + default: + eprintf("Unsupported synchronous exception from guest: 0x%x\n", + esr_ec); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + } + + /* We don't don't do any instruction emulation here */ + return (UNHANDLED); +} + +static int +arm64_handle_world_switch(struct hyp *hyp, int vcpu, struct vm_exit *vme) +{ + int excp_type; + int handled; + + excp_type = vme->u.hyp.exception_nr; + switch (excp_type) { + case EXCP_TYPE_EL1_SYNC: + /* The exit code will be set by handle_el1_sync_excp(). */ + handled = handle_el1_sync_excp(hyp, vcpu, vme); + break; + + case EXCP_TYPE_EL1_IRQ: + case EXCP_TYPE_EL1_FIQ: + /* The host kernel will handle IRQs and FIQs. */ + vme->exitcode = VM_EXITCODE_BOGUS; + handled = UNHANDLED; + break; + + case EXCP_TYPE_EL1_ERROR: + case EXCP_TYPE_EL2_SYNC: + case EXCP_TYPE_EL2_IRQ: + case EXCP_TYPE_EL2_FIQ: + case EXCP_TYPE_EL2_ERROR: + eprintf("Unhandled exception type: %s\n", __STRING(excp_type)); + vme->exitcode = VM_EXITCODE_BOGUS; + handled = UNHANDLED; + break; + + default: + eprintf("Unknown exception type: %d\n", excp_type); + vme->exitcode = VM_EXITCODE_BOGUS; + handled = UNHANDLED; + break; + } + + return (handled); +} + +static int +arm_vmrun(void *arg, int vcpu, register_t pc, pmap_t pmap, + void *rendezvous_cookie, void *suspend_cookie) +{ + uint64_t excp_type; + int handled; + register_t daif; + struct hyp *hyp; + struct hypctx *hypctx; + struct vm *vm; + struct vm_exit *vme; + + hyp = (struct hyp *)arg; + vm = hyp->vm; + vme = vm_exitinfo(vm, vcpu); + + hypctx = &hyp->ctx[vcpu]; + hypctx->elr_el2 = (uint64_t)pc; + + for (;;) { + daif = intr_disable(); + /* + * TODO: What happens if a timer interrupt is asserted exactly + * here, but for the previous VM? + */ + arm64_set_active_vcpu(hypctx); + vgic_v3_sync_hwstate(hypctx); + excp_type = vmm_call_hyp((void *)ktohyp(vmm_enter_guest), + ktohyp(hypctx)); + intr_restore(daif); + + if (excp_type == EXCP_TYPE_MAINT_IRQ) + continue; + + vme->pc = hypctx->elr_el2; + vme->inst_length = INSN_SIZE; + vme->u.hyp.exception_nr = excp_type; + vme->u.hyp.esr_el2 = hypctx->exit_info.esr_el2; + vme->u.hyp.far_el2 = hypctx->exit_info.far_el2; + vme->u.hyp.hpfar_el2 = hypctx->exit_info.hpfar_el2; + + handled = arm64_handle_world_switch(hyp, vcpu, vme); + if (handled == UNHANDLED) + /* Exit loop to emulate instruction. */ + break; + else + /* Resume guest execution from the next instruction. */ + hypctx->elr_el2 += vme->inst_length; + } + + return (0); +} + +static void +arm_vmcleanup(void *arg) +{ + struct hyp *hyp = arg; + struct hypctx *hypctx; + + hypctx = &hyp->ctx[0]; + if (arm64_get_active_vcpu() == hypctx) + arm64_set_active_vcpu(NULL); + + vtimer_vmcleanup(arg); + vgic_v3_detach_from_vm(arg); + + /* Unmap the VM hyp struct from the hyp mode translation table */ + hypmap_map(hyp_pmap, (vm_offset_t)hyp, sizeof(struct hyp), + VM_PROT_NONE); + hypmap_cleanup(hyp->stage2_map); + free(hyp->stage2_map, M_HYP); + free(hyp, M_HYP); +} + +/* + * Return register value. Registers have different sizes and an explicit cast + * must be made to ensure proper conversion. + */ +static void * +hypctx_regptr(struct hypctx *hypctx, int reg) +{ + switch (reg) { + case VM_REG_GUEST_X0: + return (&hypctx->regs.x[0]); + case VM_REG_GUEST_X1: + return (&hypctx->regs.x[1]); + case VM_REG_GUEST_X2: + return (&hypctx->regs.x[2]); + case VM_REG_GUEST_X3: + return (&hypctx->regs.x[3]); + case VM_REG_GUEST_X4: + return (&hypctx->regs.x[4]); + case VM_REG_GUEST_X5: + return (&hypctx->regs.x[5]); + case VM_REG_GUEST_X6: + return (&hypctx->regs.x[6]); + case VM_REG_GUEST_X7: + return (&hypctx->regs.x[7]); + case VM_REG_GUEST_X8: + return (&hypctx->regs.x[8]); + case VM_REG_GUEST_X9: + return (&hypctx->regs.x[9]); + case VM_REG_GUEST_X10: + return (&hypctx->regs.x[10]); + case VM_REG_GUEST_X11: + return (&hypctx->regs.x[11]); + case VM_REG_GUEST_X12: + return (&hypctx->regs.x[12]); + case VM_REG_GUEST_X13: + return (&hypctx->regs.x[13]); + case VM_REG_GUEST_X14: + return (&hypctx->regs.x[14]); + case VM_REG_GUEST_X15: + return (&hypctx->regs.x[15]); + case VM_REG_GUEST_X16: + return (&hypctx->regs.x[16]); + case VM_REG_GUEST_X17: + return (&hypctx->regs.x[17]); + case VM_REG_GUEST_X18: + return (&hypctx->regs.x[18]); + case VM_REG_GUEST_X19: + return (&hypctx->regs.x[19]); + case VM_REG_GUEST_X20: + return (&hypctx->regs.x[20]); + case VM_REG_GUEST_X21: + return (&hypctx->regs.x[21]); + case VM_REG_GUEST_X22: + return (&hypctx->regs.x[22]); + case VM_REG_GUEST_X23: + return (&hypctx->regs.x[23]); + case VM_REG_GUEST_X24: + return (&hypctx->regs.x[24]); + case VM_REG_GUEST_X25: + return (&hypctx->regs.x[25]); + case VM_REG_GUEST_X26: + return (&hypctx->regs.x[26]); + case VM_REG_GUEST_X27: + return (&hypctx->regs.x[27]); + case VM_REG_GUEST_X28: + return (&hypctx->regs.x[28]); + case VM_REG_GUEST_X29: + return (&hypctx->regs.x[29]); + case VM_REG_GUEST_LR: + return (&hypctx->regs.lr); + case VM_REG_GUEST_SP: + return (&hypctx->regs.sp); + case VM_REG_GUEST_ELR: + return (&hypctx->regs.elr); + case VM_REG_GUEST_SPSR: + return (&hypctx->regs.spsr); + case VM_REG_ELR_EL2: + return (&hypctx->elr_el2); + default: + break; + } + return (NULL); +} + +static int +arm_getreg(void *arg, int vcpu, int reg, uint64_t *retval) +{ + void *regp; + int running, hostcpu; + struct hyp *hyp = arg; + + running = vcpu_is_running(hyp->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("arm_getreg: %s%d is running", vm_name(hyp->vm), vcpu); + + if ((regp = hypctx_regptr(&hyp->ctx[vcpu], reg)) != NULL) { + if (reg == VM_REG_GUEST_SPSR) + *retval = *(uint32_t *)regp; + else + *retval = *(uint64_t *)regp; + return (0); + } else { + return (EINVAL); + } +} + +static int +arm_setreg(void *arg, int vcpu, int reg, uint64_t val) +{ + void *regp; + struct hyp *hyp = arg; + int running, hostcpu; + + running = vcpu_is_running(hyp->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("hyp_setreg: %s%d is running", vm_name(hyp->vm), vcpu); + + if ((regp = hypctx_regptr(&hyp->ctx[vcpu], reg)) != NULL) { + if (reg == VM_REG_GUEST_SPSR) + *(uint32_t *)regp = (uint32_t)val; + else + *(uint64_t *)regp = val; + return (0); + } else { + return (EINVAL); + } +} + +static +void arm_restore(void) +{ + ; +} + +struct vmm_ops vmm_ops_arm = { + arm_init, + arm_cleanup, + arm_restore, + arm_vminit, + arm_vmrun, + arm_vmcleanup, + hypmap_set, + hypmap_get, + arm_getreg, + arm_setreg, + NULL, /* vmi_get_cap_t */ + NULL /* vmi_set_cap_t */ +}; Index: sys/arm64/vmm/hyp.h =================================================================== --- /dev/null +++ sys/arm64/vmm/hyp.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2017 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_HYP_H_ +#define _VMM_HYP_H_ + +/* + * The translation tables for the hypervisor mode will hold mappings for kernel + * virtual addresses and an identity mapping (VA == PA) necessary when + * enabling/disabling the MMU. + * + * When in EL2 exception level the translation table base register is TTBR0_EL2 + * and the virtual addresses generated by the CPU must be at the bottom of the + * memory, with the first 16 bits all set to zero: + * + * 0x0000ffffffffffff End hyp address space + * 0x0000000000000000 Start of hyp address space + * + * To run code in hyp mode we need to convert kernel virtual addresses to + * addreses that fit into this address space. + * + * The kernel virtual address range is: + * + * 0xffff007fffffffff End of KVA + * 0xffff000000000000 Kernel base address & start of KVA + * + * (see /sys/arm64/include/vmparam.h). + * + * We could convert the kernel virtual addresses to valid EL2 addresses by + * setting the first 16 bits to zero and thus mapping the kernel addresses in + * the bottom half of the EL2 address space, but then they might clash with the + * identity mapping addresses. Instead we map the kernel addresses in the upper + * half of the EL2 address space. + * + * The hypervisor address space will look like this: + * + * 0x0000807fffffffff End of KVA mapping + * 0x0000800000000000 Start of KVA mapping + * + * 0x00007fffffffffff End of identity mapping + * 0x0000000000000000 Start of identity mapping + * + * With the scheme we have 47 bits at our disposable for the identity map and + * another 47 bits for the kernel virtual addresses. For a maximum physical + * memory size of 128TB we are guaranteed to not have any clashes between + * addresses. + */ +#define HYP_VM_MIN_ADDRESS 0x0000000000000000 +#define HYP_VM_MAX_ADDRESS 0x0000ffffffffffff + +#define HYP_KVA_OFFSET 0x0000800000000000 +#define HYP_KVA_MASK 0x0000ffffffffffff + +/* + * When taking asynchronous exceptions, or interrupts, with the exception of the + * SError interrupt, the exception syndrome register is not updated with the + * exception code. We need to differentiate between the different exception + * types taken to EL2. + */ +#define EXCP_TYPE_EL1_SYNC 0 +#define EXCP_TYPE_EL1_IRQ 1 +#define EXCP_TYPE_EL1_FIQ 2 +#define EXCP_TYPE_EL1_ERROR 3 + +#define EXCP_TYPE_EL2_SYNC 4 +#define EXCP_TYPE_EL2_IRQ 5 +#define EXCP_TYPE_EL2_FIQ 6 +#define EXCP_TYPE_EL2_ERROR 7 + +#define EXCP_TYPE_MAINT_IRQ 8 + +#define HYP_GET_VECTOR_TABLE -1 + +#endif /* !_VMM_HYP_H_ */ Index: sys/arm64/vmm/hyp.S =================================================================== --- /dev/null +++ sys/arm64/vmm/hyp.S @@ -0,0 +1,384 @@ +/* + * Copyright (C) 2017 Alexandru Elisei + * All rights reserved. + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include +#include +#include +#include +#include + +#include "hyp_macros.h" +#include "hyp.h" +#include "hyp_assym.h" + + .text + + .globl hyp_code_start + .globl hyp_code_end + + .align 12 +hyp_code_start: + + +ENTRY(vmm_call_hyp) + hvc #0 + ret +END(vmm_call_hyp) + + +.macro vempty + .align 7 + 1: b 1b +.endm + +.macro vector name + .align 7 + b handle_\name +.endm + + .align 11 + .globl hyp_init_vectors +hyp_init_vectors: + vempty /* Synchronous EL2t */ + vempty /* IRQ EL2t */ + vempty /* FIQ EL2t */ + vempty /* Error EL2t */ + + vempty /* Synchronous EL2h */ + vempty /* IRQ EL2h */ + vempty /* FIQ EL2h */ + vempty /* Error EL2h */ + + vector hyp_init /* Synchronous 64-bit EL1 */ + vempty /* IRQ 64-bit EL1 */ + vempty /* FIQ 64-bit EL1 */ + vempty /* Error 64-bit EL1 */ + + vempty /* Synchronous 32-bit EL1 */ + vempty /* IRQ 32-bit EL1 */ + vempty /* FIQ 32-bit EL1 */ + vempty /* Error 32-bit EL1 */ + + +/* + * Initialize the hypervisor mode with a new exception vector table, translation + * table and stack. + * + * Expecting: + * x0 - the hypervisor exception vectors + * x1 - translation tables physical address + * x2 - stack top virtual address + * x3 - TCR_EL2 value + * x4 - SCTLR_EL2 value + * x5 - VTCR_EL2 value + */ +ENTRY(handle_hyp_init) + /* Install the new exception vectors */ + msr vbar_el2, x0 + /* Set the stack top address */ + mov sp, x2 + /* Use the host VTTBR_EL2 to tell the host and the guests apart */ + mov x9, #VTTBR_HOST + msr vttbr_el2, x9 + /* Load the base address for the translation tables */ + msr ttbr0_el2, x1 + /* Invalidate the TLB */ + tlbi alle2 + /* Use the same memory attributes as EL1 */ + mrs x9, mair_el1 + msr mair_el2, x9 + /* Configure address translation */ + msr tcr_el2, x3 + isb + /* Set the system control register for EL2 */ + msr sctlr_el2, x4 + /* Set the Stage 2 translation control register */ + msr vtcr_el2, x5 + /* Return success */ + mov x0, #0 + /* MMU is up and running */ + eret +END(handle_hyp_init) + + + .align 11 + .globl hyp_vectors +hyp_vectors: + vempty /* Synchronous EL2t */ + vempty /* IRQ EL2t */ + vempty /* FIQ EL2t */ + vempty /* Error EL2t */ + + vector el2_el2h_sync /* Synchronous EL2h */ + vector el2_el2h_irq /* IRQ EL2h */ + vector el2_el2h_fiq /* FIQ EL2h */ + vector el2_el2h_error /* Error EL2h */ + + vector el2_el1_sync64 /* Synchronous 64-bit EL1 */ + vector el2_el1_irq64 /* IRQ 64-bit EL1 */ + vector el2_el1_fiq64 /* FIQ 64-bit EL1 */ + vector el2_el1_error64 /* Error 64-bit EL1 */ + + vempty /* Synchronous 32-bit EL1 */ + vempty /* IRQ 32-bit EL1 */ + vempty /* FIQ 32-bit EL1 */ + vempty /* Error 32-bit EL1 */ + + +.macro do_world_switch_to_host + .align 7 + SAVE_GUEST_REGS() +#ifdef VFP + /* + * Saving the guest VFP registers needs to come after saving the rest of + * the registers because the process dirties the regular registers. + */ + SAVE_GUEST_VFP_REGS() + LOAD_HOST_VFP_REGS() +#endif + LOAD_HOST_REGS() + SAVE_EXIT_INFO() + + /* Restore host VTTBR */ + mov x9, #VTTBR_HOST + msr vttbr_el2, x9 +.endm + + +.macro handle_el2_excp type + .align 7 + /* Save registers before modifying so we can restore them */ + str x9, [sp, #-16]! + + /* Test if the exception happened when the host was running */ + mrs x9, vttbr_el2 + cmp x9, #VTTBR_HOST + beq 1f + + /* We got the exception while the guest was running */ + ldr x9, [sp], #16 + do_world_switch_to_host + b 2f +1: + /* We got the exception while the host was running */ + ldr x9, [sp], #16 +2: + mov x0, \type + eret +.endm + + +ENTRY(handle_el2_el2h_sync) + handle_el2_excp #EXCP_TYPE_EL2_SYNC +END(handle_el2_el2h_sync) + +ENTRY(handle_el2_el2h_irq) + handle_el2_excp #EXCP_TYPE_EL2_IRQ +END(handle_el2_el2h_sync) + +ENTRY(handle_el2_el2h_fiq) + handle_el2_excp #EXCP_TYPE_EL2_FIQ +END(handle_el2_el2h_sync) + +ENTRY(handle_el2_el2h_error) + handle_el2_excp #EXCP_TYPE_EL2_ERROR +END(handle_el2_el2h_sync) + + +ENTRY(handle_el2_el1_sync64) + /* Save registers before modifying so we can restore them */ + str x9, [sp, #-16]! + + /* Check for host hypervisor call */ + mrs x9, vttbr_el2 + cmp x9, #VTTBR_HOST + beq 1f + + /* Restore register */ + ldr x9, [sp], #16 + + /* Guest exception taken to EL2 */ + do_world_switch_to_host + mov x0, #EXCP_TYPE_EL1_SYNC + b exit + +1: + /* Restore register */ + ldr x9, [sp], #16 + + cmp x0, #HYP_GET_VECTOR_TABLE + beq 2f + b call_function +2: + /* Return the vector table base address */ + mrs x0, vbar_el2 +exit: + eret +END(handle_el2_el1_sync64) + + +/* + * Call a function in EL2 context + * + * Expecting: + * x0 - function virtual address + * x1-x7 - function parameters + */ +ENTRY(call_function) + /* Save the function address before shuffling parameters */ + mov x9, x0 + + /* Shuffle function parameters */ + mov x0, x1 + mov x1, x2 + mov x2, x3 + mov x3, x4 + mov x4, x5 + mov x5, x6 + mov x6, x7 + + /* Call function */ + br x9 +END(call_function) + + +/* + * We only trap IRQ, FIQ and SError exceptions when a guest is running. Do a + * world switch to host to handle these exceptions. + */ + + +ENTRY(handle_el2_el1_irq64) + do_world_switch_to_host + str x9, [sp, #-16]! + mrs x9, ich_misr_el2 + cmp x9, xzr + beq 1f + mov x0, #EXCP_TYPE_MAINT_IRQ + b 2f +1: + mov x0, #EXCP_TYPE_EL1_IRQ +2: + ldr x9, [sp], #16 + eret +END(handle_el2_el1_irq) + +ENTRY(handle_el2_el1_fiq64) + do_world_switch_to_host + mov x0, #EXCP_TYPE_EL1_FIQ + eret +END(handle_el2_el1_fiq64) + +ENTRY(handle_el2_el1_error64) + do_world_switch_to_host + mov x0, #EXCP_TYPE_EL1_ERROR + eret +END(handle_el2_el1_error64) + + +/* + * Usage: + * void vmm_enter_guest(struct hypctx *hypctx) + * + * Expecting: + * x0 - hypctx address + */ +ENTRY(vmm_enter_guest) + /* Save hypctx address */ + msr tpidr_el2, x0 + + SAVE_HOST_REGS() +#ifdef VFP + SAVE_HOST_VFP_REGS() + /* + * Loading the guest VFP registers needs to come before loading the + * rest of the registers because this process dirties the regular + * registers. + */ + LOAD_GUEST_VFP_REGS() +#endif + LOAD_GUEST_REGS() + + /* Enter guest */ + eret +END(vmm_enter_guest) + + +/* + * Usage: + * void vmm_cleanup(void *hyp_stub_vectors) + * + * Expecting: + * x0 - physical address of hyp_stub_vectors + */ +ENTRY(vmm_cleanup) + /* Restore the stub vectors */ + msr vbar_el2, x0 + + /* Disable the MMU */ + dsb sy + mrs x2, sctlr_el2 + bic x2, x2, #SCTLR_EL2_M + msr sctlr_el2, x2 + + eret +END(vmm_cleanup) + +.macro read_reg name + mrs x0, \name +.endm + +/* + * Return the value of the ICH_VTR_EL2 register. + */ +ENTRY(vmm_read_ich_vtr_el2) + read_reg ich_vtr_el2 + eret +END(vmm_read_ich_vtr_el2) + +/* + * Return the value of the CNTHCTL_EL2 register. + */ +ENTRY(vmm_read_cnthctl_el2) + read_reg cnthctl_el2 + eret +END(vmm_read_cnthctl_el2) + +/* + * Return the value of the TCR_EL2 register. + */ +ENTRY(vmm_read_tcr_el2) + read_reg tcr_el2 + eret +END(vmm_read_tcr_el2) + + + +hyp_code_end: Index: sys/arm64/vmm/hyp_genassym.c =================================================================== --- /dev/null +++ sys/arm64/vmm/hyp_genassym.c @@ -0,0 +1,167 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arm64.h" + +ASSYM(HYPCTX_REGS_X0, offsetof(struct hypctx, regs) + 0 * 8); +ASSYM(HYPCTX_REGS_X1, offsetof(struct hypctx, regs) + 1 * 8); +ASSYM(HYPCTX_REGS_X2, offsetof(struct hypctx, regs) + 2 * 8); +ASSYM(HYPCTX_REGS_X3, offsetof(struct hypctx, regs) + 3 * 8); +ASSYM(HYPCTX_REGS_X4, offsetof(struct hypctx, regs) + 4 * 8); +ASSYM(HYPCTX_REGS_X5, offsetof(struct hypctx, regs) + 5 * 8); +ASSYM(HYPCTX_REGS_X6, offsetof(struct hypctx, regs) + 6 * 8); +ASSYM(HYPCTX_REGS_X7, offsetof(struct hypctx, regs) + 7 * 8); +ASSYM(HYPCTX_REGS_X8, offsetof(struct hypctx, regs) + 8 * 8); +ASSYM(HYPCTX_REGS_X9, offsetof(struct hypctx, regs) + 9 * 8); +ASSYM(HYPCTX_REGS_X10, offsetof(struct hypctx, regs) + 10 * 8); +ASSYM(HYPCTX_REGS_X11, offsetof(struct hypctx, regs) + 11 * 8); +ASSYM(HYPCTX_REGS_X12, offsetof(struct hypctx, regs) + 12 * 8); +ASSYM(HYPCTX_REGS_X13, offsetof(struct hypctx, regs) + 13 * 8); +ASSYM(HYPCTX_REGS_X14, offsetof(struct hypctx, regs) + 14 * 8); +ASSYM(HYPCTX_REGS_X15, offsetof(struct hypctx, regs) + 15 * 8); +ASSYM(HYPCTX_REGS_X16, offsetof(struct hypctx, regs) + 16 * 8); +ASSYM(HYPCTX_REGS_X17, offsetof(struct hypctx, regs) + 17 * 8); +ASSYM(HYPCTX_REGS_X18, offsetof(struct hypctx, regs) + 18 * 8); +ASSYM(HYPCTX_REGS_X19, offsetof(struct hypctx, regs) + 19 * 8); +ASSYM(HYPCTX_REGS_X20, offsetof(struct hypctx, regs) + 20 * 8); +ASSYM(HYPCTX_REGS_X21, offsetof(struct hypctx, regs) + 21 * 8); +ASSYM(HYPCTX_REGS_X22, offsetof(struct hypctx, regs) + 22 * 8); +ASSYM(HYPCTX_REGS_X23, offsetof(struct hypctx, regs) + 23 * 8); +ASSYM(HYPCTX_REGS_X24, offsetof(struct hypctx, regs) + 24 * 8); +ASSYM(HYPCTX_REGS_X25, offsetof(struct hypctx, regs) + 25 * 8); +ASSYM(HYPCTX_REGS_X26, offsetof(struct hypctx, regs) + 26 * 8); +ASSYM(HYPCTX_REGS_X27, offsetof(struct hypctx, regs) + 27 * 8); +ASSYM(HYPCTX_REGS_X28, offsetof(struct hypctx, regs) + 28 * 8); +ASSYM(HYPCTX_REGS_X29, offsetof(struct hypctx, regs) + 29 * 8); +ASSYM(HYPCTX_REGS_LR, offsetof(struct hypctx, regs.lr)); +ASSYM(HYPCTX_REGS_SP, offsetof(struct hypctx, regs.sp)); +ASSYM(HYPCTX_REGS_ELR, offsetof(struct hypctx, regs.elr)); +ASSYM(HYPCTX_REGS_SPSR, offsetof(struct hypctx, regs.spsr)); + +ASSYM(HYPCTX_ACTLR_EL1, offsetof(struct hypctx, actlr_el1)); +ASSYM(HYPCTX_AMAIR_EL1, offsetof(struct hypctx, amair_el1)); +ASSYM(HYPCTX_ELR_EL1, offsetof(struct hypctx, elr_el1)); +ASSYM(HYPCTX_FAR_EL1, offsetof(struct hypctx, far_el1)); +ASSYM(HYPCTX_FP, offsetof(struct hypctx, fp)); +ASSYM(HYPCTX_MAIR_EL1, offsetof(struct hypctx, mair_el1)); +ASSYM(HYPCTX_PAR_EL1, offsetof(struct hypctx, par_el1)); +ASSYM(HYPCTX_SP_EL0, offsetof(struct hypctx, sp_el0)); +ASSYM(HYPCTX_TCR_EL1, offsetof(struct hypctx, tcr_el1)); +ASSYM(HYPCTX_TPIDR_EL0, offsetof(struct hypctx, tpidr_el0)); +ASSYM(HYPCTX_TPIDRRO_EL0, offsetof(struct hypctx, tpidrro_el0)); +ASSYM(HYPCTX_TPIDR_EL1, offsetof(struct hypctx, tpidr_el1)); +ASSYM(HYPCTX_TTBR0_EL1, offsetof(struct hypctx, ttbr0_el1)); +ASSYM(HYPCTX_TTBR1_EL1, offsetof(struct hypctx, ttbr1_el1)); +ASSYM(HYPCTX_VBAR_EL1, offsetof(struct hypctx, vbar_el1)); +ASSYM(HYPCTX_AFSR0_EL1, offsetof(struct hypctx, afsr0_el1)); +ASSYM(HYPCTX_AFSR1_EL1, offsetof(struct hypctx, afsr1_el1)); +ASSYM(HYPCTX_CONTEXTIDR_EL1, offsetof(struct hypctx, contextidr_el1)); +ASSYM(HYPCTX_CPACR_EL1, offsetof(struct hypctx, cpacr_el1)); +ASSYM(HYPCTX_ESR_EL1, offsetof(struct hypctx, esr_el1)); +ASSYM(HYPCTX_SCTLR_EL1, offsetof(struct hypctx, sctlr_el1)); +ASSYM(HYPCTX_SPSR_EL1, offsetof(struct hypctx, spsr_el1)); + +ASSYM(HYPCTX_ELR_EL2, offsetof(struct hypctx, elr_el2)); +ASSYM(HYPCTX_HCR_EL2, offsetof(struct hypctx, hcr_el2)); +ASSYM(HYPCTX_VPIDR_EL2, offsetof(struct hypctx, vpidr_el2)); +ASSYM(HYPCTX_VMPIDR_EL2, offsetof(struct hypctx, vmpidr_el2)); +ASSYM(HYPCTX_CPTR_EL2, offsetof(struct hypctx, cptr_el2)); +ASSYM(HYPCTX_SPSR_EL2, offsetof(struct hypctx, spsr_el2)); + +ASSYM(HYPCTX_HYP, offsetof(struct hypctx, hyp)); + +ASSYM(HYP_VTTBR_EL2, offsetof(struct hyp, vttbr_el2)); +ASSYM(HYP_VTIMER_CNTHCTL_EL2, offsetof(struct hyp, vtimer.cnthctl_el2)); +ASSYM(HYP_VTIMER_CNTVOFF_EL2, offsetof(struct hyp, vtimer.cntvoff_el2)); + +ASSYM(HYPCTX_EXIT_INFO_ESR_EL2, offsetof(struct hypctx, exit_info.esr_el2)); +ASSYM(HYPCTX_EXIT_INFO_FAR_EL2, offsetof(struct hypctx, exit_info.far_el2)); +ASSYM(HYPCTX_EXIT_INFO_HPFAR_EL2, offsetof(struct hypctx, exit_info.hpfar_el2)); + +ASSYM(HYPCTX_VGIC_ICH_LR_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_lr_el2)); +ASSYM(HYPCTX_VGIC_ICH_LR_NUM, offsetof(struct hypctx, vgic_cpu_if.ich_lr_num)); +ASSYM(HYPCTX_VGIC_ICH_AP0R_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_ap0r_el2)); +ASSYM(HYPCTX_VGIC_ICH_AP0R_NUM, offsetof(struct hypctx, vgic_cpu_if.ich_ap0r_num)); +ASSYM(HYPCTX_VGIC_ICH_AP1R_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_ap1r_el2)); +ASSYM(HYPCTX_VGIC_ICH_AP1R_NUM, offsetof(struct hypctx, vgic_cpu_if.ich_ap1r_num)); +ASSYM(HYPCTX_VGIC_ICH_EISR_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_eisr_el2)); +ASSYM(HYPCTX_VGIC_ICH_ELRSR_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_elrsr_el2)); +ASSYM(HYPCTX_VGIC_ICH_HCR_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_hcr_el2)); +ASSYM(HYPCTX_VGIC_ICH_MISR_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_misr_el2)); +ASSYM(HYPCTX_VGIC_ICH_VMCR_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_vmcr_el2)); +ASSYM(HYPCTX_VGIC_ICH_LR_EL2, offsetof(struct hypctx, vgic_cpu_if.ich_lr_el2)); + +ASSYM(HYPCTX_VTIMER_CPU_CNTKCTL_EL1, offsetof(struct hypctx, vtimer_cpu.cntkctl_el1)); +ASSYM(HYPCTX_VTIMER_CPU_CNTV_CVAL_EL0, offsetof(struct hypctx, vtimer_cpu.cntv_cval_el0)); +ASSYM(HYPCTX_VTIMER_CPU_CNTV_CTL_EL0, offsetof(struct hypctx, vtimer_cpu.cntv_ctl_el0)); + +#ifdef VFP +ASSYM(HYPCTX_VFPSTATE_Q0, offsetof(struct hypctx, vfpstate.vfp_regs) + 0 * 16); +ASSYM(HYPCTX_VFPSTATE_Q1, offsetof(struct hypctx, vfpstate.vfp_regs) + 1 * 16); +ASSYM(HYPCTX_VFPSTATE_Q2, offsetof(struct hypctx, vfpstate.vfp_regs) + 2 * 16); +ASSYM(HYPCTX_VFPSTATE_Q3, offsetof(struct hypctx, vfpstate.vfp_regs) + 3 * 16); +ASSYM(HYPCTX_VFPSTATE_Q4, offsetof(struct hypctx, vfpstate.vfp_regs) + 4 * 16); +ASSYM(HYPCTX_VFPSTATE_Q5, offsetof(struct hypctx, vfpstate.vfp_regs) + 5 * 16); +ASSYM(HYPCTX_VFPSTATE_Q6, offsetof(struct hypctx, vfpstate.vfp_regs) + 6 * 16); +ASSYM(HYPCTX_VFPSTATE_Q7, offsetof(struct hypctx, vfpstate.vfp_regs) + 7 * 16); +ASSYM(HYPCTX_VFPSTATE_Q8, offsetof(struct hypctx, vfpstate.vfp_regs) + 8 * 16); +ASSYM(HYPCTX_VFPSTATE_Q9, offsetof(struct hypctx, vfpstate.vfp_regs) + 9 * 16); +ASSYM(HYPCTX_VFPSTATE_Q10, offsetof(struct hypctx, vfpstate.vfp_regs) + 10 * 16); +ASSYM(HYPCTX_VFPSTATE_Q11, offsetof(struct hypctx, vfpstate.vfp_regs) + 11 * 16); +ASSYM(HYPCTX_VFPSTATE_Q12, offsetof(struct hypctx, vfpstate.vfp_regs) + 12 * 16); +ASSYM(HYPCTX_VFPSTATE_Q13, offsetof(struct hypctx, vfpstate.vfp_regs) + 13 * 16); +ASSYM(HYPCTX_VFPSTATE_Q14, offsetof(struct hypctx, vfpstate.vfp_regs) + 14 * 16); +ASSYM(HYPCTX_VFPSTATE_Q15, offsetof(struct hypctx, vfpstate.vfp_regs) + 15 * 16); +ASSYM(HYPCTX_VFPSTATE_Q16, offsetof(struct hypctx, vfpstate.vfp_regs) + 16 * 16); +ASSYM(HYPCTX_VFPSTATE_Q17, offsetof(struct hypctx, vfpstate.vfp_regs) + 17 * 16); +ASSYM(HYPCTX_VFPSTATE_Q18, offsetof(struct hypctx, vfpstate.vfp_regs) + 18 * 16); +ASSYM(HYPCTX_VFPSTATE_Q19, offsetof(struct hypctx, vfpstate.vfp_regs) + 19 * 16); +ASSYM(HYPCTX_VFPSTATE_Q20, offsetof(struct hypctx, vfpstate.vfp_regs) + 20 * 16); +ASSYM(HYPCTX_VFPSTATE_Q21, offsetof(struct hypctx, vfpstate.vfp_regs) + 21 * 16); +ASSYM(HYPCTX_VFPSTATE_Q22, offsetof(struct hypctx, vfpstate.vfp_regs) + 22 * 16); +ASSYM(HYPCTX_VFPSTATE_Q23, offsetof(struct hypctx, vfpstate.vfp_regs) + 23 * 16); +ASSYM(HYPCTX_VFPSTATE_Q24, offsetof(struct hypctx, vfpstate.vfp_regs) + 24 * 16); +ASSYM(HYPCTX_VFPSTATE_Q25, offsetof(struct hypctx, vfpstate.vfp_regs) + 25 * 16); +ASSYM(HYPCTX_VFPSTATE_Q26, offsetof(struct hypctx, vfpstate.vfp_regs) + 26 * 16); +ASSYM(HYPCTX_VFPSTATE_Q27, offsetof(struct hypctx, vfpstate.vfp_regs) + 27 * 16); +ASSYM(HYPCTX_VFPSTATE_Q28, offsetof(struct hypctx, vfpstate.vfp_regs) + 28 * 16); +ASSYM(HYPCTX_VFPSTATE_Q29, offsetof(struct hypctx, vfpstate.vfp_regs) + 29 * 16); +ASSYM(HYPCTX_VFPSTATE_Q30, offsetof(struct hypctx, vfpstate.vfp_regs) + 30 * 16); +ASSYM(HYPCTX_VFPSTATE_Q31, offsetof(struct hypctx, vfpstate.vfp_regs) + 31 * 16); + + +ASSYM(HYPCTX_VFPSTATE_FPCR, offsetof(struct hypctx, vfpstate.vfp_fpcr)); +ASSYM(HYPCTX_VFPSTATE_FPSR, offsetof(struct hypctx, vfpstate.vfp_fpsr)); +#endif Index: sys/arm64/vmm/hyp_macros.h =================================================================== --- /dev/null +++ sys/arm64/vmm/hyp_macros.h @@ -0,0 +1,687 @@ +/* + * Copyright (C) 2017 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_HYP_MACROS_H_ +#define _VMM_HYP_MACROS_H_ + + +#define PUSH_SYS_REG_PAIR(reg0, reg1) \ + mrs x1, reg0; \ + mrs x2, reg1; \ + stp x2, x1, [sp, #-16]!; + + +#define PUSH_SYS_REG(reg) \ + mrs x1, reg; \ + str x1, [sp, #-16]!; + + +/* + * Push all the host registers before entering the guest. + */ +#define SAVE_HOST_REGS() \ + /* Save the regular registers */ \ + stp x0, x1, [sp, #-16]!; \ + stp x2, x3, [sp, #-16]!; \ + stp x4, x5, [sp, #-16]!; \ + stp x6, x7, [sp, #-16]!; \ + stp x8, x9, [sp, #-16]!; \ + stp x10, x11, [sp, #-16]!; \ + stp x12, x13, [sp, #-16]!; \ + stp x14, x15, [sp, #-16]!; \ + stp x16, x17, [sp, #-16]!; \ + stp x18, x19, [sp, #-16]!; \ + stp x20, x21, [sp, #-16]!; \ + stp x22, x23, [sp, #-16]!; \ + stp x24, x25, [sp, #-16]!; \ + stp x26, x27, [sp, #-16]!; \ + stp x28, x29, [sp, #-16]!; \ + stp lr, fp, [sp, #-16]!; \ + \ + /* Push the system registers */ \ + PUSH_SYS_REG_PAIR(SP_EL0, SP_EL1); \ + PUSH_SYS_REG_PAIR(ACTLR_EL1, AMAIR_EL1); \ + PUSH_SYS_REG_PAIR(ELR_EL1, PAR_EL1); \ + PUSH_SYS_REG_PAIR(MAIR_EL1, TCR_EL1); \ + PUSH_SYS_REG_PAIR(TPIDR_EL0, TPIDRRO_EL0); \ + PUSH_SYS_REG_PAIR(TPIDR_EL1, TTBR0_EL1); \ + PUSH_SYS_REG_PAIR(TTBR1_EL1, VBAR_EL1); \ + PUSH_SYS_REG_PAIR(AFSR0_EL1, AFSR1_EL1); \ + PUSH_SYS_REG_PAIR(CONTEXTIDR_EL1, CPACR_EL1); \ + PUSH_SYS_REG_PAIR(ESR_EL1, FAR_EL1); \ + PUSH_SYS_REG_PAIR(SCTLR_EL1, SPSR_EL1); \ + PUSH_SYS_REG_PAIR(ELR_EL2, HCR_EL2); \ + PUSH_SYS_REG_PAIR(VPIDR_EL2, VMPIDR_EL2); \ + PUSH_SYS_REG_PAIR(CPTR_EL2, SPSR_EL2); \ + PUSH_SYS_REG_PAIR(ICH_HCR_EL2, ICH_VMCR_EL2); \ + PUSH_SYS_REG_PAIR(CNTHCTL_EL2, CNTKCTL_EL1); \ + PUSH_SYS_REG(CNTVOFF_EL2); + + +#define SAVE_HOST_VFP_REGS() \ + stp q0, q1, [sp, #-16 * 2]!; \ + stp q2, q3, [sp, #-16 * 2]!; \ + stp q4, q5, [sp, #-16 * 2]!; \ + stp q6, q7, [sp, #-16 * 2]!; \ + stp q8, q9, [sp, #-16 * 2]!; \ + stp q10, q11, [sp, #-16 * 2]!; \ + stp q12, q13, [sp, #-16 * 2]!; \ + stp q14, q15, [sp, #-16 * 2]!; \ + stp q16, q17, [sp, #-16 * 2]!; \ + stp q18, q19, [sp, #-16 * 2]!; \ + stp q20, q21, [sp, #-16 * 2]!; \ + stp q22, q23, [sp, #-16 * 2]!; \ + stp q24, q25, [sp, #-16 * 2]!; \ + stp q26, q27, [sp, #-16 * 2]!; \ + stp q28, q29, [sp, #-16 * 2]!; \ + stp q30, q31, [sp, #-16 * 2]!; \ + PUSH_SYS_REG_PAIR(FPCR, FPSR); + + +#define POP_SYS_REG_PAIR(reg0, reg1) \ + ldp x2, x1, [sp], #16; \ + msr reg1, x2; \ + msr reg0, x1; + + +#define LOAD_HOST_VFP_REGS() \ + POP_SYS_REG_PAIR(FPCR, FPSR); \ + ldp q30, q31, [sp], #16 * 2; \ + ldp q28, q29, [sp], #16 * 2; \ + ldp q26, q27, [sp], #16 * 2; \ + ldp q24, q25, [sp], #16 * 2; \ + ldp q22, q23, [sp], #16 * 2; \ + ldp q20, q21, [sp], #16 * 2; \ + ldp q18, q19, [sp], #16 * 2; \ + ldp q16, q17, [sp], #16 * 2; \ + ldp q14, q15, [sp], #16 * 2; \ + ldp q12, q13, [sp], #16 * 2; \ + ldp q10, q11, [sp], #16 * 2; \ + ldp q8, q9, [sp], #16 * 2; \ + ldp q6, q7, [sp], #16 * 2; \ + ldp q4, q5, [sp], #16 * 2; \ + ldp q2, q3, [sp], #16 * 2; \ + ldp q0, q1, [sp], #16 * 2; \ + + +#define POP_SYS_REG(reg) \ + ldr x1, [sp], #16; \ + msr reg, x1; + + +/* + * Restore all the host registers before entering the host. + */ +#define LOAD_HOST_REGS() \ + /* Pop the system registers first */ \ + POP_SYS_REG(CNTVOFF_EL2); \ + POP_SYS_REG_PAIR(CNTHCTL_EL2, CNTKCTL_EL1); \ + POP_SYS_REG_PAIR(ICH_HCR_EL2, ICH_VMCR_EL2); \ + POP_SYS_REG_PAIR(CPTR_EL2, SPSR_EL2); \ + POP_SYS_REG_PAIR(VPIDR_EL2, VMPIDR_EL2); \ + POP_SYS_REG_PAIR(ELR_EL2, HCR_EL2); \ + POP_SYS_REG_PAIR(SCTLR_EL1, SPSR_EL1); \ + POP_SYS_REG_PAIR(ESR_EL1, FAR_EL1); \ + POP_SYS_REG_PAIR(CONTEXTIDR_EL1, CPACR_EL1); \ + POP_SYS_REG_PAIR(AFSR0_EL1, AFSR1_EL1); \ + POP_SYS_REG_PAIR(TTBR1_EL1, VBAR_EL1); \ + POP_SYS_REG_PAIR(TPIDR_EL1, TTBR0_EL1); \ + POP_SYS_REG_PAIR(TPIDR_EL0, TPIDRRO_EL0); \ + POP_SYS_REG_PAIR(MAIR_EL1, TCR_EL1); \ + POP_SYS_REG_PAIR(ELR_EL1, PAR_EL1); \ + POP_SYS_REG_PAIR(ACTLR_EL1, AMAIR_EL1); \ + POP_SYS_REG_PAIR(SP_EL0, SP_EL1); \ + \ + /* Pop the regular registers */ \ + ldp lr, fp, [sp], #16; \ + ldp x28, x29, [sp], #16; \ + ldp x26, x27, [sp], #16; \ + ldp x24, x25, [sp], #16; \ + ldp x22, x23, [sp], #16; \ + ldp x20, x21, [sp], #16; \ + ldp x18, x19, [sp], #16; \ + ldp x16, x17, [sp], #16; \ + ldp x14, x15, [sp], #16; \ + ldp x12, x13, [sp], #16; \ + ldp x10, x11, [sp], #16; \ + ldp x8, x9, [sp], #16; \ + ldp x6, x7, [sp], #16; \ + ldp x4, x5, [sp], #16; \ + ldp x2, x3, [sp], #16; \ + ldp x0, x1, [sp], #16; \ + + +#define SAVE_ARRAY_REG64(reg, dest, remaining) \ + cmp remaining, #0; \ + beq 9f; \ + mrs x7, reg; \ + str x7, [dest]; \ + add dest, dest, #8; \ + sub remaining, remaining, #1; + + +#define SAVE_LR_REGS() \ + /* Load the number of ICH_LR_EL2 regs from memory */ \ + mov x2, #HYPCTX_VGIC_ICH_LR_NUM; \ + ldr x3, [x0, x2]; \ + /* x1 holds the destination address */ \ + mov x1, #HYPCTX_VGIC_ICH_LR_EL2; \ + add x1, x0, x1; \ + SAVE_ARRAY_REG64(ich_lr0_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr1_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr2_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr3_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr4_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr5_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr6_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr7_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr8_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr9_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr10_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr11_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr12_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr13_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr14_el2, x1, x3); \ + SAVE_ARRAY_REG64(ich_lr15_el2, x1, x3); \ +9:; \ + ; + + +#define SAVE_ARRAY_REG32(reg, dest, remaining) \ + cmp remaining, #0; \ + beq 9f; \ + mrs x7, reg; \ + str w7, [dest]; \ + add dest, dest, #4; \ + sub remaining, remaining, #1; + + +#define SAVE_AP0R_REGS() \ + /* Load the number of ICH_AP0R_EL2 regs from memory */ \ + mov x2, #HYPCTX_VGIC_ICH_AP0R_NUM; \ + ldr x3, [x0, x2]; \ + /* x1 holds the destination address */ \ + mov x1, #HYPCTX_VGIC_ICH_AP0R_EL2; \ + add x1, x0, x1; \ + SAVE_ARRAY_REG32(ich_ap0r0_el2, x1, x3); \ + SAVE_ARRAY_REG32(ich_ap0r1_el2, x1, x3); \ + SAVE_ARRAY_REG32(ich_ap0r2_el2, x1, x3); \ + SAVE_ARRAY_REG32(ich_ap0r3_el2, x1, x3); \ +9:; \ + ; + + +#define SAVE_AP1R_REGS() \ + /* Load the number of ICH_AP1R_EL2 regs from memory */ \ + mov x2, #HYPCTX_VGIC_ICH_AP1R_NUM; \ + ldr x3, [x0, x2]; \ + /* x1 holds the destination address */ \ + mov x1, #HYPCTX_VGIC_ICH_AP1R_EL2; \ + add x1, x0, x1; \ + SAVE_ARRAY_REG32(ich_ap1r0_el2, x1, x3); \ + SAVE_ARRAY_REG32(ich_ap1r1_el2, x1, x3); \ + SAVE_ARRAY_REG32(ich_ap1r2_el2, x1, x3); \ + SAVE_ARRAY_REG32(ich_ap1r3_el2, x1, x3); \ +9:; \ + ; + + +/* + * The STR and LDR instructions take an offset between [-256, 255], but the + * hypctx register offset can be larger than that. To get around this limitation + * we use a temporary register to hold the offset. + */ +#define SAVE_SYS_REG64(prefix, reg) \ + mrs x1, reg; \ + mov x2, prefix ##_ ##reg; \ + str x1, [x0, x2]; + + +#define SAVE_SYS_REG32(prefix, reg) \ + mrs x1, reg; \ + mov x2, prefix ##_ ##reg; \ + str w1, [x0, x2]; + + +#define SAVE_REG(prefix, reg) \ + mov x1, prefix ##_ ##reg; \ + str reg, [x0, x1]; + +/* + * The STP and LDP instructions takes an immediate in the range of [-512, 504] + * when using the post-indexed addressing mode, but the hypctx register offset + * can be larger than that. To get around this limitation we compute the address + * by adding the hypctx base address with the struct member offset. + * + * Using STP/LDP to save/load register pairs to the corresponding struct hypctx + * variables works because the registers are declared as an array and they are + * stored in contiguous memory addresses. + */ + +#define SAVE_REG_PAIR(prefix, reg0, reg1) \ + mov x1, prefix ##_ ##reg0; \ + add x1, x0, x1; \ + stp reg0, reg1, [x1]; + + +/* + * We use x0 to load the hypctx address from TPIDR_EL2 and x1 and x2 as + * temporary registers to compute the hypctx member addresses. To save the guest + * values at first we push them on the stack, use these temporary registers to + * save the rest of the registers and at the end we pop the values from the + * stack and save them. + */ +#define SAVE_GUEST_X_REGS() \ + /* Push x0 */ \ + str x0, [sp, #-16]!; \ + /* Restore hypctx address */ \ + mrs x0, tpidr_el2; \ + /* Push x1 and x2 */ \ + stp x1, x2, [sp, #-16]!; \ + \ + /* Save the other registers */ \ + SAVE_REG_PAIR(HYPCTX_REGS, X3, X4); \ + SAVE_REG_PAIR(HYPCTX_REGS, X5, X6); \ + SAVE_REG_PAIR(HYPCTX_REGS, X7, X8); \ + SAVE_REG_PAIR(HYPCTX_REGS, X9, X10); \ + SAVE_REG_PAIR(HYPCTX_REGS, X11, X12); \ + SAVE_REG_PAIR(HYPCTX_REGS, X13, X14); \ + SAVE_REG_PAIR(HYPCTX_REGS, X15, X16); \ + SAVE_REG_PAIR(HYPCTX_REGS, X17, X18); \ + SAVE_REG_PAIR(HYPCTX_REGS, X19, X20); \ + SAVE_REG_PAIR(HYPCTX_REGS, X21, X22); \ + SAVE_REG_PAIR(HYPCTX_REGS, X23, X24); \ + SAVE_REG_PAIR(HYPCTX_REGS, X25, X26); \ + SAVE_REG_PAIR(HYPCTX_REGS, X27, X28); \ + SAVE_REG(HYPCTX_REGS, X29); \ + SAVE_REG(HYPCTX_REGS, LR); \ + \ + /* Pop and save x1 and x2 */ \ + ldp x1, x2, [sp], #16; \ + mov x3, #HYPCTX_REGS_X1; \ + add x3, x0, x3; \ + stp x1, x2, [x3]; \ + /* Pop and save x0 */ \ + ldr x1, [sp], #16; \ + mov x2, #HYPCTX_REGS_X0; \ + add x2, x2, x0; \ + str x1, [x2]; + + +/* + * Save all the guest registers. Start by saving the regular registers first + * because those will be used as temporary registers for accessing the hypctx + * member addresses. + * + * Expecting: + * TPIDR_EL2 - struct hypctx address + * + * After call: + * x0 - struct hypctx address + */ +#define SAVE_GUEST_REGS() \ + SAVE_GUEST_X_REGS(); \ + \ + SAVE_REG(HYPCTX, FP); \ + \ + SAVE_SYS_REG32(HYPCTX_VTIMER_CPU, CNTKCTL_EL1); \ + SAVE_SYS_REG64(HYPCTX_VTIMER_CPU, CNTV_CVAL_EL0); \ + SAVE_SYS_REG32(HYPCTX_VTIMER_CPU, CNTV_CTL_EL0);\ + \ + /* \ + * ICH_EISR_EL2, ICH_ELRSR_EL2 and ICH_MISR_EL2 are read-only and are \ + * saved because they are modified by the hardware as part of the \ + * interrupt virtualization process and we need to inspect them in \ + * the VGIC driver. \ + */ \ + SAVE_SYS_REG32(HYPCTX_VGIC, ICH_EISR_EL2); \ + SAVE_SYS_REG32(HYPCTX_VGIC, ICH_ELRSR_EL2); \ + SAVE_SYS_REG32(HYPCTX_VGIC, ICH_MISR_EL2); \ + SAVE_SYS_REG32(HYPCTX_VGIC, ICH_HCR_EL2); \ + SAVE_SYS_REG32(HYPCTX_VGIC, ICH_VMCR_EL2); \ + \ + SAVE_LR_REGS(); \ + SAVE_AP0R_REGS(); \ + SAVE_AP1R_REGS(); \ + \ + /* Save the stack pointer. */ \ + mrs x1, sp_el1; \ + mov x2, #HYPCTX_REGS_SP; \ + str x1, [x0, x2]; \ + \ + SAVE_SYS_REG64(HYPCTX, ACTLR_EL1); \ + SAVE_SYS_REG64(HYPCTX, AFSR0_EL1); \ + SAVE_SYS_REG64(HYPCTX, AFSR1_EL1); \ + SAVE_SYS_REG64(HYPCTX, AMAIR_EL1); \ + SAVE_SYS_REG64(HYPCTX, CONTEXTIDR_EL1); \ + SAVE_SYS_REG64(HYPCTX, CPACR_EL1); \ + SAVE_SYS_REG64(HYPCTX, ELR_EL1); \ + SAVE_SYS_REG64(HYPCTX, ESR_EL1); \ + SAVE_SYS_REG64(HYPCTX, FAR_EL1); \ + SAVE_SYS_REG64(HYPCTX, MAIR_EL1); \ + SAVE_SYS_REG64(HYPCTX, PAR_EL1); \ + SAVE_SYS_REG64(HYPCTX, SCTLR_EL1); \ + SAVE_SYS_REG64(HYPCTX, SP_EL0); \ + SAVE_SYS_REG64(HYPCTX, TCR_EL1); \ + SAVE_SYS_REG64(HYPCTX, TPIDR_EL0); \ + SAVE_SYS_REG64(HYPCTX, TPIDRRO_EL0); \ + SAVE_SYS_REG64(HYPCTX, TPIDR_EL1); \ + SAVE_SYS_REG64(HYPCTX, TTBR0_EL1); \ + SAVE_SYS_REG64(HYPCTX, TTBR1_EL1); \ + SAVE_SYS_REG64(HYPCTX, VBAR_EL1); \ + \ + SAVE_SYS_REG32(HYPCTX, SPSR_EL1); \ + \ + SAVE_SYS_REG64(HYPCTX, CPTR_EL2); \ + SAVE_SYS_REG64(HYPCTX, ELR_EL2); \ + SAVE_SYS_REG64(HYPCTX, HCR_EL2); \ + SAVE_SYS_REG64(HYPCTX, VPIDR_EL2); \ + SAVE_SYS_REG64(HYPCTX, VMPIDR_EL2); \ + SAVE_SYS_REG32(HYPCTX, SPSR_EL2); + + +#define SAVE_GUEST_VFP_REGS() \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q0, Q1); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q2, Q3); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q4, Q5); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q6, Q7); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q8, Q9); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q10, Q11); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q12, Q13); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q14, Q15); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q16, Q17); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q18, Q19); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q20, Q21); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q22, Q23); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q24, Q25); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q26, Q27); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q28, Q29); \ + SAVE_REG_PAIR(HYPCTX_VFPSTATE, Q30, Q31); \ + \ + SAVE_SYS_REG32(HYPCTX_VFPSTATE, FPCR); \ + SAVE_SYS_REG32(HYPCTX_VFPSTATE, FPSR); + + +/* See SAVE_SYS_REG */ +#define LOAD_SYS_REG64(prefix, reg) \ + mov x1, prefix ##_ ##reg; \ + ldr x2, [x0, x1]; \ + msr reg, x2; + + +#define LOAD_SYS_REG32(prefix, reg) \ + mov x1, prefix ##_ ##reg; \ + ldr w2, [x0, x1]; \ + msr reg, x2; + + +/* See SAVE_REG_PAIR */ +#define LOAD_REG_PAIR(prefix, reg0, reg1) \ + mov x1, prefix ##_ ##reg0; \ + add x1, x0, x1; \ + ldp reg0, reg1, [x1]; + + +#define LOAD_GUEST_VFP_REGS() \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q0, Q1); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q2, Q3); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q4, Q5); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q6, Q7); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q8, Q9); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q10, Q11); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q12, Q13); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q14, Q15); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q16, Q17); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q18, Q19); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q20, Q21); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q22, Q23); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q24, Q25); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q26, Q27); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q28, Q29); \ + LOAD_REG_PAIR(HYPCTX_VFPSTATE, Q30, Q31); \ + \ + LOAD_SYS_REG32(HYPCTX_VFPSTATE, FPCR); \ + LOAD_SYS_REG32(HYPCTX_VFPSTATE, FPSR); + + +#define LOAD_REG(prefix, reg) \ + mov x1, prefix ##_ ##reg; \ + ldr reg, [x0, x1]; + + +/* + * We use x1 as a temporary register to store the hypctx member offset and x0 + * to hold the hypctx address. We load the guest x0 and x1 register values in + * registers x2 and x3, push x2 and x3 on the stack and then we restore x0 and + * x1. + */ +#define LOAD_GUEST_X_REGS() \ + mov x1, #HYPCTX_REGS_X0; \ + /* x1 now holds the address of hypctx reg x0 */ \ + add x1, x1, x0; \ + /* Make x2 = guest x0 and x3 = guest x1 */ \ + ldp x2, x3, [x1]; \ + stp x2, x3, [sp, #-16]!; \ + \ + /* Load the other registers */ \ + LOAD_REG_PAIR(HYPCTX_REGS, X2, X3); \ + LOAD_REG_PAIR(HYPCTX_REGS, X4, X5); \ + LOAD_REG_PAIR(HYPCTX_REGS, X6, X7); \ + LOAD_REG_PAIR(HYPCTX_REGS, X8, X9); \ + LOAD_REG_PAIR(HYPCTX_REGS, X10, X11); \ + LOAD_REG_PAIR(HYPCTX_REGS, X12, X13); \ + LOAD_REG_PAIR(HYPCTX_REGS, X14, X15); \ + LOAD_REG_PAIR(HYPCTX_REGS, X16, X17); \ + LOAD_REG_PAIR(HYPCTX_REGS, X18, X19); \ + LOAD_REG_PAIR(HYPCTX_REGS, X20, X21); \ + LOAD_REG_PAIR(HYPCTX_REGS, X22, X23); \ + LOAD_REG_PAIR(HYPCTX_REGS, X24, X25); \ + LOAD_REG_PAIR(HYPCTX_REGS, X26, X27); \ + LOAD_REG_PAIR(HYPCTX_REGS, X28, X29); \ + LOAD_REG(HYPCTX_REGS, LR); \ + \ + /* Pop guest x0 and x1 from the stack */ \ + ldp x0, x1, [sp], #16; \ + + +#define LOAD_ARRAY_REG64(reg, src, remaining) \ + cmp remaining, #0; \ + beq 9f; \ + ldr x2, [src]; \ + msr reg, x2; \ + add src, src, #8; \ + sub remaining, remaining, #1; + + +#define LOAD_LR_REGS(); \ + /* Load the number of ICH_LR_EL2 regs from memory */ \ + mov x2, #HYPCTX_VGIC_ICH_LR_NUM; \ + ldr x3, [x0, x2]; \ + mov x1, #HYPCTX_VGIC_ICH_LR_EL2; \ + /* x1 holds the load address */ \ + add x1, x0, x1; \ + LOAD_ARRAY_REG64(ich_lr0_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr1_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr2_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr3_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr4_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr5_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr6_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr7_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr8_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr9_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr10_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr11_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr12_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr13_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr14_el2, x1, x3); \ + LOAD_ARRAY_REG64(ich_lr15_el2, x1, x3); \ +9:; \ + ; + + +#define LOAD_ARRAY_REG32(reg, src, remaining) \ + cmp remaining, #0; \ + beq 9f; \ + ldr w2, [src]; \ + msr reg, x2; \ + add src, src, #4; \ + sub remaining, remaining, #1; + + +#define LOAD_AP0R_REGS(); \ + /* Load the number of ICH_AP0R_EL2 regs from memory */ \ + mov x2, #HYPCTX_VGIC_ICH_AP0R_NUM; \ + ldr x3, [x0, x2]; \ + /* x1 holds the load address */ \ + mov x1, #HYPCTX_VGIC_ICH_AP0R_EL2; \ + add x1, x0, x1; \ + LOAD_ARRAY_REG32(ich_ap0r0_el2, x1, x3); \ + LOAD_ARRAY_REG32(ich_ap0r1_el2, x1, x3); \ + LOAD_ARRAY_REG32(ich_ap0r2_el2, x1, x3); \ + LOAD_ARRAY_REG32(ich_ap0r3_el2, x1, x3); \ +9:; \ + ; + + +#define LOAD_AP1R_REGS(); \ + /* Load the number of ICH_AP1R_EL2 regs from memory */ \ + mov x2, #HYPCTX_VGIC_ICH_AP1R_NUM; \ + ldr x3, [x0, x2]; \ + /* x1 holds the load address */ \ + mov x1, #HYPCTX_VGIC_ICH_AP1R_EL2; \ + add x1, x0, x1; \ + LOAD_ARRAY_REG32(ich_ap1r0_el2, x1, x3); \ + LOAD_ARRAY_REG32(ich_ap1r1_el2, x1, x3); \ + LOAD_ARRAY_REG32(ich_ap1r2_el2, x1, x3); \ + LOAD_ARRAY_REG32(ich_ap1r3_el2, x1, x3); \ +9:; \ + ; + + + +#define KTOHYP_REG(reg) \ + mov x7, HYP_KVA_MASK; \ + and reg, reg, x7; \ + mov x7, HYP_KVA_OFFSET; \ + orr reg, reg, x7; + + +/* Load a register from struct hyp *hyp member of hypctx. */ +#define LOAD_HYP_REG(prefix, reg) \ + /* Compute VA of hyp member in x1 */ \ + mov x1, #HYPCTX_HYP; \ + add x1, x1, x0; \ + /* Get hyp address in x2 */ \ + ldr x2, [x1]; \ + /* Transform hyp kernel VA into an EL2 VA */ \ + KTOHYP_REG(x2); \ + /* Get register offset inside struct hyp */ \ + mov x1, prefix ##_ ##reg; \ + /* Compute regster address */ \ + add x2, x2, x1; \ + /* Load the register */ \ + ldr x1, [x2]; \ + msr reg, x1; \ + + +/* + * Restore all the guest registers to their original values. + * + * Expecting: + * x0 - struct hypctx address + * + * After call: + * tpidr_el2 - struct hypctx address + */ +#define LOAD_GUEST_REGS() \ + LOAD_SYS_REG64(HYPCTX, ACTLR_EL1); \ + LOAD_SYS_REG64(HYPCTX, AFSR0_EL1); \ + LOAD_SYS_REG64(HYPCTX, AFSR1_EL1); \ + LOAD_SYS_REG64(HYPCTX, AMAIR_EL1); \ + LOAD_SYS_REG64(HYPCTX, CONTEXTIDR_EL1); \ + LOAD_SYS_REG64(HYPCTX, CPACR_EL1); \ + LOAD_SYS_REG64(HYPCTX, ELR_EL1); \ + LOAD_SYS_REG64(HYPCTX, ESR_EL1); \ + LOAD_SYS_REG64(HYPCTX, FAR_EL1); \ + LOAD_SYS_REG64(HYPCTX, MAIR_EL1); \ + LOAD_SYS_REG64(HYPCTX, PAR_EL1); \ + LOAD_SYS_REG64(HYPCTX, SCTLR_EL1); \ + LOAD_SYS_REG64(HYPCTX, SP_EL0); \ + LOAD_SYS_REG64(HYPCTX, TCR_EL1); \ + LOAD_SYS_REG64(HYPCTX, TPIDR_EL0); \ + LOAD_SYS_REG64(HYPCTX, TPIDRRO_EL0); \ + LOAD_SYS_REG64(HYPCTX, TPIDR_EL1); \ + LOAD_SYS_REG64(HYPCTX, TTBR0_EL1); \ + LOAD_SYS_REG64(HYPCTX, TTBR1_EL1); \ + LOAD_SYS_REG64(HYPCTX, VBAR_EL1); \ + LOAD_SYS_REG32(HYPCTX, SPSR_EL1); \ + \ + LOAD_SYS_REG64(HYPCTX, CPTR_EL2); \ + LOAD_SYS_REG64(HYPCTX, ELR_EL2); \ + LOAD_SYS_REG64(HYPCTX, HCR_EL2); \ + LOAD_SYS_REG64(HYPCTX, VPIDR_EL2); \ + LOAD_SYS_REG64(HYPCTX, VMPIDR_EL2); \ + LOAD_SYS_REG32(HYPCTX, SPSR_EL2); \ + \ + LOAD_SYS_REG32(HYPCTX_VGIC, ICH_HCR_EL2); \ + LOAD_SYS_REG32(HYPCTX_VGIC, ICH_VMCR_EL2); \ + \ + LOAD_SYS_REG32(HYPCTX_VTIMER_CPU, CNTKCTL_EL1); \ + LOAD_SYS_REG64(HYPCTX_VTIMER_CPU, CNTV_CVAL_EL0); \ + LOAD_SYS_REG32(HYPCTX_VTIMER_CPU, CNTV_CTL_EL0); \ + \ + LOAD_REG(HYPCTX, FP); \ + \ + LOAD_HYP_REG(HYP, VTTBR_EL2); \ + LOAD_HYP_REG(HYP_VTIMER, CNTHCTL_EL2); \ + LOAD_HYP_REG(HYP_VTIMER, CNTVOFF_EL2); \ + \ + LOAD_LR_REGS(); \ + LOAD_AP0R_REGS(); \ + LOAD_AP1R_REGS(); \ + \ + /* Load the guest EL1 stack pointer */ \ + mov x1, #HYPCTX_REGS_SP; \ + add x1, x1, x0; \ + ldr x2, [x1]; \ + msr sp_el1, x2; \ + \ + LOAD_GUEST_X_REGS(); \ + + +/* + * Save exit information + * + * Expecting: + * x0 - struct hypctx address + */ +#define SAVE_EXIT_INFO() \ + SAVE_SYS_REG64(HYPCTX_EXIT_INFO, ESR_EL2); \ + SAVE_SYS_REG64(HYPCTX_EXIT_INFO, FAR_EL2); \ + SAVE_SYS_REG64(HYPCTX_EXIT_INFO, HPFAR_EL2); \ + +#endif /* !_VMM_HYP_MACROS_H_ */ Index: sys/arm64/vmm/io/vgic_v3.h =================================================================== --- /dev/null +++ sys/arm64/vmm/io/vgic_v3.h @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_VGIC_V3_H_ +#define _VMM_VGIC_V3_H_ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#define VGIC_SGI_NUM (GIC_LAST_SGI - GIC_FIRST_SGI + 1) +#define VGIC_PPI_NUM (GIC_LAST_PPI - GIC_FIRST_PPI + 1) +#define VGIC_SPI_NUM (GIC_LAST_SPI - GIC_FIRST_SPI + 1) +#define VGIC_PRV_I_NUM (VGIC_SGI_NUM + VGIC_PPI_NUM) +#define VGIC_SHR_I_NUM (VGIC_SPI_NUM) + +#define VGIC_ICH_LR_NUM_MAX 16 +#define VGIC_ICH_AP0R_NUM_MAX 4 +#define VGIC_ICH_AP1R_NUM_MAX VGIC_ICH_AP0R_NUM_MAX + +/* Order matters, a lower value means a higher precedence */ +enum vgic_v3_irqtype { + VGIC_IRQ_MAXPRIO, + VGIC_IRQ_CLK, + VGIC_IRQ_VIRTIO, + VGIC_IRQ_MISC, + VGIC_IRQ_INVALID, +}; + +struct vgic_mmio_region { + vm_offset_t start; + vm_offset_t end; + mem_region_read_t read; + mem_region_write_t write; +}; + +struct vm; +struct vm_exit; +struct hyp; + +struct vgic_v3_dist { + struct mtx dist_mtx; + + uint64_t start; + size_t end; + size_t nirqs; + + uint32_t gicd_ctlr; /* Distributor Control Register */ + uint32_t gicd_typer; /* Interrupt Controller Type Register */ + uint32_t gicd_pidr2; /* Distributor Peripheral ID2 Register */ + /* Interrupt Configuration Registers. */ + uint32_t *gicd_icfgr; + /* Interrupt Priority Registers. */ + uint32_t *gicd_ipriorityr; + /* Interrupt Routing Registers. */ + uint64_t *gicd_irouter; + /* Interrupt Clear-Enable and Set-Enable Registers. */ + uint32_t *gicd_ixenabler; +}; + +#define aff_routing_en(distp) (distp->gicd_ctlr & GICD_CTLR_ARE_NS) + +struct vgic_v3_redist { + uint64_t start; + uint64_t end; + + uint64_t gicr_typer; /* Redistributor Type Register */ + uint32_t gicr_ctlr; /* Redistributor Control Regiser */ + uint32_t gicr_ixenabler0; + /* Interrupt Priority Registers. */ + uint32_t gicr_ipriorityr[VGIC_PRV_I_NUM / 4]; + /* Interupt Configuration Registers */ + uint32_t gicr_icfgr0, gicr_icfgr1; +}; + +struct vgic_v3_irq; +struct vgic_v3_cpu_if { + uint32_t ich_eisr_el2; /* End of Interrupt Status Register */ + uint32_t ich_elrsr_el2; /* Empty List register Status Register (ICH_ELRSR_EL2) */ + uint32_t ich_hcr_el2; /* Hyp Control Register */ + uint32_t ich_misr_el2; /* Maintenance Interrupt State Register */ + uint32_t ich_vmcr_el2; /* Virtual Machine Control Register */ + + /* + * The List Registers are part of the VM context and are modified on a + * world switch. They need to be allocated statically so they are + * mapped in the EL2 translation tables when struct hypctx is mapped. + */ + uint64_t ich_lr_el2[VGIC_ICH_LR_NUM_MAX]; + size_t ich_lr_num; + + /* + * We need a mutex for accessing the list registers because they are + * modified asynchronously by the virtual timer. + * + * Note that the mutex *MUST* be a spin mutex because an interrupt can + * be injected by a callout callback function, thereby modifying the + * list registers from a context where sleeping is forbidden. + */ + struct mtx lr_mtx; + + /* Active Priorities Registers for Group 0 and 1 interrupts */ + uint32_t ich_ap0r_el2[VGIC_ICH_AP0R_NUM_MAX]; + size_t ich_ap0r_num; + uint32_t ich_ap1r_el2[VGIC_ICH_AP1R_NUM_MAX]; + size_t ich_ap1r_num; + + struct vgic_v3_irq *irqbuf; + size_t irqbuf_size; + size_t irqbuf_num; +}; + +int vgic_v3_attach_to_vm(void *arg, uint64_t dist_start, size_t dist_size, + uint64_t redist_start, size_t redist_size); +void vgic_v3_detach_from_vm(void *arg); +void vgic_v3_init(uint64_t ich_vtr_el2); +void vgic_v3_vminit(void *arg); +void vgic_v3_cpuinit(void *arg, bool last_vcpu); +void vgic_v3_sync_hwstate(void *arg); + +void vgic_v3_mmio_init(struct hyp *hyp); +void vgic_v3_mmio_destroy(struct hyp *hyp); + +int vgic_v3_vcpu_pending_irq(void *arg); +int vgic_v3_inject_irq(void *arg, uint32_t irq, + enum vgic_v3_irqtype irqtype); +int vgic_v3_remove_irq(void *arg, uint32_t irq, bool ignore_state); + +void vgic_v3_group_toggle_enabled(bool enabled, struct hyp *hyp); +int vgic_v3_irq_toggle_enabled(uint32_t irq, bool enabled, + struct hyp *hyp, int vcpuid); + +DECLARE_CLASS(arm_vgic_driver); + +#endif /* !_VMM_VGIC_V3_H_ */ Index: sys/arm64/vmm/io/vgic_v3.c =================================================================== --- /dev/null +++ sys/arm64/vmm/io/vgic_v3.c @@ -0,0 +1,983 @@ +/* + * Copyright (C) 2018 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include "vgic_v3.h" +#include "vgic_v3_reg.h" + +#define VGIC_V3_DEVNAME "vgic" +#define VGIC_V3_DEVSTR "ARM Virtual Generic Interrupt Controller v3" + +#define RES0 0UL + +#define IRQBUF_SIZE_MIN 32 +#define IRQBUF_SIZE_MAX (1 << 10) + +#define IRQ_SCHEDULED (GIC_LAST_SPI + 1) + +#define lr_pending(lr) \ + (ICH_LR_EL2_STATE(lr) == ICH_LR_EL2_STATE_PENDING) +#define lr_inactive(lr) \ + (ICH_LR_EL2_STATE(lr) == ICH_LR_EL2_STATE_INACTIVE) +#define lr_active(lr) \ + (ICH_LR_EL2_STATE(lr) == ICH_LR_EL2_STATE_ACTIVE) +#define lr_pending_active(lr) \ + (ICH_LR_EL2_STATE(lr) == ICH_LR_EL2_STATE_PENDING_ACTIVE) +#define lr_not_active(lr) (!lr_active(lr) && !lr_pending_active(lr)) + +#define lr_clear_irq(lr) ((lr) &= ~ICH_LR_EL2_STATE_MASK) + +MALLOC_DEFINE(M_VGIC_V3, "ARM VMM VGIC V3", "ARM VMM VGIC V3"); + +struct vgic_v3_virt_features { + uint8_t min_prio; + size_t ich_lr_num; + size_t ich_ap0r_num; + size_t ich_ap1r_num; +}; + +struct vgic_v3_ro_regs { + uint32_t gicd_icfgr0; + uint32_t gicd_pidr2; + uint32_t gicd_typer; +}; + +struct vgic_v3_irq { + uint32_t irq; + enum vgic_v3_irqtype irqtype; + uint8_t enabled; + uint8_t priority; +}; + +#define vip_to_lr(vip, lr) \ +do { \ + lr = ICH_LR_EL2_STATE_PENDING; \ + lr |= ICH_LR_EL2_GROUP1; \ + lr |= (uint64_t)vip->priority << ICH_LR_EL2_PRIO_SHIFT; \ + lr |= vip->irq; \ +} while (0) + +#define lr_to_vip(lr, vip) \ +do { \ + (vip)->irq = ICH_LR_EL2_VINTID(lr); \ + (vip)->priority = \ + (uint8_t)(((lr) & ICH_LR_EL2_PRIO_MASK) >> ICH_LR_EL2_PRIO_SHIFT); \ +} while (0) + +static struct vgic_v3_virt_features virt_features; +static struct vgic_v3_ro_regs ro_regs; + +static struct gic_v3_softc *gic_sc; + +void +vgic_v3_cpuinit(void *arg, bool last_vcpu) +{ + struct hypctx *hypctx = arg; + struct vgic_v3_cpu_if *cpu_if = &hypctx->vgic_cpu_if; + struct vgic_v3_redist *redist = &hypctx->vgic_redist; + uint64_t aff, vmpidr_el2; + int i; + + vmpidr_el2 = hypctx->vmpidr_el2; + KASSERT(vmpidr_el2 != 0, + ("Trying to init this CPU's vGIC before the vCPU")); + /* + * Get affinity for the current CPU. The guest CPU affinity is taken + * from VMPIDR_EL2. The Redistributor corresponding to this CPU is + * the Redistributor with the same affinity from GICR_TYPER. + */ + aff = (CPU_AFF3(vmpidr_el2) << 24) | (CPU_AFF2(vmpidr_el2) << 16) | + (CPU_AFF1(vmpidr_el2) << 8) | CPU_AFF0(vmpidr_el2); + + /* Set up GICR_TYPER. */ + redist->gicr_typer = aff << GICR_TYPER_AFF_SHIFT; + /* Redistributor doesn't support virtual or physical LPIS. */ + redist->gicr_typer &= ~GICR_TYPER_VLPIS; + redist->gicr_typer &= ~GICR_TYPER_PLPIS; + + if (last_vcpu) + /* Mark the last Redistributor */ + redist->gicr_typer |= GICR_TYPER_LAST; + + /* + * Configure the Redistributor Control Register. + * + * ~GICR_CTLR_LPI_ENABLE: LPIs are disabled + */ + redist->gicr_ctlr = 0 & ~GICR_CTLR_LPI_ENABLE; + + mtx_init(&cpu_if->lr_mtx, "VGICv3 ICH_LR_EL2 lock", NULL, MTX_SPIN); + + /* + * Configure the Interrupt Controller Hyp Control Register. + * + * ICH_HCR_EL2_En: enable virtual CPU interface. + * + * Maintenance interrupts are disabled. + */ + cpu_if->ich_hcr_el2 = ICH_HCR_EL2_En; + + /* + * Configure the Interrupt Controller Virtual Machine Control Register. + * + * ICH_VMCR_EL2_VPMR: lowest priority mask for the VCPU interface + * ICH_VMCR_EL2_VBPR1_NO_PREEMPTION: disable interrupt preemption for + * Group 1 interrupts + * ICH_VMCR_EL2_VBPR0_NO_PREEMPTION: disable interrupt preemption for + * Group 0 interrupts + * ~ICH_VMCR_EL2_VEOIM: writes to EOI registers perform priority drop + * and interrupt deactivation. + * ICH_VMCR_EL2_VENG0: virtual Group 0 interrupts enabled. + * ICH_VMCR_EL2_VENG1: virtual Group 1 interrupts enabled. + */ + cpu_if->ich_vmcr_el2 = \ + (virt_features.min_prio << ICH_VMCR_EL2_VPMR_SHIFT) | \ + ICH_VMCR_EL2_VBPR1_NO_PREEMPTION | ICH_VMCR_EL2_VBPR0_NO_PREEMPTION; + cpu_if->ich_vmcr_el2 &= ~ICH_VMCR_EL2_VEOIM; + cpu_if->ich_vmcr_el2 |= ICH_VMCR_EL2_VENG0 | ICH_VMCR_EL2_VENG1; + + cpu_if->ich_lr_num = virt_features.ich_lr_num; + for (i = 0; i < cpu_if->ich_lr_num; i++) + cpu_if->ich_lr_el2[i] = 0UL; + + cpu_if->ich_ap0r_num = virt_features.ich_ap0r_num; + cpu_if->ich_ap1r_num = virt_features.ich_ap1r_num; + + cpu_if->irqbuf = malloc(IRQBUF_SIZE_MIN * sizeof(*cpu_if->irqbuf), + M_VGIC_V3, M_WAITOK | M_ZERO); + cpu_if->irqbuf_size = IRQBUF_SIZE_MIN; + cpu_if->irqbuf_num = 0; +} + +void +vgic_v3_vminit(void *arg) +{ + struct hyp *hyp = arg; + struct vgic_v3_dist *dist = &hyp->vgic_dist; + + /* + * Configure the Distributor control register. The register resets to an + * architecturally UNKNOWN value, so we reset to 0 to disable all + * functionality controlled by the register. + * + * The exception is GICD_CTLR.DS, which is RA0/WI when the Distributor + * supports one security state (ARM GIC Architecture Specification for + * GICv3 and GICv4, p. 4-464) + */ + dist->gicd_ctlr = GICD_CTLR_DS; + + dist->gicd_typer = ro_regs.gicd_typer; + dist->nirqs = GICD_TYPER_I_NUM(dist->gicd_typer); + dist->gicd_pidr2 = ro_regs.gicd_pidr2; + + mtx_init(&dist->dist_mtx, "VGICv3 Distributor lock", NULL, MTX_SPIN); +} + +int +vgic_v3_attach_to_vm(void *arg, uint64_t dist_start, size_t dist_size, + uint64_t redist_start, size_t redist_size) +{ + struct hyp *hyp = arg; + struct vgic_v3_dist *dist = &hyp->vgic_dist; + struct vgic_v3_redist *redist; + int i; + + /* Set the distributor address and size for trapping guest access. */ + dist->start = dist_start; + dist->end = dist_start + dist_size; + + for (i = 0; i < VM_MAXCPU; i++) { + redist = &hyp->ctx[i].vgic_redist; + /* Set the redistributor address and size. */ + redist->start = redist_start; + redist->end = redist_start + redist_size; + } + vgic_v3_mmio_init(hyp); + + hyp->vgic_attached = true; + + return (0); +} + +void +vgic_v3_detach_from_vm(void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vgic_v3_cpu_if *cpu_if; + int i; + + hyp = arg; + + for (i = 0; i < VM_MAXCPU; i++) { + hypctx = & hyp->ctx[i]; + cpu_if = &hypctx->vgic_cpu_if; + free(cpu_if->irqbuf, M_VGIC_V3); + } + + vgic_v3_mmio_destroy(hyp); +} + +int +vgic_v3_vcpu_pending_irq(void *arg) +{ + struct hypctx *hypctx = arg; + struct vgic_v3_cpu_if *cpu_if = &hypctx->vgic_cpu_if; + + return (cpu_if->irqbuf_num); +} + +/* Removes ALL instances of interrupt 'irq' */ +static int +vgic_v3_irqbuf_remove_nolock(uint32_t irq, struct vgic_v3_cpu_if *cpu_if) +{ + size_t dest = 0; + size_t from = cpu_if->irqbuf_num; + + while (dest < cpu_if->irqbuf_num) { + if (cpu_if->irqbuf[dest].irq == irq) { + for (from = dest + 1; from < cpu_if->irqbuf_num; from++) { + if (cpu_if->irqbuf[from].irq == irq) + continue; + cpu_if->irqbuf[dest++] = cpu_if->irqbuf[from]; + } + cpu_if->irqbuf_num = dest; + } else { + dest++; + } + } + + return (from - dest); +} + +int +vgic_v3_remove_irq(void *arg, uint32_t irq, bool ignore_state) +{ + struct hypctx *hypctx = arg; + struct vgic_v3_cpu_if *cpu_if = &hypctx->vgic_cpu_if; + struct vgic_v3_dist *dist = &hypctx->hyp->vgic_dist; + size_t i; + + if (irq >= dist->nirqs) { + eprintf("Malformed IRQ %u.\n", irq); + return (1); + } + + mtx_lock_spin(&cpu_if->lr_mtx); + + for (i = 0; i < cpu_if->ich_lr_num; i++) { + if (ICH_LR_EL2_VINTID(cpu_if->ich_lr_el2[i]) == irq && + (lr_not_active(cpu_if->ich_lr_el2[i]) || ignore_state)) + lr_clear_irq(cpu_if->ich_lr_el2[i]); + } + vgic_v3_irqbuf_remove_nolock(irq, cpu_if); + + mtx_unlock_spin(&cpu_if->lr_mtx); + + return (0); +} + +static struct vgic_v3_irq * +vgic_v3_irqbuf_add_nolock(struct vgic_v3_cpu_if *cpu_if) +{ + struct vgic_v3_irq *new_irqbuf, *old_irqbuf; + size_t new_size; + + if (cpu_if->irqbuf_num == cpu_if->irqbuf_size) { + /* Double the size of the buffered interrupts list */ + new_size = cpu_if->irqbuf_size << 1; + if (new_size > IRQBUF_SIZE_MAX) + return (NULL); + + new_irqbuf = NULL; + /* TODO: malloc sleeps here and causes a panic */ + while (new_irqbuf == NULL) + new_irqbuf = malloc(new_size * sizeof(*cpu_if->irqbuf), + M_VGIC_V3, M_NOWAIT | M_ZERO); + memcpy(new_irqbuf, cpu_if->irqbuf, + cpu_if->irqbuf_size * sizeof(*cpu_if->irqbuf)); + + old_irqbuf = cpu_if->irqbuf; + cpu_if->irqbuf = new_irqbuf; + cpu_if->irqbuf_size = new_size; + free(old_irqbuf, M_VGIC_V3); + } + + cpu_if->irqbuf_num++; + + return (&cpu_if->irqbuf[cpu_if->irqbuf_num - 1]); +} + +static bool +vgic_v3_int_target(uint32_t irq, struct hypctx *hypctx) +{ + struct vgic_v3_dist *dist = &hypctx->hyp->vgic_dist; + struct vgic_v3_redist *redist = &hypctx->vgic_redist; + uint64_t irouter; + uint64_t aff; + uint32_t irq_off, irq_mask; + int n; + + if (irq <= GIC_LAST_PPI) + return (true); + + /* XXX Affinity routing disabled not implemented */ + if (!aff_routing_en(dist)) + return (true); + + irq_off = irq % 32; + irq_mask = 1 << irq_off; + n = irq / 32; + + irouter = dist->gicd_irouter[irq]; + /* Check if 1-of-N routing is active */ + if (irouter & GICD_IROUTER_IRM) + /* Check if the VCPU is participating */ + return (redist->gicr_ctlr & GICR_CTLR_DPG1NS ? true : false); + + aff = redist->gicr_typer >> GICR_TYPER_AFF_SHIFT; + /* Affinity in format for comparison with irouter */ + aff = GICR_TYPER_AFF0(redist->gicr_typer) | \ + (GICR_TYPER_AFF1(redist->gicr_typer) << 8) | \ + (GICR_TYPER_AFF2(redist->gicr_typer) << 16) | \ + (GICR_TYPER_AFF3(redist->gicr_typer) << 32); + if ((irouter & aff) == aff) + return (true); + else + return (false); +} + +static uint8_t +vgic_v3_get_priority(uint32_t irq, struct hypctx *hypctx) +{ + struct vgic_v3_dist *dist = &hypctx->hyp->vgic_dist; + struct vgic_v3_redist *redist = &hypctx->vgic_redist; + size_t n; + uint32_t off, mask; + uint8_t priority; + + n = irq / 4; + off = n % 4; + mask = 0xff << off; + /* + * When affinity routing is enabled, the Redistributor is used for + * SGIs and PPIs and the Distributor for SPIs. When affinity routing + * is not enabled, the Distributor registers are used for all + * interrupts. + */ + if (aff_routing_en(dist) && (n <= 7)) + priority = (redist->gicr_ipriorityr[n] & mask) >> off; + else + priority = (dist->gicd_ipriorityr[n] & mask) >> off; + + return (priority); +} + +static bool +vgic_v3_intid_enabled(uint32_t irq, struct hypctx *hypctx) +{ + struct vgic_v3_dist *dist; + struct vgic_v3_redist *redist; + uint32_t irq_off, irq_mask; + int n; + + irq_off = irq % 32; + irq_mask = 1 << irq_off; + n = irq / 32; + + if (irq <= GIC_LAST_PPI) { + redist = &hypctx->vgic_redist; + if (!(redist->gicr_ixenabler0 & irq_mask)) + return (false); + } else { + dist = &hypctx->hyp->vgic_dist; + if (!(dist->gicd_ixenabler[n] & irq_mask)) + return (false); + } + + return (true); +} + +static inline bool +dist_group_enabled(struct vgic_v3_dist *dist) +{ + return ((dist->gicd_ctlr & GICD_CTLR_G1A) != 0); +} + +int +vgic_v3_inject_irq(void *arg, uint32_t irq, enum vgic_v3_irqtype irqtype) +{ + struct hypctx *hypctx = arg; + struct vgic_v3_dist *dist = &hypctx->hyp->vgic_dist; + struct vgic_v3_cpu_if *cpu_if = &hypctx->vgic_cpu_if; + struct vgic_v3_irq *vip; + int error; + int i; + uint8_t priority; + bool enabled; + + KASSERT(irq > GIC_LAST_SGI, ("SGI interrupts not implemented")); + + if (irq >= dist->nirqs || irqtype >= VGIC_IRQ_INVALID) { + eprintf("Malformed IRQ %u.\n", irq); + return (1); + } + + error = 0; + mtx_lock_spin(&dist->dist_mtx); + + enabled = dist_group_enabled(&hypctx->hyp->vgic_dist) && + vgic_v3_intid_enabled(irq, hypctx) && + vgic_v3_int_target(irq, hypctx); + priority = vgic_v3_get_priority(irq, hypctx); + + mtx_lock_spin(&cpu_if->lr_mtx); + + /* + * If the guest is running behind timer interrupts, don't swamp it with + * one interrupt after another. However, if the timer interrupt is being + * serviced by the guest (it is in a state other than pending, either + * active or pending and active), then add it to the buffer to be + * injected later. Otherwise, the timer would stop working because we + * disable the timer in the host interrupt handler. + */ + if (irqtype == VGIC_IRQ_CLK) { + for (i = 0; i < cpu_if->ich_lr_num; i++) + if (ICH_LR_EL2_VINTID(cpu_if->ich_lr_el2[i]) == irq && + lr_pending(cpu_if->ich_lr_el2[i])) + goto out; + for (i = 0; i < cpu_if->irqbuf_num; i++) + if (cpu_if->irqbuf[i].irq == irq) + goto out; + } + + vip = vgic_v3_irqbuf_add_nolock(cpu_if); + if (!vip) { + eprintf("Error adding IRQ %u to the IRQ buffer.\n", irq); + error = 1; + goto out; + } + vip->irq = irq; + vip->irqtype = irqtype; + vip->enabled = enabled; + vip->priority = priority; + +out: + mtx_unlock_spin(&cpu_if->lr_mtx); + mtx_unlock_spin(&dist->dist_mtx); + + return (error); +} + +void +vgic_v3_group_toggle_enabled(bool enabled, struct hyp *hyp) +{ + struct hypctx *hypctx; + struct vgic_v3_cpu_if *cpu_if; + struct vgic_v3_irq *vip; + int i, j; + + for (i = 0; i < VM_MAXCPU; i++) { + hypctx = &hyp->ctx[i]; + cpu_if = &hypctx->vgic_cpu_if; + + mtx_lock_spin(&cpu_if->lr_mtx); + + for (j = 0; j < cpu_if->irqbuf_num; j++) { + vip = &cpu_if->irqbuf[j]; + if (!enabled) + vip->enabled = 0; + else if (vgic_v3_intid_enabled(vip->irq, hypctx)) + vip->enabled = 1; + } + + mtx_unlock_spin(&cpu_if->lr_mtx); + } +} + +static int +vgic_v3_irq_toggle_enabled_vcpu(uint32_t irq, bool enabled, + struct vgic_v3_cpu_if *cpu_if) +{ + int i; + + mtx_lock_spin(&cpu_if->lr_mtx); + + if (enabled) { + /* + * Enable IRQs that were injected when the interrupt ID was + * disabled + */ + for (i = 0; i < cpu_if->irqbuf_num; i++) + if (cpu_if->irqbuf[i].irq == irq) + cpu_if->irqbuf[i].enabled = true; + } else { + /* Remove the disabled IRQ from the LR regs if it is pending */ + for (i = 0; i < cpu_if->ich_lr_num; i++) + if (lr_pending(cpu_if->ich_lr_el2[i]) && + ICH_LR_EL2_VINTID(cpu_if->ich_lr_el2[i]) == irq) + lr_clear_irq(cpu_if->ich_lr_el2[i]); + + /* Remove the IRQ from the interrupt buffer */ + vgic_v3_irqbuf_remove_nolock(irq, cpu_if); + } + + mtx_unlock_spin(&cpu_if->lr_mtx); + + return (0); +} + +int +vgic_v3_irq_toggle_enabled(uint32_t irq, bool enabled, + struct hyp *hyp, int vcpuid) +{ + struct vgic_v3_cpu_if *cpu_if; + int error; + int i; + + if (irq <= GIC_LAST_PPI) { + cpu_if = &hyp->ctx[vcpuid].vgic_cpu_if; + return (vgic_v3_irq_toggle_enabled_vcpu(irq, enabled, cpu_if)); + } else { + /* TODO: Update irqbuf for all VCPUs, not just VCPU 0 */ + for (i = 0; i < 1; i++) { + cpu_if = &hyp->ctx[i].vgic_cpu_if; + error = vgic_v3_irq_toggle_enabled_vcpu(irq, enabled, cpu_if); + if (error) + return (error); + } + } + + return (0); +} + +static int +irqbuf_highest_priority(struct vgic_v3_cpu_if *cpu_if, int start, int end, + struct hypctx *hypctx) +{ + uint32_t irq; + int i, max_idx; + uint8_t priority, max_priority; + uint8_t vpmr; + + vpmr = (cpu_if->ich_vmcr_el2 & ICH_VMCR_EL2_VPMR_MASK) >> \ + ICH_VMCR_EL2_VPMR_SHIFT; + + max_idx = -1; + max_priority = 0xff; + for (i = start; i < end; i++) { + irq = cpu_if->irqbuf[i].irq; + /* Check that the interrupt hasn't been already scheduled */ + if (irq == IRQ_SCHEDULED) + continue; + + if (!dist_group_enabled(&hypctx->hyp->vgic_dist)) + continue; + if (!vgic_v3_int_target(irq, hypctx)) + continue; + + priority = cpu_if->irqbuf[i].priority; + if (priority >= vpmr) + continue; + + if (max_idx == -1) { + max_idx = i; + max_priority = priority; + } else if (priority > max_priority) { + max_idx = i; + max_priority = priority; + } else if (priority == max_priority && + cpu_if->irqbuf[i].irqtype < cpu_if->irqbuf[max_idx].irqtype) { + max_idx = i; + max_priority = priority; + } + } + + return (max_idx); +} + +static inline bool +cpu_if_group_enabled(struct vgic_v3_cpu_if *cpu_if) +{ + return ((cpu_if->ich_vmcr_el2 & ICH_VMCR_EL2_VENG1) != 0); +} + +static inline int +irqbuf_next_enabled(struct vgic_v3_irq *irqbuf, int start, int end, + struct hypctx *hypctx, struct vgic_v3_cpu_if *cpu_if) +{ + int i; + + if (!cpu_if_group_enabled(cpu_if)) + return (-1); + + for (i = start; i < end; i++) + if (irqbuf[i].enabled) + break; + + if (i < end) + return (i); + else + return (-1); +} + +static inline int +vgic_v3_lr_next_empty(uint32_t ich_elrsr_el2, int start, int end) +{ + int i; + + for (i = start; i < end; i++) + if (ich_elrsr_el2 & (1U << i)) + break; + + if (i < end) + return (i); + else + return (-1); +} + +/* + * There are two cases in which the virtual timer interrupt is in the list + * registers: + * + * 1. The virtual interrupt is active. The guest is executing the interrupt + * handler, and the timer fired after it programmed the new alarm time but + * before the guest had the chance to write to the EOIR1 register. + * + * 2. The virtual interrupt is pending and active. The timer interrupt is level + * sensitive. The guest wrote to the EOR1 register, but the write hasn't yet + * propagated to the timer. + * + * Injecting the interrupt in these cases would mean that another timer + * interrupt is asserted as soon as the guest writes to the EOIR1 register (or + * very shortly thereafter, in the pending and active scenario). This can lead + * to the guest servicing timer interrupts one after the other and doing + * nothing else. So do not inject a timer interrupt while one is active pending. + * The buffered timer interrupts will be injected after the next world switch in + * this case. + */ +static bool +clk_irq_in_lr(struct vgic_v3_cpu_if *cpu_if) +{ + uint64_t lr; + int i; + + for (i = 0; i < cpu_if->ich_lr_num; i++) { + lr = cpu_if->ich_lr_el2[i]; + if (ICH_LR_EL2_VINTID(lr) == GT_VIRT_IRQ && + (lr_active(lr) || lr_pending_active(lr))) + return (true); + } + + return (false); +} + +static void +vgic_v3_irqbuf_to_lr(struct hypctx *hypctx, struct vgic_v3_cpu_if *cpu_if, + bool by_priority) +{ + struct vgic_v3_irq *vip; + int irqbuf_idx; + int lr_idx; + bool clk_present; + + clk_present = clk_irq_in_lr(cpu_if); + + irqbuf_idx = 0; + lr_idx = 0; + for (;;) { + if (by_priority) + irqbuf_idx = irqbuf_highest_priority(cpu_if, + irqbuf_idx, cpu_if->irqbuf_num, hypctx); + else + irqbuf_idx = irqbuf_next_enabled(cpu_if->irqbuf, + irqbuf_idx, cpu_if->irqbuf_num, hypctx, cpu_if); + if (irqbuf_idx == -1) + break; + + lr_idx = vgic_v3_lr_next_empty(cpu_if->ich_elrsr_el2, + lr_idx, cpu_if->ich_lr_num); + if (lr_idx == -1) + break; + + vip = &cpu_if->irqbuf[irqbuf_idx]; + if (vip->irqtype == VGIC_IRQ_CLK && clk_present) { + /* Skip injecting timer interrupt. */ + irqbuf_idx++; + continue; + } + + vip_to_lr(vip, cpu_if->ich_lr_el2[lr_idx]); + vip->irq = IRQ_SCHEDULED; + irqbuf_idx++; + lr_idx++; + } + + /* Remove all interrupts that were just scheduled. */ + vgic_v3_irqbuf_remove_nolock(IRQ_SCHEDULED, cpu_if); +} + +void +vgic_v3_sync_hwstate(void *arg) +{ + struct hypctx *hypctx; + struct vgic_v3_cpu_if *cpu_if; + int lr_free; + int i; + bool by_priority; + bool en_underflow_intr; + + hypctx = arg; + cpu_if = &hypctx->vgic_cpu_if; + + /* + * All Distributor writes have been executed at this point, do not + * protect Distributor reads with a mutex. + * + * This is callled with all interrupts disabled, so there is no need for + * a List Register spinlock either. + */ + mtx_lock_spin(&cpu_if->lr_mtx); + + /* Exit early if there are no buffered interrupts */ + if (cpu_if->irqbuf_num == 0) { + cpu_if->ich_hcr_el2 &= ~ICH_HCR_EL2_UIE; + goto out; + } + + /* Test if all buffered interrupts can fit in the LR regs */ + lr_free = 0; + for (i = 0; i < cpu_if->ich_lr_num; i++) + if (cpu_if->ich_elrsr_el2 & (1U << i)) + lr_free++; + + by_priority = (lr_free <= cpu_if->ich_lr_num); + vgic_v3_irqbuf_to_lr(hypctx, cpu_if, by_priority); + + lr_free = 0; + for (i = 0; i < cpu_if->ich_lr_num; i++) + if (cpu_if->ich_elrsr_el2 & (1U << i)) + lr_free++; + + en_underflow_intr = false; + if (cpu_if->irqbuf_num > 0) + for (i = 0; i < cpu_if->irqbuf_num; i++) + if (cpu_if->irqbuf[i].irqtype != VGIC_IRQ_CLK) { + en_underflow_intr = true; + break; + } + if (en_underflow_intr) { + cpu_if->ich_hcr_el2 |= ICH_HCR_EL2_UIE; + } else { + cpu_if->ich_hcr_el2 &= ~ICH_HCR_EL2_UIE; + } + +out: + mtx_unlock_spin(&cpu_if->lr_mtx); +} + +static void +vgic_v3_get_ro_regs() +{ + /* GICD_ICFGR0 configures SGIs and it is read-only. */ + ro_regs.gicd_icfgr0 = gic_d_read(gic_sc, 4, GICD_ICFGR(0)); + + /* + * Configure the GIC type register for the guest. + * + * ~GICD_TYPER_SECURITYEXTN: disable security extensions. + * ~GICD_TYPER_DVIS: direct injection for virtual LPIs not supported. + * ~GICD_TYPER_LPIS: LPIs not supported. + */ + ro_regs.gicd_typer = gic_d_read(gic_sc, 4, GICD_TYPER); + ro_regs.gicd_typer &= ~GICD_TYPER_SECURITYEXTN; + ro_regs.gicd_typer &= ~GICD_TYPER_DVIS; + ro_regs.gicd_typer &= ~GICD_TYPER_LPIS; + + /* + * XXX. Guest reads of GICD_PIDR2 should return the same ArchRev as + * specified in the guest FDT. + */ + ro_regs.gicd_pidr2 = gic_d_read(gic_sc, 4, GICD_PIDR2); +} + +void +vgic_v3_init(uint64_t ich_vtr_el2) { + uint32_t pribits, prebits; + + KASSERT(gic_sc != NULL, ("GIC softc is NULL")); + + vgic_v3_get_ro_regs(); + + pribits = ICH_VTR_EL2_PRIBITS(ich_vtr_el2); + switch (pribits) { + case 5: + virt_features.min_prio = 0xf8; + case 6: + virt_features.min_prio = 0xfc; + case 7: + virt_features.min_prio = 0xfe; + case 8: + virt_features.min_prio = 0xff; + } + + prebits = ICH_VTR_EL2_PREBITS(ich_vtr_el2); + switch (prebits) { + case 5: + virt_features.ich_ap0r_num = 1; + virt_features.ich_ap1r_num = 1; + case 6: + virt_features.ich_ap0r_num = 2; + virt_features.ich_ap1r_num = 2; + case 7: + virt_features.ich_ap0r_num = 4; + virt_features.ich_ap1r_num = 4; + } + + virt_features.ich_lr_num = ICH_VTR_EL2_LISTREGS(ich_vtr_el2); +} + +static int +vgic_v3_maint_intr(void *arg) +{ + printf("MAINTENANCE INTERRUPT\n"); + + return (FILTER_HANDLED); +} + +/* + * TODO: Look at how gic_v3_fdt.c adds the gic driver. + * + * 1. In probe they set the device description. + * 2. In attach they create children devices for the GIC (in + * gic_v3_ofw_bus_attach). + * 3. There is no identify function being called. + * + * On the other hand, in man 9 DEVICE_IDENTIFY it is stated that a new device + * instance is created by the identify function. + */ + +static void +arm_vgic_identify(driver_t *driver, device_t parent) +{ + device_t dev; + + if (strcmp(device_get_name(parent), "gic") == 0) { + dev = device_find_child(parent, VGIC_V3_DEVNAME, -1); + if (!dev) + dev = device_add_child(parent, VGIC_V3_DEVNAME, -1); + gic_sc = device_get_softc(parent); + } +} + +static int +arm_vgic_probe(device_t dev) +{ + device_t parent; + + parent = device_get_parent(dev); + if (strcmp(device_get_name(parent), "gic") == 0) { + device_set_desc(dev, VGIC_V3_DEVSTR); + return (BUS_PROBE_DEFAULT); + } + + return (ENXIO); +} + +static int +arm_vgic_attach(device_t dev) +{ + int error; + + error = gic_v3_setup_maint_intr(vgic_v3_maint_intr, NULL, NULL); + if (error) + device_printf(dev, "Could not setup maintenance interrupt\n"); + + return (0); +} + +static int +arm_vgic_detach(device_t dev) +{ + int error; + + error = gic_v3_teardown_maint_intr(); + if (error) + device_printf(dev, "Could not teardown maintenance interrupt\n"); + + gic_sc = NULL; + + return (0); +} + +static device_method_t arm_vgic_methods[] = { + DEVMETHOD(device_identify, arm_vgic_identify), + DEVMETHOD(device_probe, arm_vgic_probe), + DEVMETHOD(device_attach, arm_vgic_attach), + DEVMETHOD(device_detach, arm_vgic_detach), + DEVMETHOD_END +}; + +DEFINE_CLASS_1(vgic, arm_vgic_driver, arm_vgic_methods, 0, gic_v3_driver); + +static devclass_t arm_vgic_devclass; +DRIVER_MODULE(vgic, gic, arm_vgic_driver, arm_vgic_devclass, 0, 0); Index: sys/arm64/vmm/io/vgic_v3_mmio.c =================================================================== --- /dev/null +++ sys/arm64/vmm/io/vgic_v3_mmio.c @@ -0,0 +1,1025 @@ +#include +#include + +#include +#include +#include + +#include "vgic_v3.h" + +#define DEBUG 0 + +#define GICR_FRAME_RD 0 +#define GICR_FRAME_SGI GICR_RD_BASE_SIZE + +#define RES0 (0UL) +#define RES1 (~0UL) + +#define redist_simple_read(src, destp, vm, vcpuid) \ +do { \ + struct hyp *hyp = vm_get_cookie(vm); \ + struct vgic_v3_redist *redist = &hyp->ctx[vcpuid].vgic_redist; \ + *destp = redist->src; \ +} while (0); + +#define redist_simple_write(src, dest, vm, vcpuid) \ +do { \ + struct hyp *hyp = vm_get_cookie(vm); \ + struct vgic_v3_redist *redist = &hyp->ctx[vcpuid].vgic_redist; \ + redist->dest = src; \ +} while (0); + +#define reg32_idx(ipa, region) (((ipa) - (region).start) / 4) +#define reg64_idx(ipa, region) (((ipa) - (region).start) / 8) + +#define reg_changed(new, old, mask) (((new) & (mask)) != ((old) & (mask))) + +/* The names should always be in ascending order of memory address */ +enum vgic_mmio_region_name { + /* Distributor registers */ + VGIC_GICD_CTLR, + VGIC_GICD_TYPER, + VGIC_GICD_IGROUPR, + VGIC_GICD_ISENABLER, + VGIC_GICD_ICENABLER, + VGIC_GICD_IPRIORITYR, + VGIC_GICD_ICFGR, + VGIC_GICD_IROUTER, + VGIC_GICD_PIDR2, + /* Redistributor registers */ + VGIC_GICR_CTLR, + VGIC_GICR_TYPER, + VGIC_GICR_WAKER, + VGIC_GICR_PIDR2, + VGIC_GICR_IGROUPR0, + VGIC_GICR_ISENABLER0, + VGIC_GICR_ICENABLER0, + VGIC_GICR_IPRIORITYR, + VGIC_GICR_ICFGR0, + VGIC_GICR_ICFGR1, + VGIC_MMIO_REGIONS_NUM, +}; +/* + * Necessary for calculating the number of Distributor and Redistributor + * regions emulated. + */ +#define FIRST_REDIST_MMIO_REGION VGIC_GICR_CTLR + +MALLOC_DEFINE(M_VGIC_V3_MMIO, "ARM VMM VGIC DIST MMIO", "ARM VMM VGIC DIST MMIO"); + +static int +dist_ctlr_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + mtx_lock_spin(&dist->dist_mtx); + *rval = dist->gicd_ctlr; + mtx_unlock_spin(&dist->dist_mtx); + + /* Writes are never pending */ + *rval &= ~GICD_CTLR_RWP; + + *retu = false; + return (0); +} + +static int +dist_ctlr_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + /* GICD_CTLR.DS is RAO/WI when only one security state is supported. */ + wval |= GICD_CTLR_DS; + + mtx_lock_spin(&dist->dist_mtx); + + if (reg_changed(wval, dist->gicd_ctlr, GICD_CTLR_G1A)) { + if (!(wval & GICD_CTLR_G1A)) + vgic_v3_group_toggle_enabled(false, hyp); + else + vgic_v3_group_toggle_enabled(true, hyp); + } + dist->gicd_ctlr = wval; + + mtx_unlock_spin(&dist->dist_mtx); + + *retu = false; + return (0); +} + +static int +dist_typer_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + *rval = dist->gicd_typer; + + *retu = false; + return (0); +} + +static int +dist_typer_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + eprintf("Warning: Attempted write to read-only register GICD_TYPER.\n"); + + *retu = false; + return (0); +} + +/* Only group 1 interrupts are supported. Treat IGROUPR as RA0/WI. */ +static int +dist_igroupr_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + int n; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICD_IGROUPR]); + /* + * GIC Architecture specification, p 8-477: "For SGIs and PPIs: When + * ARE is 1 for the Security state of an interrupt, the field for that + * interrupt is RES0 and an implementation is permitted to make the + * field RAZ/WI in this case". + */ + if (n == 0 && aff_routing_en(dist)) { + *rval = RES0; + } else { + *rval = RES1; + } + + *retu = false; + return (0); +} + +/* Only group 1 interrupts are supported. Treat IGROUPR as RA0/WI. */ +static int +dist_igroupr_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + *retu = false; + return (0); +} + +static void +mmio_update_int_enabled(uint32_t new_ixenabler, uint32_t old_ixenabler, + uint32_t irq, struct hyp *hyp, int vcpuid) +{ + uint32_t irq_mask; + int error; + int i; + bool enabled; + + irq_mask = 0x1; + for (i = 0; i < 32; i++) { + if (reg_changed(new_ixenabler, old_ixenabler, irq_mask)) { + enabled = ((new_ixenabler & irq_mask) != 0); + error = vgic_v3_irq_toggle_enabled(irq, enabled, + hyp, vcpuid); + if (error) + eprintf("Warning: error while toggling IRQ %u\n", irq); + } + irq++; + irq_mask <<= 1; + } +} + +static int +dist_ixenabler_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + void *arg, enum vgic_mmio_region_name name) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + size_t n; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[name]); + /* + * GIC Architecture specification, p 8-471: "When ARE is 1 for the + * Security state of an interrupt, the field for that interrupt is RES0 + * and an implementation is permitted to* make the field RAZ/WI in this + * case". + */ + if (n == 0 && aff_routing_en(dist)) { + *rval = RES0; + goto out; + } + + mtx_lock_spin(&dist->dist_mtx); + *rval = dist->gicd_ixenabler[n]; + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); +} + +static int +dist_ixenabler_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + void *arg, enum vgic_mmio_region_name name) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + uint32_t old_ixenabler; + size_t n; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[name]); + /* See dist_ixenabler_read() */ + if (n == 0 && aff_routing_en(dist)) + /* Ignore writes */ + goto out; + + mtx_lock_spin(&dist->dist_mtx); + + old_ixenabler = dist->gicd_ixenabler[n]; + if (name == VGIC_GICD_ICENABLER) + dist->gicd_ixenabler[n] &= ~wval; + else + dist->gicd_ixenabler[n] |= wval; + mmio_update_int_enabled(dist->gicd_ixenabler[n], old_ixenabler, n * 32, + hyp, vcpuid); + + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); +} + +static int +dist_isenabler_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + return (dist_ixenabler_read(vm, vcpuid, fault_ipa, rval, arg, + VGIC_GICD_ISENABLER)); +} + +static int +dist_isenabler_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + return (dist_ixenabler_write(vm, vcpuid, fault_ipa, wval, arg, + VGIC_GICD_ISENABLER)); +} + +static int +dist_icenabler_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + return (dist_ixenabler_read(vm, vcpuid, fault_ipa, rval, arg, + VGIC_GICD_ICENABLER)); +} + +static int +dist_icenabler_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + return (dist_ixenabler_write(vm, vcpuid, fault_ipa, wval, arg, + VGIC_GICD_ICENABLER)); +} + +/* XXX: Registers are byte accessible. */ +static int +dist_ipriorityr_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + size_t n; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICD_IPRIORITYR]); + /* + * GIC Architecture specification, p 8-483: when affinity + * routing is enabled, GICD_IPRIORITYR is RAZ/WI for + * n = 0 to 7. + */ + if (aff_routing_en(dist) && n <= 7) { + *rval = RES0; + goto out; + } + + mtx_lock_spin(&dist->dist_mtx); + *rval = dist->gicd_ipriorityr[n]; + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); + +} + +static int +dist_ipriorityr_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + size_t n; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICD_IPRIORITYR]); + /* See dist_ipriorityr_read() */ + if (aff_routing_en(dist) && n <= 7) + /* Ignore writes */ + goto out; + + mtx_lock_spin(&dist->dist_mtx); + dist->gicd_ipriorityr[n] = wval; + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); +} + +static int +dist_icfgr_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + size_t n; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICD_ICFGR]); + /* + * ARM GIC Architecture Specification, p 8-472: "For SGIs, + * Int_config fields are RO, meaning that GICD_ICFGR0 is RO." + */ + if (n == 0) { + *rval = RES0; + goto out; + } + + mtx_lock_spin(&dist->dist_mtx); + *rval = dist->gicd_icfgr[n]; + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); + +} + +static int +dist_icfgr_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + size_t n; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICD_ICFGR]); + if (n == 0) + /* Ignore writes */ + goto out; + + mtx_lock_spin(&dist->dist_mtx); + dist->gicd_icfgr[n] = wval; + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); +} + +static int +dist_irouter_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + size_t n; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg64_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICD_IROUTER]); + /* GIC Architecture Manual, p 8-485: registers 0 to 31 are reserved */ + if (n <= 31) { + eprintf("Warning: Read from register GICD_IROUTER%zu\n", n); + *rval = RES0; + goto out; + } + + /* + * GIC Architecture Manual, p 8-485: when affinity routing is not + * enabled, the registers are RAZ/WI. + */ + if (!aff_routing_en(dist)) { + *rval = RES0; + goto out; + } + + mtx_lock_spin(&dist->dist_mtx); + *rval = dist->gicd_irouter[n]; + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); +} + +static int +dist_irouter_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + size_t n; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + n = reg64_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICD_IROUTER]); + if (n <= 31) { + eprintf("Warning: Write to register GICD_IROUTER%zu\n", n); + goto out; + } + + /* See dist_irouter_read() */ + if (!aff_routing_en(dist)) + /* Ignore writes */ + goto out; + + mtx_lock_spin(&dist->dist_mtx); + dist->gicd_irouter[n] = wval; + mtx_unlock_spin(&dist->dist_mtx); + +out: + *retu = false; + return (0); +} + +static int +dist_pidr2_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + *rval = dist->gicd_pidr2; + + *retu = false; + return (0); +} + +static int +dist_pidr2_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + eprintf("Warning: Attempted write to read-only register GICD_PIDR2.\n"); + + *retu = false; + return (0); +} + +static int +redist_ctlr_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + redist_simple_read(gicr_ctlr, rval, vm, vcpuid); + /* Writes are never pending */ + *rval &= ~GICR_CTLR_RWP & ~GICR_CTLR_UWP; + +#if (DEBUG > 0) + eprintf("\n"); +#endif + + *retu = false; + return (0); +} + +static int +redist_ctlr_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + redist_simple_write(wval, gicr_ctlr, vm, vcpuid); + +#if (DEBUG > 0) + eprintf("\n"); +#endif + + *retu = false; + return (0); +} + +static int +redist_typer_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + redist_simple_read(gicr_typer, rval, vm, vcpuid); + +#if (DEBUG > 0) + eprintf("\n"); +#endif + + *retu = false; + return (0); +} + +static int +redist_typer_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + eprintf("Warning: Attempted write to read-only register GICR_TYPER.\n"); + + *retu = false; + return (0); +} + +static int +redist_waker_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + /* Redistributor is always awake */ + *rval = 0 & ~GICR_WAKER_PS & ~GICR_WAKER_CA; + +#if (DEBUG > 0) + eprintf("\n"); +#endif + + *retu = false; + return (0); +} + +static int +redist_waker_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + /* Ignore writes */ +#if (DEBUG > 0) + eprintf("\n"); +#endif + + *retu = false; + return (0); +} + +/* Only group 1 interrupts are supported. Treat IGROUPR0 as RA0/WI. */ +static int +redist_igroupr0_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + *rval = RES1; + *retu = false; + return (0); +} + +/* Only group 1 interrupts are supported. Treat IGROUPR0 as RA0/WI. */ +static int +redist_igroupr0_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + if (wval == 0UL) + printf("Warning: Interrupts marked as group 0, ignoring\n"); + + *retu = false; + return (0); +} + +static int +redist_ixenabler_read(void *vm, int vcpuid, uint64_t *rval, void *arg, + enum vgic_mmio_region_name reg) +{ + struct hyp *hyp; + struct vgic_v3_redist *redist; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + redist = &hyp->ctx[vcpuid].vgic_redist; + + *rval = redist->gicr_ixenabler0; + + *retu = false; + return (0); +} + +static int +redist_ixenabler_write(void *vm, int vcpuid, uint64_t wval, void *arg, + enum vgic_mmio_region_name reg) +{ + struct hyp *hyp; + struct vgic_v3_redist *redist; + uint32_t old_ixenabler0, new_ixenabler0; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + redist = &hyp->ctx[vcpuid].vgic_redist; + + old_ixenabler0 = redist->gicr_ixenabler0; + if (reg == VGIC_GICR_ICENABLER0) + new_ixenabler0 = old_ixenabler0 & ~wval; + else + new_ixenabler0 = old_ixenabler0 | wval; + mmio_update_int_enabled(new_ixenabler0, old_ixenabler0, 0, hyp, vcpuid); + redist->gicr_ixenabler0 = new_ixenabler0; + + *retu = false; + return (0); +} + + +static int +redist_isenabler0_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ +#if (DEBUG > 0) + eprintf("\n"); +#endif + return (redist_ixenabler_read(vm, vcpuid, rval, arg, + VGIC_GICR_ISENABLER0)); +} + +static int +redist_isenabler0_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ +#if (DEBUG > 0) + eprintf("\n"); +#endif + return (redist_ixenabler_write(vm, vcpuid, wval, arg, + VGIC_GICR_ISENABLER0)); +} + +static int +redist_icenabler0_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ +#if (DEBUG > 0) + eprintf("\n"); +#endif + return (redist_ixenabler_read(vm, vcpuid, rval, arg, + VGIC_GICR_ICENABLER0)); +} + +static int +redist_icenabler0_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ +#if (DEBUG > 0) + eprintf("\n"); +#endif + return (redist_ixenabler_write(vm, vcpuid, wval, arg, + VGIC_GICR_ICENABLER0)); +} + +static int +redist_ipriorityr_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_redist *redist; + size_t n; + bool *retu = arg; + +#if (DEBUG > 0) + eprintf("\n"); +#endif + + hyp = vm_get_cookie(vm); + redist = &hyp->ctx[vcpuid].vgic_redist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICR_IPRIORITYR]); + *rval = redist->gicr_ipriorityr[n]; + + *retu = false; + return (0); +} + +static int +redist_ipriorityr_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_redist *redist; + size_t n; + bool *retu = arg; + +#if (DEBUG > 0) + eprintf("\n"); +#endif + + hyp = vm_get_cookie(vm); + redist = &hyp->ctx[vcpuid].vgic_redist; + + n = reg32_idx(fault_ipa, hyp->vgic_mmio_regions[VGIC_GICR_IPRIORITYR]); + redist->gicr_ipriorityr[n] = wval; + + *retu = false; + return (0); +} + +static int +redist_pidr2_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct vgic_v3_dist *dist; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + dist = &hyp->vgic_dist; + + /* GICR_PIDR2 has the same value as GICD_PIDR2 */ + *rval = dist->gicd_pidr2; +#if (DEBUG > 0) + eprintf("\n"); +#endif + + *retu = false; + return (0); +} + +static int +redist_pidr2_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + eprintf("Warning: Attempted write to read-only register GICR_PIDR2.\n"); + + *retu = false; + return (0); +} + +static int +redist_icfgr0_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + redist_simple_read(gicr_icfgr0, rval, vm, vcpuid); + + *retu = false; + return (0); +} + +static int +redist_icfgr0_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + redist_simple_write(wval, gicr_icfgr0, vm, vcpuid); + + *retu = false; + return (0); +} + +static int +redist_icfgr1_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + bool *retu = arg; + + redist_simple_read(gicr_icfgr0, rval, vm, vcpuid); + + *retu = false; + return (0); +} + +static int +redist_icfgr1_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + bool *retu = arg; + + redist_simple_write(wval, gicr_icfgr0, vm, vcpuid); + + *retu = false; + return (0); +} + +#define alloc_registers(regs, num, size) \ +do { \ + size = n * sizeof(*regs); \ + regs = malloc(size, M_VGIC_V3_MMIO, M_WAITOK | M_ZERO); \ +} while (0) + +#define div_round_up(n, div) (((n) + (div) - 1) / (div)) + +static inline void +init_mmio_region(struct hyp *hyp, size_t regidx, vm_offset_t start, + size_t size, mem_region_read_t read_fn, mem_region_write_t write_fn) +{ + hyp->vgic_mmio_regions[regidx] = (struct vgic_mmio_region) { + .start = start, + .end = start + size, + .read = read_fn, + .write = write_fn, + }; +} + +static void +dist_mmio_init_regions(struct vgic_v3_dist *dist, struct hyp *hyp) +{ + size_t n; + size_t region_size; + + init_mmio_region(hyp, VGIC_GICD_CTLR, dist->start + GICD_CTLR, + sizeof(dist->gicd_ctlr), dist_ctlr_read, dist_ctlr_write); + init_mmio_region(hyp, VGIC_GICD_TYPER, dist->start + GICD_TYPER, + sizeof(dist->gicd_typer), dist_typer_read, dist_typer_write); + + n = div_round_up(dist->nirqs, 32); + init_mmio_region(hyp, VGIC_GICD_IGROUPR, dist->start + GICD_IGROUPR_BASE, + n * sizeof(uint32_t), dist_igroupr_read, dist_igroupr_write); + + /* ARM GIC Architecture Specification, page 8-471. */ + n = (dist->gicd_typer & GICD_TYPER_ITLINESNUM_MASK) + 1; + alloc_registers(dist->gicd_ixenabler, n , region_size); + init_mmio_region(hyp, VGIC_GICD_ISENABLER, dist->start + GICD_ISENABLER_BASE, + region_size, dist_isenabler_read, dist_isenabler_write); + init_mmio_region(hyp, VGIC_GICD_ICENABLER, dist->start + GICD_ICENABLER_BASE, + region_size, dist_icenabler_read, dist_icenabler_write); + + /* ARM GIC Architecture Specification, page 8-483. */ + n = 8 * ((dist->gicd_typer & GICD_TYPER_ITLINESNUM_MASK) + 1); + alloc_registers(dist->gicd_ipriorityr, n, region_size); + init_mmio_region(hyp, VGIC_GICD_IPRIORITYR, dist->start + GICD_IPRIORITYR_BASE, + region_size, dist_ipriorityr_read, dist_ipriorityr_write); + + n = div_round_up(dist->nirqs, 16); + alloc_registers(dist->gicd_icfgr, n, region_size); + init_mmio_region(hyp, VGIC_GICD_ICFGR, dist->start + GICD_ICFGR_BASE, + region_size, dist_icfgr_read, dist_icfgr_write); + + /* ARM GIC Architecture Specification, page 8-485. */ + n = 32 * (dist->gicd_typer & GICD_TYPER_ITLINESNUM_MASK + 1) - 1; + alloc_registers(dist->gicd_irouter, n, region_size); + init_mmio_region(hyp, VGIC_GICD_IROUTER, dist->start + GICD_IROUTER_BASE, + region_size, dist_irouter_read, dist_irouter_write); + + init_mmio_region(hyp, VGIC_GICD_PIDR2, dist->start + GICD_PIDR2, + sizeof(dist->gicd_pidr2), dist_pidr2_read, dist_pidr2_write); +} + +static void +redist_mmio_init_regions(struct hyp *hyp, int vcpuid) +{ + struct vgic_v3_redist *redist; + vm_offset_t start; + + redist = &hyp->ctx[vcpuid].vgic_redist; + start = redist->start + GICR_FRAME_RD + GICR_CTLR; + /* + hyp->vgic_mmio_regions[VGIC_GICR_CTLR] = (struct vgic_mmio_region) { + .start = start, + .end = start + sizeof(redist->gicr_ctlr), + .read = redist_ctlr_read, + .write = redist_ctlr_write, + }; + */ + init_mmio_region(hyp, VGIC_GICR_CTLR, start, sizeof(redist->gicr_ctlr), + redist_ctlr_read, redist_ctlr_write); + + start = redist->start + GICR_FRAME_RD + GICR_TYPER; + init_mmio_region(hyp, VGIC_GICR_TYPER, start, sizeof(redist->gicr_typer), + redist_typer_read, redist_typer_write); + + start = redist->start + GICR_FRAME_RD + GICR_WAKER; + init_mmio_region(hyp, VGIC_GICR_WAKER, start, 4, redist_waker_read, + redist_waker_write); + + start = redist->start + GICR_FRAME_RD + GICR_PIDR2; + init_mmio_region(hyp, VGIC_GICR_PIDR2, start, 4, redist_pidr2_read, + redist_pidr2_write); + + start = redist->start + GICR_FRAME_SGI + GICR_IGROUPR0; + init_mmio_region(hyp, VGIC_GICR_IGROUPR0, start, + sizeof(uint32_t), redist_igroupr0_read, redist_igroupr0_write); + + start = redist->start + GICR_FRAME_SGI + GICR_ISENABLER0; + init_mmio_region(hyp, VGIC_GICR_ISENABLER0, start, + sizeof(redist->gicr_ixenabler0), redist_isenabler0_read, + redist_isenabler0_write); + + start = redist->start + GICR_FRAME_SGI + GICR_ICENABLER0; + init_mmio_region(hyp, VGIC_GICR_ICENABLER0, start, + sizeof(redist->gicr_ixenabler0), redist_icenabler0_read, + redist_icenabler0_write); + + start = redist->start + GICR_FRAME_SGI + GICR_IPRIORITYR_BASE; + init_mmio_region(hyp, VGIC_GICR_IPRIORITYR, start, + sizeof(redist->gicr_ipriorityr), redist_ipriorityr_read, + redist_ipriorityr_write); + + start = redist->start + GICR_FRAME_SGI + GICR_ICFGR0_BASE; + init_mmio_region(hyp, VGIC_GICR_ICFGR0, start, + sizeof(redist->gicr_icfgr0), redist_icfgr0_read, redist_icfgr0_write); + + start = redist->start + GICR_FRAME_SGI + GICR_ICFGR1_BASE; + init_mmio_region(hyp, VGIC_GICR_ICFGR1, start, + sizeof(redist->gicr_icfgr1), redist_icfgr1_read, redist_icfgr1_write); +} + +void +vgic_v3_mmio_init(struct hyp *hyp) +{ + struct vgic_v3_dist *dist = &hyp->vgic_dist; + int redist_region_num, dist_region_num, region_num; + int ncpus = 1; + + dist_region_num = FIRST_REDIST_MMIO_REGION; + redist_region_num = \ + ncpus * (VGIC_MMIO_REGIONS_NUM - FIRST_REDIST_MMIO_REGION); + region_num = dist_region_num + redist_region_num; + + hyp->vgic_mmio_regions = \ + malloc(region_num * sizeof(*hyp->vgic_mmio_regions), + M_VGIC_V3_MMIO, M_WAITOK | M_ZERO); + hyp->vgic_mmio_regions_num = region_num; + + dist_mmio_init_regions(dist, hyp); + + /* TODO: Do it for all VCPUs */ + redist_mmio_init_regions(hyp, 0); +} + +void +vgic_v3_mmio_destroy(struct hyp *hyp) +{ + struct vgic_v3_dist *dist = &hyp->vgic_dist; + + if (!hyp->vgic_mmio_regions) + return; + free(hyp->vgic_mmio_regions, M_VGIC_V3_MMIO); + + free(dist->gicd_ixenabler, M_VGIC_V3_MMIO); + free(dist->gicd_ipriorityr, M_VGIC_V3_MMIO); + free(dist->gicd_icfgr, M_VGIC_V3_MMIO); + free(dist->gicd_irouter, M_VGIC_V3_MMIO); +} Index: sys/arm64/vmm/io/vgic_v3_reg.h =================================================================== --- /dev/null +++ sys/arm64/vmm/io/vgic_v3_reg.h @@ -0,0 +1,97 @@ +#ifndef _VGIC_V3_REG_H_ +#define _VGIC_V3_REG_H_ + +/* Interrupt Controller End of Interrupt Status Register */ +#define ICH_EISR_EL2_STATUS_MASK 0xffff +#define ICH_EISR_EL2_EOI_NOT_HANDLED(lr) ((1 << lr) & ICH_EISR_EL2_STATUS_MASK) + +/* Interrupt Controller Empty List Register Status Register */ +#define ICH_ELRSR_EL2_STATUS_MASK 0xffff +#define ICH_ELRSR_EL2_LR_EMPTY(x) ((1 << x) & ICH_ELRSR_EL2_STATUS_MASK) + +/* Interrupt Controller Hyp Control Register */ +#define ICH_HCR_EL2_EOICOUNT_SHIFT 27 +#define ICH_HCR_EL2_EOICOUNT_MASK (0x1f << ICH_HCR_EL2_EOICOUNT_SHIFT) +#define ICH_HCR_EL2_TDIR (1 << 14) /* Trap non-secure EL1 writes to IC{C, V}_DIR_EL1 */ +#define ICH_HCR_EL2_TSEI (1 << 14) /* Trap System Error Interupts (SEI) to EL2 */ +#define ICH_HCR_EL2_TALL1 (1 << 12) /* Trap non-secure EL1 accesses to IC{C, V}_* for Group 1 interrupts */ +#define ICH_HCR_EL2_TALL0 (1 << 11) /* Trap non-secure EL1 accesses to IC{C, V}_* for Group 0 interrupts */ +#define ICH_HCR_EL2_TC (1 << 10) /* Trap non-secure EL1 accesses to common IC{C, V}_* registers */ +#define ICH_HCR_EL2_VGRP1DIE (1 << 7) /* VM Group 1 Disabled Interrupt Enable */ +#define ICH_HCR_EL2_VGRP1EIE (1 << 6) /* VM Group 1 Enabled Interrupt Enable */ +#define ICH_HCR_EL2_VGRP0DIE (1 << 5) /* VM Group 0 Disabled Interrupt Enable */ +#define ICH_HCR_EL2_VGRP0EIE (1 << 4) /* VM Group 0 Enabled Interrupt Enable */ +#define ICH_HCR_EL2_NPIE (1 << 3) /* No Pending Interrupt Enable */ +#define ICH_HCR_EL2_LRENPIE (1 << 2) /* List Register Entry Not Present Interrupt Enable */ +#define ICH_HCR_EL2_UIE (1 << 1) /* Underflow Interrupt Enable */ +#define ICH_HCR_EL2_En (1 << 0) /* Global enable for the virtual CPU interface */ + +/* Interrupt Controller List Registers */ +#define ICH_LR_EL2_VINTID_MASK 0xffffffff +#define ICH_LR_EL2_VINTID(x) ((x) & ICH_LR_EL2_VINTID_MASK) +#define ICH_LR_EL2_PINTID_SHIFT 32 +#define ICH_LR_EL2_PINTID_MASK (0x3fUL << ICH_LR_EL2_PINTID_SHIFT) +#define ICH_LR_EL2_PRIO_SHIFT 48 +#define ICH_LR_EL2_PRIO_MASK (0xffUL << ICH_LR_EL2_PRIO_SHIFT) +#define ICH_LR_EL2_GROUP_SHIFT 60 +#define ICH_LR_EL2_GROUP1 (1UL << ICH_LR_EL2_GROUP_SHIFT) +#define ICH_LR_EL2_HW (1UL << 61) +#define ICH_LR_EL2_STATE_SHIFT 62 +#define ICH_LR_EL2_STATE_MASK (0x3UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE(x) ((x) & ICH_LR_EL2_STATE_MASK) +#define ICH_LR_EL2_STATE_INACTIVE (0x0UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE_PENDING (0x1UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE_ACTIVE (0x2UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE_PENDING_ACTIVE (0x3UL << ICH_LR_EL2_STATE_SHIFT) + +/* Interrupt Controller Maintenance Interrupt State Register */ +#define ICH_MISR_EL2_VGRP1D (1 << 7) /* vPE Group 1 Disabled */ +#define ICH_MISR_EL2_VGRP1E (1 << 6) /* vPE Group 1 Enabled */ +#define ICH_MISR_EL2_VGRP0D (1 << 5) /* vPE Group 0 Disabled */ +#define ICH_MISR_EL2_VGRP0E (1 << 4) /* vPE Group 0 Enabled */ +#define ICH_MISR_EL2_NP (1 << 3) /* No Pending */ +#define ICH_MISR_EL2_LRENP (1 << 2) /* List Register Entry Not Present */ +#define ICH_MISR_EL2_U (1 << 1) /* Underflow */ +#define ICH_MISR_EL2_EOI (1 << 0) /* End Of Interrupt */ + +/* Interrupt Controller Virtual Machine Control Register */ +#define ICH_VMCR_EL2_VPMR_SHIFT 24 +#define ICH_VMCR_EL2_VPMR_MASK (0xff << ICH_VMCR_EL2_VPMR_SHIFT) +#define ICH_VMCR_EL2_VPMR_PRIO_LOWEST (0xff << ICH_VMCR_EL2_VPMR_SHIFT) +#define ICH_VMCR_EL2_VPMR_PRIO_HIGHEST (0x00 << ICH_VMCR_EL2_VPMR_SHIFT) +#define ICH_VMCR_EL2_VBPR0_SHIFT 21 +#define ICH_VMCR_EL2_VBPR0_MASK (0x7 << ICH_VMCR_EL2_VBPR0_SHIFT) +#define ICH_VMCR_EL2_VBPR0_NO_PREEMPTION \ + (0x7 << ICH_VMCR_EL2_VBPR0_SHIFT) +#define ICH_VMCR_EL2_VBPR1_SHIFT 18 +#define ICH_VMCR_EL2_VBPR1_MASK (0x7 << ICH_VMCR_EL2_VBPR1_SHIFT) +#define ICH_VMCR_EL2_VBPR1_NO_PREEMPTION \ + (0x7 << ICH_VMCR_EL2_VBPR1_SHIFT) +#define ICH_VMCR_EL2_VEOIM (1 << 9) /* Virtual EOI mode */ +#define ICH_VMCR_EL2_VCBPR (1 << 4) /* Virtual Common binary Point Register */ +#define ICH_VMCR_EL2_VFIQEN (1 << 3) /* Virtual FIQ enable */ +#define ICH_VMCR_EL2_VACKCTL (1 << 2) /* Virtual AckCtl */ +#define ICH_VMCR_EL2_VENG1 (1 << 1) /* Virtual Group 1 Interrupt Enable */ +#define ICH_VMCR_EL2_VENG0 (1 << 0) /* Virtual Group 0 Interrupt Enable */ + +/* Interrupt Controller VGIC Type Register */ +#define ICH_VTR_EL2_PRIBITS_SHIFT 29 +#define ICH_VTR_EL2_PRIBITS_MASK (0x7 << ICH_VTR_EL2_PRIBITS_SHIFT) +#define ICH_VTR_EL2_PRIBITS(x) \ + ((((x) & ICH_VTR_EL2_PRIBITS_MASK) >> ICH_VTR_EL2_PRIBITS_SHIFT) + 1) +#define ICH_VTR_EL2_PREBITS_SHIFT 26 +#define ICH_VTR_EL2_PREBITS_MASK (0x7 << ICH_VTR_EL2_PREBITS_SHIFT) +#define ICH_VTR_EL2_PREBITS(x) \ + (((x) & ICH_VTR_EL2_PREBITS_MASK) >> ICH_VTR_EL2_PREBITS_SHIFT) +#define ICH_VTR_EL2_SEIS (1 << 22) /* System Error Interrupt (SEI) Support */ +#define ICH_VTR_EL2_A3V (1 << 21) /* Affinity 3 Valid */ +#define ICH_VTR_EL2_NV4 (1 << 20) /* Direct injection of virtual interrupts. RES1 for GICv3 */ +#define ICH_VTR_EL2_TDS (1 << 19) /* Implementation supports ICH_HCR_EL2.TDIR */ +#define ICH_VTR_EL2_LISTREGS_MASK 0x1f +/* + * ICH_VTR_EL2.ListRegs holds the number of list registers, minus one. Add one + * to get the actual number of list registers. + */ +#define ICH_VTR_EL2_LISTREGS(x) (((x) & ICH_VTR_EL2_LISTREGS_MASK) + 1) + +#endif /* !_VGIC_V3_REG_H_ */ Index: sys/arm64/vmm/io/vtimer.h =================================================================== --- /dev/null +++ sys/arm64/vmm/io/vtimer.h @@ -0,0 +1,112 @@ +/*- + * Copyright (c) 2017 The FreeBSD Foundation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company nor the name of the author may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_VTIMER_H_ +#define _VMM_VTIMER_H_ + +#define GT_PHYS_NS_IRQ 30 +#define GT_VIRT_IRQ 27 + +#define CNTP_CTL_EL0_OP0 0b11 +#define CNTP_CTL_EL0_OP2 0b001 +#define CNTP_CTL_EL0_OP1 0b011 +#define CNTP_CTL_EL0_CRn 0b1110 +#define CNTP_CTL_EL0_CRm 0b0010 +#define ISS_CNTP_CTL_EL0 \ + (CNTP_CTL_EL0_OP0 << ISS_MSR_OP0_SHIFT | \ + CNTP_CTL_EL0_OP2 << ISS_MSR_OP2_SHIFT | \ + CNTP_CTL_EL0_OP1 << ISS_MSR_OP1_SHIFT | \ + CNTP_CTL_EL0_CRn << ISS_MSR_CRn_SHIFT | \ + CNTP_CTL_EL0_CRm << ISS_MSR_CRm_SHIFT) + +#define CNTP_CVAL_EL0_OP0 0b11 +#define CNTP_CVAL_EL0_OP1 0b011 +#define CNTP_CVAL_EL0_OP2 0b010 +#define CNTP_CVAL_EL0_CRn 0b1110 +#define CNTP_CVAL_EL0_CRm 0b0010 +#define ISS_CNTP_CVAL_EL0 \ + (CNTP_CVAL_EL0_OP0 << ISS_MSR_OP0_SHIFT | \ + CNTP_CVAL_EL0_OP2 << ISS_MSR_OP2_SHIFT | \ + CNTP_CVAL_EL0_OP1 << ISS_MSR_OP1_SHIFT | \ + CNTP_CVAL_EL0_CRn << ISS_MSR_CRn_SHIFT | \ + CNTP_CVAL_EL0_CRm << ISS_MSR_CRm_SHIFT) + +#define CNTP_TVAL_EL0_OP0 0b11 +#define CNTP_TVAL_EL0_OP1 0b011 +#define CNTP_TVAL_EL0_OP2 0b000 +#define CNTP_TVAL_EL0_CRn 0b1110 +#define CNTP_TVAL_EL0_CRm 0b0010 +#define ISS_CNTP_TVAL_EL0 \ + (CNTP_TVAL_EL0_OP0 << ISS_MSR_OP0_SHIFT | \ + CNTP_TVAL_EL0_OP2 << ISS_MSR_OP2_SHIFT | \ + CNTP_TVAL_EL0_OP1 << ISS_MSR_OP1_SHIFT | \ + CNTP_TVAL_EL0_CRn << ISS_MSR_CRn_SHIFT | \ + CNTP_TVAL_EL0_CRm << ISS_MSR_CRm_SHIFT) + +struct vtimer +{ + uint64_t cnthctl_el2; + uint64_t cntvoff_el2; +}; + +struct vtimer_cpu +{ + struct callout callout; + uint32_t cntkctl_el1; + /* + * Emulated registers: + * + * CNTP_CTL_EL0: Counter-timer Physical Timer Control Register + * CNTP_CVAL_EL0: Counter-timer Physical Timer CompareValue Register + */ + uint64_t cntp_cval_el0; + uint32_t cntp_ctl_el0; + /* + * The virtual machine has full access to the virtual timer. The + * following registers are part of the VM context for the current CPU: + * + * CNTV_CTL_EL0: Counter-timer Virtuel Timer Control Register + * CNTV_CVAL_EL0: Counter-timer Virtual Timer CompareValue Register + */ + uint64_t cntv_cval_el0; + uint32_t cntv_ctl_el0; +}; + +int vtimer_init(uint64_t cnthctl_el2); +void vtimer_vminit(void *arg); +void vtimer_cpuinit(void *arg); +void vtimer_vmcleanup(void *arg); +void vtimer_cleanup(void); + +int vtimer_phys_ctl_read(void *vm, int vcpuid, uint64_t *rval, void *arg); +int vtimer_phys_ctl_write(void *vm, int vcpuid, uint64_t wval, void *arg); +int vtimer_phys_cval_read(void *vm, int vcpuid, uint64_t *rval, void *arg); +int vtimer_phys_cval_write(void *vm, int vcpuid, uint64_t wval, void *arg); +int vtimer_phys_tval_read(void *vm, int vcpuid, uint64_t *rval, void *arg); +int vtimer_phys_tval_write(void *vm, int vcpuid, uint64_t wval, void *arg); +#endif Index: sys/arm64/vmm/io/vtimer.c =================================================================== --- /dev/null +++ sys/arm64/vmm/io/vtimer.c @@ -0,0 +1,407 @@ +/*- + * Copyright (c) 2017 The FreeBSD Foundation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company nor the name of the author may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "vgic_v3.h" +#include "vtimer.h" + +#define RES1 0xffffffffffffffffUL + +#define timer_enabled(ctl) \ + (!((ctl) & CNTP_CTL_IMASK) && ((ctl) & CNTP_CTL_ENABLE)) + +static uint64_t cnthctl_el2_reg; +static uint32_t tmr_frq; + +#define timer_condition_met(ctl) ((ctl) & CNTP_CTL_ISTATUS) + +static int +vtimer_virtual_timer_intr(void *arg) +{ + struct hypctx *hypctx; + uint32_t cntv_ctl; + + /* + * TODO everything here is very strange. The relantionship between the + * hardware value and the value in memory is not clear at all. + */ + + hypctx = arm64_get_active_vcpu(); + cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); + + if (!hypctx) { + /* vm_destroy() was called. */ + eprintf("No active vcpu\n"); + cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); + goto out; + } + if (!timer_enabled(cntv_ctl)) { + eprintf("Timer not enabled\n"); + goto out; + } + if (!timer_condition_met(cntv_ctl)) { + eprintf("Timer condition not met\n"); + goto out; + } + + vgic_v3_inject_irq(hypctx, GT_VIRT_IRQ, VGIC_IRQ_CLK); + + hypctx->vtimer_cpu.cntv_ctl_el0 &= ~CNTP_CTL_ENABLE; + cntv_ctl = hypctx->vtimer_cpu.cntv_ctl_el0; + +out: + /* + * Disable the timer interrupt. This will prevent the interrupt from + * being reasserted as soon as we exit the handler and getting stuck + * in an infinite loop. + * + * This is safe to do because the guest disabled the timer, and then + * enables it as part of the interrupt handling routine. + */ + cntv_ctl &= ~CNTP_CTL_ENABLE; + WRITE_SPECIALREG(cntv_ctl_el0, cntv_ctl); + + return (FILTER_HANDLED); +} + +int +vtimer_init(uint64_t cnthctl_el2) +{ + int error; + + cnthctl_el2_reg = cnthctl_el2; + /* + * The guest *MUST* use the same timer frequency as the host. The + * register CNTFRQ_EL0 is accessible to the guest and a different value + * in the guest dts file might have unforseen consequences. + */ + tmr_frq = READ_SPECIALREG(cntfrq_el0); + + error = arm_tmr_setup_intr(GT_VIRT, vtimer_virtual_timer_intr, NULL, NULL); + if (error) { + printf("WARNING: arm_tmr_setup_intr() error: %d\n", error); + printf("WARNING: Expect reduced performance\n"); + } + + return (0); +} + +void +vtimer_vminit(void *arg) +{ + struct hyp *hyp; + uint64_t now; + + hyp = (struct hyp *)arg; + /* + * Configure the Counter-timer Hypervisor Control Register for the VM. + * + * ~CNTHCTL_EL1PCEN: trap access to CNTP_{CTL, CVAL, TVAL}_EL0 from EL1 + * CNTHCTL_EL1PCTEN: don't trap access to CNTPCT_EL0 + */ + hyp->vtimer.cnthctl_el2 = cnthctl_el2_reg & ~CNTHCTL_EL1PCEN; + hyp->vtimer.cnthctl_el2 |= CNTHCTL_EL1PCTEN; + + now = READ_SPECIALREG(cntpct_el0); + hyp->vtimer.cntvoff_el2 = now; + + return; +} + +void +vtimer_cpuinit(void *arg) +{ + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + + hypctx = (struct hypctx *)arg; + vtimer_cpu = &hypctx->vtimer_cpu; + /* + * Configure physical timer interrupts for the VCPU. + * + * CNTP_CTL_IMASK: mask interrupts + * ~CNTP_CTL_ENABLE: disable the timer + */ + vtimer_cpu->cntp_ctl_el0 = CNTP_CTL_IMASK & ~CNTP_CTL_ENABLE; + /* + * Callout function is MP_SAFE because the VGIC uses a spin + * mutex when modifying the list registers. + */ + callout_init(&vtimer_cpu->callout, 1); +} + +void +vtimer_vmcleanup(void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer *vtimer; + struct vtimer_cpu *vtimer_cpu; + uint32_t cntv_ctl; + int i; + + hyp = arg; + vtimer = &hyp->vtimer; + + hypctx = arm64_get_active_vcpu(); + if (!hypctx) { + /* The active VM was destroyed, stop the timer. */ + cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); + cntv_ctl &= ~CNTP_CTL_ENABLE; + WRITE_SPECIALREG(cntv_ctl_el0, cntv_ctl); + } + + for (i = 0; i < VM_MAXCPU; i++) { + vtimer_cpu = &hyp->ctx[i].vtimer_cpu; + callout_drain(&vtimer_cpu->callout); + } +} + +void +vtimer_cleanup(void) +{ + int error; + + error = arm_tmr_teardown_intr(GT_VIRT); + if (error) + printf("WARNING: arm_tmr_teardown_intr() error: %d\n", error); + +} + +static void +vtimer_inject_irq_callout_func(void *context) +{ + struct hypctx *hypctx; + + hypctx = context; + vgic_v3_inject_irq(hypctx, GT_PHYS_NS_IRQ, VGIC_IRQ_CLK); +} + + +static void +vtimer_schedule_irq(struct vtimer_cpu *vtimer_cpu, struct hypctx *hypctx) +{ + sbintime_t time; + uint64_t cntpct_el0; + uint64_t diff; + + cntpct_el0 = READ_SPECIALREG(cntpct_el0); + if (vtimer_cpu->cntp_cval_el0 < cntpct_el0) { + /* Timer set in the past, trigger interrupt */ + vgic_v3_inject_irq(hypctx, GT_PHYS_NS_IRQ, VGIC_IRQ_CLK); + } else { + diff = vtimer_cpu->cntp_cval_el0 - cntpct_el0; + time = diff * SBT_1S / tmr_frq; + callout_reset_sbt(&vtimer_cpu->callout, time, 0, + vtimer_inject_irq_callout_func, hypctx, 0); + } +} + +static void +vtimer_remove_irq(struct hypctx *hypctx) +{ + struct vtimer_cpu *vtimer_cpu; + + vtimer_cpu = &hypctx->vtimer_cpu; + + callout_drain(&vtimer_cpu->callout); + /* + * The interrupt needs to be deactivated here regardless of the callout + * function having been executed. The timer interrupt can be masked with + * the CNTP_CTL_EL0.IMASK bit instead of reading the IAR register. + * Masking the interrupt doesn't remove it from the list registers. + */ + vgic_v3_remove_irq(hypctx, GT_PHYS_NS_IRQ, true); +} + +/* + * Timer emulation functions. + * + * The guest dts is configured to use the physical timer because the Generic + * Timer can only trap physical timer accesses. This is why we always read the + * physical counter value when programming the time for the timer interrupt in + * the guest. + */ + +int +vtimer_phys_ctl_read(void *vm, int vcpuid, uint64_t *rval, void *arg) +{ + struct hyp *hyp; + struct vtimer_cpu *vtimer_cpu; + uint64_t cntpct_el0; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + vtimer_cpu = &hyp->ctx[vcpuid].vtimer_cpu; + + cntpct_el0 = READ_SPECIALREG(cntpct_el0); + if (vtimer_cpu->cntp_cval_el0 < cntpct_el0) + /* Timer condition met */ + *rval = vtimer_cpu->cntp_ctl_el0 | CNTP_CTL_ISTATUS; + else + *rval = vtimer_cpu->cntp_ctl_el0 & ~CNTP_CTL_ISTATUS; + + *retu = false; + return (0); +} + +int +vtimer_phys_ctl_write(void *vm, int vcpuid, uint64_t wval, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + uint64_t ctl_el0; + bool timer_toggled_on; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + hypctx = &hyp->ctx[vcpuid]; + vtimer_cpu = &hypctx->vtimer_cpu; + + timer_toggled_on = false; + ctl_el0 = vtimer_cpu->cntp_ctl_el0; + + if (!timer_enabled(ctl_el0) && timer_enabled(wval)) + timer_toggled_on = true; + + vtimer_cpu->cntp_ctl_el0 = wval; + + if (timer_toggled_on) + vtimer_schedule_irq(vtimer_cpu, hypctx); + + *retu = false; + return (0); +} + +int +vtimer_phys_cval_read(void *vm, int vcpuid, uint64_t *rval, void *arg) +{ + struct hyp *hyp; + struct vtimer_cpu *vtimer_cpu; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + vtimer_cpu = &hyp->ctx[vcpuid].vtimer_cpu; + + *rval = vtimer_cpu->cntp_cval_el0; + + *retu = false; + return (0); +} + +int +vtimer_phys_cval_write(void *vm, int vcpuid, uint64_t wval, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + hypctx = &hyp->ctx[vcpuid]; + vtimer_cpu = &hypctx->vtimer_cpu; + + vtimer_cpu->cntp_cval_el0 = wval; + + if (timer_enabled(vtimer_cpu->cntp_ctl_el0)) { + vtimer_remove_irq(hypctx); + vtimer_schedule_irq(vtimer_cpu, hypctx); + } + + *retu = false; + return (0); +} + +int +vtimer_phys_tval_read(void *vm, int vcpuid, uint64_t *rval, void *arg) +{ + struct hyp *hyp; + struct vtimer_cpu *vtimer_cpu; + uint32_t cntpct_el0; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + vtimer_cpu = &hyp->ctx[vcpuid].vtimer_cpu; + + if (!(vtimer_cpu->cntp_ctl_el0 & CNTP_CTL_ENABLE)) { + /* + * ARMv8 Architecture Manual, p. D7-2702: the result of reading + * TVAL when the timer is disabled is UNKNOWN. I have chosen to + * return the maximum value possible on 32 bits which means the + * timer will fire very far into the future. + */ + *rval = (uint32_t)RES1; + } else { + cntpct_el0 = READ_SPECIALREG(cntpct_el0); + *rval = vtimer_cpu->cntp_cval_el0 - cntpct_el0; + } + + *retu = false; + return (0); +} + +int +vtimer_phys_tval_write(void *vm, int vcpuid, uint64_t wval, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + uint64_t cntpct_el0; + bool *retu = arg; + + hyp = vm_get_cookie(vm); + hypctx = &hyp->ctx[vcpuid]; + vtimer_cpu = &hypctx->vtimer_cpu; + + cntpct_el0 = READ_SPECIALREG(cntpct_el0); + vtimer_cpu->cntp_cval_el0 = (int32_t)wval + cntpct_el0; + + if (timer_enabled(vtimer_cpu->cntp_ctl_el0)) { + vtimer_remove_irq(hypctx); + vtimer_schedule_irq(vtimer_cpu, hypctx); + } + + *retu = false; + return (0); +} Index: sys/arm64/vmm/mmu.h =================================================================== --- /dev/null +++ sys/arm64/vmm/mmu.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2017 Alexandru Elisei + * All rights reserved. + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_MMU_H_ +#define _VMM_MMU_H_ + +#include +#include + +#include "hyp.h" + +#define ktohyp(kva) (((vm_offset_t)(kva) & HYP_KVA_MASK) | \ + HYP_KVA_OFFSET) +#define ipatok(ipa, hypmap) (PHYS_TO_DMAP(pmap_extract(hypmap, (ipa)))) +#define gtoipa(gva) ((gva) - KERNBASE + VM_GUEST_BASE_IPA) + +#define page_aligned(x) (((vm_offset_t)(x) & PAGE_MASK) == 0) + +void hypmap_init(pmap_t map, enum pmap_stage pm_stage); +void hypmap_map(pmap_t map, vm_offset_t va, size_t len, + vm_prot_t prot); +void hypmap_map_identity(pmap_t map, vm_offset_t va, size_t len, + vm_prot_t prot); +void hypmap_set(void *arg, vm_offset_t va, vm_offset_t pa, + size_t len, vm_prot_t prot); +vm_paddr_t hypmap_get(void *arg, vm_offset_t va); +void hypmap_cleanup(pmap_t map); + +#endif Index: sys/arm64/vmm/mmu.c =================================================================== --- /dev/null +++ sys/arm64/vmm/mmu.c @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2017 Alexandru Elisei + * All rights reserved. + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mmu.h" +#include "arm64.h" + +MALLOC_DECLARE(M_HYP); + +void +hypmap_init(pmap_t map, enum pmap_stage pm_stage) +{ + mtx_init(&map->pm_mtx, "hypmap_pm_mtx", NULL, MTX_DEF); + pmap_pinit_stage(map, pm_stage, 4); +} + +void +hypmap_map(pmap_t map, vm_offset_t va, size_t len, vm_prot_t prot) +{ + vm_offset_t va_end, hypva; + vm_page_t dummy_page; + + dummy_page = malloc(sizeof(*dummy_page), M_HYP, M_WAITOK | M_ZERO); + dummy_page->oflags = VPO_UNMANAGED; + dummy_page->md.pv_memattr = VM_MEMATTR_DEFAULT; + + /* + * Add the physical pages which correspond to the specified virtual + * addresses.The virtual addresses span contiguous virtual pages, but + * they might not reside in contiguous physical pages. + */ + va_end = va + len - 1; + va = trunc_page(va); + while (va < va_end) { + dummy_page->phys_addr = vtophys(va); + hypva = (va >= VM_MIN_KERNEL_ADDRESS) ? ktohyp(va) : va; + pmap_enter(map, hypva, dummy_page, prot, PMAP_ENTER_WIRED, 0); + va += PAGE_SIZE; + } + + free(dummy_page, M_HYP); +} + +void +hypmap_map_identity(pmap_t map, vm_offset_t va, size_t len, + vm_prot_t prot) +{ + vm_offset_t va_end; + vm_page_t dummy_page; + + dummy_page = malloc(sizeof(*dummy_page), M_HYP, M_WAITOK | M_ZERO); + dummy_page->oflags = VPO_UNMANAGED; + dummy_page->md.pv_memattr = VM_MEMATTR_DEFAULT; + + /* + * The virtual addresses span contiguous virtual pages, but they might + * not reside in contiguous physical pages. For each virtual page we + * get the physical page address and use that for the mapping. + */ + va_end = va + len - 1; + va = trunc_page(va); + while (va < va_end) { + dummy_page->phys_addr = vtophys(va); + pmap_enter(map, dummy_page->phys_addr, dummy_page, + prot, PMAP_ENTER_WIRED, 0); + va += PAGE_SIZE; + } + + free(dummy_page, M_HYP); +} + +/* + * Map 'len' bytes starting at virtual address 'va' to 'len' bytes + * starting at physical address 'pa' + */ +void +hypmap_set(void *arg, vm_offset_t va, vm_offset_t pa, size_t len, + vm_prot_t prot) +{ + vm_offset_t va_end, hypva; + vm_page_t dummy_page; + struct hyp *hyp; + pmap_t map; + + hyp = (struct hyp *)arg; + map = hyp->stage2_map; + + dummy_page = malloc(sizeof(*dummy_page), M_HYP, M_WAITOK | M_ZERO); + dummy_page->oflags = VPO_UNMANAGED; + dummy_page->md.pv_memattr = VM_MEMATTR_DEFAULT; + + va_end = va + len - 1; + va = trunc_page(va); + dummy_page->phys_addr = trunc_page(pa); + while (va < va_end) { + hypva = (va >= VM_MIN_KERNEL_ADDRESS) ? ktohyp(va) : va; + pmap_enter(map, hypva, dummy_page, prot, PMAP_ENTER_WIRED, 0); + va += PAGE_SIZE; + dummy_page->phys_addr += PAGE_SIZE; + } + + free(dummy_page, M_HYP); +} + +/* + * Return the physical address associated with virtual address 'va' + */ +vm_paddr_t +hypmap_get(void *arg, vm_offset_t va) +{ + struct hyp *hyp; + pmap_t map; + + hyp = (struct hyp *)arg; + map = hyp->stage2_map; + + return pmap_extract(map, va); +} + +/* + * Remove all the mappings from the hyp translation tables + */ +void +hypmap_cleanup(pmap_t map) +{ + pmap_remove(map, HYP_VM_MIN_ADDRESS, HYP_VM_MAX_ADDRESS); + mtx_destroy(&map->pm_mtx); + pmap_release(map); +} Index: sys/arm64/vmm/psci.h =================================================================== --- /dev/null +++ sys/arm64/vmm/psci.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2018 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _PSCI_H_ +#define _PSCI_H_ + +#include "arm64.h" + +int psci_handle_call(struct vm *vm, int vcpuid, struct vm_exit *vme, + bool *retu); + +#endif Index: sys/arm64/vmm/psci.c =================================================================== --- /dev/null +++ sys/arm64/vmm/psci.c @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2018 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include + +#include "arm64.h" +#include "psci.h" + +#define PSCI_VERSION_0_2 0x2 + +static int +psci_version(struct hypctx *hypctx, bool *retu) +{ + + hypctx->regs.x[0] = PSCI_VERSION_0_2; + + *retu = false; + return (0); +} + +static int +psci_system_off(struct vm_exit *vme, bool *retu) +{ + vme->u.suspended.how = VM_SUSPEND_POWEROFF; + vme->exitcode = VM_EXITCODE_SUSPENDED; + + *retu = true; + return (0); +} + +int +psci_handle_call(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu) +{ + struct hyp *hyp; + struct hypctx *hypctx; + uint64_t func_id; + uint32_t esr_el2, esr_iss; + int error; + + hyp = vm_get_cookie(vm); + hypctx = &hyp->ctx[vcpuid]; + + esr_el2 = hypctx->exit_info.esr_el2; + esr_iss = esr_el2 & ESR_ELx_ISS_MASK; + + if (esr_iss != 0) { + eprintf("Malformed HVC instruction with immediate: 0x%x\n", + esr_iss); + error = 1; + goto out; + } + + func_id = hypctx->regs.x[0]; + switch (func_id) { + case PSCI_FNID_VERSION: + error = psci_version(hypctx, retu); + break; + case PSCI_FNID_SYSTEM_OFF: + error = psci_system_off(vme, retu); + break; + default: + eprintf("Unimplemented PSCI function: 0x%016lx\n", func_id); + hypctx->regs.x[0] = PSCI_RETVAL_NOT_SUPPORTED; + error = 1; + } + +out: + return (error); +} Index: sys/arm64/vmm/reset.h =================================================================== --- /dev/null +++ sys/arm64/vmm/reset.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2018 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _VMM_RESET_H_ +#define _VMM_RESET_H_ + +void reset_vm_el01_regs(void *vcpu); +void reset_vm_el2_regs(void *vcpu); + +#endif Index: sys/arm64/vmm/reset.c =================================================================== --- /dev/null +++ sys/arm64/vmm/reset.c @@ -0,0 +1,139 @@ +/* + * Copyright (C) 2018 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include + +#include "arm64.h" +#include "reset.h" + +/* + * Make the architecturally UNKNOWN value 0. As a bonus, we don't have to + * manually set all those RES0 fields. + */ +#define ARCH_UNKNOWN 0 +#define set_arch_unknown(reg) (memset(&(reg), ARCH_UNKNOWN, sizeof(reg))) + +void +reset_vm_el01_regs(void *vcpu) +{ + struct hypctx *el2ctx; + + el2ctx = vcpu; + + set_arch_unknown(el2ctx->regs); + + set_arch_unknown(el2ctx->actlr_el1); + set_arch_unknown(el2ctx->afsr0_el1); + set_arch_unknown(el2ctx->afsr1_el1); + set_arch_unknown(el2ctx->amair_el1); + set_arch_unknown(el2ctx->contextidr_el1); + set_arch_unknown(el2ctx->cpacr_el1); + set_arch_unknown(el2ctx->elr_el1); + set_arch_unknown(el2ctx->esr_el1); + set_arch_unknown(el2ctx->far_el1); + set_arch_unknown(el2ctx->mair_el1); + set_arch_unknown(el2ctx->par_el1); + + /* + * Guest starts with: + * ~SCTLR_M: MMU off + * ~SCTLR_C: data cache off + * SCTLR_CP15BEN: memory barrier instruction enable from EL0; RAO/WI + * ~SCTLR_I: instruction cache off + */ + el2ctx->sctlr_el1 = SCTLR_RES1; + el2ctx->sctlr_el1 &= ~SCTLR_M & ~SCTLR_C & ~SCTLR_I; + el2ctx->sctlr_el1 |= SCTLR_CP15BEN; + + set_arch_unknown(el2ctx->sp_el0); + set_arch_unknown(el2ctx->tcr_el1); + set_arch_unknown(el2ctx->tpidr_el0); + set_arch_unknown(el2ctx->tpidr_el1); + set_arch_unknown(el2ctx->tpidrro_el0); + set_arch_unknown(el2ctx->ttbr0_el1); + set_arch_unknown(el2ctx->ttbr1_el1); + set_arch_unknown(el2ctx->vbar_el1); + set_arch_unknown(el2ctx->spsr_el1); +} + +void +reset_vm_el2_regs(void *vcpu) +{ + struct hypctx *el2ctx; + uint64_t cpu_aff; + + el2ctx = vcpu; + + /* + * Set the Hypervisor Configuration Register: + * + * HCR_RW: use AArch64 for EL1 + * HCR_BSU_IS: barrier instructions apply to the inner shareable + * domain + * HCR_SWIO: turn set/way invalidate into set/way clean and + * invalidate + * HCR_FB: broadcast maintenance operations + * HCR_AMO: route physical SError interrupts to EL2 + * HCR_IMO: route physical IRQ interrupts to EL2 + * HCR_FMO: route physical FIQ interrupts to EL2 + * HCR_VM: use stage 2 translation + */ + el2ctx->hcr_el2 = HCR_RW | HCR_BSU_IS | HCR_SWIO | HCR_FB | \ + HCR_VM | HCR_AMO | HCR_IMO | HCR_FMO; + + el2ctx->vmpidr_el2 = VMPIDR_EL2_RES1; + /* The guest will detect a multi-core, single-threaded CPU */ + el2ctx->vmpidr_el2 &= ~VMPIDR_EL2_U & ~VMPIDR_EL2_MT; + /* Only 24 bits of affinity, for a grand total of 16,777,216 cores. */ + cpu_aff = el2ctx->vcpu & (CPU_AFF0_MASK | CPU_AFF1_MASK | CPU_AFF2_MASK); + el2ctx->vmpidr_el2 |= cpu_aff; + + /* Use the same CPU identification information as the host */ + el2ctx->vpidr_el2 = CPU_IMPL_TO_MIDR(CPU_IMPL_ARM); + el2ctx->vpidr_el2 |= CPU_VAR_TO_MIDR(0); + el2ctx->vpidr_el2 |= CPU_ARCH_TO_MIDR(0xf); + el2ctx->vpidr_el2 |= CPU_PART_TO_MIDR(CPU_PART_FOUNDATION); + el2ctx->vpidr_el2 |= CPU_REV_TO_MIDR(0); + + /* + * Don't trap accesses to CPACR_EL1, trace, SVE, Advanced SIMD + * and floating point functionality to EL2. + */ + el2ctx->cptr_el2 = CPTR_RES1; + /* + * Disable interrupts in the guest. The guest OS will re-enable + * them. + */ + el2ctx->spsr_el2 = PSR_D | PSR_A | PSR_I | PSR_F; + /* Use the EL1 stack when taking exceptions to EL1 */ + el2ctx->spsr_el2 |= PSR_M_EL1h; +} Index: sys/arm64/vmm/vmm.c =================================================================== --- /dev/null +++ sys/arm64/vmm/vmm.c @@ -0,0 +1,895 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vmm_stat.h" +#include "vmm_mem.h" +#include "arm64.h" +#include "mmu.h" +#include "psci.h" + +#include "io/vgic_v3.h" +#include "io/vtimer.h" + +#define BSP 0 /* the boostrap processor */ + +struct vcpu { + int flags; + enum vcpu_state state; + struct mtx mtx; + int hostcpu; /* host cpuid this vcpu last ran on */ + int vcpuid; + void *stats; + struct vm_exit exitinfo; + uint64_t nextpc; /* (x) next instruction to execute */ +}; + +#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) +#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) +#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) +#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) +#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) + +struct mem_seg { + uint64_t gpa; + size_t len; + bool wired; + vm_object_t object; +}; +#define VM_MAX_MEMORY_SEGMENTS 2 + +struct vm { + void *cookie; + struct vcpu vcpu[VM_MAXCPU]; + int num_mem_segs; + struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS]; + char name[VM_MAX_NAMELEN]; + /* + * Set of active vcpus. + * An active vcpu is one that has been started implicitly (BSP) or + * explicitly (AP) by sending it a startup ipi. + */ + cpuset_t active_cpus; +}; + +static bool vmm_initialized = false; + +static struct vmm_ops *ops = NULL; + +#define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) +#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) + +#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL) +#define VMRUN(vmi, vcpu, pc, pmap, rvc, sc) \ + (ops != NULL ? (*ops->vmrun)(vmi, vcpu, pc, pmap, rvc, sc) : ENXIO) +#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) +#define VMMMAP_SET(vmi, ipa, pa, len, prot) \ + (ops != NULL ? \ + (*ops->vmmapset)(vmi, ipa, pa, len, prot) : ENXIO) +#define VMMMAP_GET(vmi, gpa) \ + (ops != NULL ? (*ops->vmmapget)(vmi, gpa) : ENXIO) +#define VMGETREG(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETREG(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) +#define VMGETCAP(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETCAP(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) + +#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) +#define fpu_stop_emulating() clts() + +static int vm_handle_wfi(struct vm *vm, int vcpuid, + struct vm_exit *vme, bool *retu); + +static MALLOC_DEFINE(M_VMM, "vmm", "vmm"); + +/* statistics */ +static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); + +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); + +/* + * Halt the guest if all vcpus are executing a HLT instruction with + * interrupts disabled. + */ +static int halt_detection_enabled = 1; +SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, + &halt_detection_enabled, 0, + "Halt VM if all vcpus execute HLT with interrupts disabled"); + +static int vmm_ipinum; +SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, + "IPI vector used for vcpu notifications"); + +static int trace_guest_exceptions; +SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, + &trace_guest_exceptions, 0, + "Trap into hypervisor on all guest exceptions and reflect them back"); + +static void +vcpu_cleanup(struct vm *vm, int i, bool destroy) +{ +// struct vcpu *vcpu = &vm->vcpu[i]; +} + +static void +vcpu_init(struct vm *vm, uint32_t vcpu_id) +{ + struct vcpu *vcpu; + + vcpu = &vm->vcpu[vcpu_id]; + + vcpu_lock_init(vcpu); + vcpu->hostcpu = NOCPU; + vcpu->vcpuid = vcpu_id; +} + +struct vm_exit * +vm_exitinfo(struct vm *vm, int cpuid) +{ + struct vcpu *vcpu; + + if (cpuid < 0 || cpuid >= VM_MAXCPU) + panic("vm_exitinfo: invalid cpuid %d", cpuid); + + vcpu = &vm->vcpu[cpuid]; + + return (&vcpu->exitinfo); +} + +static int +vmm_init(void) +{ + ops = &vmm_ops_arm; + + return (VMM_INIT(0)); +} + +static int +vmm_handler(module_t mod, int what, void *arg) +{ + int error; + + switch (what) { + case MOD_LOAD: + vmmdev_init(); + error = vmm_init(); + if (error == 0) + vmm_initialized = true; + break; + case MOD_UNLOAD: + error = vmmdev_cleanup(); + if (error == 0 && vmm_initialized) { + error = VMM_CLEANUP(); + if (error) + vmm_initialized = false; + } + break; + default: + error = 0; + break; + } + return (error); +} + +static moduledata_t vmm_kmod = { + "vmm", + vmm_handler, + NULL +}; + +/* + * vmm initialization has the following dependencies: + * + * - HYP initialization requires smp_rendezvous() and therefore must happen + * after SMP is fully functional (after SI_SUB_SMP). + */ +DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); +MODULE_VERSION(vmm, 1); + +int +vm_create(const char *name, struct vm **retvm) +{ + struct vm *vm; + int i; + + /* + * If vmm.ko could not be successfully initialized then don't attempt + * to create the virtual machine. + */ + if (!vmm_initialized) + return (ENXIO); + + if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) + return (EINVAL); + + vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO); + strcpy(vm->name, name); + vm->cookie = VMINIT(vm); + + for (i = 0; i < VM_MAXCPU; i++) + vcpu_init(vm, i); + + vm_activate_cpu(vm, BSP); + + *retvm = vm; + return (0); +} + +static void +vm_cleanup(struct vm *vm, bool destroy) +{ + VMCLEANUP(vm->cookie); +} + +void +vm_destroy(struct vm *vm) +{ + vm_cleanup(vm, true); + free(vm, M_VMM); +} + +const char * +vm_name(struct vm *vm) +{ + return (vm->name); +} + +#include +#include + +static caddr_t +search_by_type(const char *type, caddr_t preload_metadata) +{ + caddr_t curp, lname; + uint32_t *hdr; + int next; + + if (preload_metadata != NULL) { + + curp = preload_metadata; + lname = NULL; + for (;;) { + hdr = (uint32_t *)curp; + if (hdr[0] == 0 && hdr[1] == 0) + break; + + /* remember the start of each record */ + if (hdr[0] == MODINFO_NAME) + lname = curp; + + /* Search for a MODINFO_TYPE field */ + if ((hdr[0] == MODINFO_TYPE) && + !strcmp(type, curp + sizeof(uint32_t) * 2)) + return(lname); + + /* skip to next field */ + next = sizeof(uint32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + } + return(NULL); +} + +static int +vm_handle_reg_emul(struct vm *vm, int vcpuid, bool *retu) +{ + struct hyp *hyp; + struct vm_exit *vme; + struct vre *vre; + reg_read_t rread; + reg_write_t rwrite; + uint32_t iss_reg; + int error; + + hyp = (struct hyp *)vm->cookie; + vme = vm_exitinfo(vm, vcpuid); + vre = &vme->u.reg_emul.vre; + + iss_reg = vre->inst_syndrome & ISS_MSR_REG_MASK; + switch (iss_reg) { + case ISS_CNTP_CTL_EL0: + rread = vtimer_phys_ctl_read; + rwrite = vtimer_phys_ctl_write; + break; + case ISS_CNTP_CVAL_EL0: + rread = vtimer_phys_cval_read; + rwrite = vtimer_phys_cval_write; + break; + case ISS_CNTP_TVAL_EL0: + rread = vtimer_phys_tval_read; + rwrite = vtimer_phys_tval_write; + break; + default: + goto out_user; + } + + error = vmm_emulate_register(vm, vcpuid, vre, rread, rwrite, retu); + + return (error); + +out_user: + *retu = true; + return (0); +} + +static int +vm_mmio_region_match(const void *key, const void *memb) +{ + const uint64_t *addr = key; + const struct vgic_mmio_region *vmr = memb; + + if (*addr < vmr->start) + return (-1); + else if (*addr >= vmr->start && *addr < vmr->end) + return (0); + else + return (1); +} + +static int +vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) +{ + struct vm_exit *vme; + struct vie *vie; + struct hyp *hyp = vm->cookie; + uint64_t fault_ipa; + struct vgic_mmio_region *vmr; + int error; + + if (!hyp->vgic_attached) + goto out_user; + + vme = vm_exitinfo(vm, vcpuid); + vie = &vme->u.inst_emul.vie; + + fault_ipa = vme->u.inst_emul.gpa; + + vmr = bsearch(&fault_ipa, hyp->vgic_mmio_regions, + hyp->vgic_mmio_regions_num, sizeof(struct vgic_mmio_region), + vm_mmio_region_match); + if (!vmr) + goto out_user; + + error = vmm_emulate_instruction(vm, vcpuid, fault_ipa, vie, + vmr->read, vmr->write, retu); + + return (error); + +out_user: + *retu = true; + return (0); +} + +static int +vm_handle_poweroff(struct vm *vm, int vcpuid) +{ + return (0); +} + +static int +vm_handle_psci_call(struct vm *vm, int vcpuid, bool *retu) +{ + struct vm_exit *vme; + enum vm_suspend_how how; + int error; + + vme = vm_exitinfo(vm, vcpuid); + + error = psci_handle_call(vm, vcpuid, vme, retu); + if (error) + goto out; + + if (vme->exitcode == VM_EXITCODE_SUSPENDED) { + how = vme->u.suspended.how; + switch (how) { + case VM_SUSPEND_POWEROFF: + vm_handle_poweroff(vm, vcpuid); + break; + default: + /* Nothing to do */ + ; + } + } + +out: + return (error); +} + +int +vm_run(struct vm *vm, struct vm_run *vmrun) +{ + int error, vcpuid; + register_t pc; + struct vm_exit *vme; + bool retu; + void *rvc, *sc; + + vcpuid = vmrun->cpuid; + pc = vmrun->pc; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + rvc = sc = NULL; +restart: + critical_enter(); + error = VMRUN(vm->cookie, vcpuid, pc, NULL, rvc, sc); + critical_exit(); + + vme = vm_exitinfo(vm, vcpuid); + if (error == 0) { + retu = false; + switch (vme->exitcode) { + case VM_EXITCODE_INST_EMUL: + pc = vme->pc + vme->inst_length; + error = vm_handle_inst_emul(vm, vcpuid, &retu); + break; + + case VM_EXITCODE_REG_EMUL: + pc = vme->pc + vme->inst_length; + error = vm_handle_reg_emul(vm, vcpuid, &retu); + break; + + case VM_EXITCODE_HVC: + /* + * The HVC instruction saves the address for the + * next instruction as the return address. + */ + pc = vme->pc; + /* + * The PSCI call can change the exit information in the + * case of suspend/reset/poweroff/cpu off/cpu on. + */ + error = psci_handle_call(vm, vcpuid, vme, &retu); + break; + + case VM_EXITCODE_WFI: + pc = vme->pc + vme->inst_length; + error = vm_handle_wfi(vm, vcpuid, vme, &retu); + break; + + default: + /* Handle in userland */ + retu = true; + break; + } + } + + if (error == 0 && retu == false) + goto restart; + + /* Copy the exit information */ + bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); + + return (error); +} + +int +vm_activate_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EBUSY); + + CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); + return (0); + +} + +cpuset_t +vm_active_cpus(struct vm *vm) +{ + + return (vm->active_cpus); +} + +void * +vcpu_stats(struct vm *vm, int vcpuid) +{ + + return (vm->vcpu[vcpuid].stats); +} + +static int +vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, + bool from_idle) +{ + int error; + + vcpu_assert_locked(vcpu); + + /* + * State transitions from the vmmdev_ioctl() must always begin from + * the VCPU_IDLE state. This guarantees that there is only a single + * ioctl() operating on a vcpu at any point. + */ + if (from_idle) { + while (vcpu->state != VCPU_IDLE) + msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); + } else { + KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " + "vcpu idle state")); + } + + if (vcpu->state == VCPU_RUNNING) { + KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " + "mismatch for running vcpu", curcpu, vcpu->hostcpu)); + } else { + KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " + "vcpu that is not running", vcpu->hostcpu)); + } + + /* + * The following state transitions are allowed: + * IDLE -> FROZEN -> IDLE + * FROZEN -> RUNNING -> FROZEN + * FROZEN -> SLEEPING -> FROZEN + */ + switch (vcpu->state) { + case VCPU_IDLE: + case VCPU_RUNNING: + case VCPU_SLEEPING: + error = (newstate != VCPU_FROZEN); + break; + case VCPU_FROZEN: + error = (newstate == VCPU_FROZEN); + break; + default: + error = 1; + break; + } + + if (error) + return (EBUSY); + + vcpu->state = newstate; + if (newstate == VCPU_RUNNING) + vcpu->hostcpu = curcpu; + else + vcpu->hostcpu = NOCPU; + + if (newstate == VCPU_IDLE) + wakeup(&vcpu->state); + + return (0); +} + +int +vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, + bool from_idle) +{ + int error; + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_set_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + error = vcpu_set_state_locked(vcpu, newstate, from_idle); + vcpu_unlock(vcpu); + + return (error); +} + +enum vcpu_state +vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) +{ + struct vcpu *vcpu; + enum vcpu_state state; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_get_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + state = vcpu->state; + if (hostcpu != NULL) + *hostcpu = vcpu->hostcpu; + vcpu_unlock(vcpu); + + return (state); +} + +uint64_t +vm_gpa2hpa(struct vm *vm, uint64_t gpa, size_t len) +{ + uint64_t nextpage; + + nextpage = trunc_page(gpa + PAGE_SIZE); + if (len > nextpage - gpa) + panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%zu", gpa, len); + + return (VMMMAP_GET(vm->cookie, gpa)); +} + +int +vm_gpabase2memseg(struct vm *vm, uint64_t gpabase, + struct vm_memory_segment *seg) +{ + int i; + + for (i = 0; i < vm->num_mem_segs; i++) { + if (gpabase == vm->mem_segs[i].gpa) { + *seg = vm->mem_segs[i]; + return (0); + } + } + return (-1); +} + +int +vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (VMGETREG(vm->cookie, vcpu, reg, retval)); +} + +int +vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) +{ + struct vcpu *vcpu; + int error; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + error = VMSETREG(vm->cookie, vcpuid, reg, val); + if (error) + return (error); + + vcpu = &vm->vcpu[vcpuid]; + vcpu->nextpc = val; + + return(0); +} + +void * +vm_get_cookie(struct vm *vm) +{ + return vm->cookie; +} + +static void +vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg) +{ + size_t len; + uint64_t hpa; + + len = 0; + while (len < seg->len) { + hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE); + if (hpa == (uint64_t)-1) { + panic("vm_free_mem_segs: cannot free hpa " + "associated with gpa 0x%016lx", seg->gpa + len); + } + + vmm_mem_free(hpa, PAGE_SIZE); + + len += PAGE_SIZE; + } + + bzero(seg, sizeof(struct vm_memory_segment)); +} + +/* + * Return true if 'gpa' is available for allocation, false otherwise + */ +static bool +vm_ipa_available(struct vm *vm, uint64_t ipa) +{ + uint64_t ipabase, ipalimit; + int i; + + if (!page_aligned(ipa)) + panic("vm_ipa_available: ipa (0x%016lx) not page aligned", ipa); + + for (i = 0; i < vm->num_mem_segs; i++) { + ipabase = vm->mem_segs[i].gpa; + ipalimit = ipabase + vm->mem_segs[i].len; + if (ipa >= ipabase && ipa < ipalimit) + return (false); + } + + return (true); +} + +/* + * Allocate 'len' bytes for the virtual machine starting at address 'ipa' + */ +int +vm_malloc(struct vm *vm, uint64_t ipa, size_t len) +{ + struct vm_memory_segment *seg; + int error, available, allocated; + uint64_t ipa2; + vm_paddr_t pa; + + if (!page_aligned(ipa) != 0 || !page_aligned(len) || len == 0) + return (EINVAL); + + available = allocated = 0; + ipa2 = ipa; + while (ipa2 < ipa + len) { + if (vm_ipa_available(vm, ipa2)) + available++; + else + allocated++; + ipa2 += PAGE_SIZE; + } + + /* + * If there are some allocated and some available pages in the address + * range then it is an error. + */ + if (allocated != 0 && available != 0) + return (EINVAL); + + /* + * If the entire address range being requested has already been + * allocated then there isn't anything more to do. + */ + if (allocated != 0 && available == 0) + return (0); + + if (vm->num_mem_segs == VM_MAX_MEMORY_SEGMENTS) + return (E2BIG); + + seg = &vm->mem_segs[vm->num_mem_segs]; + error = 0; + seg->gpa = ipa; + seg->len = 0; + while (seg->len < len) { + pa = vmm_mem_alloc(PAGE_SIZE); + if (pa == 0) { + error = ENOMEM; + break; + } + VMMMAP_SET(vm->cookie, ipa, pa, PAGE_SIZE, VM_PROT_ALL); + + seg->len += PAGE_SIZE; + ipa += PAGE_SIZE; + } + vm->num_mem_segs++; + + return (0); +} + +int +vm_attach_vgic(struct vm *vm, uint64_t dist_start, size_t dist_size, + uint64_t redist_start, size_t redist_size) +{ + int error; + + error = vgic_v3_attach_to_vm(vm->cookie, dist_start, dist_size, + redist_start, redist_size); + + return (error); +} + +int +vm_assert_irq(struct vm *vm, uint32_t irq) +{ + struct hyp *hyp = (struct hyp *)vm->cookie; + int error; + + /* TODO: this is crap, send the vcpuid as an argument to vm_assert_irq */ + error = vgic_v3_inject_irq(&hyp->ctx[0], irq, VGIC_IRQ_VIRTIO); + + return (error); +} + +int +vm_deassert_irq(struct vm *vm, uint32_t irq) +{ + int error; + + error = vgic_v3_remove_irq(vm->cookie, irq, false); + + return (error); +} + +static int +vm_handle_wfi(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu) +{ + struct vcpu *vcpu; + struct hypctx *hypctx; + bool intr_disabled; + + vcpu = &vm->vcpu[vcpuid]; + hypctx = vme->u.wfi.hypctx; + intr_disabled = !(hypctx->regs.spsr & PSR_I); + + vcpu_lock(vcpu); + while (1) { + if (!intr_disabled && vgic_v3_vcpu_pending_irq(hypctx)) + break; + + if (vcpu_should_yield(vm, vcpuid)) + break; + + vcpu_set_state_locked(vcpu, VCPU_SLEEPING, false); + msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz); + vcpu_set_state_locked(vcpu, VCPU_FROZEN, false); + } + vcpu_unlock(vcpu); + + *retu = false; + return (0); +} Index: sys/arm64/vmm/vmm_dev.c =================================================================== --- /dev/null +++ sys/arm64/vmm/vmm_dev.c @@ -0,0 +1,404 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +struct vmmdev_softc { + struct vm *vm; /* vm instance cookie */ + struct cdev *cdev; + SLIST_ENTRY(vmmdev_softc) link; + int flags; +}; +#define VSC_LINKED 0x01 + +static SLIST_HEAD(, vmmdev_softc) head; + +static struct mtx vmmdev_mtx; + +static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); + +SYSCTL_DECL(_hw_vmm); + +static struct vmmdev_softc * +vmmdev_lookup(const char *name) +{ + struct vmmdev_softc *sc; + +#ifdef notyet /* XXX kernel is not compiled with invariants */ + mtx_assert(&vmmdev_mtx, MA_OWNED); +#endif + + SLIST_FOREACH(sc, &head, link) { + if (strcmp(name, vm_name(sc->vm)) == 0) + break; + } + + return (sc); +} + +static struct vmmdev_softc * +vmmdev_lookup2(struct cdev *cdev) +{ + + return (cdev->si_drv1); +} + +static int +vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) +{ + int error = 0; + + return (error); +} + +static int +vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, + struct thread *td) +{ + int error, vcpu, state_changed; + struct vmmdev_softc *sc; + struct vm_run *vmrun; + struct vm_memory_segment *seg; + struct vm_register *vmreg; + struct vm_activate_cpu *vac; + struct vm_attach_vgic *vav; + struct vm_irq *vi; + + sc = vmmdev_lookup2(cdev); + if (sc == NULL) + return (ENXIO); + + error = 0; + vcpu = -1; + state_changed = 0; + + /* + * Some VMM ioctls can operate only on vcpus that are not running. + */ + switch (cmd) { + case VM_RUN: + case VM_GET_REGISTER: + case VM_SET_REGISTER: + /* + * XXX fragile, handle with care + * Assumes that the first field of the ioctl data is the vcpu. + */ + vcpu = *(int *)data; + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + error = EINVAL; + goto done; + } + + error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); + if (error) + goto done; + + state_changed = 1; + break; + + case VM_MAP_MEMORY: + case VM_ATTACH_VGIC: + /* + * ioctls that operate on the entire virtual machine must + * prevent all vcpus from running. + */ + error = 0; + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { + error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); + if (error) + break; + } + + if (error) { + vcpu--; + while (vcpu >= 0) { + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + vcpu--; + } + goto done; + } + + state_changed = 2; + break; + case VM_ASSERT_IRQ: + vi =(struct vm_irq *)data; + error = vm_assert_irq(sc->vm, vi->irq); + break; + case VM_DEASSERT_IRQ: + vi = (struct vm_irq *)data; + error = vm_deassert_irq(sc->vm, vi->irq); + break; + default: + break; + } + + switch(cmd) { + case VM_RUN: + vmrun = (struct vm_run *)data; + error = vm_run(sc->vm, vmrun); + break; + case VM_MAP_MEMORY: + seg = (struct vm_memory_segment *)data; + error = vm_malloc(sc->vm, seg->gpa, seg->len); + break; + case VM_GET_MEMORY_SEG: + seg = (struct vm_memory_segment *)data; + seg->len = 0; + (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); + error = 0; + break; + case VM_GET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, + &vmreg->regval); + break; + case VM_SET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, + vmreg->regval); + break; + case VM_ACTIVATE_CPU: + vac = (struct vm_activate_cpu *)data; + error = vm_activate_cpu(sc->vm, vac->vcpuid); + break; + case VM_ATTACH_VGIC: + vav = (struct vm_attach_vgic *)data; + error = vm_attach_vgic(sc->vm, vav->dist_start, vav->dist_size, + vav->redist_start, vav->redist_size); + break; + default: + error = ENOTTY; + break; + } + + if (state_changed == 1) { + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + } else if (state_changed == 2) { + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + } + +done: + /* Make sure that no handler returns a bogus value like ERESTART */ + KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); + return (error); +} + +static int +vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr, + int nprot, vm_memattr_t *memattr) +{ + int error; + struct vmmdev_softc *sc; + + error = -1; + mtx_lock(&vmmdev_mtx); + + sc = vmmdev_lookup2(cdev); + if (sc != NULL && !(nprot & PROT_EXEC)) { + *paddr = (vm_paddr_t)vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE); + if (*paddr != (vm_paddr_t)-1) + error = 0; + } + + mtx_unlock(&vmmdev_mtx); + + return (error); +} + +static void +vmmdev_destroy(void *arg) +{ + + struct vmmdev_softc *sc = arg; + + if (sc->cdev != NULL) + destroy_dev(sc->cdev); + + if (sc->vm != NULL) + vm_destroy(sc->vm); + + if ((sc->flags & VSC_LINKED) != 0) { + mtx_lock(&vmmdev_mtx); + SLIST_REMOVE(&head, sc, vmmdev_softc, link); + mtx_unlock(&vmmdev_mtx); + } + + free(sc, M_VMMDEV); +} + +static int +sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) +{ + int error; + char buf[VM_MAX_NAMELEN]; + struct vmmdev_softc *sc; + struct cdev *cdev; + + strlcpy(buf, "beavis", sizeof(buf)); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(buf); + if (sc == NULL || sc->cdev == NULL) { + mtx_unlock(&vmmdev_mtx); + return (EINVAL); + } + + /* + * The 'cdev' will be destroyed asynchronously when 'si_threadcount' + * goes down to 0 so we should not do it again in the callback. + */ + cdev = sc->cdev; + sc->cdev = NULL; + mtx_unlock(&vmmdev_mtx); + + /* + * Schedule the 'cdev' to be destroyed: + * + * - any new operations on this 'cdev' will return an error (ENXIO). + * + * - when the 'si_threadcount' dwindles down to zero the 'cdev' will + * be destroyed and the callback will be invoked in a taskqueue + * context. + */ + destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); + + return (0); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_vmm_destroy, "A", NULL); + +static struct cdevsw vmmdevsw = { + .d_name = "vmmdev", + .d_version = D_VERSION, + .d_ioctl = vmmdev_ioctl, + .d_mmap = vmmdev_mmap, + .d_read = vmmdev_rw, + .d_write = vmmdev_rw, +}; + +static int +sysctl_vmm_create(SYSCTL_HANDLER_ARGS) +{ + int error; + struct vm *vm; + struct cdev *cdev; + struct vmmdev_softc *sc, *sc2; + char buf[VM_MAX_NAMELEN]; + + strlcpy(buf, "beavis", sizeof(buf)); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(buf); + mtx_unlock(&vmmdev_mtx); + if (sc != NULL) + return (EEXIST); + + error = vm_create(buf, &vm); + if (error != 0) + return (error); + + sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); + sc->vm = vm; + + /* + * Lookup the name again just in case somebody sneaked in when we + * dropped the lock. + */ + mtx_lock(&vmmdev_mtx); + sc2 = vmmdev_lookup(buf); + if (sc2 == NULL) { + SLIST_INSERT_HEAD(&head, sc, link); + sc->flags |= VSC_LINKED; + } + mtx_unlock(&vmmdev_mtx); + + if (sc2 != NULL) { + vmmdev_destroy(sc); + return (EEXIST); + } + + error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, + UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); + if (error != 0) { + vmmdev_destroy(sc); + return (error); + } + + mtx_lock(&vmmdev_mtx); + sc->cdev = cdev; + sc->cdev->si_drv1 = sc; + mtx_unlock(&vmmdev_mtx); + + return (0); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_vmm_create, "A", NULL); + +void +vmmdev_init(void) +{ + mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); +} + +int +vmmdev_cleanup(void) +{ + int error; + + if (SLIST_EMPTY(&head)) + error = 0; + else + error = EBUSY; + + return (error); +} Index: sys/arm64/vmm/vmm_instruction_emul.c =================================================================== --- /dev/null +++ sys/arm64/vmm/vmm_instruction_emul.c @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef _KERNEL +#include +#include +#include +#include + +#include + +#include + +#else +#include +#include +#include +#include +#include + +#include + +#include +#include +#endif + +#include + +int +vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) +{ + uint64_t val; + int error; + + if (vie->dir == VM_DIR_READ) { + error = memread(vm, vcpuid, gpa, &val, vie->access_size, memarg); + if (error) + goto out; + error = vm_set_register(vm, vcpuid, vie->reg, val); + } else { + error = vm_get_register(vm, vcpuid, vie->reg, &val); + if (error) + goto out; + error = memwrite(vm, vcpuid, gpa, val, vie->access_size, memarg); + } + +out: + return (error); +} + +int +vmm_emulate_register(void *vm, int vcpuid, struct vre *vre, reg_read_t regread, + reg_write_t regwrite, void *regarg) +{ + uint64_t val; + int error; + + if (vre->dir == VM_DIR_READ) { + error = regread(vm, vcpuid, &val, regarg); + if (error) + goto out; + error = vm_set_register(vm, vcpuid, vre->reg, val); + } else { + error = vm_get_register(vm, vcpuid, vre->reg, &val); + if (error) + goto out; + error = regwrite(vm, vcpuid, val, regarg); + } + +out: + return (error); +} Index: sys/arm64/vmm/vmm_mem.h =================================================================== --- /dev/null +++ sys/arm64/vmm/vmm_mem.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_MEM_H_ +#define _VMM_MEM_H_ + +int vmm_mem_init(void); +vm_paddr_t vmm_mem_alloc(size_t size); +void vmm_mem_free(vm_paddr_t start, size_t size); +vm_paddr_t vmm_mem_maxaddr(void); + +#endif Index: sys/arm64/vmm/vmm_mem.c =================================================================== --- /dev/null +++ sys/arm64/vmm/vmm_mem.c @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "vmm_mem.h" + +SYSCTL_DECL(_hw_vmm); + +static u_long pages_allocated; +SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD, + &pages_allocated, 0, "4KB pages allocated"); + +static void +update_pages_allocated(int howmany) +{ + pages_allocated += howmany; /* XXX locking? */ +} + +int +vmm_mem_init(void) +{ + + return (0); +} + +vm_paddr_t +vmm_mem_alloc(size_t size) +{ + + int flags; + vm_page_t m; + vm_paddr_t pa; + + if (size != PAGE_SIZE) + panic("vmm_mem_alloc: invalid allocation size %zu", size); + + flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | + VM_ALLOC_ZERO; + + while (1) { + /* + * XXX need policy to determine when to back off the allocation + */ + m = vm_page_alloc(NULL, 0, flags); + if (m == NULL) + vm_wait(NULL); + else + break; + } + + pa = VM_PAGE_TO_PHYS(m); + + if ((m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + + m->valid = VM_PAGE_BITS_ALL; + update_pages_allocated(1); + + return (pa); +} + +void +vmm_mem_free(vm_paddr_t base, size_t length) +{ + vm_page_t m; + + if (base & PAGE_MASK) { + panic("vmm_mem_free: base 0x%0lx must be aligned on a " + "0x%0x boundary\n", base, PAGE_SIZE); + } + + if (length != PAGE_SIZE) + panic("vmm_mem_free: invalid length %zu", length); + + m = PHYS_TO_VM_PAGE(base); + vm_page_unwire_noq(m); + vm_page_free(m); + + update_pages_allocated(-1); +} + +vm_paddr_t +vmm_mem_maxaddr(void) +{ + + return (ptoa(Maxmem)); +} Index: sys/arm64/vmm/vmm_stat.h =================================================================== --- /dev/null +++ sys/arm64/vmm/vmm_stat.h @@ -0,0 +1,155 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_STAT_H_ +#define _VMM_STAT_H_ + +struct vm; + +#define MAX_VMM_STAT_ELEMS 64 /* arbitrary */ + +enum vmm_stat_scope { + VMM_STAT_SCOPE_ANY, + VMM_STAT_SCOPE_INTEL, /* Intel VMX specific statistic */ + VMM_STAT_SCOPE_AMD, /* AMD SVM specific statistic */ +}; + +struct vmm_stat_type; +typedef void (*vmm_stat_func_t)(struct vm *vm, int vcpu, + struct vmm_stat_type *stat); + +struct vmm_stat_type { + int index; /* position in the stats buffer */ + int nelems; /* standalone or array */ + const char *desc; /* description of statistic */ + vmm_stat_func_t func; + enum vmm_stat_scope scope; +}; + +void vmm_stat_register(void *arg); + +#define VMM_STAT_FDEFINE(type, nelems, desc, func, scope) \ + struct vmm_stat_type type[1] = { \ + { -1, nelems, desc, func, scope } \ + }; \ + SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_register, type) + +#define VMM_STAT_DEFINE(type, nelems, desc, scope) \ + VMM_STAT_FDEFINE(type, nelems, desc, NULL, scope) + +#define VMM_STAT_DECLARE(type) \ + extern struct vmm_stat_type type[1] + +#define VMM_STAT(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_ANY) +#define VMM_STAT_INTEL(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_INTEL) +#define VMM_STAT_AMD(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_AMD) + +#define VMM_STAT_FUNC(type, desc, func) \ + VMM_STAT_FDEFINE(type, 1, desc, func, VMM_STAT_SCOPE_ANY) + +#define VMM_STAT_ARRAY(type, nelems, desc) \ + VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY) + +void *vmm_stat_alloc(void); +void vmm_stat_init(void *vp); +void vmm_stat_free(void *vp); + +/* + * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries + */ +int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf); +int vmm_stat_desc_copy(int index, char *buf, int buflen); + +static void __inline +vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t x) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] += x; +#endif +} + +static void __inline +vmm_stat_array_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t val) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] = val; +#endif +} + +static void __inline +vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_incr(vm, vcpu, vst, 0, x); +#endif +} + +static void __inline +vmm_stat_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t val) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_set(vm, vcpu, vst, 0, val); +#endif +} + +VMM_STAT_DECLARE(VCPU_MIGRATIONS); +VMM_STAT_DECLARE(VMEXIT_COUNT); +VMM_STAT_DECLARE(VMEXIT_EXTINT); +VMM_STAT_DECLARE(VMEXIT_HLT); +VMM_STAT_DECLARE(VMEXIT_CR_ACCESS); +VMM_STAT_DECLARE(VMEXIT_RDMSR); +VMM_STAT_DECLARE(VMEXIT_WRMSR); +VMM_STAT_DECLARE(VMEXIT_MTRAP); +VMM_STAT_DECLARE(VMEXIT_PAUSE); +VMM_STAT_DECLARE(VMEXIT_INTR_WINDOW); +VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW); +VMM_STAT_DECLARE(VMEXIT_INOUT); +VMM_STAT_DECLARE(VMEXIT_CPUID); +VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT); +VMM_STAT_DECLARE(VMEXIT_INST_EMUL); +VMM_STAT_DECLARE(VMEXIT_UNKNOWN); +VMM_STAT_DECLARE(VMEXIT_ASTPENDING); +VMM_STAT_DECLARE(VMEXIT_USERSPACE); +VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS); +VMM_STAT_DECLARE(VMEXIT_EXCEPTION); +#endif Index: sys/arm64/vmm/vmm_stat.c =================================================================== --- /dev/null +++ sys/arm64/vmm/vmm_stat.c @@ -0,0 +1,159 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include + +#include +#include "vmm_stat.h" + +/* + * 'vst_num_elems' is the total number of addressable statistic elements + * 'vst_num_types' is the number of unique statistic types + * + * It is always true that 'vst_num_elems' is greater than or equal to + * 'vst_num_types'. This is because a stat type may represent more than + * one element (for e.g. VMM_STAT_ARRAY). + */ +static int vst_num_elems, vst_num_types; +static struct vmm_stat_type *vsttab[MAX_VMM_STAT_ELEMS]; + +static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat"); + +#define vst_size ((size_t)vst_num_elems * sizeof(uint64_t)) + +void +vmm_stat_register(void *arg) +{ + struct vmm_stat_type *vst = arg; + + /* We require all stats to identify themselves with a description */ + if (vst->desc == NULL) + return; + + if (vst_num_elems + vst->nelems >= MAX_VMM_STAT_ELEMS) { + printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc); + return; + } + + vst->index = vst_num_elems; + vst_num_elems += vst->nelems; + + vsttab[vst_num_types++] = vst; +} + +int +vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf) +{ + struct vmm_stat_type *vst; + uint64_t *stats; + int i; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + /* Let stats functions update their counters */ + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (vst->func != NULL) + (*vst->func)(vm, vcpu, vst); + } + + /* Copy over the stats */ + stats = vcpu_stats(vm, vcpu); + for (i = 0; i < vst_num_elems; i++) + buf[i] = stats[i]; + *num_stats = vst_num_elems; + return (0); +} + +void * +vmm_stat_alloc(void) +{ + + return (malloc(vst_size, M_VMM_STAT, M_WAITOK)); +} + +void +vmm_stat_init(void *vp) +{ + + bzero(vp, vst_size); +} + +void +vmm_stat_free(void *vp) +{ + free(vp, M_VMM_STAT); +} + +int +vmm_stat_desc_copy(int index, char *buf, int bufsize) +{ + int i; + struct vmm_stat_type *vst; + + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (index >= vst->index && index < vst->index + vst->nelems) { + if (vst->nelems > 1) { + snprintf(buf, bufsize, "%s[%d]", + vst->desc, index - vst->index); + } else { + strlcpy(buf, vst->desc, bufsize); + } + return (0); /* found it */ + } + } + + return (EINVAL); +} + +/* global statistics */ +VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); +VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); +VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt"); +VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted"); +VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted"); +VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted"); +VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted"); +VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits"); +VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted"); +VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening"); +VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); +VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); +VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); +VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); +VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation"); +VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); +VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); +VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); +VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit"); +VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); Index: sys/conf/files.arm64 =================================================================== --- sys/conf/files.arm64 +++ sys/conf/files.arm64 @@ -159,6 +159,7 @@ arm64/arm64/identcpu.c standard arm64/arm64/in_cksum.c optional inet | inet6 arm64/arm64/locore.S standard no-obj +arm64/arm64/hyp_stub.S standard arm64/arm64/machdep.c standard arm64/arm64/machdep_boot.c standard arm64/arm64/mem.c standard @@ -360,6 +361,7 @@ dev/vnic/thunder_mdio_fdt.c optional vnic fdt dev/vnic/thunder_mdio.c optional vnic dev/vnic/lmac_if.m optional inet | inet6 | vnic +dev/bvm/bvm_console.c optional bvmconsole kern/kern_clocksource.c standard kern/msi_if.m optional intrng kern/pic_if.m optional intrng Index: sys/dev/bvm/bvm_console.c =================================================================== --- sys/dev/bvm/bvm_console.c +++ sys/dev/bvm/bvm_console.c @@ -40,6 +40,11 @@ #include #include +#if defined(__aarch64__) +#include +#include +#endif + #include #include @@ -66,8 +71,13 @@ static int alt_break_state; #endif +#if defined(__i386__) || defined(__amd64__) #define BVM_CONS_PORT 0x220 -static int bvm_cons_port = BVM_CONS_PORT; +#elif defined(__aarch64__) +#define BVM_CONS_PORT 0x090000 +#endif + +static vm_offset_t bvm_cons_port = BVM_CONS_PORT; #define BVM_CONS_SIG ('b' << 8 | 'v') @@ -88,7 +98,12 @@ { int c; +#if defined(__i386__) || defined(__amd64__) c = inl(bvm_cons_port); +#elif defined(__arm__) || defined(__aarch64__) + c = *(int *)bvm_cons_port; +#endif + if (c != -1) { *ch = (u_char)c; return (0); @@ -99,8 +114,11 @@ static void bvm_wcons(u_char ch) { - +#if defined(__i386__) || defined(__amd64__) outl(bvm_cons_port, ch); +#elif defined(__arm__) || defined(__aarch64__) + *(int *)bvm_cons_port = ch; +#endif } static void @@ -170,7 +188,10 @@ static void bvm_cnprobe(struct consdev *cp) { - int disabled, port; + int disabled; +#if defined(__i386__) || defined(__amd64__) + int port; +#endif disabled = 0; cp->cn_pri = CN_DEAD; @@ -178,11 +199,17 @@ resource_int_value("bvmconsole", 0, "disabled", &disabled); if (!disabled) { +#if defined(__i386__) || defined(__amd64__) if (resource_int_value("bvmconsole", 0, "port", &port) == 0) bvm_cons_port = port; if (inw(bvm_cons_port) == BVM_CONS_SIG) +#elif defined(__arm__) || defined(__aarch64__) + bvm_cons_port = (vm_offset_t)pmap_mapdev(bvm_cons_port, 0x1000); + if ((*(short *)bvm_cons_port) == BVM_CONS_SIG) { +#endif cp->cn_pri = CN_REMOTE; + } } } Index: sys/dts/Makefile =================================================================== --- sys/dts/Makefile +++ sys/dts/Makefile @@ -1,5 +1,5 @@ # $FreeBSD$ -SUBDIR=arm mips powerpc +SUBDIR=arm arm64 mips powerpc .include Index: sys/kern/kern_cons.c =================================================================== --- sys/kern/kern_cons.c +++ sys/kern/kern_cons.c @@ -136,7 +136,6 @@ * Check if we should mute the console (for security reasons perhaps) * It can be changes dynamically using sysctl kern.consmute * once we are up and going. - * */ cn_mute = ((boothowto & (RB_MUTE |RB_SINGLE @@ -174,6 +173,7 @@ cnadd(cn); } } + if (best_cn == NULL) return; if ((boothowto & RB_MULTIPLE) == 0) { Index: sys/modules/Makefile =================================================================== --- sys/modules/Makefile +++ sys/modules/Makefile @@ -600,6 +600,9 @@ _armv8crypto= armv8crypto _em= em _rockchip= rockchip +.if ${MK_BHYVE} != "no" || defined(ALL_MODULES) +_vmm= vmm +.endif .endif .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" @@ -767,6 +770,11 @@ .if ${MACHINE_ARCH} == "armv7" _cfi= cfi _cpsw= cpsw + +.if ${MK_BHYVE} != "no" || defined(ALL_MODULES) +_vmm= vmm +.endif + .endif .if ${MACHINE_CPUARCH} == "powerpc" Index: sys/modules/vmm/Makefile =================================================================== --- sys/modules/vmm/Makefile +++ sys/modules/vmm/Makefile @@ -4,87 +4,10 @@ KMOD= vmm -SRCS= opt_acpi.h opt_bhyve_snapshot.h opt_ddb.h -SRCS+= device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h vnode_if.h -DPSRCS+= vmx_assym.h svm_assym.h -DPSRCS+= vmx_genassym.c svm_genassym.c offset.inc +SRCS= opt_acpi.h opt_ddb.h device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h -CFLAGS+= -DVMM_KEEP_STATS -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/io -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/intel -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/amd +CFLAGS+= -DVMM_KEEP_STATS -DSMP -# generic vmm support -.PATH: ${SRCTOP}/sys/amd64/vmm -SRCS+= vmm.c \ - vmm_dev.c \ - vmm_host.c \ - vmm_instruction_emul.c \ - vmm_ioport.c \ - vmm_lapic.c \ - vmm_mem.c \ - vmm_stat.c \ - vmm_util.c \ - x86.c - -.PATH: ${SRCTOP}/sys/amd64/vmm/io -SRCS+= iommu.c \ - ppt.c \ - vatpic.c \ - vatpit.c \ - vhpet.c \ - vioapic.c \ - vlapic.c \ - vpmtmr.c \ - vrtc.c - -# intel-specific files -.PATH: ${SRCTOP}/sys/amd64/vmm/intel -SRCS+= ept.c \ - vmcs.c \ - vmx_msr.c \ - vmx_support.S \ - vmx.c \ - vtd.c - -# amd-specific files -.PATH: ${SRCTOP}/sys/amd64/vmm/amd -SRCS+= vmcb.c \ - svm.c \ - svm_support.S \ - npt.c \ - ivrs_drv.c \ - amdvi_hw.c \ - svm_msr.c - -.if ${KERN_OPTS:MBHYVE_SNAPSHOT} != "" -SRCS+= vmm_snapshot.c -.endif - -CLEANFILES= vmx_assym.h vmx_genassym.o svm_assym.h svm_genassym.o - -OBJS_DEPEND_GUESS.vmx_support.o+= vmx_assym.h -OBJS_DEPEND_GUESS.svm_support.o+= svm_assym.h - -vmx_assym.h: vmx_genassym.o - sh ${SYSDIR}/kern/genassym.sh vmx_genassym.o > ${.TARGET} - -svm_assym.h: svm_genassym.o - sh ${SYSDIR}/kern/genassym.sh svm_genassym.o > ${.TARGET} - -vmx_support.o: - ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ - ${.IMPSRC} -o ${.TARGET} - -svm_support.o: - ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ - ${.IMPSRC} -o ${.TARGET} - -vmx_genassym.o: offset.inc - ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} - -svm_genassym.o: offset.inc - ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} +.include .include Index: sys/modules/vmm/Makefile.amd64 =================================================================== --- /dev/null +++ sys/modules/vmm/Makefile.amd64 @@ -0,0 +1,77 @@ +# $FreeBSD$ + +DPSRCS+= vmx_assym.h svm_assym.h +DPSRCS+= vmx_genassym.c svm_genassym.c offset.inc + +CFLAGS+= -I${SRCTOP}/sys/amd64/vmm +CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/io +CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/intel +CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/amd + +# generic vmm support +.PATH: ${SRCTOP}/sys/amd64/vmm +SRCS+= vmm.c \ + vmm_dev.c \ + vmm_host.c \ + vmm_instruction_emul.c \ + vmm_ioport.c \ + vmm_lapic.c \ + vmm_mem.c \ + vmm_stat.c \ + vmm_util.c \ + x86.c + +.PATH: ${SRCTOP}/sys/amd64/vmm/io +SRCS+= iommu.c \ + ppt.c \ + vatpic.c \ + vatpit.c \ + vhpet.c \ + vioapic.c \ + vlapic.c \ + vpmtmr.c \ + vrtc.c + +# intel-specific files +.PATH: ${SRCTOP}/sys/amd64/vmm/intel +SRCS+= ept.c \ + vmcs.c \ + vmx_msr.c \ + vmx_support.S \ + vmx.c \ + vtd.c + +# amd-specific files +.PATH: ${SRCTOP}/sys/amd64/vmm/amd +SRCS+= vmcb.c \ + svm.c \ + svm_support.S \ + npt.c \ + ivrs_drv.c \ + amdvi_hw.c \ + svm_msr.c + +CLEANFILES= vmx_assym.h vmx_genassym.o svm_assym.h svm_genassym.o + +OBJS_DEPEND_GUESS.vmx_support.o+= vmx_assym.h +OBJS_DEPEND_GUESS.svm_support.o+= svm_assym.h + +vmx_assym.h: vmx_genassym.o + sh ${SYSDIR}/kern/genassym.sh vmx_genassym.o > ${.TARGET} + +svm_assym.h: svm_genassym.o + sh ${SYSDIR}/kern/genassym.sh svm_genassym.o > ${.TARGET} + +vmx_support.o: + ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ + ${.IMPSRC} -o ${.TARGET} + +svm_support.o: + ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ + ${.IMPSRC} -o ${.TARGET} + +vmx_genassym.o: offset.inc + ${CC} -c ${CFLAGS:N-flto:N-fno-common} ${.IMPSRC} + +svm_genassym.o: offset.inc + ${CC} -c ${CFLAGS:N-flto:N-fno-common} ${.IMPSRC} Index: sys/modules/vmm/Makefile.arm64 =================================================================== --- /dev/null +++ sys/modules/vmm/Makefile.arm64 @@ -0,0 +1,30 @@ +DPSRCS+= hyp_assym.h +DPSRCS+= hyp_genassym.c + +CFLAGS+= -I${.CURDIR}/../../arm64/vmm -I${.CURDIR}/../../arm64/include + +# generic vmm support +.PATH: ${.CURDIR}/../../arm64/vmm +SRCS+= vmm.c \ + vmm_dev.c \ + vmm_instruction_emul.c \ + vmm_mem.c \ + mmu.c \ + vmm_stat.c \ + arm64.c \ + psci.c \ + reset.c \ + hyp.S + +.PATH: ${.CURDIR}/../../arm64/vmm/io +SRCS+= vgic_v3.c \ + vgic_v3_mmio.c \ + vtimer.c + +CLEANFILES= hyp_assym.h hyp_genassym.o + +hyp_assym.h: hyp_genassym.o + sh ${SYSDIR}/kern/genassym.sh hyp_genassym.o > ${.TARGET} + +hyp_genassym.o: + ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} Index: sys/sys/bitstring.h =================================================================== --- sys/sys/bitstring.h +++ sys/sys/bitstring.h @@ -155,6 +155,24 @@ _bitstr[_bit_idx(_bit)] &= ~_bit_mask(_bit); } +/* Test bit N, then set it */ +static inline int +bit_test_and_set(bitstr_t *_bitstr, int _bit) +{ + int _result = bit_test(_bitstr, _bit); + bit_set(_bitstr, _bit); + return _result; +} + +/* Test bit N, then clear it */ +static inline int +bit_test_and_clear(bitstr_t *_bitstr, int _bit) +{ + int _result = bit_test(_bitstr, _bit); + bit_clear(_bitstr, _bit); + return _result; +} + /* Set bits start ... stop inclusive in bit string. */ static inline void bit_nset(bitstr_t *_bitstr, int _start, int _stop) @@ -419,4 +437,35 @@ *_result = _value; } +/* Computes _dstbitstr as the bitwise and of the two _srcbitstr */ +static inline void +bitstr_and(bitstr_t *_dstbitstr, bitstr_t *_src1bitstr, + bitstr_t *_src2bitstr, int _nbits) +{ + bitstr_t mask; + while (_nbits >= (int)_BITSTR_BITS) { + *_dstbitstr++ = *_src1bitstr++ & *_src2bitstr++; + _nbits -= _BITSTR_BITS; + } + if (_nbits > 0) { + mask = _bit_make_mask(0, _bit_offset(_nbits - 1)); + *_dstbitstr = (*_src1bitstr & *_src2bitstr) & mask; + } +} + +/* Computes _dstbitstr as the bitwise or of the two _srcbitstr */ +static inline void +bitstr_or(bitstr_t *_dstbitstr, bitstr_t *_src1bitstr, + bitstr_t *_src2bitstr, int _nbits) +{ + bitstr_t mask; + while (_nbits >= (int)_BITSTR_BITS) { + *_dstbitstr++ = *_src1bitstr++ | *_src2bitstr++; + _nbits -= _BITSTR_BITS; + } + if (_nbits > 0) { + mask = _bit_make_mask(0, _bit_offset(_nbits - 1)); + *_dstbitstr = (*_src1bitstr | *_src2bitstr) & mask; + } +} #endif /* _SYS_BITSTRING_H_ */ Index: usr.sbin/Makefile.arm =================================================================== --- usr.sbin/Makefile.arm +++ usr.sbin/Makefile.arm @@ -2,3 +2,8 @@ SUBDIR+= kgmon SUBDIR+= ofwdump + +.if ${MK_BHYVE} != "no" +SUBDIR+= bhyve +SUBDIR+= bhyveload +.endif Index: usr.sbin/Makefile.arm64 =================================================================== --- usr.sbin/Makefile.arm64 +++ usr.sbin/Makefile.arm64 @@ -4,3 +4,9 @@ SUBDIR+= acpi .endif SUBDIR+= ofwdump + +.if ${MK_BHYVE} != "no" +SUBDIR+= bhyve +SUBDIR+= bhyveload +SUBDIR+= bhyvectl +.endif Index: usr.sbin/bhyve/Makefile =================================================================== --- usr.sbin/bhyve/Makefile +++ usr.sbin/bhyve/Makefile @@ -3,124 +3,7 @@ # .include -CFLAGS+=-I${.CURDIR}/../../contrib/lib9p -CFLAGS+=-I${SRCTOP}/sys -.PATH: ${SRCTOP}/sys/cam/ctl -PROG= bhyve -PACKAGE= bhyve - -MAN= bhyve.8 - -BHYVE_SYSDIR?=${SRCTOP} - -SRCS= \ - atkbdc.c \ - acpi.c \ - audio.c \ - bhyvegc.c \ - bhyverun.c \ - block_if.c \ - bootrom.c \ - console.c \ - consport.c \ - ctl_util.c \ - ctl_scsi_all.c \ - dbgport.c \ - fwctl.c \ - gdb.c \ - hda_codec.c \ - inout.c \ - ioapic.c \ - kernemu_dev.c \ - mem.c \ - mevent.c \ - mptbl.c \ - net_backends.c \ - net_utils.c \ - pci_ahci.c \ - pci_e82545.c \ - pci_emul.c \ - pci_hda.c \ - pci_fbuf.c \ - pci_hostbridge.c \ - pci_irq.c \ - pci_lpc.c \ - pci_nvme.c \ - pci_passthru.c \ - pci_virtio_9p.c \ - pci_virtio_block.c \ - pci_virtio_console.c \ - pci_virtio_net.c \ - pci_virtio_rnd.c \ - pci_virtio_scsi.c \ - pci_uart.c \ - pci_xhci.c \ - pm.c \ - post.c \ - ps2kbd.c \ - ps2mouse.c \ - rfb.c \ - rtc.c \ - smbiostbl.c \ - sockstream.c \ - task_switch.c \ - uart_emul.c \ - usb_emul.c \ - usb_mouse.c \ - virtio.c \ - vga.c \ - vmgenc.c \ - xmsr.c \ - spinup_ap.c \ - iov.c - -.if ${MK_BHYVE_SNAPSHOT} != "no" -SRCS+= snapshot.c -.endif - -CFLAGS.kernemu_dev.c+= -I${SRCTOP}/sys/amd64 - -.PATH: ${BHYVE_SYSDIR}/sys/amd64/vmm -SRCS+= vmm_instruction_emul.c - -LIBADD= vmmapi md pthread z util sbuf cam 9p casper cap_pwd cap_grp -.if ${MK_BHYVE_SNAPSHOT} != "no" -LIBADD+= ucl xo -.endif - -.if ${MK_INET_SUPPORT} != "no" -CFLAGS+=-DINET -.endif -.if ${MK_INET6_SUPPORT} != "no" -CFLAGS+=-DINET6 -.endif -.if ${MK_NETGRAPH_SUPPORT} != "no" -CFLAGS+=-DNETGRAPH -LIBADD+= netgraph -.endif -.if ${MK_OPENSSL} == "no" -CFLAGS+=-DNO_OPENSSL -.else -LIBADD+= crypto -.endif - -CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000 -CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii -CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller -.if ${MK_BHYVE_SNAPSHOT} != "no" -CFLAGS+= -I${SRCTOP}/contrib/libucl/include - -# Temporary disable capsicum, until we integrate checkpoint code with it. -CFLAGS+= -DWITHOUT_CAPSICUM - -CFLAGS+= -DBHYVE_SNAPSHOT -.endif - -.ifdef GDB_LOG -CFLAGS+=-DGDB_LOG -.endif - -WARNS?= 2 +.include .include Index: usr.sbin/bhyve/Makefile.amd64 =================================================================== --- /dev/null +++ usr.sbin/bhyve/Makefile.amd64 @@ -0,0 +1,124 @@ +# +# $FreeBSD$ +# + +CFLAGS+=-I${.CURDIR}/../../contrib/lib9p +CFLAGS+=-I${SRCTOP}/sys +.PATH: ${SRCTOP}/sys/cam/ctl + +PROG= bhyve +PACKAGE= bhyve + +MAN= bhyve.8 + +BHYVE_SYSDIR?=${SRCTOP} + +SRCS= \ + atkbdc.c \ + acpi.c \ + audio.c \ + bhyvegc.c \ + bhyverun.c \ + block_if.c \ + bootrom.c \ + console.c \ + consport.c \ + ctl_util.c \ + ctl_scsi_all.c \ + dbgport.c \ + fwctl.c \ + gdb.c \ + hda_codec.c \ + inout.c \ + ioapic.c \ + kernemu_dev.c \ + mem.c \ + mevent.c \ + mptbl.c \ + net_backends.c \ + net_utils.c \ + pci_ahci.c \ + pci_e82545.c \ + pci_emul.c \ + pci_hda.c \ + pci_fbuf.c \ + pci_hostbridge.c \ + pci_irq.c \ + pci_lpc.c \ + pci_nvme.c \ + pci_passthru.c \ + pci_virtio_9p.c \ + pci_virtio_block.c \ + pci_virtio_console.c \ + pci_virtio_net.c \ + pci_virtio_rnd.c \ + pci_virtio_scsi.c \ + pci_uart.c \ + pci_xhci.c \ + pm.c \ + post.c \ + ps2kbd.c \ + ps2mouse.c \ + rfb.c \ + rtc.c \ + smbiostbl.c \ + sockstream.c \ + task_switch.c \ + uart_emul.c \ + usb_emul.c \ + usb_mouse.c \ + pci_virtio.c \ + vga.c \ + vmgenc.c \ + xmsr.c \ + spinup_ap.c \ + iov.c + +.if ${MK_BHYVE_SNAPSHOT} != "no" +SRCS+= snapshot.c +.endif + +CFLAGS.kernemu_dev.c+= -I${SRCTOP}/sys/amd64 + +.PATH: ${BHYVE_SYSDIR}/sys/amd64/vmm +SRCS+= vmm_instruction_emul.c + +LIBADD= vmmapi md pthread z util sbuf cam 9p casper cap_pwd cap_grp +.if ${MK_BHYVE_SNAPSHOT} != "no" +LIBADD+= ucl xo +.endif + +.if ${MK_INET_SUPPORT} != "no" +CFLAGS+=-DINET +.endif +.if ${MK_INET6_SUPPORT} != "no" +CFLAGS+=-DINET6 +.endif +.if ${MK_NETGRAPH_SUPPORT} != "no" +CFLAGS+=-DNETGRAPH +LIBADD+= netgraph +.endif +.if ${MK_OPENSSL} == "no" +CFLAGS+=-DNO_OPENSSL +.else +LIBADD+= crypto +.endif + +CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000 +CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii +CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller +.if ${MK_BHYVE_SNAPSHOT} != "no" +CFLAGS+= -I${SRCTOP}/contrib/libucl/include + +# Temporary disable capsicum, until we integrate checkpoint code with it. +CFLAGS+= -DWITHOUT_CAPSICUM + +CFLAGS+= -DBHYVE_SNAPSHOT +.endif + +.ifdef GDB_LOG +CFLAGS+=-DGDB_LOG +.endif + +WARNS?= 2 + Index: usr.sbin/bhyve/Makefile.arm64 =================================================================== --- /dev/null +++ usr.sbin/bhyve/Makefile.arm64 @@ -0,0 +1,56 @@ +# +# $FreeBSD$ +# + +CFLAGS+=-I${SRCTOP}/sys +.PATH: ${SRCTOP}/sys/cam/ctl + +PROG= bhyve +PACKAGE= bhyve + +MAN= bhyve.8 + +BHYVE_SYSDIR?=${SRCTOP} +BHYVE_SRCTOP?=${.CURDIR} + +SRCS= \ + block_if.c \ + iov.c \ + mevent.c \ + net_backends.c \ + sockstream.c + +CFLAGS+= -DWITHOUT_CAPSICUM +.include "${BHYVE_SRCTOP}/arm64/Makefile.inc" +.include "${BHYVE_SRCTOP}/mmio/Makefile.inc" + +LIBADD= vmmapi md pthread + +.if ${MK_INET_SUPPORT} != "no" +CFLAGS+=-DINET +.endif +.if ${MK_INET6_SUPPORT} != "no" +CFLAGS+=-DINET6 +.endif +.if ${MK_NETGRAPH_SUPPORT} != "no" +CFLAGS+=-DNETGRAPH +LIBADD+= netgraph +.endif +.if ${MK_OPENSSL} == "no" +CFLAGS+=-DNO_OPENSSL +.endif + +.PATH: ${BHYVE_SYSDIR}/sys/arm64/vmm +SRCS+= vmm_instruction_emul.c + +CFLAGS+= -I${BHYVE_SRCTOP} +CFLAGS+= -I${BHYVE_SRCTOP}/arm64 +CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/virtio +CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/virtio/console + +.ifdef GDB_LOG +CFLAGS+=-DGDB_LOG +.endif + +WARNS?= 2 + Index: usr.sbin/bhyve/acpi.c =================================================================== --- usr.sbin/bhyve/acpi.c +++ usr.sbin/bhyve/acpi.c @@ -608,9 +608,9 @@ EFPRINTF(fp, "\n"); EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base()); - EFPRINTF(fp, "[0002]\t\tSegment Group Number : 0000\n"); - EFPRINTF(fp, "[0001]\t\tStart Bus Number : 00\n"); - EFPRINTF(fp, "[0001]\t\tEnd Bus Number : FF\n"); + EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n"); + EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n"); + EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n"); EFPRINTF(fp, "[0004]\t\tReserved : 0\n"); EFFLUSH(fp); return (0); Index: usr.sbin/bhyve/arm64/Makefile.inc =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/Makefile.inc @@ -0,0 +1,17 @@ +# +# $FreeBSD$ +# +.PATH: ${BHYVE_SRCTOP}/arm64/ +SRCS+= \ + arm64/bhyverun.c \ + arm64/mem.c \ + arm64/consport.c \ + arm64/reset.c + +.PATH: ${BHYVE_SYSDIR}/sys/${BHYVE_ARCH}/vmm + +MK_MAN=no + +BHYVE_BUS= mmio + +CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/virtio/mmio Index: usr.sbin/bhyve/arm64/bhyverun.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/bhyverun.h @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/usr.sbin/bhyve/arm/bhyverun.h 4 2017-04-18 20:28:32Z mihai.carabas $ + */ + +#ifndef _FBSDRUN_H_ +#define _FBSDRUN_H_ + +#ifndef CTASSERT /* Allow lint to override */ +#define CTASSERT(x) _CTASSERT(x, __LINE__) +#define _CTASSERT(x, y) __CTASSERT(x, y) +#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1] +#endif + +struct vmctx; +extern int guest_ncpus; +extern char *vmname; + +void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len); + +void fbsdrun_addcpu(struct vmctx *ctx, int cpu, uint64_t rip); +int fbsdrun_muxed(void); +int fbsdrun_vmexit_on_hlt(void); +int fbsdrun_vmexit_on_pause(void); +int fbsdrun_disable_x2apic(void); +int fbsdrun_virtio_msix(void); +#endif Index: usr.sbin/bhyve/arm64/bhyverun.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/bhyverun.c @@ -0,0 +1,468 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "bhyverun.h" +#include "../mmio/mmio_emul.h" +#include "../mmio/mmio_irq.h" +#include "mem.h" +#include "mevent.h" + +/* Exit codes. */ +#define EXIT_REBOOT 0 +#define EXIT_POWEROFF 1 +#define EXIT_HALT 2 +#define EXIT_ERROR 4 + +#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ + +#define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */ +#define VMEXIT_CONTINUE 1 /* continue from next instruction */ +#define VMEXIT_RESTART 2 /* restart current instruction */ +#define VMEXIT_ABORT 3 /* abort the vm run loop */ +#define VMEXIT_RESET 4 /* guest machine has reset */ + +#define MB (1024UL * 1024) +#define GB (1024UL * MB) + +typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); + +char *vmname; + +int guest_ncpus; + +int raw_stdio = 0; + +static int foundcpus; + +static char *progname; +static const int BSP = 0; +/* TODO Change this to cpuset_t */ +static int cpumask; + +static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t pc); + +struct vm_exit vmexit[VM_MAXCPU]; + +struct bhyvestats { + uint64_t vmexit_bogus; + uint64_t vmexit_inst_emul; +} stats; + +struct mt_vmm_info { + pthread_t mt_thr; + struct vmctx *mt_ctx; + int mt_vcpu; +} mt_vmm_info[VM_MAXCPU]; + +static cpuset_t *vcpumap[VM_MAXCPU] = { NULL }; + +static void +usage(int code) +{ + + fprintf(stderr, + "Usage: %s [-bh] [-c vcpus] [-p pincpu] [-s ] " + "\n" + " -b: use bvmconsole\n" + " -c: # cpus (default 1)\n" + " -p: pin vcpu 'n' to host cpu 'pincpu + n'\n" + " -s: device emulation config\n" + " -h: help\n", + progname); + + exit(code); +} + +static int +pincpu_parse(const char *opt) +{ + int vcpu, pcpu; + + if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { + fprintf(stderr, "invalid format: %s\n", opt); + return (-1); + } + + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n", + vcpu, VM_MAXCPU - 1); + return (-1); + } + + if (pcpu < 0 || pcpu >= CPU_SETSIZE) { + fprintf(stderr, "hostcpu '%d' outside valid range from " + "0 to %d\n", pcpu, CPU_SETSIZE - 1); + return (-1); + } + + if (vcpumap[vcpu] == NULL) { + if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) { + perror("malloc"); + return (-1); + } + CPU_ZERO(vcpumap[vcpu]); + } + CPU_SET(pcpu, vcpumap[vcpu]); + return (0); +} + +void * +paddr_guest2host(struct vmctx *ctx, uintptr_t iaddr, size_t len) +{ + + return (vm_map_ipa(ctx, iaddr, len)); +} + +int +fbsdrun_virtio_msix(void) +{ + + return 0; +} + +static void * +fbsdrun_start_thread(void *param) +{ + char tname[MAXCOMLEN + 1]; + struct mt_vmm_info *mtp; + int vcpu; + + mtp = param; + vcpu = mtp->mt_vcpu; + + snprintf(tname, sizeof(tname), "%s vcpu %d", vmname, vcpu); + pthread_set_name_np(mtp->mt_thr, tname); + + vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].pc); + + /* not reached */ + return (NULL); +} + +void +fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t pc) +{ + int error; + + if (cpumask & (1 << vcpu)) { + fprintf(stderr, "addcpu: attempting to add existing cpu %d\n", + vcpu); + exit(4); + } + + cpumask |= 1 << vcpu; + foundcpus++; + + /* + * Set up the vmexit struct to allow execution to start + * at the given RIP + */ + vmexit[vcpu].pc = pc; + vmexit[vcpu].inst_length = 0; + + if (vcpu == BSP) { + mt_vmm_info[vcpu].mt_ctx = ctx; + mt_vmm_info[vcpu].mt_vcpu = vcpu; + + error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL, + fbsdrun_start_thread, &mt_vmm_info[vcpu]); + assert(error == 0); + } +} + +static int +fbsdrun_get_next_cpu(int curcpu) +{ + + /* + * Get the next available CPU. Assumes they arrive + * in ascending order with no gaps. + */ + return ((curcpu + 1) % foundcpus); +} + +static int +vmexit_hyp(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + fprintf(stderr, "vm exit[%d]\n", *pvcpu); + fprintf(stderr, "\treason\t\tHYP\n"); + fprintf(stderr, "\tpc\t\t0x%016lx\n", vmexit->pc); + fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); + + return (VMEXIT_ABORT); +} + +static int +vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + stats.vmexit_bogus++; + + return (VMEXIT_RESTART); +} + +static int +vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + int err; + struct vie *vie; + + stats.vmexit_inst_emul++; + + vie = &vmexit->u.inst_emul.vie; + err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, vie); + + if (err) { + if (err == ESRCH) { + fprintf(stderr, "Unhandled memory access to 0x%lx\n", + vmexit->u.inst_emul.gpa); + } + + fprintf(stderr, "Failed to emulate instruction at 0x%lx\n", vmexit->pc); + return (VMEXIT_ABORT); + } + return (VMEXIT_CONTINUE); +} + +static int +vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + enum vm_suspend_how how; + + how = vmexit->u.suspended.how; + + switch (how) { + case VM_SUSPEND_POWEROFF: + exit(EXIT_POWEROFF); + case VM_SUSPEND_RESET: + exit(EXIT_REBOOT); + case VM_SUSPEND_HALT: + exit(EXIT_HALT); + case VM_SUSPEND_TRIPLEFAULT: + /* Not implemented yet. */ + exit(EXIT_ERROR); + default: + fprintf(stderr, "vmexit_suspend: invalid or unimplemented reason %d\n", how); + exit(100); + } + +} + +static vmexit_handler_t handler[VM_EXITCODE_MAX] = { + [VM_EXITCODE_BOGUS] = vmexit_bogus, + [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, + [VM_EXITCODE_REG_EMUL] = vmexit_hyp, + [VM_EXITCODE_SUSPENDED] = vmexit_suspend, + [VM_EXITCODE_HYP] = vmexit_hyp, +}; + +static void +vm_loop(struct vmctx *ctx, int vcpu, uint64_t pc) +{ + int error, rc, prevcpu; + enum vm_exitcode exitcode; + + if (vcpumap[vcpu] != NULL) { + error = pthread_setaffinity_np(pthread_self(), + sizeof(cpuset_t), vcpumap[vcpu]); + assert(error == 0); + } + + while (1) { + + error = vm_run(ctx, vcpu, pc, &vmexit[vcpu]); + + if (error != 0) { + /* + * It is possible that 'vmmctl' or some other process + * has transitioned the vcpu to CANNOT_RUN state right + * before we tried to transition it to RUNNING. + * + * This is expected to be temporary so just retry. + */ + if (errno == EBUSY) + continue; + else + break; + } + + prevcpu = vcpu; + + exitcode = vmexit[vcpu].exitcode; + if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { + fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", + exitcode); + exit(4); + } + + rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); + + switch (rc) { + case VMEXIT_CONTINUE: + pc = vmexit[vcpu].pc + vmexit[vcpu].inst_length; + break; + case VMEXIT_RESTART: + pc = vmexit[vcpu].pc; + break; + case VMEXIT_RESET: + exit(0); + default: + exit(4); + } + } + fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); +} + +static int +num_vcpus_allowed(struct vmctx *ctx) +{ + return (VM_MAXCPU); +} + +int +main(int argc, char *argv[]) +{ + int c, error; + bool bvmcons; + int max_vcpus; + struct vmctx *ctx; + uint64_t pc; + uint64_t memory_base_address, mem_size; + + bvmcons = false; + memory_base_address = VM_GUEST_BASE_IPA; + mem_size = 128 * MB; + progname = basename(argv[0]); + guest_ncpus = 1; + + while ((c = getopt(argc, argv, "bhp:c:s:e:m:")) != -1) { + switch (c) { + case 'b': + bvmcons = true; + break; + case 'e': + memory_base_address = strtoul(optarg, NULL, 0); + break; + case 'p': + if (pincpu_parse(optarg) != 0) { + errx(EX_USAGE, "invalid vcpu pinning " + "configuration '%s'", optarg); + } + break; + case 'c': + guest_ncpus = atoi(optarg); + break; + case 'm': + error = vm_parse_memsize(optarg, &mem_size); + if (error) { + fprintf(stderr, "Invalid memsize '%s'\n", optarg); + exit(1); + } + break; + case 's': + if (mmio_parse_opts(optarg) != 0) + exit(1); + break; + case 'h': + usage(0); + default: + usage(4); + } + } + argc -= optind; + argv += optind; + + if (argc != 1) + usage(4); + + vmname = argv[0]; + + /* The VM must be created by bhyveload first. */ + ctx = vm_open(vmname); + if (ctx == NULL) { + perror("vm_open"); + exit(1); + } + + max_vcpus = num_vcpus_allowed(ctx); + if (guest_ncpus > max_vcpus) { + fprintf(stderr, "%d vCPUs requested but only %d available\n", + guest_ncpus, max_vcpus); + exit(1); + } + + error = vm_setup_memory(ctx, memory_base_address, mem_size, VM_MMAP_ALL); + if (error != 0) { + fprintf(stderr, "Unable to setup memory (%d)\n", error); + exit(1); + } + + init_mem(); + mmio_irq_init(ctx); + + if (init_mmio(ctx) != 0) { + fprintf(stderr, "Failed to initialize device emulation\n"); + exit(1); + } + + if (bvmcons) + init_bvmcons(); + + error = vm_get_register(ctx, BSP, VM_REG_ELR_EL2, &pc); + assert(error == 0); + /* + * Add CPU 0 + */ + fbsdrun_addcpu(ctx, BSP, pc); + + /* + * Head off to the main event dispatch loop + */ + mevent_dispatch(); + + exit(1); +} Index: usr.sbin/bhyve/arm64/consport.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/consport.c @@ -0,0 +1,142 @@ +/* * Copyright (C) 2015 Mihai Carabas * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "mem.h" + +#define BVM_CONS_PORT 0x090000 +#define BVM_CONS_SIG ('b' << 8 | 'v') + +static struct termios tio_orig, tio_new; + +static void +ttyclose(void) +{ + tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig); +} + +static void +ttyopen(void) +{ + tcgetattr(STDIN_FILENO, &tio_orig); + + cfmakeraw(&tio_new); + tcsetattr(STDIN_FILENO, TCSANOW, &tio_new); + + atexit(ttyclose); +} + +static bool +tty_char_available(void) +{ + fd_set rfds; + struct timeval tv; + + FD_ZERO(&rfds); + FD_SET(STDIN_FILENO, &rfds); + tv.tv_sec = 0; + tv.tv_usec = 0; + if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) { + return (true); + } else { + return (false); + } +} + +static int +ttyread(void) +{ + char rb; + + if (tty_char_available()) { + read(STDIN_FILENO, &rb, 1); + return (rb & 0xff); + } else { + return (-1); + } +} + +static void +ttywrite(unsigned char wb) +{ + (void) write(STDOUT_FILENO, &wb, 1); +} + +static int +console_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) +{ + static int opened; + + if (size == 2 && dir == MEM_F_READ) { + *val = BVM_CONS_SIG; + return (0); + } + + /* + * Guests might probe this port to look for old ISA devices + * using single-byte reads. Return 0xff for those. + */ + if (size == 1 && dir == MEM_F_READ) { + *val = 0xff; + return (0); + } + + if (size != 4) + return (-1); + + if (!opened) { + ttyopen(); + opened = 1; + } + + if (dir == MEM_F_READ) + *val = ttyread(); + else + ttywrite(*val); + return (0); +} + +struct mem_range consport ={ + "bvmcons", + MEM_F_RW, + console_handler, + NULL, + 0, + BVM_CONS_PORT, + sizeof(int) +}; + +void +init_bvmcons(void) +{ + register_mem(&consport); +} Index: usr.sbin/bhyve/arm64/mem.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/mem.h @@ -0,0 +1,59 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/usr.sbin/bhyve/arm/mem.h 38 2017-06-13 13:34:14Z darius.mihai $ + */ + +#ifndef _MEM_H_ +#define _MEM_H_ + +#include + +struct vmctx; + +typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2); + +struct mem_range { + const char *name; + int flags; + mem_func_t handler; + void *arg1; + long arg2; + uint64_t base; + uint64_t size; +}; +#define MEM_F_READ 0x1 +#define MEM_F_WRITE 0x2 +#define MEM_F_RW 0x3 + +void init_mem(void); +int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, void *vie); +int register_mem(struct mem_range *memp); +int register_mem_fallback(struct mem_range *memp); +int unregister_mem(struct mem_range *memp); + +void init_bvmcons(void); +#endif /* _MEM_H_ */ Index: usr.sbin/bhyve/arm64/mem.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/mem.c @@ -0,0 +1,271 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/usr.sbin/bhyve/arm/mem.c 4 2017-04-18 20:28:32Z mihai.carabas $ + */ + +/* + * Memory ranges are represented with an RB tree. On insertion, the range + * is checked for overlaps. On lookup, the key has the same base and limit + * so it can be searched within the range. + */ + +#include +__FBSDID("$FreeBSD: src/usr.sbin/bhyve/arm/mem.c 4 2017-04-18 20:28:32Z mihai.carabas $"); + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mem.h" + +struct mmio_rb_range { + RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */ + struct mem_range mr_param; + uint64_t mr_base; + uint64_t mr_end; +}; + +struct mmio_rb_tree; +RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); + +RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rb_root, mmio_rb_fallback; + +/* + * Per-vCPU cache. Since most accesses from a vCPU will be to + * consecutive addresses in a range, it makes sense to cache the + * result of a lookup. + */ +static struct mmio_rb_range *mmio_hint[VM_MAXCPU]; + +static pthread_rwlock_t mmio_rwlock; + +static int +mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b) +{ + if (a->mr_end < b->mr_base) + return (-1); + else if (a->mr_base > b->mr_end) + return (1); + return (0); +} + +static int +mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr, + struct mmio_rb_range **entry) +{ + struct mmio_rb_range find, *res; + + find.mr_base = find.mr_end = addr; + + res = RB_FIND(mmio_rb_tree, rbt, &find); + + if (res != NULL) { + *entry = res; + return (0); + } + + return (ENOENT); +} + +static int +mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new) +{ + struct mmio_rb_range *overlap; + + overlap = RB_INSERT(mmio_rb_tree, rbt, new); + + if (overlap != NULL) { +#ifdef RB_DEBUG + printf("overlap detected: new %lx:%lx, tree %lx:%lx\n", + new->mr_base, new->mr_end, + overlap->mr_base, overlap->mr_end); +#endif + + return (EEXIST); + } + + return (0); +} + +#if 0 +static void +mmio_rb_dump(struct mmio_rb_tree *rbt) +{ + struct mmio_rb_range *np; + + pthread_rwlock_rdlock(&mmio_rwlock); + RB_FOREACH(np, mmio_rb_tree, rbt) { + printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end, + np->mr_param.name); + } + pthread_rwlock_unlock(&mmio_rwlock); +} +#endif + +RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); + +static int +mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg) +{ + int error; + struct mem_range *mr = arg; + + error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size, + rval, mr->arg1, mr->arg2); + return (error); +} + +static int +mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg) +{ + int error; + struct mem_range *mr = arg; + + error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size, + &wval, mr->arg1, mr->arg2); + return (error); +} + +int +emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, void *vie) +{ + struct mmio_rb_range *entry; + int err; + + pthread_rwlock_rdlock(&mmio_rwlock); + /* + * First check the per-vCPU cache + */ + if (mmio_hint[vcpu] && + paddr >= mmio_hint[vcpu]->mr_base && + paddr <= mmio_hint[vcpu]->mr_end) { + entry = mmio_hint[vcpu]; + } else + entry = NULL; + + if (entry == NULL) { + if (mmio_rb_lookup(&mmio_rb_root, paddr, &entry) == 0) { + /* Update the per-vCPU cache */ + mmio_hint[vcpu] = entry; + } else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) { + pthread_rwlock_unlock(&mmio_rwlock); + return (ESRCH); + } + } + + assert(entry != NULL); + assert(NULL == NULL); + err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, + mem_read, mem_write, &entry->mr_param); + + pthread_rwlock_unlock(&mmio_rwlock); + + return (err); +} + +static int +register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp) +{ + struct mmio_rb_range *entry, *mrp; + int err; + + err = 0; + + mrp = malloc(sizeof(struct mmio_rb_range)); + + if (mrp != NULL) { + mrp->mr_param = *memp; + mrp->mr_base = memp->base; + mrp->mr_end = memp->base + memp->size - 1; + pthread_rwlock_wrlock(&mmio_rwlock); + if (mmio_rb_lookup(rbt, memp->base, &entry) != 0) + err = mmio_rb_add(rbt, mrp); + pthread_rwlock_unlock(&mmio_rwlock); + if (err) + free(mrp); + } else + err = ENOMEM; + + return (err); +} + +int +register_mem(struct mem_range *memp) +{ + + return (register_mem_int(&mmio_rb_root, memp)); +} + +int +register_mem_fallback(struct mem_range *memp) +{ + + return (register_mem_int(&mmio_rb_fallback, memp)); +} + +int +unregister_mem(struct mem_range *memp) +{ + struct mem_range *mr; + struct mmio_rb_range *entry = NULL; + int err, i; + + pthread_rwlock_wrlock(&mmio_rwlock); + err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry); + if (err == 0) { + mr = &entry->mr_param; + assert(mr->name == memp->name); + assert(mr->base == memp->base && mr->size == memp->size); + RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry); + + /* flush Per-vCPU cache */ + for (i=0; i < VM_MAXCPU; i++) { + if (mmio_hint[i] == entry) + mmio_hint[i] = NULL; + } + } + pthread_rwlock_unlock(&mmio_rwlock); + + if (entry) + free(entry); + + return (err); +} + +void +init_mem(void) +{ + RB_INIT(&mmio_rb_root); + RB_INIT(&mmio_rb_fallback); + pthread_rwlock_init(&mmio_rwlock, NULL); +} Index: usr.sbin/bhyve/arm64/mevent_test.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/mevent_test.c @@ -0,0 +1,256 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/usr.sbin/bhyve/arm/mevent_test.c 4 2017-04-18 20:28:32Z mihai.carabas $ + */ + +/* + * Test program for the micro event library. Set up a simple TCP echo + * service. + * + * cc mevent_test.c mevent.c -lpthread + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mevent.h" + +#define TEST_PORT 4321 + +static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER; + +static struct mevent *tevp; + +char *vmname = "test vm"; + + +#define MEVENT_ECHO + +/* Number of timer events to capture */ +#define TEVSZ 4096 +uint64_t tevbuf[TEVSZ]; + +static void +timer_print(void) +{ + uint64_t min, max, diff, sum, tsc_freq; + size_t len; + int j; + + min = UINT64_MAX; + max = 0; + sum = 0; + + len = sizeof(tsc_freq); + sysctlbyname("machdep.tsc_freq", &tsc_freq, &len, NULL, 0); + + for (j = 1; j < TEVSZ; j++) { + /* Convert a tsc diff into microseconds */ + diff = (tevbuf[j] - tevbuf[j-1]) * 1000000 / tsc_freq; + sum += diff; + if (min > diff) + min = diff; + if (max < diff) + max = diff; + } + + printf("timers done: usecs, min %ld, max %ld, mean %ld\n", min, max, + sum/(TEVSZ - 1)); +} + +static void +timer_callback(int fd, enum ev_type type, void *param) +{ + static int i; + + if (i >= TEVSZ) + abort(); + + tevbuf[i++] = rdtsc(); + + if (i == TEVSZ) { + mevent_delete(tevp); + timer_print(); + } +} + + +#ifdef MEVENT_ECHO +struct esync { + pthread_mutex_t e_mt; + pthread_cond_t e_cond; +}; + +static void +echoer_callback(int fd, enum ev_type type, void *param) +{ + struct esync *sync = param; + + pthread_mutex_lock(&sync->e_mt); + pthread_cond_signal(&sync->e_cond); + pthread_mutex_unlock(&sync->e_mt); +} + +static void * +echoer(void *param) +{ + struct esync sync; + struct mevent *mev; + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + pthread_mutex_init(&sync.e_mt, NULL); + pthread_cond_init(&sync.e_cond, NULL); + + pthread_mutex_lock(&sync.e_mt); + + mev = mevent_add(fd, EVF_READ, echoer_callback, &sync); + if (mev == NULL) { + printf("Could not allocate echoer event\n"); + exit(1); + } + + while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) { + len = read(fd, buf, sizeof(buf)); + if (len > 0) { + write(fd, buf, len); + write(0, buf, len); + } else { + break; + } + } + + mevent_delete_close(mev); + + pthread_mutex_unlock(&sync.e_mt); + pthread_mutex_destroy(&sync.e_mt); + pthread_cond_destroy(&sync.e_cond); + + return (NULL); +} + +#else + +static void * +echoer(void *param) +{ + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + while ((len = read(fd, buf, sizeof(buf))) > 0) { + write(1, buf, len); + } + + return (NULL); +} +#endif /* MEVENT_ECHO */ + +static void +acceptor_callback(int fd, enum ev_type type, void *param) +{ + pthread_mutex_lock(&accept_mutex); + pthread_cond_signal(&accept_condvar); + pthread_mutex_unlock(&accept_mutex); +} + +static void * +acceptor(void *param) +{ + struct sockaddr_in sin; + pthread_t tid; + int news; + int s; + static int first; + + if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("socket"); + exit(1); + } + + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(TEST_PORT); + + if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("bind"); + exit(1); + } + + if (listen(s, 1) < 0) { + perror("listen"); + exit(1); + } + + (void) mevent_add(s, EVF_READ, acceptor_callback, NULL); + + pthread_mutex_lock(&accept_mutex); + + while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) { + news = accept(s, NULL, NULL); + if (news < 0) { + perror("accept error"); + } else { + static int first = 1; + + if (first) { + /* + * Start a timer + */ + first = 0; + tevp = mevent_add(1, EVF_TIMER, timer_callback, + NULL); + } + + printf("incoming connection, spawning thread\n"); + pthread_create(&tid, NULL, echoer, + (void *)(uintptr_t)news); + } + } + + return (NULL); +} + +main() +{ + pthread_t tid; + + pthread_create(&tid, NULL, acceptor, NULL); + + mevent_dispatch(); +} Index: usr.sbin/bhyve/arm64/reset.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/reset.h @@ -0,0 +1,12 @@ +#ifndef _RESET_H_ +#define _RESET_H_ + +#define RESET_MAGIC 0xDEAD9731 + +#endif /* _RESET_H_ */ +#ifndef _RESET_H_ +#define _RESET_H_ + +#define RESET_MAGIC 0xDEAD9731 + +#endif /* _RESET_H_ */ Index: usr.sbin/bhyve/arm64/reset.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/arm64/reset.c @@ -0,0 +1,32 @@ +#include +#include + +#include "mem.h" +#include "reset.h" +#include "vmmapi.h" + +#define RESET_PORT 0x1c090100 + +static int +reset_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) +{ + vm_destroy(ctx); + + return (RESET_MAGIC); +} + +struct mem_range resetport ={ + "reset", + 0, + reset_handler, + NULL, + 0, + RESET_PORT, + sizeof(int) +}; + +void +init_reset(void) +{ + register_mem(&resetport); +} Index: usr.sbin/bhyve/bhyverun.c =================================================================== --- usr.sbin/bhyve/bhyverun.c +++ usr.sbin/bhyve/bhyverun.c @@ -1163,11 +1163,8 @@ break; #endif case 's': - if (strncmp(optarg, "help", strlen(optarg)) == 0) { - pci_print_supported_devices(); - exit(0); - } else if (pci_parse_slot(optarg) != 0) - exit(4); + if (pci_parse_slot(optarg) != 0) + exit(1); else break; case 'S': @@ -1303,10 +1300,8 @@ /* * Exit if a device emulation finds an error in its initilization */ - if (init_pci(ctx) != 0) { - perror("device emulation initialization error"); - exit(4); - } + if (init_pci(ctx) != 0) + exit(1); /* * Initialize after PCI, to allow a bootrom file to reserve the high Index: usr.sbin/bhyve/block_if.c =================================================================== --- usr.sbin/bhyve/block_if.c +++ usr.sbin/bhyve/block_if.c @@ -58,7 +58,10 @@ #include #include + +#ifdef BHYVE_SNAPSHOT #include +#endif #include "bhyverun.h" #include "debug.h" Index: usr.sbin/bhyve/mmio/Makefile.inc =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/Makefile.inc @@ -0,0 +1,17 @@ +# +# $FreeBSD$ +# + +.PATH: ${BHYVE_SRCTOP}/mmio/ +SRCS+= \ + mmio/mmio_virtio_block.c \ + mmio/mmio_virtio_console.c \ + mmio/mmio_virtio_net.c \ + mmio/mmio_virtio_rnd.c \ + mmio/mmio_emul.c \ + mmio/mmio_irq.c \ + mmio/net_utils.c \ + mmio/mmio_virtio.c + + +CFLAGS+= -I${BHYVE_SRCTOP}/mmio Index: usr.sbin/bhyve/mmio/mmio_emul.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_emul.h @@ -0,0 +1,116 @@ +#ifndef _EMUL_H_ +#define _EMUL_H_ + +#include + +#include + +struct vmctx; +struct mmio_devinst; + +// TODO suggestive naming +struct mmio_devemu { + char *de_emu; /* Device emulation name */ + + /* Instance creation */ + int (*de_init)(struct vmctx *ctx, struct mmio_devinst *di, + char *opts); + + /* Read / Write callbacks */ + void (*de_write)(struct vmctx *ctx, int vcpu, + struct mmio_devinst *di, int baridx, + uint64_t offset, int size, uint64_t val); + + uint64_t (*de_read)(struct vmctx *ctx, int vcpu, + struct mmio_devinst *di, int baridx, + uint64_t offset, int size); +}; + +#define MMIO_EMUL_SET(x) DATA_SET(mmio_set, x); +#define DI_NAMESZ 40 +#define MMIO_REGMAX 0xff +#define MMIO_REGNUM (MMIO_REGMAX + 1) + +struct devinst_addr { + uint64_t baddr; + uint64_t size; +}; + +enum lintr_stat { + IDLE, + ASSERTED, + PENDING +}; + +// TODO suggestive naming +struct mmio_devinst { + struct mmio_devemu *pi_d; /* Back ref to device */ + struct vmctx *pi_vmctx; /* Owner VM context */ + /* unused for mmio device emulation; may be used as uniquifiers */ + int pi_slot, di_func; + + char pi_name[DI_NAMESZ]; /* Instance name */ + + struct { + enum lintr_stat state; + int64_t irq; + pthread_mutex_t lock; + } di_lintr; + + void *pi_arg; /* Private data */ + + u_char pi_cfgregs[MMIO_REGNUM];/* Config regsters */ + + struct devinst_addr addr; /* Address info */ +}; + +int mmio_parse_opts(const char *args); +int mmio_alloc_mem(struct mmio_devinst *di); +int init_mmio(struct vmctx *ctx); +void mmio_lintr_request(struct mmio_devinst *di); +void mmio_lintr_assert(struct mmio_devinst *di); +void mmio_lintr_deassert(struct mmio_devinst *di); + +static __inline void +mmio_set_cfgreg8(struct mmio_devinst *di, size_t offset, uint32_t val) +{ + assert(offset <= MMIO_REGMAX); + *(uint32_t *)(di->pi_cfgregs + offset) = val; +} + +static __inline void +mmio_set_cfgreg16(struct mmio_devinst *di, size_t offset, uint32_t val) +{ + assert(offset <= (MMIO_REGMAX - 1) && (offset & 1) == 0); + *(uint32_t *)(di->pi_cfgregs + offset) = val; +} + +static __inline void +mmio_set_cfgreg32(struct mmio_devinst *di, size_t offset, uint32_t val) +{ + assert(offset <= (MMIO_REGMAX - 3) && (offset & 3) == 0); + *(uint32_t *)(di->pi_cfgregs + offset) = val; +} + +static __inline uint8_t +mmio_get_cfgreg8(struct mmio_devinst *di, size_t offset) +{ + assert(offset <= MMIO_REGMAX); + return (*(uint32_t *)(di->pi_cfgregs + offset)); +} + +static __inline uint16_t +mmio_get_cfgreg16(struct mmio_devinst *di, size_t offset) +{ + assert(offset <= (MMIO_REGMAX - 1) && (offset & 1) == 0); + return (*(uint32_t *)(di->pi_cfgregs + offset)); +} + +static __inline uint32_t +mmio_get_cfgreg32(struct mmio_devinst *di, size_t offset) +{ + assert(offset <= (MMIO_REGMAX - 3) && (offset & 3) == 0); + return (*(uint32_t *)(di->pi_cfgregs + offset)); +} + +#endif /* _EMUL_H_ */ Index: usr.sbin/bhyve/mmio/mmio_emul.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_emul.c @@ -0,0 +1,440 @@ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "arm64/mem.h" +#include "mmio_emul.h" +#include "mmio_irq.h" + +#define DEVEMU_MEMLIMIT 0xFD00000000UL +#define DEVEMU_MEMBASE 0xD000000000UL +#define MEM_ROUNDUP (1 << 20) +#ifndef max +# define max(A, B) ((A) > (B) ? (A) : (B)) +#endif + +static uint64_t mmio_membase; + +SET_DECLARE(mmio_set, struct mmio_devemu); + +static struct mmio_devemu *mmio_finddef(const char *name); +static void mmio_lintr_route(struct mmio_devinst *di); +static void mmio_lintr_update(struct mmio_devinst *di); + +static struct mmio_emul_info { + uint64_t size; /* address size */ + uint64_t baddr; /* address */ + int64_t irq; /* device interrupt number */ + char *name; /* device name */ + char *arg; /* device arguments */ + struct mmio_emul_info *next; /* pointer for linked list */ + struct mmio_devinst *di; /* pointer to device instance */ +} *mmio_emul_info_head = NULL; + +/* + * MMIO options are in the form: + * + * @#:[,] + * + * - size is the number of bytes required for the device mmio + * - base_addr is the base address for the MMIO mapped device; + * - irq specifies the device interrupt number the value MUST be a DECIMAL + * integer; if the device does not use interrupts, use -1 + * - emul is a string describing the type of device - e.g., virtio-net; + * - config is an optional string, depending on the device, that is used + * for configuration + * + * Examples of use: + * 0x200@0x100000#25:virtio-net,tap0 + * 0x100@0x200000#-1:dummy + */ +static void +mmio_parse_opts_usage(const char *args) +{ + fprintf(stderr, "Invalid mmio arguments \"%s\"\r\n", args); +} + +/* + * checks if two memory regions overlap + * checks are not required if one of the pointers is null + */ +static int +mmio_mem_overlap(uint64_t pa, uint64_t sa, uint64_t pb, uint64_t sb) +{ +#define IN_INTERVAL(lower, value, upper) \ + (((lower) < (value)) && ((value) < (upper))) + + if ((pa == 0) || (pb == 0)) + return 0; + + if (IN_INTERVAL(pa, pb, pa + sa) && + IN_INTERVAL(pb, pa, pb + sb)) + return 1; + + return 0; + +#undef IN_INTERVAL +} + +int +mmio_parse_opts(const char *args) +{ + char *emul, *config, *str; + uint64_t size, baddr; + int64_t irq; + int error; + struct mmio_emul_info *dif; + + error = -1; + emul = config = NULL; + baddr = 0, size = 0; + str = strdup(args); + + if ((emul = strchr(str, ':')) != NULL) { + *emul++ = '\0'; + + /* @# */ + if (sscanf(str, "%jx@%jx#%jd", &size, &baddr, &irq) != 3 && + sscanf(str, "%jx@%jx#%jd", &size, &baddr, &irq) != 3) { + mmio_parse_opts_usage(str); + goto parse_error; + } + } else { + mmio_parse_opts_usage(str); + goto parse_error; + } + + if ((config = strchr(emul, ',')) != NULL) + *config++ = '\0'; + + /* + * check if the required address can be obtained; + * if an address has not been requested, ignore the checks + * (however, an address will have to be later identified) + */ + if (baddr != 0) { + for (dif = mmio_emul_info_head; dif != NULL; dif = dif->next) + if (mmio_mem_overlap(dif->baddr, dif->size, + baddr, size)) + break; + + if (dif != NULL) { + fprintf(stderr, "The requested address 0x%jx is " + "already bound or overlapping\r\n", baddr); + error = EINVAL; + goto parse_error; + } + } + + dif = calloc(1, sizeof(struct mmio_emul_info)); + if (dif == NULL) { + error = ENOMEM; + goto parse_error; + } + + dif->next = mmio_emul_info_head; + mmio_emul_info_head = dif; + + dif->size = size; + dif->baddr = baddr; + dif->irq = irq; + if ((emul != NULL) && (strlen(emul)) > 0) + dif->name = strdup(emul); + else + dif->name = NULL; + if ((config != NULL) && (strlen(config)) > 0) + dif->arg = strdup(config); + else + dif->arg = NULL; + + error = 0; + +parse_error: + free(str); + + return error; +} + +static int +mmio_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2) +{ + struct mmio_devinst *di = arg1; + struct mmio_devemu *de = di->pi_d; + uint64_t offset; + int bidx = (int) arg2; + + assert(di->addr.baddr <= addr && + addr + size <= di->addr.baddr + di->addr.size); + + offset = addr - di->addr.baddr; + + if (dir == MEM_F_WRITE) { + if (size == 8) { + (*de->de_write)(ctx, vcpu, di, bidx, offset, + 4, *val & 0xffffffff); + (*de->de_write)(ctx, vcpu, di, bidx, offset + 4, + 4, *val >> 32); + } else { + (*de->de_write)(ctx, vcpu, di, bidx, offset, + size, *val); + } + } else { + if (size == 8) { + *val = (*de->de_read)(ctx, vcpu, di, bidx, + offset, 4); + *val |= (*de->de_read)(ctx, vcpu, di, bidx, + offset + 4, 4) << 32; + } else { + *val = (*de->de_read)(ctx, vcpu, di, bidx, + offset, size); + } + } + + return (0); +} + +static void +modify_mmio_registration(struct mmio_devinst *di, int registration) +{ + int error; + struct mem_range mr; + + bzero(&mr, sizeof(struct mem_range)); + mr.name = di->pi_name; + mr.base = di->addr.baddr; + mr.size = di->addr.size; + if (registration) { + mr.flags = MEM_F_RW; + mr.handler = mmio_mem_handler; + mr.arg1 = di; + mr.arg2 = 0; + error = register_mem(&mr); + } else { + error = unregister_mem(&mr); + } + + assert(error == 0); +} + +static void +register_mmio(struct mmio_devinst *di) +{ + return modify_mmio_registration(di, 1); +} + +static void +unregister_mmio(struct mmio_devinst *di) +{ + return modify_mmio_registration(di, 0); +} + +/* + * Update the MMIO address that is decoded + */ +static void +update_mem_address(struct mmio_devinst *di, uint64_t addr) +{ + /* TODO: check if the decoding is running */ + unregister_mmio(di); + + di->addr.baddr = addr; + + register_mmio(di); +} + +static int +mmio_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, + uint64_t *addr) +{ + uint64_t base; + + assert((size & (size - 1)) == 0); /* must be a power of 2 */ + + base = roundup2(*baseptr, size); + + if (base + size <= limit) { + *addr = base; + *baseptr = base + size; + return (0); + } else + return (-1); +} + +int +mmio_alloc_mem(struct mmio_devinst *di) +{ + int error; + uint64_t *baseptr, limit, addr, size; + + baseptr = &di->addr.baddr; + size = di->addr.size; + limit = DEVEMU_MEMLIMIT; + + if ((size & (size - 1)) != 0) + /* Round up to a power of 2 */ + size = 1UL << flsl(size); + + error = mmio_alloc_resource(baseptr, limit, size, &addr); + if (error != 0) + return (error); + + di->addr.baddr = addr; + + register_mmio(di); + + return (0); +} + +static struct mmio_devemu * +mmio_finddev(char *name) +{ + struct mmio_devemu **dpp, *dp; + + SET_FOREACH(dpp, mmio_set) { + dp = *dpp; + if (!strcmp(dp->de_emu, name)) + return (dp); + } + + return (NULL); +} + +static int +mmio_init(struct vmctx *ctx, struct mmio_devemu *de, struct mmio_emul_info *dif) +{ + struct mmio_devinst *di; + int error; + + di = calloc(1, sizeof(struct mmio_devinst)); + if (di == NULL) + return (ENOMEM); + + di->pi_d = de; + di->pi_vmctx = ctx; + snprintf(di->pi_name, DI_NAMESZ, "%s-mmio", de->de_emu); + di->di_lintr.state = IDLE; + di->di_lintr.irq = dif->irq; + pthread_mutex_init(&di->di_lintr.lock, NULL); + di->addr.baddr = dif->baddr; + di->addr.size = dif->size; + /* some devices (e.g., virtio-net) use these as uniquifiers; irq number + * should be unique and sufficient */ + di->pi_slot = dif->irq; + di->di_func = dif->irq; + + error = (*de->de_init)(ctx, di, dif->arg); + + if (error == 0) { + dif->di = di; + } else { + fprintf(stderr, "Device \"%s\": initialization failed\r\n", + di->pi_name); + fprintf(stderr, "Device arguments were: %s\r\n", dif->arg); + free(di); + } + + return (error); +} + +static void +init_mmio_error(const char *name) +{ + struct mmio_devemu **mdpp, *mdp; + + fprintf(stderr, "Device \"%s\" does not exist\r\n", name); + fprintf(stderr, "The following devices are available:\r\n"); + + SET_FOREACH(mdpp, mmio_set) { + mdp = *mdpp; + fprintf(stderr, "\t%s\r\n", mdp->de_emu); + } +} + +int init_mmio(struct vmctx *ctx) +{ + struct mmio_devemu *de; + struct mmio_emul_info *dif; + int error; + + mmio_membase = DEVEMU_MEMBASE; + + for (dif = mmio_emul_info_head; dif != NULL; dif = dif->next) { + if (dif->name == NULL) + continue; + + de = mmio_finddev(dif->name); + if (de == NULL) { + init_mmio_error(dif->name); + return (1); + } + + error = mmio_init(ctx, de, dif); + if (error != 0) + return (error); + + /* + * as specified in the amd64 implementation, add some + * slop to the memory resources decoded, in order to + * give the guest some flexibility to reprogram the addresses + */ + mmio_membase += MEM_ROUNDUP; + mmio_membase = roundup2(mmio_membase, MEM_ROUNDUP); + } + + /* activate the interrupts */ + for (dif = mmio_emul_info_head; dif != NULL; dif = dif->next) + if (dif->di != NULL) + mmio_lintr_route(dif->di); + + /* TODO: register fallback handlers? */ + + return (0); +} + +void +mmio_lintr_request(struct mmio_devinst *di) +{ + /* do nothing */ +} + +static void +mmio_lintr_route(struct mmio_devinst *di) +{ + /* do nothing */ +} + +void +mmio_lintr_assert(struct mmio_devinst *di) +{ + pthread_mutex_lock(&di->di_lintr.lock); + if (di->di_lintr.state == IDLE) { + di->di_lintr.state = ASSERTED; + mmio_irq_assert(di); + } + pthread_mutex_unlock(&di->di_lintr.lock); +} + +void +mmio_lintr_deassert(struct mmio_devinst *di) +{ + pthread_mutex_lock(&di->di_lintr.lock); + if (di->di_lintr.state == ASSERTED) { + mmio_irq_deassert(di); + di->di_lintr.state = IDLE; + } else if (di->di_lintr.state == PENDING) { + di->di_lintr.state = IDLE; + } + pthread_mutex_unlock(&di->di_lintr.lock); +} + +/* TODO: Add dummy? */ Index: usr.sbin/bhyve/mmio/mmio_irq.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_irq.h @@ -0,0 +1,12 @@ +#ifndef __MMIO_IRQ_H__ +#define __MMIO_IRQ_H__ + +struct mmio_devinst; + +void mmio_irq_init(struct vmctx *ctx); +void mmio_irq_reserve(int irq); +void mmio_irq_use(int irq); +void mmio_irq_assert(struct mmio_devinst *di); +void mmio_irq_deassert(struct mmio_devinst *di); + +#endif Index: usr.sbin/bhyve/mmio/mmio_irq.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_irq.c @@ -0,0 +1,113 @@ +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include + +#include "mmio_emul.h" +#include "mmio_irq.h" +#include "mmio_virtio.h" + +/* IRQ count to disable IRQ */ +#define IRQ_DISABLED 0xff + +static struct mmio_irq { + uint32_t use_count; /* number of binds */ + uint32_t active_count; /* number of asserts */ + uint32_t active; /* irq active */ + pthread_mutex_t lock; +} irqs[50]; + +void +mmio_irq_reserve(int irq) +{ + assert(irq >= 0 && irq < nitems(irqs)); + assert(irqs[irq].active == 0 || irqs[irq].active == IRQ_DISABLED); + irqs[irq].active = IRQ_DISABLED; +} + +void +mmio_irq_use(int irq) { + assert(irq >= 0 && irq < nitems(irqs)); + assert(irqs[irq].active != IRQ_DISABLED); + irqs[irq].active++; +} + +void +mmio_irq_init(struct vmctx *ctx) +{ + int i; + + for (i = 0; i < nitems(irqs); ++i) { + irqs[i].use_count = 0; + irqs[i].active_count = 0; + irqs[i].active = 0; + pthread_mutex_init(&irqs[i].lock, NULL); + } +} + +void +mmio_irq_assert(struct mmio_devinst *di) +{ + struct mmio_irq *irq; + uint32_t irq_status; + + assert(di->di_lintr.irq <= nitems(irqs)); + if (di->di_lintr.irq < 0) + return; + + irq = &irqs[di->di_lintr.irq]; + + pthread_mutex_lock(&irq->lock); + irq->active_count++; + + pthread_mutex_lock(&di->di_lintr.lock); + + irq_status = mmio_get_cfgreg32(di, VIRTIO_MMIO_INTERRUPT_STATUS); + irq_status |= VIRTIO_MMIO_INT_VRING; + mmio_set_cfgreg32(di, VIRTIO_MMIO_INTERRUPT_STATUS, irq_status); + + if (irq->active_count == 1) + vm_assert_irq(di->pi_vmctx, di->di_lintr.irq); + + pthread_mutex_unlock(&di->di_lintr.lock); + + pthread_mutex_unlock(&irq->lock); +} + +void +mmio_irq_deassert(struct mmio_devinst *di) +{ + struct mmio_irq *irq; + uint32_t irq_status; + + assert(di->di_lintr.irq <= nitems(irqs)); + if (di->di_lintr.irq < 0) + return; + + irq = &irqs[di->di_lintr.irq]; + + pthread_mutex_lock(&irq->lock); + irq->active_count--; + + pthread_mutex_lock(&di->di_lintr.lock); + + irq_status = mmio_get_cfgreg32(di, VIRTIO_MMIO_INTERRUPT_STATUS); + irq_status &= ~VIRTIO_MMIO_INT_VRING; + mmio_set_cfgreg32(di, VIRTIO_MMIO_INTERRUPT_STATUS, irq_status); + +#if 0 + /* MMIO devices do not require deassertions */ + if (irq->active_count == 0) + vm_deassert_irq(di->di_vmctx, di->di_lintr.irq); +#endif + + pthread_mutex_unlock(&di->di_lintr.lock); + + pthread_mutex_unlock(&irq->lock); +} Index: usr.sbin/bhyve/mmio/mmio_virtio.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_virtio.h @@ -0,0 +1,484 @@ +/*- + * Copyright (c) 2013 Chris Torek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VIRTIO_H_ +#define _VIRTIO_H_ + +#include + +/* + * These are derived from several virtio specifications. + * + * Some useful links: + * https://github.com/rustyrussell/virtio-spec + * http://people.redhat.com/pbonzini/virtio-spec.pdf + */ + +/* + * A virtual device has zero or more "virtual queues" (virtqueue). + * Each virtqueue uses at least two 4096-byte pages, laid out thus: + * + * +-----------------------------------------------+ + * | "desc": descriptors, 16 bytes each | + * | ----------------------------------------- | + * | "avail": 2 uint16; uint16; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * | "used": 2 x uint16; elems; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * + * The number that appears here is always a power of two and is + * limited to no more than 32768 (as it must fit in a 16-bit field). + * If is sufficiently large, the above will occupy more than + * two pages. In any case, all pages must be physically contiguous + * within the guest's physical address space. + * + * The 16-byte "desc" descriptors consist of a 64-bit guest + * physical address , a 32-bit length , a 16-bit + * , and a 16-bit field (all in guest byte order). + * + * There are three flags that may be set : + * NEXT descriptor is chained, so use its "next" field + * WRITE descriptor is for host to write into guest RAM + * (else host is to read from guest RAM) + * INDIRECT descriptor address field is (guest physical) + * address of a linear array of descriptors + * + * Unless INDIRECT is set, is the number of bytes that may + * be read/written from guest physical address . If + * INDIRECT is set, WRITE is ignored and provides the length + * of the indirect descriptors (and must be a multiple of + * 16). Note that NEXT may still be set in the main descriptor + * pointing to the indirect, and should be set in each indirect + * descriptor that uses the next descriptor (these should generally + * be numbered sequentially). However, INDIRECT must not be set + * in the indirect descriptors. Upon reaching an indirect descriptor + * without a NEXT bit, control returns to the direct descriptors. + * + * Except inside an indirect, each value must be in the + * range [0 .. N) (i.e., the half-open interval). (Inside an + * indirect, each must be in the range [0 .. /16).) + * + * The "avail" data structures reside in the same pages as the + * "desc" structures since both together are used by the device to + * pass information to the hypervisor's virtual driver. These + * begin with a 16-bit field and 16-bit index , then + * have 16-bit values, followed by one final 16-bit + * field . The entries are simply indices + * indices into the descriptor ring (and thus must meet the same + * constraints as each value). However, is counted + * up from 0 (initially) and simply wraps around after 65535; it + * is taken mod to find the next available entry. + * + * The "used" ring occupies a separate page or pages, and contains + * values written from the virtual driver back to the guest OS. + * This begins with a 16-bit and 16-bit , then there + * are "vring_used" elements, followed by a 16-bit . + * The "vring_used" elements consist of a 32-bit and a + * 32-bit (vu_tlen below). The is simply the index of + * the head of a descriptor chain the guest made available + * earlier, and the is the number of bytes actually written, + * e.g., in the case of a network driver that provided a large + * receive buffer but received only a small amount of data. + * + * The two event fields, and , in the + * avail and used rings (respectively -- note the reversal!), are + * always provided, but are used only if the virtual device + * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature + * negotiation. Similarly, both rings provide a flag -- + * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in + * their field, indicating that the guest does not need an + * interrupt, or that the hypervisor driver does not need a + * notify, when descriptors are added to the corresponding ring. + * (These are provided only for interrupt optimization and need + * not be implemented.) + */ + +#define VIRTIO_MMIO_MAGIC_VALUE 0x000 +#define VIRTIO_MMIO_VERSION 0x004 +#define VIRTIO_MMIO_DEVICE_ID 0x008 +#define VIRTIO_MMIO_VENDOR_ID 0x00c +#define VIRTIO_MMIO_HOST_FEATURES 0x010 +#define VIRTIO_MMIO_HOST_FEATURES_SEL 0x014 +#define VIRTIO_MMIO_GUEST_FEATURES 0x020 +#define VIRTIO_MMIO_GUEST_FEATURES_SEL 0x024 +#define VIRTIO_MMIO_GUEST_PAGE_SIZE 0x028 +#define VIRTIO_MMIO_QUEUE_SEL 0x030 +#define VIRTIO_MMIO_QUEUE_NUM_MAX 0x034 +#define VIRTIO_MMIO_QUEUE_NUM 0x038 +#define VIRTIO_MMIO_QUEUE_ALIGN 0x03c +#define VIRTIO_MMIO_QUEUE_PFN 0x040 +#define VIRTIO_MMIO_QUEUE_NOTIFY 0x050 +#define VIRTIO_MMIO_INTERRUPT_STATUS 0x060 +#define VIRTIO_MMIO_INTERRUPT_ACK 0x064 +#define VIRTIO_MMIO_STATUS 0x070 +#define VIRTIO_MMIO_CONFIG 0x100 +#define VIRTIO_MMIO_INT_VRING (1 << 0) +#define VIRTIO_MMIO_INT_CONFIG (1 << 1) +#define VIRTIO_MMIO_VRING_ALIGN 4096 + +#define VRING_ALIGN 4096 + +#define VRING_DESC_F_NEXT (1 << 0) +#define VRING_DESC_F_WRITE (1 << 1) +#define VRING_DESC_F_INDIRECT (1 << 2) + +struct virtio_desc { /* AKA vring_desc */ + uint64_t vd_addr; /* guest physical address */ + uint32_t vd_len; /* length of scatter/gather seg */ + uint16_t vd_flags; /* VRING_F_DESC_* */ + uint16_t vd_next; /* next desc if F_NEXT */ +} __packed; + +struct virtio_used { /* AKA vring_used_elem */ + uint32_t vu_idx; /* head of used descriptor chain */ + uint32_t vu_tlen; /* length written-to */ +} __packed; + +#define VRING_AVAIL_F_NO_INTERRUPT 1 + +struct vring_avail { + uint16_t va_flags; /* VRING_AVAIL_F_* */ + uint16_t va_idx; /* counts to 65535, then cycles */ + uint16_t va_ring[]; /* size N, reported in QNUM value */ +/* uint16_t va_used_event; -- after N ring entries */ +} __packed; + +#define VRING_USED_F_NO_NOTIFY 1 +struct vring_used { + uint16_t vu_flags; /* VRING_USED_F_* */ + uint16_t vu_idx; /* counts to 65535, then cycles */ + struct virtio_used vu_ring[]; /* size N */ +/* uint16_t vu_avail_event; -- after N ring entries */ +} __packed; + +/* + * The address of any given virtual queue is determined by a single + * Page Frame Number register. The guest writes the PFN into the + * PCI config space. However, a device that has two or more + * virtqueues can have a different PFN, and size, for each queue. + * The number of queues is determinable via the PCI config space + * VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means + * queue #0, 1 means queue#1, etc. Once a queue is selected, the + * remaining PFN and QNUM registers refer to that queue. + * + * QNUM is a read-only register containing a nonzero power of two + * that indicates the (hypervisor's) queue size. Or, if reading it + * produces zero, the hypervisor does not have a corresponding + * queue. (The number of possible queues depends on the virtual + * device. The block device has just one; the network device + * provides either two -- 0 = receive, 1 = transmit -- or three, + * with 2 = control.) + * + * PFN is a read/write register giving the physical page address of + * the virtqueue in guest memory (the guest must allocate enough space + * based on the hypervisor's provided QNUM). + * + * QNOTIFY is effectively write-only: when the guest writes a queue + * number to the register, the hypervisor should scan the specified + * virtqueue. (Reading QNOTIFY currently always gets 0). + */ + +/* + * PFN register shift amount + */ +#define VRING_PFN 12 + +/* + * Virtio device types + * + * XXX Should really be merged with defines + */ +#define VIRTIO_TYPE_NET 1 +#define VIRTIO_TYPE_BLOCK 2 +#define VIRTIO_TYPE_CONSOLE 3 +#define VIRTIO_TYPE_ENTROPY 4 +#define VIRTIO_TYPE_BALLOON 5 +#define VIRTIO_TYPE_IOMEMORY 6 +#define VIRTIO_TYPE_RPMSG 7 +#define VIRTIO_TYPE_SCSI 8 +#define VIRTIO_TYPE_9P 9 + +/* experimental IDs start at 65535 and work down */ + +/* + * PCI vendor/device IDs + */ +#define VIRTIO_VENDOR 0x1AF4 +#define VIRTIO_DEV_NET 0x1000 +#define VIRTIO_DEV_BLOCK 0x1001 +#define VIRTIO_DEV_CONSOLE 0x1003 +#define VIRTIO_DEV_RANDOM 0x1005 + +#define VIRTIO_MMIO_MAGIC_NUM 0x74726976 +#define VIRTIO_MMIO_VERSION_NUM 0x1 + +/* + * Bits in VTCFG_R_STATUS. Guests need not actually set any of these, + * but a guest writing 0 to this register means "please reset". + */ +#define VTCFG_STATUS_ACK 0x01 /* guest OS has acknowledged dev */ +#define VTCFG_STATUS_DRIVER 0x02 /* guest OS driver is loaded */ +#define VTCFG_STATUS_DRIVER_OK 0x04 /* guest OS driver ready */ +#define VTCFG_STATUS_FAILED 0x80 /* guest has given up on this dev */ + +/* + * Bits in VTCFG_R_ISR. These apply only if not using MSI-X. + * + * (We don't [yet?] ever use CONF_CHANGED.) + */ +#define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */ +#define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */ + +#define VIRTIO_MSI_NO_VECTOR 0xFFFF + +/* + * Feature flags. + * Note: bits 0 through 23 are reserved to each device type. + */ +#define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24) +#define VIRTIO_RING_F_INDIRECT_DESC (1 << 28) +#define VIRTIO_RING_F_EVENT_IDX (1 << 29) + +/* From section 2.3, "Virtqueue Configuration", of the virtio specification */ +static inline size_t +vring_size(u_int qsz, uint32_t align) +{ + size_t size; + + /* constant 3 below = va_flags, va_idx, va_used_event */ + size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz); + size = roundup2(size, align); + + /* constant 3 below = vu_flags, vu_idx, vu_avail_event */ + size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz; + size = roundup2(size, align); + + return (size); +} + +struct vmctx; +struct mmio_devinst; +struct vqueue_info; + +/* + * A virtual device, with some number (possibly 0) of virtual + * queues and some size (possibly 0) of configuration-space + * registers private to the device. The virtio_softc should come + * at the front of each "derived class", so that a pointer to the + * virtio_softc is also a pointer to the more specific, derived- + * from-virtio driver's softc. + * + * Note: inside each hypervisor virtio driver, changes to these + * data structures must be locked against other threads, if any. + * Except for PCI config space register read/write, we assume each + * driver does the required locking, but we need a pointer to the + * lock (if there is one) for PCI config space read/write ops. + * + * When the guest reads or writes the device's config space, the + * generic layer checks for operations on the special registers + * described above. If the offset of the register(s) being read + * or written is past the CFG area (CFG0 or CFG1), the request is + * passed on to the virtual device, after subtracting off the + * generic-layer size. (So, drivers can just use the offset as + * an offset into "struct config", for instance.) + * + * (The virtio layer also makes sure that the read or write is to/ + * from a "good" config offset, hence vc_cfgsize, and on BAR #0. + * However, the driver must verify the read or write size and offset + * and that no one is writing a readonly register.) + * + * The BROKED flag ("this thing done gone and broked") is for future + * use. + */ +#define VIRTIO_USE_MSIX 0x01 +#define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */ +#define VIRTIO_BROKED 0x08 /* ??? */ + +struct virtio_softc { + struct virtio_consts *vs_vc; /* constants (see below) */ + int vs_flags; /* VIRTIO_* flags from above */ + pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */ + struct mmio_devinst *vs_di; /* device instance */ + uint32_t vs_negotiated_caps; /* negotiated capabilities */ + uint32_t vs_align; /* virtual queue alignment */ + struct vqueue_info *vs_queues; /* one per vc_nvq */ + int vs_curq; /* current queue */ + int irq; /* interrupt */ + uint8_t vs_status; /* value from last status write */ + uint32_t vs_guest_page_size; /* size of guest page in bytes */ +}; + +#define VS_LOCK(vs) \ +do { \ + if (vs->vs_mtx) \ + pthread_mutex_lock(vs->vs_mtx); \ +} while (0) + +#define VS_UNLOCK(vs) \ +do { \ + if (vs->vs_mtx) \ + pthread_mutex_unlock(vs->vs_mtx); \ +} while (0) + +struct virtio_consts { + const char *vc_name; /* name of driver (for diagnostics) */ + int vc_nvq; /* number of virtual queues */ + size_t vc_cfgsize; /* size of dev-specific config regs */ + void (*vc_reset)(void *); /* called on virtual device reset */ + void (*vc_qnotify)(void *, struct vqueue_info *); + /* called on QNOTIFY if no VQ notify */ + int (*vc_cfgread)(void *, int, int, uint32_t *); + /* called to read config regs */ + int (*vc_cfgwrite)(void *, int, int, uint32_t); + /* called to write config regs */ + void (*vc_apply_features)(void *, uint64_t); + /* called to apply negotiated features */ + uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ +}; + +/* + * Data structure allocated (statically) per virtual queue. + * + * Drivers may change vq_qsize after a reset. When the guest OS + * requests a device reset, the hypervisor first calls + * vs->vs_vc->vc_reset(); then the data structure below is + * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq). + * + * The remaining fields should only be fussed-with by the generic + * code. + * + * Note: the addresses of vq_desc, vq_avail, and vq_used are all + * computable from each other, but it's a lot simpler if we just + * keep a pointer to each one. The event indices are similarly + * (but more easily) computable, and this time we'll compute them: + * they're just XX_ring[N]. + */ +#define VQ_ALLOC 0x01 /* set once we have a pfn */ +#define VQ_BROKED 0x02 /* ??? */ +struct vqueue_info { + uint16_t vq_qsize; /* size of this queue (a power of 2) */ + void (*vq_notify)(void *, struct vqueue_info *); + /* called instead of vc_notify, if not NULL */ + + struct virtio_softc *vq_vs; /* backpointer to softc */ + uint16_t vq_num; /* we're the num'th queue in the softc */ + + uint16_t vq_flags; /* flags (see above) */ + uint16_t vq_last_avail; /* a recent value of vq_avail->va_idx */ + uint16_t vq_save_used; /* saved vq_used->vu_idx; see vq_endchains */ + + uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */ + + volatile struct virtio_desc *vq_desc; /* descriptor array */ + volatile struct vring_avail *vq_avail; /* the "avail" ring */ + volatile struct vring_used *vq_used; /* the "used" ring */ +}; +/* as noted above, these are sort of backwards, name-wise */ +#define VQ_AVAIL_EVENT_IDX(vq) \ + (*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize]) +#define VQ_USED_EVENT_IDX(vq) \ + ((vq)->vq_avail->va_ring[(vq)->vq_qsize]) + +/* + * Is this ring ready for I/O? + */ +static inline int +vq_ring_ready(struct vqueue_info *vq) +{ + + return (vq->vq_flags & VQ_ALLOC); +} + +/* + * Are there "available" descriptors? (This does not count + * how many, just returns True if there are some.) + */ +static inline int +vq_has_descs(struct vqueue_info *vq) +{ + + return (vq_ring_ready(vq) && vq->vq_last_avail != + vq->vq_avail->va_idx); +} + +/* + * Deliver an interrupt to guest on the given virtual queue + * (if possible, or a generic MSI interrupt if not using MSI-X). + */ +static inline void +vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq) +{ + VS_LOCK(vs); + mmio_lintr_assert(vs->vs_di); + VS_UNLOCK(vs); +} + +static inline void +vq_kick_enable(struct vqueue_info *vq) +{ + + vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY; + /* + * Full memory barrier to make sure the store to vu_flags + * happens before the load from va_idx, which results from + * a subsequent call to vq_has_descs(). + */ + atomic_thread_fence_seq_cst(); +} + +static inline void +vq_kick_disable(struct vqueue_info *vq) +{ + + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; +} + +struct iovec; +void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct mmio_devinst *di, + struct vqueue_info *queues); +int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); +void vi_reset_dev(struct virtio_softc *); +void vi_set_io_res(struct virtio_softc *, int); + +int vq_getchain(struct vqueue_info *vq, uint16_t *pidx, + struct iovec *iov, int n_iov, uint16_t *flags); +void vq_retchains(struct vqueue_info *vq, uint16_t n_chains); +void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); +void vq_endchains(struct vqueue_info *vq, int used_all_avail); + +uint64_t vi_mmio_read(struct vmctx *ctx, int vcpu, struct mmio_devinst *di, + int baridx, uint64_t offset, int size); +void vi_mmio_write(struct vmctx *ctx, int vcpu, struct mmio_devinst *di, + int baridx, uint64_t offset, int size, uint64_t value); +void vi_devemu_init(struct mmio_devinst *di, uint32_t type); +#endif /* _VIRTIO_H_ */ Index: usr.sbin/bhyve/mmio/mmio_virtio.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_virtio.c @@ -0,0 +1,707 @@ +/*- + * Copyright (c) 2013 Chris Torek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include + +#include "bhyverun.h" +#include "mmio_emul.h" +#include "mmio_virtio.h" +#include "virtio_ids.h" + +static int debug_virtio = 0; + +#define DPRINTF(fmt, ...) if (debug_virtio) printf(fmt, ##__VA_ARGS__) +#define CFG_RW_DBG(offset, value) \ + DPRINTF("{device} | %-60s | %-35s | %-30s (%jx): value = %jx\r\n", \ + __FILE__, __func__, #offset, (uintmax_t)offset, (uintmax_t)value); + +/* + * Functions for dealing with generalized "virtual devices" as + * defined by + */ + +/* + * In case we decide to relax the "virtio softc comes at the + * front of virtio-based device softc" constraint, let's use + * this to convert. + */ +#define DEV_SOFTC(vs) ((void *)(vs)) + +/* + * Link a virtio_softc to its constants, the device softc, and + * the PCI emulation. + */ +void +vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct mmio_devinst *di, + struct vqueue_info *queues) +{ + int i; + + /* vs and dev_softc addresses must match */ + assert((void *)vs == dev_softc); + vs->vs_vc = vc; + vs->vs_di = di; + di->pi_arg = vs; + + vs->vs_queues = queues; + for (i = 0; i < vc->vc_nvq; i++) { + queues[i].vq_vs = vs; + queues[i].vq_num = i; + } +} + +/* + * Reset device (device-wide). This erases all queues, i.e., + * all the queues become invalid (though we don't wipe out the + * internal pointers, we just clear the VQ_ALLOC flag). + * + * It resets negotiated features to "none". + */ +void +vi_reset_dev(struct virtio_softc *vs) +{ + struct vqueue_info *vq; + int i, nvq; + + if (vs->vs_mtx) + assert(pthread_mutex_isowned_np(vs->vs_mtx)); + + nvq = vs->vs_vc->vc_nvq; + for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { + vq->vq_flags = 0; + vq->vq_last_avail = 0; + vq->vq_save_used = 0; + vq->vq_pfn = 0; + } + vs->vs_negotiated_caps = 0; + vs->vs_curq = 0; + /* vs->vs_status = 0; -- redundant */ + mmio_lintr_deassert(vs->vs_di); +} + +void +vi_set_io_res(struct virtio_softc *vs, int barnum) +{ + mmio_alloc_mem(vs->vs_di); +} + +/* + * Initialize interrupts for MMIO + */ +int +vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix) +{ + /* activate interrupts */ + mmio_lintr_request(vs->vs_di); + + return (0); +} + +/* + * Initialize the currently-selected virtio queue (vs->vs_curq). + * The guest just gave us a page frame number, from which we can + * calculate the addresses of the queue. + */ +void +vi_vq_init(struct virtio_softc *vs, uint32_t pfn) +{ + struct vqueue_info *vq; + uint64_t phys; + size_t size; + char *base; + + vq = &vs->vs_queues[vs->vs_curq]; + vq->vq_pfn = pfn; + phys = (uint64_t)pfn * vs->vs_guest_page_size; + size = vring_size(vq->vq_qsize, vs->vs_align); + base = paddr_guest2host(vs->vs_di->pi_vmctx, phys, size); + + /* First page(s) are descriptors... */ + vq->vq_desc = (struct virtio_desc *)base; + base += vq->vq_qsize * sizeof(struct virtio_desc); + + /* ... immediately followed by "avail" ring (entirely uint16_t's) */ + vq->vq_avail = (struct vring_avail *)base; + base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t); + + /* Then it's rounded up to the next page... */ + base = (char *)roundup2((uintptr_t)base, vs->vs_align); + + /* ... and the last page(s) are the used ring. */ + vq->vq_used = (struct vring_used *)base; + + /* Mark queue as allocated, and start at 0 when we use it. */ + vq->vq_flags = VQ_ALLOC; + vq->vq_last_avail = 0; + vq->vq_save_used = 0; +} + +/* + * Helper inline for vq_getchain(): record the i'th "real" + * descriptor. + */ +static inline void +_vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx, + struct iovec *iov, int n_iov, uint16_t *flags) { + + if (i >= n_iov) + return; + iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len); + iov[i].iov_len = vd->vd_len; + if (flags != NULL) + flags[i] = vd->vd_flags; +} +#define VQ_MAX_DESCRIPTORS 512 /* see below */ + +/* + * Examine the chain of descriptors starting at the "next one" to + * make sure that they describe a sensible request. If so, return + * the number of "real" descriptors that would be needed/used in + * acting on this request. This may be smaller than the number of + * available descriptors, e.g., if there are two available but + * they are two separate requests, this just returns 1. Or, it + * may be larger: if there are indirect descriptors involved, + * there may only be one descriptor available but it may be an + * indirect pointing to eight more. We return 8 in this case, + * i.e., we do not count the indirect descriptors, only the "real" + * ones. + * + * Basically, this vets the vd_flags and vd_next field of each + * descriptor and tells you how many are involved. Since some may + * be indirect, this also needs the vmctx (in the pci_devinst + * at vs->vs_di) so that it can find indirect descriptors. + * + * As we process each descriptor, we copy and adjust it (guest to + * host address wise, also using the vmtctx) into the given iov[] + * array (of the given size). If the array overflows, we stop + * placing values into the array but keep processing descriptors, + * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1. + * So you, the caller, must not assume that iov[] is as big as the + * return value (you can process the same thing twice to allocate + * a larger iov array if needed, or supply a zero length to find + * out how much space is needed). + * + * If you want to verify the WRITE flag on each descriptor, pass a + * non-NULL "flags" pointer to an array of "uint16_t" of the same size + * as n_iov and we'll copy each vd_flags field after unwinding any + * indirects. + * + * If some descriptor(s) are invalid, this prints a diagnostic message + * and returns -1. If no descriptors are ready now it simply returns 0. + * + * You are assumed to have done a vq_ring_ready() if needed (note + * that vq_has_descs() does one). + */ +int +vq_getchain(struct vqueue_info *vq, uint16_t *pidx, + struct iovec *iov, int n_iov, uint16_t *flags) +{ + int i; + u_int ndesc, n_indir; + u_int idx, next; + volatile struct virtio_desc *vdir, *vindir, *vp; + struct vmctx *ctx; + struct virtio_softc *vs; + const char *name; + + vs = vq->vq_vs; + name = vs->vs_vc->vc_name; + + /* + * Note: it's the responsibility of the guest not to + * update vq->vq_avail->va_idx until all of the descriptors + * the guest has written are valid (including all their + * vd_next fields and vd_flags). + * + * Compute (last_avail - va_idx) in integers mod 2**16. This is + * the number of descriptors the device has made available + * since the last time we updated vq->vq_last_avail. + * + * We just need to do the subtraction as an unsigned int, + * then trim off excess bits. + */ + idx = vq->vq_last_avail; + ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx); + if (ndesc == 0) + return (0); + if (ndesc > vq->vq_qsize) { + /* XXX need better way to diagnose issues */ + fprintf(stderr, + "%s: ndesc (%u) out of range, driver confused?\r\n", + name, (u_int)ndesc); + return (-1); + } + + /* + * Now count/parse "involved" descriptors starting from + * the head of the chain. + * + * To prevent loops, we could be more complicated and + * check whether we're re-visiting a previously visited + * index, but we just abort if the count gets excessive. + */ + ctx = vs->vs_di->pi_vmctx; + *pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)]; + vq->vq_last_avail++; + for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) { + if (next >= vq->vq_qsize) { + fprintf(stderr, + "%s: descriptor index %u out of range, " + "driver confused?\r\n", + name, next); + return (-1); + } + vdir = &vq->vq_desc[next]; + if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) { + _vq_record(i, vdir, ctx, iov, n_iov, flags); + i++; + } else if ((vs->vs_vc->vc_hv_caps & + VIRTIO_RING_F_INDIRECT_DESC) == 0) { + fprintf(stderr, + "%s: descriptor has forbidden INDIRECT flag, " + "driver confused?\r\n", + name); + return (-1); + } else { + n_indir = vdir->vd_len / 16; + if ((vdir->vd_len & 0xf) || n_indir == 0) { + fprintf(stderr, + "%s: invalid indir len 0x%x, " + "driver confused?\r\n", + name, (u_int)vdir->vd_len); + return (-1); + } + vindir = paddr_guest2host(ctx, + vdir->vd_addr, vdir->vd_len); + /* + * Indirects start at the 0th, then follow + * their own embedded "next"s until those run + * out. Each one's indirect flag must be off + * (we don't really have to check, could just + * ignore errors...). + */ + next = 0; + for (;;) { + vp = &vindir[next]; + if (vp->vd_flags & VRING_DESC_F_INDIRECT) { + fprintf(stderr, + "%s: indirect desc has INDIR flag," + " driver confused?\r\n", + name); + return (-1); + } + _vq_record(i, vp, ctx, iov, n_iov, flags); + if (++i > VQ_MAX_DESCRIPTORS) + goto loopy; + if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0) + break; + next = vp->vd_next; + if (next >= n_indir) { + fprintf(stderr, + "%s: invalid next %u > %u, " + "driver confused?\r\n", + name, (u_int)next, n_indir); + return (-1); + } + } + } + if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) + return (i); + } +loopy: + fprintf(stderr, + "%s: descriptor loop? count > %d - driver confused?\r\n", + name, i); + return (-1); +} + +/* + * Return the currently-first request chain back to the available queue. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_retchains(struct vqueue_info *vq, uint16_t n_chains) +{ + + vq->vq_last_avail -= n_chains; +} + +/* + * Return specified request chain to the guest, setting its I/O length + * to the provided value. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) +{ + uint16_t uidx, mask; + volatile struct vring_used *vuh; + volatile struct virtio_used *vue; + + /* + * Notes: + * - mask is N-1 where N is a power of 2 so computes x % N + * - vuh points to the "used" data shared with guest + * - vue points to the "used" ring entry we want to update + * - head is the same value we compute in vq_iovecs(). + * + * (I apologize for the two fields named vu_idx; the + * virtio spec calls the one that vue points to, "id"...) + */ + mask = vq->vq_qsize - 1; + vuh = vq->vq_used; + + uidx = vuh->vu_idx; + vue = &vuh->vu_ring[uidx++ & mask]; + vue->vu_idx = idx; + vue->vu_tlen = iolen; + vuh->vu_idx = uidx; +} + +/* + * Driver has finished processing "available" chains and calling + * vq_relchain on each one. If driver used all the available + * chains, used_all should be set. + * + * If the "used" index moved we may need to inform the guest, i.e., + * deliver an interrupt. Even if the used index did NOT move we + * may need to deliver an interrupt, if the avail ring is empty and + * we are supposed to interrupt on empty. + * + * Note that used_all_avail is provided by the caller because it's + * a snapshot of the ring state when he decided to finish interrupt + * processing -- it's possible that descriptors became available after + * that point. (It's also typically a constant 1/True as well.) + */ +void +vq_endchains(struct vqueue_info *vq, int used_all_avail) +{ + struct virtio_softc *vs; + uint16_t event_idx, new_idx, old_idx; + int intr; + + /* + * Interrupt generation: if we're using EVENT_IDX, + * interrupt if we've crossed the event threshold. + * Otherwise interrupt is generated if we added "used" entries, + * but suppressed by VRING_AVAIL_F_NO_INTERRUPT. + * + * In any case, though, if NOTIFY_ON_EMPTY is set and the + * entire avail was processed, we need to interrupt always. + */ + vs = vq->vq_vs; + old_idx = vq->vq_save_used; + vq->vq_save_used = new_idx = vq->vq_used->vu_idx; + if (used_all_avail && + (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) + intr = 1; + else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) { + event_idx = VQ_USED_EVENT_IDX(vq); + /* + * This calculation is per docs and the kernel + * (see src/sys/dev/virtio/virtio_ring.h). + */ + intr = (uint16_t)(new_idx - event_idx - 1) < + (uint16_t)(new_idx - old_idx); + } else { + intr = new_idx != old_idx && + !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT); + } + if (intr) + vq_interrupt(vs, vq); +} + +/* + * Handle pci config space reads. + * If it's to the interrupt system, do that + * If it's part of the virtio standard stuff, do that. + * Otherwise dispatch to the actual driver. + */ +uint64_t +vi_mmio_read(struct vmctx *ctx, int vcpu, struct mmio_devinst *di, + int baridx, uint64_t offset, int size) +{ + struct virtio_softc *vs = di->pi_arg; + struct virtio_consts *vc; + const char *name; + uint64_t sel; + uint32_t value; + int error; + + if (vs->vs_mtx) + pthread_mutex_lock(vs->vs_mtx); + + vc = vs->vs_vc; + name = vc->vc_name; + value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff; + + if (size != 1 && size != 2 && size != 4) + goto bad; + + if (offset >= VIRTIO_MMIO_CONFIG) { + error = (*vc->vc_cfgread)(DEV_SOFTC(vs), + offset - VIRTIO_MMIO_CONFIG, + size, + &value); + if (error) + goto bad; + + CFG_RW_DBG(offset, value); + goto done; + } + + switch (offset) { + case VIRTIO_MMIO_MAGIC_VALUE: + value = mmio_get_cfgreg32(di, offset); + CFG_RW_DBG(VIRTIO_MMIO_MAGIC_VALUE, value); + break; + case VIRTIO_MMIO_VERSION: + value = mmio_get_cfgreg32(di, offset); + CFG_RW_DBG(VIRTIO_MMIO_VERSION, value); + break; + case VIRTIO_MMIO_DEVICE_ID: + value = mmio_get_cfgreg32(di, offset); + CFG_RW_DBG(VIRTIO_MMIO_DEVICE_ID, value); + break; + case VIRTIO_MMIO_VENDOR_ID: + value = mmio_get_cfgreg32(di, offset); + CFG_RW_DBG(VIRTIO_MMIO_VENDOR_ID, value); + break; + case VIRTIO_MMIO_INTERRUPT_STATUS: + value = mmio_get_cfgreg32(di, offset); + CFG_RW_DBG(VIRTIO_MMIO_INTERRUPT_STATUS, value); + break; + case VIRTIO_MMIO_STATUS: + value = mmio_get_cfgreg32(di, offset); + CFG_RW_DBG(VIRTIO_MMIO_STATUS, value); + break; + case VIRTIO_MMIO_HOST_FEATURES: + sel = mmio_get_cfgreg32(di, VIRTIO_MMIO_HOST_FEATURES_SEL); + value = (vc->vc_hv_caps >> (32 * sel)) & 0xffffffff; + CFG_RW_DBG(VIRTIO_MMIO_HOST_FEATURES, value); + break; + case VIRTIO_MMIO_QUEUE_NUM_MAX: + value = vs->vs_curq < vc->vc_nvq ? + vs->vs_queues[vs->vs_curq].vq_qsize : 0; + CFG_RW_DBG(VIRTIO_MMIO_QUEUE_NUM_MAX, value); + break; + case VIRTIO_MMIO_QUEUE_PFN: + value = vs->vs_curq < vc->vc_nvq ? + vs->vs_queues[vs->vs_curq].vq_pfn : 0; + CFG_RW_DBG(VIRTIO_MMIO_QUEUE_PFN, value); + break; + default: + CFG_RW_DBG(offset, value); + goto bad; + break; + } + + goto done; + +bad: + fprintf(stderr, "%s: read from bad offset/size: %jd/%d\r\n", + name, (uintmax_t)offset, size); + +done: + if (vs->vs_mtx) + pthread_mutex_unlock(vs->vs_mtx); + return (value); +} + +/* + * Handle pci config space writes. + * If it's to the MSI-X info, do that. + * If it's part of the virtio standard stuff, do that. + * Otherwise dispatch to the actual driver. + */ +void +vi_mmio_write(struct vmctx *ctx, int vcpu, struct mmio_devinst *di, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct virtio_softc *vs = di->pi_arg; + struct vqueue_info *vq; + struct virtio_consts *vc; + const char *name; + int error; + + if (vs->vs_mtx) + pthread_mutex_lock(vs->vs_mtx); + + vc = vs->vs_vc; + name = vc->vc_name; + + if (size != 1 && size != 2 && size != 4) + goto bad; + + if (offset >= VIRTIO_MMIO_CONFIG) { + error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), + offset - VIRTIO_MMIO_CONFIG, + size, value); + if (error) + goto bad; + + CFG_RW_DBG(offset, value); + goto done; + } + + switch (offset) { + case VIRTIO_MMIO_HOST_FEATURES_SEL: + CFG_RW_DBG(VIRTIO_MMIO_HOST_FEATURES_SEL, value); + mmio_set_cfgreg32(di, offset, value); + break; + case VIRTIO_MMIO_GUEST_FEATURES_SEL: + CFG_RW_DBG(VIRTIO_MMIO_GUEST_FEATURES_SEL, value); + mmio_set_cfgreg32(di, offset, value); + break; + case VIRTIO_MMIO_INTERRUPT_ACK: + CFG_RW_DBG(VIRTIO_MMIO_INTERRUPT_ACK, value); + mmio_lintr_deassert(di); + mmio_set_cfgreg32(di, offset, value); + break; + case VIRTIO_MMIO_STATUS: + CFG_RW_DBG(VIRTIO_MMIO_STATUS, value); + mmio_set_cfgreg32(di, offset, value); + vs->vs_status = value; + if (value == 0) + (*vc->vc_reset)(DEV_SOFTC(vs)); + break; + case VIRTIO_MMIO_QUEUE_NUM: + CFG_RW_DBG(VIRTIO_MMIO_QUEUE_NUM, value); + mmio_set_cfgreg32(di, offset, value); + vq = &vs->vs_queues[vs->vs_curq]; + vq->vq_qsize = value; + break; + case VIRTIO_MMIO_GUEST_FEATURES: + CFG_RW_DBG(VIRTIO_MMIO_GUEST_FEATURES, value); + mmio_set_cfgreg32(di, offset, value); + vs->vs_negotiated_caps = value & vc->vc_hv_caps; + if (vc->vc_apply_features) + (*vc->vc_apply_features)(DEV_SOFTC(vs), + vs->vs_negotiated_caps); + break; + case VIRTIO_MMIO_GUEST_PAGE_SIZE: + mmio_set_cfgreg32(di, offset, value); + vs->vs_guest_page_size = value; + break; + case VIRTIO_MMIO_QUEUE_SEL: + CFG_RW_DBG(VIRTIO_MMIO_QUEUE_SEL, value); + mmio_set_cfgreg32(di, offset, value); + /* + * Note that the guest is allowed to select an + * invalid queue; we just need to return a QNUM + * of 0 while the bad queue is selected. + */ + vs->vs_curq = value; + break; + case VIRTIO_MMIO_QUEUE_ALIGN: + CFG_RW_DBG(VIRTIO_MMIO_QUEUE_ALIGN, value); + mmio_set_cfgreg32(di, offset, value); + vs->vs_align = value; + break; + case VIRTIO_MMIO_QUEUE_PFN: + CFG_RW_DBG(VIRTIO_MMIO_QUEUE_PFN, value); + mmio_set_cfgreg32(di, offset, value); + if (vs->vs_curq >= vc->vc_nvq) + fprintf(stderr, "%s: curq %d >= max %d\r\n", + name, vs->vs_curq, vc->vc_nvq); + else + vi_vq_init(vs, value); + break; + case VIRTIO_MMIO_QUEUE_NOTIFY: + CFG_RW_DBG(VIRTIO_MMIO_QUEUE_NOTIFY, value); + if (value >= vc->vc_nvq) { + fprintf(stderr, "%s: queue %d notify out of range\r\n", + name, (int)value); + break; + } + mmio_set_cfgreg32(di, offset, value); + vq = &vs->vs_queues[value]; + if (vq->vq_notify) + (*vq->vq_notify)(DEV_SOFTC(vs), vq); + else if (vc->vc_qnotify) + (*vc->vc_qnotify)(DEV_SOFTC(vs), vq); + else + fprintf(stderr, + "%s: qnotify queue %d: missing vq/vc notify\r\n", + name, (int)value); + break; + default: + CFG_RW_DBG(offset, value); + goto bad; + break; + } + + goto done; + +bad: + fprintf(stderr, "%s: write to bad offset/size %jd/%d\r\n", + name, (uintmax_t)offset, size); +done: + if (vs->vs_mtx) + pthread_mutex_unlock(vs->vs_mtx); +} + +void +vi_devemu_init(struct mmio_devinst *di, uint32_t type) +{ + uint32_t id; + + switch (type) { + case VIRTIO_TYPE_NET: + id = VIRTIO_ID_NETWORK; + break; + case VIRTIO_TYPE_BLOCK: + id = VIRTIO_ID_BLOCK; + break; + case VIRTIO_TYPE_CONSOLE: + id = VIRTIO_ID_CONSOLE; + break; + case VIRTIO_TYPE_ENTROPY: + id = VIRTIO_ID_ENTROPY; + break; + default: + return; + } + + mmio_set_cfgreg32(di, VIRTIO_MMIO_MAGIC_VALUE, VIRTIO_MMIO_MAGIC_NUM); + mmio_set_cfgreg32(di, VIRTIO_MMIO_VERSION, VIRTIO_MMIO_VERSION_NUM); + mmio_set_cfgreg32(di, VIRTIO_MMIO_DEVICE_ID, id); + mmio_set_cfgreg32(di, VIRTIO_MMIO_VENDOR_ID, VIRTIO_VENDOR); +} Index: usr.sbin/bhyve/mmio/mmio_virtio_block.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_virtio_block.c @@ -0,0 +1,424 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "debug.h" + +#include "mmio_emul.h" +#include "mmio_virtio.h" + +#include "block_if.h" + +#define VTBLK_RINGSZ 128 + +_Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request"); + +#define VTBLK_S_OK 0 +#define VTBLK_S_IOERR 1 +#define VTBLK_S_UNSUPP 2 + +#define VTBLK_BLK_ID_BYTES 20 + 1 + +/* Capability bits */ +#define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */ +#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */ +#define VTBLK_F_FLUSH (1 << 9) /* Cache flush support */ +#define VTBLK_F_TOPOLOGY (1 << 10) /* Optimal I/O alignment */ + +/* + * Host capabilities + */ +#define VTBLK_S_HOSTCAPS \ + ( VTBLK_F_SEG_MAX | \ + VTBLK_F_BLK_SIZE | \ + VTBLK_F_FLUSH | \ + VTBLK_F_TOPOLOGY | \ + VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ + +/* + * Config space "registers" + */ +struct vtblk_config { + uint64_t vbc_capacity; + uint32_t vbc_size_max; + uint32_t vbc_seg_max; + struct { + uint16_t cylinders; + uint8_t heads; + uint8_t sectors; + } vbc_geometry; + uint32_t vbc_blk_size; + struct { + uint8_t physical_block_exp; + uint8_t alignment_offset; + uint16_t min_io_size; + uint32_t opt_io_size; + } vbc_topology; + uint8_t vbc_writeback; +} __packed; + +/* + * Fixed-size block header + */ +struct virtio_blk_hdr { +#define VBH_OP_READ 0 +#define VBH_OP_WRITE 1 +#define VBH_OP_FLUSH 4 +#define VBH_OP_FLUSH_OUT 5 +#define VBH_OP_IDENT 8 +#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ + uint32_t vbh_type; + uint32_t vbh_ioprio; + uint64_t vbh_sector; +} __packed; + +/* + * Debug printf + */ +static int pci_vtblk_debug; +#define DPRINTF(params) if (pci_vtblk_debug) PRINTLN params +#define WPRINTF(params) PRINTLN params + +struct pci_vtblk_ioreq { + struct blockif_req io_req; + struct pci_vtblk_softc *io_sc; + uint8_t *io_status; + uint16_t io_idx; +}; + +/* + * Per-device softc + */ +struct pci_vtblk_softc { + struct virtio_softc vbsc_vs; + pthread_mutex_t vsc_mtx; + struct vqueue_info vbsc_vq; + struct vtblk_config vbsc_cfg; + struct blockif_ctxt *bc; + char vbsc_ident[VTBLK_BLK_ID_BYTES]; + struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; +}; + +static void pci_vtblk_reset(void *); +static void pci_vtblk_notify(void *, struct vqueue_info *); +static int pci_vtblk_cfgread(void *, int, int, uint32_t *); +static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); + +static struct virtio_consts vtblk_vi_consts = { + "vtblk", /* our name */ + 1, /* we support 1 virtqueue */ + sizeof(struct vtblk_config), /* config reg size */ + pci_vtblk_reset, /* reset */ + pci_vtblk_notify, /* device-wide qnotify */ + pci_vtblk_cfgread, /* read PCI config */ + pci_vtblk_cfgwrite, /* write PCI config */ + NULL, /* apply negotiated features */ + VTBLK_S_HOSTCAPS, /* our capabilities */ +}; + +static void +pci_vtblk_reset(void *vsc) +{ + struct pci_vtblk_softc *sc = vsc; + + DPRINTF(("vtblk: device reset requested !")); + vi_reset_dev(&sc->vbsc_vs); +} + +static void +pci_vtblk_done(struct blockif_req *br, int err) +{ + struct pci_vtblk_ioreq *io = br->br_param; + struct pci_vtblk_softc *sc = io->io_sc; + + /* convert errno into a virtio block error return */ + if (err == EOPNOTSUPP || err == ENOSYS) + *io->io_status = VTBLK_S_UNSUPP; + else if (err != 0) + *io->io_status = VTBLK_S_IOERR; + else + *io->io_status = VTBLK_S_OK; + + /* + * Return the descriptor back to the host. + * We wrote 1 byte (our status) to host. + */ + pthread_mutex_lock(&sc->vsc_mtx); + vq_relchain(&sc->vbsc_vq, io->io_idx, 1); + vq_endchains(&sc->vbsc_vq, 0); + pthread_mutex_unlock(&sc->vsc_mtx); +} + +static void +pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) +{ + struct virtio_blk_hdr *vbh; + struct pci_vtblk_ioreq *io; + int i, n; + int err; + ssize_t iolen; + int writeop, type; + struct iovec iov[BLOCKIF_IOV_MAX + 2]; + uint16_t idx, flags[BLOCKIF_IOV_MAX + 2]; + + n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags); + + /* + * The first descriptor will be the read-only fixed header, + * and the last is for status (hence +2 above and below). + * The remaining iov's are the actual data I/O vectors. + * + * XXX - note - this fails on crash dump, which does a + * VIRTIO_BLK_T_FLUSH with a zero transfer length + */ + assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2); + + io = &sc->vbsc_ios[idx]; + assert((flags[0] & VRING_DESC_F_WRITE) == 0); + assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); + vbh = iov[0].iov_base; + memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); + io->io_req.br_iovcnt = n - 2; + io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE; + io->io_status = iov[--n].iov_base; + assert(iov[n].iov_len == 1); + assert(flags[n] & VRING_DESC_F_WRITE); + + /* + * XXX + * The guest should not be setting the BARRIER flag because + * we don't advertise the capability. + */ + type = vbh->vbh_type & ~VBH_FLAG_BARRIER; + writeop = (type == VBH_OP_WRITE); + + iolen = 0; + for (i = 1; i < n; i++) { + /* + * - write op implies read-only descriptor, + * - read/ident op implies write-only descriptor, + * therefore test the inverse of the descriptor bit + * to the op. + */ + assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop); + iolen += iov[i].iov_len; + } + io->io_req.br_resid = iolen; + + DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %lld", + writeop ? "write" : "read/ident", iolen, i - 1, + (long long) io->io_req.br_offset)); + + switch (type) { + case VBH_OP_READ: + err = blockif_read(sc->bc, &io->io_req); + break; + case VBH_OP_WRITE: + err = blockif_write(sc->bc, &io->io_req); + break; + case VBH_OP_FLUSH: + case VBH_OP_FLUSH_OUT: + err = blockif_flush(sc->bc, &io->io_req); + break; + case VBH_OP_IDENT: + /* Assume a single buffer */ + /* S/n equal to buffer is not zero-terminated. */ + memset(iov[1].iov_base, 0, iov[1].iov_len); + strncpy(iov[1].iov_base, sc->vbsc_ident, + MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); + pci_vtblk_done(&io->io_req, 0); + return; + default: + pci_vtblk_done(&io->io_req, EOPNOTSUPP); + return; + } + assert(err == 0); +} + +static void +pci_vtblk_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtblk_softc *sc = vsc; + + while (vq_has_descs(vq)) + pci_vtblk_proc(sc, vq); +} + +static int +pci_vtblk_init(struct vmctx *ctx, struct mmio_devinst *pi, char *opts) +{ + char bident[sizeof("XX:X:X")]; + struct blockif_ctxt *bctxt; + MD5_CTX mdctx; + u_char digest[16]; + struct pci_vtblk_softc *sc; + off_t size; + int i, sectsz, sts, sto; + + if (opts == NULL) { + WPRINTF(("virtio-block: backing device required")); + return (1); + } + + /* + * The supplied backing file has to exist + */ + snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->di_func); + bctxt = blockif_open(opts, bident); + if (bctxt == NULL) { + perror("Could not open backing file"); + return (1); + } + + size = blockif_size(bctxt); + sectsz = blockif_sectsz(bctxt); + blockif_psectsz(bctxt, &sts, &sto); + + sc = calloc(1, sizeof(struct pci_vtblk_softc)); + sc->bc = bctxt; + for (i = 0; i < VTBLK_RINGSZ; i++) { + struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i]; + io->io_req.br_callback = pci_vtblk_done; + io->io_req.br_param = io; + io->io_sc = sc; + io->io_idx = i; + } + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + /* init virtio softc and virtqueues */ + vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq); + sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; + + sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; + /* sc->vbsc_vq.vq_notify = we have no per-queue notify */ + + /* + * Create an identifier for the backing file. Use parts of the + * md5 sum of the filename + */ + MD5Init(&mdctx); + MD5Update(&mdctx, opts, strlen(opts)); + MD5Final(digest, &mdctx); + snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES, + "BHYVE-%02X%02X-%02X%02X-%02X%02X", + digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); + + /* setup virtio block config space */ + sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */ + sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ + + /* + * If Linux is presented with a seg_max greater than the virtio queue + * size, it can stumble into situations where it violates its own + * invariants and panics. For safety, we keep seg_max clamped, paying + * heed to the two extra descriptors needed for the header and status + * of a request. + */ + sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX); + sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */ + sc->vbsc_cfg.vbc_geometry.heads = 0; + sc->vbsc_cfg.vbc_geometry.sectors = 0; + sc->vbsc_cfg.vbc_blk_size = sectsz; + sc->vbsc_cfg.vbc_topology.physical_block_exp = + (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0; + sc->vbsc_cfg.vbc_topology.alignment_offset = + (sto != 0) ? ((sts - sto) / sectsz) : 0; + sc->vbsc_cfg.vbc_topology.min_io_size = 0; + sc->vbsc_cfg.vbc_topology.opt_io_size = 0; + sc->vbsc_cfg.vbc_writeback = 0; + + /* + * Should we move some of this into virtio.c? Could + * have the device, class, and subdev_0 as fields in + * the virtio constants structure. + */ + vi_devemu_init(pi, VIRTIO_TYPE_BLOCK); + + if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) { + blockif_close(sc->bc); + free(sc); + return (1); + } + vi_set_io_res(&sc->vbsc_vs, 0); + return (0); +} + +static int +pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value) +{ + + DPRINTF(("vtblk: write to readonly reg %d", offset)); + return (1); +} + +static int +pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtblk_softc *sc = vsc; + void *ptr; + + /* our caller has already verified offset and size */ + ptr = (uint8_t *)&sc->vbsc_cfg + offset; + memcpy(retval, ptr, size); + return (0); +} + +struct mmio_devemu pci_de_vblk = { + .de_emu = "virtio-blk", + .de_init = pci_vtblk_init, + .de_write = vi_mmio_write, + .de_read = vi_mmio_read +}; +MMIO_EMUL_SET(pci_de_vblk); Index: usr.sbin/bhyve/mmio/mmio_virtio_console.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_virtio_console.c @@ -0,0 +1,681 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 iXsystems Inc. + * All rights reserved. + * + * This software was developed by Jakub Klama + * under sponsorship from iXsystems Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "debug.h" + +#include "mmio_emul.h" +#include "mmio_virtio.h" + +#include "mevent.h" +#include "sockstream.h" + +#define VTCON_RINGSZ 64 +#define VTCON_MAXPORTS 16 +#define VTCON_MAXQ (VTCON_MAXPORTS * 2 + 2) + +#define VTCON_DEVICE_READY 0 +#define VTCON_DEVICE_ADD 1 +#define VTCON_DEVICE_REMOVE 2 +#define VTCON_PORT_READY 3 +#define VTCON_CONSOLE_PORT 4 +#define VTCON_CONSOLE_RESIZE 5 +#define VTCON_PORT_OPEN 6 +#define VTCON_PORT_NAME 7 + +#define VTCON_F_SIZE 0 +#define VTCON_F_MULTIPORT 1 +#define VTCON_F_EMERG_WRITE 2 +#define VTCON_S_HOSTCAPS \ + (VTCON_F_SIZE | VTCON_F_MULTIPORT | VTCON_F_EMERG_WRITE) + +static int pci_vtcon_debug; +#define DPRINTF(params) if (pci_vtcon_debug) PRINTLN params +#define WPRINTF(params) PRINTLN params + +struct pci_vtcon_softc; +struct pci_vtcon_port; +struct pci_vtcon_config; +typedef void (pci_vtcon_cb_t)(struct pci_vtcon_port *, void *, struct iovec *, + int); + +struct pci_vtcon_port { + struct pci_vtcon_softc * vsp_sc; + int vsp_id; + const char * vsp_name; + bool vsp_enabled; + bool vsp_console; + bool vsp_rx_ready; + bool vsp_open; + int vsp_rxq; + int vsp_txq; + void * vsp_arg; + pci_vtcon_cb_t * vsp_cb; +}; + +struct pci_vtcon_sock +{ + struct pci_vtcon_port * vss_port; + const char * vss_path; + struct mevent * vss_server_evp; + struct mevent * vss_conn_evp; + int vss_server_fd; + int vss_conn_fd; + bool vss_open; +}; + +struct pci_vtcon_softc { + struct virtio_softc vsc_vs; + struct vqueue_info vsc_queues[VTCON_MAXQ]; + pthread_mutex_t vsc_mtx; + uint64_t vsc_cfg; + uint64_t vsc_features; + char * vsc_rootdir; + int vsc_kq; + int vsc_nports; + bool vsc_ready; + struct pci_vtcon_port vsc_control_port; + struct pci_vtcon_port vsc_ports[VTCON_MAXPORTS]; + struct pci_vtcon_config *vsc_config; +}; + +struct pci_vtcon_config { + uint16_t cols; + uint16_t rows; + uint32_t max_nr_ports; + uint32_t emerg_wr; +} __attribute__((packed)); + +struct pci_vtcon_control { + uint32_t id; + uint16_t event; + uint16_t value; +} __attribute__((packed)); + +struct pci_vtcon_console_resize { + uint16_t cols; + uint16_t rows; +} __attribute__((packed)); + +static void pci_vtcon_reset(void *); +static void pci_vtcon_notify_rx(void *, struct vqueue_info *); +static void pci_vtcon_notify_tx(void *, struct vqueue_info *); +static int pci_vtcon_cfgread(void *, int, int, uint32_t *); +static int pci_vtcon_cfgwrite(void *, int, int, uint32_t); +static void pci_vtcon_neg_features(void *, uint64_t); +static void pci_vtcon_sock_accept(int, enum ev_type, void *); +static void pci_vtcon_sock_rx(int, enum ev_type, void *); +static void pci_vtcon_sock_tx(struct pci_vtcon_port *, void *, struct iovec *, + int); +static void pci_vtcon_control_send(struct pci_vtcon_softc *, + struct pci_vtcon_control *, const void *, size_t); +static void pci_vtcon_announce_port(struct pci_vtcon_port *); +static void pci_vtcon_open_port(struct pci_vtcon_port *, bool); + +static struct virtio_consts vtcon_vi_consts = { + "vtcon", /* our name */ + VTCON_MAXQ, /* we support VTCON_MAXQ virtqueues */ + sizeof(struct pci_vtcon_config), /* config reg size */ + pci_vtcon_reset, /* reset */ + NULL, /* device-wide qnotify */ + pci_vtcon_cfgread, /* read virtio config */ + pci_vtcon_cfgwrite, /* write virtio config */ + pci_vtcon_neg_features, /* apply negotiated features */ + VTCON_S_HOSTCAPS, /* our capabilities */ +}; + + +static void +pci_vtcon_reset(void *vsc) +{ + struct pci_vtcon_softc *sc; + + sc = vsc; + + DPRINTF(("vtcon: device reset requested!")); + vi_reset_dev(&sc->vsc_vs); +} + +static void +pci_vtcon_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtcon_softc *sc = vsc; + + sc->vsc_features = negotiated_features; +} + +static int +pci_vtcon_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtcon_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)sc->vsc_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static int +pci_vtcon_cfgwrite(void *vsc, int offset, int size, uint32_t val) +{ + + return (0); +} + +static inline struct pci_vtcon_port * +pci_vtcon_vq_to_port(struct pci_vtcon_softc *sc, struct vqueue_info *vq) +{ + uint16_t num = vq->vq_num; + + if (num == 0 || num == 1) + return (&sc->vsc_ports[0]); + + if (num == 2 || num == 3) + return (&sc->vsc_control_port); + + return (&sc->vsc_ports[(num / 2) - 1]); +} + +static inline struct vqueue_info * +pci_vtcon_port_to_vq(struct pci_vtcon_port *port, bool tx_queue) +{ + int qnum; + + qnum = tx_queue ? port->vsp_txq : port->vsp_rxq; + return (&port->vsp_sc->vsc_queues[qnum]); +} + +static struct pci_vtcon_port * +pci_vtcon_port_add(struct pci_vtcon_softc *sc, const char *name, + pci_vtcon_cb_t *cb, void *arg) +{ + struct pci_vtcon_port *port; + + if (sc->vsc_nports == VTCON_MAXPORTS) { + errno = EBUSY; + return (NULL); + } + + port = &sc->vsc_ports[sc->vsc_nports++]; + port->vsp_id = sc->vsc_nports - 1; + port->vsp_sc = sc; + port->vsp_name = name; + port->vsp_cb = cb; + port->vsp_arg = arg; + + if (port->vsp_id == 0) { + /* port0 */ + port->vsp_txq = 0; + port->vsp_rxq = 1; + } else { + port->vsp_txq = sc->vsc_nports * 2; + port->vsp_rxq = port->vsp_txq + 1; + } + + port->vsp_enabled = true; + return (port); +} + +static int +pci_vtcon_sock_add(struct pci_vtcon_softc *sc, const char *name, + const char *path) +{ + struct pci_vtcon_sock *sock; + struct sockaddr_un sun; + char *pathcopy; + int s = -1, fd = -1, error = 0; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + sock = calloc(1, sizeof(struct pci_vtcon_sock)); + if (sock == NULL) { + error = -1; + goto out; + } + + s = socket(AF_UNIX, SOCK_STREAM, 0); + if (s < 0) { + error = -1; + goto out; + } + + pathcopy = strdup(path); + if (pathcopy == NULL) { + error = -1; + goto out; + } + + fd = open(dirname(pathcopy), O_RDONLY | O_DIRECTORY); + if (fd < 0) { + free(pathcopy); + error = -1; + goto out; + } + + sun.sun_family = AF_UNIX; + sun.sun_len = sizeof(struct sockaddr_un); + strcpy(pathcopy, path); + strlcpy(sun.sun_path, basename(pathcopy), sizeof(sun.sun_path)); + free(pathcopy); + + if (bindat(fd, s, (struct sockaddr *)&sun, sun.sun_len) < 0) { + error = -1; + goto out; + } + + if (fcntl(s, F_SETFL, O_NONBLOCK) < 0) { + error = -1; + goto out; + } + + if (listen(s, 1) < 0) { + error = -1; + goto out; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + sock->vss_port = pci_vtcon_port_add(sc, name, pci_vtcon_sock_tx, sock); + if (sock->vss_port == NULL) { + error = -1; + goto out; + } + + sock->vss_open = false; + sock->vss_conn_fd = -1; + sock->vss_server_fd = s; + sock->vss_server_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_accept, + sock); + + if (sock->vss_server_evp == NULL) { + error = -1; + goto out; + } + +out: + if (fd != -1) + close(fd); + + if (error != 0) { + if (s != -1) + close(s); + free(sock); + } + + return (error); +} + +static void +pci_vtcon_sock_accept(int fd __unused, enum ev_type t __unused, void *arg) +{ + struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; + int s; + + s = accept(sock->vss_server_fd, NULL, NULL); + if (s < 0) + return; + + if (sock->vss_open) { + close(s); + return; + } + + sock->vss_open = true; + sock->vss_conn_fd = s; + sock->vss_conn_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_rx, sock); + + pci_vtcon_open_port(sock->vss_port, true); +} + +static void +pci_vtcon_sock_rx(int fd __unused, enum ev_type t __unused, void *arg) +{ + struct pci_vtcon_port *port; + struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; + struct vqueue_info *vq; + struct iovec iov; + static char dummybuf[2048]; + int len, n; + uint16_t idx; + + port = sock->vss_port; + vq = pci_vtcon_port_to_vq(port, true); + + if (!sock->vss_open || !port->vsp_rx_ready) { + len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); + if (len == 0) + goto close; + + return; + } + + if (!vq_has_descs(vq)) { + len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); + vq_endchains(vq, 1); + if (len == 0) + goto close; + + return; + } + + do { + n = vq_getchain(vq, &idx, &iov, 1, NULL); + len = readv(sock->vss_conn_fd, &iov, n); + + if (len == 0 || (len < 0 && errno == EWOULDBLOCK)) { + vq_retchains(vq, 1); + vq_endchains(vq, 0); + if (len == 0) + goto close; + + return; + } + + vq_relchain(vq, idx, len); + } while (vq_has_descs(vq)); + + vq_endchains(vq, 1); + +close: + mevent_delete_close(sock->vss_conn_evp); + sock->vss_conn_fd = -1; + sock->vss_open = false; +} + +static void +pci_vtcon_sock_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, + int niov) +{ + struct pci_vtcon_sock *sock; + int i, ret; + + sock = (struct pci_vtcon_sock *)arg; + + if (sock->vss_conn_fd == -1) + return; + + for (i = 0; i < niov; i++) { + ret = stream_write(sock->vss_conn_fd, iov[i].iov_base, + iov[i].iov_len); + if (ret <= 0) + break; + } + + if (ret <= 0) { + mevent_delete_close(sock->vss_conn_evp); + sock->vss_conn_fd = -1; + sock->vss_open = false; + } +} + +static void +pci_vtcon_control_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, + int niov) +{ + struct pci_vtcon_softc *sc; + struct pci_vtcon_port *tmp; + struct pci_vtcon_control resp, *ctrl; + int i; + + assert(niov == 1); + + sc = port->vsp_sc; + ctrl = (struct pci_vtcon_control *)iov->iov_base; + + switch (ctrl->event) { + case VTCON_DEVICE_READY: + sc->vsc_ready = true; + /* set port ready events for registered ports */ + for (i = 0; i < VTCON_MAXPORTS; i++) { + tmp = &sc->vsc_ports[i]; + if (tmp->vsp_enabled) + pci_vtcon_announce_port(tmp); + + if (tmp->vsp_open) + pci_vtcon_open_port(tmp, true); + } + break; + + case VTCON_PORT_READY: + if (ctrl->id >= sc->vsc_nports) { + WPRINTF(("VTCON_PORT_READY event for unknown port %d", + ctrl->id)); + return; + } + + tmp = &sc->vsc_ports[ctrl->id]; + if (tmp->vsp_console) { + resp.event = VTCON_CONSOLE_PORT; + resp.id = ctrl->id; + resp.value = 1; + pci_vtcon_control_send(sc, &resp, NULL, 0); + } + break; + } +} + +static void +pci_vtcon_announce_port(struct pci_vtcon_port *port) +{ + struct pci_vtcon_control event; + + event.id = port->vsp_id; + event.event = VTCON_DEVICE_ADD; + event.value = 1; + pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); + + event.event = VTCON_PORT_NAME; + pci_vtcon_control_send(port->vsp_sc, &event, port->vsp_name, + strlen(port->vsp_name)); +} + +static void +pci_vtcon_open_port(struct pci_vtcon_port *port, bool open) +{ + struct pci_vtcon_control event; + + if (!port->vsp_sc->vsc_ready) { + port->vsp_open = true; + return; + } + + event.id = port->vsp_id; + event.event = VTCON_PORT_OPEN; + event.value = (int)open; + pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); +} + +static void +pci_vtcon_control_send(struct pci_vtcon_softc *sc, + struct pci_vtcon_control *ctrl, const void *payload, size_t len) +{ + struct vqueue_info *vq; + struct iovec iov; + uint16_t idx; + int n; + + vq = pci_vtcon_port_to_vq(&sc->vsc_control_port, true); + + if (!vq_has_descs(vq)) + return; + + n = vq_getchain(vq, &idx, &iov, 1, NULL); + + assert(n == 1); + + memcpy(iov.iov_base, ctrl, sizeof(struct pci_vtcon_control)); + if (payload != NULL && len > 0) + memcpy(iov.iov_base + sizeof(struct pci_vtcon_control), + payload, len); + + vq_relchain(vq, idx, sizeof(struct pci_vtcon_control) + len); + vq_endchains(vq, 1); +} + + +static void +pci_vtcon_notify_tx(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtcon_softc *sc; + struct pci_vtcon_port *port; + struct iovec iov[1]; + uint16_t idx, n; + uint16_t flags[8]; + + sc = vsc; + port = pci_vtcon_vq_to_port(sc, vq); + + while (vq_has_descs(vq)) { + n = vq_getchain(vq, &idx, iov, 1, flags); + assert(n >= 1); + if (port != NULL) + port->vsp_cb(port, port->vsp_arg, iov, 1); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, 0); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ +} + +static void +pci_vtcon_notify_rx(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtcon_softc *sc; + struct pci_vtcon_port *port; + + sc = vsc; + port = pci_vtcon_vq_to_port(sc, vq); + + if (!port->vsp_rx_ready) { + port->vsp_rx_ready = 1; + vq_kick_disable(vq); + } +} + +static int +pci_vtcon_init(struct vmctx *ctx, struct mmio_devinst *pi, char *opts) +{ + struct pci_vtcon_softc *sc; + char *portname = NULL; + char *portpath = NULL; + char *opt; + int i; + + sc = calloc(1, sizeof(struct pci_vtcon_softc)); + sc->vsc_config = calloc(1, sizeof(struct pci_vtcon_config)); + sc->vsc_config->max_nr_ports = VTCON_MAXPORTS; + sc->vsc_config->cols = 80; + sc->vsc_config->rows = 25; + + vi_softc_linkup(&sc->vsc_vs, &vtcon_vi_consts, sc, pi, sc->vsc_queues); + sc->vsc_vs.vs_mtx = &sc->vsc_mtx; + + for (i = 0; i < VTCON_MAXQ; i++) { + sc->vsc_queues[i].vq_qsize = VTCON_RINGSZ; + sc->vsc_queues[i].vq_notify = i % 2 == 0 + ? pci_vtcon_notify_rx + : pci_vtcon_notify_tx; + } + + /* initialize config space */ + vi_devemu_init(pi, VIRTIO_TYPE_CONSOLE); + + if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_res(&sc->vsc_vs, 0); + + /* create control port */ + sc->vsc_control_port.vsp_sc = sc; + sc->vsc_control_port.vsp_txq = 2; + sc->vsc_control_port.vsp_rxq = 3; + sc->vsc_control_port.vsp_cb = pci_vtcon_control_tx; + sc->vsc_control_port.vsp_enabled = true; + + while ((opt = strsep(&opts, ",")) != NULL) { + portname = strsep(&opt, "="); + portpath = opt; + + /* create port */ + if (pci_vtcon_sock_add(sc, portname, portpath) < 0) { + EPRINTLN("cannot create port %s: %s", + portname, strerror(errno)); + return (1); + } + } + + return (0); +} + +struct mmio_devemu pci_de_vcon = { + .de_emu = "virtio-console", + .de_init = pci_vtcon_init, + .de_write = vi_mmio_write, + .de_read = vi_mmio_read +}; +MMIO_EMUL_SET(pci_de_vcon); Index: usr.sbin/bhyve/mmio/mmio_virtio_net.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_virtio_net.c @@ -0,0 +1,697 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "debug.h" + +#include "mmio_emul.h" +#include "mmio_virtio.h" + +#include "mevent.h" +#include "net_utils.h" +#include "net_backends.h" +#include "iov.h" + +#define VTNET_RINGSZ 1024 + +#define VTNET_MAXSEGS 256 + +#define VTNET_MAX_PKT_LEN (65536 + 64) + +#define VTNET_S_HOSTCAPS \ + ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | \ + VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) + +/* + * PCI config-space "registers" + */ +struct virtio_net_config { + uint8_t mac[6]; + uint16_t status; +} __packed; + +/* + * Queue definitions. + */ +#define VTNET_RXQ 0 +#define VTNET_TXQ 1 +#define VTNET_CTLQ 2 /* NB: not yet supported */ + +#define VTNET_MAXQ 3 + +/* + * Debug printf + */ +static int pci_vtnet_debug; +#define DPRINTF(params) if (pci_vtnet_debug) PRINTLN params +#define WPRINTF(params) PRINTLN params + +/* + * Per-device softc + */ +struct pci_vtnet_softc { + struct virtio_softc vsc_vs; + struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; + pthread_mutex_t vsc_mtx; + + net_backend_t *vsc_be; + + int resetting; /* protected by tx_mtx */ + + uint64_t vsc_features; /* negotiated features */ + + pthread_mutex_t rx_mtx; + int rx_merge; /* merged rx bufs in use */ + + pthread_t tx_tid; + pthread_mutex_t tx_mtx; + pthread_cond_t tx_cond; + int tx_in_progress; + + size_t vhdrlen; + size_t be_vhdrlen; + + struct virtio_net_config vsc_config; + struct virtio_consts vsc_consts; +}; + +static void pci_vtnet_reset(void *); +/* static void pci_vtnet_notify(void *, struct vqueue_info *); */ +static int pci_vtnet_cfgread(void *, int, int, uint32_t *); +static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); +static void pci_vtnet_neg_features(void *, uint64_t); + +static struct virtio_consts vtnet_vi_consts = { + "vtnet", /* our name */ + VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ + sizeof(struct virtio_net_config), /* config reg size */ + pci_vtnet_reset, /* reset */ + NULL, /* device-wide qnotify -- not used */ + pci_vtnet_cfgread, /* read PCI config */ + pci_vtnet_cfgwrite, /* write PCI config */ + pci_vtnet_neg_features, /* apply negotiated features */ + VTNET_S_HOSTCAPS, /* our capabilities */ +}; + +static void +pci_vtnet_reset(void *vsc) +{ + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device reset requested !")); + + /* Acquire the RX lock to block RX processing. */ + pthread_mutex_lock(&sc->rx_mtx); + + /* + * Make sure receive operation is disabled at least until we + * re-negotiate the features, since receive operation depends + * on the value of sc->rx_merge and the header length, which + * are both set in pci_vtnet_neg_features(). + * Receive operation will be enabled again once the guest adds + * the first receive buffers and kicks us. + */ + netbe_rx_disable(sc->vsc_be); + + /* Set sc->resetting and give a chance to the TX thread to stop. */ + pthread_mutex_lock(&sc->tx_mtx); + sc->resetting = 1; + while (sc->tx_in_progress) { + pthread_mutex_unlock(&sc->tx_mtx); + usleep(10000); + pthread_mutex_lock(&sc->tx_mtx); + } + + /* + * Now reset rings, MSI-X vectors, and negotiated capabilities. + * Do that with the TX lock held, since we need to reset + * sc->resetting. + */ + vi_reset_dev(&sc->vsc_vs); + + sc->resetting = 0; + pthread_mutex_unlock(&sc->tx_mtx); + pthread_mutex_unlock(&sc->rx_mtx); +} + +static __inline struct iovec * +iov_trim_hdr(struct iovec *iov, int *iovcnt, unsigned int hlen) +{ + struct iovec *riov; + + if (iov[0].iov_len < hlen) { + /* + * Not enough header space in the first fragment. + * That's not ok for us. + */ + return NULL; + } + + iov[0].iov_len -= hlen; + if (iov[0].iov_len == 0) { + *iovcnt -= 1; + if (*iovcnt == 0) { + /* + * Only space for the header. That's not + * enough for us. + */ + return NULL; + } + riov = &iov[1]; + } else { + iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + hlen); + riov = &iov[0]; + } + + return (riov); +} + +struct virtio_mrg_rxbuf_info { + uint16_t idx; + uint16_t pad; + uint32_t len; +}; + +static void +pci_vtnet_rx(struct pci_vtnet_softc *sc) +{ + int prepend_hdr_len = sc->vhdrlen - sc->be_vhdrlen; + struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS]; + struct iovec iov[VTNET_MAXSEGS + 1]; + struct vqueue_info *vq; + + + + vq = &sc->vsc_queues[VTNET_RXQ]; + for (;;) { + struct virtio_net_rxhdr *hdr; + uint32_t riov_bytes; + struct iovec *riov; + uint32_t ulen; + int riov_len; + int n_chains; + ssize_t rlen; + ssize_t plen; + + plen = netbe_peek_recvlen(sc->vsc_be); + if (plen <= 0) { + /* + * No more packets (plen == 0), or backend errored + * (plen < 0). Interrupt if needed and stop. + */ + vq_endchains(vq, /*used_all_avail=*/0); + return; + } + plen += prepend_hdr_len; + + /* + * Get a descriptor chain to store the next ingress + * packet. In case of mergeable rx buffers, get as + * many chains as necessary in order to make room + * for a maximum sized LRO packet. + */ + riov_bytes = 0; + riov_len = 0; + riov = iov; + n_chains = 0; + do { + int n = vq_getchain(vq, &info[n_chains].idx, riov, + VTNET_MAXSEGS - riov_len, NULL); + + if (n == 0) { + /* + * No rx buffers. Enable RX kicks and double + * check. + */ + vq_kick_enable(vq); + if (!vq_has_descs(vq)) { + /* + * Still no buffers. Return the unused + * chains (if any), interrupt if needed + * (including for NOTIFY_ON_EMPTY), and + * disable the backend until the next + * kick. + */ + vq_retchains(vq, n_chains); + vq_endchains(vq, /*used_all_avail=*/1); + netbe_rx_disable(sc->vsc_be); + return; + } + + /* More rx buffers found, so keep going. */ + vq_kick_disable(vq); + continue; + } + assert(n >= 1 && riov_len + n <= VTNET_MAXSEGS); + riov_len += n; + if (!sc->rx_merge) { + n_chains = 1; + break; + } + info[n_chains].len = (uint32_t)count_iov(riov, n); + riov_bytes += info[n_chains].len; + riov += n; + n_chains++; + } while (riov_bytes < plen && riov_len < VTNET_MAXSEGS); + + riov = iov; + hdr = riov[0].iov_base; + if (prepend_hdr_len > 0) { + /* + * The frontend uses a virtio-net header, but the + * backend does not. We need to prepend a zeroed + * header. + */ + riov = iov_trim_hdr(riov, &riov_len, prepend_hdr_len); + if (riov == NULL) { + /* + * The first collected chain is nonsensical, + * as it is not even enough to store the + * virtio-net header. Just drop it. + */ + vq_relchain(vq, info[0].idx, 0); + vq_retchains(vq, n_chains - 1); + continue; + } + memset(hdr, 0, prepend_hdr_len); + } + + rlen = netbe_recv(sc->vsc_be, riov, riov_len); + + if (rlen != plen - prepend_hdr_len) { + /* + * No more packets (len == 0), or backend errored + * (err < 0). Return unused available buffers + * and stop. + */ + vq_retchains(vq, n_chains); + /* Interrupt if needed/appropriate and stop. */ + vq_endchains(vq, /*used_all_avail=*/0); + return; + } + + ulen = (uint32_t)plen; /* avoid too many casts below */ + + /* Publish the used buffers to the guest. */ + if (!sc->rx_merge) { + vq_relchain(vq, info[0].idx, ulen); + } else { + uint32_t iolen; + int i = 0; + + do { + iolen = info[i].len; + if (iolen > ulen) { + iolen = ulen; + } + vq_relchain(vq, info[i].idx, iolen); + ulen -= iolen; + i++; + } while (ulen > 0); + + hdr->vrh_bufs = i; + // TODO add publish for arm64 + //vq_relchain_publish(vq); + vq_retchains(vq, n_chains - i); + } + } + +} + +/* + * Called when there is read activity on the backend file descriptor. + * Each buffer posted by the guest is assumed to be able to contain + * an entire ethernet frame + rx header. + */ +static void +pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) +{ + struct pci_vtnet_softc *sc = param; + + pthread_mutex_lock(&sc->rx_mtx); + pci_vtnet_rx(sc); + pthread_mutex_unlock(&sc->rx_mtx); + +} + +/* Called on RX kick. */ +static void +pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtnet_softc *sc = vsc; + + /* + * A qnotify means that the rx process can now begin. + */ + pthread_mutex_lock(&sc->rx_mtx); + vq_kick_disable(vq); + netbe_rx_enable(sc->vsc_be); + pthread_mutex_unlock(&sc->rx_mtx); +} + +/* TX virtqueue processing, called by the TX thread. */ +static void +pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) +{ + struct iovec iov[VTNET_MAXSEGS + 1]; + struct iovec *siov = iov; + uint16_t idx; + ssize_t len; + int n; + + /* + * Obtain chain of descriptors. The first descriptor also + * contains the virtio-net header. + */ + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); + + if (sc->vhdrlen != sc->be_vhdrlen) { + /* + * The frontend uses a virtio-net header, but the backend + * does not. We simply strip the header and ignore it, as + * it should be zero-filled. + */ + siov = iov_trim_hdr(siov, &n, sc->vhdrlen); + } + + if (siov == NULL) { + /* The chain is nonsensical. Just drop it. */ + len = 0; + } else { + len = netbe_send(sc->vsc_be, siov, n); + if (len < 0) { + /* + * If send failed, report that 0 bytes + * were read. + */ + len = 0; + } + } + + /* + * Return the processed chain to the guest, reporting + * the number of bytes that we read. + */ + vq_relchain(vq, idx, len > 0 ? len : 0); +} + +/* Called on TX kick. */ +static void +pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtnet_softc *sc = vsc; + + /* + * Any ring entries to process? + */ + if (!vq_has_descs(vq)) + return; + + /* Signal the tx thread for processing */ + pthread_mutex_lock(&sc->tx_mtx); + vq_kick_disable(vq); + if (sc->tx_in_progress == 0) + pthread_cond_signal(&sc->tx_cond); + pthread_mutex_unlock(&sc->tx_mtx); +} + +/* + * Thread which will handle processing of TX desc + */ +static void * +pci_vtnet_tx_thread(void *param) +{ + struct pci_vtnet_softc *sc = param; + struct vqueue_info *vq; + int error; + + vq = &sc->vsc_queues[VTNET_TXQ]; + + /* + * Let us wait till the tx queue pointers get initialised & + * first tx signaled + */ + pthread_mutex_lock(&sc->tx_mtx); + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + + for (;;) { + /* note - tx mutex is locked here */ + while (sc->resetting || !vq_has_descs(vq)) { + vq_kick_enable(vq); + if (!sc->resetting && vq_has_descs(vq)) + break; + + sc->tx_in_progress = 0; + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + } + vq_kick_disable(vq); + sc->tx_in_progress = 1; + pthread_mutex_unlock(&sc->tx_mtx); + + do { + /* + * Run through entries, placing them into + * iovecs and sending when an end-of-packet + * is found + */ + pci_vtnet_proctx(sc, vq); + } while (vq_has_descs(vq)); + + /* + * Generate an interrupt if needed. + */ + vq_endchains(vq, /*used_all_avail=*/1); + + pthread_mutex_lock(&sc->tx_mtx); + } +} + +#ifdef notyet +static void +pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) +{ + + DPRINTF(("vtnet: control qnotify!")); +} +#endif + +static int +pci_vtnet_init(struct vmctx *ctx, struct mmio_devinst *pi, char *opts) +{ + struct pci_vtnet_softc *sc; + char tname[MAXCOMLEN + 1]; + int mac_provided; + + /* + * Allocate data structures for further virtio initializations. + * sc also contains a copy of vtnet_vi_consts, since capabilities + * change depending on the backend. + */ + sc = calloc(1, sizeof(struct pci_vtnet_softc)); + + sc->vsc_consts = vtnet_vi_consts; + pthread_mutex_init(&sc->vsc_mtx, NULL); + + sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; + sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; +#ifdef notyet + sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; +#endif + + /* + * Attempt to open the backend device and read the MAC address + * if specified. + */ + mac_provided = 0; + if (opts != NULL) { + char *devname; + char *vtopts; + int err; + + devname = vtopts = strdup(opts); + (void) strsep(&vtopts, ","); + + if (vtopts != NULL) { + err = net_parsemac(vtopts, sc->vsc_config.mac); + if (err != 0) { + free(devname); + free(sc); + return (err); + } + mac_provided = 1; + } + + err = netbe_init(&sc->vsc_be, devname, pci_vtnet_rx_callback, + sc); + free(devname); + if (err) { + free(sc); + return (err); + } + sc->vsc_consts.vc_hv_caps |= netbe_get_cap(sc->vsc_be); + } + + if (!mac_provided) { + net_genmac(pi, sc->vsc_config.mac); + } + + /* initialize config space */ + vi_devemu_init(pi, VIRTIO_TYPE_NET); + + /* Link is up if we managed to open backend device. */ + sc->vsc_config.status = (opts == NULL || sc->vsc_be); + + vi_softc_linkup(&sc->vsc_vs, &sc->vsc_consts, sc, pi, sc->vsc_queues); + sc->vsc_vs.vs_mtx = &sc->vsc_mtx; + + /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ + if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) { + free(sc); + return (1); + } + + /* use BAR 0 to map config regs in IO space */ + vi_set_io_res(&sc->vsc_vs, 0); + + sc->resetting = 0; + + sc->rx_merge = 0; + pthread_mutex_init(&sc->rx_mtx, NULL); + + /* + * Initialize tx semaphore & spawn TX processing thread. + * As of now, only one thread for TX desc processing is + * spawned. + */ + sc->tx_in_progress = 0; + pthread_mutex_init(&sc->tx_mtx, NULL); + pthread_cond_init(&sc->tx_cond, NULL); + pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); + snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, + pi->di_func); + pthread_set_name_np(sc->tx_tid, tname); + + return (0); +} + +static int +pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) +{ + struct pci_vtnet_softc *sc = vsc; + void *ptr; + + if (offset < (int)sizeof(sc->vsc_config.mac)) { + assert(offset + size <= (int)sizeof(sc->vsc_config.mac)); + /* + * The driver is allowed to change the MAC address + */ + ptr = &sc->vsc_config.mac[offset]; + memcpy(ptr, &value, size); + } else { + /* silently ignore other writes */ + DPRINTF(("vtnet: write to readonly reg %d", offset)); + } + + return (0); +} + +static int +pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtnet_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)&sc->vsc_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static void +pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtnet_softc *sc = vsc; + + sc->vsc_features = negotiated_features; + + if (negotiated_features & VIRTIO_NET_F_MRG_RXBUF) { + sc->vhdrlen = sizeof(struct virtio_net_rxhdr); + sc->rx_merge = 1; + } else { + /* + * Without mergeable rx buffers, virtio-net header is 2 + * bytes shorter than sizeof(struct virtio_net_rxhdr). + */ + sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2; + sc->rx_merge = 0; + } + + /* Tell the backend to enable some capabilities it has advertised. */ + netbe_set_cap(sc->vsc_be, negotiated_features, sc->vhdrlen); + sc->be_vhdrlen = netbe_get_vnet_hdr_len(sc->vsc_be); +} + +static struct mmio_devemu pci_de_vnet = { + .de_emu = "virtio-net", + .de_init = pci_vtnet_init, + .de_write = vi_mmio_write, + .de_read = vi_mmio_read +}; +MMIO_EMUL_SET(pci_de_vnet); Index: usr.sbin/bhyve/mmio/mmio_virtio_rnd.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_virtio_rnd.c @@ -0,0 +1,208 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * virtio entropy device emulation. + * Randomness is sourced from /dev/random which does not block + * once it has been seeded at bootup. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "debug.h" + +#include "mmio_emul.h" +#include "mmio_virtio.h" + +#define VTRND_RINGSZ 64 + + +static int pci_vtrnd_debug; +#define DPRINTF(params) if (pci_vtrnd_debug) PRINTLN params +#define WPRINTF(params) PRINTLN params + +/* + * Per-device softc + */ +struct pci_vtrnd_softc { + struct virtio_softc vrsc_vs; + struct vqueue_info vrsc_vq; + pthread_mutex_t vrsc_mtx; + uint64_t vrsc_cfg; + int vrsc_fd; +}; + +static void pci_vtrnd_reset(void *); +static void pci_vtrnd_notify(void *, struct vqueue_info *); + +static struct virtio_consts vtrnd_vi_consts = { + "vtrnd", /* our name */ + 1, /* we support 1 virtqueue */ + 0, /* config reg size */ + pci_vtrnd_reset, /* reset */ + pci_vtrnd_notify, /* device-wide qnotify */ + NULL, /* read virtio config */ + NULL, /* write virtio config */ + NULL, /* apply negotiated features */ + 0, /* our capabilities */ +}; + + +static void +pci_vtrnd_reset(void *vsc) +{ + struct pci_vtrnd_softc *sc; + + sc = vsc; + + DPRINTF(("vtrnd: device reset requested !")); + vi_reset_dev(&sc->vrsc_vs); +} + + +static void +pci_vtrnd_notify(void *vsc, struct vqueue_info *vq) +{ + struct iovec iov; + struct pci_vtrnd_softc *sc; + int len; + uint16_t idx; + + sc = vsc; + + if (sc->vrsc_fd < 0) { + vq_endchains(vq, 0); + return; + } + + while (vq_has_descs(vq)) { + vq_getchain(vq, &idx, &iov, 1, NULL); + + len = read(sc->vrsc_fd, iov.iov_base, iov.iov_len); + + DPRINTF(("vtrnd: vtrnd_notify(): %d", len)); + + /* Catastrophe if unable to read from /dev/random */ + assert(len > 0); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, len); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ +} + + +static int +pci_vtrnd_init(struct vmctx *ctx, struct mmio_devinst *pi, char *opts) +{ + struct pci_vtrnd_softc *sc; + int fd; + int len; + uint8_t v; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + /* + * Should always be able to open /dev/random. + */ + fd = open("/dev/random", O_RDONLY | O_NONBLOCK); + + assert(fd >= 0); + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_READ); + if (caph_rights_limit(fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* + * Check that device is seeded and non-blocking. + */ + len = read(fd, &v, sizeof(v)); + if (len <= 0) { + WPRINTF(("vtrnd: /dev/random not ready, read(): %d", len)); + close(fd); + return (1); + } + + sc = calloc(1, sizeof(struct pci_vtrnd_softc)); + + vi_softc_linkup(&sc->vrsc_vs, &vtrnd_vi_consts, sc, pi, &sc->vrsc_vq); + sc->vrsc_vs.vs_mtx = &sc->vrsc_mtx; + + sc->vrsc_vq.vq_qsize = VTRND_RINGSZ; + + /* keep /dev/random opened while emulating */ + sc->vrsc_fd = fd; + + /* initialize config space */ + vi_devemu_init(pi, VIRTIO_TYPE_ENTROPY); + + if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_res(&sc->vrsc_vs, 0); + + return (0); +} + + +struct mmio_devemu pci_de_vrnd = { + .de_emu = "virtio-rnd", + .de_init = pci_vtrnd_init, + .de_write = vi_mmio_write, + .de_read = vi_mmio_read +}; +MMIO_EMUL_SET(pci_de_vrnd); Index: usr.sbin/bhyve/mmio/mmio_virtio_scsi.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/mmio_virtio_scsi.c @@ -0,0 +1,741 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Jakub Klama . + * Copyright (c) 2018 Marcelo Araujo . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "debug.h" +#include "iov.h" + +#include "mmio_emul.h" +#include "mmio_virtio.h" + +#define VTSCSI_RINGSZ 64 +#define VTSCSI_REQUESTQ 1 +#define VTSCSI_THR_PER_Q 16 +#define VTSCSI_MAXQ (VTSCSI_REQUESTQ + 2) +#define VTSCSI_MAXSEG 64 + +#define VTSCSI_IN_HEADER_LEN(_sc) \ + (sizeof(struct pci_vtscsi_req_cmd_rd) + _sc->vss_config.cdb_size) + +#define VTSCSI_OUT_HEADER_LEN(_sc) \ + (sizeof(struct pci_vtscsi_req_cmd_wr) + _sc->vss_config.sense_size) + +#define VIRTIO_SCSI_MAX_CHANNEL 0 +#define VIRTIO_SCSI_MAX_TARGET 0 +#define VIRTIO_SCSI_MAX_LUN 16383 + +#define VIRTIO_SCSI_F_INOUT (1 << 0) +#define VIRTIO_SCSI_F_HOTPLUG (1 << 1) +#define VIRTIO_SCSI_F_CHANGE (1 << 2) + +static int pci_vtscsi_debug = 0; +#define DPRINTF(params) if (pci_vtscsi_debug) PRINTLN params +#define WPRINTF(params) PRINTLN params + +struct pci_vtscsi_config { + uint32_t num_queues; + uint32_t seg_max; + uint32_t max_sectors; + uint32_t cmd_per_lun; + uint32_t event_info_size; + uint32_t sense_size; + uint32_t cdb_size; + uint16_t max_channel; + uint16_t max_target; + uint32_t max_lun; +} __attribute__((packed)); + +struct pci_vtscsi_queue { + struct pci_vtscsi_softc * vsq_sc; + struct vqueue_info * vsq_vq; + pthread_mutex_t vsq_mtx; + pthread_mutex_t vsq_qmtx; + pthread_cond_t vsq_cv; + STAILQ_HEAD(, pci_vtscsi_request) vsq_requests; + LIST_HEAD(, pci_vtscsi_worker) vsq_workers; +}; + +struct pci_vtscsi_worker { + struct pci_vtscsi_queue * vsw_queue; + pthread_t vsw_thread; + bool vsw_exiting; + LIST_ENTRY(pci_vtscsi_worker) vsw_link; +}; + +struct pci_vtscsi_request { + struct pci_vtscsi_queue * vsr_queue; + struct iovec vsr_iov_in[VTSCSI_MAXSEG]; + int vsr_niov_in; + struct iovec vsr_iov_out[VTSCSI_MAXSEG]; + int vsr_niov_out; + uint32_t vsr_idx; + STAILQ_ENTRY(pci_vtscsi_request) vsr_link; +}; + +/* + * Per-device softc + */ +struct pci_vtscsi_softc { + struct virtio_softc vss_vs; + struct vqueue_info vss_vq[VTSCSI_MAXQ]; + struct pci_vtscsi_queue vss_queues[VTSCSI_REQUESTQ]; + pthread_mutex_t vss_mtx; + int vss_iid; + int vss_ctl_fd; + uint32_t vss_features; + struct pci_vtscsi_config vss_config; +}; + +#define VIRTIO_SCSI_T_TMF 0 +#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 +#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 +#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 +#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 +#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 +#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 + +/* command-specific response values */ +#define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0 +#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 +#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 + +struct pci_vtscsi_ctrl_tmf { + uint32_t type; + uint32_t subtype; + uint8_t lun[8]; + uint64_t id; + uint8_t response; +} __attribute__((packed)); + +#define VIRTIO_SCSI_T_AN_QUERY 1 +#define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2 +#define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4 +#define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8 +#define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16 +#define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32 +#define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64 + +struct pci_vtscsi_ctrl_an { + uint32_t type; + uint8_t lun[8]; + uint32_t event_requested; + uint32_t event_actual; + uint8_t response; +} __attribute__((packed)); + +/* command-specific response values */ +#define VIRTIO_SCSI_S_OK 0 +#define VIRTIO_SCSI_S_OVERRUN 1 +#define VIRTIO_SCSI_S_ABORTED 2 +#define VIRTIO_SCSI_S_BAD_TARGET 3 +#define VIRTIO_SCSI_S_RESET 4 +#define VIRTIO_SCSI_S_BUSY 5 +#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 +#define VIRTIO_SCSI_S_TARGET_FAILURE 7 +#define VIRTIO_SCSI_S_NEXUS_FAILURE 8 +#define VIRTIO_SCSI_S_FAILURE 9 +#define VIRTIO_SCSI_S_INCORRECT_LUN 12 + +/* task_attr */ +#define VIRTIO_SCSI_S_SIMPLE 0 +#define VIRTIO_SCSI_S_ORDERED 1 +#define VIRTIO_SCSI_S_HEAD 2 +#define VIRTIO_SCSI_S_ACA 3 + +struct pci_vtscsi_event { + uint32_t event; + uint8_t lun[8]; + uint32_t reason; +} __attribute__((packed)); + +struct pci_vtscsi_req_cmd_rd { + uint8_t lun[8]; + uint64_t id; + uint8_t task_attr; + uint8_t prio; + uint8_t crn; + uint8_t cdb[]; +} __attribute__((packed)); + +struct pci_vtscsi_req_cmd_wr { + uint32_t sense_len; + uint32_t residual; + uint16_t status_qualifier; + uint8_t status; + uint8_t response; + uint8_t sense[]; +} __attribute__((packed)); + +static void *pci_vtscsi_proc(void *); +static void pci_vtscsi_reset(void *); +static void pci_vtscsi_neg_features(void *, uint64_t); +static int pci_vtscsi_cfgread(void *, int, int, uint32_t *); +static int pci_vtscsi_cfgwrite(void *, int, int, uint32_t); +static inline int pci_vtscsi_get_lun(uint8_t *); +static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *, void *, size_t); +static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *, + struct pci_vtscsi_ctrl_tmf *); +static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *, + struct pci_vtscsi_ctrl_an *); +static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *, struct iovec *, + int, struct iovec *, int); +static void pci_vtscsi_controlq_notify(void *, struct vqueue_info *); +static void pci_vtscsi_eventq_notify(void *, struct vqueue_info *); +static void pci_vtscsi_requestq_notify(void *, struct vqueue_info *); +static int pci_vtscsi_init_queue(struct pci_vtscsi_softc *, + struct pci_vtscsi_queue *, int); +static int pci_vtscsi_init(struct vmctx *, struct mmio_devinst *, char *); + +static struct virtio_consts vtscsi_vi_consts = { + "vtscsi", /* our name */ + VTSCSI_MAXQ, /* we support 2+n virtqueues */ + sizeof(struct pci_vtscsi_config), /* config reg size */ + pci_vtscsi_reset, /* reset */ + NULL, /* device-wide qnotify */ + pci_vtscsi_cfgread, /* read virtio config */ + pci_vtscsi_cfgwrite, /* write virtio config */ + pci_vtscsi_neg_features, /* apply negotiated features */ + 0, /* our capabilities */ +}; + +static void * +pci_vtscsi_proc(void *arg) +{ + struct pci_vtscsi_worker *worker = (struct pci_vtscsi_worker *)arg; + struct pci_vtscsi_queue *q = worker->vsw_queue; + struct pci_vtscsi_request *req; + int iolen; + + for (;;) { + pthread_mutex_lock(&q->vsq_mtx); + + while (STAILQ_EMPTY(&q->vsq_requests) + && !worker->vsw_exiting) + pthread_cond_wait(&q->vsq_cv, &q->vsq_mtx); + + if (worker->vsw_exiting) + break; + + req = STAILQ_FIRST(&q->vsq_requests); + STAILQ_REMOVE_HEAD(&q->vsq_requests, vsr_link); + + pthread_mutex_unlock(&q->vsq_mtx); + iolen = pci_vtscsi_request_handle(q, req->vsr_iov_in, + req->vsr_niov_in, req->vsr_iov_out, req->vsr_niov_out); + + pthread_mutex_lock(&q->vsq_qmtx); + vq_relchain(q->vsq_vq, req->vsr_idx, iolen); + vq_endchains(q->vsq_vq, 0); + pthread_mutex_unlock(&q->vsq_qmtx); + + DPRINTF(("virtio-scsi: request completed", + req->vsr_idx)); + free(req); + } + + pthread_mutex_unlock(&q->vsq_mtx); + return (NULL); +} + +static void +pci_vtscsi_reset(void *vsc) +{ + struct pci_vtscsi_softc *sc; + + sc = vsc; + + DPRINTF(("vtscsi: device reset requested")); + vi_reset_dev(&sc->vss_vs); + + /* initialize config structure */ + sc->vss_config = (struct pci_vtscsi_config){ + .num_queues = VTSCSI_REQUESTQ, + /* Leave room for the request and the response. */ + .seg_max = VTSCSI_MAXSEG - 2, + .max_sectors = 2, + .cmd_per_lun = 1, + .event_info_size = sizeof(struct pci_vtscsi_event), + .sense_size = 96, + .cdb_size = 32, + .max_channel = VIRTIO_SCSI_MAX_CHANNEL, + .max_target = VIRTIO_SCSI_MAX_TARGET, + .max_lun = VIRTIO_SCSI_MAX_LUN + }; +} + +static void +pci_vtscsi_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtscsi_softc *sc = vsc; + + sc->vss_features = negotiated_features; +} + +static int +pci_vtscsi_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtscsi_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)&sc->vss_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static int +pci_vtscsi_cfgwrite(void *vsc, int offset, int size, uint32_t val) +{ + + return (0); +} + +static inline int +pci_vtscsi_get_lun(uint8_t *lun) +{ + + return (((lun[2] << 8) | lun[3]) & 0x3fff); +} + +static int +pci_vtscsi_control_handle(struct pci_vtscsi_softc *sc, void *buf, + size_t bufsize) +{ + struct pci_vtscsi_ctrl_tmf *tmf; + struct pci_vtscsi_ctrl_an *an; + uint32_t type; + + type = *(uint32_t *)buf; + + if (type == VIRTIO_SCSI_T_TMF) { + tmf = (struct pci_vtscsi_ctrl_tmf *)buf; + return (pci_vtscsi_tmf_handle(sc, tmf)); + } + + if (type == VIRTIO_SCSI_T_AN_QUERY) { + an = (struct pci_vtscsi_ctrl_an *)buf; + return (pci_vtscsi_an_handle(sc, an)); + } + + return (0); +} + +static int +pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_ctrl_tmf *tmf) +{ + union ctl_io *io; + int err; + + io = ctl_scsi_alloc_io(sc->vss_iid); + ctl_scsi_zero_io(io); + + io->io_hdr.io_type = CTL_IO_TASK; + io->io_hdr.nexus.initid = sc->vss_iid; + io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(tmf->lun); + io->taskio.tag_type = CTL_TAG_SIMPLE; + io->taskio.tag_num = (uint32_t)tmf->id; + + switch (tmf->subtype) { + case VIRTIO_SCSI_T_TMF_ABORT_TASK: + io->taskio.task_action = CTL_TASK_ABORT_TASK; + break; + + case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: + io->taskio.task_action = CTL_TASK_ABORT_TASK_SET; + break; + + case VIRTIO_SCSI_T_TMF_CLEAR_ACA: + io->taskio.task_action = CTL_TASK_CLEAR_ACA; + break; + + case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: + io->taskio.task_action = CTL_TASK_CLEAR_TASK_SET; + break; + + case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: + io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET; + break; + + case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: + io->taskio.task_action = CTL_TASK_LUN_RESET; + break; + + case VIRTIO_SCSI_T_TMF_QUERY_TASK: + io->taskio.task_action = CTL_TASK_QUERY_TASK; + break; + + case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET: + io->taskio.task_action = CTL_TASK_QUERY_TASK_SET; + break; + } + + if (pci_vtscsi_debug) { + struct sbuf *sb = sbuf_new_auto(); + ctl_io_sbuf(io, sb); + sbuf_finish(sb); + DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); + sbuf_delete(sb); + } + + err = ioctl(sc->vss_ctl_fd, CTL_IO, io); + if (err != 0) + WPRINTF(("CTL_IO: err=%d (%s)", errno, strerror(errno))); + + tmf->response = io->taskio.task_status; + ctl_scsi_free_io(io); + return (1); +} + +static int +pci_vtscsi_an_handle(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_ctrl_an *an) +{ + + return (0); +} + +static int +pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in, + int niov_in, struct iovec *iov_out, int niov_out) +{ + struct pci_vtscsi_softc *sc = q->vsq_sc; + struct pci_vtscsi_req_cmd_rd *cmd_rd = NULL; + struct pci_vtscsi_req_cmd_wr *cmd_wr; + struct iovec data_iov_in[VTSCSI_MAXSEG], data_iov_out[VTSCSI_MAXSEG]; + union ctl_io *io; + int data_niov_in, data_niov_out; + void *ext_data_ptr = NULL; + uint32_t ext_data_len = 0, ext_sg_entries = 0; + int err, nxferred; + + seek_iov(iov_in, niov_in, data_iov_in, &data_niov_in, + VTSCSI_IN_HEADER_LEN(sc)); + seek_iov(iov_out, niov_out, data_iov_out, &data_niov_out, + VTSCSI_OUT_HEADER_LEN(sc)); + + truncate_iov(iov_in, &niov_in, VTSCSI_IN_HEADER_LEN(sc)); + truncate_iov(iov_out, &niov_out, VTSCSI_OUT_HEADER_LEN(sc)); + iov_to_buf(iov_in, niov_in, (void **)&cmd_rd); + + cmd_wr = malloc(VTSCSI_OUT_HEADER_LEN(sc)); + io = ctl_scsi_alloc_io(sc->vss_iid); + ctl_scsi_zero_io(io); + + io->io_hdr.nexus.initid = sc->vss_iid; + io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(cmd_rd->lun); + + io->io_hdr.io_type = CTL_IO_SCSI; + + if (data_niov_in > 0) { + ext_data_ptr = (void *)data_iov_in; + ext_sg_entries = data_niov_in; + ext_data_len = count_iov(data_iov_in, data_niov_in); + io->io_hdr.flags |= CTL_FLAG_DATA_OUT; + } else if (data_niov_out > 0) { + ext_data_ptr = (void *)data_iov_out; + ext_sg_entries = data_niov_out; + ext_data_len = count_iov(data_iov_out, data_niov_out); + io->io_hdr.flags |= CTL_FLAG_DATA_IN; + } + + io->scsiio.sense_len = sc->vss_config.sense_size; + io->scsiio.tag_num = (uint32_t)cmd_rd->id; + switch (cmd_rd->task_attr) { + case VIRTIO_SCSI_S_ORDERED: + io->scsiio.tag_type = CTL_TAG_ORDERED; + break; + case VIRTIO_SCSI_S_HEAD: + io->scsiio.tag_type = CTL_TAG_HEAD_OF_QUEUE; + break; + case VIRTIO_SCSI_S_ACA: + io->scsiio.tag_type = CTL_TAG_ACA; + break; + case VIRTIO_SCSI_S_SIMPLE: + default: + io->scsiio.tag_type = CTL_TAG_SIMPLE; + break; + } + io->scsiio.ext_sg_entries = ext_sg_entries; + io->scsiio.ext_data_ptr = ext_data_ptr; + io->scsiio.ext_data_len = ext_data_len; + io->scsiio.ext_data_filled = 0; + io->scsiio.cdb_len = sc->vss_config.cdb_size; + memcpy(io->scsiio.cdb, cmd_rd->cdb, sc->vss_config.cdb_size); + + if (pci_vtscsi_debug) { + struct sbuf *sb = sbuf_new_auto(); + ctl_io_sbuf(io, sb); + sbuf_finish(sb); + DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); + sbuf_delete(sb); + } + + err = ioctl(sc->vss_ctl_fd, CTL_IO, io); + if (err != 0) { + WPRINTF(("CTL_IO: err=%d (%s)", errno, strerror(errno))); + cmd_wr->response = VIRTIO_SCSI_S_FAILURE; + } else { + cmd_wr->sense_len = MIN(io->scsiio.sense_len, + sc->vss_config.sense_size); + cmd_wr->residual = io->scsiio.residual; + cmd_wr->status = io->scsiio.scsi_status; + cmd_wr->response = VIRTIO_SCSI_S_OK; + memcpy(&cmd_wr->sense, &io->scsiio.sense_data, + cmd_wr->sense_len); + } + + buf_to_iov(cmd_wr, VTSCSI_OUT_HEADER_LEN(sc), iov_out, niov_out, 0); + nxferred = VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled; + free(cmd_rd); + free(cmd_wr); + ctl_scsi_free_io(io); + return (nxferred); +} + +static void +pci_vtscsi_controlq_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtscsi_softc *sc; + struct iovec iov[VTSCSI_MAXSEG]; + uint16_t idx, n; + void *buf = NULL; + size_t bufsize; + int iolen; + + sc = vsc; + + while (vq_has_descs(vq)) { + n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, NULL); + bufsize = iov_to_buf(iov, n, &buf); + iolen = pci_vtscsi_control_handle(sc, buf, bufsize); + buf_to_iov(buf + bufsize - iolen, iolen, iov, n, + bufsize - iolen); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, iolen); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ + free(buf); +} + +static void +pci_vtscsi_eventq_notify(void *vsc, struct vqueue_info *vq) +{ + + vq_kick_disable(vq); +} + +static void +pci_vtscsi_requestq_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtscsi_softc *sc; + struct pci_vtscsi_queue *q; + struct pci_vtscsi_request *req; + struct iovec iov[VTSCSI_MAXSEG]; + uint16_t flags[VTSCSI_MAXSEG]; + uint16_t idx, n, i; + int readable; + + sc = vsc; + q = &sc->vss_queues[vq->vq_num - 2]; + + while (vq_has_descs(vq)) { + readable = 0; + n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, flags); + + /* Count readable descriptors */ + for (i = 0; i < n; i++) { + if (flags[i] & VRING_DESC_F_WRITE) + break; + + readable++; + } + + req = calloc(1, sizeof(struct pci_vtscsi_request)); + req->vsr_idx = idx; + req->vsr_queue = q; + req->vsr_niov_in = readable; + req->vsr_niov_out = n - readable; + memcpy(req->vsr_iov_in, iov, + req->vsr_niov_in * sizeof(struct iovec)); + memcpy(req->vsr_iov_out, iov + readable, + req->vsr_niov_out * sizeof(struct iovec)); + + pthread_mutex_lock(&q->vsq_mtx); + STAILQ_INSERT_TAIL(&q->vsq_requests, req, vsr_link); + pthread_cond_signal(&q->vsq_cv); + pthread_mutex_unlock(&q->vsq_mtx); + + DPRINTF(("virtio-scsi: request enqueued", idx)); + } +} + +static int +pci_vtscsi_init_queue(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_queue *queue, int num) +{ + struct pci_vtscsi_worker *worker; + char tname[MAXCOMLEN + 1]; + int i; + + queue->vsq_sc = sc; + queue->vsq_vq = &sc->vss_vq[num + 2]; + + pthread_mutex_init(&queue->vsq_mtx, NULL); + pthread_mutex_init(&queue->vsq_qmtx, NULL); + pthread_cond_init(&queue->vsq_cv, NULL); + STAILQ_INIT(&queue->vsq_requests); + LIST_INIT(&queue->vsq_workers); + + for (i = 0; i < VTSCSI_THR_PER_Q; i++) { + worker = calloc(1, sizeof(struct pci_vtscsi_worker)); + worker->vsw_queue = queue; + + pthread_create(&worker->vsw_thread, NULL, &pci_vtscsi_proc, + (void *)worker); + + snprintf(tname, sizeof(tname), "vtscsi:%d-%d", num, i); + pthread_set_name_np(worker->vsw_thread, tname); + LIST_INSERT_HEAD(&queue->vsq_workers, worker, vsw_link); + } + + return (0); +} + +static int +pci_vtscsi_init(struct vmctx *ctx, struct mmio_devinst *pi, char *opts) +{ + struct pci_vtscsi_softc *sc; + char *opt, *optname; + const char *devname; + int i, optidx = 0; + + sc = calloc(1, sizeof(struct pci_vtscsi_softc)); + devname = "/dev/cam/ctl"; + while ((opt = strsep(&opts, ",")) != NULL) { + optname = strsep(&opt, "="); + if (opt == NULL && optidx == 0) { + if (optname[0] != 0) + devname = optname; + } else if (strcmp(optname, "dev") == 0 && opt != NULL) { + devname = opt; + } else if (strcmp(optname, "iid") == 0 && opt != NULL) { + sc->vss_iid = strtoul(opt, NULL, 10); + } else { + EPRINTLN("Invalid option %s", optname); + free(sc); + return (1); + } + optidx++; + } + + sc->vss_ctl_fd = open(devname, O_RDWR); + if (sc->vss_ctl_fd < 0) { + WPRINTF(("cannot open %s: %s", devname, strerror(errno))); + free(sc); + return (1); + } + + vi_softc_linkup(&sc->vss_vs, &vtscsi_vi_consts, sc, pi, sc->vss_vq); + sc->vss_vs.vs_mtx = &sc->vss_mtx; + + /* controlq */ + sc->vss_vq[0].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[0].vq_notify = pci_vtscsi_controlq_notify; + + /* eventq */ + sc->vss_vq[1].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[1].vq_notify = pci_vtscsi_eventq_notify; + + /* request queues */ + for (i = 2; i < VTSCSI_MAXQ; i++) { + sc->vss_vq[i].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[i].vq_notify = pci_vtscsi_requestq_notify; + pci_vtscsi_init_queue(sc, &sc->vss_queues[i - 2], i - 2); + } + + /* initialize config space */ + mmio_set_cfgreg16(pi, PCIR_DEVICE, VIRTIO_DEV_SCSI); + mmio_set_cfgreg16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + mmio_set_cfgreg8(pi, PCIR_CLASS, PCIC_STORAGE); + mmio_set_cfgreg16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_SCSI); + mmio_set_cfgreg16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vss_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_res(&sc->vss_vs, 0); + + return (0); +} + + +struct mmio_devemu pci_de_vscsi = { + .de_emu = "virtio-scsi", + .de_init = pci_vtscsi_init, + .de_write = vi_mmio_write, + .de_read = vi_mmio_read +}; +MMIO_EMUL_SET(pci_de_vscsi); Index: usr.sbin/bhyve/mmio/net_utils.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/net_utils.h @@ -0,0 +1,39 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2019 Vincenzo Maffione + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_UTILS_H_ +#define _NET_UTILS_H_ + +#include +#include "mmio_emul.h" + +void net_genmac(struct mmio_devinst *pi, uint8_t *macaddr); +int net_parsemac(char *mac_str, uint8_t *mac_addr); + +#endif /* _NET_UTILS_H_ */ Index: usr.sbin/bhyve/mmio/net_utils.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/mmio/net_utils.c @@ -0,0 +1,90 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include + +#include "bhyverun.h" +#include "debug.h" +#include "net_utils.h" + +int +net_parsemac(char *mac_str, uint8_t *mac_addr) +{ + struct ether_addr *ea; + char *tmpstr; + char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; + + tmpstr = strsep(&mac_str,"="); + + if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { + ea = ether_aton(mac_str); + + if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || + memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { + EPRINTLN("Invalid MAC %s", mac_str); + return (EINVAL); + } else + memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); + } + + return (0); +} + +void +net_genmac(struct mmio_devinst *pi, uint8_t *macaddr) +{ + /* + * The default MAC address is the standard NetApp OUI of 00-a0-98, + * followed by an MD5 of the PCI slot/func number and dev name + */ + MD5_CTX mdctx; + unsigned char digest[16]; + char nstr[80]; + + snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot, + pi->di_func, vmname); + + MD5Init(&mdctx); + MD5Update(&mdctx, nstr, (unsigned int)strlen(nstr)); + MD5Final(digest, &mdctx); + + macaddr[0] = 0x00; + macaddr[1] = 0xa0; + macaddr[2] = 0x98; + macaddr[3] = digest[0]; + macaddr[4] = digest[1]; + macaddr[5] = digest[2]; +} Index: usr.sbin/bhyve/pci_ahci.c =================================================================== --- usr.sbin/bhyve/pci_ahci.c +++ usr.sbin/bhyve/pci_ahci.c @@ -57,10 +57,10 @@ #include #include -#include "bhyverun.h" -#include "pci_emul.h" #include "ahci.h" #include "block_if.h" +#include "bhyverun.h" +#include "pci_emul.h" #define DEF_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ #define MAX_PORTS 32 /* AHCI supports 32 ports */ Index: usr.sbin/bhyve/pci_e82545.c =================================================================== --- usr.sbin/bhyve/pci_e82545.c +++ usr.sbin/bhyve/pci_e82545.c @@ -642,7 +642,7 @@ * was an asserted interrupt, clear it */ if (sc->esc_irq_asserted && !(sc->esc_ICR & sc->esc_IMS)) { - DPRINTF("icr deassert: lintr deassert %x", bits); + DPRINTF("icr deassert: lintr deassert %x\r\n", bits); pci_lintr_deassert(sc->esc_pi); sc->esc_irq_asserted = 0; } @@ -2139,7 +2139,7 @@ struct e82545_softc *sc; uint64_t retval; - //DPRINTF("Read bar:%d offset:0x%lx size:%d", baridx, offset, size); + //DPRINTF("Read bar:%d offset:0x%lx size:%d\r\n", baridx, offset, size); sc = pi->pi_arg; retval = 0; Index: usr.sbin/bhyve/pci_emul.h =================================================================== --- usr.sbin/bhyve/pci_emul.h +++ usr.sbin/bhyve/pci_emul.h @@ -28,8 +28,8 @@ * $FreeBSD$ */ -#ifndef _PCI_EMUL_H_ -#define _PCI_EMUL_H_ +#ifndef _EMUL_H_ +#define _EMUL_H_ #include #include @@ -298,4 +298,4 @@ return (*(uint32_t *)(pi->pi_cfgdata + offset)); } -#endif /* _PCI_EMUL_H_ */ +#endif /* _EMUL_H_ */ Index: usr.sbin/bhyve/pci_fbuf.c =================================================================== --- usr.sbin/bhyve/pci_fbuf.c +++ usr.sbin/bhyve/pci_fbuf.c @@ -85,7 +85,7 @@ #define ROWS_MIN 480 struct pci_fbuf_softc { - struct pci_devinst *fsc_pi; + struct pci_devinst *fsc_di; struct { uint32_t fbsize; uint16_t width; @@ -125,7 +125,7 @@ } static void -pci_fbuf_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, +pci_fbuf_write(struct vmctx *ctx, int vcpu, struct pci_devinst *di, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_fbuf_softc *sc; @@ -133,7 +133,7 @@ assert(baridx == 0); - sc = pi->pi_arg; + sc = di->pi_arg; DPRINTF(DEBUG_VERBOSE, ("fbuf wr: offset 0x%lx, size: %d, value: 0x%lx", @@ -179,7 +179,7 @@ } uint64_t -pci_fbuf_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, +pci_fbuf_read(struct vmctx *ctx, int vcpu, struct pci_devinst *di, int baridx, uint64_t offset, int size) { struct pci_fbuf_softc *sc; @@ -188,7 +188,7 @@ assert(baridx == 0); - sc = pi->pi_arg; + sc = di->pi_arg; if (offset + size > DMEMSZ) { @@ -351,7 +351,7 @@ } static int -pci_fbuf_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_fbuf_init(struct vmctx *ctx, struct pci_devinst *di, char *opts) { int error, prot; struct pci_fbuf_softc *sc; @@ -363,24 +363,24 @@ sc = calloc(1, sizeof(struct pci_fbuf_softc)); - pi->pi_arg = sc; + di->pi_arg = sc; /* initialize config space */ - pci_set_cfgdata16(pi, PCIR_DEVICE, 0x40FB); - pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); - pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_DISPLAY); - pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_DISPLAY_VGA); + pci_set_cfgdata16(di, PCIR_DEVICE, 0x40FB); + pci_set_cfgdata16(di, PCIR_VENDOR, 0xFB5D); + pci_set_cfgdata8(di, PCIR_CLASS, PCIC_DISPLAY); + pci_set_cfgdata8(di, PCIR_SUBCLASS, PCIS_DISPLAY_VGA); - error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, DMEMSZ); + error = pci_emul_alloc_bar(di, 0, PCIBAR_MEM32, DMEMSZ); assert(error == 0); - error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, FB_SIZE); + error = pci_emul_alloc_bar(di, 1, PCIBAR_MEM32, FB_SIZE); assert(error == 0); - error = pci_emul_add_msicap(pi, PCI_FBUF_MSI_MSGS); + error = pci_emul_add_msicap(di, PCI_FBUF_MSI_MSGS); assert(error == 0); - sc->fbaddr = pi->pi_bar[1].addr; + sc->fbaddr = di->pi_bar[1].addr; sc->memregs.fbsize = FB_SIZE; sc->memregs.width = COLS_DEFAULT; sc->memregs.height = ROWS_DEFAULT; @@ -389,7 +389,7 @@ sc->vga_enabled = 1; sc->vga_full = 0; - sc->fsc_pi = pi; + sc->fsc_di = di; error = pci_fbuf_parse_opts(sc, opts); if (error != 0) Index: usr.sbin/bhyve/pci_hostbridge.c =================================================================== --- usr.sbin/bhyve/pci_hostbridge.c +++ usr.sbin/bhyve/pci_hostbridge.c @@ -34,27 +34,27 @@ #include "pci_emul.h" static int -pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *di, char *opts) { /* config space */ - pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */ - pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */ - pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); - pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); - pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST); + pci_set_cfgdata16(di, PCIR_VENDOR, 0x1275); /* NetApp */ + pci_set_cfgdata16(di, PCIR_DEVICE, 0x1275); /* NetApp */ + pci_set_cfgdata8(di, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); + pci_set_cfgdata8(di, PCIR_CLASS, PCIC_BRIDGE); + pci_set_cfgdata8(di, PCIR_SUBCLASS, PCIS_BRIDGE_HOST); - pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT); + pci_emul_add_pciecap(di, PCIEM_TYPE_ROOT_PORT); return (0); } static int -pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *di, char *opts) { - (void) pci_hostbridge_init(ctx, pi, opts); - pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1022); /* AMD */ - pci_set_cfgdata16(pi, PCIR_DEVICE, 0x7432); /* made up */ + (void) pci_hostbridge_init(ctx, di, opts); + pci_set_cfgdata16(di, PCIR_VENDOR, 0x1022); /* AMD */ + pci_set_cfgdata16(di, PCIR_DEVICE, 0x7432); /* made up */ return (0); } Index: usr.sbin/bhyve/pci_lpc.c =================================================================== --- usr.sbin/bhyve/pci_lpc.c +++ usr.sbin/bhyve/pci_lpc.c @@ -236,14 +236,14 @@ } static void -pci_lpc_write_dsdt(struct pci_devinst *pi) +pci_lpc_write_dsdt(struct pci_devinst *di) { struct lpc_dsdt **ldpp, *ldp; dsdt_line(""); dsdt_line("Device (ISA)"); dsdt_line("{"); - dsdt_line(" Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func); + dsdt_line(" Name (_ADR, 0x%04X%04X)", di->pi_slot, di->pi_func); dsdt_line(" OperationRegion (LPCR, PCI_Config, 0x00, 0x100)"); dsdt_line(" Field (LPCR, AnyAcc, NoLock, Preserve)"); dsdt_line(" {"); @@ -356,7 +356,7 @@ LPC_DSDT(pci_lpc_uart_dsdt); static int -pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, +pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *di, int coff, int bytes, uint32_t val) { int pirq_pin; @@ -369,7 +369,7 @@ pirq_pin = coff - 0x68 + 5; if (pirq_pin != 0) { pirq_write(ctx, pirq_pin, val); - pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin)); + pci_set_cfgdata8(di, coff, pirq_read(pirq_pin)); return (0); } } @@ -377,13 +377,13 @@ } static void -pci_lpc_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, +pci_lpc_write(struct vmctx *ctx, int vcpu, struct pci_devinst *di, int baridx, uint64_t offset, int size, uint64_t value) { } static uint64_t -pci_lpc_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, +pci_lpc_read(struct vmctx *ctx, int vcpu, struct pci_devinst *di, int baridx, uint64_t offset, int size) { return (0); @@ -393,7 +393,7 @@ #define LPC_VENDOR 0x8086 static int -pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +pci_lpc_init(struct vmctx *ctx, struct pci_devinst *di, char *opts) { /* @@ -409,7 +409,7 @@ * simplifies the ACPI DSDT because it can provide a decode for * all legacy i/o ports behind bus 0. */ - if (pi->pi_bus != 0) { + if (di->pi_bus != 0) { EPRINTLN("LPC bridge can be present only on bus 0."); return (-1); } @@ -418,12 +418,12 @@ return (-1); /* initialize config space */ - pci_set_cfgdata16(pi, PCIR_DEVICE, LPC_DEV); - pci_set_cfgdata16(pi, PCIR_VENDOR, LPC_VENDOR); - pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); - pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA); + pci_set_cfgdata16(di, PCIR_DEVICE, LPC_DEV); + pci_set_cfgdata16(di, PCIR_VENDOR, LPC_VENDOR); + pci_set_cfgdata8(di, PCIR_CLASS, PCIC_BRIDGE); + pci_set_cfgdata8(di, PCIR_SUBCLASS, PCIS_BRIDGE_ISA); - lpc_bridge = pi; + lpc_bridge = di; return (0); } Index: usr.sbin/bhyve/pci_virtio.c =================================================================== --- usr.sbin/bhyve/pci_virtio.c +++ usr.sbin/bhyve/pci_virtio.c @@ -44,7 +44,7 @@ #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" -#include "virtio.h" +#include "pci_virtio.h" /* * Functions for dealing with generalized "virtual devices" as Index: usr.sbin/bhyve/pci_virtio_block.c =================================================================== --- usr.sbin/bhyve/pci_virtio_block.c +++ usr.sbin/bhyve/pci_virtio_block.c @@ -56,7 +56,7 @@ #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" -#include "virtio.h" +#include "pci_virtio.h" #include "block_if.h" #define VTBLK_BSIZE 512 Index: usr.sbin/bhyve/pci_virtio_console.c =================================================================== --- usr.sbin/bhyve/pci_virtio_console.c +++ usr.sbin/bhyve/pci_virtio_console.c @@ -62,7 +62,7 @@ #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" -#include "virtio.h" +#include "pci_virtio.h" #include "mevent.h" #include "sockstream.h" Index: usr.sbin/bhyve/pci_virtio_net.c =================================================================== --- usr.sbin/bhyve/pci_virtio_net.c +++ usr.sbin/bhyve/pci_virtio_net.c @@ -57,7 +57,7 @@ #include "debug.h" #include "pci_emul.h" #include "mevent.h" -#include "virtio.h" +#include "pci_virtio.h" #include "net_utils.h" #include "net_backends.h" #include "iov.h" Index: usr.sbin/bhyve/pci_virtio_rnd.c =================================================================== --- usr.sbin/bhyve/pci_virtio_rnd.c +++ usr.sbin/bhyve/pci_virtio_rnd.c @@ -60,7 +60,7 @@ #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" -#include "virtio.h" +#include "pci_virtio.h" #define VTRND_RINGSZ 64 Index: usr.sbin/bhyve/pci_virtio_scsi.c =================================================================== --- usr.sbin/bhyve/pci_virtio_scsi.c +++ usr.sbin/bhyve/pci_virtio_scsi.c @@ -63,7 +63,7 @@ #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" -#include "virtio.h" +#include "pci_virtio.h" #include "iov.h" #define VTSCSI_RINGSZ 64 Index: usr.sbin/bhyve/pm.c =================================================================== --- usr.sbin/bhyve/pm.c +++ usr.sbin/bhyve/pm.c @@ -40,9 +40,9 @@ #include #include "acpi.h" +#include "pci_irq.h" #include "inout.h" #include "mevent.h" -#include "pci_irq.h" #include "pci_lpc.h" static pthread_mutex_t pm_lock = PTHREAD_MUTEX_INITIALIZER; Index: usr.sbin/bhyvectl/Makefile =================================================================== --- usr.sbin/bhyvectl/Makefile +++ usr.sbin/bhyvectl/Makefile @@ -5,19 +5,17 @@ .include PROG= bhyvectl -SRCS= bhyvectl.c PACKAGE= bhyve -MAN= bhyvectl.8 - LIBADD= vmmapi util WARNS?= 3 -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm +CFLAGS+= -I${SRCTOP}/sys/${MACHINE}/vmm .if ${MK_BHYVE_SNAPSHOT} != "no" CFLAGS+= -DBHYVE_SNAPSHOT .endif +.include "${.CURDIR}/${MACHINE}/Makefile.inc" .include Index: usr.sbin/bhyvectl/amd64/Makefile.inc =================================================================== --- /dev/null +++ usr.sbin/bhyvectl/amd64/Makefile.inc @@ -0,0 +1,7 @@ +# +# $FreeBSD$ +# +.PATH: ${.CURDIR}/amd64 + +SRCS= bhyvectl.c +MAN= bhyvectl.8 Index: usr.sbin/bhyvectl/arm64/Makefile.inc =================================================================== --- /dev/null +++ usr.sbin/bhyvectl/arm64/Makefile.inc @@ -0,0 +1,7 @@ +# +# $FreeBSD$ +# +.PATH: ${.CURDIR}/arm64 + +SRCS= bhyvectl.c +MAN= bhyvectl.8 Index: usr.sbin/bhyvectl/arm64/bhyvectl.8 =================================================================== --- /dev/null +++ usr.sbin/bhyvectl/arm64/bhyvectl.8 @@ -0,0 +1,97 @@ +.\" Copyright (c) 2015 Christian Brueffer +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd November 13, 2016 +.Dt BHYVECTL 8 +.Os +.Sh NAME +.Nm bhyvectl +.Nd "control utility for bhyve instances" +.Sh SYNOPSIS +.Nm +.Fl -vm= Ns Ar +.Op Fl -create +.Op Fl -destroy +.Op Fl -get-stats +.Op Fl -inject-nmi +.Op Fl -force-reset +.Op Fl -force-poweroff +.Sh DESCRIPTION +The +.Nm +command is a control utility for active +.Xr bhyve 8 +virtual machine instances. +.Pp +.Em Note : +Most +.Nm +flags are intended for querying and setting the state of an active instance. +These commands are intended for development purposes, and are not documented here. +A complete list can be obtained by executing +.Nm +without any arguments. +.Pp +The user-facing options are as follows: +.Bl -tag -width ".Fl d Ar argument" +.It Fl -vm= Ns Ar +Operate on the virtual machine +.Ar . +.It Fl -create +Create the specified VM. +.It Fl -destroy +Destroy the specified VM. +.It Fl -get-stats +Retrieve statistics for the specified VM. +.It Fl -inject-nmi +Inject a non-maskable interrupt (NMI) into the VM. +.It Fl -force-reset +Force the VM to reset. +.It Fl -force-poweroff +Force the VM to power off. +.El +.Sh EXIT STATUS +.Ex -std +.Sh EXAMPLES +Destroy the VM called fbsd10: +.Pp +.Dl "bhyvectl --vm=fbsd10 --destroy" +.Sh SEE ALSO +.Xr bhyve 8 , +.Xr bhyveload 8 +.Sh HISTORY +The +.Nm +command first appeared in +.Fx 10.1 . +.Sh AUTHORS +.An -nosplit +The +.Nm +utility was written by +.An Peter Grehan +and +.An Neel Natu . Index: usr.sbin/bhyvectl/arm64/bhyvectl.c =================================================================== --- /dev/null +++ usr.sbin/bhyvectl/arm64/bhyvectl.c @@ -0,0 +1,140 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2018 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#define MB (1UL << 20) +#define GB (1UL << 30) + +#define REQ_ARG required_argument +#define NO_ARG no_argument +#define OPT_ARG optional_argument + +#define eprintf(fmt, ...) printf("%s:%d " fmt, __func__, __LINE__, ##__VA_ARGS__) + +static const char *progname; + +static void +usage() +{ + + (void)fprintf(stderr, + "Usage: %s --vm=\n" + " %*s [--destroy]\n", + progname, (int)strlen(progname), ""); + exit(1); +} + +static int create; +static int destroy; + +enum { + VMNAME = 1000, /* avoid collision with return values from getopt */ +}; + +const struct option opts[] = { + { "vm", REQ_ARG, NULL, VMNAME }, + { "destroy", NO_ARG, &destroy, 1 }, + { NULL, 0, NULL, 1 }, +}; + +int +main(int argc, char *argv[]) +{ + char *vmname; + int error, ch; + struct vmctx *ctx; + + vmname = NULL; + progname = basename(argv[0]); + + while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) { + switch (ch) { + case 0: + break; + case VMNAME: + vmname = optarg; + break; + default: + usage(); + } + } + argc -= optind; + argv += optind; + + if (vmname == NULL) + usage(); + + error = 0; + if (!error && create) + error = vm_create(vmname); + if (!error) { + ctx = vm_open(vmname); + if (ctx == NULL) { + printf("VM:%s is not created.\n", vmname); + exit(1); + } + } + + + if (error) + printf("errno = %d\n", errno); + + if (!error && destroy) + vm_destroy(ctx); + + exit(error); +} Index: usr.sbin/bhyveload/Makefile =================================================================== --- usr.sbin/bhyveload/Makefile +++ usr.sbin/bhyveload/Makefile @@ -1,14 +1,17 @@ # $FreeBSD$ PROG= bhyveload -SRCS= bhyveload.c -MAN= bhyveload.8 PACKAGE= bhyve +BHYVELOAD_SYSDIR?=${SRCTOP} +BHYVELOAD_SRCTOP?=${.CURDIR} + LIBADD= vmmapi WARNS?= 3 CFLAGS+=-I${SRCTOP}/stand/userboot +.include "${BHYVELOAD_SRCTOP}/${MACHINE}/Makefile.inc" + .include Index: usr.sbin/bhyveload/amd64/Makefile.inc =================================================================== --- /dev/null +++ usr.sbin/bhyveload/amd64/Makefile.inc @@ -0,0 +1,7 @@ +# $FreeBSD$ +.PATH: ${BHYVELOAD_SRCTOP}/amd64/ + +SRCS= bhyveload.c +MAN= bhyveload.8 + +CFLAGS+=-I${SRCTOP}/sys/boot/userboot Index: usr.sbin/bhyveload/arm64/Makefile.inc =================================================================== --- /dev/null +++ usr.sbin/bhyveload/arm64/Makefile.inc @@ -0,0 +1,13 @@ +# $FreeBSD$ +LIBADD+= util + +.PATH: ${BHYVELOAD_SRCTOP}/arm64/ + +SRCS= bhyveload.c \ + boot.c + +.PATH: ${.CURDIR}/../../sys/arm64/vmm + +CFLAGS += -I${.CURDIR}/../../stand/common + +MK_MAN=no Index: usr.sbin/bhyveload/arm64/bhyveload.c =================================================================== --- /dev/null +++ usr.sbin/bhyveload/arm64/bhyveload.c @@ -0,0 +1,404 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "boot.h" + +#define gvatovm(addr) ((uint64_t)(addr) - KERNBASE + \ + kernel_load_address - memory_base_address) +#define overlap(x_start, x_end, y_start, y_end) \ + ((x_start) >= (y_start) && (x_start) < (y_end) || \ + (x_end) >= (y_start) && (x_end) < (y_end)) + +#define MB (1024 * 1024UL) +#define BSP 0 +#define KERNEL_IMAGE_NAME_LEN 32 + +#define GIC_V3_DIST_START 0x2f000000UL +#define GIC_V3_DIST_SIZE 0x10000UL +#define GIC_V3_REDIST_START 0x2f100000UL +#define GIC_V3_REDIST_SIZE 0x200000UL + +struct env { + const char *str; + SLIST_ENTRY(env) next; +}; +static SLIST_HEAD(envhead, env) envhead; + +static uint64_t memory_base_address, kernel_load_address; + +static char *vmname, *progname; +static struct vmctx *ctx; + +static int +env_add(const char *str) +{ + struct env *env; + + env = malloc(sizeof(*env)); + if (env == NULL) + return (ENOMEM); + env->str = str; + SLIST_INSERT_HEAD(&envhead, env, next); + + return (0); +} + +static int +env_tostr(char **envstrp, int *envlen) +{ + struct env *env; + int i; + + *envlen = 0; + SLIST_FOREACH(env, &envhead, next) + *envlen = *envlen + strlen(env->str) + 1; + /* Make room for the two terminating zeroes */ + if (*envlen == 0) + *envlen = 2; + else + (*envlen)++; + + *envstrp = malloc(*envlen * sizeof(char)); + if (*envstrp == NULL) + return (ENOMEM); + + i = 0; + SLIST_FOREACH(env, &envhead, next) { + strncpy(*envstrp + i, env->str, strlen(env->str)); + i += strlen(env->str); + (*envstrp)[i++] = 0; + } + (*envstrp)[i] = 0; + + /* + * At this point we have envstr[0] == 0 if the environment is empty. + * Add the second 0 to properly terminate the environment string. + */ + if (SLIST_EMPTY(&envhead)) + (*envstrp)[1] = 0; + + /* + for (i = 0; i < *envlen; i++) + printf("%d ", (int)(*envstrp)[i]); + printf("\n"); + */ + + return (0); +} + +/* + * Guest virtual machinee + */ +static int +guest_copyin(const void *from, uint64_t to, size_t size) +{ + char *ptr; + ptr = vm_map_ipa(ctx, to, size); + if (ptr == NULL) + return (EFAULT); + + memcpy(ptr, from, size); + return (0); +} + +static int +guest_copyout(uint64_t from, void *to, size_t size) +{ + char *ptr; + + ptr = vm_map_ipa(ctx, from, size); + if (ptr == NULL) + return (EFAULT); + + memcpy(to, ptr, size); + return (0); +} + +static void +guest_setreg(enum vm_reg_name vmreg, uint64_t v) +{ + int error; + + error = vm_set_register(ctx, BSP, vmreg, v); + if (error) + perror("vm_set_register"); +} + +#if 0 +static int +parse_memsize(const char *optarg, size_t *ret_memsize) +{ + char *endptr; + size_t optval; + int error; + + optval = strtoul(optarg, &endptr, 0); + if (*optarg != '\0' && *endptr == '\0') { + /* Memory size must be at least one megabyte. */ + if (optval < MB) + optval = optval * MB; + *ret_memsize = optval; + error = 0; + } else { + error = expand_number(optarg, ret_memsize); + } + + return (error); +} +#endif + +static void +usage(int code) +{ + fprintf(stderr, + "Usage: %s [-h] [-k ] [-e ] [-b base-address]\n" + " %*s [-m mem-size] [-l load-address] \n" + " -k: path to guest kernel image\n" + " -e: guest boot environment\n" + " -b: memory base address\n" + " -m: memory size\n" + " -l: kernel load address in the guest physical memory\n" + " -h: help\n", + progname, (int)strlen(progname), ""); + exit(code); +} + +int +main(int argc, char** argv) +{ + struct vm_bootparams bootparams; + uint64_t mem_size; + int opt, error; + int kernel_image_fd; + uint64_t periphbase; + char kernel_image_name[KERNEL_IMAGE_NAME_LEN]; + struct stat st; + void *addr; + char *envstr; + int envlen; + + progname = basename(argv[0]); + + mem_size = 128 * MB; + memory_base_address = VM_GUEST_BASE_IPA; + kernel_load_address = memory_base_address; + periphbase = 0x2c000000UL; + strncpy(kernel_image_name, "kernel.bin", KERNEL_IMAGE_NAME_LEN); + memset(&bootparams, 0, sizeof(struct vm_bootparams)); + + while ((opt = getopt(argc, argv, "hk:l:b:m:e:")) != -1) { + switch (opt) { + case 'k': + strncpy(kernel_image_name, optarg, KERNEL_IMAGE_NAME_LEN); + break; + case 'l': + kernel_load_address = strtoul(optarg, NULL, 0); + break; + case 'b': + memory_base_address = strtoul(optarg, NULL, 0); + break; + case 'm': + error = vm_parse_memsize(optarg, &mem_size); + if (error) { + fprintf(stderr, "Invalid memsize '%s'\n", optarg); + exit(1); + } + break; + case 'e': + error = env_add(optarg); + if (error) { + perror("env_add"); + exit(1); + } + break; + case 'h': + usage(0); + default: + fprintf(stderr, "Unknown argument '%c'\n", opt); + usage(1); + } + } + + argc -= optind; + argv += optind; + + if (argc != 1) { + fprintf(stderr, "Missing or unknown arguments\n"); + usage(1); + } + + if (kernel_load_address < memory_base_address) { + fprintf(stderr, "Kernel load address is below memory base address\n"); + exit(1); + } + + vmname = argv[0]; + + kernel_image_fd = open(kernel_image_name, O_RDONLY); + if (kernel_image_fd == -1) { + perror("open kernel_image_name"); + exit(1); + } + + error = vm_create(vmname); + if (error) { + perror("vm_create"); + exit(1); + } + + ctx = vm_open(vmname); + if (ctx == NULL) { + perror("vm_open"); + exit(1); + } + + error = vm_setup_memory(ctx, memory_base_address, mem_size, VM_MMAP_ALL); + if (error) { + perror("vm_setup_memory"); + exit(1); + } + + error = fstat(kernel_image_fd, &st); + if (error) { + perror("fstat"); + exit(1); + } + + if ((uint64_t)st.st_size > mem_size) { + fprintf(stderr, "Kernel image larger than memory size\n"); + exit(1); + } + if (kernel_load_address + st.st_size >= memory_base_address + mem_size) { + fprintf(stderr, "Kernel image out of bounds of guest memory\n"); + exit(1); + } + + addr = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, kernel_image_fd, 0); + if (addr == MAP_FAILED) { + perror("mmap kernel_image_fd"); + exit(1); + } + + if (guest_copyin(addr, kernel_load_address - memory_base_address, st.st_size) != 0) { + perror("guest_copyin"); + exit(1); + } + + error = env_tostr(&envstr, &envlen); + if (error) { + perror("parse boot environment\n"); + exit(1); + } + + bootparams.envstr = envstr; + bootparams.envlen = envlen; + error = parse_kernel(addr, st.st_size, ctx, &bootparams); + if (error) { + fprintf(stderr, "Error parsing image\n"); + exit(1); + } + + /* + fprintf(stderr, "bootparams.envp_gva = 0x%016lx\n", bootparams.envp_gva); + fprintf(stderr, "gvatom(bootparams.envp_gva) = 0x%016lx\n", gvatovm(bootparams.envp_gva)); + fprintf(stderr, "vm_map_ipa() = 0x%016lx\n", (uint64_t)vm_map_ipa(ctx, gvatovm(bootparams.envp_gva), PAGE_SIZE)); + fprintf(stderr, "\n"); + + fprintf(stderr, "bootparams.mudulep_gva = 0x%016lx\n", bootparams.modulep_gva); + fprintf(stderr, "gvatom(bootparams.modulep_gva) = 0x%016lx\n", gvatovm(bootparams.modulep_gva)); + fprintf(stderr, "vm_map_ipa() = 0x%016lx\n", (uint64_t)vm_map_ipa(ctx, gvatovm(bootparams.modulep_gva), PAGE_SIZE)); + fprintf(stderr, "\n"); + */ + + /* Copy the environment string in the guest memory */ + if (guest_copyin((void *)envstr, gvatovm(bootparams.envp_gva), envlen) != 0) { + perror("guest_copyin"); + exit(1); + } + + /* Copy the module data in the guest memory */ + if (guest_copyin(bootparams.modulep, gvatovm(bootparams.modulep_gva), bootparams.module_len) != 0) { + perror("guest_copyin"); + exit(1); + } + + uint64_t mem_end = memory_base_address + mem_size; + uint64_t dist_end = GIC_V3_DIST_START + GIC_V3_DIST_SIZE; + uint64_t redist_end = GIC_V3_REDIST_START + GIC_V3_REDIST_SIZE; + + if (overlap(GIC_V3_DIST_SIZE, dist_end, memory_base_address, mem_end)) { + fprintf(stderr, "Guest memory overlaps with VGIC Distributor\n"); + exit(1); + } + + if (overlap(GIC_V3_REDIST_SIZE, redist_end, memory_base_address, mem_end)) { + fprintf(stderr, "Guest memory overlaps with VGIC Redistributor\n"); + exit(1); + } + + error = vm_attach_vgic(ctx, GIC_V3_DIST_START, GIC_V3_DIST_SIZE, + GIC_V3_REDIST_START, GIC_V3_REDIST_SIZE); + if (error) { + fprintf(stderr, "Error attaching VGIC to the virtual machine\n"); + exit(1); + } + + munmap(addr, st.st_size); + + guest_setreg(VM_REG_ELR_EL2, kernel_load_address + bootparams.entry_off); + guest_setreg(VM_REG_GUEST_X0, bootparams.modulep_gva); + + return 0; +} Index: usr.sbin/bhyveload/arm64/boot.h =================================================================== --- /dev/null +++ usr.sbin/bhyveload/arm64/boot.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2017 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _BOOT_H_ +#define _BOOT_H_ + + + +struct vm_bootparams { + uint64_t entry_off; + uint64_t modulep_gva; /* Guest virtual address of modulep data */ + uint64_t envp_gva; /* Guest virtual address for env */ + char *envstr; + int envlen; + int module_len; + void *modulep; /* Bhyveload address of modulep data */ +}; + +int parse_kernel(void *addr, size_t img_size, struct vmctx *ctx, + struct vm_bootparams *bootparams); + +#endif Index: usr.sbin/bhyveload/arm64/boot.c =================================================================== --- /dev/null +++ usr.sbin/bhyveload/arm64/boot.c @@ -0,0 +1,618 @@ +/*- * Copyright (c) 2017 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "boot.h" + +#define gvatou(gva, addr) ((vm_offset_t)(gva) - KERNBASE + (vm_offset_t)(addr)) + +struct elf_file { + Elf_Phdr *ph; + Elf_Ehdr *ehdr; + Elf_Sym *symtab; + Elf_Hashelt *hashtab; + Elf_Hashelt nbuckets; + Elf_Hashelt nchains; + Elf_Hashelt *buckets; + Elf_Hashelt *chains; + Elf_Rel *rel; + size_t relsz; + Elf_Rela *rela; + size_t relasz; + char *strtab; + size_t strsz; + caddr_t firstpage_u; /* Userspace address of mmap'ed guest kernel */ +}; + +static uint64_t parse_image(struct preloaded_file *img, struct elf_file *ef); +static void image_addmetadata(struct preloaded_file *img, int type, + size_t size, void *addr); +static int image_addmodule(struct preloaded_file *img, char *modname, int version); +static void parse_metadata(struct preloaded_file *img, struct elf_file *ef, + Elf_Addr p_startu, Elf_Addr p_endu); +static int lookup_symbol(struct elf_file *ef, const char *name, Elf_Sym *symp); +static struct kernel_module *image_findmodule(struct preloaded_file *img, char *modname, + struct mod_depend *verinfo); +static uint64_t moddata_len(struct preloaded_file *img); +static void moddata_copy(vm_offset_t dest, struct preloaded_file *img); + +static int +load_elf_header(struct elf_file *ef) +{ + Elf_Ehdr *ehdr; + + ehdr = ef->ehdr = (Elf_Ehdr *)ef->firstpage_u; + /* Is it ELF? */ + if (!IS_ELF(*ehdr)) + return (EFTYPE); + + if (ehdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||/* Layout ? */ + ehdr->e_ident[EI_DATA] != ELF_TARG_DATA || + ehdr->e_ident[EI_VERSION] != EV_CURRENT || /* Version ? */ + ehdr->e_version != EV_CURRENT || + ehdr->e_machine != ELF_TARG_MACH) /* Machine ? */ + return (EFTYPE); + + return (0); +} + +static caddr_t +preload_search_by_type(const char *type, caddr_t preload_metadata) +{ + caddr_t curp, lname; + uint32_t *hdr; + int next; + + if (preload_metadata != NULL) { + + curp = preload_metadata; + lname = NULL; + for (;;) { + hdr = (uint32_t *)curp; + if (hdr[0] == 0 && hdr[1] == 0) + break; + + /* remember the start of each record */ + if (hdr[0] == MODINFO_NAME) + lname = curp; + + /* Search for a MODINFO_TYPE field */ + if ((hdr[0] == MODINFO_TYPE) && + !strcmp(type, curp + sizeof(uint32_t) * 2)) + return(lname); + + /* skip to next field */ + next = sizeof(uint32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + } + return(NULL); +} + +int +parse_kernel(void *addr, size_t img_size, struct vmctx *ctx, + struct vm_bootparams *bootparams) +{ + struct elf_file ef; + struct preloaded_file img; + Elf_Ehdr *ehdr_u; + int err; + vm_offset_t lastaddr_gva; + uint64_t kernend; + uint64_t size; + uint64_t modlen; + int boothowto; + + //fprintf(stderr, "[PARSE_KERNEL]\n\n"); + + memset(&ef, 0, sizeof(struct elf_file)); + memset(&img, 0, sizeof(struct preloaded_file)); + + ef.firstpage_u = (caddr_t)addr; + err = load_elf_header(&ef); + if (err != 0) + return (err); + + ehdr_u = ef.ehdr; + if (ehdr_u->e_type != ET_EXEC) { + fprintf(stderr, "Image not a kernel\n"); + return (EPERM); + } + img.f_name = "elf kernel"; + img.f_type = "elf kernel"; + img.f_size = img_size; + + size = parse_image(&img, &ef); + if (size == 0) + return (ENOEXEC); + bootparams->entry_off = ehdr_u->e_entry - KERNBASE; + + image_addmetadata(&img, MODINFOMD_ELFHDR, sizeof(*ehdr_u), ehdr_u); + + /* XXX: Add boothowto options? */ + boothowto = 0; + image_addmetadata(&img, MODINFOMD_HOWTO, sizeof(boothowto), &boothowto); + + lastaddr_gva = roundup(img.f_addr + img.f_size + 0x3fd000, PAGE_SIZE); + image_addmetadata(&img, MODINFOMD_ENVP, sizeof(lastaddr_gva), &lastaddr_gva); + bootparams->envp_gva = lastaddr_gva; + + lastaddr_gva = roundup(lastaddr_gva + bootparams->envlen, PAGE_SIZE); + /* Module data start in the guest kernel virtual address space */ + bootparams->modulep_gva = lastaddr_gva; + + modlen = moddata_len(&img); + kernend = roundup(bootparams->modulep_gva + modlen, PAGE_SIZE); + image_addmetadata(&img, MODINFOMD_KERNEND, sizeof(kernend), &kernend); + + bootparams->module_len = roundup(modlen, PAGE_SIZE); + bootparams->modulep = calloc(1, bootparams->module_len); + if (bootparams->modulep == NULL) { + perror("calloc"); + return (ENOMEM); + } + + moddata_copy((vm_offset_t)bootparams->modulep, &img); + + return (0); +} + +static uint64_t +parse_image(struct preloaded_file *img, struct elf_file *ef) +{ + Elf_Ehdr *ehdr; + Elf_Phdr *phdr; + Elf_Phdr *php; + Elf_Shdr *shdr; + Elf_Dyn *dp; + Elf_Addr adp; + Elf_Addr ctors; + Elf_Addr ssym, esym; + Elf_Addr p_start, p_end; + Elf_Size size; + Elf_Sym sym; + vm_offset_t firstaddr, lastaddr; + vm_offset_t shstr_addr; + char *shstr; + int symstrindex; + int symtabindex; + size_t chunk_len; + uint64_t ret; + int ndp; + int i; + unsigned int j; + + dp = NULL; + shdr = NULL; + ret = 0; + + ehdr = ef->ehdr; + phdr = (Elf_Phdr *)(ef->firstpage_u + ehdr->e_phoff); + + firstaddr = lastaddr = 0; + for (i = 0; i < ehdr->e_phnum; i++) { + if (phdr[i].p_type != PT_LOAD) + continue; + if (firstaddr == 0 || firstaddr > phdr[i].p_vaddr) + firstaddr = phdr[i].p_vaddr; + /* We mmap'ed the kernel, so p_memsz == p_filesz. */ + if (lastaddr == 0 || lastaddr < (phdr[i].p_vaddr + phdr[i].p_filesz)) + lastaddr = phdr[i].p_vaddr + phdr[i].p_filesz; + } + lastaddr = roundup(lastaddr, sizeof(long)); + + /* + * Get the section headers. We need this for finding the .ctors + * section as well as for loading any symbols. Both may be hard + * to do if reading from a .gz file as it involves seeking. I + * think the rule is going to have to be that you must strip a + * file to remove symbols before gzipping it. + */ + chunk_len = ehdr->e_shnum * ehdr->e_shentsize; + if (chunk_len == 0 || ehdr->e_shoff == 0) + goto nosyms; + shdr = (Elf_Shdr *)(ef->firstpage_u + ehdr->e_shoff); + image_addmetadata(img, MODINFOMD_SHDR, chunk_len, shdr); + + /* + * Read the section string table and look for the .ctors section. + * We need to tell the kernel where it is so that it can call the + * ctors. + */ + chunk_len = shdr[ehdr->e_shstrndx].sh_size; + if (chunk_len > 0) { + shstr_addr = (vm_offset_t)(ef->firstpage_u + \ + shdr[ehdr->e_shstrndx].sh_offset); + shstr = malloc(chunk_len); + memcpy(shstr, (void *)shstr_addr, chunk_len); + for (i = 0; i < ehdr->e_shnum; i++) { + if (strcmp(shstr + shdr[i].sh_name, ".ctors") != 0) + continue; + ctors = shdr[i].sh_addr; + image_addmetadata(img, MODINFOMD_CTORS_ADDR, + sizeof(ctors), &ctors); + size = shdr[i].sh_size; + image_addmetadata(img, MODINFOMD_CTORS_SIZE, + sizeof(size), &size); + break; + } + free(shstr); + } + + /* + * Now load any symbols. + */ + symtabindex = -1; + symstrindex = -1; + for (i = 0; i < ehdr->e_shnum; i++) { + if (shdr[i].sh_type != SHT_SYMTAB) + continue; + for (j = 0; j < ehdr->e_phnum; j++) { + if (phdr[j].p_type != PT_LOAD) + continue; + if (shdr[i].sh_offset >= phdr[j].p_offset && + (shdr[i].sh_offset + shdr[i].sh_size <= + phdr[j].p_offset + phdr[j].p_filesz)) { + shdr[i].sh_offset = 0; + shdr[i].sh_size = 0; + break; + } + } + if (shdr[i].sh_offset == 0 || shdr[i].sh_size == 0) + continue; /* alread loaded in a PT_LOAD above */ + /* Save it for loading below */ + symtabindex = i; + symstrindex = shdr[i].sh_link; + } + if (symtabindex < 0 || symstrindex < 0) + goto nosyms; + + ssym = lastaddr; + i = symtabindex; + for (;;) { + size = shdr[i].sh_size; + lastaddr += sizeof(size); + lastaddr += shdr[i].sh_size; + lastaddr = roundup(lastaddr, sizeof(size)); + + if (i == symtabindex) + i = symstrindex; + else if (i == symstrindex) + break; + } + esym = lastaddr; + + image_addmetadata(img, MODINFOMD_SSYM, sizeof(ssym), &ssym); + image_addmetadata(img, MODINFOMD_ESYM, sizeof(esym), &esym); + +nosyms: + ret = lastaddr - firstaddr; + img->f_addr = firstaddr; + + php = NULL; + for (i = 0; i < ehdr->e_phnum; i++) { + if (phdr[i].p_type == PT_DYNAMIC) { + php = &phdr[i]; + adp = php->p_vaddr; + image_addmetadata(img, MODINFOMD_DYNAMIC, + sizeof(adp), &adp); + break; + } + } + if (php == NULL) + goto out; + ndp = php->p_filesz / sizeof(Elf_Dyn); + if (ndp == 0) + goto out; + + ef->strsz = 0; + dp = (Elf_Dyn *)(ef->firstpage_u + php->p_offset); + for (i = 0; i < ndp; i++) { + if (dp[i].d_tag == 0) + break; + switch(dp[i].d_tag) { + case DT_HASH: + ef->hashtab = (Elf_Hashelt *)(uintptr_t)dp[i].d_un.d_ptr; + break; + case DT_STRTAB: + ef->strtab = (char *)(uintptr_t)dp[i].d_un.d_ptr; + case DT_STRSZ: + ef->strsz = dp[i].d_un.d_val; + break; + case DT_SYMTAB: + ef->symtab = (Elf_Sym *)(uintptr_t)dp[i].d_un.d_ptr; + break; + case DT_REL: + ef->rel = (Elf_Rel *)(uintptr_t)dp[i].d_un.d_ptr; + break; + case DT_RELSZ: + ef->relsz = dp[i].d_un.d_val; + break; + case DT_RELA: + ef->rela = (Elf_Rela *)(uintptr_t)dp[i].d_un.d_ptr; + break; + case DT_RELASZ: + ef->relasz = dp[i].d_un.d_val; + break; + } + } + if (ef->hashtab == NULL || ef->symtab == NULL || + ef->strtab == NULL || ef->strsz == 0) + goto out; + + memcpy(&ef->nbuckets, (void *)gvatou(ef->hashtab, ef->firstpage_u), sizeof(ef->nbuckets)); + memcpy(&ef->nchains, (void *)gvatou(ef->hashtab + 1, ef->firstpage_u), sizeof(ef->nchains)); + ef->buckets = (Elf_Hashelt *)gvatou(ef->hashtab + 2, ef->firstpage_u); + ef->chains = ef->buckets + ef->nbuckets; + + if (lookup_symbol(ef, "__start_set_modmetadata_set", &sym) != 0) { + ret = 0; + goto out; + } + p_start = gvatou(sym.st_value, ef->firstpage_u); + if (lookup_symbol(ef, "__stop_set_modmetadata_set", &sym) != 0) { + ret = ENOENT; + goto out; + } + p_end = gvatou(sym.st_value, ef->firstpage_u); + parse_metadata(img, ef, p_start, p_end); + +out: + return ret; +} + +static uint64_t +moddata_len(struct preloaded_file *img) +{ + struct file_metadata *md; + uint64_t len; + + /* Count the kernel image name */ + len = 8 + roundup(strlen(img->f_name) + 1, sizeof(uint64_t)); + /* Count the kernel's type */ + len += 8 + roundup(strlen(img->f_type) + 1, sizeof(uint64_t)); + /* Count the kernel's virtual address */ + len += 8 + roundup(sizeof(img->f_addr), sizeof(uint64_t)); + /* Count the kernel's size */ + len += 8 + roundup(sizeof(img->f_size), sizeof(uint64_t)); + /* Count the metadata size */ + for (md = img->f_metadata; md != NULL; md = md->md_next) + len += 8 + roundup(md->md_size, sizeof(uint64_t)); + + return len; +} + +#define COPY32(dest, what) \ + do { \ + uint32_t w = (what); \ + memcpy((void *)dest, &w, sizeof(w)); \ + dest += sizeof(w); \ + } while (0) + +#define COPY_MODINFO(modinfo, dest, val, len) \ + do { \ + COPY32(dest, modinfo); \ + COPY32(dest, len); \ + memcpy((void *)dest, val, len); \ + dest += roundup(len, sizeof(uint64_t)); \ + } while (0) + +#define COPY_MODEND(dest) \ + do { \ + COPY32(dest, MODINFO_END); \ + COPY32(dest, 0); \ + } while (0); + +static void +moddata_copy(vm_offset_t dest, struct preloaded_file *img) +{ + struct file_metadata *md; + + COPY_MODINFO(MODINFO_NAME, dest, img->f_name, strlen(img->f_name) + 1); + COPY_MODINFO(MODINFO_TYPE, dest, img->f_type, strlen(img->f_type) + 1); + COPY_MODINFO(MODINFO_ADDR, dest, &img->f_addr, sizeof(img->f_addr)); + COPY_MODINFO(MODINFO_SIZE, dest, &img->f_size, sizeof(img->f_size)); + + for (md = img->f_metadata; md != NULL; md = md->md_next) + COPY_MODINFO(MODINFO_METADATA | md->md_type, dest, + md->md_data, md->md_size); + + COPY_MODEND(dest); +} + +static void +image_addmetadata(struct preloaded_file *img, int type, + size_t size, void *addr) +{ + struct file_metadata *md; + + md = malloc(sizeof(struct file_metadata) - sizeof(md->md_data) + size); + md->md_size = size; + md->md_type = type; + memcpy(md->md_data, addr, size); + md->md_next = img->f_metadata; + img->f_metadata = md; +} + +static uint64_t +elf_hash(const char *name) +{ + const unsigned char *p = (const unsigned char *)name; + uint64_t h; + uint64_t g; + + h = 0; + while (*p != '\0') { + h = (h << 4) + *p++; + if ((g = h & 0xf0000000) != 0) + h ^= g >> 24; + h &= ~g; + } + + return h; +} + +static int +lookup_symbol(struct elf_file *ef, const char *name, Elf_Sym *symp) +{ + Elf_Hashelt symnum; + Elf_Sym sym; + char *strp; + uint64_t hash; + + hash = elf_hash(name); + memcpy(&symnum, &ef->buckets[hash % ef->nbuckets], sizeof(symnum)); + + while (symnum != STN_UNDEF) { + if (symnum >= ef->nchains) { + fprintf(stderr, "lookup_symbol: corrupt symbol table\n"); + return ENOENT; + } + + memcpy(&sym, (void *)gvatou(ef->symtab + symnum, ef->firstpage_u), sizeof(sym)); + if (sym.st_name == 0) { + fprintf(stderr, "lookup_symbol: corrupt symbol table\n"); + return ENOENT; + } + + strp = strdup((char *)gvatou(ef->strtab + sym.st_name, ef->firstpage_u)); + if (strcmp(name, strp) == 0) { + free(strp); + if (sym.st_shndx != SHN_UNDEF || + (sym.st_value != 0 && + ELF_ST_TYPE(sym.st_info) == STT_FUNC)) { + *symp = sym; + return 0; + } + return ENOENT; + } + free(strp); + memcpy(&symnum, &ef->chains[symnum], sizeof(symnum)); + } + + return ENOENT; +} + +static void +parse_metadata(struct preloaded_file *img, struct elf_file *ef, + Elf_Addr p_startu, Elf_Addr p_endu) +{ + struct mod_metadata md; + struct mod_version mver; + char *s; + int modcnt; + Elf_Addr v, p; + + modcnt = 0; + for (p = p_startu; p < p_endu; p += sizeof(Elf_Addr)) { + memcpy(&v, (void *)p, sizeof(v)); + memcpy(&md, (void *)gvatou(v, ef->firstpage_u), sizeof(md)); + if (md.md_type == MDT_VERSION) { + s = strdup((char *)gvatou(md.md_cval, ef->firstpage_u)); + memcpy(&mver, + (void *)gvatou(md.md_data, ef->firstpage_u), + sizeof(mver)); + image_addmodule(img, s, mver.mv_version); + free(s); + modcnt++; + } + } + + if (modcnt == 0) { + image_addmodule(img, "kernel", 1); + free(s); + } +} + +static int +image_addmodule(struct preloaded_file *img, char *modname, int version) +{ + struct kernel_module *mp; + struct mod_depend mdepend; + + bzero(&mdepend, sizeof(mdepend)); + mdepend.md_ver_preferred = version; + + mp = image_findmodule(img, modname, &mdepend); + if (mp) + return (EEXIST); + mp = malloc(sizeof(struct kernel_module)); + if (mp == NULL) + return (ENOMEM); + + bzero(mp, sizeof(struct kernel_module)); + mp->m_name = strdup(modname); + mp->m_version = version; + mp->m_fp = img; + mp->m_next = img->f_modules; + img->f_modules = mp; + + return (0); +} + +static struct kernel_module * +image_findmodule(struct preloaded_file *img, char *modname, + struct mod_depend *verinfo) +{ + struct kernel_module *mp, *best; + int bestver, mver; + + best = NULL; + bestver = 0; + for (mp = img->f_modules; mp != NULL; mp = mp->m_next) { + if (strcmp(modname, mp->m_name) == 0) { + if (verinfo == NULL) + return (mp); + mver = mp->m_version; + if (mver == verinfo->md_ver_preferred) + return (mp); + if (mver >= verinfo->md_ver_minimum && + mver <= verinfo->md_ver_maximum && + mver > bestver) { + best = mp; + bestver = mver; + } + } + } + + return (best); +}