diff --git a/sys/arm64/vmm/io/vgic_v3.c b/sys/arm64/vmm/io/vgic_v3.c index 67afb3374815..023406c64182 100644 --- a/sys/arm64/vmm/io/vgic_v3.c +++ b/sys/arm64/vmm/io/vgic_v3.c @@ -1,2349 +1,2348 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (C) 2018 Alexandru Elisei * Copyright (C) 2020-2022 Andrew Turner * Copyright (C) 2023 Arm Ltd * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vgic.h" #include "vgic_v3.h" #include "vgic_v3_reg.h" #include "vgic_if.h" #define VGIC_SGI_NUM (GIC_LAST_SGI - GIC_FIRST_SGI + 1) #define VGIC_PPI_NUM (GIC_LAST_PPI - GIC_FIRST_PPI + 1) #define VGIC_SPI_NUM (GIC_LAST_SPI - GIC_FIRST_SPI + 1) #define VGIC_PRV_I_NUM (VGIC_SGI_NUM + VGIC_PPI_NUM) #define VGIC_SHR_I_NUM (VGIC_SPI_NUM) MALLOC_DEFINE(M_VGIC_V3, "ARM VMM VGIC V3", "ARM VMM VGIC V3"); /* TODO: Move to softc */ struct vgic_v3_virt_features { uint8_t min_prio; size_t ich_lr_num; size_t ich_apr_num; }; struct vgic_v3_irq { /* List of IRQs that are active or pending */ TAILQ_ENTRY(vgic_v3_irq) act_pend_list; struct mtx irq_spinmtx; uint64_t mpidr; int target_vcpu; uint32_t irq; bool active; bool pending; bool enabled; bool level; bool on_aplist; uint8_t priority; uint8_t config; #define VGIC_CONFIG_MASK 0x2 #define VGIC_CONFIG_LEVEL 0x0 #define VGIC_CONFIG_EDGE 0x2 }; /* Global data not needed by EL2 */ struct vgic_v3 { struct mtx dist_mtx; uint64_t dist_start; size_t dist_end; uint64_t redist_start; size_t redist_end; uint32_t gicd_ctlr; /* Distributor Control Register */ struct vgic_v3_irq *irqs; }; /* Per-CPU data not needed by EL2 */ struct vgic_v3_cpu { /* * We need a mutex for accessing the list registers because they are * modified asynchronously by the virtual timer. * * Note that the mutex *MUST* be a spin mutex because an interrupt can * be injected by a callout callback function, thereby modifying the * list registers from a context where sleeping is forbidden. */ struct mtx lr_mtx; struct vgic_v3_irq private_irqs[VGIC_PRV_I_NUM]; TAILQ_HEAD(, vgic_v3_irq) irq_act_pend; u_int ich_lr_used; }; /* How many IRQs we support (SGIs + PPIs + SPIs). Not including LPIs */ #define VGIC_NIRQS 1023 /* Pretend to be an Arm design */ #define VGIC_IIDR 0x43b static vgic_inject_irq_t vgic_v3_inject_irq; static vgic_inject_msi_t vgic_v3_inject_msi; static int vgic_v3_max_cpu_count(device_t dev, struct hyp *hyp); #define INJECT_IRQ(hyp, vcpuid, irqid, level) \ vgic_v3_inject_irq(NULL, (hyp), (vcpuid), (irqid), (level)) typedef void (register_read)(struct hypctx *, u_int, uint64_t *, void *); typedef void (register_write)(struct hypctx *, u_int, u_int, u_int, uint64_t, void *); #define VGIC_8_BIT (1 << 0) /* (1 << 1) is reserved for 16 bit accesses */ #define VGIC_32_BIT (1 << 2) #define VGIC_64_BIT (1 << 3) struct vgic_register { u_int start; /* Start within a memory region */ u_int end; u_int size; u_int flags; register_read *read; register_write *write; }; #define VGIC_REGISTER_RANGE(reg_start, reg_end, reg_size, reg_flags, readf, \ writef) \ { \ .start = (reg_start), \ .end = (reg_end), \ .size = (reg_size), \ .flags = (reg_flags), \ .read = (readf), \ .write = (writef), \ } #define VGIC_REGISTER_RANGE_RAZ_WI(reg_start, reg_end, reg_size, reg_flags) \ VGIC_REGISTER_RANGE(reg_start, reg_end, reg_size, reg_flags, \ gic_zero_read, gic_ignore_write) #define VGIC_REGISTER(start_addr, reg_size, reg_flags, readf, writef) \ VGIC_REGISTER_RANGE(start_addr, (start_addr) + (reg_size), \ reg_size, reg_flags, readf, writef) #define VGIC_REGISTER_RAZ_WI(start_addr, reg_size, reg_flags) \ VGIC_REGISTER_RANGE_RAZ_WI(start_addr, \ (start_addr) + (reg_size), reg_size, reg_flags) static register_read gic_pidr2_read; static register_read gic_zero_read; static register_write gic_ignore_write; /* GICD_CTLR */ static register_read dist_ctlr_read; static register_write dist_ctlr_write; /* GICD_TYPER */ static register_read dist_typer_read; /* GICD_IIDR */ static register_read dist_iidr_read; /* GICD_STATUSR - RAZ/WI as we don't report errors (yet) */ /* GICD_SETSPI_NSR & GICD_CLRSPI_NSR */ static register_write dist_setclrspi_nsr_write; /* GICD_SETSPI_SR - RAZ/WI */ /* GICD_CLRSPI_SR - RAZ/WI */ /* GICD_IGROUPR - RAZ/WI as GICD_CTLR.ARE == 1 */ /* GICD_ISENABLER */ static register_read dist_isenabler_read; static register_write dist_isenabler_write; /* GICD_ICENABLER */ static register_read dist_icenabler_read; static register_write dist_icenabler_write; /* GICD_ISPENDR */ static register_read dist_ispendr_read; static register_write dist_ispendr_write; /* GICD_ICPENDR */ static register_read dist_icpendr_read; static register_write dist_icpendr_write; /* GICD_ISACTIVER */ static register_read dist_isactiver_read; static register_write dist_isactiver_write; /* GICD_ICACTIVER */ static register_read dist_icactiver_read; static register_write dist_icactiver_write; /* GICD_IPRIORITYR */ static register_read dist_ipriorityr_read; static register_write dist_ipriorityr_write; /* GICD_ITARGETSR - RAZ/WI as GICD_CTLR.ARE == 1 */ /* GICD_ICFGR */ static register_read dist_icfgr_read; static register_write dist_icfgr_write; /* GICD_IGRPMODR - RAZ/WI from non-secure mode */ /* GICD_NSACR - RAZ/WI from non-secure mode */ /* GICD_SGIR - RAZ/WI as GICD_CTLR.ARE == 1 */ /* GICD_CPENDSGIR - RAZ/WI as GICD_CTLR.ARE == 1 */ /* GICD_SPENDSGIR - RAZ/WI as GICD_CTLR.ARE == 1 */ /* GICD_IROUTER */ static register_read dist_irouter_read; static register_write dist_irouter_write; static struct vgic_register dist_registers[] = { VGIC_REGISTER(GICD_CTLR, 4, VGIC_32_BIT, dist_ctlr_read, dist_ctlr_write), VGIC_REGISTER(GICD_TYPER, 4, VGIC_32_BIT, dist_typer_read, gic_ignore_write), VGIC_REGISTER(GICD_IIDR, 4, VGIC_32_BIT, dist_iidr_read, gic_ignore_write), VGIC_REGISTER_RAZ_WI(GICD_STATUSR, 4, VGIC_32_BIT), VGIC_REGISTER(GICD_SETSPI_NSR, 4, VGIC_32_BIT, gic_zero_read, dist_setclrspi_nsr_write), VGIC_REGISTER(GICD_CLRSPI_NSR, 4, VGIC_32_BIT, gic_zero_read, dist_setclrspi_nsr_write), VGIC_REGISTER_RAZ_WI(GICD_SETSPI_SR, 4, VGIC_32_BIT), VGIC_REGISTER_RAZ_WI(GICD_CLRSPI_SR, 4, VGIC_32_BIT), VGIC_REGISTER_RANGE_RAZ_WI(GICD_IGROUPR(0), GICD_IGROUPR(1024), 4, VGIC_32_BIT), VGIC_REGISTER_RAZ_WI(GICD_ISENABLER(0), 4, VGIC_32_BIT), VGIC_REGISTER_RANGE(GICD_ISENABLER(32), GICD_ISENABLER(1024), 4, VGIC_32_BIT, dist_isenabler_read, dist_isenabler_write), VGIC_REGISTER_RAZ_WI(GICD_ICENABLER(0), 4, VGIC_32_BIT), VGIC_REGISTER_RANGE(GICD_ICENABLER(32), GICD_ICENABLER(1024), 4, VGIC_32_BIT, dist_icenabler_read, dist_icenabler_write), VGIC_REGISTER_RAZ_WI(GICD_ISPENDR(0), 4, VGIC_32_BIT), VGIC_REGISTER_RANGE(GICD_ISPENDR(32), GICD_ISPENDR(1024), 4, VGIC_32_BIT, dist_ispendr_read, dist_ispendr_write), VGIC_REGISTER_RAZ_WI(GICD_ICPENDR(0), 4, VGIC_32_BIT), VGIC_REGISTER_RANGE(GICD_ICPENDR(32), GICD_ICPENDR(1024), 4, VGIC_32_BIT, dist_icpendr_read, dist_icpendr_write), VGIC_REGISTER_RAZ_WI(GICD_ISACTIVER(0), 4, VGIC_32_BIT), VGIC_REGISTER_RANGE(GICD_ISACTIVER(32), GICD_ISACTIVER(1024), 4, VGIC_32_BIT, dist_isactiver_read, dist_isactiver_write), VGIC_REGISTER_RAZ_WI(GICD_ICACTIVER(0), 4, VGIC_32_BIT), VGIC_REGISTER_RANGE(GICD_ICACTIVER(32), GICD_ICACTIVER(1024), 4, VGIC_32_BIT, dist_icactiver_read, dist_icactiver_write), VGIC_REGISTER_RANGE_RAZ_WI(GICD_IPRIORITYR(0), GICD_IPRIORITYR(32), 4, VGIC_32_BIT | VGIC_8_BIT), VGIC_REGISTER_RANGE(GICD_IPRIORITYR(32), GICD_IPRIORITYR(1024), 4, VGIC_32_BIT | VGIC_8_BIT, dist_ipriorityr_read, dist_ipriorityr_write), VGIC_REGISTER_RANGE_RAZ_WI(GICD_ITARGETSR(0), GICD_ITARGETSR(1024), 4, VGIC_32_BIT | VGIC_8_BIT), VGIC_REGISTER_RANGE_RAZ_WI(GICD_ICFGR(0), GICD_ICFGR(32), 4, VGIC_32_BIT), VGIC_REGISTER_RANGE(GICD_ICFGR(32), GICD_ICFGR(1024), 4, VGIC_32_BIT, dist_icfgr_read, dist_icfgr_write), /* VGIC_REGISTER_RANGE(GICD_IGRPMODR(0), GICD_IGRPMODR(1024), 4, VGIC_32_BIT, dist_igrpmodr_read, dist_igrpmodr_write), VGIC_REGISTER_RANGE(GICD_NSACR(0), GICD_NSACR(1024), 4, VGIC_32_BIT, dist_nsacr_read, dist_nsacr_write), */ VGIC_REGISTER_RAZ_WI(GICD_SGIR, 4, VGIC_32_BIT), /* VGIC_REGISTER_RANGE(GICD_CPENDSGIR(0), GICD_CPENDSGIR(1024), 4, VGIC_32_BIT | VGIC_8_BIT, dist_cpendsgir_read, dist_cpendsgir_write), VGIC_REGISTER_RANGE(GICD_SPENDSGIR(0), GICD_SPENDSGIR(1024), 4, VGIC_32_BIT | VGIC_8_BIT, dist_spendsgir_read, dist_spendsgir_write), */ VGIC_REGISTER_RANGE(GICD_IROUTER(32), GICD_IROUTER(1024), 8, VGIC_64_BIT | VGIC_32_BIT, dist_irouter_read, dist_irouter_write), VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR4, GICD_PIDR2, 4, VGIC_32_BIT), VGIC_REGISTER(GICD_PIDR2, 4, VGIC_32_BIT, gic_pidr2_read, gic_ignore_write), VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR2 + 4, GICD_SIZE, 4, VGIC_32_BIT), }; /* GICR_CTLR - Ignore writes as no bits can be set */ static register_read redist_ctlr_read; /* GICR_IIDR */ static register_read redist_iidr_read; /* GICR_TYPER */ static register_read redist_typer_read; /* GICR_STATUSR - RAZ/WI as we don't report errors (yet) */ /* GICR_WAKER - RAZ/WI from non-secure mode */ /* GICR_SETLPIR - RAZ/WI as no LPIs are supported */ /* GICR_CLRLPIR - RAZ/WI as no LPIs are supported */ /* GICR_PROPBASER - RAZ/WI as no LPIs are supported */ /* GICR_PENDBASER - RAZ/WI as no LPIs are supported */ /* GICR_INVLPIR - RAZ/WI as no LPIs are supported */ /* GICR_INVALLR - RAZ/WI as no LPIs are supported */ /* GICR_SYNCR - RAZ/WI as no LPIs are supported */ static struct vgic_register redist_rd_registers[] = { VGIC_REGISTER(GICR_CTLR, 4, VGIC_32_BIT, redist_ctlr_read, gic_ignore_write), VGIC_REGISTER(GICR_IIDR, 4, VGIC_32_BIT, redist_iidr_read, gic_ignore_write), VGIC_REGISTER(GICR_TYPER, 8, VGIC_64_BIT | VGIC_32_BIT, redist_typer_read, gic_ignore_write), VGIC_REGISTER_RAZ_WI(GICR_STATUSR, 4, VGIC_32_BIT), VGIC_REGISTER_RAZ_WI(GICR_WAKER, 4, VGIC_32_BIT), VGIC_REGISTER_RAZ_WI(GICR_SETLPIR, 8, VGIC_64_BIT | VGIC_32_BIT), VGIC_REGISTER_RAZ_WI(GICR_CLRLPIR, 8, VGIC_64_BIT | VGIC_32_BIT), VGIC_REGISTER_RAZ_WI(GICR_PROPBASER, 8, VGIC_64_BIT | VGIC_32_BIT), VGIC_REGISTER_RAZ_WI(GICR_PENDBASER, 8, VGIC_64_BIT | VGIC_32_BIT), VGIC_REGISTER_RAZ_WI(GICR_INVLPIR, 8, VGIC_64_BIT | VGIC_32_BIT), VGIC_REGISTER_RAZ_WI(GICR_INVALLR, 8, VGIC_64_BIT | VGIC_32_BIT), VGIC_REGISTER_RAZ_WI(GICR_SYNCR, 4, VGIC_32_BIT), /* These are identical to the dist registers */ VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR4, GICD_PIDR2, 4, VGIC_32_BIT), VGIC_REGISTER(GICD_PIDR2, 4, VGIC_32_BIT, gic_pidr2_read, gic_ignore_write), VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR2 + 4, GICD_SIZE, 4, VGIC_32_BIT), }; /* GICR_IGROUPR0 - RAZ/WI from non-secure mode */ /* GICR_ISENABLER0 */ static register_read redist_ienabler0_read; static register_write redist_isenabler0_write; /* GICR_ICENABLER0 */ static register_write redist_icenabler0_write; /* GICR_ISPENDR0 */ static register_read redist_ipendr0_read; static register_write redist_ispendr0_write; /* GICR_ICPENDR0 */ static register_write redist_icpendr0_write; /* GICR_ISACTIVER0 */ static register_read redist_iactiver0_read; static register_write redist_isactiver0_write; /* GICR_ICACTIVER0 */ static register_write redist_icactiver0_write; /* GICR_IPRIORITYR */ static register_read redist_ipriorityr_read; static register_write redist_ipriorityr_write; /* GICR_ICFGR0 - RAZ/WI from non-secure mode */ /* GICR_ICFGR1 */ static register_read redist_icfgr1_read; static register_write redist_icfgr1_write; /* GICR_IGRPMODR0 - RAZ/WI from non-secure mode */ /* GICR_NSCAR - RAZ/WI from non-secure mode */ static struct vgic_register redist_sgi_registers[] = { VGIC_REGISTER_RAZ_WI(GICR_IGROUPR0, 4, VGIC_32_BIT), VGIC_REGISTER(GICR_ISENABLER0, 4, VGIC_32_BIT, redist_ienabler0_read, redist_isenabler0_write), VGIC_REGISTER(GICR_ICENABLER0, 4, VGIC_32_BIT, redist_ienabler0_read, redist_icenabler0_write), VGIC_REGISTER(GICR_ISPENDR0, 4, VGIC_32_BIT, redist_ipendr0_read, redist_ispendr0_write), VGIC_REGISTER(GICR_ICPENDR0, 4, VGIC_32_BIT, redist_ipendr0_read, redist_icpendr0_write), VGIC_REGISTER(GICR_ISACTIVER0, 4, VGIC_32_BIT, redist_iactiver0_read, redist_isactiver0_write), VGIC_REGISTER(GICR_ICACTIVER0, 4, VGIC_32_BIT, redist_iactiver0_read, redist_icactiver0_write), VGIC_REGISTER_RANGE(GICR_IPRIORITYR(0), GICR_IPRIORITYR(32), 4, VGIC_32_BIT | VGIC_8_BIT, redist_ipriorityr_read, redist_ipriorityr_write), VGIC_REGISTER_RAZ_WI(GICR_ICFGR0, 4, VGIC_32_BIT), VGIC_REGISTER(GICR_ICFGR1, 4, VGIC_32_BIT, redist_icfgr1_read, redist_icfgr1_write), VGIC_REGISTER_RAZ_WI(GICR_IGRPMODR0, 4, VGIC_32_BIT), VGIC_REGISTER_RAZ_WI(GICR_NSACR, 4, VGIC_32_BIT), }; static struct vgic_v3_virt_features virt_features; static struct vgic_v3_irq *vgic_v3_get_irq(struct hyp *, int, uint32_t); static void vgic_v3_release_irq(struct vgic_v3_irq *); /* TODO: Move to a common file */ static int mpidr_to_vcpu(struct hyp *hyp, uint64_t mpidr) { struct vm *vm; struct hypctx *hypctx; vm = hyp->vm; for (int i = 0; i < vm_get_maxcpus(vm); i++) { hypctx = hyp->ctx[i]; if (hypctx != NULL && (hypctx->vmpidr_el2 & GICD_AFF) == mpidr) return (i); } return (-1); } static void vgic_v3_vminit(device_t dev, struct hyp *hyp) { struct vgic_v3 *vgic; hyp->vgic = malloc(sizeof(*hyp->vgic), M_VGIC_V3, M_WAITOK | M_ZERO); vgic = hyp->vgic; /* * Configure the Distributor control register. The register resets to an * architecturally UNKNOWN value, so we reset to 0 to disable all * functionality controlled by the register. * * The exception is GICD_CTLR.DS, which is RA0/WI when the Distributor * supports one security state (ARM GIC Architecture Specification for * GICv3 and GICv4, p. 4-464) */ vgic->gicd_ctlr = 0; mtx_init(&vgic->dist_mtx, "VGICv3 Distributor lock", NULL, MTX_SPIN); } static void vgic_v3_cpuinit(device_t dev, struct hypctx *hypctx) { struct vgic_v3_cpu *vgic_cpu; struct vgic_v3_irq *irq; int i, irqid; hypctx->vgic_cpu = malloc(sizeof(*hypctx->vgic_cpu), M_VGIC_V3, M_WAITOK | M_ZERO); vgic_cpu = hypctx->vgic_cpu; mtx_init(&vgic_cpu->lr_mtx, "VGICv3 ICH_LR_EL2 lock", NULL, MTX_SPIN); /* Set the SGI and PPI state */ for (irqid = 0; irqid < VGIC_PRV_I_NUM; irqid++) { irq = &vgic_cpu->private_irqs[irqid]; mtx_init(&irq->irq_spinmtx, "VGIC IRQ spinlock", NULL, MTX_SPIN); irq->irq = irqid; irq->mpidr = hypctx->vmpidr_el2 & GICD_AFF; irq->target_vcpu = vcpu_vcpuid(hypctx->vcpu); MPASS(irq->target_vcpu >= 0); if (irqid < VGIC_SGI_NUM) { /* SGIs */ irq->enabled = true; irq->config = VGIC_CONFIG_EDGE; } else { /* PPIs */ irq->config = VGIC_CONFIG_LEVEL; } irq->priority = 0; } /* * Configure the Interrupt Controller Hyp Control Register. * * ICH_HCR_EL2_En: enable virtual CPU interface. * * Maintenance interrupts are disabled. */ hypctx->vgic_v3_regs.ich_hcr_el2 = ICH_HCR_EL2_En; /* * Configure the Interrupt Controller Virtual Machine Control Register. * * ICH_VMCR_EL2_VPMR: lowest priority mask for the VCPU interface * ICH_VMCR_EL2_VBPR1_NO_PREEMPTION: disable interrupt preemption for * Group 1 interrupts * ICH_VMCR_EL2_VBPR0_NO_PREEMPTION: disable interrupt preemption for * Group 0 interrupts * ~ICH_VMCR_EL2_VEOIM: writes to EOI registers perform priority drop * and interrupt deactivation. * ICH_VMCR_EL2_VENG0: virtual Group 0 interrupts enabled. * ICH_VMCR_EL2_VENG1: virtual Group 1 interrupts enabled. */ hypctx->vgic_v3_regs.ich_vmcr_el2 = (virt_features.min_prio << ICH_VMCR_EL2_VPMR_SHIFT) | ICH_VMCR_EL2_VBPR1_NO_PREEMPTION | ICH_VMCR_EL2_VBPR0_NO_PREEMPTION; hypctx->vgic_v3_regs.ich_vmcr_el2 &= ~ICH_VMCR_EL2_VEOIM; hypctx->vgic_v3_regs.ich_vmcr_el2 |= ICH_VMCR_EL2_VENG0 | ICH_VMCR_EL2_VENG1; hypctx->vgic_v3_regs.ich_lr_num = virt_features.ich_lr_num; for (i = 0; i < hypctx->vgic_v3_regs.ich_lr_num; i++) hypctx->vgic_v3_regs.ich_lr_el2[i] = 0UL; vgic_cpu->ich_lr_used = 0; TAILQ_INIT(&vgic_cpu->irq_act_pend); hypctx->vgic_v3_regs.ich_apr_num = virt_features.ich_apr_num; } static void vgic_v3_cpucleanup(device_t dev, struct hypctx *hypctx) { struct vgic_v3_cpu *vgic_cpu; struct vgic_v3_irq *irq; int irqid; vgic_cpu = hypctx->vgic_cpu; for (irqid = 0; irqid < VGIC_PRV_I_NUM; irqid++) { irq = &vgic_cpu->private_irqs[irqid]; mtx_destroy(&irq->irq_spinmtx); } mtx_destroy(&vgic_cpu->lr_mtx); free(hypctx->vgic_cpu, M_VGIC_V3); } static void vgic_v3_vmcleanup(device_t dev, struct hyp *hyp) { mtx_destroy(&hyp->vgic->dist_mtx); free(hyp->vgic, M_VGIC_V3); } static int vgic_v3_max_cpu_count(device_t dev, struct hyp *hyp) { struct vgic_v3 *vgic; size_t count; int16_t max_count; vgic = hyp->vgic; max_count = vm_get_maxcpus(hyp->vm); /* No registers, assume the maximum CPUs */ if (vgic->redist_start == 0 && vgic->redist_end == 0) return (max_count); count = (vgic->redist_end - vgic->redist_start) / (GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE); /* * max_count is smaller than INT_MAX so will also limit count * to a positive integer value. */ if (count > max_count) return (max_count); return (count); } static bool vgic_v3_irq_pending(struct vgic_v3_irq *irq) { if ((irq->config & VGIC_CONFIG_MASK) == VGIC_CONFIG_LEVEL) { return (irq->pending || irq->level); } else { return (irq->pending); } } static bool vgic_v3_queue_irq(struct hyp *hyp, struct vgic_v3_cpu *vgic_cpu, int vcpuid, struct vgic_v3_irq *irq) { MPASS(vcpuid >= 0); MPASS(vcpuid < vm_get_maxcpus(hyp->vm)); mtx_assert(&vgic_cpu->lr_mtx, MA_OWNED); mtx_assert(&irq->irq_spinmtx, MA_OWNED); /* No need to queue the IRQ */ if (!irq->level && !irq->pending) return (false); if (!irq->on_aplist) { irq->on_aplist = true; TAILQ_INSERT_TAIL(&vgic_cpu->irq_act_pend, irq, act_pend_list); } return (true); } static uint64_t gic_reg_value_64(uint64_t field, uint64_t val, u_int offset, u_int size) { uint32_t mask; if (offset != 0 || size != 8) { mask = ((1ul << (size * 8)) - 1) << (offset * 8); /* Shift the new bits to the correct place */ val <<= (offset * 8); /* Keep only the interesting bits */ val &= mask; /* Add the bits we are keeping from the old value */ val |= field & ~mask; } return (val); } static void gic_pidr2_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { *rval = GICR_PIDR2_ARCH_GICv3 << GICR_PIDR2_ARCH_SHIFT; } /* Common read-only/write-ignored helpers */ static void gic_zero_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { *rval = 0; } static void gic_ignore_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { /* Nothing to do */ } static uint64_t read_enabler(struct hypctx *hypctx, int n) { struct vgic_v3_irq *irq; uint64_t ret; uint32_t irq_base; int i; ret = 0; irq_base = n * 32; for (i = 0; i < 32; i++) { irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), irq_base + i); if (irq == NULL) continue; if (!irq->enabled) ret |= 1u << i; vgic_v3_release_irq(irq); } return (ret); } static void write_enabler(struct hypctx *hypctx,int n, bool set, uint64_t val) { struct vgic_v3_irq *irq; uint32_t irq_base; int i; irq_base = n * 32; for (i = 0; i < 32; i++) { /* We only change interrupts when the appropriate bit is set */ if ((val & (1u << i)) == 0) continue; /* Find the interrupt this bit represents */ irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), irq_base + i); if (irq == NULL) continue; irq->enabled = set; vgic_v3_release_irq(irq); } } static uint64_t read_pendr(struct hypctx *hypctx, int n) { struct vgic_v3_irq *irq; uint64_t ret; uint32_t irq_base; int i; ret = 0; irq_base = n * 32; for (i = 0; i < 32; i++) { irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), irq_base + i); if (irq == NULL) continue; if (vgic_v3_irq_pending(irq)) ret |= 1u << i; vgic_v3_release_irq(irq); } return (ret); } static uint64_t write_pendr(struct hypctx *hypctx, int n, bool set, uint64_t val) { struct vgic_v3_cpu *vgic_cpu; struct vgic_v3_irq *irq; struct hyp *hyp; struct hypctx *target_hypctx; uint64_t ret; uint32_t irq_base; int target_vcpu, i; bool notify; hyp = hypctx->hyp; ret = 0; irq_base = n * 32; for (i = 0; i < 32; i++) { /* We only change interrupts when the appropriate bit is set */ if ((val & (1u << i)) == 0) continue; irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), irq_base + i); if (irq == NULL) continue; notify = false; target_vcpu = irq->target_vcpu; if (target_vcpu < 0) goto next_irq; target_hypctx = hyp->ctx[target_vcpu]; if (target_hypctx == NULL) goto next_irq; vgic_cpu = target_hypctx->vgic_cpu; if (!set) { /* pending -> not pending */ irq->pending = false; } else { irq->pending = true; mtx_lock_spin(&vgic_cpu->lr_mtx); notify = vgic_v3_queue_irq(hyp, vgic_cpu, target_vcpu, irq); mtx_unlock_spin(&vgic_cpu->lr_mtx); } next_irq: vgic_v3_release_irq(irq); if (notify) vcpu_notify_event(vm_vcpu(hyp->vm, target_vcpu)); } return (ret); } static uint64_t read_activer(struct hypctx *hypctx, int n) { struct vgic_v3_irq *irq; uint64_t ret; uint32_t irq_base; int i; ret = 0; irq_base = n * 32; for (i = 0; i < 32; i++) { irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), irq_base + i); if (irq == NULL) continue; if (irq->active) ret |= 1u << i; vgic_v3_release_irq(irq); } return (ret); } static void write_activer(struct hypctx *hypctx, u_int n, bool set, uint64_t val) { struct vgic_v3_cpu *vgic_cpu; struct vgic_v3_irq *irq; struct hyp *hyp; struct hypctx *target_hypctx; uint32_t irq_base; int target_vcpu, i; bool notify; hyp = hypctx->hyp; irq_base = n * 32; for (i = 0; i < 32; i++) { /* We only change interrupts when the appropriate bit is set */ if ((val & (1u << i)) == 0) continue; irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), irq_base + i); if (irq == NULL) continue; notify = false; target_vcpu = irq->target_vcpu; if (target_vcpu < 0) goto next_irq; target_hypctx = hyp->ctx[target_vcpu]; if (target_hypctx == NULL) goto next_irq; vgic_cpu = target_hypctx->vgic_cpu; if (!set) { /* active -> not active */ irq->active = false; } else { /* not active -> active */ irq->active = true; mtx_lock_spin(&vgic_cpu->lr_mtx); notify = vgic_v3_queue_irq(hyp, vgic_cpu, target_vcpu, irq); mtx_unlock_spin(&vgic_cpu->lr_mtx); } next_irq: vgic_v3_release_irq(irq); if (notify) vcpu_notify_event(vm_vcpu(hyp->vm, target_vcpu)); } } static uint64_t read_priorityr(struct hypctx *hypctx, int n) { struct vgic_v3_irq *irq; uint64_t ret; uint32_t irq_base; int i; ret = 0; irq_base = n * 4; for (i = 0; i < 4; i++) { irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), irq_base + i); if (irq == NULL) continue; ret |= ((uint64_t)irq->priority) << (i * 8); vgic_v3_release_irq(irq); } return (ret); } static void write_priorityr(struct hypctx *hypctx, u_int irq_base, u_int size, uint64_t val) { struct vgic_v3_irq *irq; int i; for (i = 0; i < size; i++) { irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), irq_base + i); if (irq == NULL) continue; /* Set the priority. We support 32 priority steps (5 bits) */ irq->priority = (val >> (i * 8)) & 0xf8; vgic_v3_release_irq(irq); } } static uint64_t read_config(struct hypctx *hypctx, int n) { struct vgic_v3_irq *irq; uint64_t ret; uint32_t irq_base; int i; ret = 0; irq_base = n * 16; for (i = 0; i < 16; i++) { irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), irq_base + i); if (irq == NULL) continue; ret |= ((uint64_t)irq->config) << (i * 2); vgic_v3_release_irq(irq); } return (ret); } static void write_config(struct hypctx *hypctx, int n, uint64_t val) { struct vgic_v3_irq *irq; uint32_t irq_base; int i; irq_base = n * 16; for (i = 0; i < 16; i++) { /* * The config can't be changed for SGIs and PPIs. SGIs have * an edge-triggered behaviour, and the register is * implementation defined to be read-only for PPIs. */ if (irq_base + i < VGIC_PRV_I_NUM) continue; irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), irq_base + i); if (irq == NULL) continue; /* Bit 0 is RES0 */ irq->config = (val >> (i * 2)) & VGIC_CONFIG_MASK; vgic_v3_release_irq(irq); } } static uint64_t read_route(struct hypctx *hypctx, int n) { struct vgic_v3_irq *irq; uint64_t mpidr; irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), n); if (irq == NULL) return (0); mpidr = irq->mpidr; vgic_v3_release_irq(irq); return (mpidr); } static void write_route(struct hypctx *hypctx, int n, uint64_t val, u_int offset, u_int size) { struct vgic_v3_irq *irq; irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), n); if (irq == NULL) return; irq->mpidr = gic_reg_value_64(irq->mpidr, val, offset, size) & GICD_AFF; irq->target_vcpu = mpidr_to_vcpu(hypctx->hyp, irq->mpidr); /* * If the interrupt is pending we can either use the old mpidr, or * the new mpidr. To simplify this code we use the old value so we * don't need to move the interrupt until the next time it is * moved to the pending state. */ vgic_v3_release_irq(irq); } /* * Distributor register handlers. */ /* GICD_CTLR */ static void dist_ctlr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { struct hyp *hyp; struct vgic_v3 *vgic; hyp = hypctx->hyp; vgic = hyp->vgic; mtx_lock_spin(&vgic->dist_mtx); *rval = vgic->gicd_ctlr; mtx_unlock_spin(&vgic->dist_mtx); /* Writes are never pending */ *rval &= ~GICD_CTLR_RWP; } static void dist_ctlr_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { struct vgic_v3 *vgic; MPASS(offset == 0); MPASS(size == 4); vgic = hypctx->hyp->vgic; /* * GICv2 backwards compatibility is not implemented so * ARE_NS is RAO/WI. This means EnableGrp1 is RES0. * * EnableGrp1A is supported, and RWP is read-only. * * All other bits are RES0 from non-secure mode as we * implement as if we are in a system with two security * states. */ wval &= GICD_CTLR_G1A; wval |= GICD_CTLR_ARE_NS; mtx_lock_spin(&vgic->dist_mtx); vgic->gicd_ctlr = wval; /* TODO: Wake any vcpus that have interrupts pending */ mtx_unlock_spin(&vgic->dist_mtx); } /* GICD_TYPER */ static void dist_typer_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { uint32_t typer; typer = (10 - 1) << GICD_TYPER_IDBITS_SHIFT; typer |= GICD_TYPER_MBIS; /* ITLinesNumber: */ typer |= howmany(VGIC_NIRQS + 1, 32) - 1; *rval = typer; } /* GICD_IIDR */ static void dist_iidr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { *rval = VGIC_IIDR; } /* GICD_SETSPI_NSR & GICD_CLRSPI_NSR */ static void dist_setclrspi_nsr_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { uint32_t irqid; MPASS(offset == 0); MPASS(size == 4); irqid = wval & GICD_SPI_INTID_MASK; INJECT_IRQ(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), irqid, reg == GICD_SETSPI_NSR); } /* GICD_ISENABLER */ static void dist_isenabler_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { int n; n = (reg - GICD_ISENABLER(0)) / 4; /* GICD_ISENABLER0 is RAZ/WI so handled separately */ MPASS(n > 0); *rval = read_enabler(hypctx, n); } static void dist_isenabler_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { int n; MPASS(offset == 0); MPASS(size == 4); n = (reg - GICD_ISENABLER(0)) / 4; /* GICD_ISENABLER0 is RAZ/WI so handled separately */ MPASS(n > 0); write_enabler(hypctx, n, true, wval); } /* GICD_ICENABLER */ static void dist_icenabler_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { int n; n = (reg - GICD_ICENABLER(0)) / 4; /* GICD_ICENABLER0 is RAZ/WI so handled separately */ MPASS(n > 0); *rval = read_enabler(hypctx, n); } static void dist_icenabler_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { int n; MPASS(offset == 0); MPASS(size == 4); n = (reg - GICD_ISENABLER(0)) / 4; /* GICD_ICENABLER0 is RAZ/WI so handled separately */ MPASS(n > 0); write_enabler(hypctx, n, false, wval); } /* GICD_ISPENDR */ static void dist_ispendr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { int n; n = (reg - GICD_ISPENDR(0)) / 4; /* GICD_ISPENDR0 is RAZ/WI so handled separately */ MPASS(n > 0); *rval = read_pendr(hypctx, n); } static void dist_ispendr_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { int n; MPASS(offset == 0); MPASS(size == 4); n = (reg - GICD_ISPENDR(0)) / 4; /* GICD_ISPENDR0 is RAZ/WI so handled separately */ MPASS(n > 0); write_pendr(hypctx, n, true, wval); } /* GICD_ICPENDR */ static void dist_icpendr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { int n; n = (reg - GICD_ICPENDR(0)) / 4; /* GICD_ICPENDR0 is RAZ/WI so handled separately */ MPASS(n > 0); *rval = read_pendr(hypctx, n); } static void dist_icpendr_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { int n; MPASS(offset == 0); MPASS(size == 4); n = (reg - GICD_ICPENDR(0)) / 4; /* GICD_ICPENDR0 is RAZ/WI so handled separately */ MPASS(n > 0); write_pendr(hypctx, n, false, wval); } /* GICD_ISACTIVER */ /* Affinity routing is enabled so isactiver0 is RAZ/WI */ static void dist_isactiver_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { int n; n = (reg - GICD_ISACTIVER(0)) / 4; /* GICD_ISACTIVER0 is RAZ/WI so handled separately */ MPASS(n > 0); *rval = read_activer(hypctx, n); } static void dist_isactiver_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { int n; MPASS(offset == 0); MPASS(size == 4); n = (reg - GICD_ISACTIVER(0)) / 4; /* GICD_ISACTIVE0 is RAZ/WI so handled separately */ MPASS(n > 0); write_activer(hypctx, n, true, wval); } /* GICD_ICACTIVER */ static void dist_icactiver_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { int n; n = (reg - GICD_ICACTIVER(0)) / 4; /* GICD_ICACTIVE0 is RAZ/WI so handled separately */ MPASS(n > 0); *rval = read_activer(hypctx, n); } static void dist_icactiver_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { int n; MPASS(offset == 0); MPASS(size == 4); n = (reg - GICD_ICACTIVER(0)) / 4; /* GICD_ICACTIVE0 is RAZ/WI so handled separately */ MPASS(n > 0); write_activer(hypctx, n, false, wval); } /* GICD_IPRIORITYR */ /* Affinity routing is enabled so ipriorityr0-7 is RAZ/WI */ static void dist_ipriorityr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { int n; n = (reg - GICD_IPRIORITYR(0)) / 4; /* GICD_IPRIORITY0-7 is RAZ/WI so handled separately */ MPASS(n > 7); *rval = read_priorityr(hypctx, n); } static void dist_ipriorityr_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { u_int irq_base; irq_base = (reg - GICD_IPRIORITYR(0)) + offset; /* GICD_IPRIORITY0-7 is RAZ/WI so handled separately */ MPASS(irq_base > 31); write_priorityr(hypctx, irq_base, size, wval); } /* GICD_ICFGR */ static void dist_icfgr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { int n; n = (reg - GICD_ICFGR(0)) / 4; /* GICD_ICFGR0-1 are RAZ/WI so handled separately */ MPASS(n > 1); *rval = read_config(hypctx, n); } static void dist_icfgr_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { int n; MPASS(offset == 0); MPASS(size == 4); n = (reg - GICD_ICFGR(0)) / 4; /* GICD_ICFGR0-1 are RAZ/WI so handled separately */ MPASS(n > 1); write_config(hypctx, n, wval); } /* GICD_IROUTER */ static void dist_irouter_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { int n; n = (reg - GICD_IROUTER(0)) / 8; /* GICD_IROUTER0-31 don't exist */ MPASS(n > 31); *rval = read_route(hypctx, n); } static void dist_irouter_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { int n; n = (reg - GICD_IROUTER(0)) / 8; /* GICD_IROUTER0-31 don't exist */ MPASS(n > 31); write_route(hypctx, n, wval, offset, size); } static bool vgic_register_read(struct hypctx *hypctx, struct vgic_register *reg_list, u_int reg_list_size, u_int reg, u_int size, uint64_t *rval, void *arg) { u_int i, offset; for (i = 0; i < reg_list_size; i++) { if (reg_list[i].start <= reg && reg_list[i].end >= reg + size) { offset = reg & (reg_list[i].size - 1); reg -= offset; if ((reg_list[i].flags & size) != 0) { reg_list[i].read(hypctx, reg, rval, NULL); /* Move the bits into the correct place */ *rval >>= (offset * 8); if (size < 8) { *rval &= (1ul << (size * 8)) - 1; } } else { /* * The access is an invalid size. Section * 12.1.3 "GIC memory-mapped register access" * of the GICv3 and GICv4 spec issue H * (IHI0069) lists the options. For a read * the controller returns unknown data, in * this case it is zero. */ *rval = 0; } return (true); } } return (false); } static bool vgic_register_write(struct hypctx *hypctx, struct vgic_register *reg_list, u_int reg_list_size, u_int reg, u_int size, uint64_t wval, void *arg) { u_int i, offset; for (i = 0; i < reg_list_size; i++) { if (reg_list[i].start <= reg && reg_list[i].end >= reg + size) { offset = reg & (reg_list[i].size - 1); reg -= offset; if ((reg_list[i].flags & size) != 0) { reg_list[i].write(hypctx, reg, offset, size, wval, NULL); } else { /* * See the comment in vgic_register_read. * For writes the controller ignores the * operation. */ } return (true); } } return (false); } static int dist_read(struct vcpu *vcpu, uint64_t fault_ipa, uint64_t *rval, int size, void *arg) { struct hyp *hyp; struct hypctx *hypctx; struct vgic_v3 *vgic; uint64_t reg; hypctx = vcpu_get_cookie(vcpu); hyp = hypctx->hyp; vgic = hyp->vgic; /* Check the register is one of ours and is the correct size */ if (fault_ipa < vgic->dist_start || fault_ipa + size > vgic->dist_end) { return (EINVAL); } reg = fault_ipa - vgic->dist_start; /* * As described in vgic_register_read an access with an invalid * alignment is read with an unknown value */ if ((reg & (size - 1)) != 0) { *rval = 0; return (0); } if (vgic_register_read(hypctx, dist_registers, nitems(dist_registers), reg, size, rval, NULL)) return (0); /* Reserved register addresses are RES0 so we can hardware it to 0 */ *rval = 0; return (0); } static int dist_write(struct vcpu *vcpu, uint64_t fault_ipa, uint64_t wval, int size, void *arg) { struct hyp *hyp; struct hypctx *hypctx; struct vgic_v3 *vgic; uint64_t reg; hypctx = vcpu_get_cookie(vcpu); hyp = hypctx->hyp; vgic = hyp->vgic; /* Check the register is one of ours and is the correct size */ if (fault_ipa < vgic->dist_start || fault_ipa + size > vgic->dist_end) { return (EINVAL); } reg = fault_ipa - vgic->dist_start; /* * As described in vgic_register_read an access with an invalid * alignment is write ignored. */ if ((reg & (size - 1)) != 0) return (0); if (vgic_register_write(hypctx, dist_registers, nitems(dist_registers), reg, size, wval, NULL)) return (0); /* Reserved register addresses are RES0 so we can ignore the write */ return (0); } /* * Redistributor register handlers. * * RD_base: */ /* GICR_CTLR */ static void redist_ctlr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { /* LPIs not supported */ *rval = 0; } /* GICR_IIDR */ static void redist_iidr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { *rval = VGIC_IIDR; } /* GICR_TYPER */ static void redist_typer_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { uint64_t aff, gicr_typer, vmpidr_el2; bool last_vcpu; last_vcpu = false; if (vcpu_vcpuid(hypctx->vcpu) == (vgic_max_cpu_count(hypctx->hyp) - 1)) last_vcpu = true; vmpidr_el2 = hypctx->vmpidr_el2; MPASS(vmpidr_el2 != 0); /* * Get affinity for the current CPU. The guest CPU affinity is taken * from VMPIDR_EL2. The Redistributor corresponding to this CPU is * the Redistributor with the same affinity from GICR_TYPER. */ aff = (CPU_AFF3(vmpidr_el2) << 24) | (CPU_AFF2(vmpidr_el2) << 16) | (CPU_AFF1(vmpidr_el2) << 8) | CPU_AFF0(vmpidr_el2); /* Set up GICR_TYPER. */ gicr_typer = aff << GICR_TYPER_AFF_SHIFT; /* Set the vcpu as the processsor ID */ gicr_typer |= (uint64_t)vcpu_vcpuid(hypctx->vcpu) << GICR_TYPER_CPUNUM_SHIFT; if (last_vcpu) /* Mark the last Redistributor */ gicr_typer |= GICR_TYPER_LAST; *rval = gicr_typer; } /* * SGI_base: */ /* GICR_ISENABLER0 */ static void redist_ienabler0_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { *rval = read_enabler(hypctx, 0); } static void redist_isenabler0_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { MPASS(offset == 0); MPASS(size == 4); write_enabler(hypctx, 0, true, wval); } /* GICR_ICENABLER0 */ static void redist_icenabler0_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { MPASS(offset == 0); MPASS(size == 4); write_enabler(hypctx, 0, false, wval); } /* GICR_ISPENDR0 */ static void redist_ipendr0_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { *rval = read_pendr(hypctx, 0); } static void redist_ispendr0_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { MPASS(offset == 0); MPASS(size == 4); write_pendr(hypctx, 0, true, wval); } /* GICR_ICPENDR0 */ static void redist_icpendr0_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { MPASS(offset == 0); MPASS(size == 4); write_pendr(hypctx, 0, false, wval); } /* GICR_ISACTIVER0 */ static void redist_iactiver0_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { *rval = read_activer(hypctx, 0); } static void redist_isactiver0_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { write_activer(hypctx, 0, true, wval); } /* GICR_ICACTIVER0 */ static void redist_icactiver0_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { write_activer(hypctx, 0, false, wval); } /* GICR_IPRIORITYR */ static void redist_ipriorityr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { int n; n = (reg - GICR_IPRIORITYR(0)) / 4; *rval = read_priorityr(hypctx, n); } static void redist_ipriorityr_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { u_int irq_base; irq_base = (reg - GICR_IPRIORITYR(0)) + offset; write_priorityr(hypctx, irq_base, size, wval); } /* GICR_ICFGR1 */ static void redist_icfgr1_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) { *rval = read_config(hypctx, 1); } static void redist_icfgr1_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, uint64_t wval, void *arg) { MPASS(offset == 0); MPASS(size == 4); write_config(hypctx, 1, wval); } static int redist_read(struct vcpu *vcpu, uint64_t fault_ipa, uint64_t *rval, int size, void *arg) { struct hyp *hyp; struct hypctx *hypctx, *target_hypctx; struct vgic_v3 *vgic; uint64_t reg; int vcpuid; /* Find the current vcpu ctx to get the vgic struct */ hypctx = vcpu_get_cookie(vcpu); hyp = hypctx->hyp; vgic = hyp->vgic; /* Check the register is one of ours and is the correct size */ if (fault_ipa < vgic->redist_start || fault_ipa + size > vgic->redist_end) { return (EINVAL); } vcpuid = (fault_ipa - vgic->redist_start) / (GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE); if (vcpuid >= vm_get_maxcpus(hyp->vm)) { /* * This should never happen, but lets be defensive so if it * does we don't panic a non-INVARIANTS kernel. */ #ifdef INVARIANTS panic("%s: Invalid vcpuid %d", __func__, vcpuid); #else *rval = 0; return (0); #endif } /* Find the target vcpu ctx for the access */ target_hypctx = hyp->ctx[vcpuid]; if (target_hypctx == NULL) { /* * The CPU has not yet started. The redistributor and CPU are * in the same power domain. As such the redistributor will * also be powered down so any access will raise an external * abort. */ raise_data_insn_abort(hypctx, fault_ipa, true, ISS_DATA_DFSC_EXT); return (0); } reg = (fault_ipa - vgic->redist_start) % (GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE); /* * As described in vgic_register_read an access with an invalid * alignment is read with an unknown value */ if ((reg & (size - 1)) != 0) { *rval = 0; return (0); } if (reg < GICR_RD_BASE_SIZE) { if (vgic_register_read(target_hypctx, redist_rd_registers, nitems(redist_rd_registers), reg, size, rval, NULL)) return (0); } else if (reg < (GICR_SGI_BASE + GICR_SGI_BASE_SIZE)) { if (vgic_register_read(target_hypctx, redist_sgi_registers, nitems(redist_sgi_registers), reg - GICR_SGI_BASE, size, rval, NULL)) return (0); } /* Reserved register addresses are RES0 so we can hardware it to 0 */ *rval = 0; return (0); } static int redist_write(struct vcpu *vcpu, uint64_t fault_ipa, uint64_t wval, int size, void *arg) { struct hyp *hyp; struct hypctx *hypctx, *target_hypctx; struct vgic_v3 *vgic; uint64_t reg; int vcpuid; /* Find the current vcpu ctx to get the vgic struct */ hypctx = vcpu_get_cookie(vcpu); hyp = hypctx->hyp; vgic = hyp->vgic; /* Check the register is one of ours and is the correct size */ if (fault_ipa < vgic->redist_start || fault_ipa + size > vgic->redist_end) { return (EINVAL); } vcpuid = (fault_ipa - vgic->redist_start) / (GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE); if (vcpuid >= vm_get_maxcpus(hyp->vm)) { /* * This should never happen, but lets be defensive so if it * does we don't panic a non-INVARIANTS kernel. */ #ifdef INVARIANTS panic("%s: Invalid vcpuid %d", __func__, vcpuid); #else return (0); #endif } /* Find the target vcpu ctx for the access */ target_hypctx = hyp->ctx[vcpuid]; if (target_hypctx == NULL) { /* * The CPU has not yet started. The redistributor and CPU are * in the same power domain. As such the redistributor will * also be powered down so any access will raise an external * abort. */ raise_data_insn_abort(hypctx, fault_ipa, true, ISS_DATA_DFSC_EXT); return (0); } reg = (fault_ipa - vgic->redist_start) % (GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE); /* * As described in vgic_register_read an access with an invalid * alignment is write ignored. */ if ((reg & (size - 1)) != 0) return (0); if (reg < GICR_RD_BASE_SIZE) { if (vgic_register_write(target_hypctx, redist_rd_registers, nitems(redist_rd_registers), reg, size, wval, NULL)) return (0); } else if (reg < (GICR_SGI_BASE + GICR_SGI_BASE_SIZE)) { if (vgic_register_write(target_hypctx, redist_sgi_registers, nitems(redist_sgi_registers), reg - GICR_SGI_BASE, size, wval, NULL)) return (0); } /* Reserved register addresses are RES0 so we can ignore the write */ return (0); } static int vgic_v3_icc_sgi1r_read(struct vcpu *vcpu, uint64_t *rval, void *arg) { /* * TODO: Inject an unknown exception. */ *rval = 0; return (0); } static int vgic_v3_icc_sgi1r_write(struct vcpu *vcpu, uint64_t rval, void *arg) { struct vm *vm; struct hyp *hyp; cpuset_t active_cpus; uint64_t mpidr, aff1, aff2, aff3; uint32_t irqid; int cpus, cpu_off, target_vcpuid, vcpuid; vm = vcpu_vm(vcpu); hyp = vm_get_cookie(vm); active_cpus = vm_active_cpus(vm); vcpuid = vcpu_vcpuid(vcpu); irqid = ICC_SGI1R_EL1_SGIID_VAL(rval) >> ICC_SGI1R_EL1_SGIID_SHIFT; if ((rval & ICC_SGI1R_EL1_IRM) == 0) { /* Non-zero points at no vcpus */ if (ICC_SGI1R_EL1_RS_VAL(rval) != 0) return (0); aff1 = ICC_SGI1R_EL1_AFF1_VAL(rval) >> ICC_SGI1R_EL1_AFF1_SHIFT; aff2 = ICC_SGI1R_EL1_AFF2_VAL(rval) >> ICC_SGI1R_EL1_AFF2_SHIFT; aff3 = ICC_SGI1R_EL1_AFF3_VAL(rval) >> ICC_SGI1R_EL1_AFF3_SHIFT; mpidr = aff3 << MPIDR_AFF3_SHIFT | aff2 << MPIDR_AFF2_SHIFT | aff1 << MPIDR_AFF1_SHIFT; cpus = ICC_SGI1R_EL1_TL_VAL(rval) >> ICC_SGI1R_EL1_TL_SHIFT; cpu_off = 0; while (cpus > 0) { if (cpus & 1) { target_vcpuid = mpidr_to_vcpu(hyp, mpidr | (cpu_off << MPIDR_AFF0_SHIFT)); if (target_vcpuid >= 0 && CPU_ISSET(target_vcpuid, &active_cpus)) { INJECT_IRQ(hyp, target_vcpuid, irqid, true); } } cpu_off++; cpus >>= 1; } } else { /* Send an IPI to all CPUs other than the current CPU */ for (target_vcpuid = 0; target_vcpuid < vm_get_maxcpus(vm); target_vcpuid++) { if (CPU_ISSET(target_vcpuid, &active_cpus) && target_vcpuid != vcpuid) { INJECT_IRQ(hyp, target_vcpuid, irqid, true); } } } return (0); } static void vgic_v3_mmio_init(struct hyp *hyp) { struct vgic_v3 *vgic; struct vgic_v3_irq *irq; int i; /* Allocate memory for the SPIs */ vgic = hyp->vgic; vgic->irqs = malloc((VGIC_NIRQS - VGIC_PRV_I_NUM) * sizeof(*vgic->irqs), M_VGIC_V3, M_WAITOK | M_ZERO); for (i = 0; i < VGIC_NIRQS - VGIC_PRV_I_NUM; i++) { irq = &vgic->irqs[i]; mtx_init(&irq->irq_spinmtx, "VGIC IRQ spinlock", NULL, MTX_SPIN); irq->irq = i + VGIC_PRV_I_NUM; } } static void vgic_v3_mmio_destroy(struct hyp *hyp) { struct vgic_v3 *vgic; struct vgic_v3_irq *irq; int i; vgic = hyp->vgic; for (i = 0; i < VGIC_NIRQS - VGIC_PRV_I_NUM; i++) { irq = &vgic->irqs[i]; mtx_destroy(&irq->irq_spinmtx); } free(vgic->irqs, M_VGIC_V3); } static int vgic_v3_attach_to_vm(device_t dev, struct hyp *hyp, struct vm_vgic_descr *descr) { struct vm *vm; struct vgic_v3 *vgic; size_t cpu_count; if (descr->ver.version != 3) return (EINVAL); /* * The register bases need to be 64k aligned * The redist register space is the RD + SGI size */ if (!__is_aligned(descr->v3_regs.dist_start, PAGE_SIZE_64K) || !__is_aligned(descr->v3_regs.redist_start, PAGE_SIZE_64K) || !__is_aligned(descr->v3_regs.redist_size, GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE)) return (EINVAL); /* The dist register space is 1 64k block */ if (descr->v3_regs.dist_size != PAGE_SIZE_64K) return (EINVAL); vm = hyp->vm; /* * Return an error if the redist space is too large for the maximum * number of CPUs we support. */ cpu_count = descr->v3_regs.redist_size / (GICR_RD_BASE_SIZE + GICR_SGI_BASE_SIZE); if (cpu_count > vm_get_maxcpus(vm)) return (EINVAL); vgic = hyp->vgic; /* Set the distributor address and size for trapping guest access. */ vgic->dist_start = descr->v3_regs.dist_start; vgic->dist_end = descr->v3_regs.dist_start + descr->v3_regs.dist_size; vgic->redist_start = descr->v3_regs.redist_start; vgic->redist_end = descr->v3_regs.redist_start + descr->v3_regs.redist_size; vm_register_inst_handler(vm, descr->v3_regs.dist_start, descr->v3_regs.dist_size, dist_read, dist_write); vm_register_inst_handler(vm, descr->v3_regs.redist_start, descr->v3_regs.redist_size, redist_read, redist_write); vm_register_reg_handler(vm, ISS_MSR_REG(ICC_SGI1R_EL1), ISS_MSR_REG_MASK, vgic_v3_icc_sgi1r_read, vgic_v3_icc_sgi1r_write, NULL); vgic_v3_mmio_init(hyp); hyp->vgic_attached = true; return (0); } static void vgic_v3_detach_from_vm(device_t dev, struct hyp *hyp) { if (hyp->vgic_attached) { hyp->vgic_attached = false; vgic_v3_mmio_destroy(hyp); } } static struct vgic_v3_irq * vgic_v3_get_irq(struct hyp *hyp, int vcpuid, uint32_t irqid) { struct vgic_v3_cpu *vgic_cpu; struct vgic_v3_irq *irq; struct hypctx *hypctx; if (irqid < VGIC_PRV_I_NUM) { if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(hyp->vm)) return (NULL); hypctx = hyp->ctx[vcpuid]; if (hypctx == NULL) return (NULL); vgic_cpu = hypctx->vgic_cpu; irq = &vgic_cpu->private_irqs[irqid]; } else if (irqid <= GIC_LAST_SPI) { irqid -= VGIC_PRV_I_NUM; if (irqid >= VGIC_NIRQS) return (NULL); irq = &hyp->vgic->irqs[irqid]; } else if (irqid < GIC_FIRST_LPI) { return (NULL); } else { /* No support for LPIs */ return (NULL); } mtx_lock_spin(&irq->irq_spinmtx); return (irq); } static void vgic_v3_release_irq(struct vgic_v3_irq *irq) { mtx_unlock_spin(&irq->irq_spinmtx); } static bool vgic_v3_has_pending_irq(device_t dev, struct hypctx *hypctx) { struct vgic_v3_cpu *vgic_cpu; bool empty; vgic_cpu = hypctx->vgic_cpu; mtx_lock_spin(&vgic_cpu->lr_mtx); empty = TAILQ_EMPTY(&vgic_cpu->irq_act_pend); mtx_unlock_spin(&vgic_cpu->lr_mtx); return (!empty); } static bool vgic_v3_check_irq(struct vgic_v3_irq *irq, bool level) { /* * Only inject if: * - Level-triggered IRQ: level changes low -> high * - Edge-triggered IRQ: level is high */ switch (irq->config & VGIC_CONFIG_MASK) { case VGIC_CONFIG_LEVEL: return (level != irq->level); case VGIC_CONFIG_EDGE: return (level); default: break; } return (false); } static int vgic_v3_inject_irq(device_t dev, struct hyp *hyp, int vcpuid, uint32_t irqid, bool level) { struct vgic_v3_cpu *vgic_cpu; struct vgic_v3_irq *irq; struct hypctx *hypctx; int target_vcpu; bool notify; if (!hyp->vgic_attached) return (ENODEV); KASSERT(vcpuid == -1 || irqid < VGIC_PRV_I_NUM, ("%s: SPI/LPI with vcpuid set: irq %u vcpuid %u", __func__, irqid, vcpuid)); irq = vgic_v3_get_irq(hyp, vcpuid, irqid); if (irq == NULL) { eprintf("Malformed IRQ %u.\n", irqid); return (EINVAL); } target_vcpu = irq->target_vcpu; KASSERT(vcpuid == -1 || vcpuid == target_vcpu, ("%s: Interrupt %u has bad cpu affinity: vcpu %d target vcpu %d", __func__, irqid, vcpuid, target_vcpu)); KASSERT(target_vcpu >= 0 && target_vcpu < vm_get_maxcpus(hyp->vm), ("%s: Interrupt %u sent to invalid vcpu %d", __func__, irqid, target_vcpu)); if (vcpuid == -1) vcpuid = target_vcpu; /* TODO: Check from 0 to vm->maxcpus */ if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(hyp->vm)) { vgic_v3_release_irq(irq); return (EINVAL); } hypctx = hyp->ctx[vcpuid]; if (hypctx == NULL) { vgic_v3_release_irq(irq); return (EINVAL); } notify = false; vgic_cpu = hypctx->vgic_cpu; mtx_lock_spin(&vgic_cpu->lr_mtx); if (!vgic_v3_check_irq(irq, level)) { goto out; } if ((irq->config & VGIC_CONFIG_MASK) == VGIC_CONFIG_LEVEL) irq->level = level; else /* VGIC_CONFIG_EDGE */ irq->pending = true; notify = vgic_v3_queue_irq(hyp, vgic_cpu, vcpuid, irq); out: mtx_unlock_spin(&vgic_cpu->lr_mtx); vgic_v3_release_irq(irq); if (notify) vcpu_notify_event(vm_vcpu(hyp->vm, vcpuid)); return (0); } static int vgic_v3_inject_msi(device_t dev, struct hyp *hyp, uint64_t msg, uint64_t addr) { struct vgic_v3 *vgic; uint64_t reg; vgic = hyp->vgic; /* This is a 4 byte register */ if (addr < vgic->dist_start || addr + 4 > vgic->dist_end) { return (EINVAL); } reg = addr - vgic->dist_start; if (reg != GICD_SETSPI_NSR) return (EINVAL); return (INJECT_IRQ(hyp, -1, msg, true)); } static void vgic_v3_flush_hwstate(device_t dev, struct hypctx *hypctx) { struct vgic_v3_cpu *vgic_cpu; struct vgic_v3_irq *irq; int i; vgic_cpu = hypctx->vgic_cpu; /* * All Distributor writes have been executed at this point, do not * protect Distributor reads with a mutex. * * This is callled with all interrupts disabled, so there is no need for * a List Register spinlock either. */ mtx_lock_spin(&vgic_cpu->lr_mtx); hypctx->vgic_v3_regs.ich_hcr_el2 &= ~ICH_HCR_EL2_UIE; /* Exit early if there are no buffered interrupts */ if (TAILQ_EMPTY(&vgic_cpu->irq_act_pend)) goto out; KASSERT(vgic_cpu->ich_lr_used == 0, ("%s: Used LR count not zero %u", __func__, vgic_cpu->ich_lr_used)); i = 0; hypctx->vgic_v3_regs.ich_elrsr_el2 = (1u << hypctx->vgic_v3_regs.ich_lr_num) - 1; TAILQ_FOREACH(irq, &vgic_cpu->irq_act_pend, act_pend_list) { /* No free list register, stop searching for IRQs */ if (i == hypctx->vgic_v3_regs.ich_lr_num) break; if (!irq->enabled) continue; hypctx->vgic_v3_regs.ich_lr_el2[i] = ICH_LR_EL2_GROUP1 | ((uint64_t)irq->priority << ICH_LR_EL2_PRIO_SHIFT) | irq->irq; if (irq->active) { hypctx->vgic_v3_regs.ich_lr_el2[i] |= ICH_LR_EL2_STATE_ACTIVE; } #ifdef notyet /* TODO: Check why this is needed */ if ((irq->config & _MASK) == LEVEL) hypctx->vgic_v3_regs.ich_lr_el2[i] |= ICH_LR_EL2_EOI; #endif if (!irq->active && vgic_v3_irq_pending(irq)) { hypctx->vgic_v3_regs.ich_lr_el2[i] |= ICH_LR_EL2_STATE_PENDING; /* * This IRQ is now pending on the guest. Allow for * another edge that could cause the interrupt to * be raised again. */ if ((irq->config & VGIC_CONFIG_MASK) == VGIC_CONFIG_EDGE) { irq->pending = false; } } i++; } vgic_cpu->ich_lr_used = i; out: mtx_unlock_spin(&vgic_cpu->lr_mtx); } static void vgic_v3_sync_hwstate(device_t dev, struct hypctx *hypctx) { struct vgic_v3_cpu *vgic_cpu; struct vgic_v3_irq *irq; uint64_t lr; int i; vgic_cpu = hypctx->vgic_cpu; /* Exit early if there are no buffered interrupts */ if (vgic_cpu->ich_lr_used == 0) return; /* * Check on the IRQ state after running the guest. ich_lr_used and * ich_lr_el2 are only ever used within this thread so is safe to * access unlocked. */ for (i = 0; i < vgic_cpu->ich_lr_used; i++) { lr = hypctx->vgic_v3_regs.ich_lr_el2[i]; hypctx->vgic_v3_regs.ich_lr_el2[i] = 0; irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), ICH_LR_EL2_VINTID(lr)); if (irq == NULL) continue; irq->active = (lr & ICH_LR_EL2_STATE_ACTIVE) != 0; if ((irq->config & VGIC_CONFIG_MASK) == VGIC_CONFIG_EDGE) { /* * If we have an edge triggered IRQ preserve the * pending bit until the IRQ has been handled. */ if ((lr & ICH_LR_EL2_STATE_PENDING) != 0) { irq->pending = true; } } else { /* * If we have a level triggerend IRQ remove the * pending bit if the IRQ has been handled. * The level is separate, so may still be high * triggering another IRQ. */ if ((lr & ICH_LR_EL2_STATE_PENDING) == 0) { irq->pending = false; } } /* Lock to update irq_act_pend */ mtx_lock_spin(&vgic_cpu->lr_mtx); if (irq->active) { /* Ensure the active IRQ is at the head of the list */ TAILQ_REMOVE(&vgic_cpu->irq_act_pend, irq, act_pend_list); TAILQ_INSERT_HEAD(&vgic_cpu->irq_act_pend, irq, act_pend_list); } else if (!vgic_v3_irq_pending(irq)) { /* If pending or active remove from the list */ TAILQ_REMOVE(&vgic_cpu->irq_act_pend, irq, act_pend_list); irq->on_aplist = false; } mtx_unlock_spin(&vgic_cpu->lr_mtx); vgic_v3_release_irq(irq); } hypctx->vgic_v3_regs.ich_hcr_el2 &= ~ICH_HCR_EL2_EOICOUNT_MASK; vgic_cpu->ich_lr_used = 0; } static void vgic_v3_init(device_t dev) { uint64_t ich_vtr_el2; uint32_t pribits, prebits; ich_vtr_el2 = vmm_read_reg(HYP_REG_ICH_VTR); /* TODO: These fields are common with the vgicv2 driver */ pribits = ICH_VTR_EL2_PRIBITS(ich_vtr_el2); switch (pribits) { default: case 5: virt_features.min_prio = 0xf8; break; case 6: virt_features.min_prio = 0xfc; break; case 7: virt_features.min_prio = 0xfe; break; case 8: virt_features.min_prio = 0xff; break; } prebits = ICH_VTR_EL2_PREBITS(ich_vtr_el2); switch (prebits) { default: case 5: virt_features.ich_apr_num = 1; break; case 6: virt_features.ich_apr_num = 2; break; case 7: virt_features.ich_apr_num = 4; break; } virt_features.ich_lr_num = ICH_VTR_EL2_LISTREGS(ich_vtr_el2); } static int vgic_v3_probe(device_t dev) { if (!gic_get_vgic(dev)) return (EINVAL); /* We currently only support the GICv3 */ if (gic_get_hw_rev(dev) < 3) return (EINVAL); device_set_desc(dev, "Virtual GIC v3"); return (BUS_PROBE_DEFAULT); } static int vgic_v3_attach(device_t dev) { vgic_dev = dev; return (0); } static int vgic_v3_detach(device_t dev) { vgic_dev = NULL; return (0); } static device_method_t vgic_v3_methods[] = { /* Device interface */ DEVMETHOD(device_probe, vgic_v3_probe), DEVMETHOD(device_attach, vgic_v3_attach), DEVMETHOD(device_detach, vgic_v3_detach), /* VGIC interface */ DEVMETHOD(vgic_init, vgic_v3_init), DEVMETHOD(vgic_attach_to_vm, vgic_v3_attach_to_vm), DEVMETHOD(vgic_detach_from_vm, vgic_v3_detach_from_vm), DEVMETHOD(vgic_vminit, vgic_v3_vminit), DEVMETHOD(vgic_cpuinit, vgic_v3_cpuinit), DEVMETHOD(vgic_cpucleanup, vgic_v3_cpucleanup), DEVMETHOD(vgic_vmcleanup, vgic_v3_vmcleanup), DEVMETHOD(vgic_max_cpu_count, vgic_v3_max_cpu_count), DEVMETHOD(vgic_has_pending_irq, vgic_v3_has_pending_irq), DEVMETHOD(vgic_inject_irq, vgic_v3_inject_irq), DEVMETHOD(vgic_inject_msi, vgic_v3_inject_msi), DEVMETHOD(vgic_flush_hwstate, vgic_v3_flush_hwstate), DEVMETHOD(vgic_sync_hwstate, vgic_v3_sync_hwstate), /* End */ DEVMETHOD_END }; /* TODO: Create a vgic base class? */ DEFINE_CLASS_0(vgic, vgic_v3_driver, vgic_v3_methods, 0); DRIVER_MODULE(vgic_v3, gic, vgic_v3_driver, 0, 0); diff --git a/sys/arm64/vmm/io/vtimer.c b/sys/arm64/vmm/io/vtimer.c index da0f0d96c431..7c7fbb49e691 100644 --- a/sys/arm64/vmm/io/vtimer.c +++ b/sys/arm64/vmm/io/vtimer.c @@ -1,575 +1,574 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2017 The FreeBSD Foundation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include "vgic.h" #include "vtimer.h" #define RES1 0xffffffffffffffffUL #define timer_enabled(ctl) \ (!((ctl) & CNTP_CTL_IMASK) && ((ctl) & CNTP_CTL_ENABLE)) static uint32_t tmr_frq; #define timer_condition_met(ctl) ((ctl) & CNTP_CTL_ISTATUS) SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, vtimer, CTLFLAG_RW, NULL, NULL); static bool allow_ecv_phys = false; SYSCTL_BOOL(_hw_vmm_vtimer, OID_AUTO, allow_ecv_phys, CTLFLAG_RW, &allow_ecv_phys, 0, "Enable hardware access to the physical timer if FEAT_ECV_POFF is supported"); static void vtimer_schedule_irq(struct hypctx *hypctx, bool phys); static int vtimer_virtual_timer_intr(void *arg) { struct hypctx *hypctx; uint64_t cntpct_el0; uint32_t cntv_ctl; hypctx = arm64_get_active_vcpu(); cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); if (!hypctx) { /* vm_destroy() was called. */ eprintf("No active vcpu\n"); cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); goto out; } if (!timer_enabled(cntv_ctl)) { eprintf("Timer not enabled\n"); goto out; } if (!timer_condition_met(cntv_ctl)) { eprintf("Timer condition not met\n"); goto out; } cntpct_el0 = READ_SPECIALREG(cntpct_el0) - hypctx->hyp->vtimer.cntvoff_el2; if (hypctx->vtimer_cpu.virt_timer.cntx_cval_el0 < cntpct_el0) vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), GT_VIRT_IRQ, true); cntv_ctl = hypctx->vtimer_cpu.virt_timer.cntx_ctl_el0; out: /* * Disable the timer interrupt. This will prevent the interrupt from * being reasserted as soon as we exit the handler and getting stuck * in an infinite loop. * * This is safe to do because the guest disabled the timer, and then * enables it as part of the interrupt handling routine. */ cntv_ctl &= ~CNTP_CTL_ENABLE; WRITE_SPECIALREG(cntv_ctl_el0, cntv_ctl); return (FILTER_HANDLED); } int vtimer_init(void) { /* * The guest *MUST* use the same timer frequency as the host. The * register CNTFRQ_EL0 is accessible to the guest and a different value * in the guest dts file might have unforseen consequences. */ tmr_frq = READ_SPECIALREG(cntfrq_el0); return (0); } void vtimer_vminit(struct hyp *hyp) { uint64_t now; bool ecv_poff; ecv_poff = false; if (allow_ecv_phys && (hyp->feats & HYP_FEAT_ECV_POFF) != 0) ecv_poff = true; /* * Configure the Counter-timer Hypervisor Control Register for the VM. */ if (in_vhe()) { /* * CNTHCTL_E2H_EL0PCTEN: trap EL0 access to CNTP{CT,CTSS}_EL0 * CNTHCTL_E2H_EL0VCTEN: don't trap EL0 access to * CNTV{CT,CTXX}_EL0 * CNTHCTL_E2H_EL0VTEN: don't trap EL0 access to * CNTV_{CTL,CVAL,TVAL}_EL0 * CNTHCTL_E2H_EL0PTEN: trap EL0 access to * CNTP_{CTL,CVAL,TVAL}_EL0 * CNTHCTL_E2H_EL1PCTEN: trap access to CNTPCT_EL0 * CNTHCTL_E2H_EL1PTEN: trap access to * CNTP_{CTL,CVAL,TVAL}_EL0 * CNTHCTL_E2H_EL1VCTEN: don't trap EL0 access to * CNTV{CT,CTSS}_EL0 * CNTHCTL_E2H_EL1PCEN: trap EL1 access to * CNTP_{CTL,CVAL,TVAL}_EL0 * * TODO: Don't trap when FEAT_ECV is present */ hyp->vtimer.cnthctl_el2 = CNTHCTL_E2H_EL0VCTEN_NOTRAP | CNTHCTL_E2H_EL0VTEN_NOTRAP; if (ecv_poff) { hyp->vtimer.cnthctl_el2 |= CNTHCTL_E2H_EL0PCTEN_NOTRAP | CNTHCTL_E2H_EL0PTEN_NOTRAP | CNTHCTL_E2H_EL1PCTEN_NOTRAP | CNTHCTL_E2H_EL1PTEN_NOTRAP; } else { hyp->vtimer.cnthctl_el2 |= CNTHCTL_E2H_EL0PCTEN_TRAP | CNTHCTL_E2H_EL0PTEN_TRAP | CNTHCTL_E2H_EL1PCTEN_TRAP | CNTHCTL_E2H_EL1PTEN_TRAP; } } else { /* * CNTHCTL_EL1PCEN: trap access to CNTP_{CTL, CVAL, TVAL}_EL0 * from EL1 * CNTHCTL_EL1PCTEN: trap access to CNTPCT_EL0 */ if (ecv_poff) { hyp->vtimer.cnthctl_el2 = CNTHCTL_EL1PCTEN_NOTRAP | CNTHCTL_EL1PCEN_NOTRAP; } else { hyp->vtimer.cnthctl_el2 = CNTHCTL_EL1PCTEN_TRAP | CNTHCTL_EL1PCEN_TRAP; } } if (ecv_poff) hyp->vtimer.cnthctl_el2 |= CNTHCTL_ECV_EN; now = READ_SPECIALREG(cntpct_el0); hyp->vtimer.cntvoff_el2 = now; return; } void vtimer_cpuinit(struct hypctx *hypctx) { struct vtimer_cpu *vtimer_cpu; vtimer_cpu = &hypctx->vtimer_cpu; /* * Configure physical timer interrupts for the VCPU. * * CNTP_CTL_IMASK: mask interrupts * ~CNTP_CTL_ENABLE: disable the timer */ vtimer_cpu->phys_timer.cntx_ctl_el0 = CNTP_CTL_IMASK & ~CNTP_CTL_ENABLE; mtx_init(&vtimer_cpu->phys_timer.mtx, "vtimer phys callout mutex", NULL, MTX_DEF); callout_init_mtx(&vtimer_cpu->phys_timer.callout, &vtimer_cpu->phys_timer.mtx, 0); vtimer_cpu->phys_timer.irqid = GT_PHYS_NS_IRQ; mtx_init(&vtimer_cpu->virt_timer.mtx, "vtimer virt callout mutex", NULL, MTX_DEF); callout_init_mtx(&vtimer_cpu->virt_timer.callout, &vtimer_cpu->virt_timer.mtx, 0); vtimer_cpu->virt_timer.irqid = GT_VIRT_IRQ; } void vtimer_cpucleanup(struct hypctx *hypctx) { struct vtimer_cpu *vtimer_cpu; vtimer_cpu = &hypctx->vtimer_cpu; callout_drain(&vtimer_cpu->phys_timer.callout); callout_drain(&vtimer_cpu->virt_timer.callout); mtx_destroy(&vtimer_cpu->phys_timer.mtx); mtx_destroy(&vtimer_cpu->virt_timer.mtx); } void vtimer_vmcleanup(struct hyp *hyp) { struct hypctx *hypctx; uint32_t cntv_ctl; hypctx = arm64_get_active_vcpu(); if (!hypctx) { /* The active VM was destroyed, stop the timer. */ cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); cntv_ctl &= ~CNTP_CTL_ENABLE; WRITE_SPECIALREG(cntv_ctl_el0, cntv_ctl); } } void vtimer_cleanup(void) { } static void vtime_sync_timer(struct hypctx *hypctx, struct vtimer_timer *timer, uint64_t cntpct_el0) { if (!timer_enabled(timer->cntx_ctl_el0)) { vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), timer->irqid, false); } else if (timer->cntx_cval_el0 < cntpct_el0) { vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), timer->irqid, true); } else { vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), timer->irqid, false); vtimer_schedule_irq(hypctx, false); } } void vtimer_sync_hwstate(struct hypctx *hypctx) { uint64_t cntpct_el0; cntpct_el0 = READ_SPECIALREG(cntpct_el0) - hypctx->hyp->vtimer.cntvoff_el2; vtime_sync_timer(hypctx, &hypctx->vtimer_cpu.virt_timer, cntpct_el0); /* If FEAT_ECV_POFF is in use then we need to sync the physical timer */ if ((hypctx->hyp->vtimer.cnthctl_el2 & CNTHCTL_ECV_EN) != 0) { vtime_sync_timer(hypctx, &hypctx->vtimer_cpu.phys_timer, cntpct_el0); } } static void vtimer_inject_irq_callout_phys(void *context) { struct hypctx *hypctx; hypctx = context; vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), hypctx->vtimer_cpu.phys_timer.irqid, true); } static void vtimer_inject_irq_callout_virt(void *context) { struct hypctx *hypctx; hypctx = context; vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), hypctx->vtimer_cpu.virt_timer.irqid, true); } static void vtimer_schedule_irq(struct hypctx *hypctx, bool phys) { sbintime_t time; struct vtimer_timer *timer; uint64_t cntpct_el0; uint64_t diff; if (phys) timer = &hypctx->vtimer_cpu.phys_timer; else timer = &hypctx->vtimer_cpu.virt_timer; cntpct_el0 = READ_SPECIALREG(cntpct_el0) - hypctx->hyp->vtimer.cntvoff_el2; if (timer->cntx_cval_el0 < cntpct_el0) { /* Timer set in the past, trigger interrupt */ vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), timer->irqid, true); } else { diff = timer->cntx_cval_el0 - cntpct_el0; time = diff * SBT_1S / tmr_frq; if (phys) callout_reset_sbt(&timer->callout, time, 0, vtimer_inject_irq_callout_phys, hypctx, 0); else callout_reset_sbt(&timer->callout, time, 0, vtimer_inject_irq_callout_virt, hypctx, 0); } } static void vtimer_remove_irq(struct hypctx *hypctx, struct vcpu *vcpu) { struct vtimer_cpu *vtimer_cpu; struct vtimer_timer *timer; vtimer_cpu = &hypctx->vtimer_cpu; timer = &vtimer_cpu->phys_timer; callout_drain(&timer->callout); /* * The interrupt needs to be deactivated here regardless of the callout * function having been executed. The timer interrupt can be masked with * the CNTP_CTL_EL0.IMASK bit instead of reading the IAR register. * Masking the interrupt doesn't remove it from the list registers. */ vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(vcpu), timer->irqid, false); } /* * Timer emulation functions. * * The guest should use the virtual timer, however some software, e.g. u-boot, * used the physical timer. Emulate this in software for the guest to use. * * Adjust for cntvoff_el2 so the physical and virtual timers are at similar * times. This simplifies interrupt handling in the virtual timer as the * adjustment will have already happened. */ int vtimer_phys_ctl_read(struct vcpu *vcpu, uint64_t *rval, void *arg) { struct hyp *hyp; struct hypctx *hypctx; struct vtimer_cpu *vtimer_cpu; uint64_t cntpct_el0; hypctx = vcpu_get_cookie(vcpu); hyp = hypctx->hyp; vtimer_cpu = &hypctx->vtimer_cpu; cntpct_el0 = READ_SPECIALREG(cntpct_el0) - hyp->vtimer.cntvoff_el2; if (vtimer_cpu->phys_timer.cntx_cval_el0 < cntpct_el0) /* Timer condition met */ *rval = vtimer_cpu->phys_timer.cntx_ctl_el0 | CNTP_CTL_ISTATUS; else *rval = vtimer_cpu->phys_timer.cntx_ctl_el0 & ~CNTP_CTL_ISTATUS; return (0); } int vtimer_phys_ctl_write(struct vcpu *vcpu, uint64_t wval, void *arg) { struct hypctx *hypctx; struct vtimer_cpu *vtimer_cpu; uint64_t ctl_el0; bool timer_toggled_on; hypctx = vcpu_get_cookie(vcpu); vtimer_cpu = &hypctx->vtimer_cpu; timer_toggled_on = false; ctl_el0 = vtimer_cpu->phys_timer.cntx_ctl_el0; if (!timer_enabled(ctl_el0) && timer_enabled(wval)) timer_toggled_on = true; else if (timer_enabled(ctl_el0) && !timer_enabled(wval)) vtimer_remove_irq(hypctx, vcpu); vtimer_cpu->phys_timer.cntx_ctl_el0 = wval; if (timer_toggled_on) vtimer_schedule_irq(hypctx, true); return (0); } int vtimer_phys_cnt_read(struct vcpu *vcpu, uint64_t *rval, void *arg) { struct vm *vm; struct hyp *hyp; vm = vcpu_vm(vcpu); hyp = vm_get_cookie(vm); *rval = READ_SPECIALREG(cntpct_el0) - hyp->vtimer.cntvoff_el2; return (0); } int vtimer_phys_cnt_write(struct vcpu *vcpu, uint64_t wval, void *arg) { return (0); } int vtimer_phys_cval_read(struct vcpu *vcpu, uint64_t *rval, void *arg) { struct hypctx *hypctx; struct vtimer_cpu *vtimer_cpu; hypctx = vcpu_get_cookie(vcpu); vtimer_cpu = &hypctx->vtimer_cpu; *rval = vtimer_cpu->phys_timer.cntx_cval_el0; return (0); } int vtimer_phys_cval_write(struct vcpu *vcpu, uint64_t wval, void *arg) { struct hypctx *hypctx; struct vtimer_cpu *vtimer_cpu; hypctx = vcpu_get_cookie(vcpu); vtimer_cpu = &hypctx->vtimer_cpu; vtimer_cpu->phys_timer.cntx_cval_el0 = wval; vtimer_remove_irq(hypctx, vcpu); if (timer_enabled(vtimer_cpu->phys_timer.cntx_ctl_el0)) { vtimer_schedule_irq(hypctx, true); } return (0); } int vtimer_phys_tval_read(struct vcpu *vcpu, uint64_t *rval, void *arg) { struct hyp *hyp; struct hypctx *hypctx; struct vtimer_cpu *vtimer_cpu; uint32_t cntpct_el0; hypctx = vcpu_get_cookie(vcpu); hyp = hypctx->hyp; vtimer_cpu = &hypctx->vtimer_cpu; if (!(vtimer_cpu->phys_timer.cntx_ctl_el0 & CNTP_CTL_ENABLE)) { /* * ARMv8 Architecture Manual, p. D7-2702: the result of reading * TVAL when the timer is disabled is UNKNOWN. I have chosen to * return the maximum value possible on 32 bits which means the * timer will fire very far into the future. */ *rval = (uint32_t)RES1; } else { cntpct_el0 = READ_SPECIALREG(cntpct_el0) - hyp->vtimer.cntvoff_el2; *rval = vtimer_cpu->phys_timer.cntx_cval_el0 - cntpct_el0; } return (0); } int vtimer_phys_tval_write(struct vcpu *vcpu, uint64_t wval, void *arg) { struct hyp *hyp; struct hypctx *hypctx; struct vtimer_cpu *vtimer_cpu; uint64_t cntpct_el0; hypctx = vcpu_get_cookie(vcpu); hyp = hypctx->hyp; vtimer_cpu = &hypctx->vtimer_cpu; cntpct_el0 = READ_SPECIALREG(cntpct_el0) - hyp->vtimer.cntvoff_el2; vtimer_cpu->phys_timer.cntx_cval_el0 = (int32_t)wval + cntpct_el0; vtimer_remove_irq(hypctx, vcpu); if (timer_enabled(vtimer_cpu->phys_timer.cntx_ctl_el0)) { vtimer_schedule_irq(hypctx, true); } return (0); } struct vtimer_softc { struct resource *res; void *ihl; int rid; }; static int vtimer_probe(device_t dev) { device_set_desc(dev, "Virtual timer"); return (BUS_PROBE_DEFAULT); } static int vtimer_attach(device_t dev) { struct vtimer_softc *sc; sc = device_get_softc(dev); sc->rid = 0; sc->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &sc->rid, RF_ACTIVE); if (sc->res == NULL) return (ENXIO); bus_setup_intr(dev, sc->res, INTR_TYPE_CLK, vtimer_virtual_timer_intr, NULL, NULL, &sc->ihl); return (0); } static device_method_t vtimer_methods[] = { /* Device interface */ DEVMETHOD(device_probe, vtimer_probe), DEVMETHOD(device_attach, vtimer_attach), /* End */ DEVMETHOD_END }; DEFINE_CLASS_0(vtimer, vtimer_driver, vtimer_methods, sizeof(struct vtimer_softc)); DRIVER_MODULE(vtimer, generic_timer, vtimer_driver, 0, 0); diff --git a/sys/arm64/vmm/vmm.c b/sys/arm64/vmm/vmm.c index 14ea26c3668c..e7b2b5d8c360 100644 --- a/sys/arm64/vmm/vmm.c +++ b/sys/arm64/vmm/vmm.c @@ -1,1567 +1,1566 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (C) 2015 Mihai Carabas * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "arm64.h" #include "mmu.h" #include "io/vgic.h" #include "io/vtimer.h" struct vcpu { int flags; enum vcpu_state state; struct mtx mtx; int hostcpu; /* host cpuid this vcpu last ran on */ int vcpuid; void *stats; struct vm_exit exitinfo; uint64_t nextpc; /* (x) next instruction to execute */ struct vm *vm; /* (o) */ void *cookie; /* (i) cpu-specific data */ struct vfpstate *guestfpu; /* (a,i) guest fpu state */ }; #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock_destroy(v) mtx_destroy(&((v)->mtx)) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) struct vmm_mmio_region { uint64_t start; uint64_t end; mem_region_read_t read; mem_region_write_t write; }; #define VM_MAX_MMIO_REGIONS 4 struct vmm_special_reg { uint32_t esr_iss; uint32_t esr_mask; reg_read_t reg_read; reg_write_t reg_write; void *arg; }; #define VM_MAX_SPECIAL_REGS 16 /* * Initialization: * (o) initialized the first time the VM is created * (i) initialized when VM is created and when it is reinitialized * (x) initialized before use */ struct vm { void *cookie; /* (i) cpu-specific data */ volatile cpuset_t active_cpus; /* (i) active vcpus */ volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ int suspend; /* (i) stop VM execution */ bool dying; /* (o) is dying */ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ struct vm_mem mem; /* (i) guest memory */ char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ struct vcpu **vcpu; /* (i) guest vcpus */ struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS]; /* (o) guest MMIO regions */ struct vmm_special_reg special_reg[VM_MAX_SPECIAL_REGS]; /* The following describe the vm cpu topology */ uint16_t sockets; /* (o) num of sockets */ uint16_t cores; /* (o) num of cores/socket */ uint16_t threads; /* (o) num of threads/core */ uint16_t maxcpus; /* (o) max pluggable cpus */ struct sx vcpus_init_lock; /* (o) */ }; static bool vmm_initialized = false; static int vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu); static MALLOC_DEFINE(M_VMM, "vmm", "vmm"); /* statistics */ static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); static int vmm_ipinum; SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, "IPI vector used for vcpu notifications"); struct vmm_regs { uint64_t id_aa64afr0; uint64_t id_aa64afr1; uint64_t id_aa64dfr0; uint64_t id_aa64dfr1; uint64_t id_aa64isar0; uint64_t id_aa64isar1; uint64_t id_aa64isar2; uint64_t id_aa64mmfr0; uint64_t id_aa64mmfr1; uint64_t id_aa64mmfr2; uint64_t id_aa64pfr0; uint64_t id_aa64pfr1; }; static const struct vmm_regs vmm_arch_regs_masks = { .id_aa64dfr0 = ID_AA64DFR0_CTX_CMPs_MASK | ID_AA64DFR0_WRPs_MASK | ID_AA64DFR0_BRPs_MASK | ID_AA64DFR0_PMUVer_3 | ID_AA64DFR0_DebugVer_8, .id_aa64isar0 = ID_AA64ISAR0_TLB_TLBIOSR | ID_AA64ISAR0_SHA3_IMPL | ID_AA64ISAR0_RDM_IMPL | ID_AA64ISAR0_Atomic_IMPL | ID_AA64ISAR0_CRC32_BASE | ID_AA64ISAR0_SHA2_512 | ID_AA64ISAR0_SHA1_BASE | ID_AA64ISAR0_AES_PMULL, .id_aa64mmfr0 = ID_AA64MMFR0_TGran4_IMPL | ID_AA64MMFR0_TGran64_IMPL | ID_AA64MMFR0_TGran16_IMPL | ID_AA64MMFR0_ASIDBits_16 | ID_AA64MMFR0_PARange_4P, .id_aa64mmfr1 = ID_AA64MMFR1_SpecSEI_IMPL | ID_AA64MMFR1_PAN_ATS1E1 | ID_AA64MMFR1_HAFDBS_AF, .id_aa64pfr0 = ID_AA64PFR0_GIC_CPUIF_NONE | ID_AA64PFR0_AdvSIMD_HP | ID_AA64PFR0_FP_HP | ID_AA64PFR0_EL3_64 | ID_AA64PFR0_EL2_64 | ID_AA64PFR0_EL1_64 | ID_AA64PFR0_EL0_64, }; /* Host registers masked by vmm_arch_regs_masks. */ static struct vmm_regs vmm_arch_regs; u_int vm_maxcpu; SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &vm_maxcpu, 0, "Maximum number of vCPUs"); static void vcpu_notify_event_locked(struct vcpu *vcpu); /* global statistics */ VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); VMM_STAT(VMEXIT_UNKNOWN, "number of vmexits for the unknown exception"); VMM_STAT(VMEXIT_WFI, "number of times wfi was intercepted"); VMM_STAT(VMEXIT_WFE, "number of times wfe was intercepted"); VMM_STAT(VMEXIT_HVC, "number of times hvc was intercepted"); VMM_STAT(VMEXIT_MSR, "number of times msr/mrs was intercepted"); VMM_STAT(VMEXIT_DATA_ABORT, "number of vmexits for a data abort"); VMM_STAT(VMEXIT_INSN_ABORT, "number of vmexits for an instruction abort"); VMM_STAT(VMEXIT_UNHANDLED_SYNC, "number of vmexits for an unhandled synchronous exception"); VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq"); VMM_STAT(VMEXIT_FIQ, "number of vmexits for an interrupt"); VMM_STAT(VMEXIT_BRK, "number of vmexits for a breakpoint exception"); VMM_STAT(VMEXIT_SS, "number of vmexits for a single-step exception"); VMM_STAT(VMEXIT_UNHANDLED_EL2, "number of vmexits for an unhandled EL2 exception"); VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception"); /* * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this * is a safe value for now. */ #define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE) static int vmm_regs_init(struct vmm_regs *regs, const struct vmm_regs *masks) { #define _FETCH_KERN_REG(reg, field) do { \ regs->field = vmm_arch_regs_masks.field; \ if (!get_kernel_reg_iss_masked(reg ## _ISS, ®s->field, \ masks->field)) \ regs->field = 0; \ } while (0) _FETCH_KERN_REG(ID_AA64AFR0_EL1, id_aa64afr0); _FETCH_KERN_REG(ID_AA64AFR1_EL1, id_aa64afr1); _FETCH_KERN_REG(ID_AA64DFR0_EL1, id_aa64dfr0); _FETCH_KERN_REG(ID_AA64DFR1_EL1, id_aa64dfr1); _FETCH_KERN_REG(ID_AA64ISAR0_EL1, id_aa64isar0); _FETCH_KERN_REG(ID_AA64ISAR1_EL1, id_aa64isar1); _FETCH_KERN_REG(ID_AA64ISAR2_EL1, id_aa64isar2); _FETCH_KERN_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0); _FETCH_KERN_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1); _FETCH_KERN_REG(ID_AA64MMFR2_EL1, id_aa64mmfr2); _FETCH_KERN_REG(ID_AA64PFR0_EL1, id_aa64pfr0); _FETCH_KERN_REG(ID_AA64PFR1_EL1, id_aa64pfr1); #undef _FETCH_KERN_REG return (0); } static void vcpu_cleanup(struct vcpu *vcpu, bool destroy) { vmmops_vcpu_cleanup(vcpu->cookie); vcpu->cookie = NULL; if (destroy) { vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); vcpu_lock_destroy(vcpu); free(vcpu, M_VMM); } } static struct vcpu * vcpu_alloc(struct vm *vm, int vcpu_id) { struct vcpu *vcpu; KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, ("vcpu_alloc: invalid vcpu %d", vcpu_id)); vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO); vcpu_lock_init(vcpu); vcpu->state = VCPU_IDLE; vcpu->hostcpu = NOCPU; vcpu->vcpuid = vcpu_id; vcpu->vm = vm; vcpu->guestfpu = fpu_save_area_alloc(); vcpu->stats = vmm_stat_alloc(); return (vcpu); } static void vcpu_init(struct vcpu *vcpu) { vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid); MPASS(vcpu->cookie != NULL); fpu_save_area_reset(vcpu->guestfpu); vmm_stat_init(vcpu->stats); } struct vm_exit * vm_exitinfo(struct vcpu *vcpu) { return (&vcpu->exitinfo); } static int vmm_unsupported_quirk(void) { /* * Known to not load on Ampere eMAG * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=285051 */ if (CPU_MATCH(CPU_IMPL_MASK | CPU_PART_MASK, CPU_IMPL_APM, CPU_PART_EMAG8180, 0, 0)) return (ENXIO); return (0); } static int vmm_init(void) { int error; vm_maxcpu = mp_ncpus; TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); if (vm_maxcpu > VM_MAXCPU) { printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); vm_maxcpu = VM_MAXCPU; } if (vm_maxcpu == 0) vm_maxcpu = 1; error = vmm_regs_init(&vmm_arch_regs, &vmm_arch_regs_masks); if (error != 0) return (error); return (vmmops_modinit(0)); } static int vmm_handler(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: error = vmm_unsupported_quirk(); if (error != 0) break; error = vmmdev_init(); if (error != 0) break; error = vmm_init(); if (error == 0) vmm_initialized = true; else (void)vmmdev_cleanup(); break; case MOD_UNLOAD: error = vmmdev_cleanup(); if (error == 0 && vmm_initialized) { error = vmmops_modcleanup(); if (error) { /* * Something bad happened - prevent new * VMs from being created */ vmm_initialized = false; } } break; default: error = 0; break; } return (error); } static moduledata_t vmm_kmod = { "vmm", vmm_handler, NULL }; /* * vmm initialization has the following dependencies: * * - HYP initialization requires smp_rendezvous() and therefore must happen * after SMP is fully functional (after SI_SUB_SMP). * - vmm device initialization requires an initialized devfs. */ DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); static void vm_init(struct vm *vm, bool create) { int i; vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm))); MPASS(vm->cookie != NULL); CPU_ZERO(&vm->active_cpus); CPU_ZERO(&vm->debug_cpus); vm->suspend = 0; CPU_ZERO(&vm->suspended_cpus); memset(vm->mmio_region, 0, sizeof(vm->mmio_region)); memset(vm->special_reg, 0, sizeof(vm->special_reg)); if (!create) { for (i = 0; i < vm->maxcpus; i++) { if (vm->vcpu[i] != NULL) vcpu_init(vm->vcpu[i]); } } } void vm_disable_vcpu_creation(struct vm *vm) { sx_xlock(&vm->vcpus_init_lock); vm->dying = true; sx_xunlock(&vm->vcpus_init_lock); } struct vcpu * vm_alloc_vcpu(struct vm *vm, int vcpuid) { struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm)) return (NULL); /* Some interrupt controllers may have a CPU limit */ if (vcpuid >= vgic_max_cpu_count(vm->cookie)) return (NULL); vcpu = (struct vcpu *) atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]); if (__predict_true(vcpu != NULL)) return (vcpu); sx_xlock(&vm->vcpus_init_lock); vcpu = vm->vcpu[vcpuid]; if (vcpu == NULL && !vm->dying) { vcpu = vcpu_alloc(vm, vcpuid); vcpu_init(vcpu); /* * Ensure vCPU is fully created before updating pointer * to permit unlocked reads above. */ atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid], (uintptr_t)vcpu); } sx_xunlock(&vm->vcpus_init_lock); return (vcpu); } void vm_lock_vcpus(struct vm *vm) { sx_xlock(&vm->vcpus_init_lock); } void vm_unlock_vcpus(struct vm *vm) { sx_unlock(&vm->vcpus_init_lock); } int vm_create(const char *name, struct vm **retvm) { struct vm *vm; int error; /* * If vmm.ko could not be successfully initialized then don't attempt * to create the virtual machine. */ if (!vmm_initialized) return (ENXIO); if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) return (EINVAL); vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO); error = vm_mem_init(&vm->mem, 0, 1ul << 39); if (error != 0) { free(vm, M_VMM); return (error); } strcpy(vm->name, name); sx_init(&vm->vcpus_init_lock, "vm vcpus"); vm->sockets = 1; vm->cores = 1; /* XXX backwards compatibility */ vm->threads = 1; /* XXX backwards compatibility */ vm->maxcpus = vm_maxcpu; vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM, M_WAITOK | M_ZERO); vm_init(vm, true); *retvm = vm; return (0); } void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) { *sockets = vm->sockets; *cores = vm->cores; *threads = vm->threads; *maxcpus = vm->maxcpus; } uint16_t vm_get_maxcpus(struct vm *vm) { return (vm->maxcpus); } int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) { /* Ignore maxcpus. */ if ((sockets * cores * threads) > vm->maxcpus) return (EINVAL); vm->sockets = sockets; vm->cores = cores; vm->threads = threads; return(0); } static void vm_cleanup(struct vm *vm, bool destroy) { pmap_t pmap __diagused; int i; if (destroy) { vm_xlock_memsegs(vm); pmap = vmspace_pmap(vm_vmspace(vm)); sched_pin(); PCPU_SET(curvmpmap, NULL); sched_unpin(); CPU_FOREACH(i) { MPASS(cpuid_to_pcpu[i]->pc_curvmpmap != pmap); } } else vm_assert_memseg_xlocked(vm); vgic_detach_from_vm(vm->cookie); for (i = 0; i < vm->maxcpus; i++) { if (vm->vcpu[i] != NULL) vcpu_cleanup(vm->vcpu[i], destroy); } vmmops_cleanup(vm->cookie); vm_mem_cleanup(vm); if (destroy) { vm_mem_destroy(vm); free(vm->vcpu, M_VMM); sx_destroy(&vm->vcpus_init_lock); } } void vm_destroy(struct vm *vm) { vm_cleanup(vm, true); free(vm, M_VMM); } int vm_reinit(struct vm *vm) { int error; /* * A virtual machine can be reset only if all vcpus are suspended. */ if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { vm_cleanup(vm, false); vm_init(vm, false); error = 0; } else { error = EBUSY; } return (error); } const char * vm_name(struct vm *vm) { return (vm->name); } int vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *is_fault) { return (vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault)); } static int vmm_reg_raz(struct vcpu *vcpu, uint64_t *rval, void *arg) { *rval = 0; return (0); } static int vmm_reg_read_arg(struct vcpu *vcpu, uint64_t *rval, void *arg) { *rval = *(uint64_t *)arg; return (0); } static int vmm_reg_wi(struct vcpu *vcpu, uint64_t wval, void *arg) { return (0); } static int vmm_write_oslar_el1(struct vcpu *vcpu, uint64_t wval, void *arg) { struct hypctx *hypctx; hypctx = vcpu_get_cookie(vcpu); /* All other fields are RES0 & we don't do anything with this */ /* TODO: Disable access to other debug state when locked */ hypctx->dbg_oslock = (wval & OSLAR_OSLK) == OSLAR_OSLK; return (0); } static int vmm_read_oslsr_el1(struct vcpu *vcpu, uint64_t *rval, void *arg) { struct hypctx *hypctx; uint64_t val; hypctx = vcpu_get_cookie(vcpu); val = OSLSR_OSLM_1; if (hypctx->dbg_oslock) val |= OSLSR_OSLK; *rval = val; return (0); } static const struct vmm_special_reg vmm_special_regs[] = { #define SPECIAL_REG(_reg, _read, _write) \ { \ .esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) | \ ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) | \ ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) | \ ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) | \ ((_reg ## _op2) << ISS_MSR_OP2_SHIFT), \ .esr_mask = ISS_MSR_REG_MASK, \ .reg_read = (_read), \ .reg_write = (_write), \ .arg = NULL, \ } #define ID_SPECIAL_REG(_reg, _name) \ { \ .esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) | \ ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) | \ ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) | \ ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) | \ ((_reg ## _op2) << ISS_MSR_OP2_SHIFT), \ .esr_mask = ISS_MSR_REG_MASK, \ .reg_read = vmm_reg_read_arg, \ .reg_write = vmm_reg_wi, \ .arg = &(vmm_arch_regs._name), \ } /* ID registers */ ID_SPECIAL_REG(ID_AA64PFR0_EL1, id_aa64pfr0), ID_SPECIAL_REG(ID_AA64DFR0_EL1, id_aa64dfr0), ID_SPECIAL_REG(ID_AA64ISAR0_EL1, id_aa64isar0), ID_SPECIAL_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0), ID_SPECIAL_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1), /* * All other ID registers are read as zero. * They are all in the op0=3, op1=0, CRn=0, CRm={0..7} space. */ { .esr_iss = (3 << ISS_MSR_OP0_SHIFT) | (0 << ISS_MSR_OP1_SHIFT) | (0 << ISS_MSR_CRn_SHIFT) | (0 << ISS_MSR_CRm_SHIFT), .esr_mask = ISS_MSR_OP0_MASK | ISS_MSR_OP1_MASK | ISS_MSR_CRn_MASK | (0x8 << ISS_MSR_CRm_SHIFT), .reg_read = vmm_reg_raz, .reg_write = vmm_reg_wi, .arg = NULL, }, /* Counter physical registers */ SPECIAL_REG(CNTP_CTL_EL0, vtimer_phys_ctl_read, vtimer_phys_ctl_write), SPECIAL_REG(CNTP_CVAL_EL0, vtimer_phys_cval_read, vtimer_phys_cval_write), SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read, vtimer_phys_tval_write), SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write), /* Debug registers */ SPECIAL_REG(DBGPRCR_EL1, vmm_reg_raz, vmm_reg_wi), SPECIAL_REG(OSDLR_EL1, vmm_reg_raz, vmm_reg_wi), /* TODO: Exceptions on invalid access */ SPECIAL_REG(OSLAR_EL1, vmm_reg_raz, vmm_write_oslar_el1), SPECIAL_REG(OSLSR_EL1, vmm_read_oslsr_el1, vmm_reg_wi), #undef SPECIAL_REG }; void vm_register_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask, reg_read_t reg_read, reg_write_t reg_write, void *arg) { int i; for (i = 0; i < nitems(vm->special_reg); i++) { if (vm->special_reg[i].esr_iss == 0 && vm->special_reg[i].esr_mask == 0) { vm->special_reg[i].esr_iss = iss; vm->special_reg[i].esr_mask = mask; vm->special_reg[i].reg_read = reg_read; vm->special_reg[i].reg_write = reg_write; vm->special_reg[i].arg = arg; return; } } panic("%s: No free special register slot", __func__); } void vm_deregister_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask) { int i; for (i = 0; i < nitems(vm->special_reg); i++) { if (vm->special_reg[i].esr_iss == iss && vm->special_reg[i].esr_mask == mask) { memset(&vm->special_reg[i], 0, sizeof(vm->special_reg[i])); return; } } panic("%s: Invalid special register: iss %lx mask %lx", __func__, iss, mask); } static int vm_handle_reg_emul(struct vcpu *vcpu, bool *retu) { struct vm *vm; struct vm_exit *vme; struct vre *vre; int i, rv; vm = vcpu->vm; vme = &vcpu->exitinfo; vre = &vme->u.reg_emul.vre; for (i = 0; i < nitems(vm->special_reg); i++) { if (vm->special_reg[i].esr_iss == 0 && vm->special_reg[i].esr_mask == 0) continue; if ((vre->inst_syndrome & vm->special_reg[i].esr_mask) == vm->special_reg[i].esr_iss) { rv = vmm_emulate_register(vcpu, vre, vm->special_reg[i].reg_read, vm->special_reg[i].reg_write, vm->special_reg[i].arg); if (rv == 0) { *retu = false; } return (rv); } } for (i = 0; i < nitems(vmm_special_regs); i++) { if ((vre->inst_syndrome & vmm_special_regs[i].esr_mask) == vmm_special_regs[i].esr_iss) { rv = vmm_emulate_register(vcpu, vre, vmm_special_regs[i].reg_read, vmm_special_regs[i].reg_write, vmm_special_regs[i].arg); if (rv == 0) { *retu = false; } return (rv); } } *retu = true; return (0); } void vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size, mem_region_read_t mmio_read, mem_region_write_t mmio_write) { int i; for (i = 0; i < nitems(vm->mmio_region); i++) { if (vm->mmio_region[i].start == 0 && vm->mmio_region[i].end == 0) { vm->mmio_region[i].start = start; vm->mmio_region[i].end = start + size; vm->mmio_region[i].read = mmio_read; vm->mmio_region[i].write = mmio_write; return; } } panic("%s: No free MMIO region", __func__); } void vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size) { int i; for (i = 0; i < nitems(vm->mmio_region); i++) { if (vm->mmio_region[i].start == start && vm->mmio_region[i].end == start + size) { memset(&vm->mmio_region[i], 0, sizeof(vm->mmio_region[i])); return; } } panic("%s: Invalid MMIO region: %lx - %lx", __func__, start, start + size); } static int vm_handle_inst_emul(struct vcpu *vcpu, bool *retu) { struct vm *vm; struct vm_exit *vme; struct vie *vie; struct hyp *hyp; uint64_t fault_ipa; struct vm_guest_paging *paging; struct vmm_mmio_region *vmr; int error, i; vm = vcpu->vm; hyp = vm->cookie; if (!hyp->vgic_attached) goto out_user; vme = &vcpu->exitinfo; vie = &vme->u.inst_emul.vie; paging = &vme->u.inst_emul.paging; fault_ipa = vme->u.inst_emul.gpa; vmr = NULL; for (i = 0; i < nitems(vm->mmio_region); i++) { if (vm->mmio_region[i].start <= fault_ipa && vm->mmio_region[i].end > fault_ipa) { vmr = &vm->mmio_region[i]; break; } } if (vmr == NULL) goto out_user; error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging, vmr->read, vmr->write, retu); return (error); out_user: *retu = true; return (0); } int vm_suspend(struct vm *vm, enum vm_suspend_how how) { int i; if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) return (EINVAL); if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { VM_CTR2(vm, "virtual machine already suspended %d/%d", vm->suspend, how); return (EALREADY); } VM_CTR1(vm, "virtual machine successfully suspended %d", how); /* * Notify all active vcpus that they are now suspended. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm_vcpu(vm, i)); } return (0); } void vm_exit_suspended(struct vcpu *vcpu, uint64_t pc) { struct vm *vm = vcpu->vm; struct vm_exit *vmexit; KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); vmexit = vm_exitinfo(vcpu); vmexit->pc = pc; vmexit->inst_length = 4; vmexit->exitcode = VM_EXITCODE_SUSPENDED; vmexit->u.suspended.how = vm->suspend; } void vm_exit_debug(struct vcpu *vcpu, uint64_t pc) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vcpu); vmexit->pc = pc; vmexit->inst_length = 4; vmexit->exitcode = VM_EXITCODE_DEBUG; } int vm_activate_cpu(struct vcpu *vcpu) { struct vm *vm = vcpu->vm; if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) return (EBUSY); CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus); return (0); } int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu) { if (vcpu == NULL) { vm->debug_cpus = vm->active_cpus; for (int i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm_vcpu(vm, i)); } } else { if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) return (EINVAL); CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); vcpu_notify_event(vcpu); } return (0); } int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu) { if (vcpu == NULL) { CPU_ZERO(&vm->debug_cpus); } else { if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus)) return (EINVAL); CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); } return (0); } int vcpu_debugged(struct vcpu *vcpu) { return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus)); } cpuset_t vm_active_cpus(struct vm *vm) { return (vm->active_cpus); } cpuset_t vm_debug_cpus(struct vm *vm) { return (vm->debug_cpus); } cpuset_t vm_suspended_cpus(struct vm *vm) { return (vm->suspended_cpus); } void * vcpu_stats(struct vcpu *vcpu) { return (vcpu->stats); } /* * This function is called to ensure that a vcpu "sees" a pending event * as soon as possible: * - If the vcpu thread is sleeping then it is woken up. * - If the vcpu is running on a different host_cpu then an IPI will be directed * to the host_cpu to cause the vcpu to trap into the hypervisor. */ static void vcpu_notify_event_locked(struct vcpu *vcpu) { int hostcpu; hostcpu = vcpu->hostcpu; if (vcpu->state == VCPU_RUNNING) { KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); if (hostcpu != curcpu) { ipi_cpu(hostcpu, vmm_ipinum); } else { /* * If the 'vcpu' is running on 'curcpu' then it must * be sending a notification to itself (e.g. SELF_IPI). * The pending event will be picked up when the vcpu * transitions back to guest context. */ } } else { KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " "with hostcpu %d", vcpu->state, hostcpu)); if (vcpu->state == VCPU_SLEEPING) wakeup_one(vcpu); } } void vcpu_notify_event(struct vcpu *vcpu) { vcpu_lock(vcpu); vcpu_notify_event_locked(vcpu); vcpu_unlock(vcpu); } struct vm_mem * vm_mem(struct vm *vm) { return (&vm->mem); } static void restore_guest_fpustate(struct vcpu *vcpu) { /* flush host state to the pcb */ vfp_save_state(curthread, curthread->td_pcb); /* Ensure the VFP state will be re-loaded when exiting the guest */ PCPU_SET(fpcurthread, NULL); /* restore guest FPU state */ vfp_enable(); vfp_restore(vcpu->guestfpu); /* * The FPU is now "dirty" with the guest's state so turn on emulation * to trap any access to the FPU by the host. */ vfp_disable(); } static void save_guest_fpustate(struct vcpu *vcpu) { if ((READ_SPECIALREG(cpacr_el1) & CPACR_FPEN_MASK) != CPACR_FPEN_TRAP_ALL1) panic("VFP not enabled in host!"); /* save guest FPU state */ vfp_enable(); vfp_store(vcpu->guestfpu); vfp_disable(); KASSERT(PCPU_GET(fpcurthread) == NULL, ("%s: fpcurthread set with guest registers", __func__)); } static int vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) { int error; vcpu_assert_locked(vcpu); /* * State transitions from the vmmdev_ioctl() must always begin from * the VCPU_IDLE state. This guarantees that there is only a single * ioctl() operating on a vcpu at any point. */ if (from_idle) { while (vcpu->state != VCPU_IDLE) { vcpu_notify_event_locked(vcpu); msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); } } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); } if (vcpu->state == VCPU_RUNNING) { KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " "mismatch for running vcpu", curcpu, vcpu->hostcpu)); } else { KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " "vcpu that is not running", vcpu->hostcpu)); } /* * The following state transitions are allowed: * IDLE -> FROZEN -> IDLE * FROZEN -> RUNNING -> FROZEN * FROZEN -> SLEEPING -> FROZEN */ switch (vcpu->state) { case VCPU_IDLE: case VCPU_RUNNING: case VCPU_SLEEPING: error = (newstate != VCPU_FROZEN); break; case VCPU_FROZEN: error = (newstate == VCPU_FROZEN); break; default: error = 1; break; } if (error) return (EBUSY); vcpu->state = newstate; if (newstate == VCPU_RUNNING) vcpu->hostcpu = curcpu; else vcpu->hostcpu = NOCPU; if (newstate == VCPU_IDLE) wakeup(&vcpu->state); return (0); } static void vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state(vcpu, newstate, false)) != 0) panic("Error %d setting state to %d\n", error, newstate); } static void vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) { int error; if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) panic("Error %d setting state to %d", error, newstate); } int vm_get_capability(struct vcpu *vcpu, int type, int *retval) { if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (vmmops_getcap(vcpu->cookie, type, retval)); } int vm_set_capability(struct vcpu *vcpu, int type, int val) { if (type < 0 || type >= VM_CAP_MAX) return (EINVAL); return (vmmops_setcap(vcpu->cookie, type, val)); } struct vm * vcpu_vm(struct vcpu *vcpu) { return (vcpu->vm); } int vcpu_vcpuid(struct vcpu *vcpu) { return (vcpu->vcpuid); } void * vcpu_get_cookie(struct vcpu *vcpu) { return (vcpu->cookie); } struct vcpu * vm_vcpu(struct vm *vm, int vcpuid) { return (vm->vcpu[vcpuid]); } int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) { int error; vcpu_lock(vcpu); error = vcpu_set_state_locked(vcpu, newstate, from_idle); vcpu_unlock(vcpu); return (error); } enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu) { enum vcpu_state state; vcpu_lock(vcpu); state = vcpu->state; if (hostcpu != NULL) *hostcpu = vcpu->hostcpu; vcpu_unlock(vcpu); return (state); } int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval) { if (reg < 0 || reg >= VM_REG_LAST) return (EINVAL); return (vmmops_getreg(vcpu->cookie, reg, retval)); } int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) { int error; if (reg < 0 || reg >= VM_REG_LAST) return (EINVAL); error = vmmops_setreg(vcpu->cookie, reg, val); if (error || reg != VM_REG_GUEST_PC) return (error); vcpu->nextpc = val; return (0); } void * vm_get_cookie(struct vm *vm) { return (vm->cookie); } int vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far) { return (vmmops_exception(vcpu->cookie, esr, far)); } int vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr) { return (vgic_attach_to_vm(vm->cookie, descr)); } int vm_assert_irq(struct vm *vm, uint32_t irq) { return (vgic_inject_irq(vm->cookie, -1, irq, true)); } int vm_deassert_irq(struct vm *vm, uint32_t irq) { return (vgic_inject_irq(vm->cookie, -1, irq, false)); } int vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot, int func) { /* TODO: Should we raise an SError? */ return (vgic_inject_msi(vm->cookie, msg, addr)); } static int vm_handle_smccc_call(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) { struct hypctx *hypctx; int i; hypctx = vcpu_get_cookie(vcpu); if ((hypctx->tf.tf_esr & ESR_ELx_ISS_MASK) != 0) return (1); vme->exitcode = VM_EXITCODE_SMCCC; vme->u.smccc_call.func_id = hypctx->tf.tf_x[0]; for (i = 0; i < nitems(vme->u.smccc_call.args); i++) vme->u.smccc_call.args[i] = hypctx->tf.tf_x[i + 1]; *retu = true; return (0); } static int vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) { struct vm *vm; vm = vcpu->vm; vcpu_lock(vcpu); while (1) { if (vm->suspend) break; if (vgic_has_pending_irq(vcpu->cookie)) break; if (vcpu_should_yield(vcpu)) break; vcpu_require_state_locked(vcpu, VCPU_SLEEPING); /* * XXX msleep_spin() cannot be interrupted by signals so * wake up periodically to check pending signals. */ msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz); vcpu_require_state_locked(vcpu, VCPU_FROZEN); } vcpu_unlock(vcpu); *retu = false; return (0); } static int vm_handle_paging(struct vcpu *vcpu, bool *retu) { struct vm *vm = vcpu->vm; struct vm_exit *vme; struct vm_map *map; uint64_t addr, esr; pmap_t pmap; int ftype, rv; vme = &vcpu->exitinfo; pmap = vmspace_pmap(vm_vmspace(vcpu->vm)); addr = vme->u.paging.gpa; esr = vme->u.paging.esr; /* The page exists, but the page table needs to be updated. */ if (pmap_fault(pmap, esr, addr) == KERN_SUCCESS) return (0); switch (ESR_ELx_EXCEPTION(esr)) { case EXCP_INSN_ABORT_L: case EXCP_DATA_ABORT_L: ftype = VM_PROT_EXECUTE | VM_PROT_READ | VM_PROT_WRITE; break; default: panic("%s: Invalid exception (esr = %lx)", __func__, esr); } map = &vm_vmspace(vm)->vm_map; rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); if (rv != KERN_SUCCESS) return (EFAULT); return (0); } static int vm_handle_suspend(struct vcpu *vcpu, bool *retu) { struct vm *vm = vcpu->vm; int error, i; struct thread *td; error = 0; td = curthread; CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus); /* * Wait until all 'active_cpus' have suspended themselves. * * Since a VM may be suspended at any time including when one or * more vcpus are doing a rendezvous we need to call the rendezvous * handler while we are waiting to prevent a deadlock. */ vcpu_lock(vcpu); while (error == 0) { if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) break; vcpu_require_state_locked(vcpu, VCPU_SLEEPING); msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); vcpu_require_state_locked(vcpu, VCPU_FROZEN); if (td_ast_pending(td, TDA_SUSPEND)) { vcpu_unlock(vcpu); error = thread_check_susp(td, false); vcpu_lock(vcpu); } } vcpu_unlock(vcpu); /* * Wakeup the other sleeping vcpus and return to userspace. */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->suspended_cpus)) { vcpu_notify_event(vm_vcpu(vm, i)); } } *retu = true; return (error); } int vm_run(struct vcpu *vcpu) { struct vm *vm = vcpu->vm; struct vm_eventinfo evinfo; int error, vcpuid; struct vm_exit *vme; bool retu; pmap_t pmap; vcpuid = vcpu->vcpuid; if (!CPU_ISSET(vcpuid, &vm->active_cpus)) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); pmap = vmspace_pmap(vm_vmspace(vm)); vme = &vcpu->exitinfo; evinfo.rptr = NULL; evinfo.sptr = &vm->suspend; evinfo.iptr = NULL; restart: critical_enter(); restore_guest_fpustate(vcpu); vcpu_require_state(vcpu, VCPU_RUNNING); error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo); vcpu_require_state(vcpu, VCPU_FROZEN); save_guest_fpustate(vcpu); critical_exit(); if (error == 0) { retu = false; switch (vme->exitcode) { case VM_EXITCODE_INST_EMUL: vcpu->nextpc = vme->pc + vme->inst_length; error = vm_handle_inst_emul(vcpu, &retu); break; case VM_EXITCODE_REG_EMUL: vcpu->nextpc = vme->pc + vme->inst_length; error = vm_handle_reg_emul(vcpu, &retu); break; case VM_EXITCODE_HVC: /* * The HVC instruction saves the address for the * next instruction as the return address. */ vcpu->nextpc = vme->pc; /* * The PSCI call can change the exit information in the * case of suspend/reset/poweroff/cpu off/cpu on. */ error = vm_handle_smccc_call(vcpu, vme, &retu); break; case VM_EXITCODE_WFI: vcpu->nextpc = vme->pc + vme->inst_length; error = vm_handle_wfi(vcpu, vme, &retu); break; case VM_EXITCODE_PAGING: vcpu->nextpc = vme->pc; error = vm_handle_paging(vcpu, &retu); break; case VM_EXITCODE_SUSPENDED: vcpu->nextpc = vme->pc; error = vm_handle_suspend(vcpu, &retu); break; default: /* Handle in userland */ vcpu->nextpc = vme->pc; retu = true; break; } } if (error == 0 && retu == false) goto restart; return (error); } diff --git a/sys/arm64/vmm/vmm_arm64.c b/sys/arm64/vmm/vmm_arm64.c index 618f4afaf8ee..006239431f29 100644 --- a/sys/arm64/vmm/vmm_arm64.c +++ b/sys/arm64/vmm/vmm_arm64.c @@ -1,1421 +1,1420 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (C) 2015 Mihai Carabas * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include "mmu.h" #include "arm64.h" #include "hyp.h" #include "reset.h" #include "io/vgic.h" #include "io/vgic_v3.h" #include "io/vtimer.h" #include "vmm_handlers.h" #include "vmm_stat.h" #define HANDLED 1 #define UNHANDLED 0 /* Number of bits in an EL2 virtual address */ #define EL2_VIRT_BITS 48 CTASSERT((1ul << EL2_VIRT_BITS) >= HYP_VM_MAX_ADDRESS); /* TODO: Move the host hypctx off the stack */ #define VMM_STACK_PAGES 4 #define VMM_STACK_SIZE (VMM_STACK_PAGES * PAGE_SIZE) static int vmm_pmap_levels, vmm_virt_bits, vmm_max_ipa_bits; /* Register values passed to arm_setup_vectors to set in the hypervisor */ struct vmm_init_regs { uint64_t tcr_el2; uint64_t vtcr_el2; }; MALLOC_DEFINE(M_HYP, "ARM VMM HYP", "ARM VMM HYP"); extern char hyp_init_vectors[]; extern char hyp_vectors[]; extern char hyp_stub_vectors[]; static vm_paddr_t hyp_code_base; static size_t hyp_code_len; static char *stack[MAXCPU]; static vm_offset_t stack_hyp_va[MAXCPU]; static vmem_t *el2_mem_alloc; static void arm_setup_vectors(void *arg); DPCPU_DEFINE_STATIC(struct hypctx *, vcpu); static inline void arm64_set_active_vcpu(struct hypctx *hypctx) { DPCPU_SET(vcpu, hypctx); } struct hypctx * arm64_get_active_vcpu(void) { return (DPCPU_GET(vcpu)); } static void arm_setup_vectors(void *arg) { struct vmm_init_regs *el2_regs; uintptr_t stack_top; uint32_t sctlr_el2; register_t daif; el2_regs = arg; arm64_set_active_vcpu(NULL); /* * Configure the system control register for EL2: * * SCTLR_EL2_M: MMU on * SCTLR_EL2_C: Data cacheability not affected * SCTLR_EL2_I: Instruction cacheability not affected * SCTLR_EL2_A: Instruction alignment check * SCTLR_EL2_SA: Stack pointer alignment check * SCTLR_EL2_WXN: Treat writable memory as execute never * ~SCTLR_EL2_EE: Data accesses are little-endian */ sctlr_el2 = SCTLR_EL2_RES1; sctlr_el2 |= SCTLR_EL2_M | SCTLR_EL2_C | SCTLR_EL2_I; sctlr_el2 |= SCTLR_EL2_A | SCTLR_EL2_SA; sctlr_el2 |= SCTLR_EL2_WXN; sctlr_el2 &= ~SCTLR_EL2_EE; daif = intr_disable(); if (in_vhe()) { WRITE_SPECIALREG(vtcr_el2, el2_regs->vtcr_el2); } else { /* * Install the temporary vectors which will be responsible for * initializing the VMM when we next trap into EL2. * * x0: the exception vector table responsible for hypervisor * initialization on the next call. */ vmm_call_hyp(vtophys(&vmm_hyp_code)); /* Create and map the hypervisor stack */ stack_top = stack_hyp_va[PCPU_GET(cpuid)] + VMM_STACK_SIZE; /* Special call to initialize EL2 */ vmm_call_hyp(vmmpmap_to_ttbr0(), stack_top, el2_regs->tcr_el2, sctlr_el2, el2_regs->vtcr_el2); } intr_restore(daif); } static void arm_teardown_vectors(void *arg) { register_t daif; /* * vmm_cleanup() will disable the MMU. For the next few instructions, * before the hardware disables the MMU, one of the following is * possible: * * a. The instruction addresses are fetched with the MMU disabled, * and they must represent the actual physical addresses. This will work * because we call the vmm_cleanup() function by its physical address. * * b. The instruction addresses are fetched using the old translation * tables. This will work because we have an identity mapping in place * in the translation tables and vmm_cleanup() is called by its physical * address. */ daif = intr_disable(); /* TODO: Invalidate the cache */ vmm_call_hyp(HYP_CLEANUP, vtophys(hyp_stub_vectors)); intr_restore(daif); arm64_set_active_vcpu(NULL); } static uint64_t vmm_vtcr_el2_sl(u_int levels) { #if PAGE_SIZE == PAGE_SIZE_4K switch (levels) { case 2: return (VTCR_EL2_SL0_4K_LVL2); case 3: return (VTCR_EL2_SL0_4K_LVL1); case 4: return (VTCR_EL2_SL0_4K_LVL0); default: panic("%s: Invalid number of page table levels %u", __func__, levels); } #elif PAGE_SIZE == PAGE_SIZE_16K switch (levels) { case 2: return (VTCR_EL2_SL0_16K_LVL2); case 3: return (VTCR_EL2_SL0_16K_LVL1); case 4: return (VTCR_EL2_SL0_16K_LVL0); default: panic("%s: Invalid number of page table levels %u", __func__, levels); } #else #error Unsupported page size #endif } int vmmops_modinit(int ipinum) { struct vmm_init_regs el2_regs; vm_offset_t next_hyp_va; vm_paddr_t vmm_base; uint64_t id_aa64mmfr0_el1, pa_range_bits, pa_range_field; int cpu, i; bool rv __diagused; if (!has_hyp()) { printf( "vmm: Processor doesn't have support for virtualization\n"); return (ENXIO); } if (!vgic_present()) { printf("vmm: No vgic found\n"); return (ENODEV); } if (!get_kernel_reg(ID_AA64MMFR0_EL1, &id_aa64mmfr0_el1)) { printf("vmm: Unable to read ID_AA64MMFR0_EL1\n"); return (ENXIO); } pa_range_field = ID_AA64MMFR0_PARange_VAL(id_aa64mmfr0_el1); /* * Use 3 levels to give us up to 39 bits with 4k pages, or * 47 bits with 16k pages. */ /* TODO: Check the number of levels for 64k pages */ vmm_pmap_levels = 3; switch (pa_range_field) { case ID_AA64MMFR0_PARange_4G: printf("vmm: Not enough physical address bits\n"); return (ENXIO); case ID_AA64MMFR0_PARange_64G: vmm_virt_bits = 36; #if PAGE_SIZE == PAGE_SIZE_16K vmm_pmap_levels = 2; #endif break; default: vmm_virt_bits = 39; break; } pa_range_bits = pa_range_field >> ID_AA64MMFR0_PARange_SHIFT; if (!in_vhe()) { /* Initialise the EL2 MMU */ if (!vmmpmap_init()) { printf("vmm: Failed to init the EL2 MMU\n"); return (ENOMEM); } } /* Set up the stage 2 pmap callbacks */ MPASS(pmap_clean_stage2_tlbi == NULL); pmap_clean_stage2_tlbi = vmm_clean_s2_tlbi; pmap_stage2_invalidate_range = vmm_s2_tlbi_range; pmap_stage2_invalidate_all = vmm_s2_tlbi_all; if (!in_vhe()) { /* * Create an allocator for the virtual address space used by * EL2. EL2 code is identity-mapped; the allocator is used to * find space for VM structures. */ el2_mem_alloc = vmem_create("VMM EL2", 0, 0, PAGE_SIZE, 0, M_WAITOK); /* Create the mappings for the hypervisor translation table. */ hyp_code_len = round_page(&vmm_hyp_code_end - &vmm_hyp_code); /* We need an physical identity mapping for when we activate the MMU */ hyp_code_base = vmm_base = vtophys(&vmm_hyp_code); rv = vmmpmap_enter(vmm_base, hyp_code_len, vmm_base, VM_PROT_READ | VM_PROT_EXECUTE); MPASS(rv); next_hyp_va = roundup2(vmm_base + hyp_code_len, L2_SIZE); /* Create a per-CPU hypervisor stack */ CPU_FOREACH(cpu) { stack[cpu] = malloc(VMM_STACK_SIZE, M_HYP, M_WAITOK | M_ZERO); stack_hyp_va[cpu] = next_hyp_va; for (i = 0; i < VMM_STACK_PAGES; i++) { rv = vmmpmap_enter(stack_hyp_va[cpu] + ptoa(i), PAGE_SIZE, vtophys(stack[cpu] + ptoa(i)), VM_PROT_READ | VM_PROT_WRITE); MPASS(rv); } next_hyp_va += L2_SIZE; } el2_regs.tcr_el2 = TCR_EL2_RES1; el2_regs.tcr_el2 |= min(pa_range_bits << TCR_EL2_PS_SHIFT, TCR_EL2_PS_52BITS); el2_regs.tcr_el2 |= TCR_EL2_T0SZ(64 - EL2_VIRT_BITS); el2_regs.tcr_el2 |= TCR_EL2_IRGN0_WBWA | TCR_EL2_ORGN0_WBWA; #if PAGE_SIZE == PAGE_SIZE_4K el2_regs.tcr_el2 |= TCR_EL2_TG0_4K; #elif PAGE_SIZE == PAGE_SIZE_16K el2_regs.tcr_el2 |= TCR_EL2_TG0_16K; #else #error Unsupported page size #endif #ifdef SMP el2_regs.tcr_el2 |= TCR_EL2_SH0_IS; #endif } switch (pa_range_bits << TCR_EL2_PS_SHIFT) { case TCR_EL2_PS_32BITS: vmm_max_ipa_bits = 32; break; case TCR_EL2_PS_36BITS: vmm_max_ipa_bits = 36; break; case TCR_EL2_PS_40BITS: vmm_max_ipa_bits = 40; break; case TCR_EL2_PS_42BITS: vmm_max_ipa_bits = 42; break; case TCR_EL2_PS_44BITS: vmm_max_ipa_bits = 44; break; case TCR_EL2_PS_48BITS: vmm_max_ipa_bits = 48; break; case TCR_EL2_PS_52BITS: default: vmm_max_ipa_bits = 52; break; } /* * Configure the Stage 2 translation control register: * * VTCR_IRGN0_WBWA: Translation table walks access inner cacheable * normal memory * VTCR_ORGN0_WBWA: Translation table walks access outer cacheable * normal memory * VTCR_EL2_TG0_4K/16K: Stage 2 uses the same page size as the kernel * VTCR_EL2_SL0_4K_LVL1: Stage 2 uses concatenated level 1 tables * VTCR_EL2_SH0_IS: Memory associated with Stage 2 walks is inner * shareable */ el2_regs.vtcr_el2 = VTCR_EL2_RES1; el2_regs.vtcr_el2 |= VTCR_EL2_IRGN0_WBWA | VTCR_EL2_ORGN0_WBWA; el2_regs.vtcr_el2 |= VTCR_EL2_T0SZ(64 - vmm_virt_bits); el2_regs.vtcr_el2 |= vmm_vtcr_el2_sl(vmm_pmap_levels); #if PAGE_SIZE == PAGE_SIZE_4K el2_regs.vtcr_el2 |= VTCR_EL2_TG0_4K; #elif PAGE_SIZE == PAGE_SIZE_16K el2_regs.vtcr_el2 |= VTCR_EL2_TG0_16K; #else #error Unsupported page size #endif #ifdef SMP el2_regs.vtcr_el2 |= VTCR_EL2_SH0_IS; #endif /* * If FEAT_LPA2 is enabled in the host then we need to enable it here * so the page tables created by pmap.c are correct. The meaning of * the shareability field changes to become address bits when this * is set. */ if ((READ_SPECIALREG(tcr_el1) & TCR_DS) != 0) { el2_regs.vtcr_el2 |= VTCR_EL2_DS; el2_regs.vtcr_el2 |= min(pa_range_bits << VTCR_EL2_PS_SHIFT, VTCR_EL2_PS_52BIT); } else { el2_regs.vtcr_el2 |= min(pa_range_bits << VTCR_EL2_PS_SHIFT, VTCR_EL2_PS_48BIT); } smp_rendezvous(NULL, arm_setup_vectors, NULL, &el2_regs); if (!in_vhe()) { /* Add memory to the vmem allocator (checking there is space) */ if (vmm_base > (L2_SIZE + PAGE_SIZE)) { /* * Ensure there is an L2 block before the vmm code to check * for buffer overflows on earlier data. Include the PAGE_SIZE * of the minimum we can allocate. */ vmm_base -= L2_SIZE + PAGE_SIZE; vmm_base = rounddown2(vmm_base, L2_SIZE); /* * Check there is memory before the vmm code to add. * * Reserve the L2 block at address 0 so NULL dereference will * raise an exception. */ if (vmm_base > L2_SIZE) vmem_add(el2_mem_alloc, L2_SIZE, vmm_base - L2_SIZE, M_WAITOK); } /* * Add the memory after the stacks. There is most of an L2 block * between the last stack and the first allocation so this should * be safe without adding more padding. */ if (next_hyp_va < HYP_VM_MAX_ADDRESS - PAGE_SIZE) vmem_add(el2_mem_alloc, next_hyp_va, HYP_VM_MAX_ADDRESS - next_hyp_va, M_WAITOK); } vgic_init(); vtimer_init(); return (0); } int vmmops_modcleanup(void) { int cpu; if (!in_vhe()) { smp_rendezvous(NULL, arm_teardown_vectors, NULL, NULL); CPU_FOREACH(cpu) { vmmpmap_remove(stack_hyp_va[cpu], VMM_STACK_PAGES * PAGE_SIZE, false); } vmmpmap_remove(hyp_code_base, hyp_code_len, false); } vtimer_cleanup(); if (!in_vhe()) { vmmpmap_fini(); CPU_FOREACH(cpu) free(stack[cpu], M_HYP); } pmap_clean_stage2_tlbi = NULL; pmap_stage2_invalidate_range = NULL; pmap_stage2_invalidate_all = NULL; return (0); } static vm_size_t el2_hyp_size(struct vm *vm) { return (round_page(sizeof(struct hyp) + sizeof(struct hypctx *) * vm_get_maxcpus(vm))); } static vm_size_t el2_hypctx_size(void) { return (round_page(sizeof(struct hypctx))); } static vm_offset_t el2_map_enter(vm_offset_t data, vm_size_t size, vm_prot_t prot) { vmem_addr_t addr; int err __diagused; bool rv __diagused; err = vmem_alloc(el2_mem_alloc, size, M_NEXTFIT | M_WAITOK, &addr); MPASS(err == 0); rv = vmmpmap_enter(addr, size, vtophys(data), prot); MPASS(rv); return (addr); } void * vmmops_init(struct vm *vm, pmap_t pmap) { struct hyp *hyp; vm_size_t size; uint64_t idreg; size = el2_hyp_size(vm); hyp = malloc_aligned(size, PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO); hyp->vm = vm; hyp->vgic_attached = false; if (get_kernel_reg(ID_AA64MMFR0_EL1, &idreg)) { if (ID_AA64MMFR0_ECV_VAL(idreg) >= ID_AA64MMFR0_ECV_POFF) hyp->feats |= HYP_FEAT_ECV_POFF; } if (get_kernel_reg(ID_AA64MMFR1_EL1, &idreg)) { if (ID_AA64MMFR1_HCX_VAL(idreg) >= ID_AA64MMFR1_HCX_IMPL) hyp->feats |= HYP_FEAT_HCX; } vtimer_vminit(hyp); vgic_vminit(hyp); if (!in_vhe()) hyp->el2_addr = el2_map_enter((vm_offset_t)hyp, size, VM_PROT_READ | VM_PROT_WRITE); return (hyp); } void * vmmops_vcpu_init(void *vmi, struct vcpu *vcpu1, int vcpuid) { struct hyp *hyp = vmi; struct hypctx *hypctx; vm_size_t size; size = el2_hypctx_size(); hypctx = malloc_aligned(size, PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO); KASSERT(vcpuid >= 0 && vcpuid < vm_get_maxcpus(hyp->vm), ("%s: Invalid vcpuid %d", __func__, vcpuid)); hyp->ctx[vcpuid] = hypctx; hypctx->hyp = hyp; hypctx->vcpu = vcpu1; reset_vm_el01_regs(hypctx); reset_vm_el2_regs(hypctx); vtimer_cpuinit(hypctx); vgic_cpuinit(hypctx); if (!in_vhe()) hypctx->el2_addr = el2_map_enter((vm_offset_t)hypctx, size, VM_PROT_READ | VM_PROT_WRITE); return (hypctx); } static int arm_vmm_pinit(pmap_t pmap) { pmap_pinit_stage(pmap, PM_STAGE2, vmm_pmap_levels); return (1); } struct vmspace * vmmops_vmspace_alloc(vm_offset_t min, vm_offset_t max) { return (vmspace_alloc(min, max, arm_vmm_pinit)); } void vmmops_vmspace_free(struct vmspace *vmspace) { pmap_remove_pages(vmspace_pmap(vmspace)); vmspace_free(vmspace); } static inline void arm64_print_hyp_regs(struct vm_exit *vme) { printf("esr_el2: 0x%016lx\n", vme->u.hyp.esr_el2); printf("far_el2: 0x%016lx\n", vme->u.hyp.far_el2); printf("hpfar_el2: 0x%016lx\n", vme->u.hyp.hpfar_el2); printf("elr_el2: 0x%016lx\n", vme->pc); } static void arm64_gen_inst_emul_data(struct hypctx *hypctx, uint32_t esr_iss, struct vm_exit *vme_ret) { struct vm_guest_paging *paging; struct vie *vie; uint32_t esr_sas, reg_num; /* * Get the page address from HPFAR_EL2. */ vme_ret->u.inst_emul.gpa = HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2); /* Bits [11:0] are the same as bits [11:0] from the virtual address. */ vme_ret->u.inst_emul.gpa += hypctx->exit_info.far_el2 & FAR_EL2_HPFAR_PAGE_MASK; esr_sas = (esr_iss & ISS_DATA_SAS_MASK) >> ISS_DATA_SAS_SHIFT; reg_num = (esr_iss & ISS_DATA_SRT_MASK) >> ISS_DATA_SRT_SHIFT; vie = &vme_ret->u.inst_emul.vie; vie->access_size = 1 << esr_sas; vie->sign_extend = (esr_iss & ISS_DATA_SSE) ? 1 : 0; vie->dir = (esr_iss & ISS_DATA_WnR) ? VM_DIR_WRITE : VM_DIR_READ; vie->reg = reg_num; paging = &vme_ret->u.inst_emul.paging; paging->ttbr0_addr = hypctx->ttbr0_el1 & ~(TTBR_ASID_MASK | TTBR_CnP); paging->ttbr1_addr = hypctx->ttbr1_el1 & ~(TTBR_ASID_MASK | TTBR_CnP); paging->tcr_el1 = hypctx->tcr_el1; paging->tcr2_el1 = hypctx->tcr2_el1; paging->flags = hypctx->tf.tf_spsr & (PSR_M_MASK | PSR_M_32); if ((hypctx->sctlr_el1 & SCTLR_M) != 0) paging->flags |= VM_GP_MMU_ENABLED; } static void arm64_gen_reg_emul_data(uint32_t esr_iss, struct vm_exit *vme_ret) { uint32_t reg_num; struct vre *vre; /* u.hyp member will be replaced by u.reg_emul */ vre = &vme_ret->u.reg_emul.vre; vre->inst_syndrome = esr_iss; /* ARMv8 Architecture Manual, p. D7-2273: 1 means read */ vre->dir = (esr_iss & ISS_MSR_DIR) ? VM_DIR_READ : VM_DIR_WRITE; reg_num = ISS_MSR_Rt(esr_iss); vre->reg = reg_num; } void raise_data_insn_abort(struct hypctx *hypctx, uint64_t far, bool dabort, int fsc) { uint64_t esr; if ((hypctx->tf.tf_spsr & PSR_M_MASK) == PSR_M_EL0t) esr = EXCP_INSN_ABORT_L << ESR_ELx_EC_SHIFT; else esr = EXCP_INSN_ABORT << ESR_ELx_EC_SHIFT; /* Set the bit that changes from insn -> data abort */ if (dabort) esr |= EXCP_DATA_ABORT_L << ESR_ELx_EC_SHIFT; /* Set the IL bit if set by hardware */ esr |= hypctx->tf.tf_esr & ESR_ELx_IL; vmmops_exception(hypctx, esr | fsc, far); } static int handle_el1_sync_excp(struct hypctx *hypctx, struct vm_exit *vme_ret, pmap_t pmap) { uint64_t gpa; uint32_t esr_ec, esr_iss; esr_ec = ESR_ELx_EXCEPTION(hypctx->tf.tf_esr); esr_iss = hypctx->tf.tf_esr & ESR_ELx_ISS_MASK; switch (esr_ec) { case EXCP_UNKNOWN: vmm_stat_incr(hypctx->vcpu, VMEXIT_UNKNOWN, 1); arm64_print_hyp_regs(vme_ret); vme_ret->exitcode = VM_EXITCODE_HYP; break; case EXCP_TRAP_WFI_WFE: if ((hypctx->tf.tf_esr & 0x3) == 0) { /* WFI */ vmm_stat_incr(hypctx->vcpu, VMEXIT_WFI, 1); vme_ret->exitcode = VM_EXITCODE_WFI; } else { vmm_stat_incr(hypctx->vcpu, VMEXIT_WFE, 1); vme_ret->exitcode = VM_EXITCODE_HYP; } break; case EXCP_HVC: vmm_stat_incr(hypctx->vcpu, VMEXIT_HVC, 1); vme_ret->exitcode = VM_EXITCODE_HVC; break; case EXCP_MSR: vmm_stat_incr(hypctx->vcpu, VMEXIT_MSR, 1); arm64_gen_reg_emul_data(esr_iss, vme_ret); vme_ret->exitcode = VM_EXITCODE_REG_EMUL; break; case EXCP_BRK: vmm_stat_incr(hypctx->vcpu, VMEXIT_BRK, 1); vme_ret->exitcode = VM_EXITCODE_BRK; break; case EXCP_SOFTSTP_EL0: vmm_stat_incr(hypctx->vcpu, VMEXIT_SS, 1); vme_ret->exitcode = VM_EXITCODE_SS; break; case EXCP_INSN_ABORT_L: case EXCP_DATA_ABORT_L: vmm_stat_incr(hypctx->vcpu, esr_ec == EXCP_DATA_ABORT_L ? VMEXIT_DATA_ABORT : VMEXIT_INSN_ABORT, 1); switch (hypctx->tf.tf_esr & ISS_DATA_DFSC_MASK) { case ISS_DATA_DFSC_TF_L0: case ISS_DATA_DFSC_TF_L1: case ISS_DATA_DFSC_TF_L2: case ISS_DATA_DFSC_TF_L3: case ISS_DATA_DFSC_AFF_L1: case ISS_DATA_DFSC_AFF_L2: case ISS_DATA_DFSC_AFF_L3: case ISS_DATA_DFSC_PF_L1: case ISS_DATA_DFSC_PF_L2: case ISS_DATA_DFSC_PF_L3: gpa = HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2); /* Check the IPA is valid */ if (gpa >= (1ul << vmm_max_ipa_bits)) { raise_data_insn_abort(hypctx, hypctx->exit_info.far_el2, esr_ec == EXCP_DATA_ABORT_L, ISS_DATA_DFSC_ASF_L0); vme_ret->inst_length = 0; return (HANDLED); } if (vm_mem_allocated(hypctx->vcpu, gpa)) { vme_ret->exitcode = VM_EXITCODE_PAGING; vme_ret->inst_length = 0; vme_ret->u.paging.esr = hypctx->tf.tf_esr; vme_ret->u.paging.gpa = gpa; } else if (esr_ec == EXCP_INSN_ABORT_L) { /* * Raise an external abort. Device memory is * not executable */ raise_data_insn_abort(hypctx, hypctx->exit_info.far_el2, false, ISS_DATA_DFSC_EXT); vme_ret->inst_length = 0; return (HANDLED); } else { arm64_gen_inst_emul_data(hypctx, esr_iss, vme_ret); vme_ret->exitcode = VM_EXITCODE_INST_EMUL; } break; default: arm64_print_hyp_regs(vme_ret); vme_ret->exitcode = VM_EXITCODE_HYP; break; } break; default: vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED_SYNC, 1); arm64_print_hyp_regs(vme_ret); vme_ret->exitcode = VM_EXITCODE_HYP; break; } /* We don't don't do any instruction emulation here */ return (UNHANDLED); } static int arm64_handle_world_switch(struct hypctx *hypctx, int excp_type, struct vm_exit *vme, pmap_t pmap) { int handled; switch (excp_type) { case EXCP_TYPE_EL1_SYNC: /* The exit code will be set by handle_el1_sync_excp(). */ handled = handle_el1_sync_excp(hypctx, vme, pmap); break; case EXCP_TYPE_EL1_IRQ: case EXCP_TYPE_EL1_FIQ: /* The host kernel will handle IRQs and FIQs. */ vmm_stat_incr(hypctx->vcpu, excp_type == EXCP_TYPE_EL1_IRQ ? VMEXIT_IRQ : VMEXIT_FIQ,1); vme->exitcode = VM_EXITCODE_BOGUS; handled = UNHANDLED; break; case EXCP_TYPE_EL1_ERROR: case EXCP_TYPE_EL2_SYNC: case EXCP_TYPE_EL2_IRQ: case EXCP_TYPE_EL2_FIQ: case EXCP_TYPE_EL2_ERROR: vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED_EL2, 1); vme->exitcode = VM_EXITCODE_BOGUS; handled = UNHANDLED; break; default: vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED, 1); vme->exitcode = VM_EXITCODE_BOGUS; handled = UNHANDLED; break; } return (handled); } static void ptp_release(void **cookie) { if (*cookie != NULL) { vm_gpa_release(*cookie); *cookie = NULL; } } static void * ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) { void *ptr; ptp_release(cookie); ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie); return (ptr); } /* log2 of the number of bytes in a page table entry */ #define PTE_SHIFT 3 int vmmops_gla2gpa(void *vcpui, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *is_fault) { struct hypctx *hypctx; void *cookie; uint64_t mask, *ptep, pte, pte_addr; int address_bits, granule_shift, ia_bits, levels, pte_shift, tsz; bool is_el0; /* Check if the MMU is off */ if ((paging->flags & VM_GP_MMU_ENABLED) == 0) { *is_fault = 0; *gpa = gla; return (0); } is_el0 = (paging->flags & PSR_M_MASK) == PSR_M_EL0t; if (ADDR_IS_KERNEL(gla)) { /* If address translation is disabled raise an exception */ if ((paging->tcr_el1 & TCR_EPD1) != 0) { *is_fault = 1; return (0); } if (is_el0 && (paging->tcr_el1 & TCR_E0PD1) != 0) { *is_fault = 1; return (0); } pte_addr = paging->ttbr1_addr; tsz = (paging->tcr_el1 & TCR_T1SZ_MASK) >> TCR_T1SZ_SHIFT; /* Clear the top byte if TBI is on */ if ((paging->tcr_el1 & TCR_TBI1) != 0) gla |= (0xfful << 56); switch (paging->tcr_el1 & TCR_TG1_MASK) { case TCR_TG1_4K: granule_shift = PAGE_SHIFT_4K; break; case TCR_TG1_16K: granule_shift = PAGE_SHIFT_16K; break; case TCR_TG1_64K: granule_shift = PAGE_SHIFT_64K; break; default: *is_fault = 1; return (EINVAL); } } else { /* If address translation is disabled raise an exception */ if ((paging->tcr_el1 & TCR_EPD0) != 0) { *is_fault = 1; return (0); } if (is_el0 && (paging->tcr_el1 & TCR_E0PD0) != 0) { *is_fault = 1; return (0); } pte_addr = paging->ttbr0_addr; tsz = (paging->tcr_el1 & TCR_T0SZ_MASK) >> TCR_T0SZ_SHIFT; /* Clear the top byte if TBI is on */ if ((paging->tcr_el1 & TCR_TBI0) != 0) gla &= ~(0xfful << 56); switch (paging->tcr_el1 & TCR_TG0_MASK) { case TCR_TG0_4K: granule_shift = PAGE_SHIFT_4K; break; case TCR_TG0_16K: granule_shift = PAGE_SHIFT_16K; break; case TCR_TG0_64K: granule_shift = PAGE_SHIFT_64K; break; default: *is_fault = 1; return (EINVAL); } } /* * TODO: Support FEAT_TTST for smaller tsz values and FEAT_LPA2 * for larger values. */ switch (granule_shift) { case PAGE_SHIFT_4K: case PAGE_SHIFT_16K: /* * See "Table D8-11 4KB granule, determining stage 1 initial * lookup level" and "Table D8-21 16KB granule, determining * stage 1 initial lookup level" from the "Arm Architecture * Reference Manual for A-Profile architecture" revision I.a * for the minimum and maximum values. * * TODO: Support less than 16 when FEAT_LPA2 is implemented * and TCR_EL1.DS == 1 * TODO: Support more than 39 when FEAT_TTST is implemented */ if (tsz < 16 || tsz > 39) { *is_fault = 1; return (EINVAL); } break; case PAGE_SHIFT_64K: /* TODO: Support 64k granule. It will probably work, but is untested */ default: *is_fault = 1; return (EINVAL); } /* * Calculate the input address bits. These are 64 bit in an address * with the top tsz bits being all 0 or all 1. */ ia_bits = 64 - tsz; /* * Calculate the number of address bits used in the page table * calculation. This is ia_bits minus the bottom granule_shift * bits that are passed to the output address. */ address_bits = ia_bits - granule_shift; /* * Calculate the number of levels. Each level uses * granule_shift - PTE_SHIFT bits of the input address. * This is because the table is 1 << granule_shift and each * entry is 1 << PTE_SHIFT bytes. */ levels = howmany(address_bits, granule_shift - PTE_SHIFT); /* Mask of the upper unused bits in the virtual address */ gla &= (1ul << ia_bits) - 1; hypctx = (struct hypctx *)vcpui; cookie = NULL; /* TODO: Check if the level supports block descriptors */ for (;levels > 0; levels--) { int idx; pte_shift = (levels - 1) * (granule_shift - PTE_SHIFT) + granule_shift; idx = (gla >> pte_shift) & ((1ul << (granule_shift - PTE_SHIFT)) - 1); while (idx > PAGE_SIZE / sizeof(pte)) { idx -= PAGE_SIZE / sizeof(pte); pte_addr += PAGE_SIZE; } ptep = ptp_hold(hypctx->vcpu, pte_addr, PAGE_SIZE, &cookie); if (ptep == NULL) goto error; pte = ptep[idx]; /* Calculate the level we are looking at */ switch (levels) { default: goto fault; /* TODO: Level -1 when FEAT_LPA2 is implemented */ case 4: /* Level 0 */ if ((pte & ATTR_DESCR_MASK) != L0_TABLE) goto fault; /* FALLTHROUGH */ case 3: /* Level 1 */ case 2: /* Level 2 */ switch (pte & ATTR_DESCR_MASK) { /* Use L1 macro as all levels are the same */ case L1_TABLE: /* Check if EL0 can access this address space */ if (is_el0 && (pte & TATTR_AP_TABLE_NO_EL0) != 0) goto fault; /* Check if the address space is writable */ if ((prot & PROT_WRITE) != 0 && (pte & TATTR_AP_TABLE_RO) != 0) goto fault; if ((prot & PROT_EXEC) != 0) { /* Check the table exec attribute */ if ((is_el0 && (pte & TATTR_UXN_TABLE) != 0) || (!is_el0 && (pte & TATTR_PXN_TABLE) != 0)) goto fault; } pte_addr = pte & ~ATTR_MASK; break; case L1_BLOCK: goto done; default: goto fault; } break; case 1: /* Level 3 */ if ((pte & ATTR_DESCR_MASK) == L3_PAGE) goto done; goto fault; } } done: /* Check if EL0 has access to the block/page */ if (is_el0 && (pte & ATTR_S1_AP(ATTR_S1_AP_USER)) == 0) goto fault; if ((prot & PROT_WRITE) != 0 && (pte & ATTR_S1_AP_RW_BIT) != 0) goto fault; if ((prot & PROT_EXEC) != 0) { if ((is_el0 && (pte & ATTR_S1_UXN) != 0) || (!is_el0 && (pte & ATTR_S1_PXN) != 0)) goto fault; } mask = (1ul << pte_shift) - 1; *gpa = (pte & ~ATTR_MASK) | (gla & mask); *is_fault = 0; ptp_release(&cookie); return (0); error: ptp_release(&cookie); return (EFAULT); fault: *is_fault = 1; ptp_release(&cookie); return (0); } int vmmops_run(void *vcpui, register_t pc, pmap_t pmap, struct vm_eventinfo *evinfo) { uint64_t excp_type; int handled; register_t daif; struct hyp *hyp; struct hypctx *hypctx; struct vcpu *vcpu; struct vm_exit *vme; int mode; hypctx = (struct hypctx *)vcpui; hyp = hypctx->hyp; vcpu = hypctx->vcpu; vme = vm_exitinfo(vcpu); hypctx->tf.tf_elr = (uint64_t)pc; for (;;) { if (hypctx->has_exception) { hypctx->has_exception = false; hypctx->elr_el1 = hypctx->tf.tf_elr; mode = hypctx->tf.tf_spsr & (PSR_M_MASK | PSR_M_32); if (mode == PSR_M_EL1t) { hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x0; } else if (mode == PSR_M_EL1h) { hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x200; } else if ((mode & PSR_M_32) == PSR_M_64) { /* 64-bit EL0 */ hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x400; } else { /* 32-bit EL0 */ hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x600; } /* Set the new spsr */ hypctx->spsr_el1 = hypctx->tf.tf_spsr; /* Set the new cpsr */ hypctx->tf.tf_spsr = hypctx->spsr_el1 & PSR_FLAGS; hypctx->tf.tf_spsr |= PSR_DAIF | PSR_M_EL1h; /* * Update fields that may change on exeption entry * based on how sctlr_el1 is configured. */ if ((hypctx->sctlr_el1 & SCTLR_SPAN) == 0) hypctx->tf.tf_spsr |= PSR_PAN; if ((hypctx->sctlr_el1 & SCTLR_DSSBS) == 0) hypctx->tf.tf_spsr &= ~PSR_SSBS; else hypctx->tf.tf_spsr |= PSR_SSBS; } daif = intr_disable(); /* Check if the vcpu is suspended */ if (vcpu_suspended(evinfo)) { intr_restore(daif); vm_exit_suspended(vcpu, pc); break; } if (vcpu_debugged(vcpu)) { intr_restore(daif); vm_exit_debug(vcpu, pc); break; } /* Activate the stage2 pmap so the vmid is valid */ pmap_activate_vm(pmap); hyp->vttbr_el2 = pmap_to_ttbr0(pmap); /* * TODO: What happens if a timer interrupt is asserted exactly * here, but for the previous VM? */ arm64_set_active_vcpu(hypctx); vgic_flush_hwstate(hypctx); /* Call into EL2 to switch to the guest */ excp_type = vmm_enter_guest(hyp, hypctx); vgic_sync_hwstate(hypctx); vtimer_sync_hwstate(hypctx); /* * Deactivate the stage2 pmap. */ PCPU_SET(curvmpmap, NULL); intr_restore(daif); vmm_stat_incr(vcpu, VMEXIT_COUNT, 1); if (excp_type == EXCP_TYPE_MAINT_IRQ) continue; vme->pc = hypctx->tf.tf_elr; vme->inst_length = INSN_SIZE; vme->u.hyp.exception_nr = excp_type; vme->u.hyp.esr_el2 = hypctx->tf.tf_esr; vme->u.hyp.far_el2 = hypctx->exit_info.far_el2; vme->u.hyp.hpfar_el2 = hypctx->exit_info.hpfar_el2; handled = arm64_handle_world_switch(hypctx, excp_type, vme, pmap); if (handled == UNHANDLED) /* Exit loop to emulate instruction. */ break; else /* Resume guest execution from the next instruction. */ hypctx->tf.tf_elr += vme->inst_length; } return (0); } static void arm_pcpu_vmcleanup(void *arg) { struct hyp *hyp; int i, maxcpus; hyp = arg; maxcpus = vm_get_maxcpus(hyp->vm); for (i = 0; i < maxcpus; i++) { if (arm64_get_active_vcpu() == hyp->ctx[i]) { arm64_set_active_vcpu(NULL); break; } } } void vmmops_vcpu_cleanup(void *vcpui) { struct hypctx *hypctx = vcpui; vtimer_cpucleanup(hypctx); vgic_cpucleanup(hypctx); if (!in_vhe()) vmmpmap_remove(hypctx->el2_addr, el2_hypctx_size(), true); free(hypctx, M_HYP); } void vmmops_cleanup(void *vmi) { struct hyp *hyp = vmi; vtimer_vmcleanup(hyp); vgic_vmcleanup(hyp); smp_rendezvous(NULL, arm_pcpu_vmcleanup, NULL, hyp); if (!in_vhe()) vmmpmap_remove(hyp->el2_addr, el2_hyp_size(hyp->vm), true); free(hyp, M_HYP); } /* * Return register value. Registers have different sizes and an explicit cast * must be made to ensure proper conversion. */ static uint64_t * hypctx_regptr(struct hypctx *hypctx, int reg) { switch (reg) { case VM_REG_GUEST_X0 ... VM_REG_GUEST_X29: return (&hypctx->tf.tf_x[reg]); case VM_REG_GUEST_LR: return (&hypctx->tf.tf_lr); case VM_REG_GUEST_SP: return (&hypctx->tf.tf_sp); case VM_REG_GUEST_CPSR: return (&hypctx->tf.tf_spsr); case VM_REG_GUEST_PC: return (&hypctx->tf.tf_elr); case VM_REG_GUEST_SCTLR_EL1: return (&hypctx->sctlr_el1); case VM_REG_GUEST_TTBR0_EL1: return (&hypctx->ttbr0_el1); case VM_REG_GUEST_TTBR1_EL1: return (&hypctx->ttbr1_el1); case VM_REG_GUEST_TCR_EL1: return (&hypctx->tcr_el1); case VM_REG_GUEST_TCR2_EL1: return (&hypctx->tcr2_el1); case VM_REG_GUEST_MPIDR_EL1: return (&hypctx->vmpidr_el2); default: break; } return (NULL); } int vmmops_getreg(void *vcpui, int reg, uint64_t *retval) { uint64_t *regp; int running, hostcpu; struct hypctx *hypctx = vcpui; running = vcpu_is_running(hypctx->vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("arm_getreg: %s%d is running", vm_name(hypctx->hyp->vm), vcpu_vcpuid(hypctx->vcpu)); regp = hypctx_regptr(hypctx, reg); if (regp == NULL) return (EINVAL); *retval = *regp; return (0); } int vmmops_setreg(void *vcpui, int reg, uint64_t val) { uint64_t *regp; struct hypctx *hypctx = vcpui; int running, hostcpu; running = vcpu_is_running(hypctx->vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("arm_setreg: %s%d is running", vm_name(hypctx->hyp->vm), vcpu_vcpuid(hypctx->vcpu)); regp = hypctx_regptr(hypctx, reg); if (regp == NULL) return (EINVAL); *regp = val; return (0); } int vmmops_exception(void *vcpui, uint64_t esr, uint64_t far) { struct hypctx *hypctx = vcpui; int running, hostcpu; running = vcpu_is_running(hypctx->vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("%s: %s%d is running", __func__, vm_name(hypctx->hyp->vm), vcpu_vcpuid(hypctx->vcpu)); hypctx->far_el1 = far; hypctx->esr_el1 = esr; hypctx->has_exception = true; return (0); } int vmmops_getcap(void *vcpui, int num, int *retval) { struct hypctx *hypctx = vcpui; int ret; ret = ENOENT; switch (num) { case VM_CAP_UNRESTRICTED_GUEST: *retval = 1; ret = 0; break; case VM_CAP_BRK_EXIT: case VM_CAP_SS_EXIT: case VM_CAP_MASK_HWINTR: *retval = (hypctx->setcaps & (1ul << num)) != 0; break; default: break; } return (ret); } int vmmops_setcap(void *vcpui, int num, int val) { struct hypctx *hypctx = vcpui; int ret; ret = 0; switch (num) { case VM_CAP_BRK_EXIT: if ((val != 0) == ((hypctx->setcaps & (1ul << num)) != 0)) break; if (val != 0) hypctx->mdcr_el2 |= MDCR_EL2_TDE; else hypctx->mdcr_el2 &= ~MDCR_EL2_TDE; break; case VM_CAP_SS_EXIT: if ((val != 0) == ((hypctx->setcaps & (1ul << num)) != 0)) break; if (val != 0) { hypctx->debug_spsr |= (hypctx->tf.tf_spsr & PSR_SS); hypctx->debug_mdscr |= hypctx->mdscr_el1 & (MDSCR_SS | MDSCR_KDE); hypctx->tf.tf_spsr |= PSR_SS; hypctx->mdscr_el1 |= MDSCR_SS | MDSCR_KDE; hypctx->mdcr_el2 |= MDCR_EL2_TDE; } else { hypctx->tf.tf_spsr &= ~PSR_SS; hypctx->tf.tf_spsr |= hypctx->debug_spsr; hypctx->debug_spsr &= ~PSR_SS; hypctx->mdscr_el1 &= ~(MDSCR_SS | MDSCR_KDE); hypctx->mdscr_el1 |= hypctx->debug_mdscr; hypctx->debug_mdscr &= ~(MDSCR_SS | MDSCR_KDE); hypctx->mdcr_el2 &= ~MDCR_EL2_TDE; } break; case VM_CAP_MASK_HWINTR: if ((val != 0) == ((hypctx->setcaps & (1ul << num)) != 0)) break; if (val != 0) { hypctx->debug_spsr |= (hypctx->tf.tf_spsr & (PSR_I | PSR_F)); hypctx->tf.tf_spsr |= PSR_I | PSR_F; } else { hypctx->tf.tf_spsr &= ~(PSR_I | PSR_F); hypctx->tf.tf_spsr |= (hypctx->debug_spsr & (PSR_I | PSR_F)); hypctx->debug_spsr &= ~(PSR_I | PSR_F); } break; default: ret = ENOENT; break; } if (ret == 0) { if (val == 0) hypctx->setcaps &= ~(1ul << num); else hypctx->setcaps |= (1ul << num); } return (ret); } diff --git a/sys/arm64/vmm/vmm_hyp.c b/sys/arm64/vmm/vmm_hyp.c index b8c6d2ab7a9a..0ad7930e9a87 100644 --- a/sys/arm64/vmm/vmm_hyp.c +++ b/sys/arm64/vmm/vmm_hyp.c @@ -1,767 +1,766 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Andrew Turner * * This work was supported by Innovate UK project 105694, "Digital Security * by Design (DSbD) Technology Platform Prototype". * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include -#include #include "arm64.h" #include "hyp.h" struct hypctx; uint64_t VMM_HYP_FUNC(do_call_guest)(struct hypctx *); static void vmm_hyp_reg_store(struct hypctx *hypctx, struct hyp *hyp, bool guest, bool ecv_poff) { uint64_t dfr0; if (guest) { /* Store the timer registers */ hypctx->vtimer_cpu.cntkctl_el1 = READ_SPECIALREG(EL1_REG(CNTKCTL)); hypctx->vtimer_cpu.virt_timer.cntx_cval_el0 = READ_SPECIALREG(EL0_REG(CNTV_CVAL)); hypctx->vtimer_cpu.virt_timer.cntx_ctl_el0 = READ_SPECIALREG(EL0_REG(CNTV_CTL)); } if (guest_or_nonvhe(guest) && ecv_poff) { /* * If we have ECV then the guest could modify these registers. * If VHE is enabled then the kernel will see a different view * of the registers, so doesn't need to handle them. */ hypctx->vtimer_cpu.phys_timer.cntx_cval_el0 = READ_SPECIALREG(EL0_REG(CNTP_CVAL)); hypctx->vtimer_cpu.phys_timer.cntx_ctl_el0 = READ_SPECIALREG(EL0_REG(CNTP_CTL)); } if (guest) { /* Store the GICv3 registers */ hypctx->vgic_v3_regs.ich_eisr_el2 = READ_SPECIALREG(ich_eisr_el2); hypctx->vgic_v3_regs.ich_elrsr_el2 = READ_SPECIALREG(ich_elrsr_el2); hypctx->vgic_v3_regs.ich_hcr_el2 = READ_SPECIALREG(ich_hcr_el2); hypctx->vgic_v3_regs.ich_misr_el2 = READ_SPECIALREG(ich_misr_el2); hypctx->vgic_v3_regs.ich_vmcr_el2 = READ_SPECIALREG(ich_vmcr_el2); switch (hypctx->vgic_v3_regs.ich_lr_num - 1) { #define STORE_LR(x) \ case x: \ hypctx->vgic_v3_regs.ich_lr_el2[x] = \ READ_SPECIALREG(ich_lr ## x ##_el2) STORE_LR(15); STORE_LR(14); STORE_LR(13); STORE_LR(12); STORE_LR(11); STORE_LR(10); STORE_LR(9); STORE_LR(8); STORE_LR(7); STORE_LR(6); STORE_LR(5); STORE_LR(4); STORE_LR(3); STORE_LR(2); STORE_LR(1); default: STORE_LR(0); #undef STORE_LR } switch (hypctx->vgic_v3_regs.ich_apr_num - 1) { #define STORE_APR(x) \ case x: \ hypctx->vgic_v3_regs.ich_ap0r_el2[x] = \ READ_SPECIALREG(ich_ap0r ## x ##_el2); \ hypctx->vgic_v3_regs.ich_ap1r_el2[x] = \ READ_SPECIALREG(ich_ap1r ## x ##_el2) STORE_APR(3); STORE_APR(2); STORE_APR(1); default: STORE_APR(0); #undef STORE_APR } } hypctx->dbgclaimset_el1 = READ_SPECIALREG(dbgclaimset_el1); dfr0 = READ_SPECIALREG(id_aa64dfr0_el1); switch (ID_AA64DFR0_BRPs_VAL(dfr0) - 1) { #define STORE_DBG_BRP(x) \ case x: \ hypctx->dbgbcr_el1[x] = \ READ_SPECIALREG(dbgbcr ## x ## _el1); \ hypctx->dbgbvr_el1[x] = \ READ_SPECIALREG(dbgbvr ## x ## _el1) STORE_DBG_BRP(15); STORE_DBG_BRP(14); STORE_DBG_BRP(13); STORE_DBG_BRP(12); STORE_DBG_BRP(11); STORE_DBG_BRP(10); STORE_DBG_BRP(9); STORE_DBG_BRP(8); STORE_DBG_BRP(7); STORE_DBG_BRP(6); STORE_DBG_BRP(5); STORE_DBG_BRP(4); STORE_DBG_BRP(3); STORE_DBG_BRP(2); STORE_DBG_BRP(1); default: STORE_DBG_BRP(0); #undef STORE_DBG_BRP } switch (ID_AA64DFR0_WRPs_VAL(dfr0) - 1) { #define STORE_DBG_WRP(x) \ case x: \ hypctx->dbgwcr_el1[x] = \ READ_SPECIALREG(dbgwcr ## x ## _el1); \ hypctx->dbgwvr_el1[x] = \ READ_SPECIALREG(dbgwvr ## x ## _el1) STORE_DBG_WRP(15); STORE_DBG_WRP(14); STORE_DBG_WRP(13); STORE_DBG_WRP(12); STORE_DBG_WRP(11); STORE_DBG_WRP(10); STORE_DBG_WRP(9); STORE_DBG_WRP(8); STORE_DBG_WRP(7); STORE_DBG_WRP(6); STORE_DBG_WRP(5); STORE_DBG_WRP(4); STORE_DBG_WRP(3); STORE_DBG_WRP(2); STORE_DBG_WRP(1); default: STORE_DBG_WRP(0); #undef STORE_DBG_WRP } /* Store the PMU registers */ hypctx->pmcr_el0 = READ_SPECIALREG(pmcr_el0); hypctx->pmccntr_el0 = READ_SPECIALREG(pmccntr_el0); hypctx->pmccfiltr_el0 = READ_SPECIALREG(pmccfiltr_el0); hypctx->pmuserenr_el0 = READ_SPECIALREG(pmuserenr_el0); hypctx->pmselr_el0 = READ_SPECIALREG(pmselr_el0); hypctx->pmxevcntr_el0 = READ_SPECIALREG(pmxevcntr_el0); hypctx->pmcntenset_el0 = READ_SPECIALREG(pmcntenset_el0); hypctx->pmintenset_el1 = READ_SPECIALREG(pmintenset_el1); hypctx->pmovsset_el0 = READ_SPECIALREG(pmovsset_el0); switch ((hypctx->pmcr_el0 & PMCR_N_MASK) >> PMCR_N_SHIFT) { #define STORE_PMU(x) \ case (x + 1): \ hypctx->pmevcntr_el0[x] = \ READ_SPECIALREG(pmevcntr ## x ## _el0); \ hypctx->pmevtyper_el0[x] = \ READ_SPECIALREG(pmevtyper ## x ## _el0) STORE_PMU(30); STORE_PMU(29); STORE_PMU(28); STORE_PMU(27); STORE_PMU(26); STORE_PMU(25); STORE_PMU(24); STORE_PMU(23); STORE_PMU(22); STORE_PMU(21); STORE_PMU(20); STORE_PMU(19); STORE_PMU(18); STORE_PMU(17); STORE_PMU(16); STORE_PMU(15); STORE_PMU(14); STORE_PMU(13); STORE_PMU(12); STORE_PMU(11); STORE_PMU(10); STORE_PMU(9); STORE_PMU(8); STORE_PMU(7); STORE_PMU(6); STORE_PMU(5); STORE_PMU(4); STORE_PMU(3); STORE_PMU(2); STORE_PMU(1); STORE_PMU(0); default: /* N == 0 when only PMCCNTR_EL0 is available */ break; #undef STORE_PMU } /* Store the special to from the trapframe */ hypctx->tf.tf_sp = READ_SPECIALREG(sp_el1); hypctx->tf.tf_elr = READ_SPECIALREG(elr_el2); hypctx->tf.tf_spsr = READ_SPECIALREG(spsr_el2); if (guest) { hypctx->tf.tf_esr = READ_SPECIALREG(esr_el2); hypctx->par_el1 = READ_SPECIALREG(par_el1); } /* Store the guest special registers */ hypctx->sp_el0 = READ_SPECIALREG(sp_el0); hypctx->tpidr_el0 = READ_SPECIALREG(tpidr_el0); hypctx->tpidrro_el0 = READ_SPECIALREG(tpidrro_el0); hypctx->tpidr_el1 = READ_SPECIALREG(tpidr_el1); hypctx->actlr_el1 = READ_SPECIALREG(actlr_el1); hypctx->csselr_el1 = READ_SPECIALREG(csselr_el1); hypctx->mdccint_el1 = READ_SPECIALREG(mdccint_el1); hypctx->mdscr_el1 = READ_SPECIALREG(mdscr_el1); if (guest_or_nonvhe(guest)) { hypctx->elr_el1 = READ_SPECIALREG(EL1_REG(ELR)); hypctx->vbar_el1 = READ_SPECIALREG(EL1_REG(VBAR)); hypctx->afsr0_el1 = READ_SPECIALREG(EL1_REG(AFSR0)); hypctx->afsr1_el1 = READ_SPECIALREG(EL1_REG(AFSR1)); hypctx->amair_el1 = READ_SPECIALREG(EL1_REG(AMAIR)); hypctx->contextidr_el1 = READ_SPECIALREG(EL1_REG(CONTEXTIDR)); hypctx->cpacr_el1 = READ_SPECIALREG(EL1_REG(CPACR)); hypctx->esr_el1 = READ_SPECIALREG(EL1_REG(ESR)); hypctx->far_el1 = READ_SPECIALREG(EL1_REG(FAR)); hypctx->mair_el1 = READ_SPECIALREG(EL1_REG(MAIR)); hypctx->sctlr_el1 = READ_SPECIALREG(EL1_REG(SCTLR)); hypctx->spsr_el1 = READ_SPECIALREG(EL1_REG(SPSR)); hypctx->tcr_el1 = READ_SPECIALREG(EL1_REG(TCR)); /* TODO: Support when this is not res0 */ hypctx->tcr2_el1 = 0; hypctx->ttbr0_el1 = READ_SPECIALREG(EL1_REG(TTBR0)); hypctx->ttbr1_el1 = READ_SPECIALREG(EL1_REG(TTBR1)); } hypctx->cptr_el2 = READ_SPECIALREG(cptr_el2); hypctx->hcr_el2 = READ_SPECIALREG(hcr_el2); hypctx->vpidr_el2 = READ_SPECIALREG(vpidr_el2); hypctx->vmpidr_el2 = READ_SPECIALREG(vmpidr_el2); } static void vmm_hyp_reg_restore(struct hypctx *hypctx, struct hyp *hyp, bool guest, bool ecv_poff) { uint64_t dfr0; /* Restore the special registers */ WRITE_SPECIALREG(hcr_el2, hypctx->hcr_el2); if (guest) { if ((hyp->feats & HYP_FEAT_HCX) != 0) WRITE_SPECIALREG(HCRX_EL2_REG, hypctx->hcrx_el2); } isb(); WRITE_SPECIALREG(sp_el0, hypctx->sp_el0); WRITE_SPECIALREG(tpidr_el0, hypctx->tpidr_el0); WRITE_SPECIALREG(tpidrro_el0, hypctx->tpidrro_el0); WRITE_SPECIALREG(tpidr_el1, hypctx->tpidr_el1); WRITE_SPECIALREG(actlr_el1, hypctx->actlr_el1); WRITE_SPECIALREG(csselr_el1, hypctx->csselr_el1); WRITE_SPECIALREG(mdccint_el1, hypctx->mdccint_el1); WRITE_SPECIALREG(mdscr_el1, hypctx->mdscr_el1); if (guest_or_nonvhe(guest)) { WRITE_SPECIALREG(EL1_REG(ELR), hypctx->elr_el1); WRITE_SPECIALREG(EL1_REG(VBAR), hypctx->vbar_el1); WRITE_SPECIALREG(EL1_REG(AFSR0), hypctx->afsr0_el1); WRITE_SPECIALREG(EL1_REG(AFSR1), hypctx->afsr1_el1); WRITE_SPECIALREG(EL1_REG(AMAIR), hypctx->amair_el1); WRITE_SPECIALREG(EL1_REG(CONTEXTIDR), hypctx->contextidr_el1); WRITE_SPECIALREG(EL1_REG(CPACR), hypctx->cpacr_el1); WRITE_SPECIALREG(EL1_REG(ESR), hypctx->esr_el1); WRITE_SPECIALREG(EL1_REG(FAR), hypctx->far_el1); WRITE_SPECIALREG(EL1_REG(MAIR), hypctx->mair_el1); // WRITE_SPECIALREG(EL1_REG(SCTLR), hypctx->sctlr_el1); WRITE_SPECIALREG(EL1_REG(SPSR), hypctx->spsr_el1); WRITE_SPECIALREG(EL1_REG(TCR), hypctx->tcr_el1); /* TODO: tcr2_el1 */ WRITE_SPECIALREG(EL1_REG(TTBR0), hypctx->ttbr0_el1); WRITE_SPECIALREG(EL1_REG(TTBR1), hypctx->ttbr1_el1); } if (guest) { WRITE_SPECIALREG(par_el1, hypctx->par_el1); } WRITE_SPECIALREG(cptr_el2, hypctx->cptr_el2); WRITE_SPECIALREG(vpidr_el2, hypctx->vpidr_el2); WRITE_SPECIALREG(vmpidr_el2, hypctx->vmpidr_el2); /* Load the special regs from the trapframe */ WRITE_SPECIALREG(sp_el1, hypctx->tf.tf_sp); WRITE_SPECIALREG(elr_el2, hypctx->tf.tf_elr); WRITE_SPECIALREG(spsr_el2, hypctx->tf.tf_spsr); /* Restore the PMU registers */ WRITE_SPECIALREG(pmcr_el0, hypctx->pmcr_el0); WRITE_SPECIALREG(pmccntr_el0, hypctx->pmccntr_el0); WRITE_SPECIALREG(pmccfiltr_el0, hypctx->pmccfiltr_el0); WRITE_SPECIALREG(pmuserenr_el0, hypctx->pmuserenr_el0); WRITE_SPECIALREG(pmselr_el0, hypctx->pmselr_el0); WRITE_SPECIALREG(pmxevcntr_el0, hypctx->pmxevcntr_el0); /* Clear all events/interrupts then enable them */ WRITE_SPECIALREG(pmcntenclr_el0, ~0ul); WRITE_SPECIALREG(pmcntenset_el0, hypctx->pmcntenset_el0); WRITE_SPECIALREG(pmintenclr_el1, ~0ul); WRITE_SPECIALREG(pmintenset_el1, hypctx->pmintenset_el1); WRITE_SPECIALREG(pmovsclr_el0, ~0ul); WRITE_SPECIALREG(pmovsset_el0, hypctx->pmovsset_el0); switch ((hypctx->pmcr_el0 & PMCR_N_MASK) >> PMCR_N_SHIFT) { #define LOAD_PMU(x) \ case (x + 1): \ WRITE_SPECIALREG(pmevcntr ## x ## _el0, \ hypctx->pmevcntr_el0[x]); \ WRITE_SPECIALREG(pmevtyper ## x ## _el0, \ hypctx->pmevtyper_el0[x]) LOAD_PMU(30); LOAD_PMU(29); LOAD_PMU(28); LOAD_PMU(27); LOAD_PMU(26); LOAD_PMU(25); LOAD_PMU(24); LOAD_PMU(23); LOAD_PMU(22); LOAD_PMU(21); LOAD_PMU(20); LOAD_PMU(19); LOAD_PMU(18); LOAD_PMU(17); LOAD_PMU(16); LOAD_PMU(15); LOAD_PMU(14); LOAD_PMU(13); LOAD_PMU(12); LOAD_PMU(11); LOAD_PMU(10); LOAD_PMU(9); LOAD_PMU(8); LOAD_PMU(7); LOAD_PMU(6); LOAD_PMU(5); LOAD_PMU(4); LOAD_PMU(3); LOAD_PMU(2); LOAD_PMU(1); LOAD_PMU(0); default: /* N == 0 when only PMCCNTR_EL0 is available */ break; #undef LOAD_PMU } WRITE_SPECIALREG(dbgclaimclr_el1, ~0ul); WRITE_SPECIALREG(dbgclaimclr_el1, hypctx->dbgclaimset_el1); dfr0 = READ_SPECIALREG(id_aa64dfr0_el1); switch (ID_AA64DFR0_BRPs_VAL(dfr0) - 1) { #define LOAD_DBG_BRP(x) \ case x: \ WRITE_SPECIALREG(dbgbcr ## x ## _el1, \ hypctx->dbgbcr_el1[x]); \ WRITE_SPECIALREG(dbgbvr ## x ## _el1, \ hypctx->dbgbvr_el1[x]) LOAD_DBG_BRP(15); LOAD_DBG_BRP(14); LOAD_DBG_BRP(13); LOAD_DBG_BRP(12); LOAD_DBG_BRP(11); LOAD_DBG_BRP(10); LOAD_DBG_BRP(9); LOAD_DBG_BRP(8); LOAD_DBG_BRP(7); LOAD_DBG_BRP(6); LOAD_DBG_BRP(5); LOAD_DBG_BRP(4); LOAD_DBG_BRP(3); LOAD_DBG_BRP(2); LOAD_DBG_BRP(1); default: LOAD_DBG_BRP(0); #undef LOAD_DBG_BRP } switch (ID_AA64DFR0_WRPs_VAL(dfr0) - 1) { #define LOAD_DBG_WRP(x) \ case x: \ WRITE_SPECIALREG(dbgwcr ## x ## _el1, \ hypctx->dbgwcr_el1[x]); \ WRITE_SPECIALREG(dbgwvr ## x ## _el1, \ hypctx->dbgwvr_el1[x]) LOAD_DBG_WRP(15); LOAD_DBG_WRP(14); LOAD_DBG_WRP(13); LOAD_DBG_WRP(12); LOAD_DBG_WRP(11); LOAD_DBG_WRP(10); LOAD_DBG_WRP(9); LOAD_DBG_WRP(8); LOAD_DBG_WRP(7); LOAD_DBG_WRP(6); LOAD_DBG_WRP(5); LOAD_DBG_WRP(4); LOAD_DBG_WRP(3); LOAD_DBG_WRP(2); LOAD_DBG_WRP(1); default: LOAD_DBG_WRP(0); #undef LOAD_DBG_WRP } if (guest) { /* Load the timer registers */ WRITE_SPECIALREG(EL1_REG(CNTKCTL), hypctx->vtimer_cpu.cntkctl_el1); WRITE_SPECIALREG(EL0_REG(CNTV_CVAL), hypctx->vtimer_cpu.virt_timer.cntx_cval_el0); WRITE_SPECIALREG(EL0_REG(CNTV_CTL), hypctx->vtimer_cpu.virt_timer.cntx_ctl_el0); WRITE_SPECIALREG(cnthctl_el2, hyp->vtimer.cnthctl_el2); WRITE_SPECIALREG(cntvoff_el2, hyp->vtimer.cntvoff_el2); if (ecv_poff) { /* * Load the same offset as the virtual timer * to keep in sync. */ WRITE_SPECIALREG(CNTPOFF_EL2_REG, hyp->vtimer.cntvoff_el2); isb(); } } if (guest_or_nonvhe(guest) && ecv_poff) { /* * If we have ECV then the guest could modify these registers. * If VHE is enabled then the kernel will see a different view * of the registers, so doesn't need to handle them. */ WRITE_SPECIALREG(EL0_REG(CNTP_CVAL), hypctx->vtimer_cpu.phys_timer.cntx_cval_el0); WRITE_SPECIALREG(EL0_REG(CNTP_CTL), hypctx->vtimer_cpu.phys_timer.cntx_ctl_el0); } if (guest) { /* Load the GICv3 registers */ WRITE_SPECIALREG(ich_hcr_el2, hypctx->vgic_v3_regs.ich_hcr_el2); WRITE_SPECIALREG(ich_vmcr_el2, hypctx->vgic_v3_regs.ich_vmcr_el2); switch (hypctx->vgic_v3_regs.ich_lr_num - 1) { #define LOAD_LR(x) \ case x: \ WRITE_SPECIALREG(ich_lr ## x ##_el2, \ hypctx->vgic_v3_regs.ich_lr_el2[x]) LOAD_LR(15); LOAD_LR(14); LOAD_LR(13); LOAD_LR(12); LOAD_LR(11); LOAD_LR(10); LOAD_LR(9); LOAD_LR(8); LOAD_LR(7); LOAD_LR(6); LOAD_LR(5); LOAD_LR(4); LOAD_LR(3); LOAD_LR(2); LOAD_LR(1); default: LOAD_LR(0); #undef LOAD_LR } switch (hypctx->vgic_v3_regs.ich_apr_num - 1) { #define LOAD_APR(x) \ case x: \ WRITE_SPECIALREG(ich_ap0r ## x ##_el2, \ hypctx->vgic_v3_regs.ich_ap0r_el2[x]); \ WRITE_SPECIALREG(ich_ap1r ## x ##_el2, \ hypctx->vgic_v3_regs.ich_ap1r_el2[x]) LOAD_APR(3); LOAD_APR(2); LOAD_APR(1); default: LOAD_APR(0); #undef LOAD_APR } } } static uint64_t vmm_hyp_call_guest(struct hyp *hyp, struct hypctx *hypctx) { struct hypctx host_hypctx; uint64_t cntvoff_el2; uint64_t ich_hcr_el2, ich_vmcr_el2, cnthctl_el2, cntkctl_el1; #ifndef VMM_VHE uint64_t hcrx_el2; #endif uint64_t ret; uint64_t s1e1r, hpfar_el2; bool ecv_poff, hpfar_valid; ecv_poff = (hyp->vtimer.cnthctl_el2 & CNTHCTL_ECV_EN) != 0; vmm_hyp_reg_store(&host_hypctx, NULL, false, ecv_poff); #ifndef VMM_VHE if ((hyp->feats & HYP_FEAT_HCX) != 0) hcrx_el2 = READ_SPECIALREG(MRS_REG_ALT_NAME(HCRX_EL2)); #endif /* Save the host special registers */ cnthctl_el2 = READ_SPECIALREG(cnthctl_el2); cntkctl_el1 = READ_SPECIALREG(cntkctl_el1); cntvoff_el2 = READ_SPECIALREG(cntvoff_el2); ich_hcr_el2 = READ_SPECIALREG(ich_hcr_el2); ich_vmcr_el2 = READ_SPECIALREG(ich_vmcr_el2); vmm_hyp_reg_restore(hypctx, hyp, true, ecv_poff); /* Load the common hypervisor registers */ WRITE_SPECIALREG(vttbr_el2, hyp->vttbr_el2); host_hypctx.mdcr_el2 = READ_SPECIALREG(mdcr_el2); WRITE_SPECIALREG(mdcr_el2, hypctx->mdcr_el2); /* Call into the guest */ ret = VMM_HYP_FUNC(do_call_guest)(hypctx); WRITE_SPECIALREG(mdcr_el2, host_hypctx.mdcr_el2); isb(); /* Store the exit info */ hypctx->exit_info.far_el2 = READ_SPECIALREG(far_el2); vmm_hyp_reg_store(hypctx, hyp, true, ecv_poff); hpfar_valid = true; if (ret == EXCP_TYPE_EL1_SYNC) { switch (ESR_ELx_EXCEPTION(hypctx->tf.tf_esr)) { case EXCP_INSN_ABORT_L: case EXCP_DATA_ABORT_L: /* * The hpfar_el2 register is valid for: * - Translation and Access faults. * - Translation, Access, and permission faults on * the translation table walk on the stage 1 tables. * - A stage 2 Address size fault. * * As we only need it in the first 2 cases we can just * exclude it on permission faults that are not from * the stage 1 table walk. * * TODO: Add a case for Arm erratum 834220. */ if ((hypctx->tf.tf_esr & ISS_DATA_S1PTW) != 0) break; switch (hypctx->tf.tf_esr & ISS_DATA_DFSC_MASK) { case ISS_DATA_DFSC_PF_L1: case ISS_DATA_DFSC_PF_L2: case ISS_DATA_DFSC_PF_L3: hpfar_valid = false; break; } break; } } if (hpfar_valid) { hypctx->exit_info.hpfar_el2 = READ_SPECIALREG(hpfar_el2); } else { /* * TODO: There is a risk the at instruction could cause an * exception here. We should handle it & return a failure. */ s1e1r = arm64_address_translate_s1e1r(hypctx->exit_info.far_el2); if (PAR_SUCCESS(s1e1r)) { hpfar_el2 = (s1e1r & PAR_PA_MASK) >> PAR_PA_SHIFT; hpfar_el2 <<= HPFAR_EL2_FIPA_SHIFT; hypctx->exit_info.hpfar_el2 = hpfar_el2; } else { ret = EXCP_TYPE_REENTER; } } vmm_hyp_reg_restore(&host_hypctx, NULL, false, ecv_poff); #ifndef VMM_VHE if ((hyp->feats & HYP_FEAT_HCX) != 0) WRITE_SPECIALREG(MRS_REG_ALT_NAME(HCRX_EL2), hcrx_el2); #endif /* Restore the host special registers */ WRITE_SPECIALREG(ich_hcr_el2, ich_hcr_el2); WRITE_SPECIALREG(ich_vmcr_el2, ich_vmcr_el2); WRITE_SPECIALREG(cnthctl_el2, cnthctl_el2); WRITE_SPECIALREG(cntkctl_el1, cntkctl_el1); WRITE_SPECIALREG(cntvoff_el2, cntvoff_el2); return (ret); } VMM_STATIC uint64_t VMM_HYP_FUNC(enter_guest)(struct hyp *hyp, struct hypctx *hypctx) { uint64_t ret; do { ret = vmm_hyp_call_guest(hyp, hypctx); } while (ret == EXCP_TYPE_REENTER); return (ret); } VMM_STATIC uint64_t VMM_HYP_FUNC(read_reg)(uint64_t reg) { switch (reg) { case HYP_REG_ICH_VTR: return (READ_SPECIALREG(ich_vtr_el2)); } return (0); } VMM_STATIC void VMM_HYP_FUNC(clean_s2_tlbi)(void) { dsb(ishst); __asm __volatile("tlbi alle1is"); dsb(ish); } VMM_STATIC void VMM_HYP_FUNC(s2_tlbi_range)(uint64_t vttbr, vm_offset_t sva, vm_offset_t eva, bool final_only) { uint64_t end, r, start; uint64_t host_vttbr; #ifdef VMM_VHE uint64_t host_tcr; #endif #ifdef VMM_VHE dsb(ishst); #endif #define TLBI_VA_SHIFT 12 #define TLBI_VA_MASK ((1ul << 44) - 1) #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK) #define TLBI_VA_L3_INCR (L3_SIZE >> TLBI_VA_SHIFT) /* Switch to the guest vttbr */ /* TODO: Handle Cortex-A57/A72 erratum 131936 */ host_vttbr = READ_SPECIALREG(vttbr_el2); WRITE_SPECIALREG(vttbr_el2, vttbr); isb(); #ifdef VMM_VHE host_tcr = READ_SPECIALREG(tcr_el2); WRITE_SPECIALREG(tcr_el2, host_tcr & ~HCR_TGE); isb(); #endif /* * The CPU can cache the stage 1 + 2 combination so we need to ensure * the stage 2 is invalidated first, then when this has completed we * invalidate the stage 1 TLB. As we don't know which stage 1 virtual * addresses point at the stage 2 IPA we need to invalidate the entire * stage 1 TLB. */ start = TLBI_VA(sva); end = TLBI_VA(eva); for (r = start; r < end; r += TLBI_VA_L3_INCR) { /* Invalidate the stage 2 TLB entry */ if (final_only) __asm __volatile("tlbi ipas2le1is, %0" : : "r"(r)); else __asm __volatile("tlbi ipas2e1is, %0" : : "r"(r)); } /* Ensure the entry has been invalidated */ dsb(ish); /* Invalidate the stage 1 TLB. */ __asm __volatile("tlbi vmalle1is"); dsb(ish); isb(); #ifdef VMM_VHE WRITE_SPECIALREG(tcr_el2, host_tcr); isb(); #endif /* Switch back to the host vttbr */ WRITE_SPECIALREG(vttbr_el2, host_vttbr); isb(); } VMM_STATIC void VMM_HYP_FUNC(s2_tlbi_all)(uint64_t vttbr) { uint64_t host_vttbr; #ifdef VMM_VHE dsb(ishst); #endif /* Switch to the guest vttbr */ /* TODO: Handle Cortex-A57/A72 erratum 131936 */ host_vttbr = READ_SPECIALREG(vttbr_el2); WRITE_SPECIALREG(vttbr_el2, vttbr); isb(); __asm __volatile("tlbi vmalls12e1is"); dsb(ish); isb(); /* Switch back t othe host vttbr */ WRITE_SPECIALREG(vttbr_el2, host_vttbr); isb(); } diff --git a/sys/arm64/vmm/vmm_reset.c b/sys/arm64/vmm/vmm_reset.c index 1240c3ed16ec..0e4910ea87b4 100644 --- a/sys/arm64/vmm/vmm_reset.c +++ b/sys/arm64/vmm/vmm_reset.c @@ -1,189 +1,188 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (C) 2018 Alexandru Elisei * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include -#include #include #include #include "arm64.h" #include "reset.h" /* * Make the architecturally UNKNOWN value 0. As a bonus, we don't have to * manually set all those RES0 fields. */ #define ARCH_UNKNOWN 0 #define set_arch_unknown(reg) (memset(&(reg), ARCH_UNKNOWN, sizeof(reg))) void reset_vm_el01_regs(void *vcpu) { struct hypctx *el2ctx; el2ctx = vcpu; set_arch_unknown(el2ctx->tf); set_arch_unknown(el2ctx->actlr_el1); set_arch_unknown(el2ctx->afsr0_el1); set_arch_unknown(el2ctx->afsr1_el1); set_arch_unknown(el2ctx->amair_el1); set_arch_unknown(el2ctx->contextidr_el1); set_arch_unknown(el2ctx->cpacr_el1); set_arch_unknown(el2ctx->csselr_el1); set_arch_unknown(el2ctx->elr_el1); set_arch_unknown(el2ctx->esr_el1); set_arch_unknown(el2ctx->far_el1); set_arch_unknown(el2ctx->mair_el1); set_arch_unknown(el2ctx->mdccint_el1); set_arch_unknown(el2ctx->mdscr_el1); set_arch_unknown(el2ctx->par_el1); /* * Guest starts with: * ~SCTLR_M: MMU off * ~SCTLR_C: data cache off * SCTLR_CP15BEN: memory barrier instruction enable from EL0; RAO/WI * ~SCTLR_I: instruction cache off */ el2ctx->sctlr_el1 = SCTLR_RES1; el2ctx->sctlr_el1 &= ~SCTLR_M & ~SCTLR_C & ~SCTLR_I; el2ctx->sctlr_el1 |= SCTLR_CP15BEN; set_arch_unknown(el2ctx->sp_el0); set_arch_unknown(el2ctx->tcr_el1); set_arch_unknown(el2ctx->tpidr_el0); set_arch_unknown(el2ctx->tpidr_el1); set_arch_unknown(el2ctx->tpidrro_el0); set_arch_unknown(el2ctx->ttbr0_el1); set_arch_unknown(el2ctx->ttbr1_el1); set_arch_unknown(el2ctx->vbar_el1); set_arch_unknown(el2ctx->spsr_el1); set_arch_unknown(el2ctx->dbgbcr_el1); set_arch_unknown(el2ctx->dbgbvr_el1); set_arch_unknown(el2ctx->dbgwcr_el1); set_arch_unknown(el2ctx->dbgwvr_el1); el2ctx->pmcr_el0 = READ_SPECIALREG(pmcr_el0) & PMCR_N_MASK; /* PMCR_LC is unknown when AArch32 is supported or RES1 otherwise */ el2ctx->pmcr_el0 |= PMCR_LC; set_arch_unknown(el2ctx->pmccntr_el0); set_arch_unknown(el2ctx->pmccfiltr_el0); set_arch_unknown(el2ctx->pmuserenr_el0); set_arch_unknown(el2ctx->pmselr_el0); set_arch_unknown(el2ctx->pmxevcntr_el0); set_arch_unknown(el2ctx->pmcntenset_el0); set_arch_unknown(el2ctx->pmintenset_el1); set_arch_unknown(el2ctx->pmovsset_el0); memset(el2ctx->pmevcntr_el0, 0, sizeof(el2ctx->pmevcntr_el0)); memset(el2ctx->pmevtyper_el0, 0, sizeof(el2ctx->pmevtyper_el0)); } void reset_vm_el2_regs(void *vcpu) { struct hypctx *el2ctx; uint64_t cpu_aff, vcpuid; el2ctx = vcpu; vcpuid = vcpu_vcpuid(el2ctx->vcpu); /* * Set the Hypervisor Configuration Register: * * HCR_RW: use AArch64 for EL1 * HCR_TID3: handle ID registers in the vmm to privide a common * set of featers on all vcpus * HCR_TWI: Trap WFI to the hypervisor * HCR_BSU_IS: barrier instructions apply to the inner shareable * domain * HCR_FB: broadcast maintenance operations * HCR_AMO: route physical SError interrupts to EL2 * HCR_IMO: route physical IRQ interrupts to EL2 * HCR_FMO: route physical FIQ interrupts to EL2 * HCR_SWIO: turn set/way invalidate into set/way clean and * invalidate * HCR_VM: use stage 2 translation */ el2ctx->hcr_el2 = HCR_RW | HCR_TID3 | HCR_TWI | HCR_BSU_IS | HCR_FB | HCR_AMO | HCR_IMO | HCR_FMO | HCR_SWIO | HCR_VM; if (in_vhe()) { el2ctx->hcr_el2 |= HCR_E2H; } /* Set the Extended Hypervisor Configuration Register */ el2ctx->hcrx_el2 = 0; /* TODO: Trap all extensions we don't support */ el2ctx->mdcr_el2 = MDCR_EL2_TDOSA | MDCR_EL2_TDRA | MDCR_EL2_TPMS | MDCR_EL2_TTRF; /* PMCR_EL0.N is read from MDCR_EL2.HPMN */ el2ctx->mdcr_el2 |= (el2ctx->pmcr_el0 & PMCR_N_MASK) >> PMCR_N_SHIFT; el2ctx->vmpidr_el2 = VMPIDR_EL2_RES1; /* The guest will detect a multi-core, single-threaded CPU */ el2ctx->vmpidr_el2 &= ~VMPIDR_EL2_U & ~VMPIDR_EL2_MT; /* * Generate the guest MPIDR value. We only support 16 CPUs at affinity * level 0 to simplify the vgicv3 driver (see writing sgi1r_el1). */ cpu_aff = (vcpuid & 0xf) << MPIDR_AFF0_SHIFT | ((vcpuid >> 4) & 0xff) << MPIDR_AFF1_SHIFT | ((vcpuid >> 12) & 0xff) << MPIDR_AFF2_SHIFT | ((vcpuid >> 20) & 0xff) << MPIDR_AFF3_SHIFT; el2ctx->vmpidr_el2 |= cpu_aff; /* Use the same CPU identification information as the host */ el2ctx->vpidr_el2 = CPU_IMPL_TO_MIDR(CPU_IMPL_ARM); el2ctx->vpidr_el2 |= CPU_VAR_TO_MIDR(0); el2ctx->vpidr_el2 |= CPU_ARCH_TO_MIDR(0xf); el2ctx->vpidr_el2 |= CPU_PART_TO_MIDR(CPU_PART_FOUNDATION); el2ctx->vpidr_el2 |= CPU_REV_TO_MIDR(0); /* * Don't trap accesses to CPACR_EL1, trace, SVE, Advanced SIMD * and floating point functionality to EL2. */ if (in_vhe()) el2ctx->cptr_el2 = CPTR_E2H_TRAP_ALL | CPTR_E2H_FPEN; else el2ctx->cptr_el2 = CPTR_TRAP_ALL & ~CPTR_TFP; el2ctx->cptr_el2 &= ~CPTR_TCPAC; /* * Disable interrupts in the guest. The guest OS will re-enable * them. */ el2ctx->tf.tf_spsr = PSR_D | PSR_A | PSR_I | PSR_F; /* Use the EL1 stack when taking exceptions to EL1 */ el2ctx->tf.tf_spsr |= PSR_M_EL1h; }