diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -304,7 +304,6 @@ int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta); int vm_restore_time(struct vm *vm); -#ifdef _SYS__CPUSET_H_ /* * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. * The rendezvous 'func(arg)' is not allowed to do anything that will @@ -325,7 +324,6 @@ cpuset_t vm_suspended_cpus(struct vm *vm); cpuset_t vm_start_cpus(struct vm *vm, const cpuset_t *tostart); void vm_await_start(struct vm *vm, const cpuset_t *waiting); -#endif /* _SYS__CPUSET_H_ */ static __inline int vcpu_rendezvous_pending(struct vcpu *vcpu, struct vm_eventinfo *info) diff --git a/sys/arm64/arm64/genassym.c b/sys/arm64/arm64/genassym.c --- a/sys/arm64/arm64/genassym.c +++ b/sys/arm64/arm64/genassym.c @@ -73,6 +73,7 @@ ASSYM(TF_SIZE, sizeof(struct trapframe)); ASSYM(TF_SP, offsetof(struct trapframe, tf_sp)); +ASSYM(TF_LR, offsetof(struct trapframe, tf_lr)); ASSYM(TF_ELR, offsetof(struct trapframe, tf_elr)); ASSYM(TF_SPSR, offsetof(struct trapframe, tf_spsr)); ASSYM(TF_ESR, offsetof(struct trapframe, tf_esr)); diff --git a/sys/arm64/include/armreg.h b/sys/arm64/include/armreg.h --- a/sys/arm64/include/armreg.h +++ b/sys/arm64/include/armreg.h @@ -357,6 +357,12 @@ #define ISS_MSR_REG_MASK \ (ISS_MSR_OP0_MASK | ISS_MSR_OP2_MASK | ISS_MSR_OP1_MASK | \ ISS_MSR_CRn_MASK | ISS_MSR_CRm_MASK) +#define ISS_MSR_REG(reg) \ + (((reg ## _op0) << ISS_MSR_OP0_SHIFT) | \ + ((reg ## _op1) << ISS_MSR_OP1_SHIFT) | \ + ((reg ## _CRn) << ISS_MSR_CRn_SHIFT) | \ + ((reg ## _CRm) << ISS_MSR_CRm_SHIFT) | \ + ((reg ## _op2) << ISS_MSR_OP2_SHIFT)) #define ISS_DATA_ISV_SHIFT 24 #define ISS_DATA_ISV (0x01 << ISS_DATA_ISV_SHIFT) diff --git a/sys/arm64/include/hypervisor.h b/sys/arm64/include/hypervisor.h --- a/sys/arm64/include/hypervisor.h +++ b/sys/arm64/include/hypervisor.h @@ -126,6 +126,8 @@ /* HPFAR_EL2_FIPA holds the 4k page address */ #define HPFAR_EL2_FIPA_ADDR(x) \ (HPFAR_EL2_FIPA_GET(x) << 12) +/* The bits from FAR_EL2 we need to add to HPFAR_EL2_FIPA_ADDR */ +#define FAR_EL2_HPFAR_PAGE_MASK (0xffful) /* ICC_SRE_EL2 */ #define ICC_SRE_EL2_SRE (1UL << 0) @@ -169,6 +171,7 @@ #define TCR_EL2_TG0_64K (0x1UL << TCR_EL2_TG0_SHIFT) #define TCR_EL2_TG0_16K (0x2UL << TCR_EL2_TG0_SHIFT) #define TCR_EL2_PS_SHIFT 16 +#define TCR_EL2_PS_MASK (0xfUL << TCR_EL2_PS_SHIFT) #define TCR_EL2_PS_32BITS (0UL << TCR_EL2_PS_SHIFT) #define TCR_EL2_PS_36BITS (1UL << TCR_EL2_PS_SHIFT) #define TCR_EL2_PS_40BITS (2UL << TCR_EL2_PS_SHIFT) diff --git a/sys/arm64/include/vmm.h b/sys/arm64/include/vmm.h new file mode 100644 --- /dev/null +++ b/sys/arm64/include/vmm.h @@ -0,0 +1,362 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_H_ +#define _VMM_H_ + +#include +#include +#include +#include + +#include "pte.h" +#include "pmap.h" + +struct vcpu; + +enum vm_suspend_how { + VM_SUSPEND_NONE, + VM_SUSPEND_RESET, + VM_SUSPEND_POWEROFF, + VM_SUSPEND_HALT, + VM_SUSPEND_LAST +}; + +/* + * Identifiers for architecturally defined registers. + */ +enum vm_reg_name { + VM_REG_GUEST_X0 = 0, + VM_REG_GUEST_X1, + VM_REG_GUEST_X2, + VM_REG_GUEST_X3, + VM_REG_GUEST_X4, + VM_REG_GUEST_X5, + VM_REG_GUEST_X6, + VM_REG_GUEST_X7, + VM_REG_GUEST_X8, + VM_REG_GUEST_X9, + VM_REG_GUEST_X10, + VM_REG_GUEST_X11, + VM_REG_GUEST_X12, + VM_REG_GUEST_X13, + VM_REG_GUEST_X14, + VM_REG_GUEST_X15, + VM_REG_GUEST_X16, + VM_REG_GUEST_X17, + VM_REG_GUEST_X18, + VM_REG_GUEST_X19, + VM_REG_GUEST_X20, + VM_REG_GUEST_X21, + VM_REG_GUEST_X22, + VM_REG_GUEST_X23, + VM_REG_GUEST_X24, + VM_REG_GUEST_X25, + VM_REG_GUEST_X26, + VM_REG_GUEST_X27, + VM_REG_GUEST_X28, + VM_REG_GUEST_X29, + VM_REG_GUEST_LR, + VM_REG_GUEST_SP, + VM_REG_GUEST_PC, + VM_REG_GUEST_CPSR, + + VM_REG_GUEST_SCTLR_EL1, + VM_REG_GUEST_TTBR0_EL1, + VM_REG_GUEST_TTBR1_EL1, + VM_REG_GUEST_TCR_EL1, + VM_REG_GUEST_TCR2_EL1, + VM_REG_LAST +}; + +#define VM_INTINFO_VECTOR(info) ((info) & 0xff) +#define VM_INTINFO_DEL_ERRCODE 0x800 +#define VM_INTINFO_RSVD 0x7ffff000 +#define VM_INTINFO_VALID 0x80000000 +#define VM_INTINFO_TYPE 0x700 +#define VM_INTINFO_HWINTR (0 << 8) +#define VM_INTINFO_NMI (2 << 8) +#define VM_INTINFO_HWEXCEPTION (3 << 8) +#define VM_INTINFO_SWINTR (4 << 8) + +#define VM_MAX_SUFFIXLEN 15 + +#define VM_GUEST_BASE_IPA 0x80000000UL /* Guest kernel start ipa */ + +#ifdef _KERNEL + +#define VM_MAX_NAMELEN 32 + +struct vm; +struct vm_exception; +struct vm_exit; +struct vm_run; +struct vm_object; +struct vm_guest_paging; +struct vm_vgic_descr; +struct pmap; + +struct vm_eventinfo { + void *rptr; /* rendezvous cookie */ + int *sptr; /* suspend cookie */ + int *iptr; /* reqidle cookie */ +}; + +int vm_create(const char *name, struct vm **retvm); +struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); +void vm_slock_vcpus(struct vm *vm); +void vm_unlock_vcpus(struct vm *vm); +void vm_destroy(struct vm *vm); +int vm_reinit(struct vm *vm); +const char *vm_name(struct vm *vm); + +/* + * APIs that modify the guest memory map require all vcpus to be frozen. + */ +void vm_slock_memsegs(struct vm *vm); +void vm_xlock_memsegs(struct vm *vm); +void vm_unlock_memsegs(struct vm *vm); +int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, + size_t len, int prot, int flags); +int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len); +int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); +void vm_free_memseg(struct vm *vm, int ident); + +/* + * APIs that inspect the guest memory map require only a *single* vcpu to + * be frozen. This acts like a read lock on the guest memory map since any + * modification requires *all* vcpus to be frozen. + */ +int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); +int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + struct vm_object **objptr); +vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); +void *vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, + int prot, void **cookie); +void *vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, + int prot, void **cookie); +void vm_gpa_release(void *cookie); +bool vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa); + +int vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault); + +uint16_t vm_get_maxcpus(struct vm *vm); +void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus); +int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus); +int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval); +int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val); +int vm_run(struct vcpu *vcpu); +int vm_suspend(struct vm *vm, enum vm_suspend_how how); +void* vm_get_cookie(struct vm *vm); +int vcpu_vcpuid(struct vcpu *vcpu); +void *vcpu_get_cookie(struct vcpu *vcpu); +struct vm *vcpu_vm(struct vcpu *vcpu); +struct vcpu *vm_vcpu(struct vm *vm, int cpu); +int vm_get_capability(struct vcpu *vcpu, int type, int *val); +int vm_set_capability(struct vcpu *vcpu, int type, int val); +int vm_activate_cpu(struct vcpu *vcpu); +int vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu); +int vm_resume_cpu(struct vm *vm, struct vcpu *vcpu); +int vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far); +int vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr); +int vm_assert_irq(struct vm *vm, uint32_t irq); +int vm_deassert_irq(struct vm *vm, uint32_t irq); +int vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot, + int func); +struct vm_exit *vm_exitinfo(struct vcpu *vcpu); +void vm_exit_suspended(struct vcpu *vcpu, uint64_t pc); +void vm_exit_debug(struct vcpu *vcpu, uint64_t pc); +void vm_exit_rendezvous(struct vcpu *vcpu, uint64_t pc); +void vm_exit_astpending(struct vcpu *vcpu, uint64_t pc); + +cpuset_t vm_active_cpus(struct vm *vm); +cpuset_t vm_debug_cpus(struct vm *vm); +cpuset_t vm_suspended_cpus(struct vm *vm); + +static __inline bool +virt_enabled(void) +{ + + return (has_hyp()); +} + +static __inline int +vcpu_rendezvous_pending(struct vm_eventinfo *info) +{ + + return (*((uintptr_t *)(info->rptr)) != 0); +} + +static __inline int +vcpu_suspended(struct vm_eventinfo *info) +{ + + return (*info->sptr); +} + +int vcpu_debugged(struct vcpu *vcpu); + +enum vcpu_state { + VCPU_IDLE, + VCPU_FROZEN, + VCPU_RUNNING, + VCPU_SLEEPING, +}; + +int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle); +enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu); + +static int __inline +vcpu_is_running(struct vcpu *vcpu, int *hostcpu) +{ + return (vcpu_get_state(vcpu, hostcpu) == VCPU_RUNNING); +} + +#ifdef _SYS_PROC_H_ +static int __inline +vcpu_should_yield(struct vcpu *vcpu) +{ + struct thread *td; + + td = curthread; + return (td->td_ast != 0 || td->td_owepreempt != 0); +} +#endif + +void *vcpu_stats(struct vcpu *vcpu); +void vcpu_notify_event(struct vcpu *vcpu); + +enum vm_reg_name vm_segment_name(int seg_encoding); + +struct vm_copyinfo { + uint64_t gpa; + size_t len; + void *hva; + void *cookie; +}; + +#endif /* _KERNEL */ + +#define VM_DIR_READ 0 +#define VM_DIR_WRITE 1 + +#define VM_GP_M_MASK 0x1f +#define VM_GP_MMU_ENABLED (1 << 5) + +struct vm_guest_paging { + uint64_t ttbr0_addr; + uint64_t ttbr1_addr; + uint64_t tcr_el1; + uint64_t tcr2_el1; + int flags; + int padding; +}; + +struct vie { + uint8_t access_size:4, sign_extend:1, dir:1, unused:2; + enum vm_reg_name reg; +}; + +struct vre { + uint32_t inst_syndrome; + uint8_t dir:1, unused:7; + enum vm_reg_name reg; +}; + +/* + * Identifiers for optional vmm capabilities + */ +enum vm_cap_type { + VM_CAP_HALT_EXIT, + VM_CAP_MTRAP_EXIT, + VM_CAP_PAUSE_EXIT, + VM_CAP_UNRESTRICTED_GUEST, + VM_CAP_MAX +}; + +enum vm_exitcode { + VM_EXITCODE_BOGUS, + VM_EXITCODE_INST_EMUL, + VM_EXITCODE_REG_EMUL, + VM_EXITCODE_HVC, + VM_EXITCODE_SUSPENDED, + VM_EXITCODE_HYP, + VM_EXITCODE_WFI, + VM_EXITCODE_PAGING, + VM_EXITCODE_SMCCC, + VM_EXITCODE_DEBUG, + VM_EXITCODE_MAX +}; + +struct vm_exit { + enum vm_exitcode exitcode; + int inst_length; + uint64_t pc; + union { + /* + * ARM specific payload. + */ + struct { + uint32_t exception_nr; + uint32_t pad; + uint64_t esr_el2; /* Exception Syndrome Register */ + uint64_t far_el2; /* Fault Address Register */ + uint64_t hpfar_el2; /* Hypervisor IPA Fault Address Register */ + } hyp; + struct { + struct vre vre; + } reg_emul; + struct { + uint64_t gpa; + uint64_t esr; + } paging; + struct { + uint64_t gpa; + struct vm_guest_paging paging; + struct vie vie; + } inst_emul; + + /* + * A SMCCC call, e.g. starting a core via PSCI. + * Further arguments can be read by asking the kernel for + * all register values. + */ + struct { + uint64_t func_id; + uint64_t args[7]; + } smccc_call; + + struct { + enum vm_suspend_how how; + } suspended; + } u; +}; + +#endif /* _VMM_H_ */ diff --git a/sys/arm64/include/vmm_dev.h b/sys/arm64/include/vmm_dev.h new file mode 100644 --- /dev/null +++ b/sys/arm64/include/vmm_dev.h @@ -0,0 +1,272 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_DEV_H_ +#define _VMM_DEV_H_ + +#ifdef _KERNEL +void vmmdev_init(void); +int vmmdev_cleanup(void); +#endif + +struct vm_memmap { + vm_paddr_t gpa; + int segid; /* memory segment */ + vm_ooffset_t segoff; /* offset into memory segment */ + size_t len; /* mmap length */ + int prot; /* RWX */ + int flags; +}; +#define VM_MEMMAP_F_WIRED 0x01 + +struct vm_munmap { + vm_paddr_t gpa; + size_t len; +}; + +#define VM_MEMSEG_NAME(m) ((m)->name[0] != '\0' ? (m)->name : NULL) +struct vm_memseg { + int segid; + size_t len; + char name[VM_MAX_SUFFIXLEN + 1]; +}; + +struct vm_register { + int cpuid; + int regnum; /* enum vm_reg_name */ + uint64_t regval; +}; + +struct vm_register_set { + int cpuid; + unsigned int count; + const int *regnums; /* enum vm_reg_name */ + uint64_t *regvals; +}; + +struct vm_run { + int cpuid; + cpuset_t *cpuset; /* CPU set storage */ + size_t cpusetsize; + struct vm_exit *vm_exit; +}; + +struct vm_exception { + int cpuid; + uint64_t esr; + uint64_t far; +}; + +struct vm_msi { + uint64_t msg; + uint64_t addr; + int bus; + int slot; + int func; +}; + +struct vm_capability { + int cpuid; + enum vm_cap_type captype; + int capval; + int allcpus; +}; + +#define MAX_VM_STATS 64 +struct vm_stats { + int cpuid; /* in */ + int index; /* in */ + int num_entries; /* out */ + struct timeval tv; + uint64_t statbuf[MAX_VM_STATS]; +}; +struct vm_stat_desc { + int index; /* in */ + char desc[128]; /* out */ +}; + +struct vm_suspend { + enum vm_suspend_how how; +}; + +struct vm_gla2gpa { + int vcpuid; /* inputs */ + int prot; /* PROT_READ or PROT_WRITE */ + uint64_t gla; + struct vm_guest_paging paging; + int fault; /* outputs */ + uint64_t gpa; +}; + +struct vm_activate_cpu { + int vcpuid; +}; + +struct vm_cpuset { + int which; + int cpusetsize; + cpuset_t *cpus; +}; +#define VM_ACTIVE_CPUS 0 +#define VM_SUSPENDED_CPUS 1 +#define VM_DEBUG_CPUS 2 + +struct vm_vgic_version { + u_int version; + u_int flags; +}; + +struct vm_vgic_descr { + struct vm_vgic_version ver; + union { + struct { + uint64_t dist_start; + uint64_t dist_size; + uint64_t redist_start; + uint64_t redist_size; + } v3_regs; + }; +}; + +struct vm_irq { + uint32_t irq; +}; + +struct vm_cpu_topology { + uint16_t sockets; + uint16_t cores; + uint16_t threads; + uint16_t maxcpus; +}; + +enum { + /* general routines */ + IOCNUM_ABIVERS = 0, + IOCNUM_RUN = 1, + IOCNUM_SET_CAPABILITY = 2, + IOCNUM_GET_CAPABILITY = 3, + IOCNUM_SUSPEND = 4, + IOCNUM_REINIT = 5, + + /* memory apis */ + IOCNUM_GET_GPA_PMAP = 12, + IOCNUM_GLA2GPA_NOFAULT = 13, + IOCNUM_ALLOC_MEMSEG = 14, + IOCNUM_GET_MEMSEG = 15, + IOCNUM_MMAP_MEMSEG = 16, + IOCNUM_MMAP_GETNEXT = 17, + IOCNUM_MUNMAP_MEMSEG = 18, + + /* register/state accessors */ + IOCNUM_SET_REGISTER = 20, + IOCNUM_GET_REGISTER = 21, + IOCNUM_SET_REGISTER_SET = 24, + IOCNUM_GET_REGISTER_SET = 25, + + /* statistics */ + IOCNUM_VM_STATS = 50, + IOCNUM_VM_STAT_DESC = 51, + + /* CPU Topology */ + IOCNUM_SET_TOPOLOGY = 63, + IOCNUM_GET_TOPOLOGY = 64, + + /* interrupt injection */ + IOCNUM_ASSERT_IRQ = 80, + IOCNUM_DEASSERT_IRQ = 81, + IOCNUM_RAISE_MSI = 82, + IOCNUM_INJECT_EXCEPTION = 83, + + /* vm_cpuset */ + IOCNUM_ACTIVATE_CPU = 90, + IOCNUM_GET_CPUSET = 91, + IOCNUM_SUSPEND_CPU = 92, + IOCNUM_RESUME_CPU = 93, + + /* vm_attach_vgic */ + IOCNUM_GET_VGIC_VERSION = 110, + IOCNUM_ATTACH_VGIC = 111, +}; + +#define VM_RUN \ + _IOWR('v', IOCNUM_RUN, struct vm_run) +#define VM_SUSPEND \ + _IOW('v', IOCNUM_SUSPEND, struct vm_suspend) +#define VM_REINIT \ + _IO('v', IOCNUM_REINIT) +#define VM_ALLOC_MEMSEG \ + _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg) +#define VM_GET_MEMSEG \ + _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg) +#define VM_MMAP_MEMSEG \ + _IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap) +#define VM_MMAP_GETNEXT \ + _IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap) +#define VM_MUNMAP_MEMSEG \ + _IOW('v', IOCNUM_MUNMAP_MEMSEG, struct vm_munmap) +#define VM_SET_REGISTER \ + _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) +#define VM_GET_REGISTER \ + _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) +#define VM_SET_REGISTER_SET \ + _IOW('v', IOCNUM_SET_REGISTER_SET, struct vm_register_set) +#define VM_GET_REGISTER_SET \ + _IOWR('v', IOCNUM_GET_REGISTER_SET, struct vm_register_set) +#define VM_SET_CAPABILITY \ + _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) +#define VM_GET_CAPABILITY \ + _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) +#define VM_STATS \ + _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) +#define VM_STAT_DESC \ + _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) +#define VM_ASSERT_IRQ \ + _IOW('v', IOCNUM_ASSERT_IRQ, struct vm_irq) +#define VM_DEASSERT_IRQ \ + _IOW('v', IOCNUM_DEASSERT_IRQ, struct vm_irq) +#define VM_RAISE_MSI \ + _IOW('v', IOCNUM_RAISE_MSI, struct vm_msi) +#define VM_INJECT_EXCEPTION \ + _IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception) +#define VM_SET_TOPOLOGY \ + _IOW('v', IOCNUM_SET_TOPOLOGY, struct vm_cpu_topology) +#define VM_GET_TOPOLOGY \ + _IOR('v', IOCNUM_GET_TOPOLOGY, struct vm_cpu_topology) +#define VM_GLA2GPA_NOFAULT \ + _IOWR('v', IOCNUM_GLA2GPA_NOFAULT, struct vm_gla2gpa) +#define VM_ACTIVATE_CPU \ + _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) +#define VM_GET_CPUS \ + _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) +#define VM_SUSPEND_CPU \ + _IOW('v', IOCNUM_SUSPEND_CPU, struct vm_activate_cpu) +#define VM_RESUME_CPU \ + _IOW('v', IOCNUM_RESUME_CPU, struct vm_activate_cpu) +#define VM_GET_VGIC_VERSION \ + _IOR('v', IOCNUM_GET_VGIC_VERSION, struct vm_vgic_version) +#define VM_ATTACH_VGIC \ + _IOW('v', IOCNUM_ATTACH_VGIC, struct vm_vgic_descr) +#endif diff --git a/sys/arm64/include/vmm_instruction_emul.h b/sys/arm64/include/vmm_instruction_emul.h new file mode 100644 --- /dev/null +++ b/sys/arm64/include/vmm_instruction_emul.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_INSTRUCTION_EMUL_H_ +#define _VMM_INSTRUCTION_EMUL_H_ + +/* + * Callback functions to read and write memory regions. + */ +typedef int (*mem_region_read_t)(struct vcpu *vcpu, uint64_t gpa, + uint64_t *rval, int rsize, void *arg); +typedef int (*mem_region_write_t)(struct vcpu *vcpu, uint64_t gpa, + uint64_t wval, int wsize, void *arg); + +/* + * Callback functions to read and write registers. + */ +typedef int (*reg_read_t)(struct vcpu *vcpu, uint64_t *rval, void *arg); +typedef int (*reg_write_t)(struct vcpu *vcpu, uint64_t wval, void *arg); + +/* + * Emulate the decoded 'vie' instruction when it contains a memory operation. + * + * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region + * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * + */ +int vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t mrr, + mem_region_write_t mrw, void *mrarg); + +/* + * Emulate the decoded 'vre' instruction when it contains a register access. + * + * The callbacks 'regread' and 'regwrite' emulate reads and writes to the + * register from 'vie'. 'regarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * + */ +int vmm_emulate_register(struct vcpu *vcpu, struct vre *vre, reg_read_t regread, + reg_write_t regwrite, void *regarg); + +#ifdef _KERNEL +void vm_register_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask, + reg_read_t reg_read, reg_write_t reg_write, void *arg); +void vm_deregister_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask); + +void vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size, + mem_region_read_t mmio_read, mem_region_write_t mmio_write); +void vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size); +#endif + +#endif /* _VMM_INSTRUCTION_EMUL_H_ */ diff --git a/sys/arm64/include/vmm_snapshot.h b/sys/arm64/include/vmm_snapshot.h new file mode 100644 --- /dev/null +++ b/sys/arm64/include/vmm_snapshot.h @@ -0,0 +1 @@ +/* $FreeBSD$ */ diff --git a/sys/arm64/vmm/arm64.h b/sys/arm64/vmm/arm64.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/arm64.h @@ -0,0 +1,164 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _VMM_ARM64_H_ +#define _VMM_ARM64_H_ + +#include +#include +#include + +#include "mmu.h" +#include "io/vgic_v3.h" +#include "io/vtimer.h" + +struct vgic_v3; +struct vgic_v3_cpu; + +struct hypctx { + struct trapframe tf; + + /* + * EL1 control registers. + */ + uint64_t elr_el1; /* Exception Link Register */ + uint64_t sp_el0; /* Stack pointer */ + uint64_t tpidr_el0; /* EL0 Software ID Register */ + uint64_t tpidrro_el0; /* Read-only Thread ID Register */ + uint64_t tpidr_el1; /* EL1 Software ID Register */ + uint64_t vbar_el1; /* Vector Base Address Register */ + + uint64_t actlr_el1; /* Auxiliary Control Register */ + uint64_t afsr0_el1; /* Auxiliary Fault Status Register 0 */ + uint64_t afsr1_el1; /* Auxiliary Fault Status Register 1 */ + uint64_t amair_el1; /* Auxiliary Memory Attribute Indirection Register */ + uint64_t contextidr_el1; /* Current Process Identifier */ + uint64_t cpacr_el1; /* Architectural Feature Access Control Register */ + uint64_t csselr_el1; /* Cache Size Selection Register */ + uint64_t esr_el1; /* Exception Syndrome Register */ + uint64_t far_el1; /* Fault Address Register */ + uint64_t mair_el1; /* Memory Attribute Indirection Register */ + uint64_t mdccint_el1; /* Monitor DCC Interrupt Enable Register */ + uint64_t mdscr_el1; /* Monitor Debug System Control Register */ + uint64_t par_el1; /* Physical Address Register */ + uint64_t sctlr_el1; /* System Control Register */ + uint64_t tcr_el1; /* Translation Control Register */ + uint64_t tcr2_el1; /* Translation Control Register 2 */ + uint64_t ttbr0_el1; /* Translation Table Base Register 0 */ + uint64_t ttbr1_el1; /* Translation Table Base Register 1 */ + uint64_t spsr_el1; /* Saved Program Status Register */ + + uint64_t pmcr_el0; /* Performance Monitors Control Register */ + uint64_t pmccntr_el0; + uint64_t pmccfiltr_el0; + uint64_t pmcntenset_el0; + uint64_t pmintenset_el1; + uint64_t pmovsset_el0; + uint64_t pmselr_el0; + uint64_t pmuserenr_el0; + uint64_t pmevcntr_el0[31]; + uint64_t pmevtyper_el0[31]; + + uint64_t dbgbcr_el1[16]; /* Debug Breakpoint Control Registers */ + uint64_t dbgbvr_el1[16]; /* Debug Breakpoint Value Registers */ + uint64_t dbgwcr_el1[16]; /* Debug Watchpoint Control Registers */ + uint64_t dbgwvr_el1[16]; /* Debug Watchpoint Value Registers */ + + /* EL2 control registers */ + uint64_t cptr_el2; /* Architectural Feature Trap Register */ + uint64_t hcr_el2; /* Hypervisor Configuration Register */ + uint64_t mdcr_el2; /* Monitor Debug Configuration Register */ + uint64_t vpidr_el2; /* Virtualization Processor ID Register */ + uint64_t vmpidr_el2; /* Virtualization Multiprocessor ID Register */ + uint64_t el2_addr; /* The address of this in el2 space */ + struct hyp *hyp; + struct vcpu *vcpu; + struct { + uint64_t far_el2; /* Fault Address Register */ + uint64_t hpfar_el2; /* Hypervisor IPA Fault Address Register */ + } exit_info; + + struct vtimer_cpu vtimer_cpu; + + struct vgic_v3_regs vgic_v3_regs; + struct vgic_v3_cpu *vgic_cpu; + bool has_exception; +}; + +struct hyp { + struct vm *vm; + struct vtimer vtimer; + uint64_t vmid_generation; + uint64_t vttbr_el2; + uint64_t el2_addr; /* The address of this in el2 space */ + bool vgic_attached; + struct vgic_v3 *vgic; + struct hypctx *ctx[]; +}; + +#define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ + ret_type vmmops_##opname args; + +DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum)) +DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) +DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) +DEFINE_VMMOPS_IFUNC(int, gla2gpa, (void *vcpui, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault)) +DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t pc, struct pmap *pmap, + struct vm_eventinfo *info)) +DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) +DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, + int vcpu_id)) +DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) +DEFINE_VMMOPS_IFUNC(int, exception, (void *vcpui, uint64_t esr, uint64_t far)) +DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) +DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) +DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) +DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) +DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, + vm_offset_t max)) +DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) +#ifdef notyet +#ifdef BHYVE_SNAPSHOT +DEFINE_VMMOPS_IFUNC(int, snapshot, (void *vmi, struct vm_snapshot_meta *meta)) +DEFINE_VMMOPS_IFUNC(int, vcpu_snapshot, (void *vcpui, + struct vm_snapshot_meta *meta)) +DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vcpui, uint64_t now)) +#endif +#endif + +uint64_t vmm_call_hyp(uint64_t, ...); + +#if 0 +#define eprintf(fmt, ...) printf("%s:%d " fmt, __func__, __LINE__, ##__VA_ARGS__) +#else +#define eprintf(fmt, ...) do {} while(0) +#endif + +struct hypctx *arm64_get_active_vcpu(void); + +#endif /* !_VMM_ARM64_H_ */ diff --git a/sys/arm64/vmm/hyp.h b/sys/arm64/vmm/hyp.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/hyp.h @@ -0,0 +1,114 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2017 Alexandru Elisei + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_HYP_H_ +#define _VMM_HYP_H_ + +/* + * The translation tables for the hypervisor mode will hold mappings for kernel + * virtual addresses and an identity mapping (VA == PA) necessary when + * enabling/disabling the MMU. + * + * When in EL2 exception level the translation table base register is TTBR0_EL2 + * and the virtual addresses generated by the CPU must be at the bottom of the + * memory, with the first 16 bits all set to zero: + * + * 0x0000ffffffffffff End hyp address space + * 0x0000000000000000 Start of hyp address space + * + * To run code in hyp mode we need to convert kernel virtual addresses to + * addreses that fit into this address space. + * + * The kernel virtual address range is: + * + * 0xffff007fffffffff End of KVA + * 0xffff000000000000 Kernel base address & start of KVA + * + * (see /sys/arm64/include/vmparam.h). + * + * We could convert the kernel virtual addresses to valid EL2 addresses by + * setting the first 16 bits to zero and thus mapping the kernel addresses in + * the bottom half of the EL2 address space, but then they might clash with the + * identity mapping addresses. Instead we map the kernel addresses in the upper + * half of the EL2 address space. + * + * The hypervisor address space will look like this: + * + * 0x0000807fffffffff End of KVA mapping + * 0x0000800000000000 Start of KVA mapping + * + * 0x00007fffffffffff End of identity mapping + * 0x0000000000000000 Start of identity mapping + * + * With the scheme we have 47 bits at our disposable for the identity map and + * another 47 bits for the kernel virtual addresses. For a maximum physical + * memory size of 128TB we are guaranteed to not have any clashes between + * addresses. + */ +#define HYP_VM_MIN_ADDRESS 0x0000000000000000 +#define HYP_VM_MAX_ADDRESS 0x0001000000000000 + +/* + * When the vmm code is installed the following handles can be used by + * the host to call into EL2. + */ +#define HYP_CLEANUP 0x00000001 +#define HYP_ENTER_GUEST 0x00000002 +#define HYP_READ_REGISTER 0x00000003 +#define HYP_REG_ICH_VTR 0x1 +#define HYP_REG_CNTHCTL 0x2 +#define HYP_CLEAN_S2_TLBI 0x00000004 +#define HYP_DC_CIVAC 0x00000005 +#define HYP_EL2_TLBI 0x00000006 +#define HYP_EL2_TLBI_ALL 0x1 +#define HYP_EL2_TLBI_VA 0x2 +#define HYP_S2_TLBI_RANGE 0x00000010 +#define HYP_S2_TLBI_ALL 0x00000011 + +/* + * When taking asynchronous exceptions, or interrupts, with the exception of the + * SError interrupt, the exception syndrome register is not updated with the + * exception code. We need to differentiate between the different exception + * types taken to EL2. + */ +#define EXCP_TYPE_EL1_SYNC 0 +#define EXCP_TYPE_EL1_IRQ 1 +#define EXCP_TYPE_EL1_FIQ 2 +#define EXCP_TYPE_EL1_ERROR 3 + +#define EXCP_TYPE_EL2_SYNC 4 +#define EXCP_TYPE_EL2_IRQ 5 +#define EXCP_TYPE_EL2_FIQ 6 +#define EXCP_TYPE_EL2_ERROR 7 + +#define EXCP_TYPE_MAINT_IRQ 8 +/* Used internally in vmm_hyp.c */ +#define EXCP_TYPE_REENTER 9 + +#define HYP_GET_VECTOR_TABLE -1 + +#endif /* !_VMM_HYP_H_ */ diff --git a/sys/arm64/vmm/io/vgic.h b/sys/arm64/vmm/io/vgic.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/io/vgic.h @@ -0,0 +1,51 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Arm Ltd + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VGIC_H_ +#define _VGIC_H_ + +struct hyp; +struct hypctx; +struct vm_vgic_descr; + +extern device_t vgic_dev; + +bool vgic_present(void); +void vgic_init(void); +int vgic_attach_to_vm(struct hyp *hyp, struct vm_vgic_descr *descr); +void vgic_detach_from_vm(struct hyp *hyp); +void vgic_vminit(struct hyp *hyp); +void vgic_cpuinit(struct hypctx *hypctx); +void vgic_cpucleanup(struct hypctx *hypctx); +void vgic_vmcleanup(struct hyp *hyp); +bool vgic_has_pending_irq(struct hypctx *hypctx); +int vgic_inject_irq(struct hyp *hyp, int vcpuid, uint32_t irqid, bool level); +int vgic_inject_msi(struct hyp *hyp, uint64_t msg, uint64_t addr); +void vgic_flush_hwstate(struct hypctx *hypctx); +void vgic_sync_hwstate(struct hypctx *hypctx); + +#endif /* _VGIC_H_ */ diff --git a/sys/arm64/vmm/io/vgic.c b/sys/arm64/vmm/io/vgic.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/io/vgic.c @@ -0,0 +1,116 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Arm Ltd + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include "vgic.h" +#include "vgic_if.h" + +device_t vgic_dev; + +bool +vgic_present(void) +{ + return (vgic_dev != NULL); +} + +void +vgic_init(void) +{ + VGIC_INIT(vgic_dev); +} + +int +vgic_attach_to_vm(struct hyp *hyp, struct vm_vgic_descr *descr) +{ + return (VGIC_ATTACH_TO_VM(vgic_dev, hyp, descr)); +} + +void +vgic_detach_from_vm(struct hyp *hyp) +{ + VGIC_DETACH_FROM_VM(vgic_dev, hyp); +} + +void +vgic_vminit(struct hyp *hyp) +{ + VGIC_VMINIT(vgic_dev, hyp); +} + +void +vgic_cpuinit(struct hypctx *hypctx) +{ + VGIC_CPUINIT(vgic_dev, hypctx); +} + +void +vgic_cpucleanup(struct hypctx *hypctx) +{ + VGIC_CPUCLEANUP(vgic_dev, hypctx); +} + +void +vgic_vmcleanup(struct hyp *hyp) +{ + VGIC_VMCLEANUP(vgic_dev, hyp); +} + +bool +vgic_has_pending_irq(struct hypctx *hypctx) +{ + return (VGIC_HAS_PENDING_IRQ(vgic_dev, hypctx)); +} + +/* TODO: vcpuid -> hypctx ? */ +/* TODO: Add a vgic interface */ +int +vgic_inject_irq(struct hyp *hyp, int vcpuid, uint32_t irqid, bool level) +{ + return (VGIC_INJECT_IRQ(vgic_dev, hyp, vcpuid, irqid, level)); +} + +int +vgic_inject_msi(struct hyp *hyp, uint64_t msg, uint64_t addr) +{ + return (VGIC_INJECT_MSI(vgic_dev, hyp, msg, addr)); +} + +void +vgic_flush_hwstate(struct hypctx *hypctx) +{ + VGIC_FLUSH_HWSTATE(vgic_dev, hypctx); +} + +void +vgic_sync_hwstate(struct hypctx *hypctx) +{ + VGIC_SYNC_HWSTATE(vgic_dev, hypctx); +} diff --git a/sys/arm64/vmm/io/vgic_if.m b/sys/arm64/vmm/io/vgic_if.m new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/io/vgic_if.m @@ -0,0 +1,99 @@ +#- +# SPDX-License-Identifier: BSD-2-Clause +# +# Copyright (c) 2023 Arm Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# + +INTERFACE vgic; + +HEADER { + struct hyp; + struct hypctx; + struct vm_vgic_descr; +}; + +METHOD void init { + device_t dev; +} + +METHOD int attach_to_vm { + device_t dev; + struct hyp *hyp; + struct vm_vgic_descr *descr; +}; + +METHOD void detach_from_vm { + device_t dev; + struct hyp *hyp; +} + +METHOD void vminit { + device_t dev; + struct hyp *hyp; +} + +METHOD void cpuinit { + device_t dev; + struct hypctx *hypctx; +} + +METHOD void cpucleanup { + device_t dev; + struct hypctx *hypctx; +} + +METHOD void vmcleanup { + device_t dev; + struct hyp *hyp; +} + +METHOD bool has_pending_irq { + device_t dev; + struct hypctx *hypctx; +} + +METHOD int inject_irq { + device_t dev; + struct hyp *hyp; + int vcpuid; + uint32_t irqid; + bool level; +} + +METHOD int inject_msi { + device_t dev; + struct hyp *hyp; + uint64_t msg; + uint64_t addr; +} + +METHOD void flush_hwstate { + device_t dev; + struct hypctx *hypctx; +} + +METHOD void sync_hwstate { + device_t dev; + struct hypctx *hypctx; +} diff --git a/sys/arm64/vmm/io/vgic_v3.h b/sys/arm64/vmm/io/vgic_v3.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/io/vgic_v3.h @@ -0,0 +1,57 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_VGIC_V3_H_ +#define _VMM_VGIC_V3_H_ + +#define VGIC_ICH_LR_NUM_MAX 16 +#define VGIC_ICH_APR_NUM_MAX 4 + +/* Registers accessed by EL2 */ +struct vgic_v3_regs { + uint32_t ich_eisr_el2; /* End of Interrupt Status Register */ + uint32_t ich_elrsr_el2; /* Empty List register Status Register (ICH_ELRSR_EL2) */ + uint32_t ich_hcr_el2; /* Hyp Control Register */ + uint32_t ich_misr_el2; /* Maintenance Interrupt State Register */ + uint32_t ich_vmcr_el2; /* Virtual Machine Control Register */ + + /* + * The List Registers are part of the VM context and are modified on a + * world switch. They need to be allocated statically so they are + * mapped in the EL2 translation tables when struct hypctx is mapped. + */ + uint64_t ich_lr_el2[VGIC_ICH_LR_NUM_MAX]; + uint16_t ich_lr_num; + + /* Active Priorities Registers for Group 0 and 1 interrupts */ + uint16_t ich_apr_num; + uint32_t ich_ap0r_el2[VGIC_ICH_APR_NUM_MAX]; + uint32_t ich_ap1r_el2[VGIC_ICH_APR_NUM_MAX]; +}; + +#endif /* !_VMM_VGIC_V3_H_ */ diff --git a/sys/arm64/vmm/io/vgic_v3.c b/sys/arm64/vmm/io/vgic_v3.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/io/vgic_v3.c @@ -0,0 +1,2227 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2018 Alexandru Elisei + * Copyright (C) 2020-2022 Andrew Turner + * Copyright (C) 2023 Arm Ltd + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include "vgic.h" +#include "vgic_v3.h" +#include "vgic_v3_reg.h" + +#include "vgic_if.h" + +#define VGIC_SGI_NUM (GIC_LAST_SGI - GIC_FIRST_SGI + 1) +#define VGIC_PPI_NUM (GIC_LAST_PPI - GIC_FIRST_PPI + 1) +#define VGIC_SPI_NUM (GIC_LAST_SPI - GIC_FIRST_SPI + 1) +#define VGIC_PRV_I_NUM (VGIC_SGI_NUM + VGIC_PPI_NUM) +#define VGIC_SHR_I_NUM (VGIC_SPI_NUM) + +MALLOC_DEFINE(M_VGIC_V3, "ARM VMM VGIC V3", "ARM VMM VGIC V3"); + +/* TODO: Move to softc */ +struct vgic_v3_virt_features { + uint8_t min_prio; + size_t ich_lr_num; + size_t ich_apr_num; +}; + +struct vgic_v3_irq { + /* List of IRQs that are active or pending */ + TAILQ_ENTRY(vgic_v3_irq) act_pend_list; + struct mtx irq_spinmtx; + uint64_t mpidr; + int target_vcpu; + uint32_t irq; + bool active; + bool pending; + bool enabled; + bool level; + bool on_aplist; + uint8_t priority; + uint8_t config; +#define VGIC_CONFIG_MASK 0x2 +#define VGIC_CONFIG_LEVEL 0x0 +#define VGIC_CONFIG_EDGE 0x2 +}; + +/* Global data not needed by EL2 */ +struct vgic_v3 { + struct mtx dist_mtx; + uint64_t dist_start; + size_t dist_end; + + uint64_t redist_start; + size_t redist_end; + + uint32_t gicd_ctlr; /* Distributor Control Register */ + + struct vgic_v3_irq *irqs; +}; + +/* Per-CPU data not needed by EL2 */ +struct vgic_v3_cpu { + /* + * We need a mutex for accessing the list registers because they are + * modified asynchronously by the virtual timer. + * + * Note that the mutex *MUST* be a spin mutex because an interrupt can + * be injected by a callout callback function, thereby modifying the + * list registers from a context where sleeping is forbidden. + */ + struct mtx lr_mtx; + + struct vgic_v3_irq private_irqs[VGIC_PRV_I_NUM]; + TAILQ_HEAD(, vgic_v3_irq) irq_act_pend; + u_int ich_lr_used; + + uint64_t gicr_typer; /* Redistributor Type Register */ +}; + +/* How many IRQs we support (SGIs + PPIs + SPIs). Not including LPIs */ +#define VGIC_NIRQS 1023 +/* Pretend to be an Arm design */ +#define VGIC_IIDR 0x43b + +static vgic_inject_irq_t vgic_v3_inject_irq; +static vgic_inject_msi_t vgic_v3_inject_msi; + +#define INJECT_IRQ(hyp, vcpuid, irqid, level) \ + vgic_v3_inject_irq(NULL, (hyp), (vcpuid), (irqid), (level)) + +typedef void (register_read)(struct hypctx *, u_int, uint64_t *, void *); +typedef void (register_write)(struct hypctx *, u_int, u_int, u_int, + uint64_t, void *); + +#define VGIC_8_BIT (1 << 0) +/* (1 << 1) is reserved for 16 bit accesses */ +#define VGIC_32_BIT (1 << 2) +#define VGIC_64_BIT (1 << 3) + +struct vgic_register { + u_int start; /* Start within a memory region */ + u_int end; + u_int size; + u_int flags; + register_read *read; + register_write *write; +}; + +#define VGIC_REGISTER_RANGE(reg_start, reg_end, reg_size, reg_flags, readf, \ + writef) \ +{ \ + .start = (reg_start), \ + .end = (reg_end), \ + .size = (reg_size), \ + .flags = (reg_flags), \ + .read = (readf), \ + .write = (writef), \ +} + +#define VGIC_REGISTER_RANGE_RAZ_WI(reg_start, reg_end, reg_size, reg_flags) \ + VGIC_REGISTER_RANGE(reg_start, reg_end, reg_size, reg_flags, \ + gic_zero_read, gic_ignore_write) + +#define VGIC_REGISTER(start_addr, reg_size, reg_flags, readf, writef) \ + VGIC_REGISTER_RANGE(start_addr, (start_addr) + (reg_size), \ + reg_size, reg_flags, readf, writef) + +#define VGIC_REGISTER_RAZ_WI(start_addr, reg_size, reg_flags) \ + VGIC_REGISTER_RANGE_RAZ_WI(start_addr, \ + (start_addr) + (reg_size), reg_size, reg_flags) + +static register_read gic_pidr2_read; +static register_read gic_zero_read; +static register_write gic_ignore_write; + +/* GICD_CTLR */ +static register_read dist_ctlr_read; +static register_write dist_ctlr_write; +/* GICD_TYPER */ +static register_read dist_typer_read; +/* GICD_IIDR */ +static register_read dist_iidr_read; +/* GICD_STATUSR - RAZ/WI as we don't report errors (yet) */ +/* GICD_SETSPI_NSR & GICD_CLRSPI_NSR */ +static register_write dist_setclrspi_nsr_write; +/* GICD_SETSPI_SR - RAZ/WI */ +/* GICD_CLRSPI_SR - RAZ/WI */ +/* GICD_IGROUPR - RAZ/WI as GICD_CTLR.ARE == 1 */ +/* GICD_ISENABLER */ +static register_read dist_isenabler_read; +static register_write dist_isenabler_write; +/* GICD_ICENABLER */ +static register_read dist_icenabler_read; +static register_write dist_icenabler_write; +/* GICD_ISPENDR */ +static register_read dist_ispendr_read; +static register_write dist_ispendr_write; +/* GICD_ICPENDR */ +static register_read dist_icpendr_read; +static register_write dist_icpendr_write; +/* GICD_ISACTIVER */ +static register_read dist_isactiver_read; +static register_write dist_isactiver_write; +/* GICD_ICACTIVER */ +static register_read dist_icactiver_read; +static register_write dist_icactiver_write; +/* GICD_IPRIORITYR */ +static register_read dist_ipriorityr_read; +static register_write dist_ipriorityr_write; +/* GICD_ITARGETSR - RAZ/WI as GICD_CTLR.ARE == 1 */ +/* GICD_ICFGR */ +static register_read dist_icfgr_read; +static register_write dist_icfgr_write; +/* GICD_IGRPMODR - RAZ/WI from non-secure mode */ +/* GICD_NSACR - RAZ/WI from non-secure mode */ +/* GICD_SGIR - RAZ/WI as GICD_CTLR.ARE == 1 */ +/* GICD_CPENDSGIR - RAZ/WI as GICD_CTLR.ARE == 1 */ +/* GICD_SPENDSGIR - RAZ/WI as GICD_CTLR.ARE == 1 */ +/* GICD_IROUTER */ +static register_read dist_irouter_read; +static register_write dist_irouter_write; + +static struct vgic_register dist_registers[] = { + VGIC_REGISTER(GICD_CTLR, 4, VGIC_32_BIT, dist_ctlr_read, + dist_ctlr_write), + VGIC_REGISTER(GICD_TYPER, 4, VGIC_32_BIT, dist_typer_read, + gic_ignore_write), + VGIC_REGISTER(GICD_IIDR, 4, VGIC_32_BIT, dist_iidr_read, + gic_ignore_write), + VGIC_REGISTER_RAZ_WI(GICD_STATUSR, 4, VGIC_32_BIT), + VGIC_REGISTER(GICD_SETSPI_NSR, 4, VGIC_32_BIT, gic_zero_read, + dist_setclrspi_nsr_write), + VGIC_REGISTER(GICD_CLRSPI_NSR, 4, VGIC_32_BIT, gic_zero_read, + dist_setclrspi_nsr_write), + VGIC_REGISTER_RAZ_WI(GICD_SETSPI_SR, 4, VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICD_CLRSPI_SR, 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE_RAZ_WI(GICD_IGROUPR(0), GICD_IGROUPR(1024), 4, + VGIC_32_BIT), + + VGIC_REGISTER_RAZ_WI(GICD_ISENABLER(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ISENABLER(32), GICD_ISENABLER(1024), 4, + VGIC_32_BIT, dist_isenabler_read, dist_isenabler_write), + + VGIC_REGISTER_RAZ_WI(GICD_ICENABLER(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ICENABLER(32), GICD_ICENABLER(1024), 4, + VGIC_32_BIT, dist_icenabler_read, dist_icenabler_write), + + VGIC_REGISTER_RAZ_WI(GICD_ISPENDR(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ISPENDR(32), GICD_ISPENDR(1024), 4, + VGIC_32_BIT, dist_ispendr_read, dist_ispendr_write), + + VGIC_REGISTER_RAZ_WI(GICD_ICPENDR(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ICPENDR(32), GICD_ICPENDR(1024), 4, + VGIC_32_BIT, dist_icpendr_read, dist_icpendr_write), + + VGIC_REGISTER_RAZ_WI(GICD_ISACTIVER(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ISACTIVER(32), GICD_ISACTIVER(1024), 4, + VGIC_32_BIT, dist_isactiver_read, dist_isactiver_write), + + VGIC_REGISTER_RAZ_WI(GICD_ICACTIVER(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ICACTIVER(32), GICD_ICACTIVER(1024), 4, + VGIC_32_BIT, dist_icactiver_read, dist_icactiver_write), + + VGIC_REGISTER_RANGE_RAZ_WI(GICD_IPRIORITYR(0), GICD_IPRIORITYR(32), 4, + VGIC_32_BIT | VGIC_8_BIT), + VGIC_REGISTER_RANGE(GICD_IPRIORITYR(32), GICD_IPRIORITYR(1024), 4, + VGIC_32_BIT | VGIC_8_BIT, dist_ipriorityr_read, + dist_ipriorityr_write), + + VGIC_REGISTER_RANGE_RAZ_WI(GICD_ITARGETSR(0), GICD_ITARGETSR(1024), 4, + VGIC_32_BIT | VGIC_8_BIT), + + VGIC_REGISTER_RANGE_RAZ_WI(GICD_ICFGR(0), GICD_ICFGR(32), 4, + VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ICFGR(32), GICD_ICFGR(1024), 4, + VGIC_32_BIT, dist_icfgr_read, dist_icfgr_write), +/* + VGIC_REGISTER_RANGE(GICD_IGRPMODR(0), GICD_IGRPMODR(1024), 4, + VGIC_32_BIT, dist_igrpmodr_read, dist_igrpmodr_write), + VGIC_REGISTER_RANGE(GICD_NSACR(0), GICD_NSACR(1024), 4, + VGIC_32_BIT, dist_nsacr_read, dist_nsacr_write), +*/ + VGIC_REGISTER_RAZ_WI(GICD_SGIR, 4, VGIC_32_BIT), +/* + VGIC_REGISTER_RANGE(GICD_CPENDSGIR(0), GICD_CPENDSGIR(1024), 4, + VGIC_32_BIT | VGIC_8_BIT, dist_cpendsgir_read, + dist_cpendsgir_write), + VGIC_REGISTER_RANGE(GICD_SPENDSGIR(0), GICD_SPENDSGIR(1024), 4, + VGIC_32_BIT | VGIC_8_BIT, dist_spendsgir_read, + dist_spendsgir_write), +*/ + VGIC_REGISTER_RANGE(GICD_IROUTER(32), GICD_IROUTER(1024), 8, + VGIC_64_BIT | VGIC_32_BIT, dist_irouter_read, dist_irouter_write), + + VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR4, GICD_PIDR2, 4, VGIC_32_BIT), + VGIC_REGISTER(GICD_PIDR2, 4, VGIC_32_BIT, gic_pidr2_read, + gic_ignore_write), + VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR2 + 4, GICD_SIZE, 4, VGIC_32_BIT), +}; + +/* GICR_CTLR - Ignore writes as no bits can be set */ +static register_read redist_ctlr_read; +/* GICR_IIDR */ +static register_read redist_iidr_read; +/* GICR_TYPER */ +static register_read redist_typer_read; +/* GICR_STATUSR - RAZ/WI as we don't report errors (yet) */ +/* GICR_WAKER - RAZ/WI from non-secure mode */ +/* GICR_SETLPIR - RAZ/WI as no LPIs are supported */ +/* GICR_CLRLPIR - RAZ/WI as no LPIs are supported */ +/* GICR_PROPBASER - RAZ/WI as no LPIs are supported */ +/* GICR_PENDBASER - RAZ/WI as no LPIs are supported */ +/* GICR_INVLPIR - RAZ/WI as no LPIs are supported */ +/* GICR_INVALLR - RAZ/WI as no LPIs are supported */ +/* GICR_SYNCR - RAZ/WI as no LPIs are supported */ + +static struct vgic_register redist_rd_registers[] = { + VGIC_REGISTER(GICR_CTLR, 4, VGIC_32_BIT, redist_ctlr_read, + gic_ignore_write), + VGIC_REGISTER(GICR_IIDR, 4, VGIC_32_BIT, redist_iidr_read, + gic_ignore_write), + VGIC_REGISTER(GICR_TYPER, 8, VGIC_64_BIT | VGIC_32_BIT, + redist_typer_read, gic_ignore_write), + VGIC_REGISTER_RAZ_WI(GICR_STATUSR, 4, VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_WAKER, 4, VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_SETLPIR, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_CLRLPIR, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_PROPBASER, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_PENDBASER, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_INVLPIR, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_INVALLR, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_SYNCR, 4, VGIC_32_BIT), + + /* These are identical to the dist registers */ + VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR4, GICD_PIDR2, 4, VGIC_32_BIT), + VGIC_REGISTER(GICD_PIDR2, 4, VGIC_32_BIT, gic_pidr2_read, + gic_ignore_write), + VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR2 + 4, GICD_SIZE, 4, + VGIC_32_BIT), +}; + +/* GICR_IGROUPR0 - RAZ/WI from non-secure mode */ +/* GICR_ISENABLER0 */ +static register_read redist_ienabler0_read; +static register_write redist_isenabler0_write; +/* GICR_ICENABLER0 */ +static register_write redist_icenabler0_write; +/* GICR_ISPENDR0 */ +static register_read redist_ipendr0_read; +static register_write redist_ispendr0_write; +/* GICR_ICPENDR0 */ +static register_write redist_icpendr0_write; +/* GICR_ISACTIVER0 */ +static register_read redist_iactiver0_read; +static register_write redist_isactiver0_write; +/* GICR_ICACTIVER0 */ +static register_write redist_icactiver0_write; +/* GICR_IPRIORITYR */ +static register_read redist_ipriorityr_read; +static register_write redist_ipriorityr_write; +/* GICR_ICFGR0 - RAZ/WI from non-secure mode */ +/* GICR_ICFGR1 */ +static register_read redist_icfgr1_read; +static register_write redist_icfgr1_write; +/* GICR_IGRPMODR0 - RAZ/WI from non-secure mode */ +/* GICR_NSCAR - RAZ/WI from non-secure mode */ + +static struct vgic_register redist_sgi_registers[] = { + VGIC_REGISTER_RAZ_WI(GICR_IGROUPR0, 4, VGIC_32_BIT), + VGIC_REGISTER(GICR_ISENABLER0, 4, VGIC_32_BIT, redist_ienabler0_read, + redist_isenabler0_write), + VGIC_REGISTER(GICR_ICENABLER0, 4, VGIC_32_BIT, redist_ienabler0_read, + redist_icenabler0_write), + VGIC_REGISTER(GICR_ISPENDR0, 4, VGIC_32_BIT, redist_ipendr0_read, + redist_ispendr0_write), + VGIC_REGISTER(GICR_ICPENDR0, 4, VGIC_32_BIT, redist_ipendr0_read, + redist_icpendr0_write), + VGIC_REGISTER(GICR_ISACTIVER0, 4, VGIC_32_BIT, redist_iactiver0_read, + redist_isactiver0_write), + VGIC_REGISTER(GICR_ICACTIVER0, 4, VGIC_32_BIT, redist_iactiver0_read, + redist_icactiver0_write), + VGIC_REGISTER_RANGE(GICR_IPRIORITYR(0), GICR_IPRIORITYR(32), 4, + VGIC_32_BIT | VGIC_8_BIT, redist_ipriorityr_read, + redist_ipriorityr_write), + VGIC_REGISTER_RAZ_WI(GICR_ICFGR0, 4, VGIC_32_BIT), + VGIC_REGISTER(GICR_ICFGR1, 4, VGIC_32_BIT, redist_icfgr1_read, + redist_icfgr1_write), + VGIC_REGISTER_RAZ_WI(GICR_IGRPMODR0, 4, VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_NSACR, 4, VGIC_32_BIT), +}; + +static struct vgic_v3_virt_features virt_features; + +static struct vgic_v3_irq *vgic_v3_get_irq(struct hyp *, int, uint32_t); +static void vgic_v3_release_irq(struct vgic_v3_irq *); + +/* TODO: Move to a common file */ +static int +mpidr_to_vcpu(struct hyp *hyp, uint64_t mpidr) +{ + struct vm *vm; + struct hypctx *hypctx; + + vm = hyp->vm; + for (int i = 0; i < vm_get_maxcpus(vm); i++) { + hypctx = hyp->ctx[i]; + if (hypctx != NULL && (hypctx->vmpidr_el2 & GICD_AFF) == mpidr) + return (i); + } + return (-1); +} + +static void +vgic_v3_vminit(device_t dev, struct hyp *hyp) +{ + struct vgic_v3 *vgic; + + hyp->vgic = malloc(sizeof(*hyp->vgic), M_VGIC_V3, + M_WAITOK | M_ZERO); + vgic = hyp->vgic; + + /* + * Configure the Distributor control register. The register resets to an + * architecturally UNKNOWN value, so we reset to 0 to disable all + * functionality controlled by the register. + * + * The exception is GICD_CTLR.DS, which is RA0/WI when the Distributor + * supports one security state (ARM GIC Architecture Specification for + * GICv3 and GICv4, p. 4-464) + */ + vgic->gicd_ctlr = 0; + + mtx_init(&vgic->dist_mtx, "VGICv3 Distributor lock", NULL, + MTX_SPIN); +} + +static void +vgic_v3_cpuinit(device_t dev, struct hypctx *hypctx) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + uint64_t aff, vmpidr_el2; + int i, irqid; + bool last_vcpu; + + hypctx->vgic_cpu = malloc(sizeof(*hypctx->vgic_cpu), + M_VGIC_V3, M_WAITOK | M_ZERO); + vgic_cpu = hypctx->vgic_cpu; + + last_vcpu = + vcpu_vcpuid(hypctx->vcpu) == (vm_get_maxcpus(hypctx->hyp->vm) - 1); + + vmpidr_el2 = hypctx->vmpidr_el2; + KASSERT(vmpidr_el2 != 0, + ("Trying to init this CPU's vGIC before the vCPU")); + /* + * Get affinity for the current CPU. The guest CPU affinity is taken + * from VMPIDR_EL2. The Redistributor corresponding to this CPU is + * the Redistributor with the same affinity from GICR_TYPER. + */ + aff = (CPU_AFF3(vmpidr_el2) << 24) | (CPU_AFF2(vmpidr_el2) << 16) | + (CPU_AFF1(vmpidr_el2) << 8) | CPU_AFF0(vmpidr_el2); + + /* Set up GICR_TYPER. */ + vgic_cpu->gicr_typer = aff << GICR_TYPER_AFF_SHIFT; + /* Set the vcpu as the processsor ID */ + vgic_cpu->gicr_typer |= + (uint64_t)vcpu_vcpuid(hypctx->vcpu) << GICR_TYPER_CPUNUM_SHIFT; + + if (last_vcpu) + /* Mark the last Redistributor */ + vgic_cpu->gicr_typer |= GICR_TYPER_LAST; + + mtx_init(&vgic_cpu->lr_mtx, "VGICv3 ICH_LR_EL2 lock", NULL, MTX_SPIN); + + /* Set the SGI and PPI state */ + for (irqid = 0; irqid < VGIC_PRV_I_NUM; irqid++) { + irq = &vgic_cpu->private_irqs[irqid]; + + mtx_init(&irq->irq_spinmtx, "VGIC IRQ spinlock", NULL, + MTX_SPIN); + irq->irq = irqid; + irq->mpidr = hypctx->vmpidr_el2 & GICD_AFF; + irq->target_vcpu = vcpu_vcpuid(hypctx->vcpu); + MPASS(irq->target_vcpu >= 0); + + if (irqid < VGIC_SGI_NUM) { + /* SGIs */ + irq->enabled = true; + irq->config = VGIC_CONFIG_EDGE; + } else { + /* PPIs */ + irq->config = VGIC_CONFIG_LEVEL; + } + irq->priority = 0; + } + + /* + * Configure the Interrupt Controller Hyp Control Register. + * + * ICH_HCR_EL2_En: enable virtual CPU interface. + * + * Maintenance interrupts are disabled. + */ + hypctx->vgic_v3_regs.ich_hcr_el2 = ICH_HCR_EL2_En; + + /* + * Configure the Interrupt Controller Virtual Machine Control Register. + * + * ICH_VMCR_EL2_VPMR: lowest priority mask for the VCPU interface + * ICH_VMCR_EL2_VBPR1_NO_PREEMPTION: disable interrupt preemption for + * Group 1 interrupts + * ICH_VMCR_EL2_VBPR0_NO_PREEMPTION: disable interrupt preemption for + * Group 0 interrupts + * ~ICH_VMCR_EL2_VEOIM: writes to EOI registers perform priority drop + * and interrupt deactivation. + * ICH_VMCR_EL2_VENG0: virtual Group 0 interrupts enabled. + * ICH_VMCR_EL2_VENG1: virtual Group 1 interrupts enabled. + */ + hypctx->vgic_v3_regs.ich_vmcr_el2 = + (virt_features.min_prio << ICH_VMCR_EL2_VPMR_SHIFT) | + ICH_VMCR_EL2_VBPR1_NO_PREEMPTION | ICH_VMCR_EL2_VBPR0_NO_PREEMPTION; + hypctx->vgic_v3_regs.ich_vmcr_el2 &= ~ICH_VMCR_EL2_VEOIM; + hypctx->vgic_v3_regs.ich_vmcr_el2 |= ICH_VMCR_EL2_VENG0 | + ICH_VMCR_EL2_VENG1; + + hypctx->vgic_v3_regs.ich_lr_num = virt_features.ich_lr_num; + for (i = 0; i < hypctx->vgic_v3_regs.ich_lr_num; i++) + hypctx->vgic_v3_regs.ich_lr_el2[i] = 0UL; + vgic_cpu->ich_lr_used = 0; + TAILQ_INIT(&vgic_cpu->irq_act_pend); + + hypctx->vgic_v3_regs.ich_apr_num = virt_features.ich_apr_num; +} + +static void +vgic_v3_cpucleanup(device_t dev, struct hypctx *hypctx) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + int irqid; + + vgic_cpu = hypctx->vgic_cpu; + for (irqid = 0; irqid < VGIC_PRV_I_NUM; irqid++) { + irq = &vgic_cpu->private_irqs[irqid]; + mtx_destroy(&irq->irq_spinmtx); + } + + mtx_destroy(&vgic_cpu->lr_mtx); + free(hypctx->vgic_cpu, M_VGIC_V3); +} + +static void +vgic_v3_vmcleanup(device_t dev, struct hyp *hyp) +{ + mtx_destroy(&hyp->vgic->dist_mtx); + free(hyp->vgic, M_VGIC_V3); +} + +static bool +vgic_v3_irq_pending(struct vgic_v3_irq *irq) +{ + if ((irq->config & VGIC_CONFIG_MASK) == VGIC_CONFIG_LEVEL) { + return (irq->pending || irq->level); + } else { + return (irq->pending); + } +} + +static bool +vgic_v3_queue_irq(struct hyp *hyp, struct vgic_v3_cpu *vgic_cpu, + int vcpuid, struct vgic_v3_irq *irq) +{ + MPASS(vcpuid >= 0); + MPASS(vcpuid < vm_get_maxcpus(hyp->vm)); + + mtx_assert(&vgic_cpu->lr_mtx, MA_OWNED); + mtx_assert(&irq->irq_spinmtx, MA_OWNED); + + /* No need to queue the IRQ */ + if (!irq->level && !irq->pending) + return (false); + + if (!irq->on_aplist) { + irq->on_aplist = true; + TAILQ_INSERT_TAIL(&vgic_cpu->irq_act_pend, irq, act_pend_list); + } + return (true); +} + +static uint64_t +gic_reg_value_64(uint64_t field, uint64_t val, u_int offset, u_int size) +{ + uint32_t mask; + + if (offset != 0 || size != 8) { + mask = ((1ul << (size * 8)) - 1) << (offset * 8); + /* Shift the new bits to the correct place */ + val <<= (offset * 8); + /* Keep only the interesting bits */ + val &= mask; + /* Add the bits we are keeping from the old value */ + val |= field & ~mask; + } + + return (val); +} + +static void +gic_pidr2_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = GICR_PIDR2_ARCH_GICv3 << GICR_PIDR2_ARCH_SHIFT; +} + +/* Common read-only/write-ignored helpers */ +static void +gic_zero_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = 0; +} + +static void +gic_ignore_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + /* Nothing to do */ +} + +static uint64_t +read_enabler(struct hypctx *hypctx, int n) +{ + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int i; + + ret = 0; + irq_base = n * 32; + for (i = 0; i < 32; i++) { + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + if (!irq->enabled) + ret |= 1u << i; + vgic_v3_release_irq(irq); + } + + return (ret); +} + +static void +write_enabler(struct hypctx *hypctx,int n, bool set, uint64_t val) +{ + struct vgic_v3_irq *irq; + uint32_t irq_base; + int i; + + irq_base = n * 32; + for (i = 0; i < 32; i++) { + /* We only change interrupts when the appropriate bit is set */ + if ((val & (1u << i)) == 0) + continue; + + /* Find the interrupt this bit represents */ + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + irq->enabled = set; + vgic_v3_release_irq(irq); + } +} + +static uint64_t +read_pendr(struct hypctx *hypctx, int n) +{ + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int i; + + ret = 0; + irq_base = n * 32; + for (i = 0; i < 32; i++) { + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + if (vgic_v3_irq_pending(irq)) + ret |= 1u << i; + vgic_v3_release_irq(irq); + } + + return (ret); +} + +static uint64_t +write_pendr(struct hypctx *hypctx, int n, bool set, uint64_t val) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + struct hyp *hyp; + struct hypctx *target_hypctx; + uint64_t ret; + uint32_t irq_base; + int target_vcpu, i; + bool notify; + + hyp = hypctx->hyp; + ret = 0; + irq_base = n * 32; + for (i = 0; i < 32; i++) { + /* We only change interrupts when the appropriate bit is set */ + if ((val & (1u << i)) == 0) + continue; + + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + notify = false; + target_vcpu = irq->target_vcpu; + if (target_vcpu < 0) + goto next_irq; + target_hypctx = hyp->ctx[target_vcpu]; + if (target_hypctx == NULL) + goto next_irq; + vgic_cpu = target_hypctx->vgic_cpu; + + if (!set) { + /* pending -> not pending */ + irq->pending = false; + } else { + irq->pending = true; + mtx_lock_spin(&vgic_cpu->lr_mtx); + notify = vgic_v3_queue_irq(hyp, vgic_cpu, target_vcpu, + irq); + mtx_unlock_spin(&vgic_cpu->lr_mtx); + } +next_irq: + vgic_v3_release_irq(irq); + + if (notify) + vcpu_notify_event(vm_vcpu(hyp->vm, target_vcpu)); + } + + return (ret); +} + +static uint64_t +read_activer(struct hypctx *hypctx, int n) +{ + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int i; + + ret = 0; + irq_base = n * 32; + for (i = 0; i < 32; i++) { + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + if (irq->active) + ret |= 1u << i; + vgic_v3_release_irq(irq); + } + + return (ret); +} + +static void +write_activer(struct hypctx *hypctx, u_int n, bool set, uint64_t val) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + struct hyp *hyp; + struct hypctx *target_hypctx; + uint32_t irq_base; + int target_vcpu, i; + bool notify; + + hyp = hypctx->hyp; + irq_base = n * 32; + for (i = 0; i < 32; i++) { + /* We only change interrupts when the appropriate bit is set */ + if ((val & (1u << i)) == 0) + continue; + + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + notify = false; + target_vcpu = irq->target_vcpu; + if (target_vcpu < 0) + goto next_irq; + target_hypctx = hyp->ctx[target_vcpu]; + if (target_hypctx == NULL) + goto next_irq; + vgic_cpu = target_hypctx->vgic_cpu; + + if (!set) { + /* active -> not active */ + irq->active = false; + } else { + /* not active -> active */ + irq->active = true; + mtx_lock_spin(&vgic_cpu->lr_mtx); + notify = vgic_v3_queue_irq(hyp, vgic_cpu, target_vcpu, + irq); + mtx_unlock_spin(&vgic_cpu->lr_mtx); + } +next_irq: + vgic_v3_release_irq(irq); + + if (notify) + vcpu_notify_event(vm_vcpu(hyp->vm, target_vcpu)); + } +} + +static uint64_t +read_priorityr(struct hypctx *hypctx, int n) +{ + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int i; + + ret = 0; + irq_base = n * 4; + for (i = 0; i < 4; i++) { + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + ret |= ((uint64_t)irq->priority) << (i * 8); + vgic_v3_release_irq(irq); + } + + return (ret); +} + +static void +write_priorityr(struct hypctx *hypctx, u_int irq_base, u_int size, uint64_t val) +{ + struct vgic_v3_irq *irq; + int i; + + for (i = 0; i < size; i++) { + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + /* Set the priority. We support 32 priority steps (5 bits) */ + irq->priority = (val >> (i * 8)) & 0xf8; + vgic_v3_release_irq(irq); + } +} + +static uint64_t +read_config(struct hypctx *hypctx, int n) +{ + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int i; + + ret = 0; + irq_base = n * 16; + for (i = 0; i < 16; i++) { + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + ret |= ((uint64_t)irq->config) << (i * 2); + vgic_v3_release_irq(irq); + } + + return (ret); +} + +static void +write_config(struct hypctx *hypctx, int n, uint64_t val) +{ + struct vgic_v3_irq *irq; + uint32_t irq_base; + int i; + + irq_base = n * 16; + for (i = 0; i < 16; i++) { + /* + * The config can't be changed for SGIs and PPIs. SGIs have + * an edge-triggered behaviour, and the register is + * implementation defined to be read-only for PPIs. + */ + if (irq_base + i < VGIC_PRV_I_NUM) + continue; + + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + irq_base + i); + if (irq == NULL) + continue; + + /* Bit 0 is RES0 */ + irq->config = (val >> (i * 2)) & VGIC_CONFIG_MASK; + vgic_v3_release_irq(irq); + } +} + +static uint64_t +read_route(struct hypctx *hypctx, int n) +{ + struct vgic_v3_irq *irq; + uint64_t mpidr; + + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), n); + if (irq == NULL) + return (0); + + mpidr = irq->mpidr; + vgic_v3_release_irq(irq); + + return (mpidr); +} + +static void +write_route(struct hypctx *hypctx, int n, uint64_t val, u_int offset, + u_int size) +{ + struct vgic_v3_irq *irq; + + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), n); + if (irq == NULL) + return; + + irq->mpidr = gic_reg_value_64(irq->mpidr, val, offset, size) & GICD_AFF; + irq->target_vcpu = mpidr_to_vcpu(hypctx->hyp, irq->mpidr); + /* + * If the interrupt is pending we can either use the old mpidr, or + * the new mpidr. To simplify this code we use the old value so we + * don't need to move the interrupt until the next time it is + * moved to the pending state. + */ + vgic_v3_release_irq(irq); +} + +/* + * Distributor register handlers. + */ +/* GICD_CTLR */ +static void +dist_ctlr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + struct hyp *hyp; + struct vgic_v3 *vgic; + + hyp = hypctx->hyp; + vgic = hyp->vgic; + + mtx_lock_spin(&vgic->dist_mtx); + *rval = vgic->gicd_ctlr; + mtx_unlock_spin(&vgic->dist_mtx); + + /* Writes are never pending */ + *rval &= ~GICD_CTLR_RWP; +} + +static void +dist_ctlr_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + struct vgic_v3 *vgic; + + MPASS(offset == 0); + MPASS(size == 4); + vgic = hypctx->hyp->vgic; + + /* + * GICv2 backwards compatibility is not implemented so + * ARE_NS is RAO/WI. This means EnableGrp1 is RES0. + * + * EnableGrp1A is supported, and RWP is read-only. + * + * All other bits are RES0 from non-secure mode as we + * implement as if we are in a system with two security + * states. + */ + wval &= GICD_CTLR_G1A; + wval |= GICD_CTLR_ARE_NS; + mtx_lock_spin(&vgic->dist_mtx); + vgic->gicd_ctlr = wval; + /* TODO: Wake any vcpus that have interrupts pending */ + mtx_unlock_spin(&vgic->dist_mtx); +} + +/* GICD_TYPER */ +static void +dist_typer_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + uint32_t typer; + + typer = (10 - 1) << GICD_TYPER_IDBITS_SHIFT; + typer |= GICD_TYPER_MBIS; + /* ITLinesNumber: */ + typer |= howmany(VGIC_NIRQS + 1, 32) - 1; + + *rval = typer; +} + +/* GICD_IIDR */ +static void +dist_iidr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + *rval = VGIC_IIDR; +} + +/* GICD_SETSPI_NSR & GICD_CLRSPI_NSR */ +static void +dist_setclrspi_nsr_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + uint32_t irqid; + + MPASS(offset == 0); + MPASS(size == 4); + irqid = wval & GICD_SPI_INTID_MASK; + INJECT_IRQ(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), irqid, + reg == GICD_SETSPI_NSR); +} + +/* GICD_ISENABLER */ +static void +dist_isenabler_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + int n; + + n = (reg - GICD_ISENABLER(0)) / 4; + /* GICD_ISENABLER0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_enabler(hypctx, n); +} + +static void +dist_isenabler_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ISENABLER(0)) / 4; + /* GICD_ISENABLER0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_enabler(hypctx, n, true, wval); +} + +/* GICD_ICENABLER */ +static void +dist_icenabler_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + int n; + + n = (reg - GICD_ICENABLER(0)) / 4; + /* GICD_ICENABLER0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_enabler(hypctx, n); +} + +static void +dist_icenabler_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ISENABLER(0)) / 4; + /* GICD_ICENABLER0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_enabler(hypctx, n, false, wval); +} + +/* GICD_ISPENDR */ +static void +dist_ispendr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + int n; + + n = (reg - GICD_ISPENDR(0)) / 4; + /* GICD_ISPENDR0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_pendr(hypctx, n); +} + +static void +dist_ispendr_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ISPENDR(0)) / 4; + /* GICD_ISPENDR0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_pendr(hypctx, n, true, wval); +} + +/* GICD_ICPENDR */ +static void +dist_icpendr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + int n; + + n = (reg - GICD_ICPENDR(0)) / 4; + /* GICD_ICPENDR0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_pendr(hypctx, n); +} + +static void +dist_icpendr_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ICPENDR(0)) / 4; + /* GICD_ICPENDR0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_pendr(hypctx, n, false, wval); +} + +/* GICD_ISACTIVER */ +/* Affinity routing is enabled so isactiver0 is RAZ/WI */ +static void +dist_isactiver_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + int n; + + n = (reg - GICD_ISACTIVER(0)) / 4; + /* GICD_ISACTIVER0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_activer(hypctx, n); +} + +static void +dist_isactiver_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ISACTIVER(0)) / 4; + /* GICD_ISACTIVE0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_activer(hypctx, n, true, wval); +} + +/* GICD_ICACTIVER */ +static void +dist_icactiver_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICD_ICACTIVER(0)) / 4; + /* GICD_ICACTIVE0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_activer(hypctx, n); +} + +static void +dist_icactiver_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ICACTIVER(0)) / 4; + /* GICD_ICACTIVE0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_activer(hypctx, n, false, wval); +} + +/* GICD_IPRIORITYR */ +/* Affinity routing is enabled so ipriorityr0-7 is RAZ/WI */ +static void +dist_ipriorityr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICD_IPRIORITYR(0)) / 4; + /* GICD_IPRIORITY0-7 is RAZ/WI so handled separately */ + MPASS(n > 7); + *rval = read_priorityr(hypctx, n); +} + +static void +dist_ipriorityr_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + u_int irq_base; + + irq_base = (reg - GICD_IPRIORITYR(0)) + offset; + /* GICD_IPRIORITY0-7 is RAZ/WI so handled separately */ + MPASS(irq_base > 31); + write_priorityr(hypctx, irq_base, size, wval); +} + +/* GICD_ICFGR */ +static void +dist_icfgr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + int n; + + n = (reg - GICD_ICFGR(0)) / 4; + /* GICD_ICFGR0-1 are RAZ/WI so handled separately */ + MPASS(n > 1); + *rval = read_config(hypctx, n); +} + +static void +dist_icfgr_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ICFGR(0)) / 4; + /* GICD_ICFGR0-1 are RAZ/WI so handled separately */ + MPASS(n > 1); + write_config(hypctx, n, wval); +} + +/* GICD_IROUTER */ +static void +dist_irouter_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + int n; + + n = (reg - GICD_IROUTER(0)) / 8; + /* GICD_IROUTER0-31 don't exist */ + MPASS(n > 31); + *rval = read_route(hypctx, n); +} + +static void +dist_irouter_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + int n; + + n = (reg - GICD_IROUTER(0)) / 8; + /* GICD_IROUTER0-31 don't exist */ + MPASS(n > 31); + write_route(hypctx, n, wval, offset, size); +} + +static bool +vgic_register_read(struct hypctx *hypctx, struct vgic_register *reg_list, + u_int reg_list_size, u_int reg, u_int size, uint64_t *rval, void *arg) +{ + u_int i, offset; + + for (i = 0; i < reg_list_size; i++) { + if (reg_list[i].start <= reg && reg_list[i].end >= reg + size) { + offset = reg & reg_list[i].size - 1; + reg -= offset; + if ((reg_list[i].flags & size) != 0) { + reg_list[i].read(hypctx, reg, rval, NULL); + + /* Move the bits into the correct place */ + *rval >>= (offset * 8); + if (size < 8) { + *rval &= (1ul << (size * 8)) - 1; + } + } else { + /* + * The access is an invalid size. Section + * 12.1.3 "GIC memory-mapped register access" + * of the GICv3 and GICv4 spec issue H + * (IHI0069) lists the options. For a read + * the controller returns unknown data, in + * this case it is zero. + */ + *rval = 0; + } + return (true); + } + } + return (false); +} + +static bool +vgic_register_write(struct hypctx *hypctx, struct vgic_register *reg_list, + u_int reg_list_size, u_int reg, u_int size, uint64_t wval, void *arg) +{ + u_int i, offset; + + for (i = 0; i < reg_list_size; i++) { + if (reg_list[i].start <= reg && reg_list[i].end >= reg + size) { + offset = reg & reg_list[i].size - 1; + reg -= offset; + if ((reg_list[i].flags & size) != 0) { + reg_list[i].write(hypctx, reg, offset, + size, wval, NULL); + } else { + /* + * See the comment in vgic_register_read. + * For writes the controller ignores the + * operation. + */ + } + return (true); + } + } + return (false); +} + +static int +dist_read(struct vcpu *vcpu, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vgic_v3 *vgic; + uint64_t reg; + + hypctx = vcpu_get_cookie(vcpu); + hyp = hypctx->hyp; + vgic = hyp->vgic; + + /* Check the register is one of ours and is the correct size */ + if (fault_ipa < vgic->dist_start || fault_ipa + size > vgic->dist_end) { + return (EINVAL); + } + + reg = fault_ipa - vgic->dist_start; + /* Check the register is correctly aligned */ + if ((reg & (size - 1)) != 0) + return (EINVAL); + + if (vgic_register_read(hypctx, dist_registers, nitems(dist_registers), + reg, size, rval, NULL)) + return (0); + + /* Reserved register addresses are RES0 so we can hardware it to 0 */ + *rval = 0; + + return (0); +} + +static int +dist_write(struct vcpu *vcpu, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vgic_v3 *vgic; + uint64_t reg; + + hypctx = vcpu_get_cookie(vcpu); + hyp = hypctx->hyp; + vgic = hyp->vgic; + + /* Check the register is one of ours and is the correct size */ + if (fault_ipa < vgic->dist_start || fault_ipa + size > vgic->dist_end) { + return (EINVAL); + } + + reg = fault_ipa - vgic->dist_start; + /* Check the register is correctly aligned */ + if ((reg & (size - 1)) != 0) + return (EINVAL); + + if (vgic_register_write(hypctx, dist_registers, nitems(dist_registers), + reg, size, wval, NULL)) + return (0); + + /* Reserved register addresses are RES0 so we can ignore the write */ + return (0); +} + +/* + * Redistributor register handlers. + * + * RD_base: + */ +/* GICR_CTLR */ +static void +redist_ctlr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + /* LPIs not supported */ + *rval = 0; +} + +/* GICR_IIDR */ +static void +redist_iidr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + *rval = VGIC_IIDR; +} + +/* GICR_TYPER */ +static void +redist_typer_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + *rval = hypctx->vgic_cpu->gicr_typer; +} + +/* + * SGI_base: + */ +/* GICR_ISENABLER0 */ +static void +redist_ienabler0_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = read_enabler(hypctx, 0); +} + +static void +redist_isenabler0_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + MPASS(offset == 0); + MPASS(size == 4); + write_enabler(hypctx, 0, true, wval); +} + +/* GICR_ICENABLER0 */ +static void +redist_icenabler0_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + MPASS(offset == 0); + MPASS(size == 4); + write_enabler(hypctx, 0, false, wval); +} + +/* GICR_ISPENDR0 */ +static void +redist_ipendr0_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = read_pendr(hypctx, 0); +} + +static void +redist_ispendr0_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + MPASS(offset == 0); + MPASS(size == 4); + write_pendr(hypctx, 0, true, wval); +} + +/* GICR_ICPENDR0 */ +static void +redist_icpendr0_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + MPASS(offset == 0); + MPASS(size == 4); + write_pendr(hypctx, 0, false, wval); +} + +/* GICR_ISACTIVER0 */ +static void +redist_iactiver0_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = read_activer(hypctx, 0); +} + +static void +redist_isactiver0_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + write_activer(hypctx, 0, true, wval); +} + +/* GICR_ICACTIVER0 */ +static void +redist_icactiver0_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + write_activer(hypctx, 0, false, wval); +} + +/* GICR_IPRIORITYR */ +static void +redist_ipriorityr_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICR_IPRIORITYR(0)) / 4; + *rval = read_priorityr(hypctx, n); +} + +static void +redist_ipriorityr_write(struct hypctx *hypctx, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + u_int irq_base; + + irq_base = (reg - GICR_IPRIORITYR(0)) + offset; + write_priorityr(hypctx, irq_base, size, wval); +} + +/* GICR_ICFGR1 */ +static void +redist_icfgr1_read(struct hypctx *hypctx, u_int reg, uint64_t *rval, void *arg) +{ + *rval = read_config(hypctx, 0); +} + +static void +redist_icfgr1_write(struct hypctx *hypctx, u_int reg, u_int offset, u_int size, + uint64_t wval, void *arg) +{ + MPASS(offset == 0); + MPASS(size == 4); + write_config(hypctx, 0, wval); +} + +static int +redist_read(struct vcpu *vcpu, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vgic_v3 *vgic; + uint64_t reg; + + hypctx = vcpu_get_cookie(vcpu); + hyp = hypctx->hyp; + vgic = hyp->vgic; + + /* Check the register is one of ours and is the correct size */ + if (fault_ipa < vgic->redist_start || + fault_ipa + size > vgic->redist_end) { + return (EINVAL); + } + + reg = fault_ipa - vgic->redist_start; + /* Check the register is correctly aligned */ + if ((reg & (size - 1)) != 0) + return (EINVAL); + + if (reg < GICR_RD_BASE_SIZE) { + if (vgic_register_read(hypctx, redist_rd_registers, + nitems(redist_rd_registers), reg, size, rval, NULL)) + return (0); + } else if (reg < (GICR_SGI_BASE + GICR_SGI_BASE_SIZE)) { + if (vgic_register_read(hypctx, redist_sgi_registers, + nitems(redist_sgi_registers), reg - GICR_SGI_BASE, size, + rval, NULL)) + return (0); + } + + /* Reserved register addresses are RES0 so we can hardware it to 0 */ + *rval = 0; + return (0); +} + +static int +redist_write(struct vcpu *vcpu, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vgic_v3 *vgic; + uint64_t reg; + + hypctx = vcpu_get_cookie(vcpu); + hyp = hypctx->hyp; + vgic = hyp->vgic; + + /* Check the register is one of ours and is the correct size */ + if (fault_ipa < vgic->redist_start || + fault_ipa + size > vgic->redist_end) { + return (EINVAL); + } + + reg = fault_ipa - vgic->redist_start; + /* Check the register is correctly aligned */ + if ((reg & (size - 1)) != 0) + return (EINVAL); + + if (reg < GICR_RD_BASE_SIZE) { + if (vgic_register_write(hypctx, redist_rd_registers, + nitems(redist_rd_registers), reg, size, wval, NULL)) + return (0); + } else if (reg < (GICR_SGI_BASE + GICR_SGI_BASE_SIZE)) { + if (vgic_register_write(hypctx, redist_sgi_registers, + nitems(redist_sgi_registers), reg - GICR_SGI_BASE, size, + wval, NULL)) + return (0); + } + + /* Reserved register addresses are RES0 so we can ignore the write */ + return (0); +} + +static int +vgic_v3_icc_sgi1r_read(struct vcpu *vcpu, uint64_t *rval, void *arg) +{ + /* + * TODO: Inject an unknown exception. + */ + *rval = 0; + return (0); +} + +static int +vgic_v3_icc_sgi1r_write(struct vcpu *vcpu, uint64_t rval, void *arg) +{ + struct vm *vm; + struct hyp *hyp; + cpuset_t active_cpus; + uint64_t mpidr, aff1, aff2, aff3; + uint32_t irqid; + int cpus, cpu_off, target_vcpuid, vcpuid; + + vm = vcpu_vm(vcpu); + hyp = vm_get_cookie(vm); + active_cpus = vm_active_cpus(vm); + vcpuid = vcpu_vcpuid(vcpu); + + irqid = ICC_SGI1R_EL1_SGIID_VAL(rval) >> ICC_SGI1R_EL1_SGIID_SHIFT; + if ((rval & ICC_SGI1R_EL1_IRM) == 0) { + /* Non-zero points at no vcpus */ + if (ICC_SGI1R_EL1_RS_VAL(rval) != 0) + return (0); + + aff1 = ICC_SGI1R_EL1_AFF1_VAL(rval) >> ICC_SGI1R_EL1_AFF1_SHIFT; + aff2 = ICC_SGI1R_EL1_AFF2_VAL(rval) >> ICC_SGI1R_EL1_AFF2_SHIFT; + aff3 = ICC_SGI1R_EL1_AFF3_VAL(rval) >> ICC_SGI1R_EL1_AFF3_SHIFT; + mpidr = aff3 << MPIDR_AFF3_SHIFT | + aff2 << MPIDR_AFF2_SHIFT | aff1 << MPIDR_AFF1_SHIFT; + + cpus = ICC_SGI1R_EL1_TL_VAL(rval) >> ICC_SGI1R_EL1_TL_SHIFT; + cpu_off = 0; + while (cpus > 0) { + if (cpus & 1) { + target_vcpuid = mpidr_to_vcpu(hyp, + mpidr | (cpu_off << MPIDR_AFF0_SHIFT)); + if (target_vcpuid >= 0 && + CPU_ISSET(target_vcpuid, &active_cpus)) { + INJECT_IRQ(hyp, target_vcpuid, irqid, + true); + } + } + cpu_off++; + cpus >>= 1; + } + } else { + /* Send an IPI to all CPUs other than the current CPU */ + for (target_vcpuid = 0; target_vcpuid < vm_get_maxcpus(vm); + target_vcpuid++) { + if (CPU_ISSET(target_vcpuid, &active_cpus) && + target_vcpuid != vcpuid) { + INJECT_IRQ(hyp, target_vcpuid, irqid, true); + } + } + } + + return (0); +} + +static void +vgic_v3_mmio_init(struct hyp *hyp) +{ + struct vgic_v3 *vgic; + struct vgic_v3_irq *irq; + int i; + + /* Allocate memory for the SPIs */ + vgic = hyp->vgic; + vgic->irqs = malloc((VGIC_NIRQS - VGIC_PRV_I_NUM) * + sizeof(*vgic->irqs), M_VGIC_V3, M_WAITOK | M_ZERO); + + for (i = 0; i < VGIC_NIRQS - VGIC_PRV_I_NUM; i++) { + irq = &vgic->irqs[i]; + + mtx_init(&irq->irq_spinmtx, "VGIC IRQ spinlock", NULL, + MTX_SPIN); + + irq->irq = i + VGIC_PRV_I_NUM; + } +} + +static void +vgic_v3_mmio_destroy(struct hyp *hyp) +{ + struct vgic_v3 *vgic; + struct vgic_v3_irq *irq; + int i; + + vgic = hyp->vgic; + for (i = 0; i < VGIC_NIRQS - VGIC_PRV_I_NUM; i++) { + irq = &vgic->irqs[i]; + + mtx_destroy(&irq->irq_spinmtx); + } + + free(vgic->irqs, M_VGIC_V3); +} + +static int +vgic_v3_attach_to_vm(device_t dev, struct hyp *hyp, struct vm_vgic_descr *descr) +{ + struct vm *vm; + struct vgic_v3 *vgic; + + if (descr->ver.version != 3) + return (EINVAL); + + /* The register bases need to be 64k aligned */ + if (!__is_aligned(descr->v3_regs.dist_start, PAGE_SIZE_64K) || + !__is_aligned(descr->v3_regs.redist_start, PAGE_SIZE_64K)) + return (EINVAL); + + /* The dist register space is 1 64k block */ + if (descr->v3_regs.dist_size != PAGE_SIZE_64K) + return (EINVAL); + + /* The redist register space is 2 64k blocks */ + if (descr->v3_regs.redist_size != PAGE_SIZE_64K * 2) + return (EINVAL); + + vm = hyp->vm; + vgic = hyp->vgic; + + /* Set the distributor address and size for trapping guest access. */ + vgic->dist_start = descr->v3_regs.dist_start; + vgic->dist_end = descr->v3_regs.dist_start + descr->v3_regs.dist_size; + + vgic->redist_start = descr->v3_regs.redist_start; + vgic->redist_end = descr->v3_regs.redist_start + + descr->v3_regs.redist_size; + + vm_register_inst_handler(vm, descr->v3_regs.dist_start, + descr->v3_regs.dist_size, dist_read, dist_write); + vm_register_inst_handler(vm, descr->v3_regs.redist_start, + descr->v3_regs.redist_size, redist_read, redist_write); + + vm_register_reg_handler(vm, ISS_MSR_REG(ICC_SGI1R_EL1), + ISS_MSR_REG_MASK, vgic_v3_icc_sgi1r_read, vgic_v3_icc_sgi1r_write, + NULL); + + vgic_v3_mmio_init(hyp); + + hyp->vgic_attached = true; + + return (0); +} + +static void +vgic_v3_detach_from_vm(device_t dev, struct hyp *hyp) +{ + if (hyp->vgic_attached) { + hyp->vgic_attached = false; + vgic_v3_mmio_destroy(hyp); + } +} + +static struct vgic_v3_irq * +vgic_v3_get_irq(struct hyp *hyp, int vcpuid, uint32_t irqid) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + struct hypctx *hypctx; + + if (irqid < VGIC_PRV_I_NUM) { + if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(hyp->vm)) + return (NULL); + hypctx = hyp->ctx[vcpuid]; + if (hypctx == NULL) + return (NULL); + vgic_cpu = hypctx->vgic_cpu; + irq = &vgic_cpu->private_irqs[irqid]; + } else if (irqid <= GIC_LAST_SPI) { + irqid -= VGIC_PRV_I_NUM; + if (irqid >= VGIC_NIRQS) + return (NULL); + irq = &hyp->vgic->irqs[irqid]; + } else if (irqid < GIC_FIRST_LPI) { + return (NULL); + } else { + /* No support for LPIs */ + return (NULL); + } + + mtx_lock_spin(&irq->irq_spinmtx); + return (irq); +} + +static void +vgic_v3_release_irq(struct vgic_v3_irq *irq) +{ + + mtx_unlock_spin(&irq->irq_spinmtx); +} + +static bool +vgic_v3_has_pending_irq(device_t dev, struct hypctx *hypctx) +{ + struct vgic_v3_cpu *vgic_cpu; + bool empty; + + vgic_cpu = hypctx->vgic_cpu; + mtx_lock_spin(&vgic_cpu->lr_mtx); + empty = TAILQ_EMPTY(&vgic_cpu->irq_act_pend); + mtx_unlock_spin(&vgic_cpu->lr_mtx); + + return (!empty); +} + +static bool +vgic_v3_check_irq(struct vgic_v3_irq *irq, bool level) +{ + /* + * Only inject if: + * - Level-triggered IRQ: level changes low -> high + * - Edge-triggered IRQ: level is high + */ + switch (irq->config & VGIC_CONFIG_MASK) { + case VGIC_CONFIG_LEVEL: + return (level != irq->level); + case VGIC_CONFIG_EDGE: + return (level); + default: + break; + } + + return (false); +} + +static int +vgic_v3_inject_irq(device_t dev, struct hyp *hyp, int vcpuid, uint32_t irqid, + bool level) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + struct hypctx *hypctx; + int target_vcpu; + bool notify; + + if (!hyp->vgic_attached) + return (ENODEV); + + KASSERT(vcpuid == -1 || irqid < VGIC_PRV_I_NUM, + ("%s: SPI/LPI with vcpuid set: irq %u vcpuid %u", __func__, irqid, + vcpuid)); + + irq = vgic_v3_get_irq(hyp, vcpuid, irqid); + if (irq == NULL) { + eprintf("Malformed IRQ %u.\n", irqid); + return (EINVAL); + } + + target_vcpu = irq->target_vcpu; + KASSERT(vcpuid == -1 || vcpuid == target_vcpu, + ("%s: Interrupt %u has bad cpu affinity: vcpu %d target vcpu %d", + __func__, irqid, vcpuid, target_vcpu)); + KASSERT(target_vcpu >= 0 && target_vcpu < vm_get_maxcpus(hyp->vm), + ("%s: Interrupt %u sent to invalid vcpu %d", __func__, irqid, + target_vcpu)); + + if (vcpuid == -1) + vcpuid = target_vcpu; + /* TODO: Check from 0 to vm->maxcpus */ + if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(hyp->vm)) { + vgic_v3_release_irq(irq); + return (EINVAL); + } + + hypctx = hyp->ctx[vcpuid]; + if (hypctx == NULL) { + vgic_v3_release_irq(irq); + return (EINVAL); + } + + notify = false; + vgic_cpu = hypctx->vgic_cpu; + + mtx_lock_spin(&vgic_cpu->lr_mtx); + + if (!vgic_v3_check_irq(irq, level)) { + goto out; + } + + if ((irq->config & VGIC_CONFIG_MASK) == VGIC_CONFIG_LEVEL) + irq->level = level; + else /* VGIC_CONFIG_EDGE */ + irq->pending = true; + + notify = vgic_v3_queue_irq(hyp, vgic_cpu, vcpuid, irq); + +out: + mtx_unlock_spin(&vgic_cpu->lr_mtx); + vgic_v3_release_irq(irq); + + if (notify) + vcpu_notify_event(vm_vcpu(hyp->vm, vcpuid)); + + return (0); +} + +static int +vgic_v3_inject_msi(device_t dev, struct hyp *hyp, uint64_t msg, uint64_t addr) +{ + struct vgic_v3 *vgic; + uint64_t reg; + + vgic = hyp->vgic; + + /* This is a 4 byte register */ + if (addr < vgic->dist_start || addr + 4 > vgic->dist_end) { + return (EINVAL); + } + + reg = addr - vgic->dist_start; + if (reg != GICD_SETSPI_NSR) + return (EINVAL); + + return (INJECT_IRQ(hyp, -1, msg, true)); +} + +static void +vgic_v3_flush_hwstate(device_t dev, struct hypctx *hypctx) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + int i; + + vgic_cpu = hypctx->vgic_cpu; + + /* + * All Distributor writes have been executed at this point, do not + * protect Distributor reads with a mutex. + * + * This is callled with all interrupts disabled, so there is no need for + * a List Register spinlock either. + */ + mtx_lock_spin(&vgic_cpu->lr_mtx); + + hypctx->vgic_v3_regs.ich_hcr_el2 &= ~ICH_HCR_EL2_UIE; + + /* Exit early if there are no buffered interrupts */ + if (TAILQ_EMPTY(&vgic_cpu->irq_act_pend)) + goto out; + + KASSERT(vgic_cpu->ich_lr_used == 0, ("%s: Used LR count not zero %u", + __func__, vgic_cpu->ich_lr_used)); + + i = 0; + hypctx->vgic_v3_regs.ich_elrsr_el2 = + (1u << hypctx->vgic_v3_regs.ich_lr_num) - 1; + TAILQ_FOREACH(irq, &vgic_cpu->irq_act_pend, act_pend_list) { + /* No free list register, stop searching for IRQs */ + if (i == hypctx->vgic_v3_regs.ich_lr_num) + break; + + if (!irq->enabled) + continue; + + hypctx->vgic_v3_regs.ich_lr_el2[i] = ICH_LR_EL2_GROUP1 | + ((uint64_t)irq->priority << ICH_LR_EL2_PRIO_SHIFT) | + irq->irq; + + if (irq->active) { + hypctx->vgic_v3_regs.ich_lr_el2[i] |= + ICH_LR_EL2_STATE_ACTIVE; + } + +#ifdef notyet + /* TODO: Check why this is needed */ + if ((irq->config & _MASK) == LEVEL) + hypctx->vgic_v3_regs.ich_lr_el2[i] |= ICH_LR_EL2_EOI; +#endif + + if (!irq->active && vgic_v3_irq_pending(irq)) { + hypctx->vgic_v3_regs.ich_lr_el2[i] |= + ICH_LR_EL2_STATE_PENDING; + + /* + * This IRQ is now pending on the guest. Allow for + * another edge that could cause the interrupt to + * be raised again. + */ + if ((irq->config & VGIC_CONFIG_MASK) == + VGIC_CONFIG_EDGE) { + irq->pending = false; + } + } + + i++; + } + vgic_cpu->ich_lr_used = i; + +out: + mtx_unlock_spin(&vgic_cpu->lr_mtx); +} + +static void +vgic_v3_sync_hwstate(device_t dev, struct hypctx *hypctx) +{ + struct vgic_v3_cpu *vgic_cpu; + struct vgic_v3_irq *irq; + uint64_t lr; + int i; + + vgic_cpu = hypctx->vgic_cpu; + + /* Exit early if there are no buffered interrupts */ + if (vgic_cpu->ich_lr_used == 0) + return; + + /* + * Check on the IRQ state after running the guest. ich_lr_used and + * ich_lr_el2 are only ever used within this thread so is safe to + * access unlocked. + */ + for (i = 0; i < vgic_cpu->ich_lr_used; i++) { + lr = hypctx->vgic_v3_regs.ich_lr_el2[i]; + hypctx->vgic_v3_regs.ich_lr_el2[i] = 0; + + irq = vgic_v3_get_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + ICH_LR_EL2_VINTID(lr)); + if (irq == NULL) + continue; + + irq->active = (lr & ICH_LR_EL2_STATE_ACTIVE) != 0; + + if ((irq->config & VGIC_CONFIG_MASK) == VGIC_CONFIG_EDGE) { + /* + * If we have an edge triggered IRQ preserve the + * pending bit until the IRQ has been handled. + */ + if ((lr & ICH_LR_EL2_STATE_PENDING) != 0) { + irq->pending = true; + } + } else { + /* + * If we have a level triggerend IRQ remove the + * pending bit if the IRQ has been handled. + * The level is separate, so may still be high + * triggering another IRQ. + */ + if ((lr & ICH_LR_EL2_STATE_PENDING) == 0) { + irq->pending = false; + } + } + + /* Lock to update irq_act_pend */ + mtx_lock_spin(&vgic_cpu->lr_mtx); + if (irq->active) { + /* Ensure the active IRQ is at the head of the list */ + TAILQ_REMOVE(&vgic_cpu->irq_act_pend, irq, + act_pend_list); + TAILQ_INSERT_HEAD(&vgic_cpu->irq_act_pend, irq, + act_pend_list); + } else if (!vgic_v3_irq_pending(irq)) { + /* If pending or active remove from the list */ + TAILQ_REMOVE(&vgic_cpu->irq_act_pend, irq, + act_pend_list); + irq->on_aplist = false; + } + mtx_unlock_spin(&vgic_cpu->lr_mtx); + vgic_v3_release_irq(irq); + } + + hypctx->vgic_v3_regs.ich_hcr_el2 &= ~ICH_HCR_EL2_EOICOUNT_MASK; + vgic_cpu->ich_lr_used = 0; +} + +static void +vgic_v3_init(device_t dev) +{ + uint64_t ich_vtr_el2; + uint32_t pribits, prebits; + + ich_vtr_el2 = vmm_call_hyp(HYP_READ_REGISTER, HYP_REG_ICH_VTR); + + /* TODO: These fields are common with the vgicv2 driver */ + pribits = ICH_VTR_EL2_PRIBITS(ich_vtr_el2); + switch (pribits) { + default: + case 5: + virt_features.min_prio = 0xf8; + break; + case 6: + virt_features.min_prio = 0xfc; + break; + case 7: + virt_features.min_prio = 0xfe; + break; + case 8: + virt_features.min_prio = 0xff; + break; + } + + prebits = ICH_VTR_EL2_PREBITS(ich_vtr_el2); + switch (prebits) { + default: + case 5: + virt_features.ich_apr_num = 1; + break; + case 6: + virt_features.ich_apr_num = 2; + break; + case 7: + virt_features.ich_apr_num = 4; + break; + } + + virt_features.ich_lr_num = ICH_VTR_EL2_LISTREGS(ich_vtr_el2); +} + +static int +vgic_v3_probe(device_t dev) +{ + if (!gic_get_vgic(dev)) + return (EINVAL); + + /* We currently only support the GICv3 */ + if (gic_get_hw_rev(dev) < 3) + return (EINVAL); + + device_set_desc(dev, "Virtual GIC v3"); + return (BUS_PROBE_DEFAULT); +} + +static int +vgic_v3_attach(device_t dev) +{ + vgic_dev = dev; + return (0); +} + +static int +vgic_v3_detach(device_t dev) +{ + vgic_dev = NULL; + return (0); +} + +static device_method_t vgic_v3_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, vgic_v3_probe), + DEVMETHOD(device_attach, vgic_v3_attach), + DEVMETHOD(device_detach, vgic_v3_detach), + + /* VGIC interface */ + DEVMETHOD(vgic_init, vgic_v3_init), + DEVMETHOD(vgic_attach_to_vm, vgic_v3_attach_to_vm), + DEVMETHOD(vgic_detach_from_vm, vgic_v3_detach_from_vm), + DEVMETHOD(vgic_vminit, vgic_v3_vminit), + DEVMETHOD(vgic_cpuinit, vgic_v3_cpuinit), + DEVMETHOD(vgic_cpucleanup, vgic_v3_cpucleanup), + DEVMETHOD(vgic_vmcleanup, vgic_v3_vmcleanup), + + DEVMETHOD(vgic_has_pending_irq, vgic_v3_has_pending_irq), + DEVMETHOD(vgic_inject_irq, vgic_v3_inject_irq), + DEVMETHOD(vgic_inject_msi, vgic_v3_inject_msi), + DEVMETHOD(vgic_flush_hwstate, vgic_v3_flush_hwstate), + DEVMETHOD(vgic_sync_hwstate, vgic_v3_sync_hwstate), + + /* End */ + DEVMETHOD_END +}; + +/* TODO: Create a vgic base class? */ +DEFINE_CLASS_0(vgic, vgic_v3_driver, vgic_v3_methods, 0); + +DRIVER_MODULE(vgic_v3, gic, vgic_v3_driver, 0, 0); diff --git a/sys/arm64/vmm/io/vgic_v3_reg.h b/sys/arm64/vmm/io/vgic_v3_reg.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/io/vgic_v3_reg.h @@ -0,0 +1,129 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2018 The FreeBSD Foundation + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VGIC_V3_REG_H_ +#define _VGIC_V3_REG_H_ + +/* Interrupt Controller End of Interrupt Status Register */ +#define ICH_EISR_EL2_STATUS_MASK 0xffff +#define ICH_EISR_EL2_EOI_NOT_HANDLED(lr) ((1 << lr) & ICH_EISR_EL2_STATUS_MASK) + +/* Interrupt Controller Empty List Register Status Register */ +#define ICH_ELSR_EL2_STATUS_MASK 0xffff +#define ICH_ELSR_EL2_LR_EMPTY(x) ((1 << x) & ICH_ELSR_EL2_STATUS_MASK) + +/* Interrupt Controller Hyp Control Register */ +#define ICH_HCR_EL2_EOICOUNT_SHIFT 27 +#define ICH_HCR_EL2_EOICOUNT_MASK (0x1f << ICH_HCR_EL2_EOICOUNT_SHIFT) +#define ICH_HCR_EL2_TDIR (1 << 14) /* Trap non-secure EL1 writes to IC{C, V}_DIR_EL1 */ +#define ICH_HCR_EL2_TSEI (1 << 14) /* Trap System Error Interupts (SEI) to EL2 */ +#define ICH_HCR_EL2_TALL1 (1 << 12) /* Trap non-secure EL1 accesses to IC{C, V}_* for Group 1 interrupts */ +#define ICH_HCR_EL2_TALL0 (1 << 11) /* Trap non-secure EL1 accesses to IC{C, V}_* for Group 0 interrupts */ +#define ICH_HCR_EL2_TC (1 << 10) /* Trap non-secure EL1 accesses to common IC{C, V}_* registers */ +#define ICH_HCR_EL2_VGRP1DIE (1 << 7) /* VM Group 1 Disabled Interrupt Enable */ +#define ICH_HCR_EL2_VGRP1EIE (1 << 6) /* VM Group 1 Enabled Interrupt Enable */ +#define ICH_HCR_EL2_VGRP0DIE (1 << 5) /* VM Group 0 Disabled Interrupt Enable */ +#define ICH_HCR_EL2_VGRP0EIE (1 << 4) /* VM Group 0 Enabled Interrupt Enable */ +#define ICH_HCR_EL2_NPIE (1 << 3) /* No Pending Interrupt Enable */ +#define ICH_HCR_EL2_LRENPIE (1 << 2) /* List Register Entry Not Present Interrupt Enable */ +#define ICH_HCR_EL2_UIE (1 << 1) /* Underflow Interrupt Enable */ +#define ICH_HCR_EL2_En (1 << 0) /* Global enable for the virtual CPU interface */ + +/* Interrupt Controller List Registers */ +#define ICH_LR_EL2_VINTID_MASK 0xffffffff +#define ICH_LR_EL2_VINTID(x) ((x) & ICH_LR_EL2_VINTID_MASK) +#define ICH_LR_EL2_PINTID_SHIFT 32 +#define ICH_LR_EL2_PINTID_MASK (0x3fUL << ICH_LR_EL2_PINTID_SHIFT) +/* Raise a maintanance IRQ when deactivated (only non-HW virqs) */ +#define ICH_LR_EL2_EOI (1UL << 41) +#define ICH_LR_EL2_PRIO_SHIFT 48 +#define ICH_LR_EL2_PRIO_MASK (0xffUL << ICH_LR_EL2_PRIO_SHIFT) +#define ICH_LR_EL2_GROUP_SHIFT 60 +#define ICH_LR_EL2_GROUP1 (1UL << ICH_LR_EL2_GROUP_SHIFT) +#define ICH_LR_EL2_HW (1UL << 61) +#define ICH_LR_EL2_STATE_SHIFT 62 +#define ICH_LR_EL2_STATE_MASK (0x3UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE(x) ((x) & ICH_LR_EL2_STATE_MASK) +#define ICH_LR_EL2_STATE_INACTIVE (0x0UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE_PENDING (0x1UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE_ACTIVE (0x2UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE_PENDING_ACTIVE (0x3UL << ICH_LR_EL2_STATE_SHIFT) + +/* Interrupt Controller Maintenance Interrupt State Register */ +#define ICH_MISR_EL2_VGRP1D (1 << 7) /* vPE Group 1 Disabled */ +#define ICH_MISR_EL2_VGRP1E (1 << 6) /* vPE Group 1 Enabled */ +#define ICH_MISR_EL2_VGRP0D (1 << 5) /* vPE Group 0 Disabled */ +#define ICH_MISR_EL2_VGRP0E (1 << 4) /* vPE Group 0 Enabled */ +#define ICH_MISR_EL2_NP (1 << 3) /* No Pending */ +#define ICH_MISR_EL2_LRENP (1 << 2) /* List Register Entry Not Present */ +#define ICH_MISR_EL2_U (1 << 1) /* Underflow */ +#define ICH_MISR_EL2_EOI (1 << 0) /* End Of Interrupt */ + +/* Interrupt Controller Virtual Machine Control Register */ +#define ICH_VMCR_EL2_VPMR_SHIFT 24 +#define ICH_VMCR_EL2_VPMR_MASK (0xff << ICH_VMCR_EL2_VPMR_SHIFT) +#define ICH_VMCR_EL2_VPMR_PRIO_LOWEST (0xff << ICH_VMCR_EL2_VPMR_SHIFT) +#define ICH_VMCR_EL2_VPMR_PRIO_HIGHEST (0x00 << ICH_VMCR_EL2_VPMR_SHIFT) +#define ICH_VMCR_EL2_VBPR0_SHIFT 21 +#define ICH_VMCR_EL2_VBPR0_MASK (0x7 << ICH_VMCR_EL2_VBPR0_SHIFT) +#define ICH_VMCR_EL2_VBPR0_NO_PREEMPTION \ + (0x7 << ICH_VMCR_EL2_VBPR0_SHIFT) +#define ICH_VMCR_EL2_VBPR1_SHIFT 18 +#define ICH_VMCR_EL2_VBPR1_MASK (0x7 << ICH_VMCR_EL2_VBPR1_SHIFT) +#define ICH_VMCR_EL2_VBPR1_NO_PREEMPTION \ + (0x7 << ICH_VMCR_EL2_VBPR1_SHIFT) +#define ICH_VMCR_EL2_VEOIM (1 << 9) /* Virtual EOI mode */ +#define ICH_VMCR_EL2_VCBPR (1 << 4) /* Virtual Common binary Point Register */ +#define ICH_VMCR_EL2_VFIQEN (1 << 3) /* Virtual FIQ enable */ +#define ICH_VMCR_EL2_VACKCTL (1 << 2) /* Virtual AckCtl */ +#define ICH_VMCR_EL2_VENG1 (1 << 1) /* Virtual Group 1 Interrupt Enable */ +#define ICH_VMCR_EL2_VENG0 (1 << 0) /* Virtual Group 0 Interrupt Enable */ + +/* Interrupt Controller VGIC Type Register */ +#define ICH_VTR_EL2_PRIBITS_SHIFT 29 +#define ICH_VTR_EL2_PRIBITS_MASK (0x7 << ICH_VTR_EL2_PRIBITS_SHIFT) +#define ICH_VTR_EL2_PRIBITS(x) \ + ((((x) & ICH_VTR_EL2_PRIBITS_MASK) >> ICH_VTR_EL2_PRIBITS_SHIFT) + 1) +#define ICH_VTR_EL2_PREBITS_SHIFT 26 +#define ICH_VTR_EL2_PREBITS_MASK (0x7 << ICH_VTR_EL2_PREBITS_SHIFT) +#define ICH_VTR_EL2_PREBITS(x) \ + (((x) & ICH_VTR_EL2_PREBITS_MASK) >> ICH_VTR_EL2_PREBITS_SHIFT) +#define ICH_VTR_EL2_SEIS (1 << 22) /* System Error Interrupt (SEI) Support */ +#define ICH_VTR_EL2_A3V (1 << 21) /* Affinity 3 Valid */ +#define ICH_VTR_EL2_NV4 (1 << 20) /* Direct injection of virtual interrupts. RES1 for GICv3 */ +#define ICH_VTR_EL2_TDS (1 << 19) /* Implementation supports ICH_HCR_EL2.TDIR */ +#define ICH_VTR_EL2_LISTREGS_MASK 0x1f +/* + * ICH_VTR_EL2.ListRegs holds the number of list registers, minus one. Add one + * to get the actual number of list registers. + */ +#define ICH_VTR_EL2_LISTREGS(x) (((x) & ICH_VTR_EL2_LISTREGS_MASK) + 1) + +#endif /* !_VGIC_V3_REG_H_ */ diff --git a/sys/arm64/vmm/io/vtimer.h b/sys/arm64/vmm/io/vtimer.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/io/vtimer.h @@ -0,0 +1,85 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2017 The FreeBSD Foundation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company nor the name of the author may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_VTIMER_H_ +#define _VMM_VTIMER_H_ + +#define GT_PHYS_NS_IRQ 30 +#define GT_VIRT_IRQ 27 + +struct hyp; +struct hypctx; + +struct vtimer { + uint64_t cnthctl_el2; + uint64_t cntvoff_el2; +}; + +struct vtimer_timer { + struct callout callout; + struct mtx mtx; + + uint32_t irqid; + + /* + * These registers are either emulated for the physical timer, or + * the guest has full access to them for the virtual timer. + + * CNTx_CTL_EL0: Counter-timer Timer Control Register + * CNTx_CVAL_EL0: Counter-timer Timer CompareValue Register + */ + uint64_t cntx_cval_el0; + uint64_t cntx_ctl_el0; +}; + +struct vtimer_cpu { + struct vtimer_timer phys_timer; + struct vtimer_timer virt_timer; + + uint32_t cntkctl_el1; +}; + +int vtimer_init(uint64_t cnthctl_el2); +void vtimer_vminit(struct hyp *); +void vtimer_cpuinit(struct hypctx *); +void vtimer_cpucleanup(struct hypctx *); +void vtimer_vmcleanup(struct hyp *); +void vtimer_cleanup(void); +void vtimer_sync_hwstate(struct hypctx *hypctx); + +int vtimer_phys_ctl_read(struct vcpu *vcpu, uint64_t *rval, void *arg); +int vtimer_phys_ctl_write(struct vcpu *vcpu, uint64_t wval, void *arg); +int vtimer_phys_cnt_read(struct vcpu *vcpu, uint64_t *rval, void *arg); +int vtimer_phys_cnt_write(struct vcpu *vcpu, uint64_t wval, void *arg); +int vtimer_phys_cval_read(struct vcpu *vcpu, uint64_t *rval, void *arg); +int vtimer_phys_cval_write(struct vcpu *vcpu, uint64_t wval, void *arg); +int vtimer_phys_tval_read(struct vcpu *vcpu, uint64_t *rval, void *arg); +int vtimer_phys_tval_write(struct vcpu *vcpu, uint64_t wval, void *arg); +#endif diff --git a/sys/arm64/vmm/io/vtimer.c b/sys/arm64/vmm/io/vtimer.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/io/vtimer.c @@ -0,0 +1,503 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2017 The FreeBSD Foundation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company nor the name of the author may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "vgic.h" +#include "vtimer.h" + +#define RES1 0xffffffffffffffffUL + +#define timer_enabled(ctl) \ + (!((ctl) & CNTP_CTL_IMASK) && ((ctl) & CNTP_CTL_ENABLE)) + +static uint64_t cnthctl_el2_reg; +static uint32_t tmr_frq; + +#define timer_condition_met(ctl) ((ctl) & CNTP_CTL_ISTATUS) + +static void vtimer_schedule_irq(struct hypctx *hypctx, bool phys); + +static int +vtimer_virtual_timer_intr(void *arg) +{ + struct hypctx *hypctx; + uint64_t cntpct_el0; + uint32_t cntv_ctl; + + hypctx = arm64_get_active_vcpu(); + cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); + + if (!hypctx) { + /* vm_destroy() was called. */ + eprintf("No active vcpu\n"); + cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); + goto out; + } + if (!timer_enabled(cntv_ctl)) { + eprintf("Timer not enabled\n"); + goto out; + } + if (!timer_condition_met(cntv_ctl)) { + eprintf("Timer condition not met\n"); + goto out; + } + + cntpct_el0 = READ_SPECIALREG(cntpct_el0) - + hypctx->hyp->vtimer.cntvoff_el2; + if (hypctx->vtimer_cpu.virt_timer.cntx_cval_el0 < cntpct_el0) + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + GT_VIRT_IRQ, true); + + cntv_ctl = hypctx->vtimer_cpu.virt_timer.cntx_ctl_el0; + +out: + /* + * Disable the timer interrupt. This will prevent the interrupt from + * being reasserted as soon as we exit the handler and getting stuck + * in an infinite loop. + * + * This is safe to do because the guest disabled the timer, and then + * enables it as part of the interrupt handling routine. + */ + cntv_ctl &= ~CNTP_CTL_ENABLE; + WRITE_SPECIALREG(cntv_ctl_el0, cntv_ctl); + + return (FILTER_HANDLED); +} + +int +vtimer_init(uint64_t cnthctl_el2) +{ + cnthctl_el2_reg = cnthctl_el2; + /* + * The guest *MUST* use the same timer frequency as the host. The + * register CNTFRQ_EL0 is accessible to the guest and a different value + * in the guest dts file might have unforseen consequences. + */ + tmr_frq = READ_SPECIALREG(cntfrq_el0); + + return (0); +} + +void +vtimer_vminit(struct hyp *hyp) +{ + uint64_t now; + + /* + * Configure the Counter-timer Hypervisor Control Register for the VM. + * + * CNTHCTL_EL1PCEN: trap access to CNTP_{CTL, CVAL, TVAL}_EL0 from EL1 + * CNTHCTL_EL1PCTEN: trap access to CNTPCT_EL0 + */ + hyp->vtimer.cnthctl_el2 = cnthctl_el2_reg & ~CNTHCTL_EL1PCEN; + hyp->vtimer.cnthctl_el2 &= ~CNTHCTL_EL1PCTEN; + + now = READ_SPECIALREG(cntpct_el0); + hyp->vtimer.cntvoff_el2 = now; + + return; +} + +void +vtimer_cpuinit(struct hypctx *hypctx) +{ + struct vtimer_cpu *vtimer_cpu; + + vtimer_cpu = &hypctx->vtimer_cpu; + /* + * Configure physical timer interrupts for the VCPU. + * + * CNTP_CTL_IMASK: mask interrupts + * ~CNTP_CTL_ENABLE: disable the timer + */ + vtimer_cpu->phys_timer.cntx_ctl_el0 = CNTP_CTL_IMASK & ~CNTP_CTL_ENABLE; + + mtx_init(&vtimer_cpu->phys_timer.mtx, "vtimer phys callout mutex", NULL, + MTX_DEF); + callout_init_mtx(&vtimer_cpu->phys_timer.callout, + &vtimer_cpu->phys_timer.mtx, 0); + vtimer_cpu->phys_timer.irqid = GT_PHYS_NS_IRQ; + + mtx_init(&vtimer_cpu->virt_timer.mtx, "vtimer virt callout mutex", NULL, + MTX_DEF); + callout_init_mtx(&vtimer_cpu->virt_timer.callout, + &vtimer_cpu->virt_timer.mtx, 0); + vtimer_cpu->virt_timer.irqid = GT_VIRT_IRQ; +} + +void +vtimer_cpucleanup(struct hypctx *hypctx) +{ + struct vtimer_cpu *vtimer_cpu; + + vtimer_cpu = &hypctx->vtimer_cpu; + callout_drain(&vtimer_cpu->phys_timer.callout); + callout_drain(&vtimer_cpu->virt_timer.callout); + mtx_destroy(&vtimer_cpu->phys_timer.mtx); + mtx_destroy(&vtimer_cpu->virt_timer.mtx); +} + +void +vtimer_vmcleanup(struct hyp *hyp) +{ + struct hypctx *hypctx; + uint32_t cntv_ctl; + + hypctx = arm64_get_active_vcpu(); + if (!hypctx) { + /* The active VM was destroyed, stop the timer. */ + cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); + cntv_ctl &= ~CNTP_CTL_ENABLE; + WRITE_SPECIALREG(cntv_ctl_el0, cntv_ctl); + } +} + +void +vtimer_cleanup(void) +{ +} + +void +vtimer_sync_hwstate(struct hypctx *hypctx) +{ + struct vtimer_timer *timer; + uint64_t cntpct_el0; + + timer = &hypctx->vtimer_cpu.virt_timer; + cntpct_el0 = READ_SPECIALREG(cntpct_el0) - + hypctx->hyp->vtimer.cntvoff_el2; + if (!timer_enabled(timer->cntx_ctl_el0)) { + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + timer->irqid, false); + } else if (timer->cntx_cval_el0 < cntpct_el0) { + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + timer->irqid, true); + } else { + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + timer->irqid, false); + vtimer_schedule_irq(hypctx, false); + } +} + +static void +vtimer_inject_irq_callout_phys(void *context) +{ + struct hypctx *hypctx; + + hypctx = context; + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + hypctx->vtimer_cpu.phys_timer.irqid, true); +} + +static void +vtimer_inject_irq_callout_virt(void *context) +{ + struct hypctx *hypctx; + + hypctx = context; + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + hypctx->vtimer_cpu.virt_timer.irqid, true); +} + +static void +vtimer_schedule_irq(struct hypctx *hypctx, bool phys) +{ + sbintime_t time; + struct vtimer_timer *timer; + uint64_t cntpct_el0; + uint64_t diff; + + if (phys) + timer = &hypctx->vtimer_cpu.phys_timer; + else + timer = &hypctx->vtimer_cpu.virt_timer; + cntpct_el0 = READ_SPECIALREG(cntpct_el0) - + hypctx->hyp->vtimer.cntvoff_el2; + if (timer->cntx_cval_el0 < cntpct_el0) { + /* Timer set in the past, trigger interrupt */ + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(hypctx->vcpu), + timer->irqid, true); + } else { + diff = timer->cntx_cval_el0 - cntpct_el0; + time = diff * SBT_1S / tmr_frq; + if (phys) + callout_reset_sbt(&timer->callout, time, 0, + vtimer_inject_irq_callout_phys, hypctx, 0); + else + callout_reset_sbt(&timer->callout, time, 0, + vtimer_inject_irq_callout_virt, hypctx, 0); + } +} + +static void +vtimer_remove_irq(struct hypctx *hypctx, struct vcpu *vcpu) +{ + struct vtimer_cpu *vtimer_cpu; + struct vtimer_timer *timer; + + vtimer_cpu = &hypctx->vtimer_cpu; + timer = &vtimer_cpu->phys_timer; + + callout_drain(&timer->callout); + /* + * The interrupt needs to be deactivated here regardless of the callout + * function having been executed. The timer interrupt can be masked with + * the CNTP_CTL_EL0.IMASK bit instead of reading the IAR register. + * Masking the interrupt doesn't remove it from the list registers. + */ + vgic_inject_irq(hypctx->hyp, vcpu_vcpuid(vcpu), timer->irqid, false); +} + +/* + * Timer emulation functions. + * + * The guest should use the virtual timer, however some software, e.g. u-boot, + * used the physical timer. Emulate this in software for the guest to use. + * + * Adjust for cntvoff_el2 so the physical and virtual timers are at similar + * times. This simplifies interrupt handling in the virtual timer as the + * adjustment will have already happened. + */ + +int +vtimer_phys_ctl_read(struct vcpu *vcpu, uint64_t *rval, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + uint64_t cntpct_el0; + + hypctx = vcpu_get_cookie(vcpu); + hyp = hypctx->hyp; + vtimer_cpu = &hypctx->vtimer_cpu; + + cntpct_el0 = READ_SPECIALREG(cntpct_el0) - hyp->vtimer.cntvoff_el2; + if (vtimer_cpu->phys_timer.cntx_cval_el0 < cntpct_el0) + /* Timer condition met */ + *rval = vtimer_cpu->phys_timer.cntx_ctl_el0 | CNTP_CTL_ISTATUS; + else + *rval = vtimer_cpu->phys_timer.cntx_ctl_el0 & ~CNTP_CTL_ISTATUS; + + return (0); +} + +int +vtimer_phys_ctl_write(struct vcpu *vcpu, uint64_t wval, void *arg) +{ + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + uint64_t ctl_el0; + bool timer_toggled_on; + + hypctx = vcpu_get_cookie(vcpu); + vtimer_cpu = &hypctx->vtimer_cpu; + + timer_toggled_on = false; + ctl_el0 = vtimer_cpu->phys_timer.cntx_ctl_el0; + + if (!timer_enabled(ctl_el0) && timer_enabled(wval)) + timer_toggled_on = true; + else if (timer_enabled(ctl_el0) && !timer_enabled(wval)) + vtimer_remove_irq(hypctx, vcpu); + + vtimer_cpu->phys_timer.cntx_ctl_el0 = wval; + + if (timer_toggled_on) + vtimer_schedule_irq(hypctx, true); + + return (0); +} + +int +vtimer_phys_cnt_read(struct vcpu *vcpu, uint64_t *rval, void *arg) +{ + struct vm *vm; + struct hyp *hyp; + + vm = vcpu_vm(vcpu); + hyp = vm_get_cookie(vm); + *rval = READ_SPECIALREG(cntpct_el0) - hyp->vtimer.cntvoff_el2; + return (0); +} + +int +vtimer_phys_cnt_write(struct vcpu *vcpu, uint64_t wval, void *arg) +{ + return (0); +} + +int +vtimer_phys_cval_read(struct vcpu *vcpu, uint64_t *rval, void *arg) +{ + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + + hypctx = vcpu_get_cookie(vcpu); + vtimer_cpu = &hypctx->vtimer_cpu; + + *rval = vtimer_cpu->phys_timer.cntx_cval_el0; + + return (0); +} + +int +vtimer_phys_cval_write(struct vcpu *vcpu, uint64_t wval, void *arg) +{ + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + + hypctx = vcpu_get_cookie(vcpu); + vtimer_cpu = &hypctx->vtimer_cpu; + + vtimer_cpu->phys_timer.cntx_cval_el0 = wval; + + vtimer_remove_irq(hypctx, vcpu); + if (timer_enabled(vtimer_cpu->phys_timer.cntx_ctl_el0)) { + vtimer_schedule_irq(hypctx, true); + } + + return (0); +} + +int +vtimer_phys_tval_read(struct vcpu *vcpu, uint64_t *rval, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + uint32_t cntpct_el0; + + hypctx = vcpu_get_cookie(vcpu); + hyp = hypctx->hyp; + vtimer_cpu = &hypctx->vtimer_cpu; + + if (!(vtimer_cpu->phys_timer.cntx_ctl_el0 & CNTP_CTL_ENABLE)) { + /* + * ARMv8 Architecture Manual, p. D7-2702: the result of reading + * TVAL when the timer is disabled is UNKNOWN. I have chosen to + * return the maximum value possible on 32 bits which means the + * timer will fire very far into the future. + */ + *rval = (uint32_t)RES1; + } else { + cntpct_el0 = READ_SPECIALREG(cntpct_el0) - + hyp->vtimer.cntvoff_el2; + *rval = vtimer_cpu->phys_timer.cntx_cval_el0 - cntpct_el0; + } + + return (0); +} + +int +vtimer_phys_tval_write(struct vcpu *vcpu, uint64_t wval, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + uint64_t cntpct_el0; + + hypctx = vcpu_get_cookie(vcpu); + hyp = hypctx->hyp; + vtimer_cpu = &hypctx->vtimer_cpu; + + cntpct_el0 = READ_SPECIALREG(cntpct_el0) - hyp->vtimer.cntvoff_el2; + vtimer_cpu->phys_timer.cntx_cval_el0 = (int32_t)wval + cntpct_el0; + + vtimer_remove_irq(hypctx, vcpu); + if (timer_enabled(vtimer_cpu->phys_timer.cntx_ctl_el0)) { + vtimer_schedule_irq(hypctx, true); + } + + return (0); +} + +struct vtimer_softc { + struct resource *res; + void *ihl; + int rid; +}; + +static int +vtimer_probe(device_t dev) +{ + device_set_desc(dev, "Virtual timer"); + return (BUS_PROBE_DEFAULT); +} + +static int +vtimer_attach(device_t dev) +{ + struct vtimer_softc *sc; + + sc = device_get_softc(dev); + + sc->rid = 0; + sc->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &sc->rid, RF_ACTIVE); + if (sc->res == NULL) + return (ENXIO); + + bus_setup_intr(dev, sc->res, INTR_TYPE_CLK, vtimer_virtual_timer_intr, + NULL, NULL, &sc->ihl); + + return (0); +} + +static device_method_t vtimer_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, vtimer_probe), + DEVMETHOD(device_attach, vtimer_attach), + + /* End */ + DEVMETHOD_END +}; + +DEFINE_CLASS_0(vtimer, vtimer_driver, vtimer_methods, + sizeof(struct vtimer_softc)); + +DRIVER_MODULE(vtimer, generic_timer, vtimer_driver, 0, 0); diff --git a/sys/arm64/vmm/mmu.h b/sys/arm64/vmm/mmu.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/mmu.h @@ -0,0 +1,52 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2017 Alexandru Elisei + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_MMU_H_ +#define _VMM_MMU_H_ + +#include +#include +#include + +#include "hyp.h" + +extern char vmm_hyp_code; +extern char vmm_hyp_code_end; + +extern char _vmm_start; +extern char _vmm_end; + +bool vmmpmap_init(void); +void vmmpmap_fini(void); +uint64_t vmmpmap_to_ttbr0(void); +bool vmmpmap_enter(vm_offset_t, vm_size_t, vm_paddr_t, vm_prot_t); +void vmmpmap_remove(vm_offset_t, vm_size_t, bool); + +#endif diff --git a/sys/arm64/vmm/reset.h b/sys/arm64/vmm/reset.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/reset.h @@ -0,0 +1,33 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2018 Alexandru Elisei + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _VMM_RESET_H_ +#define _VMM_RESET_H_ + +void reset_vm_el01_regs(void *vcpu); +void reset_vm_el2_regs(void *vcpu); + +#endif diff --git a/sys/arm64/vmm/vmm.c b/sys/arm64/vmm/vmm.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm.c @@ -0,0 +1,1799 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "vmm_ktr.h" +#include "vmm_stat.h" +#include "arm64.h" +#include "mmu.h" + +#include "io/vgic.h" +#include "io/vtimer.h" + +struct vcpu { + int flags; + enum vcpu_state state; + struct mtx mtx; + int hostcpu; /* host cpuid this vcpu last ran on */ + int vcpuid; + void *stats; + struct vm_exit exitinfo; + uint64_t nextpc; /* (x) next instruction to execute */ + struct vm *vm; /* (o) */ + void *cookie; /* (i) cpu-specific data */ + struct vfpstate *guestfpu; /* (a,i) guest fpu state */ +}; + +#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) +#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) +#define vcpu_lock_destroy(v) mtx_destroy(&((v)->mtx)) +#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) +#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) +#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) + +struct mem_seg { + uint64_t gpa; + size_t len; + bool wired; + bool sysmem; + vm_object_t object; +}; +#define VM_MAX_MEMSEGS 3 + +struct mem_map { + vm_paddr_t gpa; + size_t len; + vm_ooffset_t segoff; + int segid; + int prot; + int flags; +}; +#define VM_MAX_MEMMAPS 4 + +struct vmm_mmio_region { + uint64_t start; + uint64_t end; + mem_region_read_t read; + mem_region_write_t write; +}; +#define VM_MAX_MMIO_REGIONS 4 + +struct vmm_special_reg { + uint32_t esr_iss; + uint32_t esr_mask; + reg_read_t reg_read; + reg_write_t reg_write; + void *arg; +}; +#define VM_MAX_SPECIAL_REGS 16 + +/* + * Initialization: + * (o) initialized the first time the VM is created + * (i) initialized when VM is created and when it is reinitialized + * (x) initialized before use + */ +struct vm { + void *cookie; /* (i) cpu-specific data */ + volatile cpuset_t active_cpus; /* (i) active vcpus */ + volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ + int suspend; /* (i) stop VM execution */ + volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ + volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ + struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ + struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ + struct vmspace *vmspace; /* (o) guest's address space */ + char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ + struct vcpu **vcpu; /* (i) guest vcpus */ + struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS]; + /* (o) guest MMIO regions */ + struct vmm_special_reg special_reg[VM_MAX_SPECIAL_REGS]; + /* The following describe the vm cpu topology */ + uint16_t sockets; /* (o) num of sockets */ + uint16_t cores; /* (o) num of cores/socket */ + uint16_t threads; /* (o) num of threads/core */ + uint16_t maxcpus; /* (o) max pluggable cpus */ + struct sx mem_segs_lock; /* (o) */ + struct sx vcpus_init_lock; /* (o) */ +}; + +static bool vmm_initialized = false; + +static int vm_handle_wfi(struct vcpu *vcpu, + struct vm_exit *vme, bool *retu); + +static MALLOC_DEFINE(M_VMM, "vmm", "vmm"); + +/* statistics */ +static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); + +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); + +static int vmm_ipinum; +SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, + "IPI vector used for vcpu notifications"); + +struct vmm_regs { + uint64_t id_aa64afr0; + uint64_t id_aa64afr1; + uint64_t id_aa64dfr0; + uint64_t id_aa64dfr1; + uint64_t id_aa64isar0; + uint64_t id_aa64isar1; + uint64_t id_aa64isar2; + uint64_t id_aa64mmfr0; + uint64_t id_aa64mmfr1; + uint64_t id_aa64mmfr2; + uint64_t id_aa64pfr0; + uint64_t id_aa64pfr1; +}; + +static const struct vmm_regs vmm_arch_regs_masks = { + .id_aa64dfr0 = + ID_AA64DFR0_CTX_CMPs_MASK | + ID_AA64DFR0_WRPs_MASK | + ID_AA64DFR0_BRPs_MASK | + ID_AA64DFR0_PMUVer_3 | + ID_AA64DFR0_DebugVer_8, + .id_aa64isar0 = + ID_AA64ISAR0_TLB_TLBIOSR | + ID_AA64ISAR0_SHA3_IMPL | + ID_AA64ISAR0_RDM_IMPL | + ID_AA64ISAR0_Atomic_IMPL | + ID_AA64ISAR0_CRC32_BASE | + ID_AA64ISAR0_SHA2_512 | + ID_AA64ISAR0_SHA1_BASE | + ID_AA64ISAR0_AES_PMULL, + .id_aa64mmfr0 = + ID_AA64MMFR0_TGran4_IMPL | + ID_AA64MMFR0_TGran64_IMPL | + ID_AA64MMFR0_TGran16_IMPL | + ID_AA64MMFR0_ASIDBits_16 | + ID_AA64MMFR0_PARange_4P, + .id_aa64mmfr1 = + ID_AA64MMFR1_SpecSEI_IMPL | + ID_AA64MMFR1_PAN_ATS1E1 | + ID_AA64MMFR1_HAFDBS_AF, + .id_aa64pfr0 = + ID_AA64PFR0_GIC_CPUIF_NONE | + ID_AA64PFR0_AdvSIMD_HP | + ID_AA64PFR0_FP_HP | + ID_AA64PFR0_EL3_64 | + ID_AA64PFR0_EL2_64 | + ID_AA64PFR0_EL1_64 | + ID_AA64PFR0_EL0_64, +}; + +/* Host registers masked by vmm_arch_regs_masks. */ +static struct vmm_regs vmm_arch_regs; + +u_int vm_maxcpu; +SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &vm_maxcpu, 0, "Maximum number of vCPUs"); + +static void vm_free_memmap(struct vm *vm, int ident); +static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); +static void vcpu_notify_event_locked(struct vcpu *vcpu); + +/* + * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this + * is a safe value for now. + */ +#define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE) + +static int +vmm_regs_init(struct vmm_regs *regs, const struct vmm_regs *masks) +{ +#define _FETCH_KERN_REG(reg, field) do { \ + regs->field = vmm_arch_regs_masks.field; \ + if (!get_kernel_reg_masked(reg, ®s->field, masks->field)) \ + regs->field = 0; \ +} while (0) + _FETCH_KERN_REG(ID_AA64AFR0_EL1, id_aa64afr0); + _FETCH_KERN_REG(ID_AA64AFR1_EL1, id_aa64afr1); + _FETCH_KERN_REG(ID_AA64DFR0_EL1, id_aa64dfr0); + _FETCH_KERN_REG(ID_AA64DFR1_EL1, id_aa64dfr1); + _FETCH_KERN_REG(ID_AA64ISAR0_EL1, id_aa64isar0); + _FETCH_KERN_REG(ID_AA64ISAR1_EL1, id_aa64isar1); + _FETCH_KERN_REG(ID_AA64ISAR2_EL1, id_aa64isar2); + _FETCH_KERN_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0); + _FETCH_KERN_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1); + _FETCH_KERN_REG(ID_AA64MMFR2_EL1, id_aa64mmfr2); + _FETCH_KERN_REG(ID_AA64PFR0_EL1, id_aa64pfr0); + _FETCH_KERN_REG(ID_AA64PFR1_EL1, id_aa64pfr1); +#undef _FETCH_KERN_REG + return (0); +} + +static void +vcpu_cleanup(struct vcpu *vcpu, bool destroy) +{ + vmmops_vcpu_cleanup(vcpu->cookie); + vcpu->cookie = NULL; + if (destroy) { + vmm_stat_free(vcpu->stats); + fpu_save_area_free(vcpu->guestfpu); + vcpu_lock_destroy(vcpu); + } +} + +static struct vcpu * +vcpu_alloc(struct vm *vm, int vcpu_id) +{ + struct vcpu *vcpu; + + KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, + ("vcpu_alloc: invalid vcpu %d", vcpu_id)); + + vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO); + vcpu_lock_init(vcpu); + vcpu->state = VCPU_IDLE; + vcpu->hostcpu = NOCPU; + vcpu->vcpuid = vcpu_id; + vcpu->vm = vm; + vcpu->guestfpu = fpu_save_area_alloc(); + vcpu->stats = vmm_stat_alloc(); + return (vcpu); +} + +static void +vcpu_init(struct vcpu *vcpu) +{ + vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid); + MPASS(vcpu->cookie != NULL); + fpu_save_area_reset(vcpu->guestfpu); + vmm_stat_init(vcpu->stats); +} + +struct vm_exit * +vm_exitinfo(struct vcpu *vcpu) +{ + return (&vcpu->exitinfo); +} + +static int +vmm_init(void) +{ + int error; + + vm_maxcpu = mp_ncpus; + TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); + + if (vm_maxcpu > VM_MAXCPU) { + printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); + vm_maxcpu = VM_MAXCPU; + } + if (vm_maxcpu == 0) + vm_maxcpu = 1; + + error = vmm_regs_init(&vmm_arch_regs, &vmm_arch_regs_masks); + if (error != 0) + return (error); + + return (vmmops_modinit(0)); +} + +static int +vmm_handler(module_t mod, int what, void *arg) +{ + int error; + + switch (what) { + case MOD_LOAD: + /* TODO: if (vmm_is_hw_supported()) { */ + vmmdev_init(); + error = vmm_init(); + if (error == 0) + vmm_initialized = true; + break; + case MOD_UNLOAD: + /* TODO: if (vmm_is_hw_supported()) { */ + error = vmmdev_cleanup(); + if (error == 0 && vmm_initialized) { + error = vmmops_modcleanup(); + if (error) + vmm_initialized = false; + } + break; + default: + error = 0; + break; + } + return (error); +} + +static moduledata_t vmm_kmod = { + "vmm", + vmm_handler, + NULL +}; + +/* + * vmm initialization has the following dependencies: + * + * - HYP initialization requires smp_rendezvous() and therefore must happen + * after SMP is fully functional (after SI_SUB_SMP). + */ +DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); +MODULE_VERSION(vmm, 1); + +static void +vm_init(struct vm *vm, bool create) +{ + int i; + + vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); + MPASS(vm->cookie != NULL); + + CPU_ZERO(&vm->active_cpus); + CPU_ZERO(&vm->debug_cpus); + + vm->suspend = 0; + CPU_ZERO(&vm->suspended_cpus); + + memset(vm->mmio_region, 0, sizeof(vm->mmio_region)); + memset(vm->special_reg, 0, sizeof(vm->special_reg)); + + if (!create) { + for (i = 0; i < vm->maxcpus; i++) { + if (vm->vcpu[i] != NULL) + vcpu_init(vm->vcpu[i]); + } + } +} + +struct vcpu * +vm_alloc_vcpu(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm)) + return (NULL); + + vcpu = atomic_load_ptr(&vm->vcpu[vcpuid]); + if (__predict_true(vcpu != NULL)) + return (vcpu); + + sx_xlock(&vm->vcpus_init_lock); + vcpu = vm->vcpu[vcpuid]; + if (vcpu == NULL/* && !vm->dying*/) { + vcpu = vcpu_alloc(vm, vcpuid); + vcpu_init(vcpu); + + /* + * Ensure vCPU is fully created before updating pointer + * to permit unlocked reads above. + */ + atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid], + (uintptr_t)vcpu); + } + sx_xunlock(&vm->vcpus_init_lock); + return (vcpu); +} + +void +vm_slock_vcpus(struct vm *vm) +{ + sx_slock(&vm->vcpus_init_lock); +} + +void +vm_unlock_vcpus(struct vm *vm) +{ + sx_unlock(&vm->vcpus_init_lock); +} + +int +vm_create(const char *name, struct vm **retvm) +{ + struct vm *vm; + struct vmspace *vmspace; + + /* + * If vmm.ko could not be successfully initialized then don't attempt + * to create the virtual machine. + */ + if (!vmm_initialized) + return (ENXIO); + + if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) + return (EINVAL); + + vmspace = vmmops_vmspace_alloc(0, 1ul << 39); + if (vmspace == NULL) + return (ENOMEM); + + vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO); + strcpy(vm->name, name); + vm->vmspace = vmspace; + sx_init(&vm->mem_segs_lock, "vm mem_segs"); + sx_init(&vm->vcpus_init_lock, "vm vcpus"); + + vm->sockets = 1; + vm->cores = 1; /* XXX backwards compatibility */ + vm->threads = 1; /* XXX backwards compatibility */ + vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ + + vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM, + M_WAITOK | M_ZERO); + + vm_init(vm, true); + + *retvm = vm; + return (0); +} + +void +vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus) +{ + *sockets = vm->sockets; + *cores = vm->cores; + *threads = vm->threads; + *maxcpus = vm->maxcpus; +} + +uint16_t +vm_get_maxcpus(struct vm *vm) +{ + return (vm->maxcpus); +} + +int +vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus) +{ + /* Ignore maxcpus. */ + if ((sockets * cores * threads) > vm->maxcpus) + return (EINVAL); + vm->sockets = sockets; + vm->cores = cores; + vm->threads = threads; + return(0); +} + +static void +vm_cleanup(struct vm *vm, bool destroy) +{ + struct mem_map *mm; + pmap_t pmap __diagused; + int i; + + if (destroy) { + pmap = vmspace_pmap(vm->vmspace); + sched_pin(); + PCPU_SET(curvmpmap, NULL); + sched_unpin(); + CPU_FOREACH(i) { + MPASS(cpuid_to_pcpu[i]->pc_curvmpmap != pmap); + } + } + + vgic_detach_from_vm(vm->cookie); + + for (i = 0; i < vm->maxcpus; i++) { + if (vm->vcpu[i] != NULL) + vcpu_cleanup(vm->vcpu[i], destroy); + } + + vmmops_cleanup(vm->cookie); + + /* + * System memory is removed from the guest address space only when + * the VM is destroyed. This is because the mapping remains the same + * across VM reset. + * + * Device memory can be relocated by the guest (e.g. using PCI BARs) + * so those mappings are removed on a VM reset. + */ + if (!destroy) { + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (destroy || !sysmem_mapping(vm, mm)) + vm_free_memmap(vm, i); + } + } + + if (destroy) { + for (i = 0; i < VM_MAX_MEMSEGS; i++) + vm_free_memseg(vm, i); + + vmmops_vmspace_free(vm->vmspace); + vm->vmspace = NULL; + + for (i = 0; i < vm->maxcpus; i++) + free(vm->vcpu[i], M_VMM); + free(vm->vcpu, M_VMM); + sx_destroy(&vm->vcpus_init_lock); + sx_destroy(&vm->mem_segs_lock); + } +} + +void +vm_destroy(struct vm *vm) +{ + vm_cleanup(vm, true); + free(vm, M_VMM); +} + +int +vm_reinit(struct vm *vm) +{ + int error; + + /* + * A virtual machine can be reset only if all vcpus are suspended. + */ + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + vm_cleanup(vm, false); + vm_init(vm, false); + error = 0; + } else { + error = EBUSY; + } + + return (error); +} + +const char * +vm_name(struct vm *vm) +{ + return (vm->name); +} + +void +vm_slock_memsegs(struct vm *vm) +{ + sx_slock(&vm->mem_segs_lock); +} + +void +vm_xlock_memsegs(struct vm *vm) +{ + sx_xlock(&vm->mem_segs_lock); +} + +void +vm_unlock_memsegs(struct vm *vm) +{ + sx_unlock(&vm->mem_segs_lock); +} + +/* + * Return 'true' if 'gpa' is allocated in the guest address space. + * + * This function is called in the context of a running vcpu which acts as + * an implicit lock on 'vm->mem_maps[]'. + */ +bool +vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa) +{ + struct vm *vm = vcpu->vm; + struct mem_map *mm; + int i; + +#ifdef INVARIANTS + int hostcpu, state; + state = vcpu_get_state(vcpu, &hostcpu); + KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, + ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); +#endif + + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) + return (true); /* 'gpa' is sysmem or devmem */ + } + + return (false); +} + +int +vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) +{ + struct mem_seg *seg; + vm_object_t obj; + + sx_assert(&vm->mem_segs_lock, SX_XLOCKED); + + if (ident < 0 || ident >= VM_MAX_MEMSEGS) + return (EINVAL); + + if (len == 0 || (len & PAGE_MASK)) + return (EINVAL); + + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + if (seg->len == len && seg->sysmem == sysmem) + return (EEXIST); + else + return (EINVAL); + } + + obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); + if (obj == NULL) + return (ENOMEM); + + seg->len = len; + seg->object = obj; + seg->sysmem = sysmem; + return (0); +} + +int +vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + vm_object_t *objptr) +{ + struct mem_seg *seg; + + sx_assert(&vm->mem_segs_lock, SX_LOCKED); + + if (ident < 0 || ident >= VM_MAX_MEMSEGS) + return (EINVAL); + + seg = &vm->mem_segs[ident]; + if (len) + *len = seg->len; + if (sysmem) + *sysmem = seg->sysmem; + if (objptr) + *objptr = seg->object; + return (0); +} + +void +vm_free_memseg(struct vm *vm, int ident) +{ + struct mem_seg *seg; + + KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, + ("%s: invalid memseg ident %d", __func__, ident)); + + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + vm_object_deallocate(seg->object); + bzero(seg, sizeof(struct mem_seg)); + } +} + +int +vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, + size_t len, int prot, int flags) +{ + struct mem_seg *seg; + struct mem_map *m, *map; + vm_ooffset_t last; + int i, error; + + if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0) + return (EINVAL); + + if (flags & ~VM_MEMMAP_F_WIRED) + return (EINVAL); + + if (segid < 0 || segid >= VM_MAX_MEMSEGS) + return (EINVAL); + + seg = &vm->mem_segs[segid]; + if (seg->object == NULL) + return (EINVAL); + + last = first + len; + if (first < 0 || first >= last || last > seg->len) + return (EINVAL); + + if ((gpa | first | last) & PAGE_MASK) + return (EINVAL); + + map = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + m = &vm->mem_maps[i]; + if (m->len == 0) { + map = m; + break; + } + } + + if (map == NULL) + return (ENOSPC); + + error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, + len, 0, VMFS_NO_SPACE, prot, prot, 0); + if (error != KERN_SUCCESS) + return (EFAULT); + + vm_object_reference(seg->object); + + if (flags & VM_MEMMAP_F_WIRED) { + error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + if (error != KERN_SUCCESS) { + vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); + return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM : + EFAULT); + } + } + + map->gpa = gpa; + map->len = len; + map->segoff = first; + map->segid = segid; + map->prot = prot; + map->flags = flags; + return (0); +} + +int +vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + struct mem_map *m; + int i; + + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + m = &vm->mem_maps[i]; + if (m->gpa == gpa && m->len == len) { + vm_free_memmap(vm, i); + return (0); + } + } + + return (EINVAL); +} + +int +vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) +{ + struct mem_map *mm, *mmnext; + int i; + + mmnext = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len == 0 || mm->gpa < *gpa) + continue; + if (mmnext == NULL || mm->gpa < mmnext->gpa) + mmnext = mm; + } + + if (mmnext != NULL) { + *gpa = mmnext->gpa; + if (segid) + *segid = mmnext->segid; + if (segoff) + *segoff = mmnext->segoff; + if (len) + *len = mmnext->len; + if (prot) + *prot = mmnext->prot; + if (flags) + *flags = mmnext->flags; + return (0); + } else { + return (ENOENT); + } +} + +static void +vm_free_memmap(struct vm *vm, int ident) +{ + struct mem_map *mm; + int error __diagused; + + mm = &vm->mem_maps[ident]; + if (mm->len) { + error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, + mm->gpa + mm->len); + KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d", + __func__, error)); + bzero(mm, sizeof(struct mem_map)); + } +} + +static __inline bool +sysmem_mapping(struct vm *vm, struct mem_map *mm) +{ + + if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) + return (true); + else + return (false); +} + +vm_paddr_t +vmm_sysmem_maxaddr(struct vm *vm) +{ + struct mem_map *mm; + vm_paddr_t maxaddr; + int i; + + maxaddr = 0; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (sysmem_mapping(vm, mm)) { + if (maxaddr < mm->gpa + mm->len) + maxaddr = mm->gpa + mm->len; + } + } + return (maxaddr); +} + +int +vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault) +{ + + vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault); + return (0); +} + +static int +vmm_reg_raz(struct vcpu *vcpu, uint64_t *rval, void *arg) +{ + *rval = 0; + return (0); +} + +static int +vmm_reg_read_arg(struct vcpu *vcpu, uint64_t *rval, void *arg) +{ + *rval = *(uint64_t *)arg; + return (0); +} + +static int +vmm_reg_wi(struct vcpu *vcpu, uint64_t wval, void *arg) +{ + return (0); +} + +static const struct vmm_special_reg vmm_special_regs[] = { +#define SPECIAL_REG(_reg, _read, _write) \ + { \ + .esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) | \ + ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) | \ + ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) | \ + ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) | \ + ((_reg ## _op2) << ISS_MSR_OP2_SHIFT), \ + .esr_mask = ISS_MSR_REG_MASK, \ + .reg_read = (_read), \ + .reg_write = (_write), \ + .arg = NULL, \ + } +#define ID_SPECIAL_REG(_reg, _name) \ + { \ + .esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) | \ + ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) | \ + ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) | \ + ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) | \ + ((_reg ## _op2) << ISS_MSR_OP2_SHIFT), \ + .esr_mask = ISS_MSR_REG_MASK, \ + .reg_read = vmm_reg_read_arg, \ + .reg_write = vmm_reg_wi, \ + .arg = &(vmm_arch_regs._name), \ + } + + /* ID registers */ + ID_SPECIAL_REG(ID_AA64PFR0_EL1, id_aa64pfr0), + ID_SPECIAL_REG(ID_AA64DFR0_EL1, id_aa64dfr0), + ID_SPECIAL_REG(ID_AA64ISAR0_EL1, id_aa64isar0), + ID_SPECIAL_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0), + ID_SPECIAL_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1), + + /* + * All other ID registers are read as zero. + * They are all in the op0=3, op1=0, CRn=0, CRm={0..7} space. + */ + { + .esr_iss = (3 << ISS_MSR_OP0_SHIFT) | + (0 << ISS_MSR_OP1_SHIFT) | + (0 << ISS_MSR_CRn_SHIFT) | + (0 << ISS_MSR_CRm_SHIFT), + .esr_mask = ISS_MSR_OP0_MASK | ISS_MSR_OP1_MASK | + ISS_MSR_CRn_MASK | (0x8 << ISS_MSR_CRm_SHIFT), + .reg_read = vmm_reg_raz, + .reg_write = vmm_reg_wi, + .arg = NULL, + }, + + /* Counter physical registers */ + SPECIAL_REG(CNTP_CTL_EL0, vtimer_phys_ctl_read, vtimer_phys_ctl_write), + SPECIAL_REG(CNTP_CVAL_EL0, vtimer_phys_cval_read, + vtimer_phys_cval_write), + SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read, + vtimer_phys_tval_write), + SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write), +#undef SPECIAL_REG +}; + +void +vm_register_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask, + reg_read_t reg_read, reg_write_t reg_write, void *arg) +{ + int i; + + for (i = 0; i < nitems(vm->special_reg); i++) { + if (vm->special_reg[i].esr_iss == 0 && + vm->special_reg[i].esr_mask == 0) { + vm->special_reg[i].esr_iss = iss; + vm->special_reg[i].esr_mask = mask; + vm->special_reg[i].reg_read = reg_read; + vm->special_reg[i].reg_write = reg_write; + vm->special_reg[i].arg = arg; + return; + } + } + + panic("%s: No free special register slot", __func__); +} + +void +vm_deregister_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask) +{ + int i; + + for (i = 0; i < nitems(vm->special_reg); i++) { + if (vm->special_reg[i].esr_iss == iss && + vm->special_reg[i].esr_mask == mask) { + memset(&vm->special_reg[i], 0, + sizeof(vm->special_reg[i])); + return; + } + } + + panic("%s: Invalid special register: iss %lx mask %lx", __func__, iss, + mask); +} + +static int +vm_handle_reg_emul(struct vcpu *vcpu, bool *retu) +{ + struct vm *vm; + struct vm_exit *vme; + struct vre *vre; + int i, rv; + + vm = vcpu->vm; + vme = &vcpu->exitinfo; + vre = &vme->u.reg_emul.vre; + + for (i = 0; i < nitems(vm->special_reg); i++) { + if (vm->special_reg[i].esr_iss == 0 && + vm->special_reg[i].esr_mask == 0) + continue; + + if ((vre->inst_syndrome & vm->special_reg[i].esr_mask) == + vm->special_reg[i].esr_iss) { + rv = vmm_emulate_register(vcpu, vre, + vm->special_reg[i].reg_read, + vm->special_reg[i].reg_write, + vm->special_reg[i].arg); + if (rv == 0) { + *retu = false; + } + return (rv); + } + } + for (i = 0; i < nitems(vmm_special_regs); i++) { + if ((vre->inst_syndrome & vmm_special_regs[i].esr_mask) == + vmm_special_regs[i].esr_iss) { + rv = vmm_emulate_register(vcpu, vre, + vmm_special_regs[i].reg_read, + vmm_special_regs[i].reg_write, + vmm_special_regs[i].arg); + if (rv == 0) { + *retu = false; + } + return (rv); + } + } + + + *retu = true; + return (0); +} + +void +vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size, + mem_region_read_t mmio_read, mem_region_write_t mmio_write) +{ + int i; + + for (i = 0; i < nitems(vm->mmio_region); i++) { + if (vm->mmio_region[i].start == 0 && + vm->mmio_region[i].end == 0) { + vm->mmio_region[i].start = start; + vm->mmio_region[i].end = start + size; + vm->mmio_region[i].read = mmio_read; + vm->mmio_region[i].write = mmio_write; + return; + } + } + + panic("%s: No free MMIO region", __func__); +} + +void +vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size) +{ + int i; + + for (i = 0; i < nitems(vm->mmio_region); i++) { + if (vm->mmio_region[i].start == start && + vm->mmio_region[i].end == start + size) { + memset(&vm->mmio_region[i], 0, + sizeof(vm->mmio_region[i])); + return; + } + } + + panic("%s: Invalid MMIO region: %lx - %lx", __func__, start, + start + size); +} + +static int +vm_handle_inst_emul(struct vcpu *vcpu, bool *retu) +{ + struct vm *vm; + struct vm_exit *vme; + struct vie *vie; + struct hyp *hyp; + uint64_t fault_ipa; + struct vm_guest_paging *paging; + struct vmm_mmio_region *vmr; + int error, i; + + vm = vcpu->vm; + hyp = vm->cookie; + if (!hyp->vgic_attached) + goto out_user; + + vme = &vcpu->exitinfo; + vie = &vme->u.inst_emul.vie; + paging = &vme->u.inst_emul.paging; + + fault_ipa = vme->u.inst_emul.gpa; + + vmr = NULL; + for (i = 0; i < nitems(vm->mmio_region); i++) { + if (vm->mmio_region[i].start <= fault_ipa && + vm->mmio_region[i].end > fault_ipa) { + vmr = &vm->mmio_region[i]; + break; + } + } + if (vmr == NULL) + goto out_user; + + error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging, + vmr->read, vmr->write, retu); + return (error); + +out_user: + *retu = true; + return (0); +} + +int +vm_suspend(struct vm *vm, enum vm_suspend_how how) +{ + int i; + + if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) + return (EINVAL); + + if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { + VM_CTR2(vm, "virtual machine already suspended %d/%d", + vm->suspend, how); + return (EALREADY); + } + + VM_CTR1(vm, "virtual machine successfully suspended %d", how); + + /* + * Notify all active vcpus that they are now suspended. + */ + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm_vcpu(vm, i)); + } + + return (0); +} + +void +vm_exit_suspended(struct vcpu *vcpu, uint64_t pc) +{ + struct vm *vm = vcpu->vm; + struct vm_exit *vmexit; + + KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, + ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); + + vmexit = vm_exitinfo(vcpu); + vmexit->pc = pc; + vmexit->inst_length = 4; + vmexit->exitcode = VM_EXITCODE_SUSPENDED; + vmexit->u.suspended.how = vm->suspend; +} + +void +vm_exit_debug(struct vcpu *vcpu, uint64_t pc) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vcpu); + vmexit->pc = pc; + vmexit->inst_length = 4; + vmexit->exitcode = VM_EXITCODE_DEBUG; +} + +int +vm_activate_cpu(struct vcpu *vcpu) +{ + struct vm *vm = vcpu->vm; + + if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) + return (EBUSY); + + CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus); + return (0); + +} + +int +vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu) +{ + if (vcpu == NULL) { + vm->debug_cpus = vm->active_cpus; + for (int i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm_vcpu(vm, i)); + } + } else { + if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) + return (EINVAL); + + CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); + vcpu_notify_event(vcpu); + } + return (0); +} + +int +vm_resume_cpu(struct vm *vm, struct vcpu *vcpu) +{ + + if (vcpu == NULL) { + CPU_ZERO(&vm->debug_cpus); + } else { + if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus)) + return (EINVAL); + + CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); + } + return (0); +} + +int +vcpu_debugged(struct vcpu *vcpu) +{ + + return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus)); +} + +cpuset_t +vm_active_cpus(struct vm *vm) +{ + + return (vm->active_cpus); +} + +cpuset_t +vm_debug_cpus(struct vm *vm) +{ + + return (vm->debug_cpus); +} + +cpuset_t +vm_suspended_cpus(struct vm *vm) +{ + + return (vm->suspended_cpus); +} + + +void * +vcpu_stats(struct vcpu *vcpu) +{ + + return (vcpu->stats); +} + +/* + * This function is called to ensure that a vcpu "sees" a pending event + * as soon as possible: + * - If the vcpu thread is sleeping then it is woken up. + * - If the vcpu is running on a different host_cpu then an IPI will be directed + * to the host_cpu to cause the vcpu to trap into the hypervisor. + */ +static void +vcpu_notify_event_locked(struct vcpu *vcpu) +{ + int hostcpu; + + hostcpu = vcpu->hostcpu; + if (vcpu->state == VCPU_RUNNING) { + KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); + if (hostcpu != curcpu) { + ipi_cpu(hostcpu, vmm_ipinum); + } else { + /* + * If the 'vcpu' is running on 'curcpu' then it must + * be sending a notification to itself (e.g. SELF_IPI). + * The pending event will be picked up when the vcpu + * transitions back to guest context. + */ + } + } else { + KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " + "with hostcpu %d", vcpu->state, hostcpu)); + if (vcpu->state == VCPU_SLEEPING) + wakeup_one(vcpu); + } +} + +void +vcpu_notify_event(struct vcpu *vcpu) +{ + vcpu_lock(vcpu); + vcpu_notify_event_locked(vcpu); + vcpu_unlock(vcpu); +} + +static void +restore_guest_fpustate(struct vcpu *vcpu) +{ + + /* flush host state to the pcb */ + vfp_save_state(curthread, curthread->td_pcb); + /* Ensure the VFP state will be re-loaded when exiting the guest */ + PCPU_SET(fpcurthread, NULL); + + /* restore guest FPU state */ + vfp_enable(); + vfp_restore(vcpu->guestfpu); + + /* + * The FPU is now "dirty" with the guest's state so turn on emulation + * to trap any access to the FPU by the host. + */ + vfp_disable(); +} + +static void +save_guest_fpustate(struct vcpu *vcpu) +{ + if ((READ_SPECIALREG(cpacr_el1) & CPACR_FPEN_MASK) != + CPACR_FPEN_TRAP_ALL1) + panic("VFP not enabled in host!"); + + /* save guest FPU state */ + vfp_enable(); + vfp_store(vcpu->guestfpu); + vfp_disable(); + + KASSERT(PCPU_GET(fpcurthread) == NULL, + ("%s: fpcurthread set with guest registers", __func__)); +} +static int +vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, + bool from_idle) +{ + int error; + + vcpu_assert_locked(vcpu); + + /* + * State transitions from the vmmdev_ioctl() must always begin from + * the VCPU_IDLE state. This guarantees that there is only a single + * ioctl() operating on a vcpu at any point. + */ + if (from_idle) { + while (vcpu->state != VCPU_IDLE) { + vcpu_notify_event_locked(vcpu); + msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); + } + } else { + KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " + "vcpu idle state")); + } + + if (vcpu->state == VCPU_RUNNING) { + KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " + "mismatch for running vcpu", curcpu, vcpu->hostcpu)); + } else { + KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " + "vcpu that is not running", vcpu->hostcpu)); + } + + /* + * The following state transitions are allowed: + * IDLE -> FROZEN -> IDLE + * FROZEN -> RUNNING -> FROZEN + * FROZEN -> SLEEPING -> FROZEN + */ + switch (vcpu->state) { + case VCPU_IDLE: + case VCPU_RUNNING: + case VCPU_SLEEPING: + error = (newstate != VCPU_FROZEN); + break; + case VCPU_FROZEN: + error = (newstate == VCPU_FROZEN); + break; + default: + error = 1; + break; + } + + if (error) + return (EBUSY); + + vcpu->state = newstate; + if (newstate == VCPU_RUNNING) + vcpu->hostcpu = curcpu; + else + vcpu->hostcpu = NOCPU; + + if (newstate == VCPU_IDLE) + wakeup(&vcpu->state); + + return (0); +} + +static void +vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state(vcpu, newstate, false)) != 0) + panic("Error %d setting state to %d\n", error, newstate); +} + +static void +vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) + panic("Error %d setting state to %d", error, newstate); +} + +int +vm_get_capability(struct vcpu *vcpu, int type, int *retval) +{ + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (vmmops_getcap(vcpu->cookie, type, retval)); +} + +int +vm_set_capability(struct vcpu *vcpu, int type, int val) +{ + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (vmmops_setcap(vcpu->cookie, type, val)); +} + +struct vm * +vcpu_vm(struct vcpu *vcpu) +{ + return (vcpu->vm); +} + +int +vcpu_vcpuid(struct vcpu *vcpu) +{ + return (vcpu->vcpuid); +} + +void * +vcpu_get_cookie(struct vcpu *vcpu) +{ + return (vcpu->cookie); +} + +struct vcpu * +vm_vcpu(struct vm *vm, int vcpuid) +{ + return (vm->vcpu[vcpuid]); +} + +int +vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) +{ + int error; + + vcpu_lock(vcpu); + error = vcpu_set_state_locked(vcpu, newstate, from_idle); + vcpu_unlock(vcpu); + + return (error); +} + +enum vcpu_state +vcpu_get_state(struct vcpu *vcpu, int *hostcpu) +{ + enum vcpu_state state; + + vcpu_lock(vcpu); + state = vcpu->state; + if (hostcpu != NULL) + *hostcpu = vcpu->hostcpu; + vcpu_unlock(vcpu); + + return (state); +} + +static void * +_vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, + void **cookie) +{ + int i, count, pageoff; + struct mem_map *mm; + vm_page_t m; + + pageoff = gpa & PAGE_MASK; + if (len > PAGE_SIZE - pageoff) + panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); + + count = 0; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (sysmem_mapping(vm, mm) && gpa >= mm->gpa && + gpa < mm->gpa + mm->len) { + count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, + trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); + break; + } + } + + if (count == 1) { + *cookie = m; + return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); + } else { + *cookie = NULL; + return (NULL); + } +} + +void * +vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int reqprot, + void **cookie) +{ +#ifdef INVARIANTS + /* + * The current vcpu should be frozen to ensure 'vm_memmap[]' + * stability. + */ + int state = vcpu_get_state(vcpu, NULL); + KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", + __func__, state)); +#endif + return (_vm_gpa_hold(vcpu->vm, gpa, len, reqprot, cookie)); +} + +void * +vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, + void **cookie) +{ + sx_assert(&vm->mem_segs_lock, SX_LOCKED); + return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie)); +} + +void +vm_gpa_release(void *cookie) +{ + vm_page_t m = cookie; + + vm_page_unwire(m, PQ_ACTIVE); +} + +int +vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval) +{ + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (vmmops_getreg(vcpu->cookie, reg, retval)); +} + +int +vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) +{ + int error; + + if (reg >= VM_REG_LAST) + return (EINVAL); + error = vmmops_setreg(vcpu->cookie, reg, val); + if (error || reg != VM_REG_GUEST_PC) + return (error); + + vcpu->nextpc = val; + + return (0); +} + +void * +vm_get_cookie(struct vm *vm) +{ + return (vm->cookie); +} + +int +vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far) +{ + return (vmmops_exception(vcpu->cookie, esr, far)); +} + +int +vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr) +{ + return (vgic_attach_to_vm(vm->cookie, descr)); +} + +int +vm_assert_irq(struct vm *vm, uint32_t irq) +{ + return (vgic_inject_irq(vm->cookie, -1, irq, true)); +} + +int +vm_deassert_irq(struct vm *vm, uint32_t irq) +{ + return (vgic_inject_irq(vm->cookie, -1, irq, false)); +} + +int +vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot, + int func) +{ + /* TODO: Should we raise an SError? */ + return (vgic_inject_msi(vm->cookie, msg, addr)); +} + +static int +vm_handle_smccc_call(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) +{ + struct hypctx *hypctx; + int i; + + hypctx = vcpu_get_cookie(vcpu); + + if ((hypctx->tf.tf_esr & ESR_ELx_ISS_MASK) != 0) + return (1); + + vme->exitcode = VM_EXITCODE_SMCCC; + vme->u.smccc_call.func_id = hypctx->tf.tf_x[0]; + for (i = 0; i < nitems(vme->u.smccc_call.args); i++) + vme->u.smccc_call.args[i] = hypctx->tf.tf_x[i + 1]; + + *retu = true; + return (0); +} + +static int +vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) +{ + vcpu_lock(vcpu); + while (1) { + if (vgic_has_pending_irq(vcpu->cookie)) + break; + + if (vcpu_should_yield(vcpu)) + break; + + vcpu_require_state_locked(vcpu, VCPU_SLEEPING); + /* + * XXX msleep_spin() cannot be interrupted by signals so + * wake up periodically to check pending signals. + */ + msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz); + vcpu_require_state_locked(vcpu, VCPU_FROZEN); + } + vcpu_unlock(vcpu); + + *retu = false; + return (0); +} + +static int +vm_handle_paging(struct vcpu *vcpu, bool *retu) +{ + struct vm *vm = vcpu->vm; + struct vm_exit *vme; + struct vm_map *map; + uint64_t addr, esr; + pmap_t pmap; + int ftype, rv; + + vme = &vcpu->exitinfo; + + pmap = vmspace_pmap(vcpu->vm->vmspace); + addr = vme->u.paging.gpa; + esr = vme->u.paging.esr; + + /* The page exists, but the page table needs to be updated. */ + if (pmap_fault(pmap, esr, addr) == KERN_SUCCESS) + return (0); + + switch (ESR_ELx_EXCEPTION(esr)) { + case EXCP_INSN_ABORT_L: + case EXCP_DATA_ABORT_L: + ftype = VM_PROT_EXECUTE | VM_PROT_READ | VM_PROT_WRITE; + break; + default: + panic("%s: Invalid exception (esr = %lx)", __func__, esr); + } + + map = &vm->vmspace->vm_map; + rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); + if (rv != KERN_SUCCESS) + return (EFAULT); + + return (0); +} + +int +vm_run(struct vcpu *vcpu) +{ + struct vm *vm = vcpu->vm; + struct vm_eventinfo evinfo; + int error, vcpuid; + struct vm_exit *vme; + bool retu; + pmap_t pmap; + + vcpuid = vcpu->vcpuid; + + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) + return (EINVAL); + + pmap = vmspace_pmap(vm->vmspace); + vme = &vcpu->exitinfo; + evinfo.rptr = NULL; + evinfo.sptr = &vm->suspend; + evinfo.iptr = NULL; +restart: + critical_enter(); + + restore_guest_fpustate(vcpu); + + vcpu_require_state(vcpu, VCPU_RUNNING); + error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo); + vcpu_require_state(vcpu, VCPU_FROZEN); + + save_guest_fpustate(vcpu); + + critical_exit(); + + if (error == 0) { + retu = false; + switch (vme->exitcode) { + case VM_EXITCODE_INST_EMUL: + vcpu->nextpc = vme->pc + vme->inst_length; + error = vm_handle_inst_emul(vcpu, &retu); + break; + + case VM_EXITCODE_REG_EMUL: + vcpu->nextpc = vme->pc + vme->inst_length; + error = vm_handle_reg_emul(vcpu, &retu); + break; + + case VM_EXITCODE_HVC: + /* + * The HVC instruction saves the address for the + * next instruction as the return address. + */ + vcpu->nextpc = vme->pc; + /* + * The PSCI call can change the exit information in the + * case of suspend/reset/poweroff/cpu off/cpu on. + */ + error = vm_handle_smccc_call(vcpu, vme, &retu); + break; + + case VM_EXITCODE_WFI: + vcpu->nextpc = vme->pc + vme->inst_length; + error = vm_handle_wfi(vcpu, vme, &retu); + break; + + case VM_EXITCODE_PAGING: + vcpu->nextpc = vme->pc; + error = vm_handle_paging(vcpu, &retu); + break; + + default: + /* Handle in userland */ + vcpu->nextpc = vme->pc; + retu = true; + break; + } + } + + if (error == 0 && retu == false) + goto restart; + + return (error); +} diff --git a/sys/arm64/vmm/vmm_arm64.c b/sys/arm64/vmm/vmm_arm64.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_arm64.c @@ -0,0 +1,1308 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mmu.h" +#include "arm64.h" +#include "hyp.h" +#include "reset.h" +#include "io/vgic.h" +#include "io/vgic_v3.h" +#include "io/vtimer.h" +#include "vmm_stat.h" + +#define HANDLED 1 +#define UNHANDLED 0 + +/* Number of bits in an EL2 virtual address */ +#define EL2_VIRT_BITS 48 +CTASSERT((1ul << EL2_VIRT_BITS) >= HYP_VM_MAX_ADDRESS); + +/* TODO: Move the host hypctx off the stack */ +#define VMM_STACK_PAGES 4 +#define VMM_STACK_SIZE (VMM_STACK_PAGES * PAGE_SIZE) + +static int vmm_pmap_levels, vmm_virt_bits, vmm_max_ipa_bits; + +/* Register values passed to arm_setup_vectors to set in the hypervisor */ +struct vmm_init_regs { + uint64_t tcr_el2; + uint64_t vtcr_el2; +}; + +MALLOC_DEFINE(M_HYP, "ARM VMM HYP", "ARM VMM HYP"); + +extern char hyp_init_vectors[]; +extern char hyp_vectors[]; +extern char hyp_stub_vectors[]; + +static vm_paddr_t hyp_code_base; +static size_t hyp_code_len; + +static char *stack[MAXCPU]; +static vm_offset_t stack_hyp_va[MAXCPU]; + +static vmem_t *el2_mem_alloc; + +static void arm_setup_vectors(void *arg); +static void vmm_pmap_clean_stage2_tlbi(void); +static void vmm_pmap_invalidate_range(uint64_t, vm_offset_t, vm_offset_t, bool); +static void vmm_pmap_invalidate_all(uint64_t); + +DPCPU_DEFINE_STATIC(struct hypctx *, vcpu); + +static inline void +arm64_set_active_vcpu(struct hypctx *hypctx) +{ + DPCPU_SET(vcpu, hypctx); +} + +struct hypctx * +arm64_get_active_vcpu(void) +{ + return (DPCPU_GET(vcpu)); +} + +static void +arm_setup_vectors(void *arg) +{ + struct vmm_init_regs *el2_regs; + uintptr_t stack_top; + uint32_t sctlr_el2; + register_t daif; + + el2_regs = arg; + arm64_set_active_vcpu(NULL); + + daif = intr_disable(); + + /* + * Install the temporary vectors which will be responsible for + * initializing the VMM when we next trap into EL2. + * + * x0: the exception vector table responsible for hypervisor + * initialization on the next call. + */ + vmm_call_hyp(vtophys(&vmm_hyp_code)); + + /* Create and map the hypervisor stack */ + stack_top = stack_hyp_va[PCPU_GET(cpuid)] + VMM_STACK_SIZE; + + /* + * Configure the system control register for EL2: + * + * SCTLR_EL2_M: MMU on + * SCTLR_EL2_C: Data cacheability not affected + * SCTLR_EL2_I: Instruction cacheability not affected + * SCTLR_EL2_A: Instruction alignment check + * SCTLR_EL2_SA: Stack pointer alignment check + * SCTLR_EL2_WXN: Treat writable memory as execute never + * ~SCTLR_EL2_EE: Data accesses are little-endian + */ + sctlr_el2 = SCTLR_EL2_RES1; + sctlr_el2 |= SCTLR_EL2_M | SCTLR_EL2_C | SCTLR_EL2_I; + sctlr_el2 |= SCTLR_EL2_A | SCTLR_EL2_SA; + sctlr_el2 |= SCTLR_EL2_WXN; + sctlr_el2 &= ~SCTLR_EL2_EE; + + /* Special call to initialize EL2 */ + vmm_call_hyp(vmmpmap_to_ttbr0(), stack_top, el2_regs->tcr_el2, + sctlr_el2, el2_regs->vtcr_el2); + + intr_restore(daif); +} + +static void +arm_teardown_vectors(void *arg) +{ + register_t daif; + + /* + * vmm_cleanup() will disable the MMU. For the next few instructions, + * before the hardware disables the MMU, one of the following is + * possible: + * + * a. The instruction addresses are fetched with the MMU disabled, + * and they must represent the actual physical addresses. This will work + * because we call the vmm_cleanup() function by its physical address. + * + * b. The instruction addresses are fetched using the old translation + * tables. This will work because we have an identity mapping in place + * in the translation tables and vmm_cleanup() is called by its physical + * address. + */ + daif = intr_disable(); + /* TODO: Invalidate the cache */ + vmm_call_hyp(HYP_CLEANUP, vtophys(hyp_stub_vectors)); + intr_restore(daif); + + arm64_set_active_vcpu(NULL); +} + +static uint64_t +vmm_vtcr_el2_sl(u_int levels) +{ +#if PAGE_SIZE == PAGE_SIZE_4K + switch (levels) { + case 2: + return (VTCR_EL2_SL0_4K_LVL2); + case 3: + return (VTCR_EL2_SL0_4K_LVL1); + case 4: + return (VTCR_EL2_SL0_4K_LVL0); + default: + panic("%s: Invalid number of page table levels %u", __func__, + levels); + } +#elif PAGE_SIZE == PAGE_SIZE_16K + switch (levels) { + case 2: + return (VTCR_EL2_SL0_16K_LVL2); + case 3: + return (VTCR_EL2_SL0_16K_LVL1); + case 4: + return (VTCR_EL2_SL0_16K_LVL0); + default: + panic("%s: Invalid number of page table levels %u", __func__, + levels); + } +#else +#error Unsupported page size +#endif +} + +int +vmmops_modinit(int ipinum) +{ + struct vmm_init_regs el2_regs; + vm_offset_t next_hyp_va; + vm_paddr_t vmm_base; + uint64_t id_aa64mmfr0_el1, pa_range_bits, pa_range_field; + uint64_t cnthctl_el2; + register_t daif; + int cpu, i; + bool rv __diagused; + + if (!virt_enabled()) { + printf( + "vmm: Processor doesn't have support for virtualization\n"); + return (ENXIO); + } + + if (!vgic_present()) { + printf("vmm: No vgic found\n"); + return (ENODEV); + } + + if (!get_kernel_reg(ID_AA64MMFR0_EL1, &id_aa64mmfr0_el1)) { + printf("vmm: Unable to read ID_AA64MMFR0_EL1\n"); + return (ENXIO); + } + pa_range_field = ID_AA64MMFR0_PARange_VAL(id_aa64mmfr0_el1); + /* + * Use 3 levels to give us up to 39 bits with 4k pages, or + * 47 bits with 16k pages. + */ + /* TODO: Check the number of levels for 64k pages */ + vmm_pmap_levels = 3; + switch (pa_range_field) { + case ID_AA64MMFR0_PARange_4G: + printf("vmm: Not enough physical address bits\n"); + return (ENXIO); + case ID_AA64MMFR0_PARange_64G: + vmm_virt_bits = 36; +#if PAGE_SIZE == PAGE_SIZE_16K + /* TODO: Test */ + vmm_pmap_levels = 2; +#endif + break; + default: + vmm_virt_bits = 39; + break; + } + pa_range_bits = pa_range_field >> ID_AA64MMFR0_PARange_SHIFT; + + /* Initialise the EL2 MMU */ + if (!vmmpmap_init()) { + printf("vmm: Failed to init the EL2 MMU\n"); + return (ENOMEM); + } + + /* Set up the stage 2 pmap callbacks */ + MPASS(pmap_clean_stage2_tlbi == NULL); + pmap_clean_stage2_tlbi = vmm_pmap_clean_stage2_tlbi; + pmap_stage2_invalidate_range = vmm_pmap_invalidate_range; + pmap_stage2_invalidate_all = vmm_pmap_invalidate_all; + + /* + * Create an allocator for the virtual address space used by EL2. + * EL2 code is identity-mapped; the allocator is used to find space for + * VM structures. + */ + el2_mem_alloc = vmem_create("VMM EL2", 0, 0, PAGE_SIZE, 0, M_WAITOK); + + /* Create the mappings for the hypervisor translation table. */ + hyp_code_len = round_page(&vmm_hyp_code_end - &vmm_hyp_code); + + /* We need an physical identity mapping for when we activate the MMU */ + hyp_code_base = vmm_base = vtophys(&vmm_hyp_code); + rv = vmmpmap_enter(vmm_base, hyp_code_len, vmm_base, + VM_PROT_READ | VM_PROT_EXECUTE); + MPASS(rv); + + next_hyp_va = roundup2(vmm_base + hyp_code_len, L2_SIZE); + + /* Create a per-CPU hypervisor stack */ + CPU_FOREACH(cpu) { + stack[cpu] = malloc(VMM_STACK_SIZE, M_HYP, M_WAITOK | M_ZERO); + stack_hyp_va[cpu] = next_hyp_va; + + for (i = 0; i < VMM_STACK_PAGES; i++) { + rv = vmmpmap_enter(stack_hyp_va[cpu] + ptoa(i), + PAGE_SIZE, vtophys(stack[cpu] + ptoa(i)), + VM_PROT_READ | VM_PROT_WRITE); + MPASS(rv); + } + next_hyp_va += L2_SIZE; + } + + el2_regs.tcr_el2 = TCR_EL2_RES1; + el2_regs.tcr_el2 |= min(pa_range_bits << TCR_EL2_PS_SHIFT, + TCR_EL2_PS_52BITS); + el2_regs.tcr_el2 |= TCR_EL2_T0SZ(64 - EL2_VIRT_BITS); + el2_regs.tcr_el2 |= TCR_EL2_IRGN0_WBWA | TCR_EL2_ORGN0_WBWA; +#if PAGE_SIZE == PAGE_SIZE_4K + el2_regs.tcr_el2 |= TCR_EL2_TG0_4K; +#elif PAGE_SIZE == PAGE_SIZE_16K + el2_regs.tcr_el2 |= TCR_EL2_TG0_16K; +#else +#error Unsupported page size +#endif +#ifdef SMP + el2_regs.tcr_el2 |= TCR_EL2_SH0_IS; +#endif + + switch (el2_regs.tcr_el2 & TCR_EL2_PS_MASK) { + case TCR_EL2_PS_32BITS: + vmm_max_ipa_bits = 32; + break; + case TCR_EL2_PS_36BITS: + vmm_max_ipa_bits = 36; + break; + case TCR_EL2_PS_40BITS: + vmm_max_ipa_bits = 40; + break; + case TCR_EL2_PS_42BITS: + vmm_max_ipa_bits = 42; + break; + case TCR_EL2_PS_44BITS: + vmm_max_ipa_bits = 44; + break; + case TCR_EL2_PS_48BITS: + vmm_max_ipa_bits = 48; + break; + case TCR_EL2_PS_52BITS: + default: + vmm_max_ipa_bits = 52; + break; + } + + /* + * Configure the Stage 2 translation control register: + * + * VTCR_IRGN0_WBWA: Translation table walks access inner cacheable + * normal memory + * VTCR_ORGN0_WBWA: Translation table walks access outer cacheable + * normal memory + * VTCR_EL2_TG0_4K/16K: Stage 2 uses the same page size as the kernel + * VTCR_EL2_SL0_4K_LVL1: Stage 2 uses concatenated level 1 tables + * VTCR_EL2_SH0_IS: Memory associated with Stage 2 walks is inner + * shareable + */ + el2_regs.vtcr_el2 = VTCR_EL2_RES1; + el2_regs.vtcr_el2 |= + min(pa_range_bits << VTCR_EL2_PS_SHIFT, VTCR_EL2_PS_48BIT); + el2_regs.vtcr_el2 |= VTCR_EL2_IRGN0_WBWA | VTCR_EL2_ORGN0_WBWA; + el2_regs.vtcr_el2 |= VTCR_EL2_T0SZ(64 - vmm_virt_bits); + el2_regs.vtcr_el2 |= vmm_vtcr_el2_sl(vmm_pmap_levels); +#if PAGE_SIZE == PAGE_SIZE_4K + el2_regs.vtcr_el2 |= VTCR_EL2_TG0_4K; +#elif PAGE_SIZE == PAGE_SIZE_16K + el2_regs.vtcr_el2 |= VTCR_EL2_TG0_16K; +#else +#error Unsupported page size +#endif +#ifdef SMP + el2_regs.vtcr_el2 |= VTCR_EL2_SH0_IS; +#endif + + smp_rendezvous(NULL, arm_setup_vectors, NULL, &el2_regs); + + /* Add memory to the vmem allocator (checking there is space) */ + if (vmm_base > (L2_SIZE + PAGE_SIZE)) { + /* + * Ensure there is an L2 block before the vmm code to check + * for buffer overflows on earlier data. Include the PAGE_SIZE + * of the minimum we can allocate. + */ + vmm_base -= L2_SIZE + PAGE_SIZE; + vmm_base = rounddown2(vmm_base, L2_SIZE); + + /* + * Check there is memory before the vmm code to add. + * + * Reserve the L2 block at address 0 so NULL dereference will + * raise an exception. + */ + if (vmm_base > L2_SIZE) + vmem_add(el2_mem_alloc, L2_SIZE, vmm_base - L2_SIZE, + M_WAITOK); + } + + /* + * Add the memory after the stacks. There is most of an L2 block + * between the last stack and the first allocation so this should + * be safe without adding more padding. + */ + if (next_hyp_va < HYP_VM_MAX_ADDRESS - PAGE_SIZE) + vmem_add(el2_mem_alloc, next_hyp_va, + HYP_VM_MAX_ADDRESS - next_hyp_va, M_WAITOK); + + daif = intr_disable(); + cnthctl_el2 = vmm_call_hyp(HYP_READ_REGISTER, HYP_REG_CNTHCTL); + intr_restore(daif); + + vgic_init(); + vtimer_init(cnthctl_el2); + + return (0); +} + +int +vmmops_modcleanup(void) +{ + int cpu; + + smp_rendezvous(NULL, arm_teardown_vectors, NULL, NULL); + + CPU_FOREACH(cpu) { + vmmpmap_remove(stack_hyp_va[cpu], VMM_STACK_PAGES * PAGE_SIZE, + false); + } + + vmmpmap_remove(hyp_code_base, hyp_code_len, false); + + vtimer_cleanup(); + + vmmpmap_fini(); + + CPU_FOREACH(cpu) + free(stack[cpu], M_HYP); + + pmap_clean_stage2_tlbi = NULL; + pmap_stage2_invalidate_range = NULL; + pmap_stage2_invalidate_all = NULL; + + return (0); +} + +static vm_size_t +el2_hyp_size(struct vm *vm) +{ + return (round_page(sizeof(struct hyp) + + sizeof(struct hypctx *) * vm_get_maxcpus(vm))); +} + +static vm_size_t +el2_hypctx_size(void) +{ + return (round_page(sizeof(struct hypctx))); +} + +static vm_offset_t +el2_map_enter(vm_offset_t data, vm_size_t size, vm_prot_t prot) +{ + vmem_addr_t addr; + int err __diagused; + bool rv __diagused; + + err = vmem_alloc(el2_mem_alloc, size, M_NEXTFIT | M_WAITOK, &addr); + MPASS(err == 0); + rv = vmmpmap_enter(addr, size, vtophys(data), prot); + MPASS(rv); + + return (addr); +} + +void * +vmmops_init(struct vm *vm, pmap_t pmap) +{ + struct hyp *hyp; + vm_size_t size; + + size = el2_hyp_size(vm); + hyp = malloc_aligned(size, PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO); + + hyp->vm = vm; + hyp->vgic_attached = false; + + vtimer_vminit(hyp); + vgic_vminit(hyp); + + hyp->el2_addr = el2_map_enter((vm_offset_t)hyp, size, + VM_PROT_READ | VM_PROT_WRITE); + + return (hyp); +} + +void * +vmmops_vcpu_init(void *vmi, struct vcpu *vcpu1, int vcpuid) +{ + struct hyp *hyp = vmi; + struct hypctx *hypctx; + vm_size_t size; + + size = el2_hypctx_size(); + hypctx = malloc_aligned(size, PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO); + + KASSERT(vcpuid >= 0 && vcpuid < vm_get_maxcpus(hyp->vm), + ("%s: Invalid vcpuid %d", __func__, vcpuid)); + hyp->ctx[vcpuid] = hypctx; + + hypctx->hyp = hyp; + hypctx->vcpu = vcpu1; + + reset_vm_el01_regs(hypctx); + reset_vm_el2_regs(hypctx); + + vtimer_cpuinit(hypctx); + vgic_cpuinit(hypctx); + + hypctx->el2_addr = el2_map_enter((vm_offset_t)hypctx, size, + VM_PROT_READ | VM_PROT_WRITE); + + return (hypctx); +} + +static int +arm_vmm_pinit(pmap_t pmap) +{ + + pmap_pinit_stage(pmap, PM_STAGE2, vmm_pmap_levels); + return (1); +} + +struct vmspace * +vmmops_vmspace_alloc(vm_offset_t min, vm_offset_t max) +{ + return (vmspace_alloc(min, max, arm_vmm_pinit)); +} + +void +vmmops_vmspace_free(struct vmspace *vmspace) +{ + + pmap_remove_pages(vmspace_pmap(vmspace)); + vmspace_free(vmspace); +} + +static void +vmm_pmap_clean_stage2_tlbi(void) +{ + vmm_call_hyp(HYP_CLEAN_S2_TLBI); +} + +static void +vmm_pmap_invalidate_range(uint64_t vttbr, vm_offset_t sva, vm_offset_t eva, + bool final_only) +{ + MPASS(eva > sva); + vmm_call_hyp(HYP_S2_TLBI_RANGE, vttbr, sva, eva, final_only); +} + +static void +vmm_pmap_invalidate_all(uint64_t vttbr) +{ + vmm_call_hyp(HYP_S2_TLBI_ALL, vttbr); +} + +static inline void +arm64_print_hyp_regs(struct vm_exit *vme) +{ + printf("esr_el2: 0x%016lx\n", vme->u.hyp.esr_el2); + printf("far_el2: 0x%016lx\n", vme->u.hyp.far_el2); + printf("hpfar_el2: 0x%016lx\n", vme->u.hyp.hpfar_el2); + printf("elr_el2: 0x%016lx\n", vme->pc); +} + +static void +arm64_gen_inst_emul_data(struct hypctx *hypctx, uint32_t esr_iss, + struct vm_exit *vme_ret) +{ + struct vm_guest_paging *paging; + struct vie *vie; + uint32_t esr_sas, reg_num; + + /* + * Get the page address from HPFAR_EL2. + */ + vme_ret->u.inst_emul.gpa = + HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2); + /* Bits [11:0] are the same as bits [11:0] from the virtual address. */ + vme_ret->u.inst_emul.gpa += hypctx->exit_info.far_el2 & + FAR_EL2_HPFAR_PAGE_MASK; + + esr_sas = (esr_iss & ISS_DATA_SAS_MASK) >> ISS_DATA_SAS_SHIFT; + reg_num = (esr_iss & ISS_DATA_SRT_MASK) >> ISS_DATA_SRT_SHIFT; + + vie = &vme_ret->u.inst_emul.vie; + vie->access_size = 1 << esr_sas; + vie->sign_extend = (esr_iss & ISS_DATA_SSE) ? 1 : 0; + vie->dir = (esr_iss & ISS_DATA_WnR) ? VM_DIR_WRITE : VM_DIR_READ; + vie->reg = reg_num; + + paging = &vme_ret->u.inst_emul.paging; + paging->ttbr0_addr = hypctx->ttbr0_el1 & ~(TTBR_ASID_MASK | TTBR_CnP); + paging->ttbr1_addr = hypctx->ttbr1_el1 & ~(TTBR_ASID_MASK | TTBR_CnP); + paging->tcr_el1 = hypctx->tcr_el1; + paging->tcr2_el1 = hypctx->tcr2_el1; + paging->flags = hypctx->tf.tf_spsr & (PSR_M_MASK | PSR_M_32); + if ((hypctx->sctlr_el1 & SCTLR_M) != 0) + paging->flags |= VM_GP_MMU_ENABLED; +} + +static void +arm64_gen_reg_emul_data(uint32_t esr_iss, struct vm_exit *vme_ret) +{ + uint32_t reg_num; + struct vre *vre; + + /* u.hyp member will be replaced by u.reg_emul */ + vre = &vme_ret->u.reg_emul.vre; + + vre->inst_syndrome = esr_iss; + /* ARMv8 Architecture Manual, p. D7-2273: 1 means read */ + vre->dir = (esr_iss & ISS_MSR_DIR) ? VM_DIR_READ : VM_DIR_WRITE; + reg_num = ISS_MSR_Rt(esr_iss); + vre->reg = reg_num; +} + +static void +raise_data_insn_abort(struct hypctx *hypctx, uint64_t far, bool dabort, int fsc) +{ + uint64_t esr; + + if ((hypctx->tf.tf_spsr & PSR_M_MASK) == PSR_M_EL0t) + esr = EXCP_INSN_ABORT_L << ESR_ELx_EC_SHIFT; + else + esr = EXCP_INSN_ABORT << ESR_ELx_EC_SHIFT; + /* Set the bit that changes from insn -> data abort */ + if (dabort) + esr |= EXCP_DATA_ABORT_L << ESR_ELx_EC_SHIFT; + /* Set the IL bit if set by hardware */ + esr |= hypctx->tf.tf_esr & ESR_ELx_IL; + + vmmops_exception(hypctx, esr | fsc, far); +} + +static int +handle_el1_sync_excp(struct hypctx *hypctx, struct vm_exit *vme_ret, + pmap_t pmap) +{ + uint64_t gpa; + uint32_t esr_ec, esr_iss; + + esr_ec = ESR_ELx_EXCEPTION(hypctx->tf.tf_esr); + esr_iss = hypctx->tf.tf_esr & ESR_ELx_ISS_MASK; + + switch (esr_ec) { + case EXCP_UNKNOWN: + vmm_stat_incr(hypctx->vcpu, VMEXIT_UNKNOWN, 1); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + case EXCP_TRAP_WFI_WFE: + if ((hypctx->tf.tf_esr & 0x3) == 0) { /* WFI */ + vmm_stat_incr(hypctx->vcpu, VMEXIT_WFI, 1); + vme_ret->exitcode = VM_EXITCODE_WFI; + } else { + vmm_stat_incr(hypctx->vcpu, VMEXIT_WFE, 1); + vme_ret->exitcode = VM_EXITCODE_HYP; + } + break; + case EXCP_HVC: + vmm_stat_incr(hypctx->vcpu, VMEXIT_HVC, 1); + vme_ret->exitcode = VM_EXITCODE_HVC; + break; + case EXCP_MSR: + vmm_stat_incr(hypctx->vcpu, VMEXIT_MSR, 1); + arm64_gen_reg_emul_data(esr_iss, vme_ret); + vme_ret->exitcode = VM_EXITCODE_REG_EMUL; + break; + + case EXCP_INSN_ABORT_L: + case EXCP_DATA_ABORT_L: + vmm_stat_incr(hypctx->vcpu, esr_ec == EXCP_DATA_ABORT_L ? + VMEXIT_DATA_ABORT : VMEXIT_INSN_ABORT, 1); + switch (hypctx->tf.tf_esr & ISS_DATA_DFSC_MASK) { + case ISS_DATA_DFSC_TF_L0: + case ISS_DATA_DFSC_TF_L1: + case ISS_DATA_DFSC_TF_L2: + case ISS_DATA_DFSC_TF_L3: + case ISS_DATA_DFSC_AFF_L1: + case ISS_DATA_DFSC_AFF_L2: + case ISS_DATA_DFSC_AFF_L3: + case ISS_DATA_DFSC_PF_L1: + case ISS_DATA_DFSC_PF_L2: + case ISS_DATA_DFSC_PF_L3: + gpa = HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2); + /* Check the IPA is valid */ + if (gpa >= (1ul << vmm_max_ipa_bits)) { + raise_data_insn_abort(hypctx, + hypctx->exit_info.far_el2, + esr_ec == EXCP_DATA_ABORT_L, + ISS_DATA_DFSC_ASF_L0); + vme_ret->inst_length = 0; + return (HANDLED); + } + + if (vm_mem_allocated(hypctx->vcpu, gpa)) { + vme_ret->exitcode = VM_EXITCODE_PAGING; + vme_ret->inst_length = 0; + vme_ret->u.paging.esr = hypctx->tf.tf_esr; + vme_ret->u.paging.gpa = gpa; + } else if (esr_ec == EXCP_INSN_ABORT_L) { + /* + * Raise an external abort. Device memory is + * not executable + */ + raise_data_insn_abort(hypctx, + hypctx->exit_info.far_el2, false, + ISS_DATA_DFSC_EXT); + vme_ret->inst_length = 0; + return (HANDLED); + } else { + arm64_gen_inst_emul_data(hypctx, esr_iss, + vme_ret); + vme_ret->exitcode = VM_EXITCODE_INST_EMUL; + } + break; + default: + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + } + + break; + + default: + vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED_SYNC, 1); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + } + + /* We don't don't do any instruction emulation here */ + return (UNHANDLED); +} + +static int +arm64_handle_world_switch(struct hypctx *hypctx, int excp_type, + struct vm_exit *vme, pmap_t pmap) +{ + int handled; + + switch (excp_type) { + case EXCP_TYPE_EL1_SYNC: + /* The exit code will be set by handle_el1_sync_excp(). */ + handled = handle_el1_sync_excp(hypctx, vme, pmap); + break; + + case EXCP_TYPE_EL1_IRQ: + case EXCP_TYPE_EL1_FIQ: + /* The host kernel will handle IRQs and FIQs. */ + vmm_stat_incr(hypctx->vcpu, + excp_type == EXCP_TYPE_EL1_IRQ ? VMEXIT_IRQ : VMEXIT_FIQ,1); + vme->exitcode = VM_EXITCODE_BOGUS; + handled = UNHANDLED; + break; + + case EXCP_TYPE_EL1_ERROR: + case EXCP_TYPE_EL2_SYNC: + case EXCP_TYPE_EL2_IRQ: + case EXCP_TYPE_EL2_FIQ: + case EXCP_TYPE_EL2_ERROR: + vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED_EL2, 1); + vme->exitcode = VM_EXITCODE_BOGUS; + handled = UNHANDLED; + break; + + default: + vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED, 1); + vme->exitcode = VM_EXITCODE_BOGUS; + handled = UNHANDLED; + break; + } + + return (handled); +} + +static void +ptp_release(void **cookie) +{ + if (*cookie != NULL) { + vm_gpa_release(*cookie); + *cookie = NULL; + } +} + +static void * +ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) +{ + void *ptr; + + ptp_release(cookie); + ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie); + return (ptr); +} + +/* log2 of the number of bytes in a page table entry */ +#define PTE_SHIFT 3 +int +vmmops_gla2gpa(void *vcpui, struct vm_guest_paging *paging, uint64_t gla, + int prot, uint64_t *gpa, int *is_fault) +{ + struct hypctx *hypctx; + void *cookie; + uint64_t mask, *ptep, pte, pte_addr; + int address_bits, granule_shift, ia_bits, levels, pte_shift, tsz; + bool is_el0; + + /* Check if the MMU is off */ + if ((paging->flags & VM_GP_MMU_ENABLED) == 0) { + *is_fault = 0; + *gpa = gla; + return (0); + } + + is_el0 = (paging->flags & PSR_M_MASK) == PSR_M_EL0t; + + if (ADDR_IS_KERNEL(gla)) { + /* If address translation is disabled raise an exception */ + if ((paging->tcr_el1 & TCR_EPD1) != 0) { + *is_fault = 1; + return (0); + } + if (is_el0 && (paging->tcr_el1 & TCR_E0PD1) != 0) { + *is_fault = 1; + return (0); + } + pte_addr = paging->ttbr1_addr; + tsz = (paging->tcr_el1 & TCR_T1SZ_MASK) >> TCR_T1SZ_SHIFT; + /* Clear the top byte if TBI is on */ + if ((paging->tcr_el1 & TCR_TBI1) != 0) + gla |= (0xfful << 56); + switch (paging->tcr_el1 & TCR_TG1_MASK) { + case TCR_TG1_4K: + granule_shift = PAGE_SHIFT_4K; + break; + case TCR_TG1_16K: + granule_shift = PAGE_SHIFT_16K; + break; + case TCR_TG1_64K: + granule_shift = PAGE_SHIFT_64K; + break; + default: + *is_fault = 1; + return (EINVAL); + } + } else { + /* If address translation is disabled raise an exception */ + if ((paging->tcr_el1 & TCR_EPD0) != 0) { + *is_fault = 1; + return (0); + } + if (is_el0 && (paging->tcr_el1 & TCR_E0PD0) != 0) { + *is_fault = 1; + return (0); + } + pte_addr = paging->ttbr0_addr; + tsz = (paging->tcr_el1 & TCR_T0SZ_MASK) >> TCR_T0SZ_SHIFT; + /* Clear the top byte if TBI is on */ + if ((paging->tcr_el1 & TCR_TBI0) != 0) + gla &= ~(0xfful << 56); + switch (paging->tcr_el1 & TCR_TG0_MASK) { + case TCR_TG0_4K: + granule_shift = PAGE_SHIFT_4K; + break; + case TCR_TG0_16K: + granule_shift = PAGE_SHIFT_16K; + break; + case TCR_TG0_64K: + granule_shift = PAGE_SHIFT_64K; + break; + default: + *is_fault = 1; + return (EINVAL); + } + } + + /* + * TODO: Support FEAT_TTST for smaller tsz values and FEAT_LPA2 + * for larger values. + */ + switch (granule_shift) { + case PAGE_SHIFT_4K: + /* + * See "Table D8-11 4KB granule, determining stage 1 initial + * lookup level" from the "Arm Architecture Reference Manual + * for A-Profile architecture" revision I.a for the minimum + * and maximum values. + */ + if (tsz < 16 || tsz > 39) { + *is_fault = 1; + return (EINVAL); + } + break; + /* TODO: Support non-4k granule */ + default: + *is_fault = 1; + return (EINVAL); + } + + /* + * Calculate the input address bits. These are 64 bit in an address + * with the top tsz bits being all 0 or all 1. + */ + ia_bits = 64 - tsz; + + /* + * Calculate the number of address bits used in the page table + * calculation. This is ia_bits minus the bottom granule_shift + * bits that are passed to the output address. + */ + address_bits = ia_bits - granule_shift; + + /* + * Calculate the number of levels. Each level uses + * granule_shift - PTE_SHIFT bits of the input address. + * This is because the table is 1 << granule_shift and each + * entry is 1 << PTE_SHIFT bytes. + */ + levels = howmany(address_bits, granule_shift - PTE_SHIFT); + + /* Mask of the upper unused bits in the virtual address */ + gla &= (1ul << ia_bits) - 1; + hypctx = (struct hypctx *)vcpui; + cookie = NULL; + /* TODO: Check if the level supports block descriptors */ + for (;levels > 0; levels--) { + int idx; + + /* TODO: ptp_hold works on host pages */ + ptep = ptp_hold(hypctx->vcpu, pte_addr, 1 << granule_shift, + &cookie); + pte_shift = (levels - 1) * (granule_shift - PTE_SHIFT) + + granule_shift; + idx = (gla >> pte_shift) & + ((1ul << (granule_shift - PTE_SHIFT)) - 1); + pte = ptep[idx]; + + /* Calculate the level we are looking at */ + switch (levels) { + default: + goto fault; + /* TODO: Level -1 when FEAT_LPA2 is implemented */ + case 4: /* Level 0 */ + /* TODO: support FEAT_??? */ + if ((pte & ATTR_DESCR_MASK) != L0_TABLE) + goto fault; + /* FALLTHROUGH */ + case 3: /* Level 1 */ + case 2: /* Level 2 */ + switch (pte & ATTR_DESCR_MASK) { + /* Use L1 macro as all levels are the same */ + case L1_TABLE: + /* Check if EL0 can access this address space */ + if (is_el0 && + (pte & TATTR_AP_TABLE_NO_EL0) != 0) + goto fault; + /* Check if the address space is writable */ + if ((prot & PROT_WRITE) != 0 && + (pte & TATTR_AP_TABLE_RO) != 0) + goto fault; + if ((prot & PROT_EXEC) != 0) { + /* Check the table exec attribute */ + if ((is_el0 && + (pte & TATTR_UXN_TABLE) != 0) || + (!is_el0 && + (pte & TATTR_PXN_TABLE) != 0)) + goto fault; + } + pte_addr = pte & ~ATTR_MASK; + break; + case L1_BLOCK: + goto done; + default: + goto fault; + } + break; + case 1: /* Level 3 */ + if ((pte & ATTR_DESCR_MASK) == L3_PAGE) + goto done; + goto fault; + } + } + +done: + /* Check if EL0 has access to the block/page */ + if (is_el0 && (pte & ATTR_S1_AP(ATTR_S1_AP_USER)) == 0) + goto fault; + if ((prot & PROT_WRITE) != 0 && (pte & ATTR_S1_AP_RW_BIT) != 0) + goto fault; + if ((prot & PROT_EXEC) != 0) { + if ((is_el0 && (pte & ATTR_S1_UXN) != 0) || + (!is_el0 && (pte & ATTR_S1_PXN) != 0)) + goto fault; + } + mask = (1ul << pte_shift) - 1; + *gpa = (pte & ~ATTR_MASK) | (gla & mask); + *is_fault = 0; + ptp_release(&cookie); + return (0); + +fault: + *is_fault = 1; + ptp_release(&cookie); + return (0); +} + +int +vmmops_run(void *vcpui, register_t pc, pmap_t pmap, struct vm_eventinfo *evinfo) +{ + uint64_t excp_type; + int handled; + register_t daif; + struct hyp *hyp; + struct hypctx *hypctx; + struct vcpu *vcpu; + struct vm_exit *vme; + int mode; + + hypctx = (struct hypctx *)vcpui; + hyp = hypctx->hyp; + vcpu = hypctx->vcpu; + vme = vm_exitinfo(vcpu); + + hypctx->tf.tf_elr = (uint64_t)pc; + + for (;;) { + if (hypctx->has_exception) { + hypctx->has_exception = false; + hypctx->elr_el1 = hypctx->tf.tf_elr; + + mode = hypctx->tf.tf_spsr & (PSR_M_MASK | PSR_M_32); + + if (mode == PSR_M_EL1t) { + hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x0; + } else if (mode == PSR_M_EL1h) { + hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x200; + } else if ((mode & PSR_M_32) == PSR_M_64) { + /* 64-bit EL0 */ + hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x400; + } else { + /* 32-bit EL0 */ + hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x600; + } + + /* Set the new spsr */ + hypctx->spsr_el1 = hypctx->tf.tf_spsr; + + /* Set the new cpsr */ + hypctx->tf.tf_spsr = hypctx->spsr_el1 & PSR_FLAGS; + /* TODO: DIT, PAN, SSBS */ + hypctx->tf.tf_spsr |= PSR_DAIF | PSR_M_EL1h; + } + + daif = intr_disable(); + + /* Check if the vcpu is suspended */ + if (vcpu_suspended(evinfo)) { + intr_restore(daif); + vm_exit_suspended(vcpu, pc); + break; + } + + if (vcpu_debugged(vcpu)) { + intr_restore(daif); + vm_exit_debug(vcpu, pc); + break; + } + + /* Activate the stage2 pmap so the vmid is valid */ + pmap_activate_vm(pmap); + hyp->vttbr_el2 = pmap_to_ttbr0(pmap); + + /* + * TODO: What happens if a timer interrupt is asserted exactly + * here, but for the previous VM? + */ + arm64_set_active_vcpu(hypctx); + vgic_flush_hwstate(hypctx); + + /* Call into EL2 to switch to the guest */ + excp_type = vmm_call_hyp(HYP_ENTER_GUEST, + hyp->el2_addr, hypctx->el2_addr); + + vgic_sync_hwstate(hypctx); + vtimer_sync_hwstate(hypctx); + + /* + * Deactivate the stage2 pmap. vmm_pmap_clean_stage2_tlbi + * depends on this meaning we activate the VM before entering + * the vm again + */ + PCPU_SET(curvmpmap, NULL); + intr_restore(daif); + + vmm_stat_incr(vcpu, VMEXIT_COUNT, 1); + if (excp_type == EXCP_TYPE_MAINT_IRQ) + continue; + + vme->pc = hypctx->tf.tf_elr; + vme->inst_length = INSN_SIZE; + vme->u.hyp.exception_nr = excp_type; + vme->u.hyp.esr_el2 = hypctx->tf.tf_esr; + vme->u.hyp.far_el2 = hypctx->exit_info.far_el2; + vme->u.hyp.hpfar_el2 = hypctx->exit_info.hpfar_el2; + + handled = arm64_handle_world_switch(hypctx, excp_type, vme, + pmap); + if (handled == UNHANDLED) + /* Exit loop to emulate instruction. */ + break; + else + /* Resume guest execution from the next instruction. */ + hypctx->tf.tf_elr += vme->inst_length; + } + + return (0); +} + +static void +arm_pcpu_vmcleanup(void *arg) +{ + struct hyp *hyp; + int i, maxcpus; + + hyp = arg; + maxcpus = vm_get_maxcpus(hyp->vm); + for (i = 0; i < maxcpus; i++) { + if (arm64_get_active_vcpu() == hyp->ctx[i]) { + arm64_set_active_vcpu(NULL); + break; + } + } +} + +void +vmmops_vcpu_cleanup(void *vcpui) +{ + struct hypctx *hypctx = vcpui; + + vtimer_cpucleanup(hypctx); + vgic_cpucleanup(hypctx); + + vmmpmap_remove(hypctx->el2_addr, el2_hypctx_size(), true); + + free(hypctx, M_HYP); +} + +void +vmmops_cleanup(void *vmi) +{ + struct hyp *hyp = vmi; + + vtimer_vmcleanup(hyp); + vgic_vmcleanup(hyp); + + smp_rendezvous(NULL, arm_pcpu_vmcleanup, NULL, hyp); + + vmmpmap_remove(hyp->el2_addr, el2_hyp_size(hyp->vm), true); + + free(hyp, M_HYP); +} + +/* + * Return register value. Registers have different sizes and an explicit cast + * must be made to ensure proper conversion. + */ +static uint64_t * +hypctx_regptr(struct hypctx *hypctx, int reg) +{ + switch (reg) { + case VM_REG_GUEST_X0 ... VM_REG_GUEST_X29: + return (&hypctx->tf.tf_x[reg]); + case VM_REG_GUEST_LR: + return (&hypctx->tf.tf_lr); + case VM_REG_GUEST_SP: + return (&hypctx->tf.tf_sp); + case VM_REG_GUEST_CPSR: + return (&hypctx->tf.tf_spsr); + case VM_REG_GUEST_PC: + return (&hypctx->tf.tf_elr); + case VM_REG_GUEST_SCTLR_EL1: + return (&hypctx->sctlr_el1); + case VM_REG_GUEST_TTBR0_EL1: + return (&hypctx->ttbr0_el1); + case VM_REG_GUEST_TTBR1_EL1: + return (&hypctx->ttbr1_el1); + case VM_REG_GUEST_TCR_EL1: + return (&hypctx->tcr_el1); + case VM_REG_GUEST_TCR2_EL1: + return (&hypctx->tcr2_el1); + default: + break; + } + return (NULL); +} + +int +vmmops_getreg(void *vcpui, int reg, uint64_t *retval) +{ + uint64_t *regp; + int running, hostcpu; + struct hypctx *hypctx = vcpui; + + running = vcpu_is_running(hypctx->vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("arm_getreg: %s%d is running", vm_name(hypctx->hyp->vm), + vcpu_vcpuid(hypctx->vcpu)); + + regp = hypctx_regptr(hypctx, reg); + if (regp == NULL) + return (EINVAL); + + *retval = *regp; + return (0); +} + +int +vmmops_setreg(void *vcpui, int reg, uint64_t val) +{ + uint64_t *regp; + struct hypctx *hypctx = vcpui; + int running, hostcpu; + + running = vcpu_is_running(hypctx->vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("arm_setreg: %s%d is running", vm_name(hypctx->hyp->vm), + vcpu_vcpuid(hypctx->vcpu)); + + regp = hypctx_regptr(hypctx, reg); + if (regp == NULL) + return (EINVAL); + + *regp = val; + return (0); +} + +int +vmmops_exception(void *vcpui, uint64_t esr, uint64_t far) +{ + struct hypctx *hypctx = vcpui; + int running, hostcpu; + + running = vcpu_is_running(hypctx->vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("%s: %s%d is running", __func__, vm_name(hypctx->hyp->vm), + vcpu_vcpuid(hypctx->vcpu)); + + hypctx->far_el1 = far; + hypctx->esr_el1 = esr; + hypctx->has_exception = true; + + return (0); +} + +int +vmmops_getcap(void *vcpui, int num, int *retval) +{ + int ret; + + ret = ENOENT; + + switch (num) { + case VM_CAP_UNRESTRICTED_GUEST: + *retval = 1; + ret = 0; + break; + default: + break; + } + + return (ret); +} + +int +vmmops_setcap(void *vcpui, int num, int val) +{ + + return (ENOENT); +} diff --git a/sys/arm64/vmm/vmm_call.S b/sys/arm64/vmm/vmm_call.S new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_call.S @@ -0,0 +1,39 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2017 Alexandru Elisei + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include + + .text + +ENTRY(vmm_call_hyp) + hvc #0 + ret +END(vmm_call_hyp) diff --git a/sys/arm64/vmm/vmm_dev.c b/sys/arm64/vmm/vmm_dev.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_dev.c @@ -0,0 +1,1046 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "vmm_stat.h" + +#include "io/vgic.h" + +struct devmem_softc { + int segid; + char *name; + struct cdev *cdev; + struct vmmdev_softc *sc; + SLIST_ENTRY(devmem_softc) link; +}; + +struct vmmdev_softc { + struct vm *vm; /* vm instance cookie */ + struct cdev *cdev; + SLIST_ENTRY(vmmdev_softc) link; + SLIST_HEAD(, devmem_softc) devmem; + int flags; +}; +#define VSC_LINKED 0x01 + +static SLIST_HEAD(, vmmdev_softc) head; + +static unsigned pr_allow_flag; +static struct mtx vmmdev_mtx; + +static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); + +SYSCTL_DECL(_hw_vmm); + +static int vmm_priv_check(struct ucred *ucred); +static int devmem_create_cdev(const char *vmname, int id, char *devmem); +static void devmem_destroy(void *arg); + +static int +vmm_priv_check(struct ucred *ucred) +{ + + if (jailed(ucred) && + !(ucred->cr_prison->pr_allow & pr_allow_flag)) + return (EPERM); + + return (0); +} + +static int +vcpu_lock_one(struct vcpu *vcpu) +{ + int error; + + error = vcpu_set_state(vcpu, VCPU_FROZEN, true); + return (error); +} + +static void +vcpu_unlock_one(struct vcpu *vcpu) +{ + enum vcpu_state state; + + state = vcpu_get_state(vcpu, NULL); + if (state != VCPU_FROZEN) { + panic("vcpu %s(%d) has invalid state %d", + vm_name(vcpu_vm(vcpu)), vcpu_vcpuid(vcpu), state); + } + + vcpu_set_state(vcpu, VCPU_IDLE, false); +} + +static int +vcpu_lock_all(struct vmmdev_softc *sc) +{ + struct vcpu *vcpu; + int error; + uint16_t i, maxcpus; + + vm_slock_vcpus(sc->vm); + maxcpus = vm_get_maxcpus(sc->vm); + for (i = 0; i < maxcpus; i++) { + vcpu = vm_vcpu(sc->vm, i); + if (vcpu == NULL) + continue; + error = vcpu_lock_one(vcpu); + if (error) + break; + } + + if (error) { + while (--i >= 0) { + vcpu = vm_vcpu(sc->vm, i); + if (vcpu == NULL) + continue; + vcpu_unlock_one(vcpu); + } + vm_unlock_vcpus(sc->vm); + } + + return (error); +} + +static void +vcpu_unlock_all(struct vmmdev_softc *sc) +{ + struct vcpu *vcpu; + uint16_t i, maxcpus; + + maxcpus = vm_get_maxcpus(sc->vm); + for (i = 0; i < maxcpus; i++) { + vcpu = vm_vcpu(sc->vm, i); + if (vcpu == NULL) + continue; + vcpu_unlock_one(vcpu); + } + vm_unlock_vcpus(sc->vm); +} + +static struct vmmdev_softc * +vmmdev_lookup(const char *name) +{ + struct vmmdev_softc *sc; + +#ifdef notyet /* XXX kernel is not compiled with invariants */ + mtx_assert(&vmmdev_mtx, MA_OWNED); +#endif + + SLIST_FOREACH(sc, &head, link) { + if (strcmp(name, vm_name(sc->vm)) == 0) + break; + } + + return (sc); +} + +static struct vmmdev_softc * +vmmdev_lookup2(struct cdev *cdev) +{ + + return (cdev->si_drv1); +} + +static int +vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) +{ + int error, off, c, prot; + vm_paddr_t gpa, maxaddr; + void *hpa, *cookie; + struct vmmdev_softc *sc; + + error = vmm_priv_check(curthread->td_ucred); + if (error) + return (error); + + sc = vmmdev_lookup2(cdev); + if (sc == NULL) + return (ENXIO); + + /* + * Get a read lock on the guest memory map. + */ + vm_slock_memsegs(sc->vm); + + prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); + maxaddr = vmm_sysmem_maxaddr(sc->vm); + while (uio->uio_resid > 0 && error == 0) { + gpa = uio->uio_offset; + off = gpa & PAGE_MASK; + c = min(uio->uio_resid, PAGE_SIZE - off); + + /* + * The VM has a hole in its physical memory map. If we want to + * use 'dd' to inspect memory beyond the hole we need to + * provide bogus data for memory that lies in the hole. + * + * Since this device does not support lseek(2), dd(1) will + * read(2) blocks of data to simulate the lseek(2). + */ + hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie); + if (hpa == NULL) { + if (uio->uio_rw == UIO_READ && gpa < maxaddr) + error = uiomove(__DECONST(void *, zero_region), + c, uio); + else + error = EFAULT; + } else { + error = uiomove(hpa, c, uio); + vm_gpa_release(cookie); + } + } + vm_unlock_memsegs(sc->vm); + return (error); +} + +static int +get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg) +{ + struct devmem_softc *dsc; + int error; + bool sysmem; + + error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL); + if (error || mseg->len == 0) + return (error); + + if (!sysmem) { + SLIST_FOREACH(dsc, &sc->devmem, link) { + if (dsc->segid == mseg->segid) + break; + } + KASSERT(dsc != NULL, ("%s: devmem segment %d not found", + __func__, mseg->segid)); + error = copystr(dsc->name, mseg->name, sizeof(mseg->name), + NULL); + } else { + bzero(mseg->name, sizeof(mseg->name)); + } + + return (error); +} + +static int +alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg) +{ + char *name; + int error; + bool sysmem; + + error = 0; + name = NULL; + sysmem = true; + + /* + * The allocation is lengthened by 1 to hold a terminating NUL. It'll + * by stripped off when devfs processes the full string. + */ + if (VM_MEMSEG_NAME(mseg)) { + sysmem = false; + name = malloc(sizeof(mseg->name), M_VMMDEV, M_WAITOK); + error = copystr(mseg->name, name, sizeof(mseg->name), NULL); + if (error) + goto done; + } + + error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem); + if (error) + goto done; + + if (VM_MEMSEG_NAME(mseg)) { + error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name); + if (error) + vm_free_memseg(sc->vm, mseg->segid); + else + name = NULL; /* freed when 'cdev' is destroyed */ + } +done: + free(name, M_VMMDEV); + return (error); +} + +static int +vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, + uint64_t *regval) +{ + int error, i; + + error = 0; + for (i = 0; i < count; i++) { + error = vm_get_register(vcpu, regnum[i], ®val[i]); + if (error) + break; + } + return (error); +} + +static int +vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum, + uint64_t *regval) +{ + int error, i; + + error = 0; + for (i = 0; i < count; i++) { + error = vm_set_register(vcpu, regnum[i], regval[i]); + if (error) + break; + } + return (error); +} + +static int +vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, + struct thread *td) +{ + int error, vcpuid, size; + cpuset_t *cpuset; + struct vmmdev_softc *sc; + struct vcpu *vcpu; + struct vm_register *vmreg; + struct vm_register_set *vmregset; + struct vm_run *vmrun; + struct vm_vgic_version *vgv; + struct vm_vgic_descr *vgic; + struct vm_cpuset *vm_cpuset; + struct vm_irq *vi; + struct vm_capability *vmcap; + struct vm_stats *vmstats; + struct vm_stat_desc *statdesc; + struct vm_suspend *vmsuspend; + struct vm_exception *vmexc; + struct vm_gla2gpa *gg; + struct vm_memmap *mm; + struct vm_munmap *mu; + struct vm_msi *vmsi; + struct vm_cpu_topology *topology; + uint64_t *regvals; + int *regnums; + enum { NONE, SINGLE, ALL } vcpus_locked; + bool memsegs_locked; + + error = vmm_priv_check(curthread->td_ucred); + if (error) + return (error); + + sc = vmmdev_lookup2(cdev); + if (sc == NULL) + return (ENXIO); + + error = 0; + vcpuid = -1; + vcpu = NULL; + vcpus_locked = NONE; + memsegs_locked = false; + + /* + * Some VMM ioctls can operate only on vcpus that are not running. + */ + switch (cmd) { + case VM_RUN: + case VM_GET_REGISTER: + case VM_SET_REGISTER: + case VM_GET_REGISTER_SET: + case VM_SET_REGISTER_SET: + case VM_INJECT_EXCEPTION: + case VM_GET_CAPABILITY: + case VM_SET_CAPABILITY: + case VM_GLA2GPA_NOFAULT: + case VM_ACTIVATE_CPU: + /* + * ioctls that can operate only on vcpus that are not running. + */ + vcpuid = *(int *)data; + vcpu = vm_alloc_vcpu(sc->vm, vcpuid); + if (vcpu == NULL) { + error = EINVAL; + goto done; + } + error = vcpu_lock_one(vcpu); + if (error) + goto done; + vcpus_locked = SINGLE; + break; + + case VM_ALLOC_MEMSEG: + case VM_MMAP_MEMSEG: + case VM_MUNMAP_MEMSEG: + case VM_REINIT: + case VM_ATTACH_VGIC: + /* + * ioctls that modify the memory map must lock memory + * segments exclusively. + */ + vm_xlock_memsegs(sc->vm); + memsegs_locked = true; + + /* + * ioctls that operate on the entire virtual machine must + * prevent all vcpus from running. + */ + error = vcpu_lock_all(sc); + if (error) + goto done; + vcpus_locked = ALL; + break; + case VM_GET_MEMSEG: + case VM_MMAP_GETNEXT: + /* + * Lock the memory map while it is being inspected. + */ + vm_slock_memsegs(sc->vm); + memsegs_locked = true; + break; + + case VM_STATS: + /* + * These do not need the vCPU locked but do operate on + * a specific vCPU. + */ + vcpuid = *(int *)data; + vcpu = vm_alloc_vcpu(sc->vm, vcpuid); + if (vcpu == NULL) { + error = EINVAL; + goto done; + } + break; + + case VM_SUSPEND_CPU: + case VM_RESUME_CPU: + /* + * These can either operate on all CPUs via a vcpuid of + * -1 or on a specific vCPU. + */ + vcpuid = *(int *)data; + if (vcpuid == -1) + break; + vcpu = vm_alloc_vcpu(sc->vm, vcpuid); + if (vcpu == NULL) { + error = EINVAL; + goto done; + } + break; + + case VM_ASSERT_IRQ: + vi = (struct vm_irq *)data; + error = vm_assert_irq(sc->vm, vi->irq); + break; + case VM_DEASSERT_IRQ: + vi = (struct vm_irq *)data; + error = vm_deassert_irq(sc->vm, vi->irq); + break; + default: + break; + } + + switch (cmd) { + case VM_RUN: { + struct vm_exit *vme; + + vmrun = (struct vm_run *)data; + vme = vm_exitinfo(vcpu); + + error = vm_run(vcpu); + if (error != 0) + break; + + error = copyout(vme, vmrun->vm_exit, sizeof(*vme)); + if (error != 0) + break; + break; + } + case VM_SUSPEND: + vmsuspend = (struct vm_suspend *)data; + error = vm_suspend(sc->vm, vmsuspend->how); + break; + case VM_REINIT: + error = vm_reinit(sc->vm); + break; + case VM_STAT_DESC: { + statdesc = (struct vm_stat_desc *)data; + error = vmm_stat_desc_copy(statdesc->index, + statdesc->desc, sizeof(statdesc->desc)); + break; + } + case VM_STATS: { + CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); + vmstats = (struct vm_stats *)data; + getmicrotime(&vmstats->tv); + error = vmm_stat_copy(vcpu, vmstats->index, + nitems(vmstats->statbuf), + &vmstats->num_entries, vmstats->statbuf); + break; + } + case VM_MMAP_GETNEXT: + mm = (struct vm_memmap *)data; + error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid, + &mm->segoff, &mm->len, &mm->prot, &mm->flags); + break; + case VM_MMAP_MEMSEG: + mm = (struct vm_memmap *)data; + error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff, + mm->len, mm->prot, mm->flags); + break; + case VM_MUNMAP_MEMSEG: + mu = (struct vm_munmap *)data; + error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len); + break; + case VM_ALLOC_MEMSEG: + error = alloc_memseg(sc, (struct vm_memseg *)data); + break; + case VM_GET_MEMSEG: + error = get_memseg(sc, (struct vm_memseg *)data); + break; + case VM_GET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval); + break; + case VM_SET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval); + break; + case VM_GET_REGISTER_SET: + vmregset = (struct vm_register_set *)data; + if (vmregset->count > VM_REG_LAST) { + error = EINVAL; + break; + } + regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, + M_WAITOK); + regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, + M_WAITOK); + error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * + vmregset->count); + if (error == 0) + error = vm_get_register_set(vcpu, vmregset->count, + regnums, regvals); + if (error == 0) + error = copyout(regvals, vmregset->regvals, + sizeof(regvals[0]) * vmregset->count); + free(regvals, M_VMMDEV); + free(regnums, M_VMMDEV); + break; + case VM_SET_REGISTER_SET: + vmregset = (struct vm_register_set *)data; + if (vmregset->count > VM_REG_LAST) { + error = EINVAL; + break; + } + regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, + M_WAITOK); + regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, + M_WAITOK); + error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * + vmregset->count); + if (error == 0) + error = copyin(vmregset->regvals, regvals, + sizeof(regvals[0]) * vmregset->count); + if (error == 0) + error = vm_set_register_set(vcpu, vmregset->count, + regnums, regvals); + free(regvals, M_VMMDEV); + free(regnums, M_VMMDEV); + break; + case VM_GET_CAPABILITY: + vmcap = (struct vm_capability *)data; + error = vm_get_capability(vcpu, + vmcap->captype, + &vmcap->capval); + break; + case VM_SET_CAPABILITY: + vmcap = (struct vm_capability *)data; + error = vm_set_capability(vcpu, + vmcap->captype, + vmcap->capval); + break; + case VM_INJECT_EXCEPTION: + vmexc = (struct vm_exception *)data; + error = vm_inject_exception(vcpu, vmexc->esr, vmexc->far); + break; + case VM_GLA2GPA_NOFAULT: + gg = (struct vm_gla2gpa *)data; + error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla, + gg->prot, &gg->gpa, &gg->fault); + KASSERT(error == 0 || error == EFAULT, + ("%s: vm_gla2gpa unknown error %d", __func__, error)); + break; + case VM_ACTIVATE_CPU: + error = vm_activate_cpu(vcpu); + break; + case VM_GET_CPUS: + error = 0; + vm_cpuset = (struct vm_cpuset *)data; + size = vm_cpuset->cpusetsize; + if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) { + error = ERANGE; + break; + } + cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO); + if (vm_cpuset->which == VM_ACTIVE_CPUS) + *cpuset = vm_active_cpus(sc->vm); + else if (vm_cpuset->which == VM_SUSPENDED_CPUS) + *cpuset = vm_suspended_cpus(sc->vm); + else if (vm_cpuset->which == VM_DEBUG_CPUS) + *cpuset = vm_debug_cpus(sc->vm); + else + error = EINVAL; + if (error == 0) + error = copyout(cpuset, vm_cpuset->cpus, size); + free(cpuset, M_TEMP); + break; + case VM_SUSPEND_CPU: + error = vm_suspend_cpu(sc->vm, vcpu); + break; + case VM_RESUME_CPU: + error = vm_resume_cpu(sc->vm, vcpu); + break; + case VM_GET_VGIC_VERSION: + vgv = (struct vm_vgic_version *)data; + /* TODO: Query the vgic driver for this */ + vgv->version = 3; + vgv->flags = 0; + error = 0; + break; + case VM_ATTACH_VGIC: + vgic = (struct vm_vgic_descr *)data; + error = vm_attach_vgic(sc->vm, vgic); + break; + case VM_RAISE_MSI: + vmsi = (struct vm_msi *)data; + error = vm_raise_msi(sc->vm, vmsi->msg, vmsi->addr, vmsi->bus, + vmsi->slot, vmsi->func); + break; + case VM_SET_TOPOLOGY: + topology = (struct vm_cpu_topology *)data; + error = vm_set_topology(sc->vm, topology->sockets, + topology->cores, topology->threads, topology->maxcpus); + break; + case VM_GET_TOPOLOGY: + topology = (struct vm_cpu_topology *)data; + vm_get_topology(sc->vm, &topology->sockets, &topology->cores, + &topology->threads, &topology->maxcpus); + error = 0; + break; + default: + error = ENOTTY; + break; + } + +done: + if (vcpus_locked == SINGLE) + vcpu_unlock_one(vcpu); + else if (vcpus_locked == ALL) + vcpu_unlock_all(sc); + if (memsegs_locked) + vm_unlock_memsegs(sc->vm); + + /* + * Make sure that no handler returns a kernel-internal + * error value to userspace. + */ + KASSERT(error == ERESTART || error >= 0, + ("vmmdev_ioctl: invalid error return %d", error)); + return (error); +} + +static int +vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize, + struct vm_object **objp, int nprot) +{ + struct vmmdev_softc *sc; + vm_paddr_t gpa; + size_t len; + vm_ooffset_t segoff, first, last; + int error, found, segid; + bool sysmem; + + error = vmm_priv_check(curthread->td_ucred); + if (error) + return (error); + + first = *offset; + last = first + mapsize; + if ((nprot & PROT_EXEC) || first < 0 || first >= last) + return (EINVAL); + + sc = vmmdev_lookup2(cdev); + if (sc == NULL) { + /* virtual machine is in the process of being created */ + return (EINVAL); + } + + /* + * Get a read lock on the guest memory map. + */ + vm_slock_memsegs(sc->vm); + + gpa = 0; + found = 0; + while (!found) { + error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len, + NULL, NULL); + if (error) + break; + + if (first >= gpa && last <= gpa + len) + found = 1; + else + gpa += len; + } + + if (found) { + error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp); + KASSERT(error == 0 && *objp != NULL, + ("%s: invalid memory segment %d", __func__, segid)); + if (sysmem) { + vm_object_reference(*objp); + *offset = segoff + (first - gpa); + } else { + error = EINVAL; + } + } + vm_unlock_memsegs(sc->vm); + return (error); +} + +static void +vmmdev_destroy(void *arg) +{ + struct vmmdev_softc *sc = arg; + struct devmem_softc *dsc; + int error __diagused; + + error = vcpu_lock_all(sc); + KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error)); + vm_unlock_vcpus(sc->vm); + + while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) { + KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__)); + SLIST_REMOVE_HEAD(&sc->devmem, link); + free(dsc->name, M_VMMDEV); + free(dsc, M_VMMDEV); + } + + if (sc->cdev != NULL) + destroy_dev(sc->cdev); + + if (sc->vm != NULL) + vm_destroy(sc->vm); + + if ((sc->flags & VSC_LINKED) != 0) { + mtx_lock(&vmmdev_mtx); + SLIST_REMOVE(&head, sc, vmmdev_softc, link); + mtx_unlock(&vmmdev_mtx); + } + + free(sc, M_VMMDEV); +} + +static int +sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) +{ + struct devmem_softc *dsc; + struct vmmdev_softc *sc; + struct cdev *cdev; + char *buf; + int error, buflen; + + error = vmm_priv_check(req->td->td_ucred); + if (error) + return (error); + + buflen = VM_MAX_NAMELEN + 1; + buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); + strlcpy(buf, "beavis", buflen); + error = sysctl_handle_string(oidp, buf, buflen, req); + if (error != 0 || req->newptr == NULL) + goto out; + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(buf); + if (sc == NULL || sc->cdev == NULL) { + mtx_unlock(&vmmdev_mtx); + error = EINVAL; + goto out; + } + + /* + * The 'cdev' will be destroyed asynchronously when 'si_threadcount' + * goes down to 0 so we should not do it again in the callback. + * + * Setting 'sc->cdev' to NULL is also used to indicate that the VM + * is scheduled for destruction. + */ + cdev = sc->cdev; + sc->cdev = NULL; + mtx_unlock(&vmmdev_mtx); + + /* + * Schedule all cdevs to be destroyed: + * + * - any new operations on the 'cdev' will return an error (ENXIO). + * + * - when the 'si_threadcount' dwindles down to zero the 'cdev' will + * be destroyed and the callback will be invoked in a taskqueue + * context. + * + * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev' + */ + SLIST_FOREACH(dsc, &sc->devmem, link) { + KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed")); + destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc); + } + destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); + error = 0; + +out: + free(buf, M_VMMDEV); + return (error); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, + NULL, 0, sysctl_vmm_destroy, "A", + NULL); + +static struct cdevsw vmmdevsw = { + .d_name = "vmmdev", + .d_version = D_VERSION, + .d_ioctl = vmmdev_ioctl, + .d_mmap_single = vmmdev_mmap_single, + .d_read = vmmdev_rw, + .d_write = vmmdev_rw, +}; + +static int +sysctl_vmm_create(SYSCTL_HANDLER_ARGS) +{ + struct vm *vm; + struct cdev *cdev; + struct vmmdev_softc *sc, *sc2; + char *buf; + int error, buflen; + + error = vmm_priv_check(req->td->td_ucred); + if (error) + return (error); + + buflen = VM_MAX_NAMELEN + 1; + buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); + strlcpy(buf, "beavis", buflen); + error = sysctl_handle_string(oidp, buf, buflen, req); + if (error != 0 || req->newptr == NULL) + goto out; + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(buf); + mtx_unlock(&vmmdev_mtx); + if (sc != NULL) { + error = EEXIST; + goto out; + } + + error = vm_create(buf, &vm); + if (error != 0) + goto out; + + sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); + sc->vm = vm; + SLIST_INIT(&sc->devmem); + + /* + * Lookup the name again just in case somebody sneaked in when we + * dropped the lock. + */ + mtx_lock(&vmmdev_mtx); + sc2 = vmmdev_lookup(buf); + if (sc2 == NULL) { + SLIST_INSERT_HEAD(&head, sc, link); + sc->flags |= VSC_LINKED; + } + mtx_unlock(&vmmdev_mtx); + + if (sc2 != NULL) { + vmmdev_destroy(sc); + error = EEXIST; + goto out; + } + + error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, + UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); + if (error != 0) { + vmmdev_destroy(sc); + goto out; + } + + mtx_lock(&vmmdev_mtx); + sc->cdev = cdev; + sc->cdev->si_drv1 = sc; + mtx_unlock(&vmmdev_mtx); + +out: + free(buf, M_VMMDEV); + return (error); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, create, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, + NULL, 0, sysctl_vmm_create, "A", + NULL); + +void +vmmdev_init(void) +{ + mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); + pr_allow_flag = prison_add_allow(NULL, "vmm", NULL, + "Allow use of vmm in a jail."); +} + +int +vmmdev_cleanup(void) +{ + int error; + + if (SLIST_EMPTY(&head)) + error = 0; + else + error = EBUSY; + + return (error); +} + +static int +devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len, + struct vm_object **objp, int nprot) +{ + struct devmem_softc *dsc; + vm_ooffset_t first, last; + size_t seglen; + int error; + bool sysmem; + + dsc = cdev->si_drv1; + if (dsc == NULL) { + /* 'cdev' has been created but is not ready for use */ + return (ENXIO); + } + + first = *offset; + last = *offset + len; + if ((nprot & PROT_EXEC) || first < 0 || first >= last) + return (EINVAL); + + vm_slock_memsegs(dsc->sc->vm); + + error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp); + KASSERT(error == 0 && !sysmem && *objp != NULL, + ("%s: invalid devmem segment %d", __func__, dsc->segid)); + + if (seglen >= last) + vm_object_reference(*objp); + else + error = 0; + vm_unlock_memsegs(dsc->sc->vm); + return (error); +} + +static struct cdevsw devmemsw = { + .d_name = "devmem", + .d_version = D_VERSION, + .d_mmap_single = devmem_mmap_single, +}; + +static int +devmem_create_cdev(const char *vmname, int segid, char *devname) +{ + struct devmem_softc *dsc; + struct vmmdev_softc *sc; + struct cdev *cdev; + int error; + + error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL, + UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname); + if (error) + return (error); + + dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO); + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(vmname); + KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname)); + if (sc->cdev == NULL) { + /* virtual machine is being created or destroyed */ + mtx_unlock(&vmmdev_mtx); + free(dsc, M_VMMDEV); + destroy_dev_sched_cb(cdev, NULL, 0); + return (ENODEV); + } + + dsc->segid = segid; + dsc->name = devname; + dsc->cdev = cdev; + dsc->sc = sc; + SLIST_INSERT_HEAD(&sc->devmem, dsc, link); + mtx_unlock(&vmmdev_mtx); + + /* The 'cdev' is ready for use after 'si_drv1' is initialized */ + cdev->si_drv1 = dsc; + return (0); +} + +static void +devmem_destroy(void *arg) +{ + struct devmem_softc *dsc = arg; + + KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__)); + dsc->cdev = NULL; + dsc->sc = NULL; +} diff --git a/sys/arm64/vmm/vmm_hyp.c b/sys/arm64/vmm/vmm_hyp.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_hyp.c @@ -0,0 +1,750 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Andrew Turner + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include + +#include + +#include "arm64.h" +#include "hyp.h" + +struct hypctx; + +uint64_t vmm_hyp_enter(uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, + uint64_t, uint64_t, uint64_t); +uint64_t vmm_enter_guest(struct hypctx *); + +static void +vmm_hyp_reg_store(struct hypctx *hypctx, struct hyp *hyp, bool guest) +{ + uint64_t dfr0; + + /* Store the guest VFP registers */ + if (guest) { + /* Store the timer registers */ + hypctx->vtimer_cpu.cntkctl_el1 = READ_SPECIALREG(cntkctl_el1); + hypctx->vtimer_cpu.virt_timer.cntx_cval_el0 = + READ_SPECIALREG(cntv_cval_el0); + hypctx->vtimer_cpu.virt_timer.cntx_ctl_el0 = + READ_SPECIALREG(cntv_ctl_el0); + + /* Store the GICv3 registers */ + hypctx->vgic_v3_regs.ich_eisr_el2 = + READ_SPECIALREG(ich_eisr_el2); + hypctx->vgic_v3_regs.ich_elrsr_el2 = + READ_SPECIALREG(ich_elrsr_el2); + hypctx->vgic_v3_regs.ich_hcr_el2 = + READ_SPECIALREG(ich_hcr_el2); + hypctx->vgic_v3_regs.ich_misr_el2 = + READ_SPECIALREG(ich_misr_el2); + hypctx->vgic_v3_regs.ich_vmcr_el2 = + READ_SPECIALREG(ich_vmcr_el2); + switch (hypctx->vgic_v3_regs.ich_lr_num - 1) { +#define STORE_LR(x) \ + case x: \ + hypctx->vgic_v3_regs.ich_lr_el2[x] = \ + READ_SPECIALREG(ich_lr ## x ##_el2) + STORE_LR(15); + STORE_LR(14); + STORE_LR(13); + STORE_LR(12); + STORE_LR(11); + STORE_LR(10); + STORE_LR(9); + STORE_LR(8); + STORE_LR(7); + STORE_LR(6); + STORE_LR(5); + STORE_LR(4); + STORE_LR(3); + STORE_LR(2); + STORE_LR(1); + default: + STORE_LR(0); +#undef STORE_LR + } + + switch (hypctx->vgic_v3_regs.ich_apr_num - 1) { +#define STORE_APR(x) \ + case x: \ + hypctx->vgic_v3_regs.ich_ap0r_el2[x] = \ + READ_SPECIALREG(ich_ap0r ## x ##_el2); \ + hypctx->vgic_v3_regs.ich_ap1r_el2[x] = \ + READ_SPECIALREG(ich_ap1r ## x ##_el2) + STORE_APR(3); + STORE_APR(2); + STORE_APR(1); + default: + STORE_APR(0); +#undef STORE_APR + } + } + + dfr0 = READ_SPECIALREG(id_aa64dfr0_el1); + switch (ID_AA64DFR0_BRPs_VAL(dfr0) - 1) { +#define STORE_DBG_BRP(x) \ + case x: \ + hypctx->dbgbcr_el1[x] = \ + READ_SPECIALREG(dbgbcr ## x ## _el1); \ + hypctx->dbgbvr_el1[x] = \ + READ_SPECIALREG(dbgbvr ## x ## _el1) + STORE_DBG_BRP(15); + STORE_DBG_BRP(14); + STORE_DBG_BRP(13); + STORE_DBG_BRP(12); + STORE_DBG_BRP(11); + STORE_DBG_BRP(10); + STORE_DBG_BRP(9); + STORE_DBG_BRP(8); + STORE_DBG_BRP(7); + STORE_DBG_BRP(6); + STORE_DBG_BRP(5); + STORE_DBG_BRP(4); + STORE_DBG_BRP(3); + STORE_DBG_BRP(2); + STORE_DBG_BRP(1); + default: + STORE_DBG_BRP(0); +#undef STORE_DBG_BRP + } + + switch (ID_AA64DFR0_WRPs_VAL(dfr0) - 1) { +#define STORE_DBG_WRP(x) \ + case x: \ + hypctx->dbgwcr_el1[x] = \ + READ_SPECIALREG(dbgwcr ## x ## _el1); \ + hypctx->dbgwvr_el1[x] = \ + READ_SPECIALREG(dbgwvr ## x ## _el1) + STORE_DBG_WRP(15); + STORE_DBG_WRP(14); + STORE_DBG_WRP(13); + STORE_DBG_WRP(12); + STORE_DBG_WRP(11); + STORE_DBG_WRP(10); + STORE_DBG_WRP(9); + STORE_DBG_WRP(8); + STORE_DBG_WRP(7); + STORE_DBG_WRP(6); + STORE_DBG_WRP(5); + STORE_DBG_WRP(4); + STORE_DBG_WRP(3); + STORE_DBG_WRP(2); + STORE_DBG_WRP(1); + default: + STORE_DBG_WRP(0); +#undef STORE_DBG_WRP + } + + /* Store the PMU registers */ + hypctx->pmcr_el0 = READ_SPECIALREG(pmcr_el0); + hypctx->pmccntr_el0 = READ_SPECIALREG(pmccntr_el0); + hypctx->pmccfiltr_el0 = READ_SPECIALREG(pmccfiltr_el0); + hypctx->pmcntenset_el0 = READ_SPECIALREG(pmcntenset_el0); + hypctx->pmintenset_el1 = READ_SPECIALREG(pmintenset_el1); + hypctx->pmovsset_el0 = READ_SPECIALREG(pmovsset_el0); + hypctx->pmuserenr_el0 = READ_SPECIALREG(pmuserenr_el0); + switch ((hypctx->pmcr_el0 & PMCR_N_MASK) >> PMCR_N_SHIFT) { +#define STORE_PMU(x) \ + case (x + 1): \ + hypctx->pmevcntr_el0[x] = \ + READ_SPECIALREG(pmevcntr ## x ## _el0); \ + hypctx->pmevtyper_el0[x] = \ + READ_SPECIALREG(pmevtyper ## x ## _el0) + STORE_PMU(30); + STORE_PMU(29); + STORE_PMU(28); + STORE_PMU(27); + STORE_PMU(26); + STORE_PMU(25); + STORE_PMU(24); + STORE_PMU(23); + STORE_PMU(22); + STORE_PMU(21); + STORE_PMU(20); + STORE_PMU(19); + STORE_PMU(18); + STORE_PMU(17); + STORE_PMU(16); + STORE_PMU(15); + STORE_PMU(14); + STORE_PMU(13); + STORE_PMU(12); + STORE_PMU(11); + STORE_PMU(10); + STORE_PMU(9); + STORE_PMU(8); + STORE_PMU(7); + STORE_PMU(6); + STORE_PMU(5); + STORE_PMU(4); + STORE_PMU(3); + STORE_PMU(2); + STORE_PMU(1); + STORE_PMU(0); + default: /* N == 0 when only PMCCNTR_EL0 is available */ + break; +#undef STORE_PMU + } + + /* Store the special to from the trapframe */ + hypctx->tf.tf_sp = READ_SPECIALREG(sp_el1); + hypctx->tf.tf_elr = READ_SPECIALREG(elr_el2); + hypctx->tf.tf_spsr = READ_SPECIALREG(spsr_el2); + if (guest) { + hypctx->tf.tf_esr = READ_SPECIALREG(esr_el2); + } + + /* Store the guest special registers */ + hypctx->elr_el1 = READ_SPECIALREG(elr_el1); + hypctx->sp_el0 = READ_SPECIALREG(sp_el0); + hypctx->tpidr_el0 = READ_SPECIALREG(tpidr_el0); + hypctx->tpidrro_el0 = READ_SPECIALREG(tpidrro_el0); + hypctx->tpidr_el1 = READ_SPECIALREG(tpidr_el1); + hypctx->vbar_el1 = READ_SPECIALREG(vbar_el1); + + hypctx->actlr_el1 = READ_SPECIALREG(actlr_el1); + hypctx->afsr0_el1 = READ_SPECIALREG(afsr0_el1); + hypctx->afsr1_el1 = READ_SPECIALREG(afsr1_el1); + hypctx->amair_el1 = READ_SPECIALREG(amair_el1); + hypctx->contextidr_el1 = READ_SPECIALREG(contextidr_el1); + hypctx->cpacr_el1 = READ_SPECIALREG(cpacr_el1); + hypctx->csselr_el1 = READ_SPECIALREG(csselr_el1); + hypctx->esr_el1 = READ_SPECIALREG(esr_el1); + hypctx->far_el1 = READ_SPECIALREG(far_el1); + hypctx->mair_el1 = READ_SPECIALREG(mair_el1); + hypctx->mdccint_el1 = READ_SPECIALREG(mdccint_el1); + hypctx->mdscr_el1 = READ_SPECIALREG(mdscr_el1); + hypctx->par_el1 = READ_SPECIALREG(par_el1); + hypctx->sctlr_el1 = READ_SPECIALREG(sctlr_el1); + hypctx->spsr_el1 = READ_SPECIALREG(spsr_el1); + hypctx->tcr_el1 = READ_SPECIALREG(tcr_el1); + /* TODO: Support when this is not res0 */ + hypctx->tcr2_el1 = 0; + hypctx->ttbr0_el1 = READ_SPECIALREG(ttbr0_el1); + hypctx->ttbr1_el1 = READ_SPECIALREG(ttbr1_el1); + + hypctx->cptr_el2 = READ_SPECIALREG(cptr_el2); + hypctx->hcr_el2 = READ_SPECIALREG(hcr_el2); + hypctx->vpidr_el2 = READ_SPECIALREG(vpidr_el2); + hypctx->vmpidr_el2 = READ_SPECIALREG(vmpidr_el2); +} + +static void +vmm_hyp_reg_restore(struct hypctx *hypctx, struct hyp *hyp, bool guest) +{ + uint64_t dfr0; + + /* Restore the special registers */ + WRITE_SPECIALREG(elr_el1, hypctx->elr_el1); + WRITE_SPECIALREG(sp_el0, hypctx->sp_el0); + WRITE_SPECIALREG(tpidr_el0, hypctx->tpidr_el0); + WRITE_SPECIALREG(tpidrro_el0, hypctx->tpidrro_el0); + WRITE_SPECIALREG(tpidr_el1, hypctx->tpidr_el1); + WRITE_SPECIALREG(vbar_el1, hypctx->vbar_el1); + + WRITE_SPECIALREG(actlr_el1, hypctx->actlr_el1); + WRITE_SPECIALREG(afsr0_el1, hypctx->afsr0_el1); + WRITE_SPECIALREG(afsr1_el1, hypctx->afsr1_el1); + WRITE_SPECIALREG(amair_el1, hypctx->amair_el1); + WRITE_SPECIALREG(contextidr_el1, hypctx->contextidr_el1); + WRITE_SPECIALREG(cpacr_el1, hypctx->cpacr_el1); + WRITE_SPECIALREG(csselr_el1, hypctx->csselr_el1); + WRITE_SPECIALREG(esr_el1, hypctx->esr_el1); + WRITE_SPECIALREG(far_el1, hypctx->far_el1); + WRITE_SPECIALREG(mdccint_el1, hypctx->mdccint_el1); + WRITE_SPECIALREG(mdscr_el1, hypctx->mdscr_el1); + WRITE_SPECIALREG(mair_el1, hypctx->mair_el1); + WRITE_SPECIALREG(par_el1, hypctx->par_el1); + WRITE_SPECIALREG(sctlr_el1, hypctx->sctlr_el1); + WRITE_SPECIALREG(tcr_el1, hypctx->tcr_el1); + /* TODO: tcr2_el1 */ + WRITE_SPECIALREG(ttbr0_el1, hypctx->ttbr0_el1); + WRITE_SPECIALREG(ttbr1_el1, hypctx->ttbr1_el1); + WRITE_SPECIALREG(spsr_el1, hypctx->spsr_el1); + + WRITE_SPECIALREG(cptr_el2, hypctx->cptr_el2); + WRITE_SPECIALREG(hcr_el2, hypctx->hcr_el2); + WRITE_SPECIALREG(vpidr_el2, hypctx->vpidr_el2); + WRITE_SPECIALREG(vmpidr_el2, hypctx->vmpidr_el2); + + /* Load the special regs from the trapframe */ + WRITE_SPECIALREG(sp_el1, hypctx->tf.tf_sp); + WRITE_SPECIALREG(elr_el2, hypctx->tf.tf_elr); + WRITE_SPECIALREG(spsr_el2, hypctx->tf.tf_spsr); + + /* Restore the PMU registers */ + WRITE_SPECIALREG(pmcr_el0, hypctx->pmcr_el0); + WRITE_SPECIALREG(pmccntr_el0, hypctx->pmccntr_el0); + WRITE_SPECIALREG(pmccfiltr_el0, hypctx->pmccfiltr_el0); + /* Clear all events/interrupts then enable them */ + WRITE_SPECIALREG(pmcntenclr_el0, 0xfffffffful); + WRITE_SPECIALREG(pmcntenset_el0, hypctx->pmcntenset_el0); + WRITE_SPECIALREG(pmintenclr_el1, 0xfffffffful); + WRITE_SPECIALREG(pmintenset_el1, hypctx->pmintenset_el1); + WRITE_SPECIALREG(pmovsclr_el0, 0xfffffffful); + WRITE_SPECIALREG(pmovsset_el0, hypctx->pmovsset_el0); + + switch ((hypctx->pmcr_el0 & PMCR_N_MASK) >> PMCR_N_SHIFT) { +#define LOAD_PMU(x) \ + case (x + 1): \ + WRITE_SPECIALREG(pmevcntr ## x ## _el0, \ + hypctx->pmevcntr_el0[x]); \ + WRITE_SPECIALREG(pmevtyper ## x ## _el0, \ + hypctx->pmevtyper_el0[x]) + LOAD_PMU(30); + LOAD_PMU(29); + LOAD_PMU(28); + LOAD_PMU(27); + LOAD_PMU(26); + LOAD_PMU(25); + LOAD_PMU(24); + LOAD_PMU(23); + LOAD_PMU(22); + LOAD_PMU(21); + LOAD_PMU(20); + LOAD_PMU(19); + LOAD_PMU(18); + LOAD_PMU(17); + LOAD_PMU(16); + LOAD_PMU(15); + LOAD_PMU(14); + LOAD_PMU(13); + LOAD_PMU(12); + LOAD_PMU(11); + LOAD_PMU(10); + LOAD_PMU(9); + LOAD_PMU(8); + LOAD_PMU(7); + LOAD_PMU(6); + LOAD_PMU(5); + LOAD_PMU(4); + LOAD_PMU(3); + LOAD_PMU(2); + LOAD_PMU(1); + LOAD_PMU(0); + default: /* N == 0 when only PMCCNTR_EL0 is available */ + break; +#undef LOAD_PMU + } + + dfr0 = READ_SPECIALREG(id_aa64dfr0_el1); + switch (ID_AA64DFR0_BRPs_VAL(dfr0) - 1) { +#define LOAD_DBG_BRP(x) \ + case x: \ + WRITE_SPECIALREG(dbgbcr ## x ## _el1, \ + hypctx->dbgbcr_el1[x]); \ + WRITE_SPECIALREG(dbgbvr ## x ## _el1, \ + hypctx->dbgbvr_el1[x]) + LOAD_DBG_BRP(15); + LOAD_DBG_BRP(14); + LOAD_DBG_BRP(13); + LOAD_DBG_BRP(12); + LOAD_DBG_BRP(11); + LOAD_DBG_BRP(10); + LOAD_DBG_BRP(9); + LOAD_DBG_BRP(8); + LOAD_DBG_BRP(7); + LOAD_DBG_BRP(6); + LOAD_DBG_BRP(5); + LOAD_DBG_BRP(4); + LOAD_DBG_BRP(3); + LOAD_DBG_BRP(2); + LOAD_DBG_BRP(1); + default: + LOAD_DBG_BRP(0); +#undef LOAD_DBG_BRP + } + + switch (ID_AA64DFR0_WRPs_VAL(dfr0) - 1) { +#define LOAD_DBG_WRP(x) \ + case x: \ + WRITE_SPECIALREG(dbgwcr ## x ## _el1, \ + hypctx->dbgwcr_el1[x]); \ + WRITE_SPECIALREG(dbgwvr ## x ## _el1, \ + hypctx->dbgwvr_el1[x]) + LOAD_DBG_WRP(15); + LOAD_DBG_WRP(14); + LOAD_DBG_WRP(13); + LOAD_DBG_WRP(12); + LOAD_DBG_WRP(11); + LOAD_DBG_WRP(10); + LOAD_DBG_WRP(9); + LOAD_DBG_WRP(8); + LOAD_DBG_WRP(7); + LOAD_DBG_WRP(6); + LOAD_DBG_WRP(5); + LOAD_DBG_WRP(4); + LOAD_DBG_WRP(3); + LOAD_DBG_WRP(2); + LOAD_DBG_WRP(1); + default: + LOAD_DBG_WRP(0); +#undef LOAD_DBG_WRP + } + + if (guest) { + /* Load the timer registers */ + WRITE_SPECIALREG(cntkctl_el1, hypctx->vtimer_cpu.cntkctl_el1); + WRITE_SPECIALREG(cntv_cval_el0, + hypctx->vtimer_cpu.virt_timer.cntx_cval_el0); + WRITE_SPECIALREG(cntv_ctl_el0, + hypctx->vtimer_cpu.virt_timer.cntx_ctl_el0); + WRITE_SPECIALREG(cnthctl_el2, hyp->vtimer.cnthctl_el2); + WRITE_SPECIALREG(cntvoff_el2, hyp->vtimer.cntvoff_el2); + + /* Load the GICv3 registers */ + WRITE_SPECIALREG(ich_hcr_el2, hypctx->vgic_v3_regs.ich_hcr_el2); + WRITE_SPECIALREG(ich_vmcr_el2, + hypctx->vgic_v3_regs.ich_vmcr_el2); + switch (hypctx->vgic_v3_regs.ich_lr_num - 1) { +#define LOAD_LR(x) \ + case x: \ + WRITE_SPECIALREG(ich_lr ## x ##_el2, \ + hypctx->vgic_v3_regs.ich_lr_el2[x]) + LOAD_LR(15); + LOAD_LR(14); + LOAD_LR(13); + LOAD_LR(12); + LOAD_LR(11); + LOAD_LR(10); + LOAD_LR(9); + LOAD_LR(8); + LOAD_LR(7); + LOAD_LR(6); + LOAD_LR(5); + LOAD_LR(4); + LOAD_LR(3); + LOAD_LR(2); + LOAD_LR(1); + default: + LOAD_LR(0); +#undef LOAD_LR + } + + switch (hypctx->vgic_v3_regs.ich_apr_num - 1) { +#define LOAD_APR(x) \ + case x: \ + WRITE_SPECIALREG(ich_ap0r ## x ##_el2, \ + hypctx->vgic_v3_regs.ich_ap0r_el2[x]); \ + WRITE_SPECIALREG(ich_ap1r ## x ##_el2, \ + hypctx->vgic_v3_regs.ich_ap1r_el2[x]) + LOAD_APR(3); + LOAD_APR(2); + LOAD_APR(1); + default: + LOAD_APR(0); +#undef LOAD_APR + } + } +} + +static uint64_t +vmm_hyp_call_guest(struct hyp *hyp, struct hypctx *hypctx) +{ + struct hypctx host_hypctx; + uint64_t cntvoff_el2; + uint64_t ich_hcr_el2, ich_vmcr_el2, cnthctl_el2, cntkctl_el1; + uint64_t ret; + uint64_t s1e1r, hpfar_el2; + bool hpfar_valid; + + vmm_hyp_reg_store(&host_hypctx, NULL, false); + + /* Save the host special registers */ + cnthctl_el2 = READ_SPECIALREG(cnthctl_el2); + cntkctl_el1 = READ_SPECIALREG(cntkctl_el1); + cntvoff_el2 = READ_SPECIALREG(cntvoff_el2); + + ich_hcr_el2 = READ_SPECIALREG(ich_hcr_el2); + ich_vmcr_el2 = READ_SPECIALREG(ich_vmcr_el2); + + vmm_hyp_reg_restore(hypctx, hyp, true); + + /* Load the common hypervisor registers */ + WRITE_SPECIALREG(vttbr_el2, hyp->vttbr_el2); + + host_hypctx.mdcr_el2 = READ_SPECIALREG(mdcr_el2); + WRITE_SPECIALREG(mdcr_el2, hypctx->mdcr_el2); + + /* Call into the guest */ + ret = vmm_enter_guest(hypctx); + + WRITE_SPECIALREG(mdcr_el2, host_hypctx.mdcr_el2); + isb(); + + /* Store the exit info */ + hypctx->exit_info.far_el2 = READ_SPECIALREG(far_el2); + vmm_hyp_reg_store(hypctx, hyp, true); + + hpfar_valid = true; + if (ret == EXCP_TYPE_EL1_SYNC) { + switch (ESR_ELx_EXCEPTION(hypctx->tf.tf_esr)) { + case EXCP_INSN_ABORT_L: + case EXCP_DATA_ABORT_L: + /* + * The hpfar_el2 register is valid for: + * - Translation and Access faults. + * - Translation, Access, and permission faults on + * the translation table walk on the stage 1 tables. + * - A stage 2 Address size fault. + * + * As we only need it in the first 2 cases we can just + * exclude it on permission faults that are not from + * the stage 1 table walk. + * + * TODO: Add a case for Arm erratum 834220. + */ + if ((hypctx->tf.tf_esr & ISS_DATA_S1PTW) != 0) + break; + switch (hypctx->tf.tf_esr & ISS_DATA_DFSC_MASK) { + case ISS_DATA_DFSC_PF_L1: + case ISS_DATA_DFSC_PF_L2: + case ISS_DATA_DFSC_PF_L3: + hpfar_valid = false; + break; + } + break; + } + } + if (hpfar_valid) { + hypctx->exit_info.hpfar_el2 = READ_SPECIALREG(hpfar_el2); + } else { + /* + * TODO: There is a risk the at instruction could cause an + * exception here. We should handle it & return a failure. + */ + s1e1r = + arm64_address_translate_s1e1r(hypctx->exit_info.far_el2); + if (PAR_SUCCESS(s1e1r)) { + hpfar_el2 = (s1e1r & PAR_PA_MASK) >> PAR_PA_SHIFT; + hpfar_el2 <<= HPFAR_EL2_FIPA_SHIFT; + hypctx->exit_info.hpfar_el2 = hpfar_el2; + } else { + ret = EXCP_TYPE_REENTER; + } + } + + vmm_hyp_reg_restore(&host_hypctx, NULL, false); + + /* Restore the host special registers */ + WRITE_SPECIALREG(ich_hcr_el2, ich_hcr_el2); + WRITE_SPECIALREG(ich_vmcr_el2, ich_vmcr_el2); + + WRITE_SPECIALREG(cnthctl_el2, cnthctl_el2); + WRITE_SPECIALREG(cntkctl_el1, cntkctl_el1); + WRITE_SPECIALREG(cntvoff_el2, cntvoff_el2); + + return (ret); +} + +static uint64_t +vmm_hyp_read_reg(uint64_t reg) +{ + switch (reg) { + case HYP_REG_ICH_VTR: + return (READ_SPECIALREG(ich_vtr_el2)); + case HYP_REG_CNTHCTL: + return (READ_SPECIALREG(cnthctl_el2)); + } + + return (0); +} + +static bool +vmm_is_vpipt_cache(void) +{ + /* TODO: Implement */ + return (0); +} + +static int +vmm_clean_s2_tlbi(void) +{ + dsb(ishst); + __asm __volatile("tlbi alle1is"); + + /* + * If we have a VPIPT icache it will use the VMID to tag cachelines. + * As we are changing the allocated VMIDs we need to invalidate the + * icache lines containing all old values. + */ + if (vmm_is_vpipt_cache()) + __asm __volatile("ic ialluis"); + dsb(ish); + + return (0); +} + +static int +vm_s2_tlbi_range(uint64_t vttbr, vm_offset_t sva, vm_size_t eva, + bool final_only) +{ + uint64_t end, r, start; + uint64_t host_vttbr; + +#define TLBI_VA_SHIFT 12 +#define TLBI_VA_MASK ((1ul << 44) - 1) +#define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK) +#define TLBI_VA_L3_INCR (L3_SIZE >> TLBI_VA_SHIFT) + + /* Switch to the guest vttbr */ + /* TODO: Handle Cortex-A57/A72 erratum 131936 */ + host_vttbr = READ_SPECIALREG(vttbr_el2); + WRITE_SPECIALREG(vttbr_el2, vttbr); + isb(); + + /* + * The CPU can cache the stage 1 + 2 combination so we need to ensure + * the stage 2 is invalidated first, then when this has completed we + * invalidate the stage 1 TLB. As we don't know which stage 1 virtual + * addresses point at the stage 2 IPA we need to invalidate the entire + * stage 1 TLB. + */ + + start = TLBI_VA(sva); + end = TLBI_VA(eva); + for (r = start; r < end; r += TLBI_VA_L3_INCR) { + /* Invalidate the stage 2 TLB entry */ + if (final_only) + __asm __volatile("tlbi ipas2le1is, %0" : : "r"(r)); + else + __asm __volatile("tlbi ipas2e1is, %0" : : "r"(r)); + } + /* Ensure the entry has been invalidated */ + dsb(ish); + /* Invalidate the stage 1 TLB. */ + __asm __volatile("tlbi vmalle1is"); + dsb(ish); + isb(); + + /* Switch back t othe host vttbr */ + WRITE_SPECIALREG(vttbr_el2, host_vttbr); + isb(); + + return (0); +} + +static int +vm_s2_tlbi_all(uint64_t vttbr) +{ + uint64_t host_vttbr; + + /* Switch to the guest vttbr */ + /* TODO: Handle Cortex-A57/A72 erratum 131936 */ + host_vttbr = READ_SPECIALREG(vttbr_el2); + WRITE_SPECIALREG(vttbr_el2, vttbr); + isb(); + + __asm __volatile("tlbi vmalls12e1is"); + dsb(ish); + isb(); + + /* Switch back t othe host vttbr */ + WRITE_SPECIALREG(vttbr_el2, host_vttbr); + isb(); + + return (0); +} + +static int +vmm_dc_civac(uint64_t start, uint64_t len) +{ + size_t line_size, end; + uint64_t ctr; + + ctr = READ_SPECIALREG(ctr_el0); + line_size = sizeof(int) << CTR_DLINE_SIZE(ctr); + end = start + len; + dsb(ishst); + /* Clean and Invalidate the D-cache */ + for (; start < end; start += line_size) + __asm __volatile("dc civac, %0" :: "r" (start) : "memory"); + dsb(ish); + return (0); +} + +static int +vmm_el2_tlbi(uint64_t type, uint64_t start, uint64_t len) +{ + uint64_t end, r; + + dsb(ishst); + switch (type) { + default: + case HYP_EL2_TLBI_ALL: + __asm __volatile("tlbi alle2" ::: "memory"); + break; + case HYP_EL2_TLBI_VA: + end = TLBI_VA(start + len); + start = TLBI_VA(start); + for (r = start; r < end; r += TLBI_VA_L3_INCR) { + __asm __volatile("tlbi vae2is, %0" :: "r"(r)); + } + break; + } + dsb(ish); + + return (0); +} + +uint64_t +vmm_hyp_enter(uint64_t handle, uint64_t x1, uint64_t x2, uint64_t x3, + uint64_t x4, uint64_t x5, uint64_t x6, uint64_t x7) +{ + uint64_t ret; + + switch (handle) { + case HYP_ENTER_GUEST: + do { + ret = vmm_hyp_call_guest((struct hyp *)x1, + (struct hypctx *)x2); + } while (ret == EXCP_TYPE_REENTER); + return (ret); + case HYP_READ_REGISTER: + return (vmm_hyp_read_reg(x1)); + case HYP_CLEAN_S2_TLBI: + return (vmm_clean_s2_tlbi()); + case HYP_DC_CIVAC: + return (vmm_dc_civac(x1, x2)); + case HYP_EL2_TLBI: + return (vmm_el2_tlbi(x1, x2, x3)); + case HYP_S2_TLBI_RANGE: + return (vm_s2_tlbi_range(x1, x2, x3, x4)); + case HYP_S2_TLBI_ALL: + return (vm_s2_tlbi_all(x1)); + case HYP_CLEANUP: /* Handled in vmm_hyp_exception.S */ + default: + break; + } + + return (0); +} diff --git a/sys/arm64/vmm/vmm_hyp_el2.S b/sys/arm64/vmm/vmm_hyp_el2.S new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_hyp_el2.S @@ -0,0 +1,39 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Andrew Turner + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + + .rodata + .align PAGE_SHIFT + .globl vmm_hyp_code +vmm_hyp_code: + .incbin "vmm_hyp_blob.bin" + .globl vmm_hyp_code_end +vmm_hyp_code_end: diff --git a/sys/arm64/vmm/vmm_hyp_exception.S b/sys/arm64/vmm/vmm_hyp_exception.S new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_hyp_exception.S @@ -0,0 +1,384 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2017 Alexandru Elisei + * Copyright (c) 2021 Andrew Turner + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include +#include + +#include "assym.inc" +#include "hyp.h" + +.macro save_host_registers + /* TODO: Only store callee saved registers */ + sub sp, sp, #(32 * 8) + str x30, [sp, #(30 * 8)] + stp x28, x29, [sp, #(28 * 8)] + stp x26, x27, [sp, #(26 * 8)] + stp x24, x25, [sp, #(24 * 8)] + stp x22, x23, [sp, #(22 * 8)] + stp x20, x21, [sp, #(20 * 8)] + stp x18, x19, [sp, #(18 * 8)] + stp x16, x17, [sp, #(16 * 8)] + stp x14, x15, [sp, #(14 * 8)] + stp x12, x13, [sp, #(12 * 8)] + stp x10, x11, [sp, #(10 * 8)] + stp x8, x9, [sp, #(8 * 8)] + stp x6, x7, [sp, #(6 * 8)] + stp x4, x5, [sp, #(4 * 8)] + stp x2, x3, [sp, #(2 * 8)] + stp x0, x1, [sp, #(0 * 8)] +.endm + +.macro restore_host_registers + /* TODO: Only restore callee saved registers */ + ldp x0, x1, [sp, #(0 * 8)] + ldp x2, x3, [sp, #(2 * 8)] + ldp x4, x5, [sp, #(4 * 8)] + ldp x6, x7, [sp, #(6 * 8)] + ldp x8, x9, [sp, #(8 * 8)] + ldp x10, x11, [sp, #(10 * 8)] + ldp x12, x13, [sp, #(12 * 8)] + ldp x14, x15, [sp, #(14 * 8)] + ldp x16, x17, [sp, #(16 * 8)] + ldp x18, x19, [sp, #(18 * 8)] + ldp x20, x21, [sp, #(20 * 8)] + ldp x22, x23, [sp, #(22 * 8)] + ldp x24, x25, [sp, #(24 * 8)] + ldp x26, x27, [sp, #(26 * 8)] + ldp x28, x29, [sp, #(28 * 8)] + ldr x30, [sp, #(30 * 8)] + add sp, sp, #(32 * 8) +.endm + +.macro save_guest_registers + /* Back up x0 so we can use it as a temporary register */ + stp x0, x1, [sp, #-(2 * 8)]! + + /* Restore the hypctx pointer */ + mrs x0, tpidr_el2 + + stp x2, x3, [x0, #(TF_X + 2 * 8)] + stp x4, x5, [x0, #(TF_X + 4 * 8)] + stp x6, x7, [x0, #(TF_X + 6 * 8)] + stp x8, x9, [x0, #(TF_X + 8 * 8)] + stp x10, x11, [x0, #(TF_X + 10 * 8)] + stp x12, x13, [x0, #(TF_X + 12 * 8)] + stp x14, x15, [x0, #(TF_X + 14 * 8)] + stp x16, x17, [x0, #(TF_X + 16 * 8)] + stp x18, x19, [x0, #(TF_X + 18 * 8)] + stp x20, x21, [x0, #(TF_X + 20 * 8)] + stp x22, x23, [x0, #(TF_X + 22 * 8)] + stp x24, x25, [x0, #(TF_X + 24 * 8)] + stp x26, x27, [x0, #(TF_X + 26 * 8)] + stp x28, x29, [x0, #(TF_X + 28 * 8)] + + str lr, [x0, #(TF_LR)] + + /* Restore the saved x0 & x1 and save them */ + ldp x2, x3, [sp], #(2 * 8) + stp x2, x3, [x0, #(TF_X + 0 * 8)] +.endm + +.macro restore_guest_registers + /* + * Copy the guest x0 and x1 to the stack so we can restore them + * after loading the other registers. + */ + ldp x2, x3, [x0, #(TF_X + 0 * 8)] + stp x2, x3, [sp, #-(2 * 8)]! + + ldr lr, [x0, #(TF_LR)] + + ldp x28, x29, [x0, #(TF_X + 28 * 8)] + ldp x26, x27, [x0, #(TF_X + 26 * 8)] + ldp x24, x25, [x0, #(TF_X + 24 * 8)] + ldp x22, x23, [x0, #(TF_X + 22 * 8)] + ldp x20, x21, [x0, #(TF_X + 20 * 8)] + ldp x18, x19, [x0, #(TF_X + 18 * 8)] + ldp x16, x17, [x0, #(TF_X + 16 * 8)] + ldp x14, x15, [x0, #(TF_X + 14 * 8)] + ldp x12, x13, [x0, #(TF_X + 12 * 8)] + ldp x10, x11, [x0, #(TF_X + 10 * 8)] + ldp x8, x9, [x0, #(TF_X + 8 * 8)] + ldp x6, x7, [x0, #(TF_X + 6 * 8)] + ldp x4, x5, [x0, #(TF_X + 4 * 8)] + ldp x2, x3, [x0, #(TF_X + 2 * 8)] + + ldp x0, x1, [sp], #(2 * 8) +.endm + +.macro vempty + .align 7 + 1: b 1b +.endm + +.macro vector name + .align 7 + b handle_\name +.endm + + .section ".vmm_vectors","ax" + .align 11 +hyp_init_vectors: + vempty /* Synchronous EL2t */ + vempty /* IRQ EL2t */ + vempty /* FIQ EL2t */ + vempty /* Error EL2t */ + + vempty /* Synchronous EL2h */ + vempty /* IRQ EL2h */ + vempty /* FIQ EL2h */ + vempty /* Error EL2h */ + + vector hyp_init /* Synchronous 64-bit EL1 */ + vempty /* IRQ 64-bit EL1 */ + vempty /* FIQ 64-bit EL1 */ + vempty /* Error 64-bit EL1 */ + + vempty /* Synchronous 32-bit EL1 */ + vempty /* IRQ 32-bit EL1 */ + vempty /* FIQ 32-bit EL1 */ + vempty /* Error 32-bit EL1 */ + + .text + .align 11 +hyp_vectors: + vempty /* Synchronous EL2t */ + vempty /* IRQ EL2t */ + vempty /* FIQ EL2t */ + vempty /* Error EL2t */ + + vector el2_el2h_sync /* Synchronous EL2h */ + vector el2_el2h_irq /* IRQ EL2h */ + vector el2_el2h_fiq /* FIQ EL2h */ + vector el2_el2h_error /* Error EL2h */ + + vector el2_el1_sync64 /* Synchronous 64-bit EL1 */ + vector el2_el1_irq64 /* IRQ 64-bit EL1 */ + vector el2_el1_fiq64 /* FIQ 64-bit EL1 */ + vector el2_el1_error64 /* Error 64-bit EL1 */ + + vempty /* Synchronous 32-bit EL1 */ + vempty /* IRQ 32-bit EL1 */ + vempty /* FIQ 32-bit EL1 */ + vempty /* Error 32-bit EL1 */ + +/* + * Initialize the hypervisor mode with a new exception vector table, translation + * table and stack. + * + * Expecting: + * x0 - translation tables physical address + * x1 - stack top virtual address + * x2 - TCR_EL2 value + * x3 - SCTLR_EL2 value + * x4 - VTCR_EL2 value + */ +LENTRY(handle_hyp_init) + /* Install the new exception vectors */ + adrp x6, hyp_vectors + add x6, x6, :lo12:hyp_vectors + msr vbar_el2, x6 + /* Set the stack top address */ + mov sp, x1 + /* Use the host VTTBR_EL2 to tell the host and the guests apart */ + mov x9, #VTTBR_HOST + msr vttbr_el2, x9 + /* Load the base address for the translation tables */ + msr ttbr0_el2, x0 + /* Invalidate the TLB */ + tlbi alle2 + /* Use the same memory attributes as EL1 */ + mrs x9, mair_el1 + msr mair_el2, x9 + /* Configure address translation */ + msr tcr_el2, x2 + isb + /* Set the system control register for EL2 */ + msr sctlr_el2, x3 + /* Set the Stage 2 translation control register */ + msr vtcr_el2, x4 + /* Return success */ + mov x0, #0 + /* MMU is up and running */ + ERET +LEND(handle_hyp_init) + +.macro do_world_switch_to_host + save_guest_registers + restore_host_registers + + /* Restore host VTTBR */ + mov x9, #VTTBR_HOST + msr vttbr_el2, x9 +.endm + + +.macro handle_el2_excp type + /* Save registers before modifying so we can restore them */ + str x9, [sp, #-16]! + + /* Test if the exception happened when the host was running */ + mrs x9, vttbr_el2 + cmp x9, #VTTBR_HOST + beq 1f + + /* We got the exception while the guest was running */ + ldr x9, [sp], #16 + do_world_switch_to_host + mov x0, \type + ret + +1: + /* We got the exception while the host was running */ + ldr x9, [sp], #16 + mov x0, \type + ERET +.endm + + +LENTRY(handle_el2_el2h_sync) + handle_el2_excp #EXCP_TYPE_EL2_SYNC +LEND(handle_el2_el2h_sync) + +LENTRY(handle_el2_el2h_irq) + handle_el2_excp #EXCP_TYPE_EL2_IRQ +LEND(handle_el2_el2h_irq) + +LENTRY(handle_el2_el2h_fiq) + handle_el2_excp #EXCP_TYPE_EL2_FIQ +LEND(handle_el2_el2h_fiq) + +LENTRY(handle_el2_el2h_error) + handle_el2_excp #EXCP_TYPE_EL2_ERROR +LEND(handle_el2_el2h_error) + + +LENTRY(handle_el2_el1_sync64) + /* Save registers before modifying so we can restore them */ + str x9, [sp, #-16]! + + /* Check for host hypervisor call */ + mrs x9, vttbr_el2 + cmp x9, #VTTBR_HOST + ldr x9, [sp], #16 /* Restore the temp register */ + bne 1f + + /* + * Called from the host + */ + + /* Check if this is a cleanup call and handle in a controlled state */ + cmp x0, #(HYP_CLEANUP) + b.eq vmm_cleanup + + str lr, [sp, #-16]! + bl vmm_hyp_enter + ldr lr, [sp], #16 + ERET + +1: /* Guest exception taken to EL2 */ + do_world_switch_to_host + mov x0, #EXCP_TYPE_EL1_SYNC + ret +LEND(handle_el2_el1_sync64) + +/* + * We only trap IRQ, FIQ and SError exceptions when a guest is running. Do a + * world switch to host to handle these exceptions. + */ + +LENTRY(handle_el2_el1_irq64) + do_world_switch_to_host + str x9, [sp, #-16]! + mrs x9, ich_misr_el2 + cmp x9, xzr + beq 1f + mov x0, #EXCP_TYPE_MAINT_IRQ + b 2f +1: + mov x0, #EXCP_TYPE_EL1_IRQ +2: + ldr x9, [sp], #16 + ret +LEND(handle_el2_el1_irq) + +LENTRY(handle_el2_el1_fiq64) + do_world_switch_to_host + mov x0, #EXCP_TYPE_EL1_FIQ + ret +LEND(handle_el2_el1_fiq64) + +LENTRY(handle_el2_el1_error64) + do_world_switch_to_host + mov x0, #EXCP_TYPE_EL1_ERROR + ret +LEND(handle_el2_el1_error64) + + +/* + * Usage: + * uint64_t vmm_enter_guest(struct hypctx *hypctx) + * + * Expecting: + * x0 - hypctx address + */ +ENTRY(vmm_enter_guest) + /* Save hypctx address */ + msr tpidr_el2, x0 + + save_host_registers + restore_guest_registers + + /* Enter guest */ + ERET +END(vmm_enter_guest) + +/* + * Usage: + * void vmm_cleanup(uint64_t handle, void *hyp_stub_vectors) + * + * Expecting: + * x1 - physical address of hyp_stub_vectors + */ +LENTRY(vmm_cleanup) + /* Restore the stub vectors */ + msr vbar_el2, x1 + + /* Disable the MMU */ + dsb sy + mrs x2, sctlr_el2 + bic x2, x2, #SCTLR_EL2_M + msr sctlr_el2, x2 + isb + + ERET +LEND(vmm_cleanup) diff --git a/sys/arm64/vmm/vmm_instruction_emul.c b/sys/arm64/vmm/vmm_instruction_emul.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_instruction_emul.c @@ -0,0 +1,99 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef _KERNEL +#include +#include +#include +#include + +#include + +#include +#include +#else +#include +#include +#include + +#include + +#include +#include +#include +#include +#endif + +#include + +int +vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging __unused, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) +{ + uint64_t val; + int error; + + if (vie->dir == VM_DIR_READ) { + error = memread(vcpu, gpa, &val, vie->access_size, memarg); + if (error) + goto out; + error = vm_set_register(vcpu, vie->reg, val); + } else { + error = vm_get_register(vcpu, vie->reg, &val); + if (error) + goto out; + error = memwrite(vcpu, gpa, val, vie->access_size, memarg); + } + +out: + return (error); +} + +int +vmm_emulate_register(struct vcpu *vcpu, struct vre *vre, reg_read_t regread, + reg_write_t regwrite, void *regarg) +{ + uint64_t val; + int error; + + if (vre->dir == VM_DIR_READ) { + error = regread(vcpu, &val, regarg); + if (error) + goto out; + error = vm_set_register(vcpu, vre->reg, val); + } else { + error = vm_get_register(vcpu, vre->reg, &val); + if (error) + goto out; + error = regwrite(vcpu, val, regarg); + } + +out: + return (error); +} diff --git a/sys/arm64/vmm/vmm_ktr.h b/sys/arm64/vmm/vmm_ktr.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_ktr.h @@ -0,0 +1,69 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_KTR_H_ +#define _VMM_KTR_H_ + +#include +#include + +#ifndef KTR_VMM +#define KTR_VMM KTR_GEN +#endif + +#define VCPU_CTR0(vm, vcpuid, format) \ +CTR2(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid)) + +#define VCPU_CTR1(vm, vcpuid, format, p1) \ +CTR3(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1)) + +#define VCPU_CTR2(vm, vcpuid, format, p1, p2) \ +CTR4(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2)) + +#define VCPU_CTR3(vm, vcpuid, format, p1, p2, p3) \ +CTR5(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2), (p3)) + +#define VCPU_CTR4(vm, vcpuid, format, p1, p2, p3, p4) \ +CTR6(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), \ + (p1), (p2), (p3), (p4)) + +#define VM_CTR0(vm, format) \ +CTR1(KTR_VMM, "vm %s: " format, vm_name((vm))) + +#define VM_CTR1(vm, format, p1) \ +CTR2(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1)) + +#define VM_CTR2(vm, format, p1, p2) \ +CTR3(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2)) + +#define VM_CTR3(vm, format, p1, p2, p3) \ +CTR4(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3)) + +#define VM_CTR4(vm, format, p1, p2, p3, p4) \ +CTR5(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3), (p4)) +#endif diff --git a/sys/arm64/vmm/vmm_mmu.c b/sys/arm64/vmm/vmm_mmu.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_mmu.c @@ -0,0 +1,430 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2017 Alexandru Elisei + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "mmu.h" +#include "arm64.h" + +static struct mtx vmmpmap_mtx; +static pt_entry_t *l0; +static vm_paddr_t l0_paddr; + +bool +vmmpmap_init(void) +{ + vm_page_t m; + + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) + return (false); + + l0_paddr = VM_PAGE_TO_PHYS(m); + l0 = (pd_entry_t *)PHYS_TO_DMAP(l0_paddr); + + mtx_init(&vmmpmap_mtx, "vmm pmap", NULL, MTX_DEF); + + return (true); +} + +static void +vmmpmap_release_l3(pd_entry_t l2e) +{ + pt_entry_t *l3 __diagused; + vm_page_t m; + int i; + + l3 = (pd_entry_t *)PHYS_TO_DMAP(l2e & ~ATTR_MASK); + for (i = 0; i < Ln_ENTRIES; i++) { + KASSERT(l3[i] == 0, ("%s: l3 still mapped: %p %lx", __func__, + &l3[i], l3[i])); + } + + m = PHYS_TO_VM_PAGE(l2e & ~ATTR_MASK); + vm_page_unwire_noq(m); + vm_page_free(m); +} + +static void +vmmpmap_release_l2(pd_entry_t l1e) +{ + pt_entry_t *l2; + vm_page_t m; + int i; + + l2 = (pd_entry_t *)PHYS_TO_DMAP(l1e & ~ATTR_MASK); + for (i = 0; i < Ln_ENTRIES; i++) { + if (l2[i] != 0) { + vmmpmap_release_l3(l2[i]); + } + } + + m = PHYS_TO_VM_PAGE(l1e & ~ATTR_MASK); + vm_page_unwire_noq(m); + vm_page_free(m); +} + +static void +vmmpmap_release_l1(pd_entry_t l0e) +{ + pt_entry_t *l1; + vm_page_t m; + int i; + + l1 = (pd_entry_t *)PHYS_TO_DMAP(l0e & ~ATTR_MASK); + for (i = 0; i < Ln_ENTRIES; i++) { + if (l1[i] != 0) { + vmmpmap_release_l2(l1[i]); + } + } + + m = PHYS_TO_VM_PAGE(l0e & ~ATTR_MASK); + vm_page_unwire_noq(m); + vm_page_free(m); +} + +void +vmmpmap_fini(void) +{ + vm_page_t m; + int i; + + /* Remove the remaining entries */ + for (i = 0; i < L0_ENTRIES; i++) { + if (l0[i] != 0) { + vmmpmap_release_l1(l0[i]); + } + } + + m = PHYS_TO_VM_PAGE(l0_paddr); + vm_page_unwire_noq(m); + vm_page_free(m); + + mtx_destroy(&vmmpmap_mtx); +} + +uint64_t +vmmpmap_to_ttbr0(void) +{ + + return (l0_paddr); +} + +/* Returns a pointer to the level 1 table, allocating if needed. */ +static pt_entry_t * +vmmpmap_l1_table(vm_offset_t va) +{ + pt_entry_t new_l0e, l0e, *l1; + vm_page_t m; + int rv; + + m = NULL; +again: + l0e = atomic_load_64(&l0[pmap_l0_index(va)]); + if ((l0e & ATTR_DESCR_VALID) == 0) { + /* Allocate a page for the level 1 table */ + if (m == NULL) { + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) + return (NULL); + } + + new_l0e = VM_PAGE_TO_PHYS(m) | L0_TABLE; + + mtx_lock(&vmmpmap_mtx); + rv = atomic_cmpset_64(&l0[pmap_l0_index(va)], l0e, new_l0e); + mtx_unlock(&vmmpmap_mtx); + /* We may have raced another thread, try again */ + if (rv == 0) + goto again; + + /* The cmpset succeeded */ + l0e = new_l0e; + } else if (m != NULL) { + /* We allocated a page that wasn't used */ + vm_page_unwire_noq(m); + vm_page_free_zero(m); + } + + l1 = (pd_entry_t *)PHYS_TO_DMAP(l0e & ~ATTR_MASK); + return (l1); +} + +static pt_entry_t * +vmmpmap_l2_table(vm_offset_t va) +{ + pt_entry_t new_l1e, l1e, *l1, *l2; + vm_page_t m; + int rv; + + l1 = vmmpmap_l1_table(va); + if (l1 == NULL) + return (NULL); + + m = NULL; +again: + l1e = atomic_load_64(&l1[pmap_l1_index(va)]); + if ((l1e & ATTR_DESCR_VALID) == 0) { + /* Allocate a page for the level 2 table */ + if (m == NULL) { + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) + return (NULL); + } + + new_l1e = VM_PAGE_TO_PHYS(m) | L1_TABLE; + + mtx_lock(&vmmpmap_mtx); + rv = atomic_cmpset_64(&l1[pmap_l1_index(va)], l1e, new_l1e); + mtx_unlock(&vmmpmap_mtx); + /* We may have raced another thread, try again */ + if (rv == 0) + goto again; + + /* The cmpset succeeded */ + l1e = new_l1e; + } else if (m != NULL) { + /* We allocated a page that wasn't used */ + vm_page_unwire_noq(m); + vm_page_free_zero(m); + } + + l2 = (pd_entry_t *)PHYS_TO_DMAP(l1e & ~ATTR_MASK); + return (l2); +} + +static pd_entry_t * +vmmpmap_l3_table(vm_offset_t va) +{ + pt_entry_t new_l2e, l2e, *l2, *l3; + vm_page_t m; + int rv; + + l2 = vmmpmap_l2_table(va); + if (l2 == NULL) + return (NULL); + + m = NULL; +again: + l2e = atomic_load_64(&l2[pmap_l2_index(va)]); + if ((l2e & ATTR_DESCR_VALID) == 0) { + /* Allocate a page for the level 3 table */ + if (m == NULL) { + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) + return (NULL); + } + + new_l2e = VM_PAGE_TO_PHYS(m) | L2_TABLE; + + mtx_lock(&vmmpmap_mtx); + rv = atomic_cmpset_64(&l2[pmap_l2_index(va)], l2e, new_l2e); + mtx_unlock(&vmmpmap_mtx); + /* We may have raced another thread, try again */ + if (rv == 0) + goto again; + + /* The cmpset succeeded */ + l2e = new_l2e; + } else if (m != NULL) { + /* We allocated a page that wasn't used */ + vm_page_unwire_noq(m); + vm_page_free_zero(m); + } + + l3 = (pt_entry_t *)PHYS_TO_DMAP(l2e & ~ATTR_MASK); + return (l3); +} + +/* + * Creates an EL2 entry in the hyp_pmap. Similar to pmap_kenter. + */ +bool +vmmpmap_enter(vm_offset_t va, vm_size_t size, vm_paddr_t pa, vm_prot_t prot) +{ + pd_entry_t l3e, *l3; + + KASSERT((pa & L3_OFFSET) == 0, + ("%s: Invalid physical address", __func__)); + KASSERT((va & L3_OFFSET) == 0, + ("%s: Invalid virtual address", __func__)); + KASSERT((size & PAGE_MASK) == 0, + ("%s: Mapping is not page-sized", __func__)); + + l3e = ATTR_DEFAULT | L3_PAGE; + /* This bit is res1 at EL2 */ + l3e |= ATTR_S1_AP(ATTR_S1_AP_USER); + /* Only normal memory is used at EL2 */ + l3e |= ATTR_S1_IDX(VM_MEMATTR_DEFAULT); + + if ((prot & VM_PROT_EXECUTE) == 0) { + /* PXN is res0 at EL2. UXN is XN */ + l3e |= ATTR_S1_UXN; + } + if ((prot & VM_PROT_WRITE) == 0) { + l3e |= ATTR_S1_AP(ATTR_S1_AP_RO); + } + + while (size > 0) { + l3 = vmmpmap_l3_table(va); + if (l3 == NULL) + return (false); + +#ifdef INVARIANTS + /* + * Ensure no other threads can write to l3 between the KASSERT + * and store. + */ + mtx_lock(&vmmpmap_mtx); +#endif + KASSERT(atomic_load_64(&l3[pmap_l3_index(va)]) == 0, + ("%s: VA already mapped", __func__)); + + atomic_store_64(&l3[pmap_l3_index(va)], l3e | pa); +#ifdef INVARIANTS + mtx_unlock(&vmmpmap_mtx); +#endif + + size -= PAGE_SIZE; + pa += PAGE_SIZE; + va += PAGE_SIZE; + } + + return (true); +} + +void +vmmpmap_remove(vm_offset_t va, vm_size_t size, bool invalidate) +{ + pt_entry_t l0e, *l1, l1e, *l2, l2e; + pd_entry_t *l3, l3e, **l3_list; + vm_offset_t eva, va_next, sva; + size_t i; + + KASSERT((va & L3_OFFSET) == 0, + ("%s: Invalid virtual address", __func__)); + KASSERT((size & PAGE_MASK) == 0, + ("%s: Mapping is not page-sized", __func__)); + + if (invalidate) { + l3_list = malloc((size / PAGE_SIZE) * sizeof(l3_list[0]), + M_TEMP, M_WAITOK | M_ZERO); + } + + sva = va; + eva = va + size; + mtx_lock(&vmmpmap_mtx); + for (i = 0; va < eva; va = va_next) { + l0e = atomic_load_64(&l0[pmap_l0_index(va)]); + if (l0e == 0) { + va_next = (va + L0_SIZE) & ~L0_OFFSET; + if (va_next < va) + va_next = eva; + continue; + } + MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE); + + l1 = (pd_entry_t *)PHYS_TO_DMAP(l0e & ~ATTR_MASK); + l1e = atomic_load_64(&l1[pmap_l1_index(va)]); + if (l1e == 0) { + va_next = (va + L1_SIZE) & ~L1_OFFSET; + if (va_next < va) + va_next = eva; + continue; + } + MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE); + + l2 = (pd_entry_t *)PHYS_TO_DMAP(l1e & ~ATTR_MASK); + l2e = atomic_load_64(&l2[pmap_l2_index(va)]); + if (l2e == 0) { + va_next = (va + L2_SIZE) & ~L2_OFFSET; + if (va_next < va) + va_next = eva; + continue; + } + MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE); + + l3 = (pd_entry_t *)PHYS_TO_DMAP(l2e & ~ATTR_MASK); + if (invalidate) { + l3e = atomic_load_64(&l3[pmap_l3_index(va)]); + MPASS(l3e != 0); + /* + * Mark memory as read-only so we can invalidate + * the cache. + */ + l3e &= ~ATTR_S1_AP_MASK; + l3e |= ATTR_S1_AP(ATTR_S1_AP_RO); + atomic_store_64(&l3[pmap_l3_index(va)], l3e); + + l3_list[i] = &l3[pmap_l3_index(va)]; + i++; + } else { + /* + * The caller is responsible for clearing the cache & + * handling the TLB + */ + atomic_store_64(&l3[pmap_l3_index(va)], 0); + } + + va_next = (va + L3_SIZE) & ~L3_OFFSET; + if (va_next < va) + va_next = eva; + } + mtx_unlock(&vmmpmap_mtx); + + if (invalidate) { + /* Invalidate the memory from the D-cache */ + vmm_call_hyp(HYP_DC_CIVAC, sva, size); + + for (i = 0; i < (size / PAGE_SIZE); i++) { + atomic_store_64(l3_list[i], 0); + } + + vmm_call_hyp(HYP_EL2_TLBI, HYP_EL2_TLBI_VA, sva, size); + + free(l3_list, M_TEMP); + } +} diff --git a/sys/arm64/vmm/vmm_reset.c b/sys/arm64/vmm/vmm_reset.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_reset.c @@ -0,0 +1,177 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2018 Alexandru Elisei + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "arm64.h" +#include "reset.h" + +/* + * Make the architecturally UNKNOWN value 0. As a bonus, we don't have to + * manually set all those RES0 fields. + */ +#define ARCH_UNKNOWN 0 +#define set_arch_unknown(reg) (memset(&(reg), ARCH_UNKNOWN, sizeof(reg))) + +void +reset_vm_el01_regs(void *vcpu) +{ + struct hypctx *el2ctx; + + el2ctx = vcpu; + + set_arch_unknown(el2ctx->tf); + + set_arch_unknown(el2ctx->actlr_el1); + set_arch_unknown(el2ctx->afsr0_el1); + set_arch_unknown(el2ctx->afsr1_el1); + set_arch_unknown(el2ctx->amair_el1); + set_arch_unknown(el2ctx->contextidr_el1); + set_arch_unknown(el2ctx->cpacr_el1); + set_arch_unknown(el2ctx->csselr_el1); + set_arch_unknown(el2ctx->elr_el1); + set_arch_unknown(el2ctx->esr_el1); + set_arch_unknown(el2ctx->far_el1); + set_arch_unknown(el2ctx->mair_el1); + set_arch_unknown(el2ctx->mdccint_el1); + set_arch_unknown(el2ctx->mdscr_el1); + set_arch_unknown(el2ctx->par_el1); + + /* + * Guest starts with: + * ~SCTLR_M: MMU off + * ~SCTLR_C: data cache off + * SCTLR_CP15BEN: memory barrier instruction enable from EL0; RAO/WI + * ~SCTLR_I: instruction cache off + */ + el2ctx->sctlr_el1 = SCTLR_RES1; + el2ctx->sctlr_el1 &= ~SCTLR_M & ~SCTLR_C & ~SCTLR_I; + el2ctx->sctlr_el1 |= SCTLR_CP15BEN; + + set_arch_unknown(el2ctx->sp_el0); + set_arch_unknown(el2ctx->tcr_el1); + set_arch_unknown(el2ctx->tpidr_el0); + set_arch_unknown(el2ctx->tpidr_el1); + set_arch_unknown(el2ctx->tpidrro_el0); + set_arch_unknown(el2ctx->ttbr0_el1); + set_arch_unknown(el2ctx->ttbr1_el1); + set_arch_unknown(el2ctx->vbar_el1); + set_arch_unknown(el2ctx->spsr_el1); + + set_arch_unknown(el2ctx->dbgbcr_el1); + set_arch_unknown(el2ctx->dbgbvr_el1); + set_arch_unknown(el2ctx->dbgwcr_el1); + set_arch_unknown(el2ctx->dbgwvr_el1); + + el2ctx->pmcr_el0 = READ_SPECIALREG(pmcr_el0) & PMCR_N_MASK; + /* PMCR_LC is unknown when AArch32 is supported or RES1 otherwise */ + el2ctx->pmcr_el0 |= PMCR_LC; + set_arch_unknown(el2ctx->pmccntr_el0); + set_arch_unknown(el2ctx->pmccfiltr_el0); + set_arch_unknown(el2ctx->pmcntenset_el0); + set_arch_unknown(el2ctx->pmintenset_el1); + set_arch_unknown(el2ctx->pmovsset_el0); + set_arch_unknown(el2ctx->pmuserenr_el0); + memset(el2ctx->pmevcntr_el0, 0, sizeof(el2ctx->pmevcntr_el0)); + memset(el2ctx->pmevtyper_el0, 0, sizeof(el2ctx->pmevtyper_el0)); +} + +void +reset_vm_el2_regs(void *vcpu) +{ + struct hypctx *el2ctx; + uint64_t cpu_aff, vcpuid; + + el2ctx = vcpu; + vcpuid = vcpu_vcpuid(el2ctx->vcpu); + + /* + * Set the Hypervisor Configuration Register: + * + * HCR_RW: use AArch64 for EL1 + * HCR_TID3: handle ID registers in the vmm to privide a common + * set of featers on all vcpus + * HCR_TWI: Trap WFI to the hypervisor + * HCR_BSU_IS: barrier instructions apply to the inner shareable + * domain + * HCR_FB: broadcast maintenance operations + * HCR_AMO: route physical SError interrupts to EL2 + * HCR_IMO: route physical IRQ interrupts to EL2 + * HCR_FMO: route physical FIQ interrupts to EL2 + * HCR_SWIO: turn set/way invalidate into set/way clean and + * invalidate + * HCR_VM: use stage 2 translation + */ + el2ctx->hcr_el2 = HCR_RW | HCR_TID3 | HCR_TWI | HCR_BSU_IS | HCR_FB | + HCR_AMO | HCR_IMO | HCR_FMO | HCR_SWIO | HCR_VM; + + /* TODO: Trap all extensions we don't support */ + el2ctx->mdcr_el2 = 0; + /* PMCR_EL0.N is read from MDCR_EL2.HPMN */ + el2ctx->mdcr_el2 |= (el2ctx->pmcr_el0 & PMCR_N_MASK) >> PMCR_N_SHIFT; + + el2ctx->vmpidr_el2 = VMPIDR_EL2_RES1; + /* The guest will detect a multi-core, single-threaded CPU */ + el2ctx->vmpidr_el2 &= ~VMPIDR_EL2_U & ~VMPIDR_EL2_MT; + /* + * Generate the guest MPIDR value. We only support 16 CPUs at affinity + * level 0 to simplify the vgicv3 driver (see writing sgi1r_el1). + */ + cpu_aff = (vcpuid & 0xf) << MPIDR_AFF0_SHIFT | + ((vcpuid >> 4) & 0xff) << MPIDR_AFF1_SHIFT | + ((vcpuid >> 12) & 0xff) << MPIDR_AFF2_SHIFT | + ((vcpuid >> 20) & 0xff) << MPIDR_AFF3_SHIFT; + el2ctx->vmpidr_el2 |= cpu_aff; + + /* Use the same CPU identification information as the host */ + el2ctx->vpidr_el2 = CPU_IMPL_TO_MIDR(CPU_IMPL_ARM); + el2ctx->vpidr_el2 |= CPU_VAR_TO_MIDR(0); + el2ctx->vpidr_el2 |= CPU_ARCH_TO_MIDR(0xf); + el2ctx->vpidr_el2 |= CPU_PART_TO_MIDR(CPU_PART_FOUNDATION); + el2ctx->vpidr_el2 |= CPU_REV_TO_MIDR(0); + + /* + * Don't trap accesses to CPACR_EL1, trace, SVE, Advanced SIMD + * and floating point functionality to EL2. + */ + el2ctx->cptr_el2 = CPTR_RES1; + /* + * Disable interrupts in the guest. The guest OS will re-enable + * them. + */ + el2ctx->tf.tf_spsr = PSR_D | PSR_A | PSR_I | PSR_F; + /* Use the EL1 stack when taking exceptions to EL1 */ + el2ctx->tf.tf_spsr |= PSR_M_EL1h; +} diff --git a/sys/arm64/vmm/vmm_stat.h b/sys/arm64/vmm/vmm_stat.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_stat.h @@ -0,0 +1,145 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_STAT_H_ +#define _VMM_STAT_H_ + +struct vm; + +#define MAX_VMM_STAT_ELEMS 64 /* arbitrary */ + +enum vmm_stat_scope { + VMM_STAT_SCOPE_ANY, +}; + +struct vmm_stat_type; +typedef void (*vmm_stat_func_t)(struct vcpu *vcpu, + struct vmm_stat_type *stat); + +struct vmm_stat_type { + int index; /* position in the stats buffer */ + int nelems; /* standalone or array */ + const char *desc; /* description of statistic */ + vmm_stat_func_t func; + enum vmm_stat_scope scope; +}; + +void vmm_stat_register(void *arg); + +#define VMM_STAT_FDEFINE(type, nelems, desc, func, scope) \ + struct vmm_stat_type type[1] = { \ + { -1, nelems, desc, func, scope } \ + }; \ + SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_register, type) + +#define VMM_STAT_DEFINE(type, nelems, desc, scope) \ + VMM_STAT_FDEFINE(type, nelems, desc, NULL, scope) + +#define VMM_STAT_DECLARE(type) \ + extern struct vmm_stat_type type[1] + +#define VMM_STAT(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_ANY) + +#define VMM_STAT_FUNC(type, desc, func) \ + VMM_STAT_FDEFINE(type, 1, desc, func, VMM_STAT_SCOPE_ANY) + +#define VMM_STAT_ARRAY(type, nelems, desc) \ + VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY) + +void *vmm_stat_alloc(void); +void vmm_stat_init(void *vp); +void vmm_stat_free(void *vp); + +int vmm_stat_copy(struct vcpu *vcpu, int index, int count, + int *num_stats, uint64_t *buf); +int vmm_stat_desc_copy(int index, char *buf, int buflen); + +static void __inline +vmm_stat_array_incr(struct vcpu *vcpu, struct vmm_stat_type *vst, int statidx, + uint64_t x) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] += x; +#endif +} + +static void __inline +vmm_stat_array_set(struct vcpu *vcpu, struct vmm_stat_type *vst, int statidx, + uint64_t val) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] = val; +#endif +} + +static void __inline +vmm_stat_incr(struct vcpu *vcpu, struct vmm_stat_type *vst, uint64_t x) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_incr(vcpu, vst, 0, x); +#endif +} + +static void __inline +vmm_stat_set(struct vcpu *vcpu, struct vmm_stat_type *vst, uint64_t val) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_set(vcpu, vst, 0, val); +#endif +} + +VMM_STAT_DECLARE(VMEXIT_COUNT); +VMM_STAT_DECLARE(VMEXIT_UNKNOWN); +VMM_STAT_DECLARE(VMEXIT_WFI); +VMM_STAT_DECLARE(VMEXIT_WFE); +VMM_STAT_DECLARE(VMEXIT_HVC); +VMM_STAT_DECLARE(VMEXIT_MSR); +VMM_STAT_DECLARE(VMEXIT_DATA_ABORT); +VMM_STAT_DECLARE(VMEXIT_INSN_ABORT); +VMM_STAT_DECLARE(VMEXIT_UNHANDLED_SYNC); +VMM_STAT_DECLARE(VMEXIT_IRQ); +VMM_STAT_DECLARE(VMEXIT_FIQ); +VMM_STAT_DECLARE(VMEXIT_UNHANDLED_EL2); +VMM_STAT_DECLARE(VMEXIT_UNHANDLED); +#endif diff --git a/sys/arm64/vmm/vmm_stat.c b/sys/arm64/vmm/vmm_stat.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_stat.c @@ -0,0 +1,165 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include + +#include +#include +#include "vmm_stat.h" + +/* + * 'vst_num_elems' is the total number of addressable statistic elements + * 'vst_num_types' is the number of unique statistic types + * + * It is always true that 'vst_num_elems' is greater than or equal to + * 'vst_num_types'. This is because a stat type may represent more than + * one element (for e.g. VMM_STAT_ARRAY). + */ +static int vst_num_elems, vst_num_types; +static struct vmm_stat_type *vsttab[MAX_VMM_STAT_ELEMS]; + +static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat"); + +#define vst_size ((size_t)vst_num_elems * sizeof(uint64_t)) + +void +vmm_stat_register(void *arg) +{ + struct vmm_stat_type *vst = arg; + + /* We require all stats to identify themselves with a description */ + if (vst->desc == NULL) + return; + + if (vst_num_elems + vst->nelems >= MAX_VMM_STAT_ELEMS) { + printf("Cannot accommodate vmm stat type \"%s\"!\n", vst->desc); + return; + } + + vst->index = vst_num_elems; + vst_num_elems += vst->nelems; + + vsttab[vst_num_types++] = vst; +} + +int +vmm_stat_copy(struct vcpu *vcpu, int index, int count, int *num_stats, + uint64_t *buf) +{ + struct vmm_stat_type *vst; + uint64_t *stats; + int i, tocopy; + + if (index < 0 || count < 0) + return (EINVAL); + + if (index > vst_num_elems) + return (ENOENT); + + if (index == vst_num_elems) { + *num_stats = 0; + return (0); + } + + tocopy = min(vst_num_elems - index, count); + + /* Let stats functions update their counters */ + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (vst->func != NULL) + (*vst->func)(vcpu, vst); + } + + /* Copy over the stats */ + stats = vcpu_stats(vcpu); + memcpy(buf, stats + index, tocopy * sizeof(stats[0])); + *num_stats = tocopy; + return (0); +} + +void * +vmm_stat_alloc(void) +{ + + return (malloc(vst_size, M_VMM_STAT, M_WAITOK)); +} + +void +vmm_stat_init(void *vp) +{ + + bzero(vp, vst_size); +} + +void +vmm_stat_free(void *vp) +{ + free(vp, M_VMM_STAT); +} + +int +vmm_stat_desc_copy(int index, char *buf, int bufsize) +{ + int i; + struct vmm_stat_type *vst; + + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (index >= vst->index && index < vst->index + vst->nelems) { + if (vst->nelems > 1) { + snprintf(buf, bufsize, "%s[%d]", + vst->desc, index - vst->index); + } else { + strlcpy(buf, vst->desc, bufsize); + } + return (0); /* found it */ + } + } + + return (EINVAL); +} + +/* global statistics */ +VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); +VMM_STAT(VMEXIT_UNKNOWN, "number of vmexits for the unknown exception"); +VMM_STAT(VMEXIT_WFI, "number of times wfi was intercepted"); +VMM_STAT(VMEXIT_WFE, "number of times wfe was intercepted"); +VMM_STAT(VMEXIT_HVC, "number of times hvc was intercepted"); +VMM_STAT(VMEXIT_MSR, "number of times msr/mrs was intercepted"); +VMM_STAT(VMEXIT_DATA_ABORT, "number of vmexits for a data abort"); +VMM_STAT(VMEXIT_INSN_ABORT, "number of vmexits for an instruction abort"); +VMM_STAT(VMEXIT_UNHANDLED_SYNC, "number of vmexits for an unhandled synchronous exception"); +VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq"); +VMM_STAT(VMEXIT_FIQ, "number of vmexits for an interrupt"); +VMM_STAT(VMEXIT_UNHANDLED_EL2, "number of vmexits for an unhandled EL2 exception"); +VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception"); diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -116,6 +116,39 @@ dev/iommu/busdma_iommu.c optional iommu dev/iommu/iommu_gas.c optional iommu +arm64/vmm/vmm.c optional vmm +arm64/vmm/vmm_dev.c optional vmm +arm64/vmm/vmm_instruction_emul.c optional vmm +arm64/vmm/vmm_stat.c optional vmm +arm64/vmm/vmm_arm64.c optional vmm +arm64/vmm/vmm_reset.c optional vmm +arm64/vmm/vmm_call.S optional vmm +arm64/vmm/vmm_hyp_exception.S optional vmm \ + compile-with "${NORMAL_C:N-fsanitize*:N-mbranch-protection*} -fpie" \ + no-obj +arm64/vmm/vmm_hyp.c optional vmm \ + compile-with "${NORMAL_C:N-fsanitize*:N-mbranch-protection*} -fpie" \ + no-obj +vmm_hyp_blob.elf.full optional vmm \ + dependency "vmm_hyp.o vmm_hyp_exception.o" \ + compile-with "${SYSTEM_LD_BASECMD} -o ${.TARGET} ${.ALLSRC} --defsym=text_start='0x0'" \ + no-obj no-implicit-rule +vmm_hyp_blob.elf optional vmm \ + dependency "vmm_hyp_blob.elf.full" \ + compile-with "${OBJCOPY} --strip-debug ${.ALLSRC} ${.TARGET}" \ + no-obj no-implicit-rule +vmm_hyp_blob.bin optional vmm \ + dependency vmm_hyp_blob.elf \ + compile-with "${OBJCOPY} --output-target=binary ${.ALLSRC} ${.TARGET}" \ + no-obj no-implicit-rule +arm64/vmm/vmm_hyp_el2.S optional vmm \ + dependency vmm_hyp_blob.bin +arm64/vmm/vmm_mmu.c optional vmm +arm64/vmm/io/vgic.c optional vmm +arm64/vmm/io/vgic_v3.c optional vmm +arm64/vmm/io/vgic_if.m optional vmm +arm64/vmm/io/vtimer.c optional vmm + crypto/armv8/armv8_crypto.c optional armv8crypto armv8_crypto_wrap.o optional armv8crypto \ dependency "$S/crypto/armv8/armv8_crypto_wrap.c" \ diff --git a/sys/conf/ldscript.arm64 b/sys/conf/ldscript.arm64 --- a/sys/conf/ldscript.arm64 +++ b/sys/conf/ldscript.arm64 @@ -6,6 +6,7 @@ { /* Read-only sections, merged into text segment: */ . = text_start; /* This is set using --defsym= on the command line. */ + .vmm_vectors : { *(.vmm_vectors) } .text : { *(.text) @@ -16,6 +17,7 @@ } =0x9090 _etext = .; PROVIDE (etext = .); + .fini : { *(.fini) } =0x9090 .rodata : { *(.rodata*) *(.gnu.linkonce.r*) } .rodata1 : { *(.rodata1) } diff --git a/sys/conf/options.arm64 b/sys/conf/options.arm64 --- a/sys/conf/options.arm64 +++ b/sys/conf/options.arm64 @@ -19,6 +19,9 @@ # EFI Runtime services support EFIRT opt_efirt.h +# Bhyve +VMM opt_global.h + # SoC Support SOC_ALLWINNER_A64 opt_soc.h SOC_ALLWINNER_H5 opt_soc.h diff --git a/sys/modules/Makefile b/sys/modules/Makefile --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -833,7 +833,9 @@ _sgx_linux= sgx_linux _smartpqi= smartpqi _p2sb= p2sb +.endif +.if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" .if ${MK_BHYVE} != "no" || defined(ALL_MODULES) .if ${KERN_OPTS:MSMP} _vmm= vmm diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile --- a/sys/modules/vmm/Makefile +++ b/sys/modules/vmm/Makefile @@ -3,31 +3,79 @@ KMOD= vmm -SRCS= opt_acpi.h opt_bhyve_snapshot.h opt_ddb.h -SRCS+= device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h vnode_if.h -DPSRCS+= vmx_assym.h svm_assym.h -DPSRCS+= vmx_genassym.c svm_genassym.c offset.inc +SRCS= opt_acpi.h opt_ddb.h device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h CFLAGS+= -DVMM_KEEP_STATS -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/io -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/intel -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/amd +CFLAGS+= -I${SRCTOP}/sys/${MACHINE}/vmm +CFLAGS+= -I${SRCTOP}/sys/${MACHINE}/vmm/io # generic vmm support -.PATH: ${SRCTOP}/sys/amd64/vmm +.PATH: ${SRCTOP}/sys/${MACHINE}/vmm SRCS+= vmm.c \ vmm_dev.c \ - vmm_host.c \ vmm_instruction_emul.c \ + vmm_stat.c + +.if ${MACHINE_CPUARCH} == "aarch64" +DPSRCS+= assym.inc + +# TODO: Add the new EL2 code +SRCS+= vmm_arm64.c \ + vmm_reset.c \ + vmm_call.S \ + vmm_mmu.c \ + vmm_hyp_el2.S + +.PATH: ${SRCTOP}/sys/${MACHINE}/vmm/io +SRCS+= vgic.c \ + vgic_if.h \ + vgic_if.c \ + vgic_v3.c \ + vtimer.c + +SRCS+= vmm_hyp_exception.S vmm_hyp.c + +CLEANFILES+= vmm_hyp_blob.elf.full +CLEANFILES+= vmm_hyp_blob.elf vmm_hyp_blob.bin + +vmm_hyp_exception.o: vmm_hyp_exception.S + ${CC} -c -x assembler-with-cpp -DLOCORE \ + ${CFLAGS:N-fsanitize*:N-mbranch-protection*} \ + ${.IMPSRC} -o ${.TARGET} -fpie + +vmm_hyp.o: vmm_hyp.c + ${CC} -c ${CFLAGS:N-fsanitize*:N-mbranch-protection*} \ + ${.IMPSRC} -o ${.TARGET} -fpie + +vmm_hyp_blob.elf.full: vmm_hyp_exception.o vmm_hyp.o + ${LD} -m ${LD_EMULATION} -Bdynamic -T ${SYSDIR}/conf/ldscript.arm64 \ + ${_LDFLAGS} --no-warn-mismatch --warn-common --export-dynamic \ + --dynamic-linker /red/herring -X -o ${.TARGET} ${.ALLSRC} \ + --defsym=text_start='0x0' + +vmm_hyp_blob.elf: vmm_hyp_blob.elf.full + ${OBJCOPY} --strip-debug ${.ALLSRC} ${.TARGET} + +vmm_hyp_blob.bin: vmm_hyp_blob.elf + ${OBJCOPY} --output-target=binary ${.ALLSRC} ${.TARGET} + +vmm_hyp_el2.o: vmm_hyp_blob.bin + +.elif ${MACHINE_CPUARCH} == "amd64" +DPSRCS+= vmx_assym.h svm_assym.h +DPSRCS+= vmx_genassym.c svm_genassym.c offset.inc + +CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/intel +CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/amd + +SRCS+= vmm_host.c \ vmm_ioport.c \ vmm_lapic.c \ vmm_mem.c \ - vmm_stat.c \ vmm_util.c \ x86.c -.PATH: ${SRCTOP}/sys/amd64/vmm/io +.PATH: ${SRCTOP}/sys/${MACHINE}/vmm/io SRCS+= iommu.c \ ppt.c \ vatpic.c \ @@ -62,10 +110,11 @@ SRCS.BHYVE_SNAPSHOT= vmm_snapshot.c -CLEANFILES= vmx_assym.h vmx_genassym.o svm_assym.h svm_genassym.o +CLEANFILES+= vmx_assym.h vmx_genassym.o svm_assym.h svm_genassym.o OBJS_DEPEND_GUESS.vmx_support.o+= vmx_assym.h OBJS_DEPEND_GUESS.svm_support.o+= svm_assym.h +.endif vmx_assym.h: vmx_genassym.o sh ${SYSDIR}/kern/genassym.sh vmx_genassym.o > ${.TARGET} @@ -81,6 +130,9 @@ ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ ${.IMPSRC} -o ${.TARGET} +hyp_genassym.o: offset.inc + ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} + vmx_genassym.o: offset.inc ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC}