diff --git a/sys/arm64/arm64/genassym.c b/sys/arm64/arm64/genassym.c --- a/sys/arm64/arm64/genassym.c +++ b/sys/arm64/arm64/genassym.c @@ -73,6 +73,7 @@ ASSYM(TF_SIZE, sizeof(struct trapframe)); ASSYM(TF_SP, offsetof(struct trapframe, tf_sp)); +ASSYM(TF_LR, offsetof(struct trapframe, tf_lr)); ASSYM(TF_ELR, offsetof(struct trapframe, tf_elr)); ASSYM(TF_SPSR, offsetof(struct trapframe, tf_spsr)); ASSYM(TF_X, offsetof(struct trapframe, tf_x)); diff --git a/sys/arm64/arm64/identcpu.c b/sys/arm64/arm64/identcpu.c --- a/sys/arm64/arm64/identcpu.c +++ b/sys/arm64/arm64/identcpu.c @@ -104,8 +104,6 @@ SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD, cpu_model, sizeof(cpu_model), "Machine model"); -#define MAX_CACHES 8 /* Maximum number of caches supported - architecturally. */ /* * Per-CPU affinity as provided in MPIDR_EL1 * Indexed by CPU number in logical order selected by the system. @@ -119,32 +117,6 @@ uint64_t __cpu_affinity[MAXCPU]; static u_int cpu_aff_levels; -struct cpu_desc { - uint64_t mpidr; - uint64_t id_aa64afr0; - uint64_t id_aa64afr1; - uint64_t id_aa64dfr0; - uint64_t id_aa64dfr1; - uint64_t id_aa64isar0; - uint64_t id_aa64isar1; - uint64_t id_aa64isar2; - uint64_t id_aa64mmfr0; - uint64_t id_aa64mmfr1; - uint64_t id_aa64mmfr2; - uint64_t id_aa64pfr0; - uint64_t id_aa64pfr1; - uint64_t id_aa64zfr0; - uint64_t ctr; -#ifdef COMPAT_FREEBSD32 - uint64_t id_isar5; - uint64_t mvfr0; - uint64_t mvfr1; -#endif - uint64_t clidr; - uint32_t ccsidr[MAX_CACHES][2]; /* 2 possible types. */ - bool have_sve; -}; - static struct cpu_desc cpu_desc[MAXCPU]; static struct cpu_desc kern_cpu_desc; static struct cpu_desc user_cpu_desc; @@ -1824,6 +1796,27 @@ } } +void +update_cpu_desc(struct cpu_desc *desc) +{ + struct mrs_field *fields; + uint64_t desc_val, kern_val; + int i, j; + + for (i = 0; i < nitems(user_regs); i++) { + kern_val = CPU_DESC_FIELD(kern_cpu_desc, i); + desc_val = CPU_DESC_FIELD(*desc, i); + + fields = user_regs[i].fields; + for (j = 0; fields[j].type != 0; j++) { + desc_val = update_lower_register(desc_val, kern_val, + fields[j].shift, 4, fields[j].sign); + } + + CPU_DESC_FIELD(*desc, i) = desc_val; + } +} + /* HWCAP */ bool __read_frequently lse_supported = false; diff --git a/sys/arm64/include/armreg.h b/sys/arm64/include/armreg.h --- a/sys/arm64/include/armreg.h +++ b/sys/arm64/include/armreg.h @@ -503,6 +503,14 @@ #define ID_AA64DFR0_TraceFilt_NONE (UL(0x0) << ID_AA64DFR0_TraceFilt_SHIFT) #define ID_AA64DFR0_TraceFilt_8_4 (UL(0x1) << ID_AA64DFR0_TraceFilt_SHIFT) +/* ID_AA64DFR1_EL1 */ +#define ID_AA64DFR1_EL1 MRS_REG(ID_AA64DFR0_EL1) +#define ID_AA64DFR1_EL1_op0 3 +#define ID_AA64DFR1_EL1_op1 0 +#define ID_AA64DFR1_EL1_CRn 0 +#define ID_AA64DFR1_EL1_CRm 5 +#define ID_AA64DFR1_EL1_op2 1 + /* ID_AA64ISAR0_EL1 */ #define ID_AA64ISAR0_EL1 MRS_REG(ID_AA64ISAR0_EL1) #define ID_AA64ISAR0_EL1_op0 0x3 diff --git a/sys/arm64/include/cpu.h b/sys/arm64/include/cpu.h --- a/sys/arm64/include/cpu.h +++ b/sys/arm64/include/cpu.h @@ -178,6 +178,36 @@ #define CPU_MATCH_ERRATA_CAVIUM_THUNDERX_1_1 0 #endif +#define MAX_CACHES 8 /* Maximum number of caches supported + architecturally. */ + +struct cpu_desc { + uint64_t mpidr; + uint64_t id_aa64afr0; + uint64_t id_aa64afr1; + uint64_t id_aa64dfr0; + uint64_t id_aa64dfr1; + uint64_t id_aa64isar0; + uint64_t id_aa64isar1; + uint64_t id_aa64isar2; + uint64_t id_aa64mmfr0; + uint64_t id_aa64mmfr1; + uint64_t id_aa64mmfr2; + uint64_t id_aa64pfr0; + uint64_t id_aa64pfr1; + uint64_t id_aa64zfr0; + uint64_t ctr; +#ifdef COMPAT_FREEBSD32 + uint64_t id_isar5; + uint64_t mvfr0; + uint64_t mvfr1; +#endif + uint64_t clidr; + uint32_t ccsidr[MAX_CACHES][2]; /* 2 possible types. */ + bool have_sve; +}; + + extern char btext[]; extern char etext[]; @@ -217,6 +247,7 @@ /* Functions to read the sanitised view of the special registers */ void update_special_regs(u_int); +void update_cpu_desc(struct cpu_desc *desc); bool extract_user_id_field(u_int, u_int, uint8_t *); bool get_kernel_reg(u_int, uint64_t *); diff --git a/sys/arm64/include/pcpu.h b/sys/arm64/include/pcpu.h --- a/sys/arm64/include/pcpu.h +++ b/sys/arm64/include/pcpu.h @@ -47,6 +47,7 @@ pcpu_ssbd pc_ssbd; \ struct pmap *pc_curpmap; \ struct pmap *pc_curvmpmap; \ + void *pc_vcpu; \ u_int pc_bcast_tlbi_workaround; \ /* Store as two u_int values to preserve KBI */ \ u_int pc_mpidr_low; /* lower MPIDR 32 bits */ \ diff --git a/sys/arm64/include/vmm.h b/sys/arm64/include/vmm.h new file mode 100644 --- /dev/null +++ b/sys/arm64/include/vmm.h @@ -0,0 +1,443 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_H_ +#define _VMM_H_ + +#include +#include +#include + +#include "pte.h" +#include "pmap.h" + +enum vm_suspend_how { + VM_SUSPEND_NONE, + VM_SUSPEND_RESET, + VM_SUSPEND_POWEROFF, + VM_SUSPEND_HALT, + VM_SUSPEND_TRIPLEFAULT, + VM_SUSPEND_LAST +}; + +/* + * Identifiers for architecturally defined registers. + */ +enum vm_reg_name { + VM_REG_GUEST_X0, + VM_REG_GUEST_X1, + VM_REG_GUEST_X2, + VM_REG_GUEST_X3, + VM_REG_GUEST_X4, + VM_REG_GUEST_X5, + VM_REG_GUEST_X6, + VM_REG_GUEST_X7, + VM_REG_GUEST_X8, + VM_REG_GUEST_X9, + VM_REG_GUEST_X10, + VM_REG_GUEST_X11, + VM_REG_GUEST_X12, + VM_REG_GUEST_X13, + VM_REG_GUEST_X14, + VM_REG_GUEST_X15, + VM_REG_GUEST_X16, + VM_REG_GUEST_X17, + VM_REG_GUEST_X18, + VM_REG_GUEST_X19, + VM_REG_GUEST_X20, + VM_REG_GUEST_X21, + VM_REG_GUEST_X22, + VM_REG_GUEST_X23, + VM_REG_GUEST_X24, + VM_REG_GUEST_X25, + VM_REG_GUEST_X26, + VM_REG_GUEST_X27, + VM_REG_GUEST_X28, + VM_REG_GUEST_X29, + VM_REG_GUEST_LR, + VM_REG_GUEST_SP, + VM_REG_GUEST_ELR, + VM_REG_GUEST_SPSR, + VM_REG_ELR_EL2, + VM_REG_LAST +}; + +#define VM_INTINFO_VECTOR(info) ((info) & 0xff) +#define VM_INTINFO_DEL_ERRCODE 0x800 +#define VM_INTINFO_RSVD 0x7ffff000 +#define VM_INTINFO_VALID 0x80000000 +#define VM_INTINFO_TYPE 0x700 +#define VM_INTINFO_HWINTR (0 << 8) +#define VM_INTINFO_NMI (2 << 8) +#define VM_INTINFO_HWEXCEPTION (3 << 8) +#define VM_INTINFO_SWINTR (4 << 8) + +#define VM_MAX_SUFFIXLEN 15 + +#define VM_GUEST_BASE_IPA 0x80000000UL /* Guest kernel start ipa */ + +#ifdef _KERNEL + +#define VM_MAX_NAMELEN 32 + +struct vm; +struct vm_exception; +struct vm_exit; +struct vm_run; +struct vm_object; +struct pmap; + +struct vm_eventinfo { + void *rptr; /* rendezvous cookie */ + int *sptr; /* suspend cookie */ + int *iptr; /* reqidle cookie */ +}; + +typedef int (*vmm_init_func_t)(int ipinum); +typedef int (*vmm_cleanup_func_t)(void); +typedef void (*vmm_resume_func_t)(void); +typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, + struct pmap *pmap, struct vm_eventinfo *evinfo); +typedef void (*vmi_cleanup_func_t)(void *vmi); +typedef void (*vmi_mmap_set_func_t)(void *arg, vm_offset_t va, + vm_offset_t pa, size_t len, + vm_prot_t prot); +typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *arg, vm_offset_t va); +typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, + uint64_t *retval); +typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, + uint64_t val); +typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); +typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); +typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); +typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); +typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); +typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); + +struct vmm_ops { + /* Module-wide functions */ + vmm_init_func_t init; + vmm_cleanup_func_t cleanup; + vmm_resume_func_t resume; + /* VM specific functions */ + vmi_init_func_t vminit; + vmi_run_func_t vmrun; + vmi_cleanup_func_t vmcleanup; + vmi_get_register_t vmgetreg; + vmi_set_register_t vmsetreg; + vmi_get_cap_t vmgetcap; + vmi_set_cap_t vmsetcap; + vmi_vmspace_alloc vmspace_alloc; + vmi_vmspace_free vmspace_free; +}; + +extern struct vmm_ops vmm_ops_arm; + +int vm_create(const char *name, struct vm **retvm); +void vm_destroy(struct vm *vm); +int vm_reinit(struct vm *vm); +const char *vm_name(struct vm *vm); + +/* + * APIs that modify the guest memory map require all vcpus to be frozen. + */ +int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, + size_t len, int prot, int flags); +int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); +void vm_free_memseg(struct vm *vm, int ident); +int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); +int vmm_map_gpa(struct vm *vm, vm_offset_t va, vm_paddr_t gpa, int pages, + vm_page_t *ma); +void vmm_unmap_gpa(struct vm *vm, vm_offset_t va, size_t pages, vm_page_t *ma); + +/* + * APIs that inspect the guest memory map require only a *single* vcpu to + * be frozen. This acts like a read lock on the guest memory map since any + * modification requires *all* vcpus to be frozen. + */ +int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); +int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + struct vm_object **objptr); +vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); +void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len, + int prot, void **cookie); +void vm_gpa_release(void *cookie); +bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa); + +uint16_t vm_get_maxcpus(struct vm *vm); +void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus); +int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus); +int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); +int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); +int vm_run(struct vm *vm, struct vm_run *vmrun); +int vm_suspend(struct vm *vm, enum vm_suspend_how how); +void* vm_get_cookie(struct vm *vm); +int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); +int vm_set_capability(struct vm *vm, int vcpu, int type, int val); +int vm_activate_cpu(struct vm *vm, int vcpu); +int vm_suspend_cpu(struct vm *vm, int vcpu); +int vm_resume_cpu(struct vm *vm, int vcpu); +int vm_attach_vgic(struct vm *vm, uint64_t dist_start, size_t dist_size, + uint64_t redist_start, size_t redist_size); +int vm_assert_irq(struct vm *vm, uint32_t irq); +int vm_deassert_irq(struct vm *vm, uint32_t irq); +int vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot, + int func); +struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); +void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); + +#ifdef _SYS__CPUSET_H_ +/* + * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. + * The rendezvous 'func(arg)' is not allowed to do anything that will + * cause the thread to be put to sleep. + * + * If the rendezvous is being initiated from a vcpu context then the + * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1. + * + * The caller cannot hold any locks when initiating the rendezvous. + * + * The implementation of this API may cause vcpus other than those specified + * by 'dest' to be stalled. The caller should not rely on any vcpus making + * forward progress when the rendezvous is in progress. + */ +typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); +void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, + vm_rendezvous_func_t func, void *arg); +cpuset_t vm_active_cpus(struct vm *vm); +cpuset_t vm_debug_cpus(struct vm *vm); +cpuset_t vm_suspended_cpus(struct vm *vm); +#endif /* _SYS__CPUSET_H_ */ + +static __inline bool +virt_enabled() +{ + + return (has_hyp()); +} + +static __inline int +vcpu_rendezvous_pending(struct vm_eventinfo *info) +{ + + return (*((uintptr_t *)(info->rptr)) != 0); +} + +static __inline int +vcpu_suspended(struct vm_eventinfo *info) +{ + + return (*info->sptr); +} + +enum vcpu_state { + VCPU_IDLE, + VCPU_FROZEN, + VCPU_RUNNING, + VCPU_SLEEPING, +}; + +int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, + bool from_idle); +enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); + +static int __inline +vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) +{ + return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); +} + +#ifdef _SYS_PROC_H_ +static int __inline +vcpu_should_yield(struct vm *vm, int vcpu) +{ + struct thread *td; + + td = curthread; + return (td->td_ast != 0 || td->td_owepreempt != 0); +} +#endif + +void *vcpu_stats(struct vm *vm, int vcpu); +void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); + +/* + * This function is called after a VM-exit that occurred during exception or + * interrupt delivery through the IDT. The format of 'intinfo' is described + * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. + * + * If a VM-exit handler completes the event delivery successfully then it + * should call vm_exit_intinfo() to extinguish the pending event. For e.g., + * if the task switch emulation is triggered via a task gate then it should + * call this function with 'intinfo=0' to indicate that the external event + * is not pending anymore. + * + * Return value is 0 on success and non-zero on failure. + */ +int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); + +/* + * This function is called before every VM-entry to retrieve a pending + * event that should be injected into the guest. This function combines + * nested events into a double or triple fault. + * + * Returns 0 if there are no events that need to be injected into the guest + * and non-zero otherwise. + */ +int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); + +int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); + +enum vm_reg_name vm_segment_name(int seg_encoding); + +struct vm_copyinfo { + uint64_t gpa; + size_t len; + void *hva; + void *cookie; +}; + +int vcpu_trace_exceptions(struct vm *vm, int vcpuid); + +#endif /* _KERNEL */ + +#define VM_MAXCPU 4 + +#define VM_DIR_READ 0 +#define VM_DIR_WRITE 1 + +#define VM_GP_M_MASK 0x1f +#define VM_GP_MMU_ENABLED (1 << 5) + +struct vm_guest_paging { + uint64_t far; + uint64_t ttbr0_el1; + uint64_t ttbr1_el1; + int flags; + int padding; +}; + +struct vie { + uint8_t access_size:4, sign_extend:1, dir:1, unused:2; + enum vm_reg_name reg; +}; + +struct vre { + uint32_t inst_syndrome; + uint8_t dir:1, unused:7; + enum vm_reg_name reg; +}; + +/* + * Identifiers for optional vmm capabilities + */ +enum vm_cap_type { + VM_CAP_HALT_EXIT, + VM_CAP_MTRAP_EXIT, + VM_CAP_PAUSE_EXIT, + VM_CAP_UNRESTRICTED_GUEST, + VM_CAP_MAX +}; + +enum vm_exitcode { + VM_EXITCODE_BOGUS, + VM_EXITCODE_INST_EMUL, + VM_EXITCODE_REG_EMUL, + VM_EXITCODE_HVC, + VM_EXITCODE_SUSPENDED, + VM_EXITCODE_HYP, + VM_EXITCODE_WFI, + VM_EXITCODE_PAGING, + VM_EXITCODE_SMCCC, + VM_EXITCODE_MAX +}; + +enum task_switch_reason { + TSR_CALL, + TSR_IRET, + TSR_JMP, + TSR_IDT_GATE, /* task gate in IDT */ +}; + +struct vm_task_switch { + uint16_t tsssel; /* new TSS selector */ + int ext; /* task switch due to external event */ + uint32_t errcode; + int errcode_valid; /* push 'errcode' on the new stack */ + enum task_switch_reason reason; +}; + +struct vm_exit { + enum vm_exitcode exitcode; + int inst_length; + uint64_t pc; + union { + /* + * ARM specific payload. + */ + struct { + uint32_t exception_nr; + uint32_t esr_el2; /* Exception Syndrome Register */ + uint64_t far_el2; /* Fault Address Register */ + uint64_t hpfar_el2; /* Hypervisor IPA Fault Address Register */ + } hyp; + struct { + struct vre vre; + } reg_emul; + struct { + uint64_t gpa; + uint64_t esr; + } paging; + struct { + uint64_t gpa; + struct vm_guest_paging paging; + struct vie vie; + } inst_emul; + + /* + * A SMCCC call, e.g. starting a core via PSCI. + * Further arguments can be read by asking the kernel for + * all register values. + */ + struct { + uint64_t func_id; + uint64_t args[3]; + } smccc_call; + + struct { + enum vm_suspend_how how; + } suspended; + } u; +}; + +#endif /* _VMM_H_ */ diff --git a/sys/arm64/include/vmm_dev.h b/sys/arm64/include/vmm_dev.h new file mode 100644 --- /dev/null +++ b/sys/arm64/include/vmm_dev.h @@ -0,0 +1,249 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_DEV_H_ +#define _VMM_DEV_H_ + +#ifdef _KERNEL +void vmmdev_init(void); +int vmmdev_cleanup(void); +#endif + +struct vm_memmap { + vm_paddr_t gpa; + int segid; /* memory segment */ + vm_ooffset_t segoff; /* offset into memory segment */ + size_t len; /* mmap length */ + int prot; /* RWX */ + int flags; +}; +#define VM_MEMMAP_F_WIRED 0x01 + +#define VM_MEMSEG_NAME(m) ((m)->name[0] != '\0' ? (m)->name : NULL) +struct vm_memseg { + int segid; + size_t len; + char name[VM_MAX_SUFFIXLEN + 1]; +}; + +struct vm_register { + int cpuid; + int regnum; /* enum vm_reg_name */ + uint64_t regval; +}; + +struct vm_register_set { + int cpuid; + unsigned int count; + const int *regnums; /* enum vm_reg_name */ + uint64_t *regvals; +}; + +struct vm_run { + int cpuid; + uint64_t pc; + struct vm_exit vm_exit; + +}; + +struct vm_exception { + int cpuid; + int vector; + uint32_t error_code; + int error_code_valid; + int restart_instruction; +}; + +struct vm_msi { + uint64_t msg; + uint64_t addr; + int bus; + int slot; + int func; +}; + +struct vm_capability { + int cpuid; + enum vm_cap_type captype; + int capval; + int allcpus; +}; + +#define MAX_VM_STATS 64 +struct vm_stats { + int cpuid; /* in */ + int index; /* in */ + int num_entries; /* out */ + struct timeval tv; + uint64_t statbuf[MAX_VM_STATS]; +}; +struct vm_stat_desc { + int index; /* in */ + char desc[128]; /* out */ +}; + +struct vm_suspend { + enum vm_suspend_how how; +}; + +struct vm_gla2gpa { + int vcpuid; /* inputs */ + int prot; /* PROT_READ or PROT_WRITE */ + uint64_t gla; + int fault; /* outputs */ + uint64_t gpa; +}; + +struct vm_activate_cpu { + int vcpuid; +}; + +struct vm_cpuset { + int which; + int cpusetsize; + cpuset_t *cpus; +}; +#define VM_ACTIVE_CPUS 0 +#define VM_SUSPENDED_CPUS 1 +#define VM_DEBUG_CPUS 2 + +struct vm_attach_vgic { + uint64_t dist_start; + size_t dist_size; + uint64_t redist_start; + size_t redist_size; +}; + +struct vm_irq { + uint32_t irq; +}; + +struct vm_cpu_topology { + uint16_t sockets; + uint16_t cores; + uint16_t threads; + uint16_t maxcpus; +}; + +enum { + /* general routines */ + IOCNUM_ABIVERS = 0, + IOCNUM_RUN = 1, + IOCNUM_SET_CAPABILITY = 2, + IOCNUM_GET_CAPABILITY = 3, + IOCNUM_SUSPEND = 4, + IOCNUM_REINIT = 5, + + /* memory apis */ + IOCNUM_GET_GPA_PMAP = 12, + IOCNUM_GLA2GPA = 13, + IOCNUM_ALLOC_MEMSEG = 14, + IOCNUM_GET_MEMSEG = 15, + IOCNUM_MMAP_MEMSEG = 16, + IOCNUM_MMAP_GETNEXT = 17, + + /* register/state accessors */ + IOCNUM_SET_REGISTER = 20, + IOCNUM_GET_REGISTER = 21, + IOCNUM_SET_REGISTER_SET = 24, + IOCNUM_GET_REGISTER_SET = 25, + + /* statistics */ + IOCNUM_VM_STATS = 50, + IOCNUM_VM_STAT_DESC = 51, + + /* CPU Topology */ + IOCNUM_SET_TOPOLOGY = 63, + IOCNUM_GET_TOPOLOGY = 64, + + /* interrupt injection */ + IOCNUM_ASSERT_IRQ = 80, + IOCNUM_DEASSERT_IRQ = 81, + IOCNUM_RAISE_MSI = 82, + + /* vm_cpuset */ + IOCNUM_ACTIVATE_CPU = 90, + IOCNUM_GET_CPUSET = 91, + IOCNUM_SUSPEND_CPU = 92, + IOCNUM_RESUME_CPU = 93, + + /* vm_attach_vgic */ + IOCNUM_ATTACH_VGIC = 110, +}; + +#define VM_RUN \ + _IOWR('v', IOCNUM_RUN, struct vm_run) +#define VM_SUSPEND \ + _IOW('v', IOCNUM_SUSPEND, struct vm_suspend) +#define VM_REINIT \ + _IO('v', IOCNUM_REINIT) +#define VM_ALLOC_MEMSEG \ + _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg) +#define VM_GET_MEMSEG \ + _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg) +#define VM_MMAP_MEMSEG \ + _IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap) +#define VM_MMAP_GETNEXT \ + _IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap) +#define VM_SET_REGISTER \ + _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) +#define VM_GET_REGISTER \ + _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) +#define VM_SET_REGISTER_SET \ + _IOW('v', IOCNUM_SET_REGISTER_SET, struct vm_register_set) +#define VM_GET_REGISTER_SET \ + _IOWR('v', IOCNUM_GET_REGISTER_SET, struct vm_register_set) +#define VM_SET_CAPABILITY \ + _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) +#define VM_GET_CAPABILITY \ + _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) +#define VM_STATS \ + _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) +#define VM_STAT_DESC \ + _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) +#define VM_ASSERT_IRQ \ + _IOW('v', IOCNUM_ASSERT_IRQ, struct vm_irq) +#define VM_DEASSERT_IRQ \ + _IOW('v', IOCNUM_DEASSERT_IRQ, struct vm_irq) +#define VM_RAISE_MSI \ + _IOW('v', IOCNUM_RAISE_MSI, struct vm_msi) +#define VM_SET_TOPOLOGY \ + _IOW('v', IOCNUM_SET_TOPOLOGY, struct vm_cpu_topology) +#define VM_GET_TOPOLOGY \ + _IOR('v', IOCNUM_GET_TOPOLOGY, struct vm_cpu_topology) +#define VM_GLA2GPA \ + _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa) +#define VM_ACTIVATE_CPU \ + _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) +#define VM_GET_CPUS \ + _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) +#define VM_SUSPEND_CPU \ + _IOW('v', IOCNUM_SUSPEND_CPU, struct vm_activate_cpu) +#define VM_RESUME_CPU \ + _IOW('v', IOCNUM_RESUME_CPU, struct vm_activate_cpu) +#define VM_ATTACH_VGIC \ + _IOW('v', IOCNUM_ATTACH_VGIC, struct vm_attach_vgic) +#endif diff --git a/sys/arm64/include/vmm_instruction_emul.h b/sys/arm64/include/vmm_instruction_emul.h new file mode 100644 --- /dev/null +++ b/sys/arm64/include/vmm_instruction_emul.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_INSTRUCTION_EMUL_H_ +#define _VMM_INSTRUCTION_EMUL_H_ + +/* + * Callback functions to read and write memory regions. + */ +typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t *rval, int rsize, void *arg); +typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t wval, int wsize, void *arg); + +/* + * Callback functions to read and write registers. + */ +typedef int (*reg_read_t)(void *vm, int cpuid, uint64_t *rval, void *arg); +typedef int (*reg_write_t)(void *vm, int cpuid, uint64_t wval, void *arg); + +/* + * Emulate the decoded 'vie' instruction when it contains a memory operation. + * + * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region + * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * + */ +int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t mrr, + mem_region_write_t mrw, void *mrarg); + +/* + * Emulate the decoded 'vre' instruction when it contains a register access. + * + * The callbacks 'regread' and 'regwrite' emulate reads and writes to the + * register from 'vie'. 'regarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * + */ +int vmm_emulate_register(void *vm, int vcpuid, struct vre *vre, reg_read_t regread, + reg_write_t regwrite, void *regarg); + +#ifdef _KERNEL +void vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size, + mem_region_read_t mmio_read, mem_region_write_t mmio_write); +void vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size); +#endif + +#endif /* _VMM_INSTRUCTION_EMUL_H_ */ diff --git a/sys/arm64/include/vmm_snapshot.h b/sys/arm64/include/vmm_snapshot.h new file mode 100644 --- /dev/null +++ b/sys/arm64/include/vmm_snapshot.h @@ -0,0 +1 @@ +/* $FreeBSD$ */ diff --git a/sys/arm64/vmm/arm64.h b/sys/arm64/vmm/arm64.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/arm64.h @@ -0,0 +1,153 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _VMM_ARM64_H_ +#define _VMM_ARM64_H_ + +#include +#include +#include +#include + +#include "mmu.h" +#include "io/vgic_v3.h" +#include "io/vtimer.h" + +struct hypctx { + struct trapframe tf; + + /* + * EL1 control registers. + * Be careful changing the layout of these as we access them from + * assembly when switching between the host and guest. + */ + uint64_t elr_el1; /* Exception Link Register */ + uint64_t sp_el0; /* Stack pointer */ + uint64_t tpidr_el0; /* EL0 Software ID Register */ + uint64_t tpidrro_el0; /* Read-only Thread ID Register */ + uint64_t tpidr_el1; /* EL1 Software ID Register */ + uint64_t vbar_el1; /* Vector Base Address Register */ + + uint64_t actlr_el1; /* Auxiliary Control Register */ + uint64_t afsr0_el1; /* Auxiliary Fault Status Register 0 */ + uint64_t afsr1_el1; /* Auxiliary Fault Status Register 1 */ + uint64_t amair_el1; /* Auxiliary Memory Attribute Indirection Register */ + uint64_t contextidr_el1; /* Current Process Identifier */ + uint64_t cpacr_el1; /* Architectural Feature Access Control Register */ + uint64_t csselr_el1; /* Cache Size Selection Register */ + uint64_t esr_el1; /* Exception Syndrome Register */ + uint64_t far_el1; /* Fault Address Register */ + uint64_t mair_el1; /* Memory Attribute Indirection Register */ + uint64_t mdccint_el1; /* Monitor DCC Interrupt Enable Register */ + uint64_t mdscr_el1; /* Monitor Debug System Control Register */ + uint64_t par_el1; /* Physical Address Register */ + uint64_t sctlr_el1; /* System Control Register */ + uint64_t tcr_el1; /* Translation Control Register */ + uint64_t ttbr0_el1; /* Translation Table Base Register 0 */ + uint64_t ttbr1_el1; /* Translation Table Base Register 1 */ + uint64_t spsr_el1; /* Saved Program Status Register */ + + uint64_t pmcr_el0; /* Performance Monitors Control Register */ + uint64_t pmccntr_el0; + uint64_t pmccfiltr_el0; + uint64_t pmcntenset_el0; + uint64_t pmintenset_el1; + uint64_t pmovsset_el0; + uint64_t pmselr_el0; + uint64_t pmuserenr_el0; + uint64_t pmevcntr_el0[31]; + uint64_t pmevtyper_el0[31]; + + uint64_t dbgbcr_el1[16]; /* Debug Breakpoint Control Registers */ + uint64_t dbgbvr_el1[16]; /* Debug Breakpoint Value Registers */ + uint64_t dbgwcr_el1[16]; /* Debug Watchpoint Control Registers */ + uint64_t dbgwvr_el1[16]; /* Debug Watchpoint Value Registers */ + + /* EL2 control registers */ + uint64_t cptr_el2; /* Architectural Feature Trap Register */ + uint64_t hcr_el2; /* Hypervisor Configuration Register */ + uint64_t mdcr_el2; /* Monitor Debug Configuration Register */ + uint64_t vpidr_el2; /* Virtualization Processor ID Register */ + uint64_t vmpidr_el2; /* Virtualization Multiprocessor ID Register */ + uint32_t vcpu; + struct hyp *hyp; + struct { + uint64_t far_el2; /* Fault Address Register */ + uint64_t hpfar_el2; /* Hypervisor IPA Fault Address Register */ + } exit_info; + + struct vtimer_cpu vtimer_cpu; + struct vgic_v3_cpu_if vgic_cpu_if; + struct vgic_v3_redist vgic_redist; +#ifdef VFP + struct vfpstate vfpstate; +#endif +}; + +struct hyp { + struct hypctx ctx[VM_MAXCPU]; + struct vgic_v3_dist vgic_dist; + struct vm *vm; + struct vtimer vtimer; + uint64_t vmid_generation; + uint64_t vttbr_el2; + uint64_t el2_addr; /* The address of this in el2 space */ + bool vgic_attached; +}; + +uint64_t vmm_call_hyp(uint64_t, ...); +void vmm_cleanup(void *hyp_stub_vectors); +uint64_t vmm_enter_guest(struct hypctx *hypctx); +uint64_t vmm_read_ich_vtr_el2(void); +uint64_t vmm_read_cnthctl_el2(void); +uint64_t vmm_read_tcr_el2(void); + +#define eprintf(fmt, ...) printf("%s:%d " fmt, __func__, __LINE__, ##__VA_ARGS__) +//#define eprintf(fmt, ...) do {} while(0) + +#define VMID_GENERATION_MASK ((1UL<<8) - 1) +#define build_vttbr(vmid, ptaddr) \ + ((((vmid) & VMID_GENERATION_MASK) << VTTBR_VMID_SHIFT) | \ + (uint64_t)(ptaddr)) + +#define MPIDR_SMP_MASK (0x3 << 30) +#define MPIDR_AFF1_LEVEL(x) (((x) >> 2) << 8) +#define MPIDR_AFF0_LEVEL(x) (((x) & 0x3) << 0) + +/* + * Return true if the exception was caused by a translation fault in the stage 2 + * translation regime. The DFSC encoding for a translation fault has the format + * 0b0001LL, where LL (bits [1:0]) represents the level where the fault occured + * (page D7-2280 of the ARMv8 Architecture Manual). + */ +#define ISS_DATA_DFSC_TF(esr_iss) \ + (!((esr_iss) & 0b111000) && ((esr_iss) & 0b000100)) +#define FAR_EL2_PAGE_OFFSET(x) ((x) & PAGE_MASK) + +#define DEBUG_ME 0 + +#define arm64_get_active_vcpu() ((struct hypctx *)PCPU_GET(vcpu)) + +#endif /* !_VMM_ARM64_H_ */ diff --git a/sys/arm64/vmm/hyp.h b/sys/arm64/vmm/hyp.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/hyp.h @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2017 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_HYP_H_ +#define _VMM_HYP_H_ + +/* + * The translation tables for the hypervisor mode will hold mappings for kernel + * virtual addresses and an identity mapping (VA == PA) necessary when + * enabling/disabling the MMU. + * + * When in EL2 exception level the translation table base register is TTBR0_EL2 + * and the virtual addresses generated by the CPU must be at the bottom of the + * memory, with the first 16 bits all set to zero: + * + * 0x0000ffffffffffff End hyp address space + * 0x0000000000000000 Start of hyp address space + * + * To run code in hyp mode we need to convert kernel virtual addresses to + * addresses that fit into this address space. + * + * The kernel virtual address range is: + * + * 0xffff007fffffffff End of KVA + * 0xffff000000000000 Kernel base address & start of KVA + * + * (see /sys/arm64/include/vmparam.h). + * + * We could convert the kernel virtual addresses to valid EL2 addresses by + * setting the first 16 bits to zero and thus mapping the kernel addresses in + * the bottom half of the EL2 address space, but then they might clash with the + * identity mapping addresses. Instead we map the kernel addresses in the upper + * half of the EL2 address space. + * + * The hypervisor address space will look like this: + * + * 0x0000807fffffffff End of KVA mapping + * 0x0000800000000000 Start of KVA mapping + * + * 0x00007fffffffffff End of identity mapping + * 0x0000000000000000 Start of identity mapping + * + * With the scheme we have 47 bits at our disposable for the identity map and + * another 47 bits for the kernel virtual addresses. For a maximum physical + * memory size of 128TB we are guaranteed to not have any clashes between + * addresses. + */ +#define HYP_VM_MIN_ADDRESS 0x0000000000000000 +#define HYP_VM_MAX_ADDRESS 0x0001000000000000 + +/* + * When the vmm code is installed the following handles can be used by + * the host to call into EL2. + */ +#define HYP_CLEANUP 0x00000001 +#define HYP_ENTER_GUEST 0x00000002 +#define HYP_READ_REGISTER 0x00000003 +#define HYP_REG_ICH_VTR 0x1 +#define HYP_REG_CNTHCTL 0x2 +#define HYP_CLEAN_S2_TLBI 0x00000004 +#define HYP_DC_CIVAC 0x00000005 +#define HYP_EL2_TLBI 0x00000006 +#define HYP_EL2_TLBI_ALL 0x1 +#define HYP_EL2_TLBI_VA 0x2 +#define HYP_S2_TLBI_RANGE 0x00000010 +#define HYP_S2_TLBI_ALL 0x00000011 + +/* + * When taking asynchronous exceptions, or interrupts, with the exception of the + * SError interrupt, the exception syndrome register is not updated with the + * exception code. We need to differentiate between the different exception + * types taken to EL2. + */ +#define EXCP_TYPE_EL1_SYNC 0 +#define EXCP_TYPE_EL1_IRQ 1 +#define EXCP_TYPE_EL1_FIQ 2 +#define EXCP_TYPE_EL1_ERROR 3 + +#define EXCP_TYPE_EL2_SYNC 4 +#define EXCP_TYPE_EL2_IRQ 5 +#define EXCP_TYPE_EL2_FIQ 6 +#define EXCP_TYPE_EL2_ERROR 7 + +#define EXCP_TYPE_MAINT_IRQ 8 +/* Used internally in vmm_hyp.c */ +#define EXCP_TYPE_REENTER 9 + +#define HYP_GET_VECTOR_TABLE -1 + +#endif /* !_VMM_HYP_H_ */ diff --git a/sys/arm64/vmm/io/vgic_v3.h b/sys/arm64/vmm/io/vgic_v3.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/io/vgic_v3.h @@ -0,0 +1,173 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_VGIC_V3_H_ +#define _VMM_VGIC_V3_H_ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +struct hypctx; + +int vgic_v3_icc_sgi1r_read(void *vm, int vcpuid, uint64_t *rval, void *arg); +int vgic_v3_icc_sgi1r_write(void *vm, int vcpuid, uint64_t rval, void *arg); + +#define VGIC_SGI_NUM (GIC_LAST_SGI - GIC_FIRST_SGI + 1) +#define VGIC_PPI_NUM (GIC_LAST_PPI - GIC_FIRST_PPI + 1) +#define VGIC_SPI_NUM (GIC_LAST_SPI - GIC_FIRST_SPI + 1) +#define VGIC_PRV_I_NUM (VGIC_SGI_NUM + VGIC_PPI_NUM) +#define VGIC_SHR_I_NUM (VGIC_SPI_NUM) + +#define VGIC_ICH_LR_NUM_MAX 16 +#define VGIC_ICH_APR_NUM_MAX 4 + +struct vgic_v3_irq { + /* List of IRQs that are active or pending */ + TAILQ_ENTRY(vgic_v3_irq) act_pend_list; + struct mtx irq_spinmtx; + uint64_t mpidr; + int target_vcpu; + uint32_t irq; + bool active; + bool pending; + bool enabled; + bool level; + bool on_aplist; + uint8_t priority; + uint8_t config; +#define VGIC_CONFIG_MASK 0x2 +#define VGIC_CONFIG_LEVEL 0x0 +#define VGIC_CONFIG_EDGE 0x2 +}; + +struct vgic_v3_lpi { + struct vgic_v3_irq irq; + SLIST_ENTRY(vgic_v3_lpi) next; +}; + +struct vgic_mmio_region { + vm_offset_t start; + vm_offset_t end; + mem_region_read_t read; + mem_region_write_t write; +}; + +struct vm; +struct vm_exit; +struct hyp; + +struct vgic_v3_dist { + struct mtx dist_mtx; + + uint64_t start; + size_t end; + + uint32_t gicd_ctlr; /* Distributor Control Register */ + + struct vgic_v3_irq *irqs; + SLIST_HEAD(, vgic_v3_lpi) lpis; +}; + +#define aff_routing_en(distp) (distp->gicd_ctlr & GICD_CTLR_ARE_NS) + +struct vgic_v3_redist { + uint64_t start; + uint64_t end; + + uint64_t gicr_typer; /* Redistributor Type Register */ +}; + +struct vgic_v3_irq; + +struct vgic_v3_cpu_if { + uint32_t ich_eisr_el2; /* End of Interrupt Status Register */ + uint32_t ich_elrsr_el2; /* Empty List register Status Register (ICH_ELRSR_EL2) */ + uint32_t ich_hcr_el2; /* Hyp Control Register */ + uint32_t ich_misr_el2; /* Maintenance Interrupt State Register */ + uint32_t ich_vmcr_el2; /* Virtual Machine Control Register */ + + /* + * The List Registers are part of the VM context and are modified on a + * world switch. They need to be allocated statically so they are + * mapped in the EL2 translation tables when struct hypctx is mapped. + */ + uint64_t ich_lr_el2[VGIC_ICH_LR_NUM_MAX]; + size_t ich_lr_num; + + /* + * We need a mutex for accessing the list registers because they are + * modified asynchronously by the virtual timer. + * + * Note that the mutex *MUST* be a spin mutex because an interrupt can + * be injected by a callout callback function, thereby modifying the + * list registers from a context where sleeping is forbidden. + */ + struct mtx lr_mtx; + + /* Active Priorities Registers for Group 0 and 1 interrupts */ + size_t ich_apr_num; + uint32_t ich_ap0r_el2[VGIC_ICH_APR_NUM_MAX]; + uint32_t ich_ap1r_el2[VGIC_ICH_APR_NUM_MAX]; + + struct vgic_v3_irq private_irqs[VGIC_PRV_I_NUM]; + TAILQ_HEAD(, vgic_v3_irq) irq_act_pend; + u_int ich_lr_used; +}; + +int vgic_v3_attach_to_vm(struct vm *vm, uint64_t dist_start, + size_t dist_size, uint64_t redist_start, size_t redist_size); +void vgic_v3_detach_from_vm(struct vm *vm); + +bool vgic_present(void); +void vgic_v3_init(uint64_t ich_vtr_el2); +void vgic_v3_vminit(struct hyp *); +void vgic_v3_cpuinit(struct hypctx *, bool last_vcpu); +void vgic_v3_cpucleanup(struct hypctx *); +void vgic_v3_vmcleanup(struct hyp *); +void vgic_v3_flush_hwstate(void *arg); +void vgic_v3_sync_hwstate(void *arg); + +bool vgic_v3_vcpu_pending_irq(struct hypctx *hypctx); +int vgic_v3_inject_irq(struct hyp *hyp, int vcpuid, uint32_t irqid, + bool level); +int vgic_v3_inject_msi(struct hyp *hyp, uint64_t msg, uint64_t addr); + +void vgic_v3_group_toggle_enabled(bool enabled, struct hyp *hyp); +int vgic_v3_irq_toggle_enabled(uint32_t irq, bool enabled, + struct hyp *hyp, int vcpuid); + +DECLARE_CLASS(arm_vgic_driver); + +#endif /* !_VMM_VGIC_V3_H_ */ diff --git a/sys/arm64/vmm/io/vgic_v3.c b/sys/arm64/vmm/io/vgic_v3.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/io/vgic_v3.c @@ -0,0 +1,2033 @@ +/* + * Copyright (C) 2018 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include "vgic_v3.h" +#include "vgic_v3_reg.h" + +MALLOC_DEFINE(M_VGIC_V3, "ARM VMM VGIC V3", "ARM VMM VGIC V3"); + +static bool have_vgic = false; + +struct vgic_v3_virt_features { + uint8_t min_prio; + size_t ich_lr_num; + size_t ich_apr_num; +}; + +/* How many IRQs we support (SGIs + PPIs + SPIs). Not including LPIs */ +#define VGIC_NIRQS 1023 +/* Pretend to be an Arm design */ +#define VGIC_IIDR 0x43b + +typedef void (register_read)(struct hyp *, int, u_int, uint64_t *, void *); +typedef void (register_write)(struct hyp *, int, u_int, u_int, u_int, uint64_t, + void *); + +#define VGIC_8_BIT (1 << 0) +/* (1 << 1) is reserved for 16 bit accesses */ +#define VGIC_32_BIT (1 << 2) +#define VGIC_64_BIT (1 << 3) + +struct vgic_register { + u_int start; /* Start within a memory region */ + u_int end; + u_int size; + u_int flags; + register_read *read; + register_write *write; +}; + +#define VGIC_REGISTER_RANGE(reg_start, reg_end, reg_size, reg_flags, readf, \ + writef) \ +{ \ + .start = (reg_start), \ + .end = (reg_end), \ + .size = (reg_size), \ + .flags = (reg_flags), \ + .read = (readf), \ + .write = (writef), \ +} + +#define VGIC_REGISTER_RANGE_RAZ_WI(reg_start, reg_end, reg_size, reg_flags) \ + VGIC_REGISTER_RANGE(reg_start, reg_end, reg_size, reg_flags, \ + gic_zero_read, gic_ignore_write) + +#define VGIC_REGISTER(start_addr, reg_size, reg_flags, readf, writef) \ + VGIC_REGISTER_RANGE(start_addr, (start_addr) + (reg_size), \ + reg_size, reg_flags, readf, writef) + +#define VGIC_REGISTER_RAZ_WI(start_addr, reg_size, reg_flags) \ + VGIC_REGISTER_RANGE_RAZ_WI(start_addr, \ + (start_addr) + (reg_size), reg_size, reg_flags) + +static register_read gic_pidr2_read; +static register_read gic_zero_read; +static register_write gic_ignore_write; + +/* GICD_CTLR */ +static register_read dist_ctlr_read; +static register_write dist_ctlr_write; +/* GICD_TYPER */ +static register_read dist_typer_read; +/* GICD_IIDR */ +static register_read dist_iidr_read; +/* GICD_STATUSR - RAZ/WI as we don't report errors (yet) */ +/* GICD_SETSPI_NSR & GICD_CLRSPI_NSR */ +static register_write dist_setclrspi_nsr_write; +/* GICD_SETSPI_SR - RAZ/WI */ +/* GICD_CLRSPI_SR - RAZ/WI */ +/* GICD_IGROUPR - RAZ/WI as GICD_CTLR.ARE == 1 */ +/* GICD_ISENABLER */ +static register_read dist_isenabler_read; +static register_write dist_isenabler_write; +/* GICD_ICENABLER */ +static register_read dist_icenabler_read; +static register_write dist_icenabler_write; +/* GICD_ISPENDR */ +static register_read dist_ispendr_read; +static register_write dist_ispendr_write; +/* GICD_ICPENDR */ +static register_read dist_icpendr_read; +static register_write dist_icpendr_write; +/* GICD_ISACTIVER */ +static register_read dist_isactiver_read; +static register_write dist_isactiver_write; +/* GICD_ICACTIVER */ +static register_read dist_icactiver_read; +static register_write dist_icactiver_write; +/* GICD_IPRIORITYR */ +static register_read dist_ipriorityr_read; +static register_write dist_ipriorityr_write; +/* GICD_ITARGETSR - RAZ/WI as GICD_CTLR.ARE == 1 */ +/* GICD_ICFGR */ +static register_read dist_icfgr_read; +static register_write dist_icfgr_write; +/* GICD_IGRPMODR - RAZ/WI from non-secure mode */ +/* GICD_NSACR - RAZ/WI from non-secure mode */ +/* GICD_SGIR - RAZ/WI as GICD_CTLR.ARE == 1 */ +/* GICD_CPENDSGIR - RAZ/WI as GICD_CTLR.ARE == 1 */ +/* GICD_SPENDSGIR - RAZ/WI as GICD_CTLR.ARE == 1 */ +/* GICD_IROUTER */ +static register_read dist_irouter_read; +static register_write dist_irouter_write; + +static struct vgic_register dist_registers[] = { + VGIC_REGISTER(GICD_CTLR, 4, VGIC_32_BIT, dist_ctlr_read, + dist_ctlr_write), + VGIC_REGISTER(GICD_TYPER, 4, VGIC_32_BIT, dist_typer_read, + gic_ignore_write), + VGIC_REGISTER(GICD_IIDR, 4, VGIC_32_BIT, dist_iidr_read, + gic_ignore_write), + VGIC_REGISTER_RAZ_WI(GICD_STATUSR, 4, VGIC_32_BIT), + VGIC_REGISTER(GICD_SETSPI_NSR, 4, VGIC_32_BIT, gic_zero_read, + dist_setclrspi_nsr_write), + VGIC_REGISTER(GICD_CLRSPI_NSR, 4, VGIC_32_BIT, gic_zero_read, + dist_setclrspi_nsr_write), + VGIC_REGISTER_RAZ_WI(GICD_SETSPI_SR, 4, VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICD_CLRSPI_SR, 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE_RAZ_WI(GICD_IGROUPR(0), GICD_IGROUPR(1024), 4, + VGIC_32_BIT), + + VGIC_REGISTER_RAZ_WI(GICD_ISENABLER(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ISENABLER(32), GICD_ISENABLER(1024), 4, + VGIC_32_BIT, dist_isenabler_read, dist_isenabler_write), + + VGIC_REGISTER_RAZ_WI(GICD_ICENABLER(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ICENABLER(32), GICD_ICENABLER(1024), 4, + VGIC_32_BIT, dist_icenabler_read, dist_icenabler_write), + + VGIC_REGISTER_RAZ_WI(GICD_ISPENDR(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ISPENDR(32), GICD_ISPENDR(1024), 4, + VGIC_32_BIT, dist_ispendr_read, dist_ispendr_write), + + VGIC_REGISTER_RAZ_WI(GICD_ICPENDR(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ICPENDR(32), GICD_ICPENDR(1024), 4, + VGIC_32_BIT, dist_icpendr_read, dist_icpendr_write), + + VGIC_REGISTER_RAZ_WI(GICD_ISACTIVER(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ISACTIVER(32), GICD_ISACTIVER(1024), 4, + VGIC_32_BIT, dist_isactiver_read, dist_isactiver_write), + + VGIC_REGISTER_RAZ_WI(GICD_ICACTIVER(0), 4, VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ICACTIVER(32), GICD_ICACTIVER(1024), 4, + VGIC_32_BIT, dist_icactiver_read, dist_icactiver_write), + + VGIC_REGISTER_RANGE_RAZ_WI(GICD_IPRIORITYR(0), GICD_IPRIORITYR(32), 4, + VGIC_32_BIT | VGIC_8_BIT), + VGIC_REGISTER_RANGE(GICD_IPRIORITYR(32), GICD_IPRIORITYR(1024), 4, + VGIC_32_BIT | VGIC_8_BIT, dist_ipriorityr_read, + dist_ipriorityr_write), + + VGIC_REGISTER_RANGE_RAZ_WI(GICD_ITARGETSR(0), GICD_ITARGETSR(1024), 4, + VGIC_32_BIT | VGIC_8_BIT), + + VGIC_REGISTER_RANGE_RAZ_WI(GICD_ICFGR(0), GICD_ICFGR(32), 4, + VGIC_32_BIT), + VGIC_REGISTER_RANGE(GICD_ICFGR(32), GICD_ICFGR(1024), 4, + VGIC_32_BIT, dist_icfgr_read, dist_icfgr_write), +/* + VGIC_REGISTER_RANGE(GICD_IGRPMODR(0), GICD_IGRPMODR(1024), 4, + VGIC_32_BIT, dist_igrpmodr_read, dist_igrpmodr_write), + VGIC_REGISTER_RANGE(GICD_NSACR(0), GICD_NSACR(1024), 4, + VGIC_32_BIT, dist_nsacr_read, dist_nsacr_write), +*/ + VGIC_REGISTER_RAZ_WI(GICD_SGIR, 4, VGIC_32_BIT), +/* + VGIC_REGISTER_RANGE(GICD_CPENDSGIR(0), GICD_CPENDSGIR(1024), 4, + VGIC_32_BIT | VGIC_8_BIT, dist_cpendsgir_read, + dist_cpendsgir_write), + VGIC_REGISTER_RANGE(GICD_SPENDSGIR(0), GICD_SPENDSGIR(1024), 4, + VGIC_32_BIT | VGIC_8_BIT, dist_spendsgir_read, + dist_spendsgir_write), +*/ + VGIC_REGISTER_RANGE(GICD_IROUTER(32), GICD_IROUTER(1024), 8, + VGIC_64_BIT | VGIC_32_BIT, dist_irouter_read, dist_irouter_write), + + VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR4, GICD_PIDR2, 4, VGIC_32_BIT), + VGIC_REGISTER(GICD_PIDR2, 4, VGIC_32_BIT, gic_pidr2_read, + gic_ignore_write), + VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR2 + 4, GICD_SIZE, 4, VGIC_32_BIT), +}; + +/* GICR_CTLR - Ignore writes as no bits can be set */ +static register_read redist_ctlr_read; +/* GICR_IIDR */ +static register_read redist_iidr_read; +/* GICR_TYPER */ +static register_read redist_typer_read; +/* GICR_STATUSR - RAZ/WI as we don't report errors (yet) */ +/* GICR_WAKER - RAZ/WI from non-secure mode */ +/* GICR_SETLPIR - RAZ/WI as no LPIs are supported */ +/* GICR_CLRLPIR - RAZ/WI as no LPIs are supported */ +/* GICR_PROPBASER - RAZ/WI as no LPIs are supported */ +/* GICR_PENDBASER - RAZ/WI as no LPIs are supported */ +/* GICR_INVLPIR - RAZ/WI as no LPIs are supported */ +/* GICR_INVALLR - RAZ/WI as no LPIs are supported */ +/* GICR_SYNCR - RAZ/WI as no LPIs are supported */ + +static struct vgic_register redist_rd_registers[] = { + VGIC_REGISTER(GICR_CTLR, 4, VGIC_32_BIT, redist_ctlr_read, + gic_ignore_write), + VGIC_REGISTER(GICR_IIDR, 4, VGIC_32_BIT, redist_iidr_read, + gic_ignore_write), + VGIC_REGISTER(GICR_TYPER, 8, VGIC_64_BIT | VGIC_32_BIT, + redist_typer_read, gic_ignore_write), + VGIC_REGISTER_RAZ_WI(GICR_STATUSR, 4, VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_WAKER, 4, VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_SETLPIR, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_CLRLPIR, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_PROPBASER, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_PENDBASER, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_INVLPIR, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_INVALLR, 8, VGIC_64_BIT | VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_SYNCR, 4, VGIC_32_BIT), + + /* These are identical to the dist registers */ + VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR4, GICD_PIDR2, 4, VGIC_32_BIT), + VGIC_REGISTER(GICD_PIDR2, 4, VGIC_32_BIT, gic_pidr2_read, + gic_ignore_write), + VGIC_REGISTER_RANGE_RAZ_WI(GICD_PIDR2 + 4, GICD_SIZE, 4, + VGIC_32_BIT), +}; + +/* GICR_IGROUPR0 - RAZ/WI from non-secure mode */ +/* GICR_ISENABLER0 */ +static register_read redist_ienabler0_read; +static register_write redist_isenabler0_write; +/* GICR_ICENABLER0 */ +static register_write redist_icenabler0_write; +/* GICR_ISPENDR0 */ +static register_read redist_ipendr0_read; +static register_write redist_ispendr0_write; +/* GICR_ICPENDR0 */ +static register_write redist_icpendr0_write; +/* GICR_ISACTIVER0 */ +static register_read redist_iactiver0_read; +static register_write redist_isactiver0_write; +/* GICR_ICACTIVER0 */ +static register_write redist_icactiver0_write; +/* GICR_IPRIORITYR */ +static register_read redist_ipriorityr_read; +static register_write redist_ipriorityr_write; +/* GICR_ICFGR0 - RAZ/WI from non-secure mode */ +/* GICR_ICFGR1 */ +static register_read redist_icfgr1_read; +static register_write redist_icfgr1_write; +/* GICR_IGRPMODR0 - RAZ/WI from non-secure mode */ +/* GICR_NSCAR - RAZ/WI from non-secure mode */ + +static struct vgic_register redist_sgi_registers[] = { + VGIC_REGISTER_RAZ_WI(GICR_IGROUPR0, 4, VGIC_32_BIT), + VGIC_REGISTER(GICR_ISENABLER0, 4, VGIC_32_BIT, redist_ienabler0_read, + redist_isenabler0_write), + VGIC_REGISTER(GICR_ICENABLER0, 4, VGIC_32_BIT, redist_ienabler0_read, + redist_icenabler0_write), + VGIC_REGISTER(GICR_ISPENDR0, 4, VGIC_32_BIT, redist_ipendr0_read, + redist_ispendr0_write), + VGIC_REGISTER(GICR_ICPENDR0, 4, VGIC_32_BIT, redist_ipendr0_read, + redist_icpendr0_write), + VGIC_REGISTER(GICR_ISACTIVER0, 4, VGIC_32_BIT, redist_iactiver0_read, + redist_isactiver0_write), + VGIC_REGISTER(GICR_ICACTIVER0, 4, VGIC_32_BIT, redist_iactiver0_read, + redist_icactiver0_write), + VGIC_REGISTER_RANGE(GICR_IPRIORITYR(0), GICR_IPRIORITYR(32), 4, + VGIC_32_BIT | VGIC_8_BIT, redist_ipriorityr_read, + redist_ipriorityr_write), + VGIC_REGISTER_RAZ_WI(GICR_ICFGR0, 4, VGIC_32_BIT), + VGIC_REGISTER(GICR_ICFGR1, 4, VGIC_32_BIT, redist_icfgr1_read, + redist_icfgr1_write), + VGIC_REGISTER_RAZ_WI(GICR_IGRPMODR0, 4, VGIC_32_BIT), + VGIC_REGISTER_RAZ_WI(GICR_NSACR, 4, VGIC_32_BIT), +}; + +static struct vgic_v3_virt_features virt_features; + +static struct vgic_v3_irq *vgic_v3_get_irq(struct hyp *, int, uint32_t); +static void vgic_v3_release_irq(struct vgic_v3_irq *); + +/* TODO: Move to a common file */ +static int +mpidr_to_vcpu(struct hyp *hyp, uint64_t mpidr) +{ + struct vm *vm; + + vm = hyp->vm; + for (int i = 0; i < vm_get_maxcpus(vm); i++) { + if ((hyp->ctx[i].vmpidr_el2 & GICD_AFF) == mpidr) + return (i); + } + return (-1); +} + +void +vgic_v3_vminit(struct hyp *hyp) +{ + struct vgic_v3_dist *dist = &hyp->vgic_dist; + + /* + * Configure the Distributor control register. The register resets to an + * architecturally UNKNOWN value, so we reset to 0 to disable all + * functionality controlled by the register. + * + * The exception is GICD_CTLR.DS, which is RA0/WI when the Distributor + * supports one security state (ARM GIC Architecture Specification for + * GICv3 and GICv4, p. 4-464) + */ + dist->gicd_ctlr = 0; + + mtx_init(&dist->dist_mtx, "VGICv3 Distributor lock", NULL, MTX_SPIN); +} + +void +vgic_v3_cpuinit(struct hypctx *hypctx, bool last_vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &hypctx->vgic_cpu_if; + struct vgic_v3_redist *redist = &hypctx->vgic_redist; + struct vgic_v3_irq *irq; + uint64_t aff, vmpidr_el2; + int i, irqid; + + vmpidr_el2 = hypctx->vmpidr_el2; + KASSERT(vmpidr_el2 != 0, + ("Trying to init this CPU's vGIC before the vCPU")); + /* + * Get affinity for the current CPU. The guest CPU affinity is taken + * from VMPIDR_EL2. The Redistributor corresponding to this CPU is + * the Redistributor with the same affinity from GICR_TYPER. + */ + aff = (CPU_AFF3(vmpidr_el2) << 24) | (CPU_AFF2(vmpidr_el2) << 16) | + (CPU_AFF1(vmpidr_el2) << 8) | CPU_AFF0(vmpidr_el2); + + /* Set up GICR_TYPER. */ + redist->gicr_typer = aff << GICR_TYPER_AFF_SHIFT; + /* Set the vcpu as the processsor ID */ + redist->gicr_typer |= hypctx->vcpu << GICR_TYPER_CPUNUM_SHIFT; + + if (last_vcpu) + /* Mark the last Redistributor */ + redist->gicr_typer |= GICR_TYPER_LAST; + + mtx_init(&cpu_if->lr_mtx, "VGICv3 ICH_LR_EL2 lock", NULL, MTX_SPIN); + + /* Set the SGI and PPI state */ + for (irqid = 0; irqid < VGIC_PRV_I_NUM; irqid++) { + irq = &cpu_if->private_irqs[irqid]; + + mtx_init(&irq->irq_spinmtx, "VGIC IRQ spinlock", NULL, + MTX_SPIN); + irq->irq = irqid; + irq->mpidr = hypctx->vmpidr_el2 & GICD_AFF; + irq->target_vcpu = mpidr_to_vcpu(hypctx->hyp, irq->mpidr); + if (irqid < VGIC_SGI_NUM) { + /* SGIs */ + irq->enabled = true; + irq->config = VGIC_CONFIG_EDGE; + } else { + /* PPIs */ + irq->config = VGIC_CONFIG_LEVEL; + } + irq->priority = 0; + } + + /* + * Configure the Interrupt Controller Hyp Control Register. + * + * ICH_HCR_EL2_En: enable virtual CPU interface. + * + * Maintenance interrupts are disabled. + */ + cpu_if->ich_hcr_el2 = ICH_HCR_EL2_En; + + /* + * Configure the Interrupt Controller Virtual Machine Control Register. + * + * ICH_VMCR_EL2_VPMR: lowest priority mask for the VCPU interface + * ICH_VMCR_EL2_VBPR1_NO_PREEMPTION: disable interrupt preemption for + * Group 1 interrupts + * ICH_VMCR_EL2_VBPR0_NO_PREEMPTION: disable interrupt preemption for + * Group 0 interrupts + * ~ICH_VMCR_EL2_VEOIM: writes to EOI registers perform priority drop + * and interrupt deactivation. + * ICH_VMCR_EL2_VENG0: virtual Group 0 interrupts enabled. + * ICH_VMCR_EL2_VENG1: virtual Group 1 interrupts enabled. + */ + cpu_if->ich_vmcr_el2 = \ + (virt_features.min_prio << ICH_VMCR_EL2_VPMR_SHIFT) | \ + ICH_VMCR_EL2_VBPR1_NO_PREEMPTION | ICH_VMCR_EL2_VBPR0_NO_PREEMPTION; + cpu_if->ich_vmcr_el2 &= ~ICH_VMCR_EL2_VEOIM; + cpu_if->ich_vmcr_el2 |= ICH_VMCR_EL2_VENG0 | ICH_VMCR_EL2_VENG1; + + cpu_if->ich_lr_num = virt_features.ich_lr_num; + for (i = 0; i < cpu_if->ich_lr_num; i++) + cpu_if->ich_lr_el2[i] = 0UL; + cpu_if->ich_lr_used = 0; + TAILQ_INIT(&cpu_if->irq_act_pend); + + cpu_if->ich_apr_num = virt_features.ich_apr_num; +} + +void +vgic_v3_cpucleanup(struct hypctx *hypctx) +{ + struct vgic_v3_cpu_if *cpu_if; + struct vgic_v3_irq *irq; + int irqid; + + cpu_if = &hypctx->vgic_cpu_if; + for (irqid = 0; irqid < VGIC_PRV_I_NUM; irqid++) { + irq = &cpu_if->private_irqs[irqid]; + mtx_destroy(&irq->irq_spinmtx); + } + + mtx_destroy(&cpu_if->lr_mtx); +} + +void +vgic_v3_vmcleanup(struct hyp *hyp) +{ + struct vgic_v3_dist *dist = &hyp->vgic_dist; + + mtx_destroy(&dist->dist_mtx); +} + +static bool +vgic_v3_irq_pending(struct vgic_v3_irq *irq) +{ + if ((irq->config & VGIC_CONFIG_MASK) == VGIC_CONFIG_LEVEL) { + return (irq->pending || irq->level); + } else { + return (irq->pending); + } +} + +static bool +vgic_v3_queue_irq(struct hyp *hyp, struct vgic_v3_cpu_if *cpu_if, + int vcpuid, struct vgic_v3_irq *irq) +{ + MPASS(vcpuid >= 0); + MPASS(vcpuid < VM_MAXCPU); + + mtx_assert(&cpu_if->lr_mtx, MA_OWNED); + mtx_assert(&irq->irq_spinmtx, MA_OWNED); + + /* No need to queue the IRQ */ + if (!irq->level && !irq->pending) + return (false); + + if (!irq->on_aplist) { + irq->on_aplist = true; + TAILQ_INSERT_TAIL(&cpu_if->irq_act_pend, irq, act_pend_list); + } + return (true); +} + +static uint64_t +gic_reg_value_64(uint64_t field, uint64_t val, u_int offset, u_int size) +{ + uint32_t mask; + + if (offset != 0 || size != 8) { + mask = ((1ul << (size * 8)) - 1) << (offset * 8); + /* Shift the new bits to the correct place */ + val <<= (offset * 8); + /* Keep only the interesting bits */ + val &= mask; + /* Add the bits we are keeping from the old value */ + val |= field & ~mask; + } + + return (val); +} + +static void +gic_pidr2_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = GICR_PIDR2_ARCH_GICv3 << GICR_PIDR2_ARCH_SHIFT; +} + +/* Common read-only/write-ignored helpers */ +static void +gic_zero_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = 0; +} + +static void +gic_ignore_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + /* Nothing to do */ +} + +static uint64_t +read_enabler(struct hyp *hyp, int vcpuid, int n) +{ + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int i; + + ret = 0; + irq_base = n * 32; + for (i = 0; i < 32; i++) { + irq = vgic_v3_get_irq(hyp, vcpuid, irq_base + i); + if (irq == NULL) + continue; + + if (!irq->enabled) + ret |= 1u << i; + vgic_v3_release_irq(irq); + } + + return (ret); +} + +static void +write_enabler(struct hyp *hyp, int vcpuid, int n, bool set, uint64_t val) +{ + struct vgic_v3_irq *irq; + uint32_t irq_base; + int i; + + irq_base = n * 32; + for (i = 0; i < 32; i++) { + /* We only change interrupts when the appropriate bit is set */ + if ((val & (1u << i)) == 0) + continue; + + /* Find the interrupt this bit represents */ + irq = vgic_v3_get_irq(hyp, vcpuid, irq_base + i); + if (irq == NULL) + continue; + + irq->enabled = set; + vgic_v3_release_irq(irq); + } +} + +static uint64_t +read_pendr(struct hyp *hyp, int vcpuid, int n) +{ + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int i; + + ret = 0; + irq_base = n * 32; + for (i = 0; i < 32; i++) { + irq = vgic_v3_get_irq(hyp, vcpuid, irq_base + i); + if (irq == NULL) + continue; + + if (vgic_v3_irq_pending(irq)) + ret |= 1u << i; + vgic_v3_release_irq(irq); + } + + return (ret); +} + +static uint64_t +write_pendr(struct hyp *hyp, int vcpuid, int n, bool set, uint64_t val) +{ + struct vgic_v3_cpu_if *cpu_if; + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int target_vcpu, i; + bool notify; + + ret = 0; + irq_base = n * 32; + for (i = 0; i < 32; i++) { + /* We only change interrupts when the appropriate bit is set */ + if ((val & (1u << i)) == 0) + continue; + + irq = vgic_v3_get_irq(hyp, vcpuid, irq_base + i); + if (irq == NULL) + continue; + + notify = false; + target_vcpu = irq->target_vcpu; + if (target_vcpu < 0) + goto next_irq; + cpu_if = &hyp->ctx[target_vcpu].vgic_cpu_if; + + if (!set) { + /* pending -> not pending */ + irq->pending = false; + } else { + irq->pending = true; + mtx_lock_spin(&cpu_if->lr_mtx); + notify = vgic_v3_queue_irq(hyp, cpu_if, target_vcpu, + irq); + mtx_unlock_spin(&cpu_if->lr_mtx); + } +next_irq: + vgic_v3_release_irq(irq); + + if (notify) + vcpu_notify_event(hyp->vm, target_vcpu, false); + } + + return (ret); +} + +static uint64_t +read_activer(struct hyp *hyp, int vcpuid, int n) +{ + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int i; + + ret = 0; + irq_base = n * 32; + for (i = 0; i < 32; i++) { + irq = vgic_v3_get_irq(hyp, vcpuid, irq_base + i); + if (irq == NULL) + continue; + + if (irq->active) + ret |= 1u << i; + vgic_v3_release_irq(irq); + } + + return (ret); +} + +static void +write_activer(struct hyp *hyp, int vcpuid, u_int n, bool set, uint64_t val) +{ + struct vgic_v3_cpu_if *cpu_if; + struct vgic_v3_irq *irq; + uint32_t irq_base; + int target_vcpu, i; + bool notify; + + irq_base = n * 32; + for (i = 0; i < 32; i++) { + /* We only change interrupts when the appropriate bit is set */ + if ((val & (1u << i)) == 0) + continue; + + irq = vgic_v3_get_irq(hyp, vcpuid, irq_base + i); + if (irq == NULL) + continue; + + notify = false; + target_vcpu = irq->target_vcpu; + if (target_vcpu < 0) + goto next_irq; + cpu_if = &hyp->ctx[target_vcpu].vgic_cpu_if; + + if (!set) { + /* active -> not active */ + irq->active = false; + } else { + /* not active -> active */ + irq->active = true; + mtx_lock_spin(&cpu_if->lr_mtx); + notify = vgic_v3_queue_irq(hyp, cpu_if, target_vcpu, + irq); + mtx_unlock_spin(&cpu_if->lr_mtx); + } +next_irq: + vgic_v3_release_irq(irq); + + if (notify) + vcpu_notify_event(hyp->vm, target_vcpu, false); + } +} + +static uint64_t +read_priorityr(struct hyp *hyp, int vcpuid, int n) +{ + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int i; + + ret = 0; + irq_base = n * 4; + for (i = 0; i < 4; i++) { + irq = vgic_v3_get_irq(hyp, vcpuid, irq_base + i); + if (irq == NULL) + continue; + + ret |= ((uint64_t)irq->priority) << (i * 8); + vgic_v3_release_irq(irq); + } + + return (ret); +} + +static void +write_priorityr(struct hyp *hyp, int vcpuid, u_int irq_base, u_int size, + uint64_t val) +{ + struct vgic_v3_irq *irq; + int i; + + for (i = 0; i < size; i++) { + irq = vgic_v3_get_irq(hyp, vcpuid, irq_base + i); + if (irq == NULL) + continue; + + /* Set the priority. We support 32 priority steps (5 bits) */ + irq->priority = (val >> (i * 8)) & 0xf8; + vgic_v3_release_irq(irq); + } +} + +static uint64_t +read_config(struct hyp *hyp, int vcpuid, int n) +{ + struct vgic_v3_irq *irq; + uint64_t ret; + uint32_t irq_base; + int i; + + ret = 0; + irq_base = n * 16; + for (i = 0; i < 16; i++) { + irq = vgic_v3_get_irq(hyp, vcpuid, irq_base + i); + if (irq == NULL) + continue; + + ret |= ((uint64_t)irq->config) << (i * 2); + vgic_v3_release_irq(irq); + } + + return (ret); +} + +static void +write_config(struct hyp *hyp, int vcpuid, int n, uint64_t val) +{ + struct vgic_v3_irq *irq; + uint32_t irq_base; + int i; + + irq_base = n * 16; + for (i = 0; i < 16; i++) { + /* + * The config can't be changed for SGIs and PPIs. SGIs have + * an edge-triggered behaviour, and the register is + * implementation defined to be read-only for PPIs. + */ + if (irq_base + i < VGIC_PRV_I_NUM) + continue; + + irq = vgic_v3_get_irq(hyp, vcpuid, irq_base + i); + if (irq == NULL) + continue; + + /* Bit 0 is RES0 */ + irq->config = (val >> (i * 2)) & VGIC_CONFIG_MASK; + vgic_v3_release_irq(irq); + } +} + +static uint64_t +read_route(struct hyp *hyp, int vcpuid, int n) +{ + struct vgic_v3_irq *irq; + uint64_t mpidr; + + irq = vgic_v3_get_irq(hyp, vcpuid, n); + if (irq == NULL) + return (0); + + mpidr = irq->mpidr; + vgic_v3_release_irq(irq); + + return (mpidr); +} + +static void +write_route(struct hyp *hyp, int vcpuid, int n, uint64_t val, u_int offset, + u_int size) +{ + struct vgic_v3_irq *irq; + + irq = vgic_v3_get_irq(hyp, vcpuid, n); + if (irq == NULL) + return; + + irq->mpidr = gic_reg_value_64(irq->mpidr, val, offset, size) & GICD_AFF; + irq->target_vcpu = mpidr_to_vcpu(hyp, irq->mpidr); + /* + * If the interrupt is pending we can either use the old mpidr, or + * the new mpidr. To simplify this code we use the old value so we + * don't need to move the interrupt until the next time it is + * moved to the pending state. + */ + vgic_v3_release_irq(irq); +} + +/* + * Distributor register handlers. + */ +/* GICD_CTLR */ +static void +dist_ctlr_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + struct vgic_v3_dist *dist; + + dist = &hyp->vgic_dist; + mtx_lock_spin(&dist->dist_mtx); + *rval = dist->gicd_ctlr; + mtx_unlock_spin(&dist->dist_mtx); + + /* Writes are never pending */ + *rval &= ~GICD_CTLR_RWP; +} + +static void +dist_ctlr_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + struct vgic_v3_dist *dist; + + MPASS(offset == 0); + MPASS(size == 4); + dist = &hyp->vgic_dist; + + /* + * GICv2 backwards compatibility is not implemented so + * ARE_NS is RAO/WI. This means EnableGrp1 is RES0. + * + * EnableGrp1A is supported, and RWP is read-only. + * + * All other bits are RES0 from non-secure mode as we + * implement as if we are in a system with two security + * states. + */ + wval &= GICD_CTLR_G1A; + wval |= GICD_CTLR_ARE_NS; + mtx_lock_spin(&dist->dist_mtx); + dist->gicd_ctlr = wval; + /* TODO: Wake any vcpus that have interrupts pending */ + mtx_unlock_spin(&dist->dist_mtx); +} + +/* GICD_TYPER */ +static void +dist_typer_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + uint32_t typer; + + typer = (10 - 1) << GICD_TYPER_IDBITS_SHIFT; + typer |= GICD_TYPER_MBIS; + /* ITLinesNumber: */ + typer |= howmany(VGIC_NIRQS + 1, 32) - 1; + + *rval = typer; +} + +/* GICD_IIDR */ +static void +dist_iidr_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = VGIC_IIDR; +} + +/* GICD_SETSPI_NSR & GICD_CLRSPI_NSR */ +static void +dist_setclrspi_nsr_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + uint32_t irqid; + + MPASS(offset == 0); + MPASS(size == 4); + irqid = wval & GICD_SPI_INTID_MASK; + vgic_v3_inject_irq(hyp, vcpuid, irqid, reg == GICD_SETSPI_NSR); +} + +/* GICD_ISENABLER */ +static void +dist_isenabler_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICD_ISENABLER(0)) / 4; + /* GICD_ISENABLER0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_enabler(hyp, vcpuid, n); +} + +static void +dist_isenabler_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ISENABLER(0)) / 4; + /* GICD_ISENABLER0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_enabler(hyp, vcpuid, n, true, wval); +} + +/* GICD_ICENABLER */ +static void +dist_icenabler_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICD_ICENABLER(0)) / 4; + /* GICD_ICENABLER0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_enabler(hyp, vcpuid, n); +} + +static void +dist_icenabler_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ISENABLER(0)) / 4; + /* GICD_ICENABLER0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_enabler(hyp, vcpuid, n, false, wval); +} + +/* GICD_ISPENDR */ +static void +dist_ispendr_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICD_ISPENDR(0)) / 4; + /* GICD_ISPENDR0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_pendr(hyp, vcpuid, n); +} + +static void +dist_ispendr_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ISPENDR(0)) / 4; + /* GICD_ISPENDR0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_pendr(hyp, vcpuid, n, true, wval); +} + +/* GICD_ICPENDR */ +static void +dist_icpendr_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICD_ICPENDR(0)) / 4; + /* GICD_ICPENDR0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_pendr(hyp, vcpuid, n); +} + +static void +dist_icpendr_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ICPENDR(0)) / 4; + /* GICD_ICPENDR0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_pendr(hyp, vcpuid, n, false, wval); +} + +/* GICD_ISACTIVER */ +/* Affinity routing is enabled so isactiver0 is RAZ/WI */ +static void +dist_isactiver_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICD_ISACTIVER(0)) / 4; + /* GICD_ISACTIVER0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_activer(hyp, vcpuid, n); +} + +static void +dist_isactiver_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ISACTIVER(0)) / 4; + /* GICD_ISACTIVE0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_activer(hyp, vcpuid, n, true, wval); +} + +/* GICD_ICACTIVER */ +static void +dist_icactiver_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICD_ICACTIVER(0)) / 4; + /* GICD_ICACTIVE0 is RAZ/WI so handled separately */ + MPASS(n > 0); + *rval = read_activer(hyp, vcpuid, n); +} + +static void +dist_icactiver_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ICACTIVER(0)) / 4; + /* GICD_ICACTIVE0 is RAZ/WI so handled separately */ + MPASS(n > 0); + write_activer(hyp, vcpuid, n, false, wval); +} + +/* GICD_IPRIORITYR */ +/* Affinity routing is enabled so ipriorityr0-7 is RAZ/WI */ +static void +dist_ipriorityr_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICD_IPRIORITYR(0)) / 4; + /* GICD_IPRIORITY0-7 is RAZ/WI so handled separately */ + MPASS(n > 7); + *rval = read_priorityr(hyp, vcpuid, n); +} + +static void +dist_ipriorityr_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + u_int irq_base; + + irq_base = (reg - GICD_IPRIORITYR(0)) + offset; + /* GICD_IPRIORITY0-7 is RAZ/WI so handled separately */ + MPASS(irq_base > 31); + write_priorityr(hyp, vcpuid, irq_base, size, wval); +} + +/* GICD_ICFGR */ +static void +dist_icfgr_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICD_ICFGR(0)) / 4; + /* GICD_ICFGR0-1 are RAZ/WI so handled separately */ + MPASS(n > 1); + *rval = read_config(hyp, vcpuid, n); +} + +static void +dist_icfgr_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + int n; + + MPASS(offset == 0); + MPASS(size == 4); + n = (reg - GICD_ICFGR(0)) / 4; + /* GICD_ICFGR0-1 are RAZ/WI so handled separately */ + MPASS(n > 1); + write_config(hyp, vcpuid, n, wval); +} + +/* GICD_IROUTER */ +static void +dist_irouter_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICD_IROUTER(0)) / 8; + /* GICD_IROUTER0-31 don't exist */ + MPASS(n > 31); + *rval = read_route(hyp, vcpuid, n); +} + +static void +dist_irouter_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + int n; + + n = (reg - GICD_IROUTER(0)) / 8; + /* GICD_IROUTER0-31 don't exist */ + MPASS(n > 31); + write_route(hyp, vcpuid, n, wval, offset, size); +} + +static bool +vgic_register_read(struct hyp *hyp, struct vgic_register *reg_list, + u_int reg_list_size, int vcpuid, u_int reg, u_int size, + uint64_t *rval, void *arg) +{ + u_int i, offset; + + for (i = 0; i < reg_list_size; i++) { + if (reg_list[i].start <= reg && reg_list[i].end >= reg + size) { + offset = reg & reg_list[i].size - 1; + reg -= offset; + if ((reg_list[i].flags & size) != 0) { + reg_list[i].read(hyp, vcpuid, reg, rval, NULL); + + /* Move the bits into the correct place */ + *rval >>= (offset * 8); + if (size < 8) { + *rval &= (1ul << (size * 8)) - 1; + } + } else { + panic("TODO: Handle invalid register size: " + "reg %x size %d", reg, size); + } + return (true); + } + } + return (false); +} + +static bool +vgic_register_write(struct hyp *hyp, struct vgic_register *reg_list, + u_int reg_list_size, int vcpuid, u_int reg, u_int size, + uint64_t wval, void *arg) +{ + u_int i, offset; + + for (i = 0; i < reg_list_size; i++) { + if (reg_list[i].start <= reg && reg_list[i].end >= reg + size) { + offset = reg & reg_list[i].size - 1; + reg -= offset; + if ((reg_list[i].flags & size) != 0) { + reg_list[i].write(hyp, vcpuid, reg, offset, + size, wval, NULL); + } else { + panic("TODO: Handle invalid register size: " + "reg %x size %d", reg, size); + } + return (true); + } + } + return (false); +} + +static int +dist_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp = vm_get_cookie(vm); + struct vgic_v3_dist *dist = &hyp->vgic_dist; + uint64_t reg; + + /* Check the register is one of ours and is the correct size */ + if (fault_ipa < dist->start || fault_ipa + size > dist->end) { + return (EINVAL); + } + + reg = fault_ipa - dist->start; + /* Check the register is correctly aligned */ + if ((reg & (size - 1)) != 0) + return (EINVAL); + + if (vgic_register_read(hyp, dist_registers, nitems(dist_registers), + vcpuid, reg, size, rval, NULL)) + return (0); + + /* TODO: Check the correct behaviour */ + printf("%s: %lx\n", __func__, fault_ipa - dist->start); + *rval = 0; + + return (0); +} + +static int +dist_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp = vm_get_cookie(vm); + struct vgic_v3_dist *dist = &hyp->vgic_dist; + uint64_t reg; + + /* Check the register is one of ours and is the correct size */ + if (fault_ipa < dist->start || fault_ipa + size > dist->end) { + return (EINVAL); + } + + reg = fault_ipa - dist->start; + /* Check the register is correctly aligned */ + if ((reg & (size - 1)) != 0) + return (EINVAL); + + if (vgic_register_write(hyp, dist_registers, nitems(dist_registers), + vcpuid, reg, size, wval, NULL)) + return (0); + + panic("%s: %lx\n", __func__, fault_ipa - dist->start); + return (0); +} + +/* + * Redistributor register handlers. + * + * RD_base: + */ +/* GICR_CTLR */ +static void +redist_ctlr_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + /* LPIs not supported */ + *rval = 0; +} + +/* GICR_IIDR */ +static void +redist_iidr_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = VGIC_IIDR; +} + +/* GICR_TYPER */ +static void +redist_typer_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + struct vgic_v3_redist *redist; + + redist = &hyp->ctx[vcpuid].vgic_redist; + *rval = redist->gicr_typer; +} + +/* + * SGI_base: + */ +/* GICR_ISENABLER0 */ +static void +redist_ienabler0_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = read_enabler(hyp, vcpuid, 0); +} + +static void +redist_isenabler0_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + MPASS(offset == 0); + MPASS(size == 4); + write_enabler(hyp, vcpuid, 0, true, wval); +} + +/* GICR_ICENABLER0 */ +static void +redist_icenabler0_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + MPASS(offset == 0); + MPASS(size == 4); + write_enabler(hyp, vcpuid, 0, false, wval); +} + +/* GICR_ISPENDR0 */ +static void +redist_ipendr0_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = read_pendr(hyp, vcpuid, 0); +} + +static void +redist_ispendr0_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + MPASS(offset == 0); + MPASS(size == 4); + write_pendr(hyp, vcpuid, 0, true, wval); +} + +/* GICR_ICPENDR0 */ +static void +redist_icpendr0_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + MPASS(offset == 0); + MPASS(size == 4); + write_pendr(hyp, vcpuid, 0, false, wval); +} + +/* GICR_ISACTIVER0 */ +static void +redist_iactiver0_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = read_activer(hyp, vcpuid, 0); +} + +static void +redist_isactiver0_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + write_activer(hyp, vcpuid, 0, true, wval); +} + +/* GICR_ICACTIVER0 */ +static void +redist_icactiver0_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + write_activer(hyp, vcpuid, 0, false, wval); +} + +/* GICR_IPRIORITYR */ +static void +redist_ipriorityr_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + int n; + + n = (reg - GICR_IPRIORITYR(0)) / 4; + *rval = read_priorityr(hyp, vcpuid, n); +} + +static void +redist_ipriorityr_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + u_int irq_base; + + irq_base = (reg - GICR_IPRIORITYR(0)) + offset; + write_priorityr(hyp, vcpuid, irq_base, size, wval); +} + +/* GICR_ICFGR1 */ +static void +redist_icfgr1_read(struct hyp *hyp, int vcpuid, u_int reg, uint64_t *rval, + void *arg) +{ + *rval = read_config(hyp, vcpuid, 0); +} + +static void +redist_icfgr1_write(struct hyp *hyp, int vcpuid, u_int reg, u_int offset, + u_int size, uint64_t wval, void *arg) +{ + MPASS(offset == 0); + MPASS(size == 4); + write_config(hyp, vcpuid, 0, wval); +} + +static int +redist_read(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t *rval, + int size, void *arg) +{ + struct hyp *hyp = vm_get_cookie(vm); + struct vgic_v3_redist *redist = &hyp->ctx[vcpuid].vgic_redist; + uint64_t reg; + + /* Check the register is one of ours and is the correct size */ + if (fault_ipa < redist->start || fault_ipa + size > redist->end) { + return (EINVAL); + } + + reg = fault_ipa - redist->start; + /* Check the register is correctly aligned */ + if ((reg & (size - 1)) != 0) + return (EINVAL); + + if (reg < GICR_RD_BASE_SIZE) { + if (vgic_register_read(hyp, redist_rd_registers, + nitems(redist_rd_registers), vcpuid, reg, size, rval, NULL)) + return (0); + } else if (reg < (GICR_SGI_BASE + GICR_SGI_BASE_SIZE)) { + if (vgic_register_read(hyp, redist_sgi_registers, + nitems(redist_sgi_registers), vcpuid, + reg - GICR_SGI_BASE, size, rval, NULL)) + return (0); + } + + panic("%s: %lx", __func__, reg); +} + +static int +redist_write(void *vm, int vcpuid, uint64_t fault_ipa, uint64_t wval, + int size, void *arg) +{ + struct hyp *hyp = vm_get_cookie(vm); + struct vgic_v3_redist *redist = &hyp->ctx[vcpuid].vgic_redist; + uint64_t reg; + + /* Check the register is one of ours and is the correct size */ + if (fault_ipa < redist->start || fault_ipa + size > redist->end) { + return (EINVAL); + } + + reg = fault_ipa - redist->start; + /* Check the register is correctly aligned */ + if ((reg & (size - 1)) != 0) + return (EINVAL); + + if (reg < GICR_RD_BASE_SIZE) { + if (vgic_register_write(hyp, redist_rd_registers, + nitems(redist_rd_registers), vcpuid, reg, size, wval, NULL)) + return (0); + } else if (reg < (GICR_SGI_BASE + GICR_SGI_BASE_SIZE)) { + if (vgic_register_write(hyp, redist_sgi_registers, + nitems(redist_sgi_registers), vcpuid, + reg - GICR_SGI_BASE, size, wval, NULL)) + return (0); + } + + panic("%s: %lx", __func__, reg); +} + +int +vgic_v3_icc_sgi1r_read(void *vm, int vcpuid, uint64_t *rval, void *arg) +{ + /* + * TODO: Inject an unknown exception. + */ + *rval = 0; + return (0); +} + +/* vgic_v3_icc_sgi1r_write currently only handles 16 CPUs */ +CTASSERT(VM_MAXCPU <= 16); +int +vgic_v3_icc_sgi1r_write(void *vm, int vcpuid, uint64_t rval, void *arg) +{ + struct hyp *hyp; + cpuset_t active_cpus; + uint32_t irqid; + int cpus, vcpu; + + hyp = vm_get_cookie(vm); + active_cpus = vm_active_cpus(vm); + irqid = (rval >> ICC_SGI1R_EL1_SGIID_SHIFT) & ICC_SGI1R_EL1_SGIID_MASK; + if ((rval & ICC_SGI1R_EL1_IRM) == 0) { + /* + * TODO: Support on more than 16 CPUs. This is the mask for the + * affinity bits. These should be 0. + */ + if ((rval & 0xff00ff00ff000ul) != 0) + return (0); + cpus = rval & 0xff; + vcpu = 0; + while (cpus > 0) { + if (CPU_ISSET(vcpu, &active_cpus) && vcpu != vcpuid) { + vgic_v3_inject_irq(hyp, vcpu, irqid, true); + } + vcpu++; + cpus >>= 1; + } + } else { + /* Send an IPI to all CPUs other than the current CPU */ + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { + if (CPU_ISSET(vcpu, &active_cpus) && vcpu != vcpuid) { + vgic_v3_inject_irq(hyp, vcpu, irqid, true); + } + } + } + + return (0); +} + +static void +vgic_v3_mmio_init(struct hyp *hyp) +{ + struct vgic_v3_dist *dist; + struct vgic_v3_irq *irq; + int i; + + /* Allocate memory for the SPIs */ + dist = &hyp->vgic_dist; + dist->irqs = malloc((VGIC_NIRQS - VGIC_PRV_I_NUM) * + sizeof(*dist->irqs), M_VGIC_V3, M_WAITOK | M_ZERO); + + for (i = 0; i < VGIC_NIRQS - VGIC_PRV_I_NUM; i++) { + irq = &dist->irqs[i]; + + mtx_init(&irq->irq_spinmtx, "VGIC IRQ spinlock", NULL, + MTX_SPIN); + + irq->irq = i + VGIC_PRV_I_NUM; + } +} + +static void +vgic_v3_mmio_destroy(struct hyp *hyp) +{ + struct vgic_v3_dist *dist = &hyp->vgic_dist; + struct vgic_v3_irq *irq; + int i; + + for (i = 0; i < VGIC_NIRQS - VGIC_PRV_I_NUM; i++) { + irq = &dist->irqs[i]; + + mtx_destroy(&irq->irq_spinmtx); + } + + free(dist->irqs, M_VGIC_V3); +} + +int +vgic_v3_attach_to_vm(struct vm *vm, uint64_t dist_start, size_t dist_size, + uint64_t redist_start, size_t redist_size) +{ + struct hyp *hyp = vm_get_cookie(vm); + struct vgic_v3_dist *dist = &hyp->vgic_dist; + struct vgic_v3_redist *redist; + int i; + + /* The register bases need to be 64k aligned */ + if (!__is_aligned(dist_start, PAGE_SIZE_64K) || + !__is_aligned(redist_start, PAGE_SIZE_64K)) + return (EINVAL); + + /* The dist register space is 1 64k block */ + if (dist_size != PAGE_SIZE_64K) + return (EINVAL); + + /* The redist register space is 2 64k blocks */ + if (redist_size != PAGE_SIZE_64K * 2) + return (EINVAL); + + /* Set the distributor address and size for trapping guest access. */ + dist->start = dist_start; + dist->end = dist_start + dist_size; + + for (i = 0; i < VM_MAXCPU; i++) { + redist = &hyp->ctx[i].vgic_redist; + /* Set the redistributor address and size. */ + redist->start = redist_start; + redist->end = redist_start + redist_size; + } + + vm_register_inst_handler(vm, dist_start, dist_size, dist_read, + dist_write); + vm_register_inst_handler(vm, redist_start, redist_size, redist_read, + redist_write); + + vgic_v3_mmio_init(hyp); + + hyp->vgic_attached = true; + + return (0); +} + +void +vgic_v3_detach_from_vm(struct vm *vm) +{ + struct hyp *hyp = vm_get_cookie(vm); + + if (hyp->vgic_attached) { + hyp->vgic_attached = false; + vgic_v3_mmio_destroy(hyp); + } +} + +static struct vgic_v3_irq * +vgic_v3_get_irq(struct hyp *hyp, int vcpuid, uint32_t irqid) +{ + struct vgic_v3_cpu_if *cpu_if; + struct vgic_v3_dist *dist; + struct vgic_v3_irq *irq; + + if (irqid < VGIC_PRV_I_NUM) { + if (vcpuid < 0 || vcpuid >= nitems(hyp->ctx)) + return (NULL); + + cpu_if = &hyp->ctx[vcpuid].vgic_cpu_if; + irq = &cpu_if->private_irqs[irqid]; + } else if (irqid <= GIC_LAST_SPI) { + dist = &hyp->vgic_dist; + irqid -= VGIC_PRV_I_NUM; + if (irqid >= VGIC_NIRQS) + return (NULL); + irq = &dist->irqs[irqid]; + } else if (irqid < GIC_FIRST_LPI) { + return (NULL); + } else { + /* No support for LPIs */ + return (NULL); + } + + mtx_lock_spin(&irq->irq_spinmtx); + return (irq); +} + +static void +vgic_v3_release_irq(struct vgic_v3_irq *irq) +{ + + mtx_unlock_spin(&irq->irq_spinmtx); +} + +bool +vgic_v3_vcpu_pending_irq(struct hypctx *hypctx) +{ + struct vgic_v3_cpu_if *cpu_if; + bool empty; + + cpu_if = &hypctx->vgic_cpu_if; + mtx_lock_spin(&cpu_if->lr_mtx); + empty = TAILQ_EMPTY(&cpu_if->irq_act_pend); + mtx_unlock_spin(&cpu_if->lr_mtx); + + return (!empty); +} + +static bool +vgic_v3_check_irq(struct vgic_v3_irq *irq, bool level) +{ + /* + * Only inject if: + * - Level-triggered IRQ: level changes low -> high + * - Edge-triggered IRQ: level is high + */ + switch (irq->config & VGIC_CONFIG_MASK) { + case VGIC_CONFIG_LEVEL: + return (level != irq->level); + case VGIC_CONFIG_EDGE: + return (level); + default: + break; + } + + return (false); +} + +int +vgic_v3_inject_irq(struct hyp *hyp, int vcpuid, uint32_t irqid, bool level) +{ + + struct vgic_v3_cpu_if *cpu_if; + struct vgic_v3_irq *irq; + int target_vcpu; + bool notify; + + KASSERT(vcpuid == -1 || irqid < VGIC_PRV_I_NUM, + ("%s: SPI/LPI with vcpuid set: irq %u vcpuid %u", __func__, irqid, + vcpuid)); + + irq = vgic_v3_get_irq(hyp, vcpuid, irqid); + if (irq == NULL) { + eprintf("Malformed IRQ %u.\n", irqid); + return (1); + } + + target_vcpu = irq->target_vcpu; + KASSERT(vcpuid == -1 || vcpuid == target_vcpu, + ("%s: Interrupt %u has bad cpu affinity: vcpu %d target vcpu %d", + __func__, irqid, vcpuid, target_vcpu)); + KASSERT(target_vcpu >= 0 && target_vcpu < VM_MAXCPU, + ("%s: Interrupt %u sent to invalid vcpu %d", __func__, irqid, + target_vcpu)); + + if (vcpuid == -1) + vcpuid = target_vcpu; + /* TODO: Check from 0 to vm->maxcpus */ + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) { + vgic_v3_release_irq(irq); + return (1); + } + + notify = false; + cpu_if = &hyp->ctx[vcpuid].vgic_cpu_if; + + mtx_lock_spin(&cpu_if->lr_mtx); + + if (!vgic_v3_check_irq(irq, level)) { + goto out; + } + + if ((irq->config & VGIC_CONFIG_MASK) == VGIC_CONFIG_LEVEL) + irq->level = level; + else /* VGIC_CONFIG_EDGE */ + irq->pending = true; + + notify = vgic_v3_queue_irq(hyp, cpu_if, vcpuid, irq); + +out: + mtx_unlock_spin(&cpu_if->lr_mtx); + vgic_v3_release_irq(irq); + + if (notify) + vcpu_notify_event(hyp->vm, vcpuid, false); + + return (0); +} + +int +vgic_v3_inject_msi(struct hyp *hyp, uint64_t msg, uint64_t addr) +{ + struct vgic_v3_dist *dist = &hyp->vgic_dist; + uint64_t reg; + + /* This is a 4 byte register */ + if (addr < dist->start || addr + 4 > dist->end) { + return (EINVAL); + } + + reg = addr - dist->start; + if (reg != GICD_SETSPI_NSR) + return (EINVAL); + + return (vgic_v3_inject_irq(hyp, -1, msg, true)); +} + +void +vgic_v3_flush_hwstate(void *arg) +{ + struct hypctx *hypctx; + struct vgic_v3_cpu_if *cpu_if; + struct vgic_v3_irq *irq; + int i; + + hypctx = arg; + cpu_if = &hypctx->vgic_cpu_if; + + /* + * All Distributor writes have been executed at this point, do not + * protect Distributor reads with a mutex. + * + * This is callled with all interrupts disabled, so there is no need for + * a List Register spinlock either. + */ + mtx_lock_spin(&cpu_if->lr_mtx); + + cpu_if->ich_hcr_el2 &= ~ICH_HCR_EL2_UIE; + + /* Exit early if there are no buffered interrupts */ + if (TAILQ_EMPTY(&cpu_if->irq_act_pend)) + goto out; + + KASSERT(cpu_if->ich_lr_used == 0, ("%s: Used LR count not zero %u", + __func__, cpu_if->ich_lr_used)); + + i = 0; + cpu_if->ich_elrsr_el2 = (1 << cpu_if->ich_lr_num) - 1; + TAILQ_FOREACH(irq, &cpu_if->irq_act_pend, act_pend_list) { + /* No free list register, stop searching for IRQs */ + if (i == cpu_if->ich_lr_num) + break; + + if (!irq->enabled) + continue; + + cpu_if->ich_lr_el2[i] = ICH_LR_EL2_GROUP1 | + ((uint64_t)irq->priority << ICH_LR_EL2_PRIO_SHIFT) | + irq->irq; + + if (irq->active) { + cpu_if->ich_lr_el2[i] |= ICH_LR_EL2_STATE_ACTIVE; + } + +#ifdef notyet + /* TODO: Check why this is needed */ + if ((irq->config & _MASK) == LEVEL) + cpu_if->ich_lr_el2[i] |= ICH_LR_EL2_EOI; +#endif + + if (!irq->active && vgic_v3_irq_pending(irq)) { + cpu_if->ich_lr_el2[i] |= ICH_LR_EL2_STATE_PENDING; + + /* + * This IRQ is now pending on the guest. Allow for + * another edge that could cause the interrupt to + * be raised again. + */ + if ((irq->config & VGIC_CONFIG_MASK) == + VGIC_CONFIG_EDGE) { + irq->pending = false; + } + } + + i++; + } + cpu_if->ich_lr_used = i; + +out: + mtx_unlock_spin(&cpu_if->lr_mtx); +} + +void +vgic_v3_sync_hwstate(void *arg) +{ + struct hypctx *hypctx; + struct vgic_v3_cpu_if *cpu_if; + struct vgic_v3_irq *irq; + uint64_t lr; + int i; + + hypctx = arg; + cpu_if = &hypctx->vgic_cpu_if; + + /* Exit early if there are no buffered interrupts */ + if (cpu_if->ich_lr_used == 0) + return; + + /* + * Check on the IRQ state after running the guest. ich_lr_used and + * ich_lr_el2 are only ever used within this thread so is safe to + * access unlocked. + */ + for (i = 0; i < cpu_if->ich_lr_used; i++) { + lr = cpu_if->ich_lr_el2[i]; + cpu_if->ich_lr_el2[i] = 0; + + irq = vgic_v3_get_irq(hypctx->hyp, hypctx->vcpu, + ICH_LR_EL2_VINTID(lr)); + if (irq == NULL) + continue; + + irq->active = (lr & ICH_LR_EL2_STATE_ACTIVE) != 0; + + if ((irq->config & VGIC_CONFIG_MASK) == VGIC_CONFIG_EDGE) { + /* + * If we have an edge triggered IRQ preserve the + * pending bit until the IRQ has been handled. + */ + if ((lr & ICH_LR_EL2_STATE_PENDING) != 0) { + irq->pending = true; + } + } else { + /* + * If we have a level triggerend IRQ remove the + * pending bit if the IRQ has been handled. + * The level is separate, so may still be high + * triggering another IRQ. + */ + if ((lr & ICH_LR_EL2_STATE_PENDING) == 0) { + irq->pending = false; + } + } + + /* Lock to update irq_act_pend */ + mtx_lock_spin(&cpu_if->lr_mtx); + if (irq->active) { + /* Ensure the active IRQ is at the head of the list */ + TAILQ_REMOVE(&cpu_if->irq_act_pend, irq, act_pend_list); + TAILQ_INSERT_HEAD(&cpu_if->irq_act_pend, irq, + act_pend_list); + } else if (!vgic_v3_irq_pending(irq)) { + /* If pending or active remove from the list */ + TAILQ_REMOVE(&cpu_if->irq_act_pend, irq, act_pend_list); + irq->on_aplist = false; + } + mtx_unlock_spin(&cpu_if->lr_mtx); + vgic_v3_release_irq(irq); + } + + cpu_if->ich_hcr_el2 &= ~ICH_HCR_EL2_EOICOUNT_MASK; + cpu_if->ich_lr_used = 0; +} + +static int +vgic_probe(device_t dev) +{ + if (!gic_get_vgic(dev)) + return (EINVAL); + + /* We currently only support the GICv3 */ + if (gic_get_hw_rev(dev) < 3) + return (EINVAL); + + device_set_desc(dev, "Virtual GIC"); + return (BUS_PROBE_DEFAULT); +} + +static int +vgic_attach(device_t dev) +{ + have_vgic = true; + return (0); +} + +static device_method_t vgic_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, vgic_probe), + DEVMETHOD(device_attach, vgic_attach), + + /* End */ + DEVMETHOD_END +}; + +DEFINE_CLASS_0(vgic, vgic_driver, vgic_methods, 0); + +DRIVER_MODULE(vgic, gic, vgic_driver, 0, 0); + +bool +vgic_present(void) +{ + return (have_vgic); +} + +void +vgic_v3_init(uint64_t ich_vtr_el2) +{ + uint32_t pribits, prebits; + + MPASS(have_vgic); + + pribits = ICH_VTR_EL2_PRIBITS(ich_vtr_el2); + switch (pribits) { + case 5: + virt_features.min_prio = 0xf8; + case 6: + virt_features.min_prio = 0xfc; + case 7: + virt_features.min_prio = 0xfe; + case 8: + virt_features.min_prio = 0xff; + } + + prebits = ICH_VTR_EL2_PREBITS(ich_vtr_el2); + switch (prebits) { + case 5: + virt_features.ich_apr_num = 1; + case 6: + virt_features.ich_apr_num = 2; + case 7: + virt_features.ich_apr_num = 4; + } + + virt_features.ich_lr_num = ICH_VTR_EL2_LISTREGS(ich_vtr_el2); +} diff --git a/sys/arm64/vmm/io/vgic_v3_reg.h b/sys/arm64/vmm/io/vgic_v3_reg.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/io/vgic_v3_reg.h @@ -0,0 +1,99 @@ +#ifndef _VGIC_V3_REG_H_ +#define _VGIC_V3_REG_H_ + +/* Interrupt Controller End of Interrupt Status Register */ +#define ICH_EISR_EL2_STATUS_MASK 0xffff +#define ICH_EISR_EL2_EOI_NOT_HANDLED(lr) ((1 << lr) & ICH_EISR_EL2_STATUS_MASK) + +/* Interrupt Controller Empty List Register Status Register */ +#define ICH_ELSR_EL2_STATUS_MASK 0xffff +#define ICH_ELSR_EL2_LR_EMPTY(x) ((1 << x) & ICH_ELSR_EL2_STATUS_MASK) + +/* Interrupt Controller Hyp Control Register */ +#define ICH_HCR_EL2_EOICOUNT_SHIFT 27 +#define ICH_HCR_EL2_EOICOUNT_MASK (0x1f << ICH_HCR_EL2_EOICOUNT_SHIFT) +#define ICH_HCR_EL2_TDIR (1 << 14) /* Trap non-secure EL1 writes to IC{C, V}_DIR_EL1 */ +#define ICH_HCR_EL2_TSEI (1 << 14) /* Trap System Error Interupts (SEI) to EL2 */ +#define ICH_HCR_EL2_TALL1 (1 << 12) /* Trap non-secure EL1 accesses to IC{C, V}_* for Group 1 interrupts */ +#define ICH_HCR_EL2_TALL0 (1 << 11) /* Trap non-secure EL1 accesses to IC{C, V}_* for Group 0 interrupts */ +#define ICH_HCR_EL2_TC (1 << 10) /* Trap non-secure EL1 accesses to common IC{C, V}_* registers */ +#define ICH_HCR_EL2_VGRP1DIE (1 << 7) /* VM Group 1 Disabled Interrupt Enable */ +#define ICH_HCR_EL2_VGRP1EIE (1 << 6) /* VM Group 1 Enabled Interrupt Enable */ +#define ICH_HCR_EL2_VGRP0DIE (1 << 5) /* VM Group 0 Disabled Interrupt Enable */ +#define ICH_HCR_EL2_VGRP0EIE (1 << 4) /* VM Group 0 Enabled Interrupt Enable */ +#define ICH_HCR_EL2_NPIE (1 << 3) /* No Pending Interrupt Enable */ +#define ICH_HCR_EL2_LRENPIE (1 << 2) /* List Register Entry Not Present Interrupt Enable */ +#define ICH_HCR_EL2_UIE (1 << 1) /* Underflow Interrupt Enable */ +#define ICH_HCR_EL2_En (1 << 0) /* Global enable for the virtual CPU interface */ + +/* Interrupt Controller List Registers */ +#define ICH_LR_EL2_VINTID_MASK 0xffffffff +#define ICH_LR_EL2_VINTID(x) ((x) & ICH_LR_EL2_VINTID_MASK) +#define ICH_LR_EL2_PINTID_SHIFT 32 +#define ICH_LR_EL2_PINTID_MASK (0x3fUL << ICH_LR_EL2_PINTID_SHIFT) +/* Raise a maintanance IRQ when deactivated (only non-HW virqs) */ +#define ICH_LR_EL2_EOI (1UL << 41) +#define ICH_LR_EL2_PRIO_SHIFT 48 +#define ICH_LR_EL2_PRIO_MASK (0xffUL << ICH_LR_EL2_PRIO_SHIFT) +#define ICH_LR_EL2_GROUP_SHIFT 60 +#define ICH_LR_EL2_GROUP1 (1UL << ICH_LR_EL2_GROUP_SHIFT) +#define ICH_LR_EL2_HW (1UL << 61) +#define ICH_LR_EL2_STATE_SHIFT 62 +#define ICH_LR_EL2_STATE_MASK (0x3UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE(x) ((x) & ICH_LR_EL2_STATE_MASK) +#define ICH_LR_EL2_STATE_INACTIVE (0x0UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE_PENDING (0x1UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE_ACTIVE (0x2UL << ICH_LR_EL2_STATE_SHIFT) +#define ICH_LR_EL2_STATE_PENDING_ACTIVE (0x3UL << ICH_LR_EL2_STATE_SHIFT) + +/* Interrupt Controller Maintenance Interrupt State Register */ +#define ICH_MISR_EL2_VGRP1D (1 << 7) /* vPE Group 1 Disabled */ +#define ICH_MISR_EL2_VGRP1E (1 << 6) /* vPE Group 1 Enabled */ +#define ICH_MISR_EL2_VGRP0D (1 << 5) /* vPE Group 0 Disabled */ +#define ICH_MISR_EL2_VGRP0E (1 << 4) /* vPE Group 0 Enabled */ +#define ICH_MISR_EL2_NP (1 << 3) /* No Pending */ +#define ICH_MISR_EL2_LRENP (1 << 2) /* List Register Entry Not Present */ +#define ICH_MISR_EL2_U (1 << 1) /* Underflow */ +#define ICH_MISR_EL2_EOI (1 << 0) /* End Of Interrupt */ + +/* Interrupt Controller Virtual Machine Control Register */ +#define ICH_VMCR_EL2_VPMR_SHIFT 24 +#define ICH_VMCR_EL2_VPMR_MASK (0xff << ICH_VMCR_EL2_VPMR_SHIFT) +#define ICH_VMCR_EL2_VPMR_PRIO_LOWEST (0xff << ICH_VMCR_EL2_VPMR_SHIFT) +#define ICH_VMCR_EL2_VPMR_PRIO_HIGHEST (0x00 << ICH_VMCR_EL2_VPMR_SHIFT) +#define ICH_VMCR_EL2_VBPR0_SHIFT 21 +#define ICH_VMCR_EL2_VBPR0_MASK (0x7 << ICH_VMCR_EL2_VBPR0_SHIFT) +#define ICH_VMCR_EL2_VBPR0_NO_PREEMPTION \ + (0x7 << ICH_VMCR_EL2_VBPR0_SHIFT) +#define ICH_VMCR_EL2_VBPR1_SHIFT 18 +#define ICH_VMCR_EL2_VBPR1_MASK (0x7 << ICH_VMCR_EL2_VBPR1_SHIFT) +#define ICH_VMCR_EL2_VBPR1_NO_PREEMPTION \ + (0x7 << ICH_VMCR_EL2_VBPR1_SHIFT) +#define ICH_VMCR_EL2_VEOIM (1 << 9) /* Virtual EOI mode */ +#define ICH_VMCR_EL2_VCBPR (1 << 4) /* Virtual Common binary Point Register */ +#define ICH_VMCR_EL2_VFIQEN (1 << 3) /* Virtual FIQ enable */ +#define ICH_VMCR_EL2_VACKCTL (1 << 2) /* Virtual AckCtl */ +#define ICH_VMCR_EL2_VENG1 (1 << 1) /* Virtual Group 1 Interrupt Enable */ +#define ICH_VMCR_EL2_VENG0 (1 << 0) /* Virtual Group 0 Interrupt Enable */ + +/* Interrupt Controller VGIC Type Register */ +#define ICH_VTR_EL2_PRIBITS_SHIFT 29 +#define ICH_VTR_EL2_PRIBITS_MASK (0x7 << ICH_VTR_EL2_PRIBITS_SHIFT) +#define ICH_VTR_EL2_PRIBITS(x) \ + ((((x) & ICH_VTR_EL2_PRIBITS_MASK) >> ICH_VTR_EL2_PRIBITS_SHIFT) + 1) +#define ICH_VTR_EL2_PREBITS_SHIFT 26 +#define ICH_VTR_EL2_PREBITS_MASK (0x7 << ICH_VTR_EL2_PREBITS_SHIFT) +#define ICH_VTR_EL2_PREBITS(x) \ + (((x) & ICH_VTR_EL2_PREBITS_MASK) >> ICH_VTR_EL2_PREBITS_SHIFT) +#define ICH_VTR_EL2_SEIS (1 << 22) /* System Error Interrupt (SEI) Support */ +#define ICH_VTR_EL2_A3V (1 << 21) /* Affinity 3 Valid */ +#define ICH_VTR_EL2_NV4 (1 << 20) /* Direct injection of virtual interrupts. RES1 for GICv3 */ +#define ICH_VTR_EL2_TDS (1 << 19) /* Implementation supports ICH_HCR_EL2.TDIR */ +#define ICH_VTR_EL2_LISTREGS_MASK 0x1f +/* + * ICH_VTR_EL2.ListRegs holds the number of list registers, minus one. Add one + * to get the actual number of list registers. + */ +#define ICH_VTR_EL2_LISTREGS(x) (((x) & ICH_VTR_EL2_LISTREGS_MASK) + 1) + +#endif /* !_VGIC_V3_REG_H_ */ diff --git a/sys/arm64/vmm/io/vtimer.h b/sys/arm64/vmm/io/vtimer.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/io/vtimer.h @@ -0,0 +1,82 @@ +/*- + * Copyright (c) 2017 The FreeBSD Foundation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company nor the name of the author may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_VTIMER_H_ +#define _VMM_VTIMER_H_ + +#define GT_PHYS_NS_IRQ 30 +#define GT_VIRT_IRQ 27 + +struct vtimer +{ + uint64_t cnthctl_el2; + uint64_t cntvoff_el2; +}; + +struct vtimer_timer +{ + struct callout callout; + struct mtx mtx; + + uint32_t irqid; + + /* + * These registers are either emulated for the physical timer, or + * the guest has full access to them for the virtual timer. + + * CNTx_CTL_EL0: Counter-timer Timer Control Register + * CNTx_CVAL_EL0: Counter-timer Timer CompareValue Register + */ + uint32_t cntx_cval_el0; + uint32_t cntx_ctl_el0; +}; + +struct vtimer_cpu +{ + struct vtimer_timer phys_timer; + struct vtimer_timer virt_timer; + + uint32_t cntkctl_el1; +}; + +int vtimer_init(uint64_t cnthctl_el2); +void vtimer_vminit(struct hyp *); +void vtimer_cpuinit(struct hypctx *); +void vtimer_cpucleanup(struct hypctx *); +void vtimer_vmcleanup(struct hyp *); +void vtimer_cleanup(void); + +int vtimer_phys_ctl_read(void *vm, int vcpuid, uint64_t *rval, void *arg); +int vtimer_phys_ctl_write(void *vm, int vcpuid, uint64_t wval, void *arg); +int vtimer_phys_cnt_read(void *vm, int vcpuid, uint64_t *rval, void *arg); +int vtimer_phys_cnt_write(void *vm, int vcpuid, uint64_t wval, void *arg); +int vtimer_phys_cval_read(void *vm, int vcpuid, uint64_t *rval, void *arg); +int vtimer_phys_cval_write(void *vm, int vcpuid, uint64_t wval, void *arg); +int vtimer_phys_tval_read(void *vm, int vcpuid, uint64_t *rval, void *arg); +int vtimer_phys_tval_write(void *vm, int vcpuid, uint64_t wval, void *arg); +#endif diff --git a/sys/arm64/vmm/io/vtimer.c b/sys/arm64/vmm/io/vtimer.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/io/vtimer.c @@ -0,0 +1,456 @@ +/*- + * Copyright (c) 2017 The FreeBSD Foundation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company nor the name of the author may be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "vgic_v3.h" +#include "vtimer.h" + +#define RES1 0xffffffffffffffffUL + +#define timer_enabled(ctl) \ + (!((ctl) & CNTP_CTL_IMASK) && ((ctl) & CNTP_CTL_ENABLE)) + +static uint64_t cnthctl_el2_reg; +static uint32_t tmr_frq; +static bool have_vtimer = false; + +#define timer_condition_met(ctl) ((ctl) & CNTP_CTL_ISTATUS) + +static int +vtimer_virtual_timer_intr(void *arg) +{ + struct hypctx *hypctx; + uint32_t cntv_ctl; + + /* + * TODO everything here is very strange. The relantionship between the + * hardware value and the value in memory is not clear at all. + */ + + hypctx = arm64_get_active_vcpu(); + cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); + + if (!hypctx) { + /* vm_destroy() was called. */ + eprintf("No active vcpu\n"); + cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); + goto out; + } + if (!timer_enabled(cntv_ctl)) { + eprintf("Timer not enabled\n"); + goto out; + } + if (!timer_condition_met(cntv_ctl)) { + eprintf("Timer condition not met\n"); + goto out; + } + + vgic_v3_inject_irq(hypctx->hyp, hypctx->vcpu, GT_VIRT_IRQ, true); + + hypctx->vtimer_cpu.virt_timer.cntx_ctl_el0 &= ~CNTP_CTL_ENABLE; + cntv_ctl = hypctx->vtimer_cpu.virt_timer.cntx_ctl_el0; + +out: + /* + * Disable the timer interrupt. This will prevent the interrupt from + * being reasserted as soon as we exit the handler and getting stuck + * in an infinite loop. + * + * This is safe to do because the guest disabled the timer, and then + * enables it as part of the interrupt handling routine. + */ + cntv_ctl &= ~CNTP_CTL_ENABLE; + WRITE_SPECIALREG(cntv_ctl_el0, cntv_ctl); + + return (FILTER_HANDLED); +} + +int +vtimer_init(uint64_t cnthctl_el2) +{ + cnthctl_el2_reg = cnthctl_el2; + /* + * The guest *MUST* use the same timer frequency as the host. The + * register CNTFRQ_EL0 is accessible to the guest and a different value + * in the guest dts file might have unforseen consequences. + */ + tmr_frq = READ_SPECIALREG(cntfrq_el0); + + return (0); +} + +void +vtimer_vminit(struct hyp *hyp) +{ + uint64_t now; + + /* + * Configure the Counter-timer Hypervisor Control Register for the VM. + * + * ~CNTHCTL_EL1PCEN: trap access to CNTP_{CTL, CVAL, TVAL}_EL0 from EL1 + * CNTHCTL_EL1PCTEN: don't trap access to CNTPCT_EL0 + */ + hyp->vtimer.cnthctl_el2 = cnthctl_el2_reg & ~CNTHCTL_EL1PCEN; + hyp->vtimer.cnthctl_el2 |= CNTHCTL_EL1PCTEN; + + now = READ_SPECIALREG(cntpct_el0); + hyp->vtimer.cntvoff_el2 = now; + + return; +} + +void +vtimer_cpuinit(struct hypctx *hypctx) +{ + struct vtimer_cpu *vtimer_cpu; + + vtimer_cpu = &hypctx->vtimer_cpu; + /* + * Configure physical timer interrupts for the VCPU. + * + * CNTP_CTL_IMASK: mask interrupts + * ~CNTP_CTL_ENABLE: disable the timer + */ + vtimer_cpu->phys_timer.cntx_ctl_el0 = CNTP_CTL_IMASK & ~CNTP_CTL_ENABLE; + + mtx_init(&vtimer_cpu->phys_timer.mtx, "vtimer phys callout mutex", NULL, + MTX_DEF); + callout_init_mtx(&vtimer_cpu->phys_timer.callout, + &vtimer_cpu->phys_timer.mtx, 0); + vtimer_cpu->phys_timer.irqid = GT_PHYS_NS_IRQ; + + mtx_init(&vtimer_cpu->virt_timer.mtx, "vtimer virt callout mutex", NULL, + MTX_DEF); + callout_init_mtx(&vtimer_cpu->virt_timer.callout, + &vtimer_cpu->virt_timer.mtx, 0); + vtimer_cpu->virt_timer.irqid = GT_VIRT_IRQ; +} + +void +vtimer_cpucleanup(struct hypctx *hypctx) +{ + struct vtimer_cpu *vtimer_cpu; + + vtimer_cpu = &hypctx->vtimer_cpu; + callout_drain(&vtimer_cpu->phys_timer.callout); + callout_drain(&vtimer_cpu->virt_timer.callout); + mtx_destroy(&vtimer_cpu->phys_timer.mtx); + mtx_destroy(&vtimer_cpu->virt_timer.mtx); +} + +void +vtimer_vmcleanup(struct hyp *hyp) +{ + struct hypctx *hypctx; + uint32_t cntv_ctl; + + hypctx = arm64_get_active_vcpu(); + if (!hypctx) { + /* The active VM was destroyed, stop the timer. */ + cntv_ctl = READ_SPECIALREG(cntv_ctl_el0); + cntv_ctl &= ~CNTP_CTL_ENABLE; + WRITE_SPECIALREG(cntv_ctl_el0, cntv_ctl); + } +} + +void +vtimer_cleanup(void) +{ +} + +static void +vtimer_inject_irq_callout_func(void *context) +{ + struct hypctx *hypctx; + + hypctx = context; + vgic_v3_inject_irq(hypctx->hyp, hypctx->vcpu, + hypctx->vtimer_cpu.phys_timer.irqid, true); +} + + +static void +vtimer_schedule_irq(struct vtimer_cpu *vtimer_cpu, struct hyp *hyp, int vcpuid) +{ + sbintime_t time; + struct vtimer_timer *timer; + uint64_t cntpct_el0; + uint64_t diff; + + timer = &vtimer_cpu->phys_timer; + cntpct_el0 = READ_SPECIALREG(cntpct_el0); + if (timer->cntx_cval_el0 < cntpct_el0) { + /* Timer set in the past, trigger interrupt */ + vgic_v3_inject_irq(hyp, vcpuid, timer->irqid, true); + } else { + diff = timer->cntx_cval_el0 - cntpct_el0; + time = diff * SBT_1S / tmr_frq; + callout_reset_sbt(&timer->callout, time, 0, + vtimer_inject_irq_callout_func, &hyp->ctx[vcpuid], 0); + } +} + +static void +vtimer_remove_irq(struct hypctx *hypctx, int vcpuid) +{ + struct vtimer_cpu *vtimer_cpu; + struct vtimer_timer *timer; + + vtimer_cpu = &hypctx->vtimer_cpu; + timer = &vtimer_cpu->phys_timer; + + callout_drain(&timer->callout); + /* + * The interrupt needs to be deactivated here regardless of the callout + * function having been executed. The timer interrupt can be masked with + * the CNTP_CTL_EL0.IMASK bit instead of reading the IAR register. + * Masking the interrupt doesn't remove it from the list registers. + */ + vgic_v3_inject_irq(hypctx->hyp, vcpuid, timer->irqid, false); +} + +/* + * Timer emulation functions. + * + * The guest should use the virtual timer, however some software, e.g. u-boot, + * used the physical timer. Emulate this in software for the guest to use. + * + * Adjust for cntvoff_el2 so the physical and virtual timers are at similar + * times. This simplifies interrupt handling in the virtual timer as the + * adjustment will have already happened. + */ + +int +vtimer_phys_ctl_read(void *vm, int vcpuid, uint64_t *rval, void *arg) +{ + struct hyp *hyp; + struct vtimer_cpu *vtimer_cpu; + uint64_t cntpct_el0; + + hyp = vm_get_cookie(vm); + vtimer_cpu = &hyp->ctx[vcpuid].vtimer_cpu; + + cntpct_el0 = READ_SPECIALREG(cntpct_el0) - hyp->vtimer.cntvoff_el2; + if (vtimer_cpu->phys_timer.cntx_cval_el0 < cntpct_el0) + /* Timer condition met */ + *rval = vtimer_cpu->phys_timer.cntx_ctl_el0 | CNTP_CTL_ISTATUS; + else + *rval = vtimer_cpu->phys_timer.cntx_ctl_el0 & ~CNTP_CTL_ISTATUS; + + return (0); +} + +int +vtimer_phys_ctl_write(void *vm, int vcpuid, uint64_t wval, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + uint64_t ctl_el0; + bool timer_toggled_on; + + hyp = vm_get_cookie(vm); + hypctx = &hyp->ctx[vcpuid]; + vtimer_cpu = &hypctx->vtimer_cpu; + + timer_toggled_on = false; + ctl_el0 = vtimer_cpu->phys_timer.cntx_ctl_el0; + + if (!timer_enabled(ctl_el0) && timer_enabled(wval)) + timer_toggled_on = true; + + vtimer_cpu->phys_timer.cntx_ctl_el0 = wval; + + if (timer_toggled_on) + vtimer_schedule_irq(vtimer_cpu, hyp, vcpuid); + + return (0); +} + +int +vtimer_phys_cnt_read(void *vm, int vcpuid, uint64_t *rval, void *arg) +{ + struct hyp *hyp; + + hyp = vm_get_cookie(vm); + *rval = READ_SPECIALREG(cntpct_el0) - hyp->vtimer.cntvoff_el2; + return (0); +} + +int +vtimer_phys_cnt_write(void *vm, int vcpuid, uint64_t wval, void *arg) +{ + return (0); +} + +int +vtimer_phys_cval_read(void *vm, int vcpuid, uint64_t *rval, void *arg) +{ + struct hyp *hyp; + struct vtimer_cpu *vtimer_cpu; + + hyp = vm_get_cookie(vm); + vtimer_cpu = &hyp->ctx[vcpuid].vtimer_cpu; + + *rval = vtimer_cpu->phys_timer.cntx_cval_el0; + + return (0); +} + +int +vtimer_phys_cval_write(void *vm, int vcpuid, uint64_t wval, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + + hyp = vm_get_cookie(vm); + hypctx = &hyp->ctx[vcpuid]; + vtimer_cpu = &hypctx->vtimer_cpu; + + vtimer_cpu->phys_timer.cntx_cval_el0 = wval; + + if (timer_enabled(vtimer_cpu->phys_timer.cntx_ctl_el0)) { + vtimer_remove_irq(hypctx, vcpuid); + vtimer_schedule_irq(vtimer_cpu, hyp, vcpuid); + } + + return (0); +} + +int +vtimer_phys_tval_read(void *vm, int vcpuid, uint64_t *rval, void *arg) +{ + struct hyp *hyp; + struct vtimer_cpu *vtimer_cpu; + uint32_t cntpct_el0; + + hyp = vm_get_cookie(vm); + vtimer_cpu = &hyp->ctx[vcpuid].vtimer_cpu; + + if (!(vtimer_cpu->phys_timer.cntx_ctl_el0 & CNTP_CTL_ENABLE)) { + /* + * ARMv8 Architecture Manual, p. D7-2702: the result of reading + * TVAL when the timer is disabled is UNKNOWN. I have chosen to + * return the maximum value possible on 32 bits which means the + * timer will fire very far into the future. + */ + *rval = (uint32_t)RES1; + } else { + cntpct_el0 = READ_SPECIALREG(cntpct_el0) - + hyp->vtimer.cntvoff_el2; + *rval = vtimer_cpu->phys_timer.cntx_cval_el0 - cntpct_el0; + } + + return (0); +} + +int +vtimer_phys_tval_write(void *vm, int vcpuid, uint64_t wval, void *arg) +{ + struct hyp *hyp; + struct hypctx *hypctx; + struct vtimer_cpu *vtimer_cpu; + uint64_t cntpct_el0; + + hyp = vm_get_cookie(vm); + hypctx = &hyp->ctx[vcpuid]; + vtimer_cpu = &hypctx->vtimer_cpu; + + cntpct_el0 = READ_SPECIALREG(cntpct_el0) - hyp->vtimer.cntvoff_el2; + vtimer_cpu->phys_timer.cntx_cval_el0 = (int32_t)wval + cntpct_el0; + + if (timer_enabled(vtimer_cpu->phys_timer.cntx_ctl_el0)) { + vtimer_remove_irq(hypctx, vcpuid); + vtimer_schedule_irq(vtimer_cpu, hyp, vcpuid); + } + + return (0); +} + +struct vtimer_softc { + struct resource *res; + void *ihl; + int rid; +}; + +static int +vtimer_probe(device_t dev) +{ + device_set_desc(dev, "Virtual timer"); + return (BUS_PROBE_DEFAULT); +} + +static int +vtimer_attach(device_t dev) +{ + struct vtimer_softc *sc; + + sc = device_get_softc(dev); + + sc->rid = 0; + sc->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &sc->rid, RF_ACTIVE); + if (sc->res == NULL) + return (ENXIO); + + bus_setup_intr(dev, sc->res, INTR_TYPE_CLK, vtimer_virtual_timer_intr, + NULL, NULL, &sc->ihl); + + have_vtimer = true; + return (0); +} + +static device_method_t vtimer_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, vtimer_probe), + DEVMETHOD(device_attach, vtimer_attach), + + /* End */ + DEVMETHOD_END +}; + +DEFINE_CLASS_0(vtimer, vtimer_driver, vtimer_methods, + sizeof(struct vtimer_softc)); + +DRIVER_MODULE(vtimer, generic_timer, vtimer_driver, 0, 0); diff --git a/sys/arm64/vmm/mmu.h b/sys/arm64/vmm/mmu.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/mmu.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2017 Alexandru Elisei + * All rights reserved. + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_MMU_H_ +#define _VMM_MMU_H_ + +#include +#include +#include + +#include "hyp.h" + +extern char vmm_hyp_code; +extern char vmm_hyp_code_end; + +extern char _vmm_start; +extern char _vmm_end; + +bool vmmpmap_init(void); +void vmmpmap_fini(void); +uint64_t vmmpmap_to_ttbr0(void); +bool vmmpmap_enter(vm_offset_t, vm_size_t, vm_paddr_t, vm_prot_t); +void vmmpmap_remove(vm_offset_t, vm_size_t, bool); + +#endif diff --git a/sys/arm64/vmm/psci.h b/sys/arm64/vmm/psci.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/psci.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2018 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _PSCI_H_ +#define _PSCI_H_ + +#include "arm64.h" + +int psci_handle_call(struct vm *vm, int vcpuid, struct vm_exit *vme, + bool *retu); + +#endif diff --git a/sys/arm64/vmm/reset.h b/sys/arm64/vmm/reset.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/reset.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2018 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _VMM_RESET_H_ +#define _VMM_RESET_H_ + +void reset_vm_el01_regs(void *vcpu); +void reset_vm_el2_regs(void *vcpu); + +#endif diff --git a/sys/arm64/vmm/vmm.c b/sys/arm64/vmm/vmm.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm.c @@ -0,0 +1,1599 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "vmm_ktr.h" +#include "vmm_stat.h" +#include "vmm_mem.h" +#include "arm64.h" +#include "mmu.h" +#include "psci.h" + +#include "io/vgic_v3.h" +#include "io/vtimer.h" + +#define BSP 0 /* the boostrap processor */ + +struct vcpu { + int flags; + enum vcpu_state state; + struct mtx mtx; + int hostcpu; /* host cpuid this vcpu last ran on */ + int vcpuid; + void *stats; + struct vm_exit exitinfo; + uint64_t nextpc; /* (x) next instruction to execute */ +}; + +#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) +#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) +#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) +#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) +#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) + +struct mem_seg { + uint64_t gpa; + size_t len; + bool wired; + bool sysmem; + vm_object_t object; +}; +#define VM_MAX_MEMSEGS 3 + +struct mem_map { + vm_paddr_t gpa; + size_t len; + vm_ooffset_t segoff; + int segid; + int prot; + int flags; +}; +#define VM_MAX_MEMMAPS 4 + +struct vmm_mmio_region { + uint64_t start; + uint64_t end; + mem_region_read_t read; + mem_region_write_t write; +}; +#define VM_MAX_MMIO_REGIONS 4 + +/* + * Initialization: + * (o) initialized the first time the VM is created + * (i) initialized when VM is created and when it is reinitialized + * (x) initialized before use + */ +struct vm { + void *cookie; /* (i) cpu-specific data */ + volatile cpuset_t active_cpus; /* (i) active vcpus */ + volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ + int suspend; /* (i) stop VM execution */ + volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ + volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ + struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ + struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ + struct vmspace *vmspace; /* (o) guest's address space */ + char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ + struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ + struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS]; + /* (o) guest MMIO regions */ + /* The following describe the vm cpu topology */ + uint16_t sockets; /* (o) num of sockets */ + uint16_t cores; /* (o) num of cores/socket */ + uint16_t threads; /* (o) num of threads/core */ + uint16_t maxcpus; /* (o) max pluggable cpus */ +}; + +static bool vmm_initialized = false; + +static struct vmm_ops *ops = NULL; + +#define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) +#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) + +#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) +#define VMRUN(vmi, vcpu, pc, pmap, evinfo) \ + (ops != NULL ? (*ops->vmrun)(vmi, vcpu, pc, pmap, evinfo) : ENXIO) +#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) +#define VMSPACE_ALLOC(min, max) \ + (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) +#define VMSPACE_FREE(vmspace) \ + (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) +#define VMGETREG(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETREG(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) +#define VMGETCAP(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETCAP(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) + +#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) +#define fpu_stop_emulating() clts() + +static int vm_handle_wfi(struct vm *vm, int vcpuid, + struct vm_exit *vme, bool *retu); + +static MALLOC_DEFINE(M_VMM, "vmm", "vmm"); + +/* statistics */ +static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); + +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); + +/* + * Halt the guest if all vcpus are executing a HLT instruction with + * interrupts disabled. + */ +static int halt_detection_enabled = 1; +SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, + &halt_detection_enabled, 0, + "Halt VM if all vcpus execute HLT with interrupts disabled"); + +static int vmm_ipinum; +SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, + "IPI vector used for vcpu notifications"); + +static int trace_guest_exceptions; +SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, + &trace_guest_exceptions, 0, + "Trap into hypervisor on all guest exceptions and reflect them back"); + +static struct cpu_desc vmm_desc = { + .id_aa64afr0 = 0, + .id_aa64afr1 = 0, + .id_aa64dfr0 = + (0xful << ID_AA64DFR0_CTX_CMPs_SHIFT) | + (0xful << ID_AA64DFR0_WRPs_SHIFT) | + (0xful << ID_AA64DFR0_BRPs_SHIFT) | + ID_AA64DFR0_PMUVer_3 | + ID_AA64DFR0_DebugVer_8, + .id_aa64dfr1 = 0, + .id_aa64isar0 = + ID_AA64ISAR0_TLB_TLBIOSR | + ID_AA64ISAR0_SHA3_IMPL | + ID_AA64ISAR0_RDM_IMPL | + ID_AA64ISAR0_Atomic_IMPL | + ID_AA64ISAR0_CRC32_BASE | + ID_AA64ISAR0_SHA2_512 | + ID_AA64ISAR0_SHA1_BASE | + ID_AA64ISAR0_AES_PMULL, + .id_aa64isar1 = 0, + .id_aa64mmfr0 = + ID_AA64MMFR0_TGran4_IMPL | + ID_AA64MMFR0_TGran64_IMPL | + ID_AA64MMFR0_TGran16_IMPL | + ID_AA64MMFR0_ASIDBits_16 | + ID_AA64MMFR0_PARange_4P, + .id_aa64mmfr1 = + ID_AA64MMFR1_SpecSEI_IMPL | + ID_AA64MMFR1_PAN_ATS1E1 | + ID_AA64MMFR1_HAFDBS_AF, + .id_aa64mmfr2 = 0, + .id_aa64pfr0 = + ID_AA64PFR0_GIC_CPUIF_NONE | + ID_AA64PFR0_AdvSIMD_HP | + ID_AA64PFR0_FP_HP | + ID_AA64PFR0_EL3_64 | + ID_AA64PFR0_EL2_64 | + ID_AA64PFR0_EL1_64 | + ID_AA64PFR0_EL0_64, + .id_aa64pfr1 = 0, +}; + +static void vm_free_memmap(struct vm *vm, int ident); +static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); +static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); + +static void +vcpu_cleanup(struct vm *vm, int i, bool destroy) +{ +// struct vcpu *vcpu = &vm->vcpu[i]; +} + +static void +vcpu_init(struct vm *vm, uint32_t vcpu_id, bool create) +{ + struct vcpu *vcpu; + + vcpu = &vm->vcpu[vcpu_id]; + + if (create) { + KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " + "initialized", vcpu_id)); + vcpu_lock_init(vcpu); + vcpu->hostcpu = NOCPU; + vcpu->vcpuid = vcpu_id; + } +} + +struct vm_exit * +vm_exitinfo(struct vm *vm, int cpuid) +{ + struct vcpu *vcpu; + + if (cpuid < 0 || cpuid >= vm->maxcpus) + panic("vm_exitinfo: invalid cpuid %d", cpuid); + + vcpu = &vm->vcpu[cpuid]; + + return (&vcpu->exitinfo); +} + +static int +vmm_init(void) +{ + ops = &vmm_ops_arm; + + update_cpu_desc(&vmm_desc); + + return (VMM_INIT(0)); +} + +static int +vmm_handler(module_t mod, int what, void *arg) +{ + int error; + + switch (what) { + case MOD_LOAD: + vmmdev_init(); + error = vmm_init(); + if (error == 0) + vmm_initialized = true; + break; + case MOD_UNLOAD: + error = vmmdev_cleanup(); + if (error == 0 && vmm_initialized) { + error = VMM_CLEANUP(); + if (error) + vmm_initialized = false; + } + break; + default: + error = 0; + break; + } + return (error); +} + +static moduledata_t vmm_kmod = { + "vmm", + vmm_handler, + NULL +}; + +/* + * vmm initialization has the following dependencies: + * + * - HYP initialization requires smp_rendezvous() and therefore must happen + * after SMP is fully functional (after SI_SUB_SMP). + */ +DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); +MODULE_VERSION(vmm, 1); + +static void +vm_init(struct vm *vm, bool create) +{ + int i; + + vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace)); + + CPU_ZERO(&vm->active_cpus); + CPU_ZERO(&vm->debug_cpus); + + vm->suspend = 0; + CPU_ZERO(&vm->suspended_cpus); + + memset(vm->mmio_region, 0, sizeof(vm->mmio_region)); + + for (i = 0; i < vm->maxcpus; i++) + vcpu_init(vm, i, create); +} + +int +vm_create(const char *name, struct vm **retvm) +{ + struct vm *vm; + struct vmspace *vmspace; + + /* + * If vmm.ko could not be successfully initialized then don't attempt + * to create the virtual machine. + */ + if (!vmm_initialized) + return (ENXIO); + + if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) + return (EINVAL); + + vmspace = VMSPACE_ALLOC(0, 1ul << 39); + if (vmspace == NULL) + return (ENOMEM); + + vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO); + strcpy(vm->name, name); + vm->vmspace = vmspace; + + vm->sockets = 1; + vm->cores = 1; /* XXX backwards compatibility */ + vm->threads = 1; /* XXX backwards compatibility */ + vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ + + vm_init(vm, true); + + *retvm = vm; + return (0); +} + +void +vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus) +{ + *sockets = vm->sockets; + *cores = vm->cores; + *threads = vm->threads; + *maxcpus = vm->maxcpus; +} + +uint16_t +vm_get_maxcpus(struct vm *vm) +{ + return (vm->maxcpus); +} + +int +vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus) +{ + if (maxcpus != 0) + return (EINVAL); /* XXX remove when supported */ + if ((sockets * cores * threads) > vm->maxcpus) + return (EINVAL); + /* XXX need to check sockets * cores * threads == vCPU, how? */ + vm->sockets = sockets; + vm->cores = cores; + vm->threads = threads; + vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ + return(0); +} + +static void +vm_cleanup(struct vm *vm, bool destroy) +{ + struct mem_map *mm; + pmap_t pmap; + int i; + + if (destroy) { + pmap = vmspace_pmap(vm->vmspace); + sched_pin(); + PCPU_SET(curvmpmap, NULL); + sched_unpin(); + CPU_FOREACH(i) { + MPASS(cpuid_to_pcpu[i]->pc_curvmpmap != pmap); + } + } + + vgic_v3_detach_from_vm(vm); + + for (i = 0; i < vm->maxcpus; i++) + vcpu_cleanup(vm, i, destroy); + + VMCLEANUP(vm->cookie); + + /* + * System memory is removed from the guest address space only when + * the VM is destroyed. This is because the mapping remains the same + * across VM reset. + * + * Device memory can be relocated by the guest (e.g. using PCI BARs) + * so those mappings are removed on a VM reset. + */ + if (!destroy) { + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (destroy || !sysmem_mapping(vm, mm)) + vm_free_memmap(vm, i); + } + } + + if (destroy) { + for (i = 0; i < VM_MAX_MEMSEGS; i++) + vm_free_memseg(vm, i); + + VMSPACE_FREE(vm->vmspace); + vm->vmspace = NULL; + } +} + +void +vm_destroy(struct vm *vm) +{ + vm_cleanup(vm, true); + free(vm, M_VMM); +} + +int +vm_reinit(struct vm *vm) +{ + int error; + + /* + * A virtual machine can be reset only if all vcpus are suspended. + */ + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + vm_cleanup(vm, false); + vm_init(vm, false); + error = 0; + } else { + error = EBUSY; + } + + return (error); +} + +const char * +vm_name(struct vm *vm) +{ + return (vm->name); +} + +int +vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + vm_object_t obj; + + if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) + return (ENOMEM); + else + return (0); +} + +int +vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + + vmm_mmio_free(vm->vmspace, gpa, len); + return (0); +} + +/* + * Return 'true' if 'gpa' is allocated in the guest address space. + * + * This function is called in the context of a running vcpu which acts as + * an implicit lock on 'vm->mem_maps[]'. + */ +bool +vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) +{ + struct mem_map *mm; + int i; + +#ifdef INVARIANTS + int hostcpu, state; + state = vcpu_get_state(vm, vcpuid, &hostcpu); + KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, + ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); +#endif + + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) + return (true); /* 'gpa' is sysmem or devmem */ + } + +#if 0 + if (ppt_is_mmio(vm, gpa)) + return (true); /* 'gpa' is pci passthru mmio */ +#endif + + return (false); +} + +int +vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) +{ + struct mem_seg *seg; + vm_object_t obj; + + if (ident < 0 || ident >= VM_MAX_MEMSEGS) + return (EINVAL); + + if (len == 0 || (len & PAGE_MASK)) + return (EINVAL); + + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + if (seg->len == len && seg->sysmem == sysmem) + return (EEXIST); + else + return (EINVAL); + } + + obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); + if (obj == NULL) + return (ENOMEM); + + seg->len = len; + seg->object = obj; + seg->sysmem = sysmem; + return (0); +} + +int +vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + vm_object_t *objptr) +{ + struct mem_seg *seg; + + if (ident < 0 || ident >= VM_MAX_MEMSEGS) + return (EINVAL); + + seg = &vm->mem_segs[ident]; + if (len) + *len = seg->len; + if (sysmem) + *sysmem = seg->sysmem; + if (objptr) + *objptr = seg->object; + return (0); +} + +void +vm_free_memseg(struct vm *vm, int ident) +{ + struct mem_seg *seg; + + KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, + ("%s: invalid memseg ident %d", __func__, ident)); + + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + vm_object_deallocate(seg->object); + bzero(seg, sizeof(struct mem_seg)); + } +} + +int +vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, + size_t len, int prot, int flags) +{ + struct mem_seg *seg; + struct mem_map *m, *map; + vm_ooffset_t last; + int i, error; + + if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0) + return (EINVAL); + + if (flags & ~VM_MEMMAP_F_WIRED) + return (EINVAL); + + if (segid < 0 || segid >= VM_MAX_MEMSEGS) + return (EINVAL); + + seg = &vm->mem_segs[segid]; + if (seg->object == NULL) + return (EINVAL); + + last = first + len; + if (first < 0 || first >= last || last > seg->len) + return (EINVAL); + + if ((gpa | first | last) & PAGE_MASK) + return (EINVAL); + + map = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + m = &vm->mem_maps[i]; + if (m->len == 0) { + map = m; + break; + } + } + + if (map == NULL) + return (ENOSPC); + + error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, + len, 0, VMFS_NO_SPACE, prot, prot, 0); + if (error != KERN_SUCCESS) + return (EFAULT); + + vm_object_reference(seg->object); + + if (flags & VM_MEMMAP_F_WIRED) { + error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + if (error != KERN_SUCCESS) { + vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); + return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM : + EFAULT); + } + } + + map->gpa = gpa; + map->len = len; + map->segoff = first; + map->segid = segid; + map->prot = prot; + map->flags = flags; + return (0); +} + +int +vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) +{ + struct mem_map *mm, *mmnext; + int i; + + mmnext = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len == 0 || mm->gpa < *gpa) + continue; + if (mmnext == NULL || mm->gpa < mmnext->gpa) + mmnext = mm; + } + + if (mmnext != NULL) { + *gpa = mmnext->gpa; + if (segid) + *segid = mmnext->segid; + if (segoff) + *segoff = mmnext->segoff; + if (len) + *len = mmnext->len; + if (prot) + *prot = mmnext->prot; + if (flags) + *flags = mmnext->flags; + return (0); + } else { + return (ENOENT); + } +} + +static void +vm_free_memmap(struct vm *vm, int ident) +{ + struct mem_map *mm; + int error __diagused; + + mm = &vm->mem_maps[ident]; + if (mm->len) { + error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, + mm->gpa + mm->len); + KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d", + __func__, error)); + bzero(mm, sizeof(struct mem_map)); + } +} + +static __inline bool +sysmem_mapping(struct vm *vm, struct mem_map *mm) +{ + + if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) + return (true); + else + return (false); +} + +vm_paddr_t +vmm_sysmem_maxaddr(struct vm *vm) +{ + struct mem_map *mm; + vm_paddr_t maxaddr; + int i; + + maxaddr = 0; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (sysmem_mapping(vm, mm)) { + if (maxaddr < mm->gpa + mm->len) + maxaddr = mm->gpa + mm->len; + } + } + return (maxaddr); +} + +static int +vmm_reg_raz(void *vm, int vcpuid, uint64_t *rval, void *arg) +{ + *rval = 0; + return (0); +} + +static int +vmm_reg_read_arg(void *vm, int vcpuid, uint64_t *rval, void *arg) +{ + *rval = *(uint64_t *)arg; + return (0); +} + +static int +vmm_reg_wi(void *vm, int vcpuid, uint64_t wval, void *arg) +{ + return (0); +} + + +#include +#include + +static struct { + uint32_t esr_iss; + uint32_t esr_mask; + reg_read_t reg_read; + reg_write_t reg_write; + void *arg; +} vmm_special_regs[] = { +#define SPECIAL_REG(_reg, _read, _write) \ + { \ + .esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) | \ + ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) | \ + ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) | \ + ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) | \ + ((_reg ## _op2) << ISS_MSR_OP2_SHIFT), \ + .esr_mask = ISS_MSR_REG_MASK, \ + .reg_read = (_read), \ + .reg_write = (_write), \ + .arg = NULL, \ + } +#define ID_SPECIAL_REG(_reg, _name) \ + { \ + .esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) | \ + ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) | \ + ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) | \ + ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) | \ + ((_reg ## _op2) << ISS_MSR_OP2_SHIFT), \ + .esr_mask = ISS_MSR_REG_MASK, \ + .reg_read = vmm_reg_read_arg, \ + .reg_write = vmm_reg_wi, \ + .arg = &(vmm_desc._name), \ + } + + /* ID registers */ + ID_SPECIAL_REG(ID_AA64PFR0_EL1, id_aa64pfr0), + ID_SPECIAL_REG(ID_AA64PFR1_EL1, id_aa64pfr1), + + ID_SPECIAL_REG(ID_AA64DFR0_EL1, id_aa64dfr0), + ID_SPECIAL_REG(ID_AA64DFR1_EL1, id_aa64dfr1), + + ID_SPECIAL_REG(ID_AA64ISAR0_EL1, id_aa64isar0), + ID_SPECIAL_REG(ID_AA64ISAR1_EL1, id_aa64isar1), + + ID_SPECIAL_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0), + ID_SPECIAL_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1), + + /* + * All other ID registers are read as zero. + * They are all in the op0=3, op1=0, CRn=0, CRm={0..7} space. + */ + { + .esr_iss = (3 << ISS_MSR_OP0_SHIFT) | + (0 << ISS_MSR_OP1_SHIFT) | + (0 << ISS_MSR_CRn_SHIFT) | + (0 << ISS_MSR_CRm_SHIFT), + .esr_mask = ISS_MSR_OP0_MASK | ISS_MSR_OP1_MASK | + ISS_MSR_CRn_MASK | (0x8 << ISS_MSR_CRm_SHIFT), + .reg_read = vmm_reg_raz, + .reg_write = vmm_reg_wi, + .arg = NULL, + }, + + /* Counter physical registers */ + SPECIAL_REG(CNTP_CTL_EL0, vtimer_phys_ctl_read, vtimer_phys_ctl_write), + SPECIAL_REG(CNTP_CVAL_EL0, vtimer_phys_cval_read, + vtimer_phys_cval_write), + SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read, + vtimer_phys_tval_write), + SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write), + + /* GICv3 registers */ + SPECIAL_REG(ICC_SGI1R_EL1, vgic_v3_icc_sgi1r_read, + vgic_v3_icc_sgi1r_write), +#undef SPECIAL_REG +}; + +static int +vm_handle_reg_emul(struct vm *vm, int vcpuid, bool *retu) +{ + struct vm_exit *vme; + struct vre *vre; + int i, rv; + + vme = vm_exitinfo(vm, vcpuid); + vre = &vme->u.reg_emul.vre; + + for (i = 0; i < nitems(vmm_special_regs); i++) { + if ((vre->inst_syndrome & vmm_special_regs[i].esr_mask) == + vmm_special_regs[i].esr_iss) { + rv = vmm_emulate_register(vm, vcpuid, vre, + vmm_special_regs[i].reg_read, + vmm_special_regs[i].reg_write, + vmm_special_regs[i].arg); + if (rv == 0) { + *retu = false; + } + return (rv); + } + } + + + *retu = true; + return (0); +} + +void +vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size, + mem_region_read_t mmio_read, mem_region_write_t mmio_write) +{ + int i; + + for (i = 0; i < nitems(vm->mmio_region); i++) { + if (vm->mmio_region[i].start == 0 && + vm->mmio_region[i].end == 0) { + vm->mmio_region[i].start = start; + vm->mmio_region[i].end = start + size; + vm->mmio_region[i].read = mmio_read; + vm->mmio_region[i].write = mmio_write; + return; + } + } + + panic("%s: No free MMIO region", __func__); +} + +void +vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size) +{ + int i; + + for (i = 0; i < nitems(vm->mmio_region); i++) { + if (vm->mmio_region[i].start == start && + vm->mmio_region[i].end == start + size) { + memset(&vm->mmio_region[i], 0, + sizeof(vm->mmio_region[i])); + return; + } + } + + panic("%s: Invalid MMIO region: %lx - %lx", __func__, start, + start + size); +} + +static int +vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) +{ + struct vm_exit *vme; + struct vie *vie; + struct hyp *hyp = vm->cookie; + uint64_t fault_ipa; + struct vm_guest_paging *paging; + struct vmm_mmio_region *vmr; + int error, i; + + if (!hyp->vgic_attached) + goto out_user; + + vme = vm_exitinfo(vm, vcpuid); + vie = &vme->u.inst_emul.vie; + paging = &vme->u.inst_emul.paging; + + fault_ipa = vme->u.inst_emul.gpa; + + vmr = NULL; + for (i = 0; i < nitems(vm->mmio_region); i++) { + if (vm->mmio_region[i].start <= fault_ipa && + vm->mmio_region[i].end > fault_ipa) { + vmr = &vm->mmio_region[i]; + break; + } + } + if (vmr == NULL) + goto out_user; + + error = vmm_emulate_instruction(vm, vcpuid, fault_ipa, vie, + paging, vmr->read, vmr->write, retu); + return (error); + +out_user: + *retu = true; + return (0); +} + +int +vm_suspend(struct vm *vm, enum vm_suspend_how how) +{ + int i; + + if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) + return (EINVAL); + + if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { + VM_CTR2(vm, "virtual machine already suspended %d/%d", + vm->suspend, how); + return (EALREADY); + } + + VM_CTR1(vm, "virtual machine successfully suspended %d", how); + + /* + * Notify all active vcpus that they are now suspended. + */ + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm, i, false); + } + + return (0); +} + +void +vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t pc) +{ + struct vm_exit *vmexit; + + KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, + ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->pc = pc; + vmexit->inst_length = 4; + vmexit->exitcode = VM_EXITCODE_SUSPENDED; + vmexit->u.suspended.how = vm->suspend; +} + +int +vm_activate_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EBUSY); + + CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); + return (0); + +} + +int +vm_suspend_cpu(struct vm *vm, int vcpuid) +{ + int i; + + if (vcpuid < -1 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (vcpuid == -1) { + vm->debug_cpus = vm->active_cpus; + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm, i, false); + } + } else { + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); + vcpu_notify_event(vm, vcpuid, false); + } + return (0); +} + +int +vm_resume_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid < -1 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (vcpuid == -1) { + CPU_ZERO(&vm->debug_cpus); + } else { + if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) + return (EINVAL); + + CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); + } + return (0); +} + + +cpuset_t +vm_active_cpus(struct vm *vm) +{ + + return (vm->active_cpus); +} + +cpuset_t +vm_debug_cpus(struct vm *vm) +{ + + return (vm->debug_cpus); +} + +cpuset_t +vm_suspended_cpus(struct vm *vm) +{ + + return (vm->suspended_cpus); +} + + +void * +vcpu_stats(struct vm *vm, int vcpuid) +{ + + return (vm->vcpu[vcpuid].stats); +} + +/* + * This function is called to ensure that a vcpu "sees" a pending event + * as soon as possible: + * - If the vcpu thread is sleeping then it is woken up. + * - If the vcpu is running on a different host_cpu then an IPI will be directed + * to the host_cpu to cause the vcpu to trap into the hypervisor. + */ +static void +vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) +{ + int hostcpu; + + KASSERT(lapic_intr == false, ("%s: lapic_intr != false", __func__)); + hostcpu = vcpu->hostcpu; + if (vcpu->state == VCPU_RUNNING) { + KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); + if (hostcpu != curcpu) { +#if 0 + if (lapic_intr) { + vlapic_post_intr(vcpu->vlapic, hostcpu, + vmm_ipinum); + } else +#endif + { + ipi_cpu(hostcpu, vmm_ipinum); + } + } else { + /* + * If the 'vcpu' is running on 'curcpu' then it must + * be sending a notification to itself (e.g. SELF_IPI). + * The pending event will be picked up when the vcpu + * transitions back to guest context. + */ + } + } else { + KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " + "with hostcpu %d", vcpu->state, hostcpu)); + if (vcpu->state == VCPU_SLEEPING) + wakeup_one(vcpu); + } +} + +void +vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + vcpu_notify_event_locked(vcpu, lapic_intr); + vcpu_unlock(vcpu); +} + +static int +vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, + bool from_idle) +{ + struct vcpu *vcpu; + int error; + + vcpu = &vm->vcpu[vcpuid]; + vcpu_assert_locked(vcpu); + + /* + * State transitions from the vmmdev_ioctl() must always begin from + * the VCPU_IDLE state. This guarantees that there is only a single + * ioctl() operating on a vcpu at any point. + */ + if (from_idle) { + while (vcpu->state != VCPU_IDLE) { + vcpu_notify_event_locked(vcpu, false); + msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); + } + } else { + KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " + "vcpu idle state")); + } + + if (vcpu->state == VCPU_RUNNING) { + KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " + "mismatch for running vcpu", curcpu, vcpu->hostcpu)); + } else { + KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " + "vcpu that is not running", vcpu->hostcpu)); + } + + /* + * The following state transitions are allowed: + * IDLE -> FROZEN -> IDLE + * FROZEN -> RUNNING -> FROZEN + * FROZEN -> SLEEPING -> FROZEN + */ + switch (vcpu->state) { + case VCPU_IDLE: + case VCPU_RUNNING: + case VCPU_SLEEPING: + error = (newstate != VCPU_FROZEN); + break; + case VCPU_FROZEN: + error = (newstate == VCPU_FROZEN); + break; + default: + error = 1; + break; + } + + if (error) + return (EBUSY); + + vcpu->state = newstate; + if (newstate == VCPU_RUNNING) + vcpu->hostcpu = curcpu; + else + vcpu->hostcpu = NOCPU; + + if (newstate == VCPU_IDLE) + wakeup(&vcpu->state); + + return (0); +} + +static void +vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) + panic("Error %d setting state to %d\n", error, newstate); +} + +static void +vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) + panic("Error %d setting state to %d", error, newstate); +} + +int +vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) +{ + if (vcpu < 0 || vcpu >= vm->maxcpus) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMGETCAP(vm->cookie, vcpu, type, retval)); +} + +int +vm_set_capability(struct vm *vm, int vcpu, int type, int val) +{ + if (vcpu < 0 || vcpu >= vm->maxcpus) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMSETCAP(vm->cookie, vcpu, type, val)); +} + +int +vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, + bool from_idle) +{ + int error; + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + panic("vm_set_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); + vcpu_unlock(vcpu); + + return (error); +} + +enum vcpu_state +vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) +{ + struct vcpu *vcpu; + enum vcpu_state state; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + panic("vm_get_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + state = vcpu->state; + if (hostcpu != NULL) + *hostcpu = vcpu->hostcpu; + vcpu_unlock(vcpu); + + return (state); +} + +void * +vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot, + void **cookie) +{ + int i, count, pageoff; + struct mem_map *mm; + vm_page_t m; +#ifdef INVARIANTS + /* + * All vcpus are frozen by ioctls that modify the memory map + * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is + * guaranteed if at least one vcpu is in the VCPU_FROZEN state. + */ + int state; + KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d", + __func__, vcpuid)); + for (i = 0; i < vm->maxcpus; i++) { + if (vcpuid != -1 && vcpuid != i) + continue; + state = vcpu_get_state(vm, i, NULL); + KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", + __func__, state)); + } +#endif + pageoff = gpa & PAGE_MASK; + if (len > PAGE_SIZE - pageoff) + panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); + + count = 0; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (sysmem_mapping(vm, mm) && gpa >= mm->gpa && + gpa < mm->gpa + mm->len) { + count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, + trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); + break; + } + } + + if (count == 1) { + *cookie = m; + return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); + } else { + *cookie = NULL; + return (NULL); + } +} + +void +vm_gpa_release(void *cookie) +{ + vm_page_t m = cookie; + + vm_page_unwire(m, PQ_ACTIVE); +} + +int +vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) +{ + + if (vcpu < 0 || vcpu >= vm->maxcpus) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (VMGETREG(vm->cookie, vcpu, reg, retval)); +} + +int +vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) +{ + struct vcpu *vcpu; + int error; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + error = VMSETREG(vm->cookie, vcpuid, reg, val); + if (error || reg != VM_REG_ELR_EL2) + return (error); + + vcpu = &vm->vcpu[vcpuid]; + vcpu->nextpc = val; + + return(0); +} + +void * +vm_get_cookie(struct vm *vm) +{ + return vm->cookie; +} + +int +vm_attach_vgic(struct vm *vm, uint64_t dist_start, size_t dist_size, + uint64_t redist_start, size_t redist_size) +{ + int error; + + error = vgic_v3_attach_to_vm(vm, dist_start, dist_size, redist_start, + redist_size); + + return (error); +} + +int +vm_assert_irq(struct vm *vm, uint32_t irq) +{ + struct hyp *hyp = (struct hyp *)vm->cookie; + int error; + + error = vgic_v3_inject_irq(hyp, -1, irq, true); + + return (error); +} + +int +vm_deassert_irq(struct vm *vm, uint32_t irq) +{ + struct hyp *hyp = (struct hyp *)vm->cookie; + int error; + + error = vgic_v3_inject_irq(hyp, -1, irq, false); + + return (error); +} + +int +vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot, + int func) +{ + struct hyp *hyp = (struct hyp *)vm->cookie; + int error; + + if (addr >= hyp->vgic_dist.start && addr < hyp->vgic_dist.end) { + error = vgic_v3_inject_msi(hyp, msg, addr); + if (error == 0) + return (0); + } + + /* TODO: Should we raise an SError? */ + return (EINVAL); +} + +static int +vm_handle_wfi(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu) +{ + struct hyp *hyp; + struct vcpu *vcpu; + struct hypctx *hypctx; + + vcpu = &vm->vcpu[vcpuid]; + hyp = vm->cookie; + hypctx = &hyp->ctx[vcpuid]; + + vcpu_lock(vcpu); + while (1) { + if (vgic_v3_vcpu_pending_irq(hypctx)) + break; + + if (vcpu_should_yield(vm, vcpuid)) + break; + + vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); + /* + * XXX msleep_spin() cannot be interrupted by signals so + * wake up periodically to check pending signals. + */ + msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz); + vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); + } + vcpu_unlock(vcpu); + + *retu = false; + return (0); +} + +static int +vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) +{ + struct vm_exit *vme; + struct vm_map *map; + uint64_t addr, esr; + pmap_t pmap; + int ftype, rv; + + vme = vm_exitinfo(vm, vcpuid); + pmap = vmspace_pmap(vm->vmspace); + addr = vme->u.paging.gpa; + esr = vme->u.paging.esr; + + /* The page exists, but the page table needs to be upddated */ + if (pmap_fault(pmap, esr, addr) == KERN_SUCCESS) + return (0); + + switch (ESR_ELx_EXCEPTION(esr)) { + case EXCP_INSN_ABORT_L: + case EXCP_DATA_ABORT_L: + ftype = VM_PROT_EXECUTE | VM_PROT_READ | VM_PROT_WRITE; + break; + default: + panic("%s: Invalid exception (esr = %lx)", __func__, esr); + } + + map = &vm->vmspace->vm_map; + rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); + if (rv != KERN_SUCCESS) + return (EFAULT); + + return (0); +} + +int +vm_run(struct vm *vm, struct vm_run *vmrun) +{ + struct vm_eventinfo evinfo; + int error, vcpuid; + struct vcpu *vcpu; + struct vm_exit *vme; + bool retu; + pmap_t pmap; + + vcpuid = vmrun->cpuid; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) + return (EINVAL); + + pmap = vmspace_pmap(vm->vmspace); + vcpu = &vm->vcpu[vcpuid]; + evinfo.rptr = NULL; + evinfo.sptr = &vm->suspend; + evinfo.iptr = NULL; +restart: + critical_enter(); + vcpu_require_state(vm, vcpuid, VCPU_RUNNING); + error = VMRUN(vm->cookie, vcpuid, vcpu->nextpc, pmap, &evinfo); + vcpu_require_state(vm, vcpuid, VCPU_FROZEN); + critical_exit(); + + vme = vm_exitinfo(vm, vcpuid); + if (error == 0) { + retu = false; + switch (vme->exitcode) { + case VM_EXITCODE_INST_EMUL: + vcpu->nextpc = vme->pc + vme->inst_length; + error = vm_handle_inst_emul(vm, vcpuid, &retu); + break; + + case VM_EXITCODE_REG_EMUL: + vcpu->nextpc = vme->pc + vme->inst_length; + error = vm_handle_reg_emul(vm, vcpuid, &retu); + break; + + case VM_EXITCODE_HVC: + /* + * The HVC instruction saves the address for the + * next instruction as the return address. + */ + vcpu->nextpc = vme->pc; + /* + * The PSCI call can change the exit information in the + * case of suspend/reset/poweroff/cpu off/cpu on. + */ + error = psci_handle_call(vm, vcpuid, vme, &retu); + break; + + case VM_EXITCODE_WFI: + vcpu->nextpc = vme->pc + vme->inst_length; + error = vm_handle_wfi(vm, vcpuid, vme, &retu); + break; + + case VM_EXITCODE_PAGING: + vcpu->nextpc = vme->pc; + error = vm_handle_paging(vm, vcpuid, &retu); + break; + + default: + /* Handle in userland */ + vcpu->nextpc = vme->pc; + retu = true; + break; + } + } + + if (error == 0 && retu == false) + goto restart; + + /* Copy the exit information */ + bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); + + return (error); +} diff --git a/sys/arm64/vmm/vmm_arm64.c b/sys/arm64/vmm/vmm_arm64.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_arm64.c @@ -0,0 +1,1076 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mmu.h" +#include "arm64.h" +#include "hyp.h" +#include "reset.h" +#include "io/vgic_v3.h" +#include "io/vtimer.h" + +#define HANDLED 1 +#define UNHANDLED 0 + +#define UNUSED 0 + +/* Number of bits in an EL2 virtual address */ +#define EL2_VIRT_BITS 48 +CTASSERT((1ul << EL2_VIRT_BITS) >= HYP_VM_MAX_ADDRESS); + +/* TODO: Move the host hypctx off the stack */ +#define VMM_STACK_PAGES 4 +#define VMM_STACK_SIZE (VMM_STACK_PAGES * PAGE_SIZE) + +static int vmm_pmap_levels, vmm_virt_bits; + +/* Register values passed to arm_setup_vectors to set in the hypervisor */ +struct vmm_init_regs { + uint64_t tcr_el2; + uint64_t vtcr_el2; +}; + +MALLOC_DEFINE(M_HYP, "ARM VMM HYP", "ARM VMM HYP"); + +extern char hyp_init_vectors[]; +extern char hyp_vectors[]; +extern char hyp_stub_vectors[]; + +static vm_paddr_t hyp_code_base; +static size_t hyp_code_len; + +static char *stack[MAXCPU]; +static vm_offset_t stack_hyp_va[MAXCPU]; + +static vmem_t *el2_mem_alloc; + +static void arm_setup_vectors(void *arg); +static void vmm_pmap_clean_stage2_tlbi(void); +static void vmm_pmap_invalidate_range(uint64_t, vm_offset_t, vm_offset_t, bool); +static void vmm_pmap_invalidate_all(uint64_t); + +static inline void +arm64_set_active_vcpu(struct hypctx *hypctx) +{ + + PCPU_SET(vcpu, hypctx); +} + +static void +arm_setup_vectors(void *arg) +{ + struct vmm_init_regs *el2_regs; + char *stack_top; + uint32_t sctlr_el2; + register_t daif; + + el2_regs = arg; + arm64_set_active_vcpu(NULL); + + daif = intr_disable(); + + /* + * Install the temporary vectors which will be responsible for + * initializing the VMM when we next trap into EL2. + * + * x0: the exception vector table responsible for hypervisor + * initialization on the next call. + */ + vmm_call_hyp(vtophys(&vmm_hyp_code)); + + /* Create and map the hypervisor stack */ + stack_top = (char *)stack_hyp_va[PCPU_GET(cpuid)] + VMM_STACK_SIZE; + + /* + * Configure the system control register for EL2: + * + * SCTLR_EL2_M: MMU on + * SCTLR_EL2_C: Data cacheability not affected + * SCTLR_EL2_I: Instruction cacheability not affected + * SCTLR_EL2_A: Instruction alignment check + * SCTLR_EL2_SA: Stack pointer alignment check + * SCTLR_EL2_WXN: Treat writable memory as execute never + * ~SCTLR_EL2_EE: Data accesses are little-endian + */ + sctlr_el2 = SCTLR_EL2_RES1; + sctlr_el2 |= SCTLR_EL2_M | SCTLR_EL2_C | SCTLR_EL2_I; + sctlr_el2 |= SCTLR_EL2_A | SCTLR_EL2_SA; + sctlr_el2 |= SCTLR_EL2_WXN; + sctlr_el2 &= ~SCTLR_EL2_EE; + + /* Special call to initialize EL2 */ + vmm_call_hyp(vmmpmap_to_ttbr0(), stack_top, el2_regs->tcr_el2, + sctlr_el2, el2_regs->vtcr_el2); + + intr_restore(daif); +} + +static void +arm_teardown_vectors(void *arg) +{ + register_t daif; + + /* + * vmm_cleanup() will disable the MMU. For the next few instructions, + * before the hardware disables the MMU, one of the following is + * possible: + * + * a. The instruction addresses are fetched with the MMU disabled, + * and they must represent the actual physical addresses. This will work + * because we call the vmm_cleanup() function by its physical address. + * + * b. The instruction addresses are fetched using the old translation + * tables. This will work because we have an identity mapping in place + * in the translation tables and vmm_cleanup() is called by its physical + * address. + */ + daif = intr_disable(); + /* TODO: Invalidate the cache */ + vmm_call_hyp(HYP_CLEANUP, vtophys(hyp_stub_vectors)); + intr_restore(daif); + + arm64_set_active_vcpu(NULL); +} + +static uint64_t +vmm_vtcr_el2_sl(u_int levels) +{ +#if PAGE_SIZE == PAGE_SIZE_4K + switch(levels) { + case 2: + return (VTCR_EL2_SL0_4K_LVL2); + case 3: + return (VTCR_EL2_SL0_4K_LVL1); + case 4: + return (VTCR_EL2_SL0_4K_LVL0); + default: + panic("%s: Invalid number of page table levels %u", __func__, + levels); + } +#elif PAGE_SIZE == PAGE_SIZE_16K + switch(levels) { + case 2: + return (VTCR_EL2_SL0_16K_LVL2); + case 3: + return (VTCR_EL2_SL0_16K_LVL1); + case 4: + return (VTCR_EL2_SL0_16K_LVL0); + default: + panic("%s: Invalid number of page table levels %u", __func__, + levels); + } +#else +#error Unsupported page size +#endif +} + +static int +arm_init(int ipinum) +{ + struct vmm_init_regs el2_regs; + vm_offset_t next_hyp_va; + vm_paddr_t vmm_base; + uint64_t id_aa64mmfr0_el1, pa_range_bits, pa_range_field; + uint64_t ich_vtr_el2; + uint64_t cnthctl_el2; + register_t daif; + int cpu, i; + bool rv __diagused; + + if (!virt_enabled()) { + printf("arm_init: Processor doesn't have support for virtualization.\n"); + return (ENXIO); + } + + if (!vgic_present()) { + printf("arm_init: No GICv3 found\n"); + return (ENODEV); + } + + if (!get_kernel_reg(ID_AA64MMFR0_EL1, &id_aa64mmfr0_el1)) { + printf("arm_init: Unable to read ID_AA64MMFR0_EL1\n"); + return (ENXIO); + } + pa_range_field = ID_AA64MMFR0_PARange_VAL(id_aa64mmfr0_el1); + /* + * Use 3 levels to give us up to 39 bits with 4k pages, or + * 47 bits with 16k pages. + */ + /* TODO: Check the number of levels for 64k pages */ + vmm_pmap_levels = 3; + switch (pa_range_field) { + case ID_AA64MMFR0_PARange_4G: + printf("arm_init: Not enough physical address bits\n"); + return (ENXIO); + case ID_AA64MMFR0_PARange_64G: + vmm_virt_bits = 36; +#if PAGE_SIZE == PAGE_SIZE_16K + /* TODO: Test */ + vmm_pmap_levels = 2; +#endif + break; + default: + vmm_virt_bits = 39; + break; + } + pa_range_bits = pa_range_field >> ID_AA64MMFR0_PARange_SHIFT; + + /* Initialise the EL2 MMU */ + if (!vmmpmap_init()) { + printf("arm_init: Failed to init the EL2 MMU\n"); + return (ENOMEM); + } + + /* Set up the stage 2 pmap callbacks */ + MPASS(pmap_clean_stage2_tlbi == NULL); + pmap_clean_stage2_tlbi = vmm_pmap_clean_stage2_tlbi; + pmap_stage2_invalidate_range = vmm_pmap_invalidate_range; + pmap_stage2_invalidate_all = vmm_pmap_invalidate_all; + + /* Create the vmem allocator */ + el2_mem_alloc = vmem_create("VMM EL2", 0, 0, PAGE_SIZE, 0, M_WAITOK); + + /* Create the mappings for the hypervisor translation table. */ + hyp_code_len = roundup2(&vmm_hyp_code_end - &vmm_hyp_code, PAGE_SIZE); + + /* We need an physical identity mapping for when we activate the MMU */ + hyp_code_base = vmm_base = vtophys(&vmm_hyp_code); + rv = vmmpmap_enter(vmm_base, hyp_code_len, vtophys(&vmm_hyp_code), + VM_PROT_READ | VM_PROT_EXECUTE); + MPASS(rv); + + next_hyp_va = roundup2(vtophys(&vmm_hyp_code) + hyp_code_len, L2_SIZE); + + /* Create a per-CPU hypervisor stack */ + CPU_FOREACH(cpu) { + stack[cpu] = malloc(VMM_STACK_SIZE, M_HYP, M_WAITOK | M_ZERO); + stack_hyp_va[cpu] = next_hyp_va; + + for (i = 0; i < VMM_STACK_PAGES; i++) { + rv = vmmpmap_enter(stack_hyp_va[cpu] + (i * PAGE_SIZE), + PAGE_SIZE, vtophys(stack[cpu] + (i * PAGE_SIZE)), + VM_PROT_READ | VM_PROT_WRITE); + MPASS(rv); + } + next_hyp_va += L2_SIZE; + } + + el2_regs.tcr_el2 = TCR_EL2_RES1; + el2_regs.tcr_el2 |= min(pa_range_bits << TCR_EL2_PS_SHIFT, + TCR_EL2_PS_52BITS); + el2_regs.tcr_el2 |= TCR_EL2_T0SZ(64 - EL2_VIRT_BITS); + el2_regs.tcr_el2 |= TCR_EL2_IRGN0_WBWA | TCR_EL2_ORGN0_WBWA; +#if PAGE_SIZE == PAGE_SIZE_4K + el2_regs.tcr_el2 |= TCR_EL2_TG0_4K; +#elif PAGE_SIZE == PAGE_SIZE_16K + el2_regs.tcr_el2 |= TCR_EL2_TG0_16K; +#else +#error Unsupported page size +#endif +#ifdef SMP + el2_regs.tcr_el2 |= TCR_EL2_SH0_IS; +#endif + + /* + * Configure the Stage 2 translation control register: + * + * VTCR_IRGN0_WBWA: Translation table walks access inner cacheable + * normal memory + * VTCR_ORGN0_WBWA: Translation table walks access outer cacheable + * normal memory + * VTCR_EL2_TG0_4K/16K: Stage 2 uses the same page size as the kernel + * VTCR_EL2_SL0_4K_LVL1: Stage 2 uses concatenated level 1 tables + * VTCR_EL2_SH0_IS: Memory associated with Stage 2 walks is inner + * shareable + */ + el2_regs.vtcr_el2 = VTCR_EL2_RES1; + el2_regs.vtcr_el2 |= + min(pa_range_bits << VTCR_EL2_PS_SHIFT, VTCR_EL2_PS_48BIT); + el2_regs.vtcr_el2 |= VTCR_EL2_IRGN0_WBWA | VTCR_EL2_ORGN0_WBWA; + el2_regs.vtcr_el2 |= VTCR_EL2_T0SZ(64 - vmm_virt_bits); + el2_regs.vtcr_el2 |= vmm_vtcr_el2_sl(vmm_pmap_levels); +#if PAGE_SIZE == PAGE_SIZE_4K + el2_regs.vtcr_el2 |= VTCR_EL2_TG0_4K; +#elif PAGE_SIZE == PAGE_SIZE_16K + el2_regs.vtcr_el2 |= VTCR_EL2_TG0_16K; +#else +#error Unsupported page size +#endif +#ifdef SMP + el2_regs.vtcr_el2 |= VTCR_EL2_SH0_IS; +#endif + + smp_rendezvous(NULL, arm_setup_vectors, NULL, &el2_regs); + + /* Add memory to the vmem allocator (checking there is space) */ + if (vmm_base > L2_SIZE) { + /* + * Ensure there is an L2 block before the vmm code to check + * for buffer overflows on earlier data. Include the PAGE_SIZE + * of the minimum we can allocate. + */ + vmm_base -= L2_SIZE + PAGE_SIZE; + vmm_base = rounddown2(vmm_base, L2_SIZE); + + /* + * Check there is memory before the vmm code to add. + * + * Reserve the L2 block at address 0 so NULL dereference will + * raise an exception + */ + if (vmm_base > L2_SIZE) + vmem_add(el2_mem_alloc, L2_SIZE, next_hyp_va - L2_SIZE, + M_WAITOK); + } + + /* + * Add the memory after the stacks. There is most of an L2 block + * between the last stack and the first allocation so this should + * be safe without adding more padding. + */ + if (next_hyp_va < HYP_VM_MAX_ADDRESS - PAGE_SIZE) + vmem_add(el2_mem_alloc, next_hyp_va, + HYP_VM_MAX_ADDRESS - next_hyp_va, M_WAITOK); + + + daif = intr_disable(); + ich_vtr_el2 = vmm_call_hyp(HYP_READ_REGISTER, HYP_REG_ICH_VTR); + cnthctl_el2 = vmm_call_hyp(HYP_READ_REGISTER, HYP_REG_CNTHCTL); + intr_restore(daif); + + vgic_v3_init(ich_vtr_el2); + vtimer_init(cnthctl_el2); + + return (0); +} + +static int +arm_cleanup(void) +{ + int cpu; + + smp_rendezvous(NULL, arm_teardown_vectors, NULL, NULL); + +#ifdef INVARIANTS + CPU_FOREACH(cpu) { + vmmpmap_remove(stack_hyp_va[cpu], VMM_STACK_PAGES * PAGE_SIZE, + false); + } + + vmmpmap_remove(hyp_code_base, hyp_code_len, false); +#endif + + vtimer_cleanup(); + + vmmpmap_fini(); + for (cpu = 0; cpu < nitems(stack); cpu++) + free(stack[cpu], M_HYP); + + pmap_clean_stage2_tlbi = NULL; + + return (0); +} + +static void * +arm_vminit(struct vm *vm, pmap_t pmap) +{ + struct hyp *hyp; + struct hypctx *hypctx; + vmem_addr_t vm_addr; + vm_size_t size; + bool last_vcpu, rv __diagused; + int err __diagused, i, maxcpus; + + /* Ensure this is the only data on the page */ + size = roundup2(sizeof(struct hyp), PAGE_SIZE); + hyp = malloc(size, M_HYP, M_WAITOK | M_ZERO); + MPASS(((vm_offset_t)hyp & PAGE_MASK) == 0); + + hyp->vm = vm; + hyp->vgic_attached = false; + + maxcpus = vm_get_maxcpus(vm); + for (i = 0; i < maxcpus; i++) { + hypctx = &hyp->ctx[i]; + hypctx->vcpu = i; + hypctx->hyp = hyp; + + reset_vm_el01_regs(hypctx); + reset_vm_el2_regs(hypctx); + } + + vtimer_vminit(hyp); + vgic_v3_vminit(hyp); + for (i = 0; i < VM_MAXCPU; i++) { + hypctx = &hyp->ctx[i]; + vtimer_cpuinit(hypctx); + last_vcpu = (i == VM_MAXCPU - 1); + vgic_v3_cpuinit(hypctx, last_vcpu); + } + + /* XXX: Can this fail? */ + err = vmem_alloc(el2_mem_alloc, size, M_NEXTFIT | M_WAITOK, + &vm_addr); + MPASS(err == 0); + MPASS((vm_addr & PAGE_MASK) == 0); + hyp->el2_addr = vm_addr; + + rv = vmmpmap_enter(hyp->el2_addr, size, vtophys(hyp), + VM_PROT_READ | VM_PROT_WRITE); + MPASS(rv); + + return (hyp); +} + +static int +arm_vmm_pinit(pmap_t pmap) +{ + + pmap_pinit_stage(pmap, PM_STAGE2, vmm_pmap_levels); + return (1); +} + +static struct vmspace * +arm_vmspace_alloc(vm_offset_t min, vm_offset_t max) +{ + return (vmspace_alloc(min, max, arm_vmm_pinit)); +} + +static void +arm_vmspace_free(struct vmspace *vmspace) +{ + + pmap_remove_pages(vmspace_pmap(vmspace)); + vmspace_free(vmspace); +} + +static void +vmm_pmap_clean_stage2_tlbi(void) +{ + vmm_call_hyp(HYP_CLEAN_S2_TLBI); +} + +static void +vmm_pmap_invalidate_range(uint64_t vttbr, vm_offset_t sva, vm_offset_t eva, + bool final_only) +{ + MPASS(eva > sva); + vmm_call_hyp(HYP_S2_TLBI_RANGE, vttbr, sva, eva, final_only); +} + +static void +vmm_pmap_invalidate_all(uint64_t vttbr) +{ + vmm_call_hyp(HYP_S2_TLBI_ALL, vttbr); +} + +static enum vm_reg_name +get_vm_reg_name(uint32_t reg_nr, uint32_t mode __attribute__((unused))) +{ + switch(reg_nr) { + case 0: + return VM_REG_GUEST_X0; + case 1: + return VM_REG_GUEST_X1; + case 2: + return VM_REG_GUEST_X2; + case 3: + return VM_REG_GUEST_X3; + case 4: + return VM_REG_GUEST_X4; + case 5: + return VM_REG_GUEST_X5; + case 6: + return VM_REG_GUEST_X6; + case 7: + return VM_REG_GUEST_X7; + case 8: + return VM_REG_GUEST_X8; + case 9: + return VM_REG_GUEST_X9; + case 10: + return VM_REG_GUEST_X10; + case 11: + return VM_REG_GUEST_X11; + case 12: + return VM_REG_GUEST_X12; + case 13: + return VM_REG_GUEST_X13; + case 14: + return VM_REG_GUEST_X14; + case 15: + return VM_REG_GUEST_X15; + case 16: + return VM_REG_GUEST_X16; + case 17: + return VM_REG_GUEST_X17; + case 18: + return VM_REG_GUEST_X18; + case 19: + return VM_REG_GUEST_X19; + case 20: + return VM_REG_GUEST_X20; + case 21: + return VM_REG_GUEST_X21; + case 22: + return VM_REG_GUEST_X22; + case 23: + return VM_REG_GUEST_X23; + case 24: + return VM_REG_GUEST_X24; + case 25: + return VM_REG_GUEST_X25; + case 26: + return VM_REG_GUEST_X26; + case 27: + return VM_REG_GUEST_X27; + case 28: + return VM_REG_GUEST_X28; + case 29: + return VM_REG_GUEST_X29; + case 30: + return VM_REG_GUEST_LR; + case 31: + return VM_REG_GUEST_SP; + case 32: + return VM_REG_GUEST_ELR; + case 33: + return VM_REG_GUEST_SPSR; + case 34: + return VM_REG_ELR_EL2; + default: + break; + } + + return (VM_REG_LAST); +} + +static inline void +arm64_print_hyp_regs(struct vm_exit *vme) +{ + printf("esr_el2: 0x%08x\n", vme->u.hyp.esr_el2); + printf("far_el2: 0x%016lx\n", vme->u.hyp.far_el2); + printf("hpfar_el2: 0x%016lx\n", vme->u.hyp.hpfar_el2); +} + +static void +arm64_gen_inst_emul_data(struct hypctx *hypctx, uint32_t esr_iss, + struct vm_exit *vme_ret) +{ + struct vm_guest_paging *paging; + struct vie *vie; + uint32_t esr_sas, reg_num; + uint64_t page_off; + + /* + * Get the page address from HPFAR_EL2. + */ + vme_ret->u.inst_emul.gpa = + HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2); + /* Bits [11:0] are the same as bits [11:0] from the virtual address. */ + page_off = FAR_EL2_PAGE_OFFSET(hypctx->exit_info.far_el2); + vme_ret->u.inst_emul.gpa += page_off; + + esr_sas = (esr_iss & ISS_DATA_SAS_MASK) >> ISS_DATA_SAS_SHIFT; + reg_num = (esr_iss & ISS_DATA_SRT_MASK) >> ISS_DATA_SRT_SHIFT; + + vie = &vme_ret->u.inst_emul.vie; + vie->access_size = 1 << esr_sas; + vie->sign_extend = (esr_iss & ISS_DATA_SSE) ? 1 : 0; + vie->dir = (esr_iss & ISS_DATA_WnR) ? VM_DIR_WRITE : VM_DIR_READ; + vie->reg = get_vm_reg_name(reg_num, UNUSED); + + paging = &vme_ret->u.inst_emul.paging; + paging->far = hypctx->exit_info.far_el2; + paging->ttbr0_el1 = hypctx->ttbr0_el1; + paging->ttbr1_el1 = hypctx->ttbr1_el1; + paging->flags = hypctx->tf.tf_spsr & (PSR_M_MASK | PSR_M_32); + if ((hypctx->sctlr_el1 & SCTLR_M) != 0) + paging->flags |= VM_GP_MMU_ENABLED; +} + +static void +arm64_gen_reg_emul_data(uint32_t esr_iss, struct vm_exit *vme_ret) +{ + uint32_t reg_num; + struct vre *vre; + + /* u.hyp member will be replaced by u.reg_emul */ + vre = &vme_ret->u.reg_emul.vre; + + vre->inst_syndrome = esr_iss; + /* ARMv8 Architecture Manual, p. D7-2273: 1 means read */ + vre->dir = (esr_iss & ISS_MSR_DIR) ? VM_DIR_READ : VM_DIR_WRITE; + reg_num = ISS_MSR_Rt(esr_iss); + vre->reg = get_vm_reg_name(reg_num, UNUSED); +} + +static int +handle_el1_sync_excp(struct hyp *hyp, int vcpu, struct vm_exit *vme_ret, + pmap_t pmap) +{ + struct hypctx *hypctx; + uint64_t gpa; + uint32_t esr_ec, esr_iss; + + hypctx = &hyp->ctx[vcpu]; + esr_ec = ESR_ELx_EXCEPTION(hypctx->tf.tf_esr); + esr_iss = hypctx->tf.tf_esr & ESR_ELx_ISS_MASK; + + switch(esr_ec) { + case EXCP_UNKNOWN: + eprintf("Unknown exception from guest\n"); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + case EXCP_TRAP_WFI_WFE: + if ((hypctx->tf.tf_esr & 0x3) == 0) /* WFI */ + vme_ret->exitcode = VM_EXITCODE_WFI; + else + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + case EXCP_HVC: + vme_ret->exitcode = VM_EXITCODE_HVC; + break; + case EXCP_MSR: + arm64_gen_reg_emul_data(esr_iss, vme_ret); + vme_ret->exitcode = VM_EXITCODE_REG_EMUL; + break; + + case EXCP_INSN_ABORT_L: + case EXCP_DATA_ABORT_L: + switch (hypctx->tf.tf_esr & ISS_DATA_DFSC_MASK) { + case ISS_DATA_DFSC_TF_L0: + case ISS_DATA_DFSC_TF_L1: + case ISS_DATA_DFSC_TF_L2: + case ISS_DATA_DFSC_TF_L3: + case ISS_DATA_DFSC_AFF_L1: + case ISS_DATA_DFSC_AFF_L2: + case ISS_DATA_DFSC_AFF_L3: + case ISS_DATA_DFSC_PF_L1: + case ISS_DATA_DFSC_PF_L2: + case ISS_DATA_DFSC_PF_L3: + hypctx = &hyp->ctx[vcpu]; + gpa = HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2); + if (vm_mem_allocated(hyp->vm, vcpu, gpa)) { + vme_ret->exitcode = VM_EXITCODE_PAGING; + vme_ret->inst_length = 0; + vme_ret->u.paging.esr = hypctx->tf.tf_esr; + vme_ret->u.paging.gpa = gpa; + } else if (esr_ec == EXCP_DATA_ABORT_L) { + arm64_gen_inst_emul_data(&hyp->ctx[vcpu], + esr_iss, vme_ret); + vme_ret->exitcode = VM_EXITCODE_INST_EMUL; + } else { + eprintf( + "Unsupported instruction fault from guest\n"); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + } + break; + default: + eprintf( + "Unsupported data/instruction fault from guest\n"); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + } + + break; + + default: + eprintf("Unsupported synchronous exception from guest: 0x%x\n", + esr_ec); + arm64_print_hyp_regs(vme_ret); + vme_ret->exitcode = VM_EXITCODE_HYP; + break; + } + + /* We don't don't do any instruction emulation here */ + return (UNHANDLED); +} + +static int +arm64_handle_world_switch(struct hyp *hyp, int vcpu, int excp_type, + struct vm_exit *vme, pmap_t pmap) +{ + int handled; + + switch (excp_type) { + case EXCP_TYPE_EL1_SYNC: + /* The exit code will be set by handle_el1_sync_excp(). */ + handled = handle_el1_sync_excp(hyp, vcpu, vme, pmap); + break; + + case EXCP_TYPE_EL1_IRQ: + case EXCP_TYPE_EL1_FIQ: + /* The host kernel will handle IRQs and FIQs. */ + vme->exitcode = VM_EXITCODE_BOGUS; + handled = UNHANDLED; + break; + + case EXCP_TYPE_EL1_ERROR: + case EXCP_TYPE_EL2_SYNC: + case EXCP_TYPE_EL2_IRQ: + case EXCP_TYPE_EL2_FIQ: + case EXCP_TYPE_EL2_ERROR: + eprintf("Unhandled exception type: %s\n", __STRING(excp_type)); + vme->exitcode = VM_EXITCODE_BOGUS; + handled = UNHANDLED; + break; + + default: + eprintf("Unknown exception type: %d\n", excp_type); + vme->exitcode = VM_EXITCODE_BOGUS; + handled = UNHANDLED; + break; + } + + return (handled); +} + +static int +arm_vmrun(void *arg, int vcpu, register_t pc, pmap_t pmap, + struct vm_eventinfo *evinfo) +{ + uint64_t excp_type; + int handled; + register_t daif; + struct hyp *hyp; + struct hypctx *hypctx; + struct vm *vm; + struct vm_exit *vme; + + hyp = (struct hyp *)arg; + vm = hyp->vm; + vme = vm_exitinfo(vm, vcpu); + + hypctx = &hyp->ctx[vcpu]; + hypctx->tf.tf_elr = (uint64_t)pc; + + for (;;) { + daif = intr_disable(); + + /* Check if the vcpu is suspended */ + if (vcpu_suspended(evinfo)) { + intr_restore(daif); + vm_exit_suspended(vm, vcpu, pc); + break; + } + + /* Activate the stage2 pmap so the vmid is valid */ + pmap_activate_vm(pmap); + hyp->vttbr_el2 = pmap_to_ttbr0(pmap); + + /* + * TODO: What happens if a timer interrupt is asserted exactly + * here, but for the previous VM? + */ + arm64_set_active_vcpu(hypctx); + vgic_v3_flush_hwstate(hypctx); + + /* Call into EL2 to switch to the guest */ + excp_type = vmm_call_hyp(HYP_ENTER_GUEST, + hyp->el2_addr, vcpu); + + vgic_v3_sync_hwstate(hypctx); + + /* + * Deactivate the stage2 pmap. vmm_pmap_clean_stage2_tlbi + * depends on this meaning we activate the VM before entering + * the vm again + */ + PCPU_SET(curvmpmap, NULL); + intr_restore(daif); + + if (excp_type == EXCP_TYPE_MAINT_IRQ) + continue; + + vme->pc = hypctx->tf.tf_elr; + vme->inst_length = INSN_SIZE; + vme->u.hyp.exception_nr = excp_type; + vme->u.hyp.esr_el2 = hypctx->tf.tf_esr; + vme->u.hyp.far_el2 = hypctx->exit_info.far_el2; + vme->u.hyp.hpfar_el2 = hypctx->exit_info.hpfar_el2; + + handled = arm64_handle_world_switch(hyp, vcpu, excp_type, vme, + pmap); + if (handled == UNHANDLED) + /* Exit loop to emulate instruction. */ + break; + else + /* Resume guest execution from the next instruction. */ + hypctx->tf.tf_elr += vme->inst_length; + } + + return (0); +} + +static void +arm_pcpu_vmcleanup(void *arg) +{ + struct hyp *hyp; + int i, maxcpus; + + hyp = arg; + maxcpus = vm_get_maxcpus(hyp->vm); + for (i = 0; i < maxcpus; i++) { + if (arm64_get_active_vcpu() == &hyp->ctx[i]) { + arm64_set_active_vcpu(NULL); + break; + } + } +} + +static void +arm_vmcleanup(void *arg) +{ + struct hyp *hyp = arg; + struct hypctx *hypctx; + int i; + + for (i = 0; i < VM_MAXCPU; i++) { + hypctx = &hyp->ctx[i]; + vtimer_cpucleanup(hypctx); + vgic_v3_cpucleanup(hypctx); + } + + vtimer_vmcleanup(hyp); + vgic_v3_vmcleanup(hyp); + + smp_rendezvous(NULL, arm_pcpu_vmcleanup, NULL, hyp); + + /* Unmap the VM hyp struct from the hyp mode translation table */ + vmmpmap_remove(hyp->el2_addr, roundup2(sizeof(*hyp), PAGE_SIZE), + true); + + free(hyp, M_HYP); +} + +/* + * Return register value. Registers have different sizes and an explicit cast + * must be made to ensure proper conversion. + */ +static void * +hypctx_regptr(struct hypctx *hypctx, int reg) +{ + switch (reg) { + case VM_REG_GUEST_X0: + return (&hypctx->tf.tf_x[0]); + case VM_REG_GUEST_X1: + return (&hypctx->tf.tf_x[1]); + case VM_REG_GUEST_X2: + return (&hypctx->tf.tf_x[2]); + case VM_REG_GUEST_X3: + return (&hypctx->tf.tf_x[3]); + case VM_REG_GUEST_X4: + return (&hypctx->tf.tf_x[4]); + case VM_REG_GUEST_X5: + return (&hypctx->tf.tf_x[5]); + case VM_REG_GUEST_X6: + return (&hypctx->tf.tf_x[6]); + case VM_REG_GUEST_X7: + return (&hypctx->tf.tf_x[7]); + case VM_REG_GUEST_X8: + return (&hypctx->tf.tf_x[8]); + case VM_REG_GUEST_X9: + return (&hypctx->tf.tf_x[9]); + case VM_REG_GUEST_X10: + return (&hypctx->tf.tf_x[10]); + case VM_REG_GUEST_X11: + return (&hypctx->tf.tf_x[11]); + case VM_REG_GUEST_X12: + return (&hypctx->tf.tf_x[12]); + case VM_REG_GUEST_X13: + return (&hypctx->tf.tf_x[13]); + case VM_REG_GUEST_X14: + return (&hypctx->tf.tf_x[14]); + case VM_REG_GUEST_X15: + return (&hypctx->tf.tf_x[15]); + case VM_REG_GUEST_X16: + return (&hypctx->tf.tf_x[16]); + case VM_REG_GUEST_X17: + return (&hypctx->tf.tf_x[17]); + case VM_REG_GUEST_X18: + return (&hypctx->tf.tf_x[18]); + case VM_REG_GUEST_X19: + return (&hypctx->tf.tf_x[19]); + case VM_REG_GUEST_X20: + return (&hypctx->tf.tf_x[20]); + case VM_REG_GUEST_X21: + return (&hypctx->tf.tf_x[21]); + case VM_REG_GUEST_X22: + return (&hypctx->tf.tf_x[22]); + case VM_REG_GUEST_X23: + return (&hypctx->tf.tf_x[23]); + case VM_REG_GUEST_X24: + return (&hypctx->tf.tf_x[24]); + case VM_REG_GUEST_X25: + return (&hypctx->tf.tf_x[25]); + case VM_REG_GUEST_X26: + return (&hypctx->tf.tf_x[26]); + case VM_REG_GUEST_X27: + return (&hypctx->tf.tf_x[27]); + case VM_REG_GUEST_X28: + return (&hypctx->tf.tf_x[28]); + case VM_REG_GUEST_X29: + return (&hypctx->tf.tf_x[29]); + case VM_REG_GUEST_LR: + return (&hypctx->tf.tf_lr); + case VM_REG_GUEST_SP: + return (&hypctx->tf.tf_sp); + case VM_REG_GUEST_ELR: /* This is bogus */ + return (&hypctx->tf.tf_elr); + case VM_REG_GUEST_SPSR: /* This is bogus */ + return (&hypctx->tf.tf_spsr); + case VM_REG_ELR_EL2: + return (&hypctx->tf.tf_elr); + default: + break; + } + return (NULL); +} + +static int +arm_getreg(void *arg, int vcpu, int reg, uint64_t *retval) +{ + void *regp; + int running, hostcpu; + struct hyp *hyp = arg; + + running = vcpu_is_running(hyp->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("arm_getreg: %s%d is running", vm_name(hyp->vm), vcpu); + + if ((regp = hypctx_regptr(&hyp->ctx[vcpu], reg)) != NULL) { + if (reg == VM_REG_GUEST_SPSR) + *retval = *(uint32_t *)regp; + else + *retval = *(uint64_t *)regp; + return (0); + } else { + return (EINVAL); + } +} + +static int +arm_setreg(void *arg, int vcpu, int reg, uint64_t val) +{ + void *regp; + struct hyp *hyp = arg; + int running, hostcpu; + + running = vcpu_is_running(hyp->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("hyp_setreg: %s%d is running", vm_name(hyp->vm), vcpu); + + if ((regp = hypctx_regptr(&hyp->ctx[vcpu], reg)) != NULL) { + if (reg == VM_REG_GUEST_SPSR) + *(uint32_t *)regp = (uint32_t)val; + else + *(uint64_t *)regp = val; + return (0); + } else { + return (EINVAL); + } +} + +static int +arm_getcap(void *arg, int vcpu, int type, int *retval) +{ + int ret; + + ret = ENOENT; + + switch (type) { + case VM_CAP_UNRESTRICTED_GUEST: + *retval = 1; + ret = 0; + break; + default: + break; + } + + return (ret); +} + +static int +arm_setcap(void *arg, int vcpu, int type, int val) +{ + + return (ENOENT); +} + +static +void arm_restore(void) +{ + ; +} + +struct vmm_ops vmm_ops_arm = { + .init = arm_init, + .cleanup = arm_cleanup, + .resume = arm_restore, + .vminit = arm_vminit, + .vmrun = arm_vmrun, + .vmcleanup = arm_vmcleanup, + .vmgetreg = arm_getreg, + .vmsetreg = arm_setreg, + .vmgetcap = arm_getcap, + .vmsetcap = arm_setcap, + .vmspace_alloc = arm_vmspace_alloc, + .vmspace_free = arm_vmspace_free, +}; diff --git a/sys/arm64/vmm/vmm_call.S b/sys/arm64/vmm/vmm_call.S new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_call.S @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2017 Alexandru Elisei + * All rights reserved. + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include + + .text + +ENTRY(vmm_call_hyp) + hvc #0 + ret +END(vmm_call_hyp) diff --git a/sys/arm64/vmm/vmm_dev.c b/sys/arm64/vmm/vmm_dev.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_dev.c @@ -0,0 +1,970 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "vmm_stat.h" + +struct devmem_softc { + int segid; + char *name; + struct cdev *cdev; + struct vmmdev_softc *sc; + SLIST_ENTRY(devmem_softc) link; +}; + +struct vmmdev_softc { + struct vm *vm; /* vm instance cookie */ + struct cdev *cdev; + SLIST_ENTRY(vmmdev_softc) link; + SLIST_HEAD(, devmem_softc) devmem; + int flags; +}; +#define VSC_LINKED 0x01 + +static SLIST_HEAD(, vmmdev_softc) head; + +static unsigned pr_allow_flag; +static struct mtx vmmdev_mtx; + +static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); + +SYSCTL_DECL(_hw_vmm); + +static int vmm_priv_check(struct ucred *ucred); +static int devmem_create_cdev(const char *vmname, int id, char *devmem); +static void devmem_destroy(void *arg); + +static int +vmm_priv_check(struct ucred *ucred) +{ + + if (jailed(ucred) && + !(ucred->cr_prison->pr_allow & pr_allow_flag)) + return (EPERM); + + return (0); +} + +static int +vcpu_lock_one(struct vmmdev_softc *sc, int vcpu) +{ + int error; + + if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm)) + return (EINVAL); + + error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); + return (error); +} + +static void +vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu) +{ + enum vcpu_state state; + + state = vcpu_get_state(sc->vm, vcpu, NULL); + if (state != VCPU_FROZEN) { + panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm), + vcpu, state); + } + + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); +} + +static int +vcpu_lock_all(struct vmmdev_softc *sc) +{ + int error, vcpu; + uint16_t maxcpus; + + maxcpus = vm_get_maxcpus(sc->vm); + for (vcpu = 0; vcpu < maxcpus; vcpu++) { + error = vcpu_lock_one(sc, vcpu); + if (error) + break; + } + + if (error) { + while (--vcpu >= 0) + vcpu_unlock_one(sc, vcpu); + } + + return (error); +} + +static void +vcpu_unlock_all(struct vmmdev_softc *sc) +{ + int vcpu; + uint16_t maxcpus; + + maxcpus = vm_get_maxcpus(sc->vm); + for (vcpu = 0; vcpu < maxcpus; vcpu++) + vcpu_unlock_one(sc, vcpu); +} + +static struct vmmdev_softc * +vmmdev_lookup(const char *name) +{ + struct vmmdev_softc *sc; + +#ifdef notyet /* XXX kernel is not compiled with invariants */ + mtx_assert(&vmmdev_mtx, MA_OWNED); +#endif + + SLIST_FOREACH(sc, &head, link) { + if (strcmp(name, vm_name(sc->vm)) == 0) + break; + } + + return (sc); +} + +static struct vmmdev_softc * +vmmdev_lookup2(struct cdev *cdev) +{ + + return (cdev->si_drv1); +} + +static int +vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) +{ + int error, off, c, prot; + vm_paddr_t gpa, maxaddr; + void *hpa, *cookie; + struct vmmdev_softc *sc; + uint16_t lastcpu; + + error = vmm_priv_check(curthread->td_ucred); + if (error) + return (error); + + sc = vmmdev_lookup2(cdev); + if (sc == NULL) + return (ENXIO); + + /* + * Get a read lock on the guest memory map by freezing any vcpu. + */ + lastcpu = vm_get_maxcpus(sc->vm) - 1; + error = vcpu_lock_one(sc, lastcpu); + if (error) + return (error); + + prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); + maxaddr = vmm_sysmem_maxaddr(sc->vm); + while (uio->uio_resid > 0 && error == 0) { + gpa = uio->uio_offset; + off = gpa & PAGE_MASK; + c = min(uio->uio_resid, PAGE_SIZE - off); + + /* + * The VM has a hole in its physical memory map. If we want to + * use 'dd' to inspect memory beyond the hole we need to + * provide bogus data for memory that lies in the hole. + * + * Since this device does not support lseek(2), dd(1) will + * read(2) blocks of data to simulate the lseek(2). + */ + hpa = vm_gpa_hold(sc->vm, lastcpu, gpa, c, + prot, &cookie); + if (hpa == NULL) { + if (uio->uio_rw == UIO_READ && gpa < maxaddr) + error = uiomove(__DECONST(void *, zero_region), + c, uio); + else + error = EFAULT; + } else { + error = uiomove(hpa, c, uio); + vm_gpa_release(cookie); + } + } + vcpu_unlock_one(sc, lastcpu); + return (error); +} + +static int +get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg) +{ + struct devmem_softc *dsc; + int error; + bool sysmem; + + error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL); + if (error || mseg->len == 0) + return (error); + + if (!sysmem) { + SLIST_FOREACH(dsc, &sc->devmem, link) { + if (dsc->segid == mseg->segid) + break; + } + KASSERT(dsc != NULL, ("%s: devmem segment %d not found", + __func__, mseg->segid)); + error = copystr(dsc->name, mseg->name, sizeof(mseg->name), + NULL); + } else { + bzero(mseg->name, sizeof(mseg->name)); + } + + return (error); +} + +static int +alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg) +{ + char *name; + int error; + bool sysmem; + + error = 0; + name = NULL; + sysmem = true; + + /* + * The allocation is lengthened by 1 to hold a terminating NUL. It'll + * by stripped off when devfs processes the full string. + */ + if (VM_MEMSEG_NAME(mseg)) { + sysmem = false; + name = malloc(sizeof(mseg->name), M_VMMDEV, M_WAITOK); + error = copystr(mseg->name, name, sizeof(mseg->name), NULL); + if (error) + goto done; + } + + error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem); + if (error) + goto done; + + if (VM_MEMSEG_NAME(mseg)) { + error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name); + if (error) + vm_free_memseg(sc->vm, mseg->segid); + else + name = NULL; /* freed when 'cdev' is destroyed */ + } +done: + free(name, M_VMMDEV); + return (error); +} + +static int +vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum, + uint64_t *regval) +{ + int error, i; + + error = 0; + for (i = 0; i < count; i++) { + error = vm_get_register(vm, vcpu, regnum[i], ®val[i]); + if (error) + break; + } + return (error); +} + +static int +vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum, + uint64_t *regval) +{ + int error, i; + + error = 0; + for (i = 0; i < count; i++) { + error = vm_set_register(vm, vcpu, regnum[i], regval[i]); + if (error) + break; + } + return (error); +} + +static int +vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, + struct thread *td) +{ + int error, vcpu, state_changed, size; + cpuset_t *cpuset; + struct vmmdev_softc *sc; + struct vm_register *vmreg; + struct vm_register_set *vmregset; + struct vm_run *vmrun; + struct vm_activate_cpu *vac; + struct vm_attach_vgic *vav; + struct vm_cpuset *vm_cpuset; + struct vm_irq *vi; + struct vm_capability *vmcap; + struct vm_stats *vmstats; + struct vm_stat_desc *statdesc; + struct vm_suspend *vmsuspend; + struct vm_memmap *mm; + struct vm_msi *vmsi; + struct vm_cpu_topology *topology; + uint64_t *regvals; + int *regnums; + + error = vmm_priv_check(curthread->td_ucred); + if (error) + return (error); + + sc = vmmdev_lookup2(cdev); + if (sc == NULL) + return (ENXIO); + + error = 0; + vcpu = -1; + state_changed = 0; + + /* + * Some VMM ioctls can operate only on vcpus that are not running. + */ + switch (cmd) { + case VM_RUN: + case VM_GET_REGISTER: + case VM_SET_REGISTER: + case VM_GET_REGISTER_SET: + case VM_SET_REGISTER_SET: + case VM_GET_CAPABILITY: + case VM_SET_CAPABILITY: + case VM_ACTIVATE_CPU: + /* + * XXX fragile, handle with care + * Assumes that the first field of the ioctl data is the vcpu. + */ + vcpu = *(int *)data; + error = vcpu_lock_one(sc, vcpu); + if (error) + goto done; + state_changed = 1; + break; + + case VM_ALLOC_MEMSEG: + case VM_MMAP_MEMSEG: + case VM_REINIT: + case VM_ATTACH_VGIC: + /* + * ioctls that operate on the entire virtual machine must + * prevent all vcpus from running. + */ + error = vcpu_lock_all(sc); + if (error) + goto done; + state_changed = 2; + break; + case VM_GET_MEMSEG: + case VM_MMAP_GETNEXT: + /* + * Lock a vcpu to make sure that the memory map cannot be + * modified while it is being inspected. + */ + vcpu = vm_get_maxcpus(sc->vm) - 1; + error = vcpu_lock_one(sc, vcpu); + if (error) + goto done; + state_changed = 1; + break; + case VM_ASSERT_IRQ: + vi =(struct vm_irq *)data; + error = vm_assert_irq(sc->vm, vi->irq); + break; + case VM_DEASSERT_IRQ: + vi = (struct vm_irq *)data; + error = vm_deassert_irq(sc->vm, vi->irq); + break; + default: + break; + } + + switch(cmd) { + case VM_RUN: + vmrun = (struct vm_run *)data; + error = vm_run(sc->vm, vmrun); + break; + case VM_SUSPEND: + vmsuspend = (struct vm_suspend *)data; + error = vm_suspend(sc->vm, vmsuspend->how); + break; + case VM_REINIT: + error = vm_reinit(sc->vm); + break; + case VM_STAT_DESC: { + statdesc = (struct vm_stat_desc *)data; + error = vmm_stat_desc_copy(statdesc->index, + statdesc->desc, sizeof(statdesc->desc)); + break; + } + case VM_STATS: { + CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); + vmstats = (struct vm_stats *)data; + getmicrotime(&vmstats->tv); + error = vmm_stat_copy(sc->vm, vmstats->cpuid, vmstats->index, + nitems(vmstats->statbuf), + &vmstats->num_entries, vmstats->statbuf); + break; + } + case VM_MMAP_GETNEXT: + mm = (struct vm_memmap *)data; + error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid, + &mm->segoff, &mm->len, &mm->prot, &mm->flags); + break; + case VM_MMAP_MEMSEG: + mm = (struct vm_memmap *)data; + error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff, + mm->len, mm->prot, mm->flags); + break; + case VM_ALLOC_MEMSEG: + error = alloc_memseg(sc, (struct vm_memseg *)data); + break; + case VM_GET_MEMSEG: + error = get_memseg(sc, (struct vm_memseg *)data); + break; + case VM_GET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, + &vmreg->regval); + break; + case VM_SET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, + vmreg->regval); + break; + case VM_GET_REGISTER_SET: + vmregset = (struct vm_register_set *)data; + if (vmregset->count > VM_REG_LAST) { + error = EINVAL; + break; + } + regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, + M_WAITOK); + regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, + M_WAITOK); + error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * + vmregset->count); + if (error == 0) + error = vm_get_register_set(sc->vm, vmregset->cpuid, + vmregset->count, regnums, regvals); + if (error == 0) + error = copyout(regvals, vmregset->regvals, + sizeof(regvals[0]) * vmregset->count); + free(regvals, M_VMMDEV); + free(regnums, M_VMMDEV); + break; + case VM_SET_REGISTER_SET: + vmregset = (struct vm_register_set *)data; + if (vmregset->count > VM_REG_LAST) { + error = EINVAL; + break; + } + regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV, + M_WAITOK); + regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV, + M_WAITOK); + error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) * + vmregset->count); + if (error == 0) + error = copyin(vmregset->regvals, regvals, + sizeof(regvals[0]) * vmregset->count); + if (error == 0) + error = vm_set_register_set(sc->vm, vmregset->cpuid, + vmregset->count, regnums, regvals); + free(regvals, M_VMMDEV); + free(regnums, M_VMMDEV); + break; + case VM_GET_CAPABILITY: + vmcap = (struct vm_capability *)data; + error = vm_get_capability(sc->vm, vmcap->cpuid, + vmcap->captype, + &vmcap->capval); + break; + case VM_SET_CAPABILITY: + vmcap = (struct vm_capability *)data; + error = vm_set_capability(sc->vm, vmcap->cpuid, + vmcap->captype, + vmcap->capval); + break; + case VM_ACTIVATE_CPU: + vac = (struct vm_activate_cpu *)data; + error = vm_activate_cpu(sc->vm, vac->vcpuid); + break; + case VM_GET_CPUS: + error = 0; + vm_cpuset = (struct vm_cpuset *)data; + size = vm_cpuset->cpusetsize; + if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) { + error = ERANGE; + break; + } + cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO); + if (vm_cpuset->which == VM_ACTIVE_CPUS) + *cpuset = vm_active_cpus(sc->vm); + else if (vm_cpuset->which == VM_SUSPENDED_CPUS) + *cpuset = vm_suspended_cpus(sc->vm); + else if (vm_cpuset->which == VM_DEBUG_CPUS) + *cpuset = vm_debug_cpus(sc->vm); + else + error = EINVAL; + if (error == 0) + error = copyout(cpuset, vm_cpuset->cpus, size); + free(cpuset, M_TEMP); + break; + case VM_SUSPEND_CPU: + vac = (struct vm_activate_cpu *)data; + error = vm_suspend_cpu(sc->vm, vac->vcpuid); + break; + case VM_RESUME_CPU: + vac = (struct vm_activate_cpu *)data; + error = vm_resume_cpu(sc->vm, vac->vcpuid); + break; + case VM_ATTACH_VGIC: + vav = (struct vm_attach_vgic *)data; + error = vm_attach_vgic(sc->vm, vav->dist_start, vav->dist_size, + vav->redist_start, vav->redist_size); + break; + case VM_RAISE_MSI: + vmsi = (struct vm_msi *)data; + error = vm_raise_msi(sc->vm, vmsi->msg, vmsi->addr, vmsi->bus, + vmsi->slot, vmsi->func); + break; + case VM_SET_TOPOLOGY: + topology = (struct vm_cpu_topology *)data; + error = vm_set_topology(sc->vm, topology->sockets, + topology->cores, topology->threads, topology->maxcpus); + break; + case VM_GET_TOPOLOGY: + topology = (struct vm_cpu_topology *)data; + vm_get_topology(sc->vm, &topology->sockets, &topology->cores, + &topology->threads, &topology->maxcpus); + error = 0; + break; + default: + error = ENOTTY; + break; + } + + if (state_changed == 1) + vcpu_unlock_one(sc, vcpu); + else if (state_changed == 2) + vcpu_unlock_all(sc); + +done: + /* + * Make sure that no handler returns a kernel-internal + * error value to userspace. + */ + KASSERT(error == ERESTART || error >= 0, + ("vmmdev_ioctl: invalid error return %d", error)); + return (error); +} + +static int +vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize, + struct vm_object **objp, int nprot) +{ + struct vmmdev_softc *sc; + vm_paddr_t gpa; + size_t len; + vm_ooffset_t segoff, first, last; + int error, found, segid; + uint16_t lastcpu; + bool sysmem; + + error = vmm_priv_check(curthread->td_ucred); + if (error) + return (error); + + first = *offset; + last = first + mapsize; + if ((nprot & PROT_EXEC) || first < 0 || first >= last) + return (EINVAL); + + sc = vmmdev_lookup2(cdev); + if (sc == NULL) { + /* virtual machine is in the process of being created */ + return (EINVAL); + } + + /* + * Get a read lock on the guest memory map by freezing any vcpu. + */ + lastcpu = vm_get_maxcpus(sc->vm) - 1; + error = vcpu_lock_one(sc, lastcpu); + if (error) + return (error); + + gpa = 0; + found = 0; + while (!found) { + error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len, + NULL, NULL); + if (error) + break; + + if (first >= gpa && last <= gpa + len) + found = 1; + else + gpa += len; + } + + if (found) { + error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp); + KASSERT(error == 0 && *objp != NULL, + ("%s: invalid memory segment %d", __func__, segid)); + if (sysmem) { + vm_object_reference(*objp); + *offset = segoff + (first - gpa); + } else { + error = EINVAL; + } + } + vcpu_unlock_one(sc, lastcpu); + return (error); +} + +static void +vmmdev_destroy(void *arg) +{ + struct vmmdev_softc *sc = arg; + struct devmem_softc *dsc; + int error __diagused; + + error = vcpu_lock_all(sc); + KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error)); + + while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) { + KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__)); + SLIST_REMOVE_HEAD(&sc->devmem, link); + free(dsc->name, M_VMMDEV); + free(dsc, M_VMMDEV); + } + + if (sc->cdev != NULL) + destroy_dev(sc->cdev); + + if (sc->vm != NULL) + vm_destroy(sc->vm); + + if ((sc->flags & VSC_LINKED) != 0) { + mtx_lock(&vmmdev_mtx); + SLIST_REMOVE(&head, sc, vmmdev_softc, link); + mtx_unlock(&vmmdev_mtx); + } + + free(sc, M_VMMDEV); +} + +static int +sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) +{ + struct devmem_softc *dsc; + struct vmmdev_softc *sc; + struct cdev *cdev; + char *buf; + int error, buflen; + + error = vmm_priv_check(req->td->td_ucred); + if (error) + return (error); + + buflen = VM_MAX_NAMELEN + 1; + buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); + strlcpy(buf, "beavis", buflen); + error = sysctl_handle_string(oidp, buf, buflen, req); + if (error != 0 || req->newptr == NULL) + goto out; + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(buf); + if (sc == NULL || sc->cdev == NULL) { + mtx_unlock(&vmmdev_mtx); + error = EINVAL; + goto out; + } + + /* + * The 'cdev' will be destroyed asynchronously when 'si_threadcount' + * goes down to 0 so we should not do it again in the callback. + * + * Setting 'sc->cdev' to NULL is also used to indicate that the VM + * is scheduled for destruction. + */ + cdev = sc->cdev; + sc->cdev = NULL; + mtx_unlock(&vmmdev_mtx); + + /* + * Schedule all cdevs to be destroyed: + * + * - any new operations on the 'cdev' will return an error (ENXIO). + * + * - when the 'si_threadcount' dwindles down to zero the 'cdev' will + * be destroyed and the callback will be invoked in a taskqueue + * context. + * + * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev' + */ + SLIST_FOREACH(dsc, &sc->devmem, link) { + KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed")); + destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc); + } + destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); + error = 0; + +out: + free(buf, M_VMMDEV); + return (error); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, + NULL, 0, sysctl_vmm_destroy, "A", + NULL); + +static struct cdevsw vmmdevsw = { + .d_name = "vmmdev", + .d_version = D_VERSION, + .d_ioctl = vmmdev_ioctl, + .d_mmap_single = vmmdev_mmap_single, + .d_read = vmmdev_rw, + .d_write = vmmdev_rw, +}; + +static int +sysctl_vmm_create(SYSCTL_HANDLER_ARGS) +{ + struct vm *vm; + struct cdev *cdev; + struct vmmdev_softc *sc, *sc2; + char *buf; + int error, buflen; + + error = vmm_priv_check(req->td->td_ucred); + if (error) + return (error); + + buflen = VM_MAX_NAMELEN + 1; + buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO); + strlcpy(buf, "beavis", buflen); + error = sysctl_handle_string(oidp, buf, buflen, req); + if (error != 0 || req->newptr == NULL) + goto out; + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(buf); + mtx_unlock(&vmmdev_mtx); + if (sc != NULL) { + error = EEXIST; + goto out; + } + + error = vm_create(buf, &vm); + if (error != 0) + goto out; + + sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); + sc->vm = vm; + SLIST_INIT(&sc->devmem); + + /* + * Lookup the name again just in case somebody sneaked in when we + * dropped the lock. + */ + mtx_lock(&vmmdev_mtx); + sc2 = vmmdev_lookup(buf); + if (sc2 == NULL) { + SLIST_INSERT_HEAD(&head, sc, link); + sc->flags |= VSC_LINKED; + } + mtx_unlock(&vmmdev_mtx); + + if (sc2 != NULL) { + vmmdev_destroy(sc); + error = EEXIST; + goto out; + } + + error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, + UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); + if (error != 0) { + vmmdev_destroy(sc); + goto out; + } + + mtx_lock(&vmmdev_mtx); + sc->cdev = cdev; + sc->cdev->si_drv1 = sc; + mtx_unlock(&vmmdev_mtx); + +out: + free(buf, M_VMMDEV); + return (error); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, create, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, + NULL, 0, sysctl_vmm_create, "A", + NULL); + +void +vmmdev_init(void) +{ + mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); + pr_allow_flag = prison_add_allow(NULL, "vmm", NULL, + "Allow use of vmm in a jail."); +} + +int +vmmdev_cleanup(void) +{ + int error; + + if (SLIST_EMPTY(&head)) + error = 0; + else + error = EBUSY; + + return (error); +} + +static int +devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len, + struct vm_object **objp, int nprot) +{ + struct devmem_softc *dsc; + vm_ooffset_t first, last; + size_t seglen; + int error; + uint16_t lastcpu; + bool sysmem; + + dsc = cdev->si_drv1; + if (dsc == NULL) { + /* 'cdev' has been created but is not ready for use */ + return (ENXIO); + } + + first = *offset; + last = *offset + len; + if ((nprot & PROT_EXEC) || first < 0 || first >= last) + return (EINVAL); + + lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1; + error = vcpu_lock_one(dsc->sc, lastcpu); + if (error) + return (error); + + error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp); + KASSERT(error == 0 && !sysmem && *objp != NULL, + ("%s: invalid devmem segment %d", __func__, dsc->segid)); + + vcpu_unlock_one(dsc->sc, lastcpu); + + if (seglen >= last) { + vm_object_reference(*objp); + return (0); + } else { + return (EINVAL); + } +} + +static struct cdevsw devmemsw = { + .d_name = "devmem", + .d_version = D_VERSION, + .d_mmap_single = devmem_mmap_single, +}; + +static int +devmem_create_cdev(const char *vmname, int segid, char *devname) +{ + struct devmem_softc *dsc; + struct vmmdev_softc *sc; + struct cdev *cdev; + int error; + + error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL, + UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname); + if (error) + return (error); + + dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO); + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(vmname); + KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname)); + if (sc->cdev == NULL) { + /* virtual machine is being created or destroyed */ + mtx_unlock(&vmmdev_mtx); + free(dsc, M_VMMDEV); + destroy_dev_sched_cb(cdev, NULL, 0); + return (ENODEV); + } + + dsc->segid = segid; + dsc->name = devname; + dsc->cdev = cdev; + dsc->sc = sc; + SLIST_INSERT_HEAD(&sc->devmem, dsc, link); + mtx_unlock(&vmmdev_mtx); + + /* The 'cdev' is ready for use after 'si_drv1' is initialized */ + cdev->si_drv1 = dsc; + return (0); +} + +static void +devmem_destroy(void *arg) +{ + struct devmem_softc *dsc = arg; + + KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__)); + dsc->cdev = NULL; + dsc->sc = NULL; +} diff --git a/sys/arm64/vmm/vmm_hyp.c b/sys/arm64/vmm/vmm_hyp.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_hyp.c @@ -0,0 +1,822 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Andrew Turner + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include + +#include + +#include "arm64.h" +#include "hyp.h" + +struct hypctx; + +uint64_t vmm_hyp_enter(uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, + uint64_t, uint64_t, uint64_t); +uint64_t vmm_enter_guest(struct hypctx *); + +/* TODO: Make this common between this & vfp.h */ +static void +vfp_store(struct vfpstate *state) +{ + __uint128_t *vfp_state; + uint64_t fpcr, fpsr; + + vfp_state = state->vfp_regs; + __asm __volatile( + "mrs %0, fpcr \n" + "mrs %1, fpsr \n" + "stp q0, q1, [%2, #16 * 0]\n" + "stp q2, q3, [%2, #16 * 2]\n" + "stp q4, q5, [%2, #16 * 4]\n" + "stp q6, q7, [%2, #16 * 6]\n" + "stp q8, q9, [%2, #16 * 8]\n" + "stp q10, q11, [%2, #16 * 10]\n" + "stp q12, q13, [%2, #16 * 12]\n" + "stp q14, q15, [%2, #16 * 14]\n" + "stp q16, q17, [%2, #16 * 16]\n" + "stp q18, q19, [%2, #16 * 18]\n" + "stp q20, q21, [%2, #16 * 20]\n" + "stp q22, q23, [%2, #16 * 22]\n" + "stp q24, q25, [%2, #16 * 24]\n" + "stp q26, q27, [%2, #16 * 26]\n" + "stp q28, q29, [%2, #16 * 28]\n" + "stp q30, q31, [%2, #16 * 30]\n" + : "=&r"(fpcr), "=&r"(fpsr) : "r"(vfp_state)); + + state->vfp_fpcr = fpcr; + state->vfp_fpsr = fpsr; +} + +static void +vfp_restore(struct vfpstate *state) +{ + __uint128_t *vfp_state; + uint64_t fpcr, fpsr; + + vfp_state = state->vfp_regs; + fpcr = state->vfp_fpcr; + fpsr = state->vfp_fpsr; + + __asm __volatile( + "ldp q0, q1, [%2, #16 * 0]\n" + "ldp q2, q3, [%2, #16 * 2]\n" + "ldp q4, q5, [%2, #16 * 4]\n" + "ldp q6, q7, [%2, #16 * 6]\n" + "ldp q8, q9, [%2, #16 * 8]\n" + "ldp q10, q11, [%2, #16 * 10]\n" + "ldp q12, q13, [%2, #16 * 12]\n" + "ldp q14, q15, [%2, #16 * 14]\n" + "ldp q16, q17, [%2, #16 * 16]\n" + "ldp q18, q19, [%2, #16 * 18]\n" + "ldp q20, q21, [%2, #16 * 20]\n" + "ldp q22, q23, [%2, #16 * 22]\n" + "ldp q24, q25, [%2, #16 * 24]\n" + "ldp q26, q27, [%2, #16 * 26]\n" + "ldp q28, q29, [%2, #16 * 28]\n" + "ldp q30, q31, [%2, #16 * 30]\n" + "msr fpcr, %0 \n" + "msr fpsr, %1 \n" + : : "r"(fpcr), "r"(fpsr), "r"(vfp_state)); +} + +static void +vmm_hyp_reg_store(struct hypctx *hypctx, struct hyp *hyp, bool guest) +{ + uint64_t dfr0; + + /* Store the guest VFP registers */ + if (guest) { + vfp_store(&hypctx->vfpstate); + + /* Store the timer registers */ + hypctx->vtimer_cpu.cntkctl_el1 = READ_SPECIALREG(cntkctl_el1); + hypctx->vtimer_cpu.virt_timer.cntx_cval_el0 = + READ_SPECIALREG(cntv_cval_el0); + hypctx->vtimer_cpu.virt_timer.cntx_ctl_el0 = + READ_SPECIALREG(cntv_ctl_el0); + + /* Store the GICv3 registers */ + hypctx->vgic_cpu_if.ich_eisr_el2 = + READ_SPECIALREG(ich_eisr_el2); + hypctx->vgic_cpu_if.ich_elrsr_el2 = + READ_SPECIALREG(ich_elrsr_el2); + hypctx->vgic_cpu_if.ich_hcr_el2 = READ_SPECIALREG(ich_hcr_el2); + hypctx->vgic_cpu_if.ich_misr_el2 = + READ_SPECIALREG(ich_misr_el2); + hypctx->vgic_cpu_if.ich_vmcr_el2 = + READ_SPECIALREG(ich_vmcr_el2); + switch(hypctx->vgic_cpu_if.ich_lr_num - 1) { +#define STORE_LR(x) \ + case x: \ + hypctx->vgic_cpu_if.ich_lr_el2[x] = \ + READ_SPECIALREG(ich_lr ## x ##_el2) + STORE_LR(15); + STORE_LR(14); + STORE_LR(13); + STORE_LR(12); + STORE_LR(11); + STORE_LR(10); + STORE_LR(9); + STORE_LR(8); + STORE_LR(7); + STORE_LR(6); + STORE_LR(5); + STORE_LR(4); + STORE_LR(3); + STORE_LR(2); + STORE_LR(1); + default: + STORE_LR(0); +#undef STORE_LR + } + + switch(hypctx->vgic_cpu_if.ich_apr_num - 1) { +#define STORE_APR(x) \ + case x: \ + hypctx->vgic_cpu_if.ich_ap0r_el2[x] = \ + READ_SPECIALREG(ich_ap0r ## x ##_el2); \ + hypctx->vgic_cpu_if.ich_ap1r_el2[x] = \ + READ_SPECIALREG(ich_ap1r ## x ##_el2) + STORE_APR(3); + STORE_APR(2); + STORE_APR(1); + default: + STORE_APR(0); +#undef STORE_APR + } + } + + dfr0 = READ_SPECIALREG(id_aa64dfr0_el1); + switch(ID_AA64DFR0_BRPs_VAL(dfr0) - 1) { +#define STORE_DBG_BRP(x) \ + case x: \ + hypctx->dbgbcr_el1[x] = \ + READ_SPECIALREG(dbgbcr ## x ## _el1); \ + hypctx->dbgbvr_el1[x] = \ + READ_SPECIALREG(dbgbvr ## x ## _el1) + STORE_DBG_BRP(15); + STORE_DBG_BRP(14); + STORE_DBG_BRP(13); + STORE_DBG_BRP(12); + STORE_DBG_BRP(11); + STORE_DBG_BRP(10); + STORE_DBG_BRP(9); + STORE_DBG_BRP(8); + STORE_DBG_BRP(7); + STORE_DBG_BRP(6); + STORE_DBG_BRP(5); + STORE_DBG_BRP(4); + STORE_DBG_BRP(3); + STORE_DBG_BRP(2); + STORE_DBG_BRP(1); + default: + STORE_DBG_BRP(0); +#undef STORE_DBG_BRP + } + + switch(ID_AA64DFR0_WRPs_VAL(dfr0) - 1) { +#define STORE_DBG_WRP(x) \ + case x: \ + hypctx->dbgwcr_el1[x] = \ + READ_SPECIALREG(dbgwcr ## x ## _el1); \ + hypctx->dbgwvr_el1[x] = \ + READ_SPECIALREG(dbgwvr ## x ## _el1) + STORE_DBG_WRP(15); + STORE_DBG_WRP(14); + STORE_DBG_WRP(13); + STORE_DBG_WRP(12); + STORE_DBG_WRP(11); + STORE_DBG_WRP(10); + STORE_DBG_WRP(9); + STORE_DBG_WRP(8); + STORE_DBG_WRP(7); + STORE_DBG_WRP(6); + STORE_DBG_WRP(5); + STORE_DBG_WRP(4); + STORE_DBG_WRP(3); + STORE_DBG_WRP(2); + STORE_DBG_WRP(1); + default: + STORE_DBG_WRP(0); +#undef STORE_DBG_WRP + } + + /* Store the PMU registers */ + hypctx->pmcr_el0 = READ_SPECIALREG(pmcr_el0); + hypctx->pmccntr_el0 = READ_SPECIALREG(pmccntr_el0); + hypctx->pmccfiltr_el0 = READ_SPECIALREG(pmccfiltr_el0); + hypctx->pmcntenset_el0 = READ_SPECIALREG(pmcntenset_el0); + hypctx->pmintenset_el1 = READ_SPECIALREG(pmintenset_el1); + hypctx->pmovsset_el0 = READ_SPECIALREG(pmovsset_el0); + hypctx->pmuserenr_el0 = READ_SPECIALREG(pmuserenr_el0); + switch ((hypctx->pmcr_el0 & PMCR_N_MASK) >> PMCR_N_SHIFT) { +#define STORE_PMU(x) \ + case (x + 1): \ + hypctx->pmevcntr_el0[x] = \ + READ_SPECIALREG(pmevcntr ## x ## _el0); \ + hypctx->pmevtyper_el0[x] = \ + READ_SPECIALREG(pmevtyper ## x ## _el0) + STORE_PMU(30); + STORE_PMU(29); + STORE_PMU(28); + STORE_PMU(27); + STORE_PMU(26); + STORE_PMU(25); + STORE_PMU(24); + STORE_PMU(23); + STORE_PMU(22); + STORE_PMU(21); + STORE_PMU(20); + STORE_PMU(19); + STORE_PMU(18); + STORE_PMU(17); + STORE_PMU(16); + STORE_PMU(15); + STORE_PMU(14); + STORE_PMU(13); + STORE_PMU(12); + STORE_PMU(11); + STORE_PMU(10); + STORE_PMU(9); + STORE_PMU(8); + STORE_PMU(7); + STORE_PMU(6); + STORE_PMU(5); + STORE_PMU(4); + STORE_PMU(3); + STORE_PMU(2); + STORE_PMU(1); + STORE_PMU(0); + default: /* N == 0 when only PMCCNTR_EL0 is available */ + break; +#undef STORE_PMU + } + + /* Store the special to from the trapframe */ + hypctx->tf.tf_sp = READ_SPECIALREG(sp_el1); + hypctx->tf.tf_elr = READ_SPECIALREG(elr_el2); + hypctx->tf.tf_spsr = READ_SPECIALREG(spsr_el2); + if (guest) { + hypctx->tf.tf_esr = READ_SPECIALREG(esr_el2); + } + + /* Store the guest special registers */ + hypctx->elr_el1 = READ_SPECIALREG(elr_el1); + hypctx->sp_el0 = READ_SPECIALREG(sp_el0); + hypctx->tpidr_el0 = READ_SPECIALREG(tpidr_el0); + hypctx->tpidrro_el0 = READ_SPECIALREG(tpidrro_el0); + hypctx->tpidr_el1 = READ_SPECIALREG(tpidr_el1); + hypctx->vbar_el1 = READ_SPECIALREG(vbar_el1); + + hypctx->actlr_el1 = READ_SPECIALREG(actlr_el1); + hypctx->afsr0_el1 = READ_SPECIALREG(afsr0_el1); + hypctx->afsr1_el1 = READ_SPECIALREG(afsr1_el1); + hypctx->amair_el1 = READ_SPECIALREG(amair_el1); + hypctx->contextidr_el1 = READ_SPECIALREG(contextidr_el1); + hypctx->cpacr_el1 = READ_SPECIALREG(cpacr_el1); + hypctx->csselr_el1 = READ_SPECIALREG(csselr_el1); + hypctx->esr_el1 = READ_SPECIALREG(esr_el1); + hypctx->far_el1 = READ_SPECIALREG(far_el1); + hypctx->mair_el1 = READ_SPECIALREG(mair_el1); + hypctx->mdccint_el1 = READ_SPECIALREG(mdccint_el1); + hypctx->mdscr_el1 = READ_SPECIALREG(mdscr_el1); + hypctx->par_el1 = READ_SPECIALREG(par_el1); + hypctx->sctlr_el1 = READ_SPECIALREG(sctlr_el1); + hypctx->spsr_el1 = READ_SPECIALREG(spsr_el1); + hypctx->tcr_el1 = READ_SPECIALREG(tcr_el1); + hypctx->ttbr0_el1 = READ_SPECIALREG(ttbr0_el1); + hypctx->ttbr1_el1 = READ_SPECIALREG(ttbr1_el1); + + hypctx->cptr_el2 = READ_SPECIALREG(cptr_el2); + hypctx->hcr_el2 = READ_SPECIALREG(hcr_el2); + hypctx->vpidr_el2 = READ_SPECIALREG(vpidr_el2); + hypctx->vmpidr_el2 = READ_SPECIALREG(vmpidr_el2); +} + +static void +vmm_hyp_reg_restore(struct hypctx *hypctx, struct hyp *hyp, bool guest) +{ + uint64_t dfr0; + + /* Restore the special registers */ + WRITE_SPECIALREG(elr_el1, hypctx->elr_el1); + WRITE_SPECIALREG(sp_el0, hypctx->sp_el0); + WRITE_SPECIALREG(tpidr_el0, hypctx->tpidr_el0); + WRITE_SPECIALREG(tpidrro_el0, hypctx->tpidrro_el0); + WRITE_SPECIALREG(tpidr_el1, hypctx->tpidr_el1); + WRITE_SPECIALREG(vbar_el1, hypctx->vbar_el1); + + WRITE_SPECIALREG(actlr_el1, hypctx->actlr_el1); + WRITE_SPECIALREG(afsr0_el1, hypctx->afsr0_el1); + WRITE_SPECIALREG(afsr1_el1, hypctx->afsr1_el1); + WRITE_SPECIALREG(amair_el1, hypctx->amair_el1); + WRITE_SPECIALREG(contextidr_el1, hypctx->contextidr_el1); + WRITE_SPECIALREG(cpacr_el1, hypctx->cpacr_el1); + WRITE_SPECIALREG(csselr_el1, hypctx->csselr_el1); + WRITE_SPECIALREG(esr_el1, hypctx->esr_el1); + WRITE_SPECIALREG(far_el1, hypctx->far_el1); + WRITE_SPECIALREG(mdccint_el1, hypctx->mdccint_el1); + WRITE_SPECIALREG(mdscr_el1, hypctx->mdscr_el1); + WRITE_SPECIALREG(mair_el1, hypctx->mair_el1); + WRITE_SPECIALREG(par_el1, hypctx->par_el1); + WRITE_SPECIALREG(sctlr_el1, hypctx->sctlr_el1); + WRITE_SPECIALREG(tcr_el1, hypctx->tcr_el1); + WRITE_SPECIALREG(ttbr0_el1, hypctx->ttbr0_el1); + WRITE_SPECIALREG(ttbr1_el1, hypctx->ttbr1_el1); + WRITE_SPECIALREG(spsr_el1, hypctx->spsr_el1); + + WRITE_SPECIALREG(cptr_el2, hypctx->cptr_el2); + WRITE_SPECIALREG(hcr_el2, hypctx->hcr_el2); + WRITE_SPECIALREG(vpidr_el2, hypctx->vpidr_el2); + WRITE_SPECIALREG(vmpidr_el2, hypctx->vmpidr_el2); + + /* Load the special regs from the trapframe */ + WRITE_SPECIALREG(sp_el1, hypctx->tf.tf_sp); + WRITE_SPECIALREG(elr_el2, hypctx->tf.tf_elr); + WRITE_SPECIALREG(spsr_el2, hypctx->tf.tf_spsr); + + /* Restore the PMU registers */ + WRITE_SPECIALREG(pmcr_el0, hypctx->pmcr_el0); + WRITE_SPECIALREG(pmccntr_el0, hypctx->pmccntr_el0); + WRITE_SPECIALREG(pmccfiltr_el0, hypctx->pmccfiltr_el0); + /* Clear all events/interrupts then enable them */ + WRITE_SPECIALREG(pmcntenclr_el0, 0xfffffffful); + WRITE_SPECIALREG(pmcntenset_el0, hypctx->pmcntenset_el0); + WRITE_SPECIALREG(pmintenclr_el1, 0xfffffffful); + WRITE_SPECIALREG(pmintenset_el1, hypctx->pmintenset_el1); + WRITE_SPECIALREG(pmovsclr_el0, 0xfffffffful); + WRITE_SPECIALREG(pmovsset_el0, hypctx->pmovsset_el0); + + switch ((hypctx->pmcr_el0 & PMCR_N_MASK) >> PMCR_N_SHIFT) { +#define LOAD_PMU(x) \ + case (x + 1): \ + WRITE_SPECIALREG(pmevcntr ## x ## _el0, \ + hypctx->pmevcntr_el0[x]); \ + WRITE_SPECIALREG(pmevtyper ## x ## _el0, \ + hypctx->pmevtyper_el0[x]) + LOAD_PMU(30); + LOAD_PMU(29); + LOAD_PMU(28); + LOAD_PMU(27); + LOAD_PMU(26); + LOAD_PMU(25); + LOAD_PMU(24); + LOAD_PMU(23); + LOAD_PMU(22); + LOAD_PMU(21); + LOAD_PMU(20); + LOAD_PMU(19); + LOAD_PMU(18); + LOAD_PMU(17); + LOAD_PMU(16); + LOAD_PMU(15); + LOAD_PMU(14); + LOAD_PMU(13); + LOAD_PMU(12); + LOAD_PMU(11); + LOAD_PMU(10); + LOAD_PMU(9); + LOAD_PMU(8); + LOAD_PMU(7); + LOAD_PMU(6); + LOAD_PMU(5); + LOAD_PMU(4); + LOAD_PMU(3); + LOAD_PMU(2); + LOAD_PMU(1); + LOAD_PMU(0); + default: /* N == 0 when only PMCCNTR_EL0 is available */ + break; +#undef LOAD_PMU + } + + dfr0 = READ_SPECIALREG(id_aa64dfr0_el1); + switch(ID_AA64DFR0_BRPs_VAL(dfr0) - 1) { +#define LOAD_DBG_BRP(x) \ + case x: \ + WRITE_SPECIALREG(dbgbcr ## x ## _el1, \ + hypctx->dbgbcr_el1[x]); \ + WRITE_SPECIALREG(dbgbvr ## x ## _el1, \ + hypctx->dbgbvr_el1[x]) + LOAD_DBG_BRP(15); + LOAD_DBG_BRP(14); + LOAD_DBG_BRP(13); + LOAD_DBG_BRP(12); + LOAD_DBG_BRP(11); + LOAD_DBG_BRP(10); + LOAD_DBG_BRP(9); + LOAD_DBG_BRP(8); + LOAD_DBG_BRP(7); + LOAD_DBG_BRP(6); + LOAD_DBG_BRP(5); + LOAD_DBG_BRP(4); + LOAD_DBG_BRP(3); + LOAD_DBG_BRP(2); + LOAD_DBG_BRP(1); + default: + LOAD_DBG_BRP(0); +#undef LOAD_DBG_BRP + } + + switch(ID_AA64DFR0_WRPs_VAL(dfr0) - 1) { +#define LOAD_DBG_WRP(x) \ + case x: \ + WRITE_SPECIALREG(dbgwcr ## x ## _el1, \ + hypctx->dbgwcr_el1[x]); \ + WRITE_SPECIALREG(dbgwvr ## x ## _el1, \ + hypctx->dbgwvr_el1[x]) + LOAD_DBG_WRP(15); + LOAD_DBG_WRP(14); + LOAD_DBG_WRP(13); + LOAD_DBG_WRP(12); + LOAD_DBG_WRP(11); + LOAD_DBG_WRP(10); + LOAD_DBG_WRP(9); + LOAD_DBG_WRP(8); + LOAD_DBG_WRP(7); + LOAD_DBG_WRP(6); + LOAD_DBG_WRP(5); + LOAD_DBG_WRP(4); + LOAD_DBG_WRP(3); + LOAD_DBG_WRP(2); + LOAD_DBG_WRP(1); + default: + LOAD_DBG_WRP(0); +#undef LOAD_DBG_WRP + } + + if (guest) { + /* Load the timer registers */ + WRITE_SPECIALREG(cntkctl_el1, hypctx->vtimer_cpu.cntkctl_el1); + WRITE_SPECIALREG(cntv_cval_el0, + hypctx->vtimer_cpu.virt_timer.cntx_cval_el0); + WRITE_SPECIALREG(cntv_ctl_el0, + hypctx->vtimer_cpu.virt_timer.cntx_ctl_el0); + WRITE_SPECIALREG(cnthctl_el2, hyp->vtimer.cnthctl_el2); + WRITE_SPECIALREG(cntvoff_el2, hyp->vtimer.cntvoff_el2); + + /* Load the GICv3 registers */ + WRITE_SPECIALREG(ich_hcr_el2, hypctx->vgic_cpu_if.ich_hcr_el2); + WRITE_SPECIALREG(ich_vmcr_el2, + hypctx->vgic_cpu_if.ich_vmcr_el2); + switch(hypctx->vgic_cpu_if.ich_lr_num - 1) { +#define LOAD_LR(x) \ + case x: \ + WRITE_SPECIALREG(ich_lr ## x ##_el2, \ + hypctx->vgic_cpu_if.ich_lr_el2[x]) + LOAD_LR(15); + LOAD_LR(14); + LOAD_LR(13); + LOAD_LR(12); + LOAD_LR(11); + LOAD_LR(10); + LOAD_LR(9); + LOAD_LR(8); + LOAD_LR(7); + LOAD_LR(6); + LOAD_LR(5); + LOAD_LR(4); + LOAD_LR(3); + LOAD_LR(2); + LOAD_LR(1); + default: + LOAD_LR(0); +#undef LOAD_LR + } + + switch(hypctx->vgic_cpu_if.ich_apr_num - 1) { +#define LOAD_APR(x) \ + case x: \ + WRITE_SPECIALREG(ich_ap0r ## x ##_el2, \ + hypctx->vgic_cpu_if.ich_ap0r_el2[x]); \ + WRITE_SPECIALREG(ich_ap1r ## x ##_el2, \ + hypctx->vgic_cpu_if.ich_ap1r_el2[x]) + LOAD_APR(3); + LOAD_APR(2); + LOAD_APR(1); + default: + LOAD_APR(0); +#undef LOAD_APR + } + + /* Load the guest VFP registers */ + vfp_restore(&hypctx->vfpstate); + } +} + +static uint64_t +vmm_hyp_call_guest(struct hyp *hyp, int vcpu) +{ + struct hypctx host_hypctx; + struct hypctx *hypctx; + uint64_t cntvoff_el2; + uint64_t ich_hcr_el2, ich_vmcr_el2, cnthctl_el2, cntkctl_el1; + uint64_t ret; + uint64_t s1e1r, hpfar_el2; + bool hpfar_valid; + + vmm_hyp_reg_store(&host_hypctx, NULL, false); + + /* TODO: Check cpuid is valid */ + hypctx = &hyp->ctx[vcpu]; + + /* Save the host special registers */ + cnthctl_el2 = READ_SPECIALREG(cnthctl_el2); + cntkctl_el1 = READ_SPECIALREG(cntkctl_el1); + cntvoff_el2 = READ_SPECIALREG(cntvoff_el2); + + ich_hcr_el2 = READ_SPECIALREG(ich_hcr_el2); + ich_vmcr_el2 = READ_SPECIALREG(ich_vmcr_el2); + + vmm_hyp_reg_restore(hypctx, hyp, true); + + /* Load the common hypervisor registers */ + WRITE_SPECIALREG(vttbr_el2, hyp->vttbr_el2); + + host_hypctx.mdcr_el2 = READ_SPECIALREG(mdcr_el2); + WRITE_SPECIALREG(mdcr_el2, hypctx->mdcr_el2); + + /* Call into the guest */ + ret = vmm_enter_guest(hypctx); + + WRITE_SPECIALREG(mdcr_el2, host_hypctx.mdcr_el2); + isb(); + + /* Store the exit info */ + hypctx->exit_info.far_el2 = READ_SPECIALREG(far_el2); + hpfar_valid = true; + if (ret == EXCP_TYPE_EL1_SYNC) { + switch(ESR_ELx_EXCEPTION(hypctx->tf.tf_esr)) { + case EXCP_INSN_ABORT_L: + case EXCP_DATA_ABORT_L: + /* + * The hpfar_el2 register is valid for: + * - Translaation and Access faults. + * - Translaation, Access, and permission faults on + * the translation table walk on the stage 1 tables. + * - A stage 2 Address size fault. + * + * As we only need it in the first 2 cases we can just + * exclude it on permission faults that are not from + * the stage 1 table walk. + * + * TODO: Add a case for Arm erratum 834220. + */ + if ((hypctx->tf.tf_esr & ISS_DATA_S1PTW) != 0) + break; + switch(hypctx->tf.tf_esr & ISS_DATA_DFSC_MASK) { + case ISS_DATA_DFSC_PF_L1: + case ISS_DATA_DFSC_PF_L2: + case ISS_DATA_DFSC_PF_L3: + hpfar_valid = false; + break; + } + break; + } + } + if (hpfar_valid) { + hypctx->exit_info.hpfar_el2 = READ_SPECIALREG(hpfar_el2); + } else { + /* + * TODO: There is a risk the at instruction could cause an + * exception here. We should handle it & return a failure. + */ + s1e1r = + arm64_address_translate_s1e1r(hypctx->exit_info.far_el2); + if (PAR_SUCCESS(s1e1r)) { + hpfar_el2 = (s1e1r & PAR_PA_MASK) >> PAR_PA_SHIFT; + hpfar_el2 <<= HPFAR_EL2_FIPA_SHIFT; + hypctx->exit_info.hpfar_el2 = hpfar_el2; + } else { + ret = EXCP_TYPE_REENTER; + } + } + + vmm_hyp_reg_store(hypctx, hyp, true); + + vmm_hyp_reg_restore(&host_hypctx, NULL, false); + + /* Restore the host special registers */ + WRITE_SPECIALREG(ich_hcr_el2, ich_hcr_el2); + WRITE_SPECIALREG(ich_vmcr_el2, ich_vmcr_el2); + + WRITE_SPECIALREG(cnthctl_el2, cnthctl_el2); + WRITE_SPECIALREG(cntkctl_el1, cntkctl_el1); + WRITE_SPECIALREG(cntvoff_el2, cntvoff_el2); + + return (ret); +} + +static uint64_t +vmm_hyp_read_reg(uint64_t reg) +{ + switch(reg) { + case HYP_REG_ICH_VTR: + return (READ_SPECIALREG(ich_vtr_el2)); + case HYP_REG_CNTHCTL: + return (READ_SPECIALREG(cnthctl_el2)); + } + + return (0); +} + +static bool +vmm_is_vpipt_cache(void) +{ + /* TODO: Implement */ + return (0); +} + +static int +vmm_clean_s2_tlbi(void) +{ + dsb(ishst); + __asm __volatile("tlbi alle1is"); + + /* + * If we have a VPIPT icache it will use the VMID to tag cachelines. + * As we are changing the allocated VMIDs we need to invalidate the + * icache lines containing all old values. + */ + if (vmm_is_vpipt_cache()) + __asm __volatile("ic ialluis"); + dsb(ish); + + return (0); +} + +static int +vm_s2_tlbi_range(uint64_t vttbr, vm_offset_t sva, vm_size_t eva, + bool final_only) +{ + uint64_t end, r, start; + uint64_t host_vttbr; + +#define TLBI_VA_SHIFT 12 +#define TLBI_VA_MASK ((1ul << 44) - 1) +#define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK) +#define TLBI_VA_L3_INCR (L3_SIZE >> TLBI_VA_SHIFT) + + /* Switch to the guest vttbr */ + /* TODO: Handle Cortex-A57/A72 erratum 131936 */ + host_vttbr = READ_SPECIALREG(vttbr_el2); + WRITE_SPECIALREG(vttbr_el2, vttbr); + isb(); + + /* + * The CPU can cache the stage 1 + 2 combination so we need to ensure + * the stage 2 is invalidated first, then when this has completed we + * invalidate the stage 1 TLB. As we don't know which stage 1 virtual + * addresses point at the stage 2 IPA we need to invalidate the entire + * stage 1 TLB. + */ + + start = TLBI_VA(sva); + end = TLBI_VA(eva); + for (r = start; r < end; r += TLBI_VA_L3_INCR) { + /* Invalidate the stage 2 TLB entry */ + if (final_only) + __asm __volatile("tlbi ipas2le1is, %0" : : "r"(r)); + else + __asm __volatile("tlbi ipas2e1is, %0" : : "r"(r)); + } + /* Ensure the entry has been invalidated */ + dsb(ish); + /* Invalidate the stage 1 TLB. */ + __asm __volatile("tlbi vmalle1is"); + dsb(ish); + isb(); + + /* Switch back t othe host vttbr */ + WRITE_SPECIALREG(vttbr_el2, host_vttbr); + isb(); + + return (0); +} + +static int +vm_s2_tlbi_all(uint64_t vttbr) +{ + uint64_t host_vttbr; + + /* Switch to the guest vttbr */ + /* TODO: Handle Cortex-A57/A72 erratum 131936 */ + host_vttbr = READ_SPECIALREG(vttbr_el2); + WRITE_SPECIALREG(vttbr_el2, vttbr); + isb(); + + __asm __volatile("tlbi vmalls12e1is"); + dsb(ish); + isb(); + + /* Switch back t othe host vttbr */ + WRITE_SPECIALREG(vttbr_el2, host_vttbr); + isb(); + + return (0); +} + +static int +vmm_dc_civac(uint64_t start, uint64_t len) +{ + size_t line_size, end; + uint64_t ctr; + + ctr = READ_SPECIALREG(ctr_el0); + line_size = sizeof(int) << CTR_DLINE_SIZE(ctr); + end = start + len; + dsb(ishst); + /* Clean and Invalidate the D-cache */ + for (; start < end; start += line_size) + __asm __volatile("dc civac, %0" :: "r" (start) : "memory"); + dsb(ish); + return (0); +} + +static int +vmm_el2_tlbi(uint64_t type, uint64_t start, uint64_t len) +{ + uint64_t end, r; + + dsb(ishst); + switch (type) { + default: + case HYP_EL2_TLBI_ALL: + __asm __volatile("tlbi alle2" ::: "memory"); + break; + case HYP_EL2_TLBI_VA: + end = (start + len) >> 12; + start >>= 12; + while (start < end) { + /* TODO: Use new macros when merged past them */ + r = start & 0xffffffffffful; + __asm __volatile("tlbi vae2is, %0" :: "r"(r)); + start += PAGE_SIZE; + } + break; + } + dsb(ish); + + return (0); +} + +uint64_t +vmm_hyp_enter(uint64_t handle, uint64_t x1, uint64_t x2, uint64_t x3, + uint64_t x4, uint64_t x5, uint64_t x6, uint64_t x7) +{ + uint64_t ret; + + switch (handle) { + case HYP_ENTER_GUEST: + do { + ret = vmm_hyp_call_guest((struct hyp *)x1, x2); + } while (ret == EXCP_TYPE_REENTER); + return (ret); + case HYP_READ_REGISTER: + return (vmm_hyp_read_reg(x1)); + case HYP_CLEAN_S2_TLBI: + return (vmm_clean_s2_tlbi()); + case HYP_DC_CIVAC: + return (vmm_dc_civac(x1, x2)); + case HYP_EL2_TLBI: + return (vmm_el2_tlbi(x1, x2, x3)); + case HYP_S2_TLBI_RANGE: + return (vm_s2_tlbi_range(x1, x2, x3, x4)); + case HYP_S2_TLBI_ALL: + return (vm_s2_tlbi_all(x1)); + case HYP_CLEANUP: /* Handled in vmm_hyp_exception.S */ + default: + break; + } + + return (0); +} diff --git a/sys/arm64/vmm/vmm_hyp_el2.S b/sys/arm64/vmm/vmm_hyp_el2.S new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_hyp_el2.S @@ -0,0 +1,39 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Andrew Turner + * + * This work was supported by Innovate UK project 105694, "Digital Security + * by Design (DSbD) Technology Platform Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + + .rodata + .align PAGE_SHIFT + .globl vmm_hyp_code +vmm_hyp_code: + .incbin "vmm_hyp_blob.bin" + .globl vmm_hyp_code_end +vmm_hyp_code_end: diff --git a/sys/arm64/vmm/vmm_hyp_exception.S b/sys/arm64/vmm/vmm_hyp_exception.S new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_hyp_exception.S @@ -0,0 +1,383 @@ +/* + * Copyright (C) 2017 Alexandru Elisei + * All rights reserved. + * Copyright (c) 2021 Andrew Turner + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include +#include + +#include "assym.inc" +#include "hyp.h" + +.macro save_host_registers + /* TODO: Only store callee saved registers */ + sub sp, sp, #(32 * 8) + str x30, [sp, #(30 * 8)] + stp x28, x29, [sp, #(28 * 8)] + stp x26, x27, [sp, #(26 * 8)] + stp x24, x25, [sp, #(24 * 8)] + stp x22, x23, [sp, #(22 * 8)] + stp x20, x21, [sp, #(20 * 8)] + stp x18, x19, [sp, #(18 * 8)] + stp x16, x17, [sp, #(16 * 8)] + stp x14, x15, [sp, #(14 * 8)] + stp x12, x13, [sp, #(12 * 8)] + stp x10, x11, [sp, #(10 * 8)] + stp x8, x9, [sp, #(8 * 8)] + stp x6, x7, [sp, #(6 * 8)] + stp x4, x5, [sp, #(4 * 8)] + stp x2, x3, [sp, #(2 * 8)] + stp x0, x1, [sp, #(0 * 8)] +.endm + +.macro restore_host_registers + /* TODO: Only restore callee saved registers */ + ldp x0, x1, [sp, #(0 * 8)] + ldp x2, x3, [sp, #(2 * 8)] + ldp x4, x5, [sp, #(4 * 8)] + ldp x6, x7, [sp, #(6 * 8)] + ldp x8, x9, [sp, #(8 * 8)] + ldp x10, x11, [sp, #(10 * 8)] + ldp x12, x13, [sp, #(12 * 8)] + ldp x14, x15, [sp, #(14 * 8)] + ldp x16, x17, [sp, #(16 * 8)] + ldp x18, x19, [sp, #(18 * 8)] + ldp x20, x21, [sp, #(20 * 8)] + ldp x22, x23, [sp, #(22 * 8)] + ldp x24, x25, [sp, #(24 * 8)] + ldp x26, x27, [sp, #(26 * 8)] + ldp x28, x29, [sp, #(28 * 8)] + ldr x30, [sp, #(30 * 8)] + add sp, sp, #(32 * 8) +.endm + +.macro save_guest_registers + /* Back up x0 so we can use it as a temporary register */ + stp x0, x1, [sp, #-(2 * 8)]! + + /* Restore the hypctx pointer */ + mrs x0, tpidr_el2 + + stp x2, x3, [x0, #(TF_X + 2 * 8)] + stp x4, x5, [x0, #(TF_X + 4 * 8)] + stp x6, x7, [x0, #(TF_X + 6 * 8)] + stp x8, x9, [x0, #(TF_X + 8 * 8)] + stp x10, x11, [x0, #(TF_X + 10 * 8)] + stp x12, x13, [x0, #(TF_X + 12 * 8)] + stp x14, x15, [x0, #(TF_X + 14 * 8)] + stp x16, x17, [x0, #(TF_X + 16 * 8)] + stp x18, x19, [x0, #(TF_X + 18 * 8)] + stp x20, x21, [x0, #(TF_X + 20 * 8)] + stp x22, x23, [x0, #(TF_X + 22 * 8)] + stp x24, x25, [x0, #(TF_X + 24 * 8)] + stp x26, x27, [x0, #(TF_X + 26 * 8)] + stp x28, x29, [x0, #(TF_X + 28 * 8)] + + str lr, [x0, #(TF_LR)] + + /* Restore the saved x0 & x1 and save them */ + ldp x2, x3, [sp], #(2 * 8) + stp x2, x3, [x0, #(TF_X + 0 * 8)] +.endm + +.macro restore_guest_registers + /* + * Copy the guest x0 and x1 to the stack so we can restore them + * after loading the other registers. + */ + ldp x2, x3, [x0, #(TF_X + 0 * 8)] + stp x2, x3, [sp, #-(2 * 8)]! + + ldr lr, [x0, #(TF_LR)] + + ldp x28, x29, [x0, #(TF_X + 28 * 8)] + ldp x26, x27, [x0, #(TF_X + 26 * 8)] + ldp x24, x25, [x0, #(TF_X + 24 * 8)] + ldp x22, x23, [x0, #(TF_X + 22 * 8)] + ldp x20, x21, [x0, #(TF_X + 20 * 8)] + ldp x18, x19, [x0, #(TF_X + 18 * 8)] + ldp x16, x17, [x0, #(TF_X + 16 * 8)] + ldp x14, x15, [x0, #(TF_X + 14 * 8)] + ldp x12, x13, [x0, #(TF_X + 12 * 8)] + ldp x10, x11, [x0, #(TF_X + 10 * 8)] + ldp x8, x9, [x0, #(TF_X + 8 * 8)] + ldp x6, x7, [x0, #(TF_X + 6 * 8)] + ldp x4, x5, [x0, #(TF_X + 4 * 8)] + ldp x2, x3, [x0, #(TF_X + 2 * 8)] + + ldp x0, x1, [sp], #(2 * 8) +.endm + +.macro vempty + .align 7 + 1: b 1b +.endm + +.macro vector name + .align 7 + b handle_\name +.endm + + .section ".vmm_vectors","ax" + .align 11 +hyp_init_vectors: + vempty /* Synchronous EL2t */ + vempty /* IRQ EL2t */ + vempty /* FIQ EL2t */ + vempty /* Error EL2t */ + + vempty /* Synchronous EL2h */ + vempty /* IRQ EL2h */ + vempty /* FIQ EL2h */ + vempty /* Error EL2h */ + + vector hyp_init /* Synchronous 64-bit EL1 */ + vempty /* IRQ 64-bit EL1 */ + vempty /* FIQ 64-bit EL1 */ + vempty /* Error 64-bit EL1 */ + + vempty /* Synchronous 32-bit EL1 */ + vempty /* IRQ 32-bit EL1 */ + vempty /* FIQ 32-bit EL1 */ + vempty /* Error 32-bit EL1 */ + + .text + .align 11 +hyp_vectors: + vempty /* Synchronous EL2t */ + vempty /* IRQ EL2t */ + vempty /* FIQ EL2t */ + vempty /* Error EL2t */ + + vector el2_el2h_sync /* Synchronous EL2h */ + vector el2_el2h_irq /* IRQ EL2h */ + vector el2_el2h_fiq /* FIQ EL2h */ + vector el2_el2h_error /* Error EL2h */ + + vector el2_el1_sync64 /* Synchronous 64-bit EL1 */ + vector el2_el1_irq64 /* IRQ 64-bit EL1 */ + vector el2_el1_fiq64 /* FIQ 64-bit EL1 */ + vector el2_el1_error64 /* Error 64-bit EL1 */ + + vempty /* Synchronous 32-bit EL1 */ + vempty /* IRQ 32-bit EL1 */ + vempty /* FIQ 32-bit EL1 */ + vempty /* Error 32-bit EL1 */ + +/* + * Initialize the hypervisor mode with a new exception vector table, translation + * table and stack. + * + * Expecting: + * x0 - translation tables physical address + * x1 - stack top virtual address + * x2 - TCR_EL2 value + * x3 - SCTLR_EL2 value + * x4 - VTCR_EL2 value + */ +LENTRY(handle_hyp_init) + /* Install the new exception vectors */ + adrp x6, hyp_vectors + add x6, x6, :lo12:hyp_vectors + msr vbar_el2, x6 + /* Set the stack top address */ + mov sp, x1 + /* Use the host VTTBR_EL2 to tell the host and the guests apart */ + mov x9, #VTTBR_HOST + msr vttbr_el2, x9 + /* Load the base address for the translation tables */ + msr ttbr0_el2, x0 + /* Invalidate the TLB */ + tlbi alle2 + /* Use the same memory attributes as EL1 */ + mrs x9, mair_el1 + msr mair_el2, x9 + /* Configure address translation */ + msr tcr_el2, x2 + isb + /* Set the system control register for EL2 */ + msr sctlr_el2, x3 + /* Set the Stage 2 translation control register */ + msr vtcr_el2, x4 + /* Return success */ + mov x0, #0 + /* MMU is up and running */ + ERET +LEND(handle_hyp_init) + +.macro do_world_switch_to_host + save_guest_registers + restore_host_registers + + /* Restore host VTTBR */ + mov x9, #VTTBR_HOST + msr vttbr_el2, x9 +.endm + + +.macro handle_el2_excp type + /* Save registers before modifying so we can restore them */ + str x9, [sp, #-16]! + + /* Test if the exception happened when the host was running */ + mrs x9, vttbr_el2 + cmp x9, #VTTBR_HOST + beq 1f + + /* We got the exception while the guest was running */ + ldr x9, [sp], #16 + do_world_switch_to_host + mov x0, \type + ret + +1: + /* We got the exception while the host was running */ + ldr x9, [sp], #16 + mov x0, \type + eret +.endm + + +LENTRY(handle_el2_el2h_sync) + handle_el2_excp #EXCP_TYPE_EL2_SYNC +LEND(handle_el2_el2h_sync) + +LENTRY(handle_el2_el2h_irq) + handle_el2_excp #EXCP_TYPE_EL2_IRQ +LEND(handle_el2_el2h_irq) + +LENTRY(handle_el2_el2h_fiq) + handle_el2_excp #EXCP_TYPE_EL2_FIQ +LEND(handle_el2_el2h_fiq) + +LENTRY(handle_el2_el2h_error) + handle_el2_excp #EXCP_TYPE_EL2_ERROR +LEND(handle_el2_el2h_error) + + +LENTRY(handle_el2_el1_sync64) + /* Save registers before modifying so we can restore them */ + str x9, [sp, #-16]! + + /* Check for host hypervisor call */ + mrs x9, vttbr_el2 + cmp x9, #VTTBR_HOST + ldr x9, [sp], #16 /* Restore the temp register */ + bne 1f + + /* + * Called from the host + */ + + /* Check if this is a cleanup call and handle in a controlled state */ + cmp x0, #(HYP_CLEANUP) + b.eq vmm_cleanup + + str lr, [sp, #-16]! + bl vmm_hyp_enter + ldr lr, [sp], #16 + ERET + +1: /* Guest exception taken to EL2 */ + do_world_switch_to_host + mov x0, #EXCP_TYPE_EL1_SYNC + ret +LEND(handle_el2_el1_sync64) + +/* + * We only trap IRQ, FIQ and SError exceptions when a guest is running. Do a + * world switch to host to handle these exceptions. + */ + +LENTRY(handle_el2_el1_irq64) + do_world_switch_to_host + str x9, [sp, #-16]! + mrs x9, ich_misr_el2 + cmp x9, xzr + beq 1f + mov x0, #EXCP_TYPE_MAINT_IRQ + b 2f +1: + mov x0, #EXCP_TYPE_EL1_IRQ +2: + ldr x9, [sp], #16 + ret +LEND(handle_el2_el1_irq) + +LENTRY(handle_el2_el1_fiq64) + do_world_switch_to_host + mov x0, #EXCP_TYPE_EL1_FIQ + ret +LEND(handle_el2_el1_fiq64) + +LENTRY(handle_el2_el1_error64) + do_world_switch_to_host + mov x0, #EXCP_TYPE_EL1_ERROR + ret +LEND(handle_el2_el1_error64) + + +/* + * Usage: + * uint64_t vmm_enter_guest(struct hypctx *hypctx) + * + * Expecting: + * x0 - hypctx address + */ +ENTRY(vmm_enter_guest) + /* Save hypctx address */ + msr tpidr_el2, x0 + + save_host_registers + restore_guest_registers + + /* Enter guest */ + eret +END(vmm_enter_guest) + +/* + * Usage: + * void vmm_cleanup(uint64_t handle, void *hyp_stub_vectors) + * + * Expecting: + * x1 - physical address of hyp_stub_vectors + */ +LENTRY(vmm_cleanup) + /* Restore the stub vectors */ + msr vbar_el2, x1 + + /* Disable the MMU */ + dsb sy + mrs x2, sctlr_el2 + bic x2, x2, #SCTLR_EL2_M + msr sctlr_el2, x2 + isb + + ERET +LEND(vmm_cleanup) diff --git a/sys/arm64/vmm/vmm_instruction_emul.c b/sys/arm64/vmm/vmm_instruction_emul.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_instruction_emul.c @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef _KERNEL +#include +#include +#include +#include + +#include + +#include +#include + +#else +#include +#include +#include +#include +#include + +#include + +#include +#include +#endif + +#include + +int +vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) +{ + uint64_t val; + int error; + + if (vie->dir == VM_DIR_READ) { + error = memread(vm, vcpuid, gpa, &val, vie->access_size, memarg); + if (error) + goto out; + error = vm_set_register(vm, vcpuid, vie->reg, val); + } else { + error = vm_get_register(vm, vcpuid, vie->reg, &val); + if (error) + goto out; + error = memwrite(vm, vcpuid, gpa, val, vie->access_size, memarg); + } + +out: + return (error); +} + +int +vmm_emulate_register(void *vm, int vcpuid, struct vre *vre, reg_read_t regread, + reg_write_t regwrite, void *regarg) +{ + uint64_t val; + int error; + + if (vre->dir == VM_DIR_READ) { + error = regread(vm, vcpuid, &val, regarg); + if (error) + goto out; + error = vm_set_register(vm, vcpuid, vre->reg, val); + } else { + error = vm_get_register(vm, vcpuid, vre->reg, &val); + if (error) + goto out; + error = regwrite(vm, vcpuid, val, regarg); + } + +out: + return (error); +} diff --git a/sys/arm64/vmm/vmm_ktr.h b/sys/arm64/vmm/vmm_ktr.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_ktr.h @@ -0,0 +1,71 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_KTR_H_ +#define _VMM_KTR_H_ + +#include +#include + +#ifndef KTR_VMM +#define KTR_VMM KTR_GEN +#endif + +#define VCPU_CTR0(vm, vcpuid, format) \ +CTR2(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid)) + +#define VCPU_CTR1(vm, vcpuid, format, p1) \ +CTR3(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1)) + +#define VCPU_CTR2(vm, vcpuid, format, p1, p2) \ +CTR4(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2)) + +#define VCPU_CTR3(vm, vcpuid, format, p1, p2, p3) \ +CTR5(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2), (p3)) + +#define VCPU_CTR4(vm, vcpuid, format, p1, p2, p3, p4) \ +CTR6(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), \ + (p1), (p2), (p3), (p4)) + +#define VM_CTR0(vm, format) \ +CTR1(KTR_VMM, "vm %s: " format, vm_name((vm))) + +#define VM_CTR1(vm, format, p1) \ +CTR2(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1)) + +#define VM_CTR2(vm, format, p1, p2) \ +CTR3(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2)) + +#define VM_CTR3(vm, format, p1, p2, p3) \ +CTR4(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3)) + +#define VM_CTR4(vm, format, p1, p2, p3, p4) \ +CTR5(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3), (p4)) +#endif diff --git a/sys/arm64/vmm/vmm_mem.h b/sys/arm64/vmm/vmm_mem.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_mem.h @@ -0,0 +1,43 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_MEM_H_ +#define _VMM_MEM_H_ + +struct vmspace; +struct vm_object; + +int vmm_mem_init(void); +struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa); +void vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size); +vm_paddr_t vmm_mem_maxaddr(void); + +#endif diff --git a/sys/arm64/vmm/vmm_mem.c b/sys/arm64/vmm/vmm_mem.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_mem.c @@ -0,0 +1,124 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "vmm_mem.h" + +int +vmm_mem_init(void) +{ + + return (0); +} + +vm_object_t +vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa) +{ + int error; + vm_object_t obj; + struct sglist *sg; + + sg = sglist_alloc(1, M_WAITOK); + error = sglist_append_phys(sg, hpa, len); + KASSERT(error == 0, ("error %d appending physaddr to sglist", error)); + + obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL); + if (obj != NULL) { + /* + * VT-x ignores the MTRR settings when figuring out the + * memory type for translations obtained through EPT. + * + * Therefore we explicitly force the pages provided by + * this object to be mapped as uncacheable. + */ + VM_OBJECT_WLOCK(obj); + error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE); + VM_OBJECT_WUNLOCK(obj); + if (error != KERN_SUCCESS) { + panic("vmm_mmio_alloc: vm_object_set_memattr error %d", + error); + } + error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0, + VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0); + if (error != KERN_SUCCESS) { + vm_object_deallocate(obj); + obj = NULL; + } + } + + /* + * Drop the reference on the sglist. + * + * If the scatter/gather object was successfully allocated then it + * has incremented the reference count on the sglist. Dropping the + * initial reference count ensures that the sglist will be freed + * when the object is deallocated. + * + * If the object could not be allocated then we end up freeing the + * sglist. + */ + sglist_free(sg); + + return (obj); +} + +void +vmm_mmio_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len) +{ + + vm_map_remove(&vmspace->vm_map, gpa, gpa + len); +} + +vm_paddr_t +vmm_mem_maxaddr(void) +{ + + return (ptoa(Maxmem)); +} diff --git a/sys/arm64/vmm/vmm_mmu.c b/sys/arm64/vmm/vmm_mmu.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_mmu.c @@ -0,0 +1,432 @@ +/* + * Copyright (C) 2017 Alexandru Elisei + * All rights reserved. + * + * This software was developed by Alexandru Elisei under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "mmu.h" +#include "arm64.h" + +MALLOC_DECLARE(M_HYP); + +static struct mtx vmmpmap_mtx; +static pt_entry_t *l0; +static vm_paddr_t l0_paddr; + +bool +vmmpmap_init(void) +{ + vm_page_t m; + + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) + return (false); + + l0_paddr = VM_PAGE_TO_PHYS(m); + l0 = (pd_entry_t *)PHYS_TO_DMAP(l0_paddr); + memset(l0, 0, PAGE_SIZE); + + mtx_init(&vmmpmap_mtx, "vmm pmap", NULL, MTX_DEF); + + return (true); +} + +static void +vmmpmap_release_l3(pd_entry_t l2e) +{ + pt_entry_t *l3 __diagused; + vm_page_t m; + int i; + + l3 = (pd_entry_t *)PHYS_TO_DMAP(l2e & ~ATTR_MASK); + for (i = 0; i < Ln_ENTRIES; i++) { + KASSERT(l3[i] == 0, ("%s: l3 still mapped: %p %lx", __func__, + &l3[i], l3[i])); + } + + m = PHYS_TO_VM_PAGE(l2e & ~ATTR_MASK); + vm_page_unwire_noq(m); + vm_page_free(m); +} + +static void +vmmpmap_release_l2(pd_entry_t l1e) +{ + pt_entry_t *l2; + vm_page_t m; + int i; + + l2 = (pd_entry_t *)PHYS_TO_DMAP(l1e & ~ATTR_MASK); + for (i = 0; i < Ln_ENTRIES; i++) { + if (l2[i] != 0) { + vmmpmap_release_l3(l2[i]); + } + } + + m = PHYS_TO_VM_PAGE(l1e & ~ATTR_MASK); + vm_page_unwire_noq(m); + vm_page_free(m); +} + +static void +vmmpmap_release_l1(pd_entry_t l0e) +{ + pt_entry_t *l1; + vm_page_t m; + int i; + + l1 = (pd_entry_t *)PHYS_TO_DMAP(l0e & ~ATTR_MASK); + for (i = 0; i < Ln_ENTRIES; i++) { + if (l1[i] != 0) { + vmmpmap_release_l2(l1[i]); + } + } + + m = PHYS_TO_VM_PAGE(l0e & ~ATTR_MASK); + vm_page_unwire_noq(m); + vm_page_free(m); +} + +void +vmmpmap_fini(void) +{ + vm_page_t m; + int i; + + /* Remove the remaining entries */ + for (i = 0; i < L0_ENTRIES; i++) { + if (l0[i] != 0) { + vmmpmap_release_l1(l0[i]); + } + } + + m = PHYS_TO_VM_PAGE(l0_paddr); + vm_page_unwire_noq(m); + vm_page_free(m); + + mtx_destroy(&vmmpmap_mtx); +} + +uint64_t +vmmpmap_to_ttbr0(void) +{ + + return (l0_paddr); +} + +/* Returns a pointer to the level 1 table, allocating if needed. */ +static pt_entry_t * +vmmpmap_l1_table(vm_offset_t va) +{ + pt_entry_t new_l0e, l0e, *l1; + vm_page_t m; + int rv; + + m = NULL; +again: + l0e = atomic_load_64(&l0[pmap_l0_index(va)]); + if ((l0e & ATTR_DESCR_VALID) == 0) { + /* Allocate a page for the level 1 table */ + if (m == NULL) { + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) + return (NULL); + } + + new_l0e = VM_PAGE_TO_PHYS(m) | L0_TABLE; + + mtx_lock(&vmmpmap_mtx); + rv = atomic_cmpset_64(&l0[pmap_l0_index(va)], l0e, new_l0e); + mtx_unlock(&vmmpmap_mtx); + /* We may have raced another thread, try again */ + if (rv == 0) + goto again; + + /* The cmpset succeeded */ + l0e = new_l0e; + } else if (m != NULL) { + /* We allocated a page that wasn't used */ + vm_page_unwire_noq(m); + vm_page_free_zero(m); + } + + l1 = (pd_entry_t *)PHYS_TO_DMAP(l0e & ~ATTR_MASK); + return (l1); +} + +static pt_entry_t * +vmmpmap_l2_table(vm_offset_t va) +{ + pt_entry_t new_l1e, l1e, *l1, *l2; + vm_page_t m; + int rv; + + l1 = vmmpmap_l1_table(va); + if (l1 == NULL) + return (NULL); + + m = NULL; +again: + l1e = atomic_load_64(&l1[pmap_l1_index(va)]); + if ((l1e & ATTR_DESCR_VALID) == 0) { + /* Allocate a page for the level 2 table */ + if (m == NULL) { + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) + return (NULL); + } + + new_l1e = VM_PAGE_TO_PHYS(m) | L1_TABLE; + + mtx_lock(&vmmpmap_mtx); + rv = atomic_cmpset_64(&l1[pmap_l1_index(va)], l1e, new_l1e); + mtx_unlock(&vmmpmap_mtx); + /* We may have raced another thread, try again */ + if (rv == 0) + goto again; + + /* The cmpset succeeded */ + l1e = new_l1e; + } else if (m != NULL) { + /* We allocated a page that wasn't used */ + vm_page_unwire_noq(m); + vm_page_free_zero(m); + } + + l2 = (pd_entry_t *)PHYS_TO_DMAP(l1e & ~ATTR_MASK); + return (l2); +} + +static pd_entry_t * +vmmpmap_l3_table(vm_offset_t va) +{ + pt_entry_t new_l2e, l2e, *l2, *l3; + vm_page_t m; + int rv; + + l2 = vmmpmap_l2_table(va); + if (l2 == NULL) + return (NULL); + + m = NULL; +again: + l2e = atomic_load_64(&l2[pmap_l2_index(va)]); + if ((l2e & ATTR_DESCR_VALID) == 0) { + /* Allocate a page for the level 3 table */ + if (m == NULL) { + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) + return (NULL); + } + + new_l2e = VM_PAGE_TO_PHYS(m) | L2_TABLE; + + mtx_lock(&vmmpmap_mtx); + rv = atomic_cmpset_64(&l2[pmap_l2_index(va)], l2e, new_l2e); + mtx_unlock(&vmmpmap_mtx); + /* We may have raced another thread, try again */ + if (rv == 0) + goto again; + + /* The cmpset succeeded */ + l2e = new_l2e; + } else if (m != NULL) { + /* We allocated a page that wasn't used */ + vm_page_unwire_noq(m); + vm_page_free_zero(m); + } + + l3 = (pt_entry_t *)PHYS_TO_DMAP(l2e & ~ATTR_MASK); + return (l3); +} + +/* + * Creates an EL2 entry in the hyp_pmap. Similar to pmap_kenter. + */ +bool +vmmpmap_enter(vm_offset_t va, vm_size_t size, vm_paddr_t pa, vm_prot_t prot) +{ + pd_entry_t l3e, *l3; + + KASSERT((pa & L3_OFFSET) == 0, + ("%s: Invalid physical address", __func__)); + KASSERT((va & L3_OFFSET) == 0, + ("%s: Invalid virtual address", __func__)); + KASSERT((size & PAGE_MASK) == 0, + ("%s: Mapping is not page-sized", __func__)); + + l3e = ATTR_DEFAULT | L3_PAGE; + /* This bit is res1 at EL2 */ + l3e |= ATTR_S1_AP(ATTR_S1_AP_USER); + /* Only normal memory is used at EL2 */ + l3e |= ATTR_S1_IDX(VM_MEMATTR_DEFAULT); + + if ((prot & VM_PROT_EXECUTE) == 0) { + /* PXN is res0 at EL2. UXN is XN */ + l3e |= ATTR_S1_UXN; + } + if ((prot & VM_PROT_WRITE) == 0) { + l3e |= ATTR_S1_AP(ATTR_S1_AP_RO); + } + + while (size > 0) { + l3 = vmmpmap_l3_table(va); + if (l3 == NULL) + return (false); + +#ifdef INVARIANTS + /* + * Ensure no other threads can write to l3 between the KASSERT + * and store. + */ + mtx_lock(&vmmpmap_mtx); +#endif + KASSERT(atomic_load_64(&l3[pmap_l3_index(va)]) == 0, + ("%s: VA already mapped", __func__)); + + atomic_store_64(&l3[pmap_l3_index(va)], l3e | pa); +#ifdef INVARIANTS + mtx_unlock(&vmmpmap_mtx); +#endif + + size -= PAGE_SIZE; + pa += PAGE_SIZE; + va += PAGE_SIZE; + } + + return (true); +} + +void +vmmpmap_remove(vm_offset_t va, vm_size_t size, bool invalidate) +{ + pt_entry_t l0e, *l1, l1e, *l2, l2e; + pd_entry_t *l3, l3e, **l3_list; + vm_offset_t eva, va_next, sva; + size_t i; + + KASSERT((va & L3_OFFSET) == 0, + ("%s: Invalid virtual address", __func__)); + KASSERT((size & PAGE_MASK) == 0, + ("%s: Mapping is not page-sized", __func__)); + + if (invalidate) { + l3_list = malloc((size / PAGE_SIZE) * sizeof(l3_list[0]), + M_TEMP, M_WAITOK | M_ZERO); + } + + sva = va; + eva = va + size; + mtx_lock(&vmmpmap_mtx); + for (i = 0; va < eva; va = va_next) { + l0e = atomic_load_64(&l0[pmap_l0_index(va)]); + if (l0e == 0) { + va_next = (va + L0_SIZE) & ~L0_OFFSET; + if (va_next < va) + va_next = eva; + continue; + } + MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE); + + l1 = (pd_entry_t *)PHYS_TO_DMAP(l0e & ~ATTR_MASK); + l1e = atomic_load_64(&l1[pmap_l1_index(va)]); + if (l1e == 0) { + va_next = (va + L1_SIZE) & ~L1_OFFSET; + if (va_next < va) + va_next = eva; + continue; + } + MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE); + + l2 = (pd_entry_t *)PHYS_TO_DMAP(l1e & ~ATTR_MASK); + l2e = atomic_load_64(&l2[pmap_l2_index(va)]); + if (l2e == 0) { + va_next = (va + L2_SIZE) & ~L2_OFFSET; + if (va_next < va) + va_next = eva; + continue; + } + MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE); + + l3 = (pd_entry_t *)PHYS_TO_DMAP(l2e & ~ATTR_MASK); + if (invalidate) { + l3e = atomic_load_64(&l3[pmap_l3_index(va)]); + MPASS(l3e != 0); + /* + * Mark memory as read-only so we can invalidate + * the cache. + */ + l3e &= ~ATTR_S1_AP_MASK; + l3e |= ATTR_S1_AP(ATTR_S1_AP_RO); + atomic_store_64(&l3[pmap_l3_index(va)], l3e); + + l3_list[i] = l3; + i++; + } else { + /* + * The caller is responsible for clearing the cache & + * handling the TLB + */ + atomic_store_64(&l3[pmap_l3_index(va)], 0); + } + + va_next = (va + L3_SIZE) & ~L3_OFFSET; + if (va_next < va) + va_next = eva; + } + mtx_unlock(&vmmpmap_mtx); + + if (invalidate) { + /* Invalidate the memory from the D-cache */ + vmm_call_hyp(HYP_DC_CIVAC, sva, size); + + for (i = 0; i < (size / PAGE_SIZE); i++) { + atomic_store_64(l3_list[i], 0); + } + + vmm_call_hyp(HYP_EL2_TLBI, HYP_EL2_TLBI_VA, sva, size); + + free(l3_list, M_TEMP); + } +} diff --git a/sys/arm64/vmm/vmm_psci.c b/sys/arm64/vmm/vmm_psci.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_psci.c @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2018 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include + +#include "arm64.h" +#include "psci.h" + +#define PSCI_VERSION_0_2 0x2 + +static int +psci_version(struct hypctx *hypctx, bool *retu) +{ + + hypctx->tf.tf_x[0] = PSCI_VERSION_0_2; + + *retu = false; + return (0); +} + +static int +psci_system_off(struct vm *vm) +{ + return (vm_suspend(vm, VM_SUSPEND_POWEROFF)); +} + +static int +psci_system_reset(struct vm *vm) +{ + return (vm_suspend(vm, VM_SUSPEND_RESET)); +} + +int +psci_handle_call(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu) +{ + struct hyp *hyp; + struct hypctx *hypctx; + uint64_t func_id; + uint32_t esr_el2, esr_iss; + int error, i; + + hyp = vm_get_cookie(vm); + hypctx = &hyp->ctx[vcpuid]; + + esr_el2 = hypctx->tf.tf_esr; + esr_iss = esr_el2 & ESR_ELx_ISS_MASK; + + if (esr_iss != 0) { + eprintf("Malformed HVC instruction with immediate: 0x%x\n", + esr_iss); + error = 1; + goto out; + } + + func_id = hypctx->tf.tf_x[0]; + switch (func_id) { + case PSCI_FNID_VERSION: + error = psci_version(hypctx, retu); + break; + case PSCI_FNID_SYSTEM_OFF: + error = psci_system_off(vm); + break; + case PSCI_FNID_SYSTEM_RESET: + error = psci_system_reset(vm); + break; + default: + vme->exitcode = VM_EXITCODE_SMCCC; + vme->u.smccc_call.func_id = func_id; + for (i = 0; i < nitems(vme->u.smccc_call.args); i++) + vme->u.smccc_call.args[i] = hypctx->tf.tf_x[i + 1]; + *retu = true; + error = 0; + break; + } + +out: + return (error); +} diff --git a/sys/arm64/vmm/vmm_reset.c b/sys/arm64/vmm/vmm_reset.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_reset.c @@ -0,0 +1,167 @@ +/* + * Copyright (C) 2018 Alexandru Elisei + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include + +#include "arm64.h" +#include "reset.h" + +/* + * Make the architecturally UNKNOWN value 0. As a bonus, we don't have to + * manually set all those RES0 fields. + */ +#define ARCH_UNKNOWN 0 +#define set_arch_unknown(reg) (memset(&(reg), ARCH_UNKNOWN, sizeof(reg))) + +void +reset_vm_el01_regs(void *vcpu) +{ + struct hypctx *el2ctx; + + el2ctx = vcpu; + + set_arch_unknown(el2ctx->tf); + + set_arch_unknown(el2ctx->actlr_el1); + set_arch_unknown(el2ctx->afsr0_el1); + set_arch_unknown(el2ctx->afsr1_el1); + set_arch_unknown(el2ctx->amair_el1); + set_arch_unknown(el2ctx->contextidr_el1); + set_arch_unknown(el2ctx->cpacr_el1); + set_arch_unknown(el2ctx->csselr_el1); + set_arch_unknown(el2ctx->elr_el1); + set_arch_unknown(el2ctx->esr_el1); + set_arch_unknown(el2ctx->far_el1); + set_arch_unknown(el2ctx->mair_el1); + set_arch_unknown(el2ctx->mdccint_el1); + set_arch_unknown(el2ctx->mdscr_el1); + set_arch_unknown(el2ctx->par_el1); + + /* + * Guest starts with: + * ~SCTLR_M: MMU off + * ~SCTLR_C: data cache off + * SCTLR_CP15BEN: memory barrier instruction enable from EL0; RAO/WI + * ~SCTLR_I: instruction cache off + */ + el2ctx->sctlr_el1 = SCTLR_RES1; + el2ctx->sctlr_el1 &= ~SCTLR_M & ~SCTLR_C & ~SCTLR_I; + el2ctx->sctlr_el1 |= SCTLR_CP15BEN; + + set_arch_unknown(el2ctx->sp_el0); + set_arch_unknown(el2ctx->tcr_el1); + set_arch_unknown(el2ctx->tpidr_el0); + set_arch_unknown(el2ctx->tpidr_el1); + set_arch_unknown(el2ctx->tpidrro_el0); + set_arch_unknown(el2ctx->ttbr0_el1); + set_arch_unknown(el2ctx->ttbr1_el1); + set_arch_unknown(el2ctx->vbar_el1); + set_arch_unknown(el2ctx->spsr_el1); + + set_arch_unknown(el2ctx->dbgbcr_el1); + set_arch_unknown(el2ctx->dbgbvr_el1); + set_arch_unknown(el2ctx->dbgwcr_el1); + set_arch_unknown(el2ctx->dbgwvr_el1); + + el2ctx->pmcr_el0 = READ_SPECIALREG(pmcr_el0) & PMCR_N_MASK; + /* PMCR_LC is unknown when AArch32 is supported or RES1 otherwise */ + el2ctx->pmcr_el0 |= PMCR_LC; + set_arch_unknown(el2ctx->pmccntr_el0); + set_arch_unknown(el2ctx->pmccfiltr_el0); + set_arch_unknown(el2ctx->pmcntenset_el0); + set_arch_unknown(el2ctx->pmintenset_el1); + set_arch_unknown(el2ctx->pmovsset_el0); + set_arch_unknown(el2ctx->pmuserenr_el0); + memset(el2ctx->pmevcntr_el0, 0, sizeof(el2ctx->pmevcntr_el0)); + memset(el2ctx->pmevtyper_el0, 0, sizeof(el2ctx->pmevtyper_el0)); +} + +void +reset_vm_el2_regs(void *vcpu) +{ + struct hypctx *el2ctx; + uint64_t cpu_aff; + + el2ctx = vcpu; + + /* + * Set the Hypervisor Configuration Register: + * + * HCR_RW: use AArch64 for EL1 + * HCR_TID3: handle ID registers in the vmm to privide a common + * set of featers on all vcpus + * HCR_TWI: Trap WFI to the hypervisor + * HCR_BSU_IS: barrier instructions apply to the inner shareable + * domain + * HCR_FB: broadcast maintenance operations + * HCR_AMO: route physical SError interrupts to EL2 + * HCR_IMO: route physical IRQ interrupts to EL2 + * HCR_FMO: route physical FIQ interrupts to EL2 + * HCR_SWIO: turn set/way invalidate into set/way clean and + * invalidate + * HCR_VM: use stage 2 translation + */ + el2ctx->hcr_el2 = HCR_RW | HCR_TID3 | HCR_TWI | HCR_BSU_IS | HCR_FB | + HCR_AMO | HCR_IMO | HCR_FMO | HCR_SWIO | HCR_VM; + + /* TODO: Trap all extensions we don't support */ + el2ctx->mdcr_el2 = 0; + /* PMCR_EL0.N is read from MDCR_EL2.HPMN */ + el2ctx->mdcr_el2 |= (el2ctx->pmcr_el0 & PMCR_N_MASK) >> PMCR_N_SHIFT; + + el2ctx->vmpidr_el2 = VMPIDR_EL2_RES1; + /* The guest will detect a multi-core, single-threaded CPU */ + el2ctx->vmpidr_el2 &= ~VMPIDR_EL2_U & ~VMPIDR_EL2_MT; + /* Only 24 bits of affinity, for a grand total of 16,777,216 cores. */ + cpu_aff = el2ctx->vcpu & (CPU_AFF0_MASK | CPU_AFF1_MASK | CPU_AFF2_MASK); + el2ctx->vmpidr_el2 |= cpu_aff; + + /* Use the same CPU identification information as the host */ + el2ctx->vpidr_el2 = CPU_IMPL_TO_MIDR(CPU_IMPL_ARM); + el2ctx->vpidr_el2 |= CPU_VAR_TO_MIDR(0); + el2ctx->vpidr_el2 |= CPU_ARCH_TO_MIDR(0xf); + el2ctx->vpidr_el2 |= CPU_PART_TO_MIDR(CPU_PART_FOUNDATION); + el2ctx->vpidr_el2 |= CPU_REV_TO_MIDR(0); + + /* + * Don't trap accesses to CPACR_EL1, trace, SVE, Advanced SIMD + * and floating point functionality to EL2. + */ + el2ctx->cptr_el2 = CPTR_RES1; + /* + * Disable interrupts in the guest. The guest OS will re-enable + * them. + */ + el2ctx->tf.tf_spsr = PSR_D | PSR_A | PSR_I | PSR_F; + /* Use the EL1 stack when taking exceptions to EL1 */ + el2ctx->tf.tf_spsr |= PSR_M_EL1h; +} diff --git a/sys/arm64/vmm/vmm_stat.h b/sys/arm64/vmm/vmm_stat.h new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_stat.h @@ -0,0 +1,153 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_STAT_H_ +#define _VMM_STAT_H_ + +struct vm; + +#define MAX_VMM_STAT_ELEMS 64 /* arbitrary */ + +enum vmm_stat_scope { + VMM_STAT_SCOPE_ANY, + VMM_STAT_SCOPE_INTEL, /* Intel VMX specific statistic */ + VMM_STAT_SCOPE_AMD, /* AMD SVM specific statistic */ +}; + +struct vmm_stat_type; +typedef void (*vmm_stat_func_t)(struct vm *vm, int vcpu, + struct vmm_stat_type *stat); + +struct vmm_stat_type { + int index; /* position in the stats buffer */ + int nelems; /* standalone or array */ + const char *desc; /* description of statistic */ + vmm_stat_func_t func; + enum vmm_stat_scope scope; +}; + +void vmm_stat_register(void *arg); + +#define VMM_STAT_FDEFINE(type, nelems, desc, func, scope) \ + struct vmm_stat_type type[1] = { \ + { -1, nelems, desc, func, scope } \ + }; \ + SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_register, type) + +#define VMM_STAT_DEFINE(type, nelems, desc, scope) \ + VMM_STAT_FDEFINE(type, nelems, desc, NULL, scope) + +#define VMM_STAT_DECLARE(type) \ + extern struct vmm_stat_type type[1] + +#define VMM_STAT(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_ANY) +#define VMM_STAT_INTEL(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_INTEL) +#define VMM_STAT_AMD(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_AMD) + +#define VMM_STAT_FUNC(type, desc, func) \ + VMM_STAT_FDEFINE(type, 1, desc, func, VMM_STAT_SCOPE_ANY) + +#define VMM_STAT_ARRAY(type, nelems, desc) \ + VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY) + +void *vmm_stat_alloc(void); +void vmm_stat_init(void *vp); +void vmm_stat_free(void *vp); + +int vmm_stat_copy(struct vm *vm, int vcpu, int index, int count, + int *num_stats, uint64_t *buf); +int vmm_stat_desc_copy(int index, char *buf, int buflen); + +static void __inline +vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t x) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] += x; +#endif +} + +static void __inline +vmm_stat_array_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t val) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] = val; +#endif +} + +static void __inline +vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_incr(vm, vcpu, vst, 0, x); +#endif +} + +static void __inline +vmm_stat_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t val) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_set(vm, vcpu, vst, 0, val); +#endif +} + +VMM_STAT_DECLARE(VCPU_MIGRATIONS); +VMM_STAT_DECLARE(VMEXIT_COUNT); +VMM_STAT_DECLARE(VMEXIT_EXTINT); +VMM_STAT_DECLARE(VMEXIT_HLT); +VMM_STAT_DECLARE(VMEXIT_CR_ACCESS); +VMM_STAT_DECLARE(VMEXIT_RDMSR); +VMM_STAT_DECLARE(VMEXIT_WRMSR); +VMM_STAT_DECLARE(VMEXIT_MTRAP); +VMM_STAT_DECLARE(VMEXIT_PAUSE); +VMM_STAT_DECLARE(VMEXIT_INTR_WINDOW); +VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW); +VMM_STAT_DECLARE(VMEXIT_INOUT); +VMM_STAT_DECLARE(VMEXIT_CPUID); +VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT); +VMM_STAT_DECLARE(VMEXIT_INST_EMUL); +VMM_STAT_DECLARE(VMEXIT_UNKNOWN); +VMM_STAT_DECLARE(VMEXIT_ASTPENDING); +VMM_STAT_DECLARE(VMEXIT_USERSPACE); +VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS); +VMM_STAT_DECLARE(VMEXIT_EXCEPTION); +#endif diff --git a/sys/arm64/vmm/vmm_stat.c b/sys/arm64/vmm/vmm_stat.c new file mode 100644 --- /dev/null +++ b/sys/arm64/vmm/vmm_stat.c @@ -0,0 +1,173 @@ +/* + * Copyright (C) 2015 Mihai Carabas + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include + +#include +#include +#include "vmm_stat.h" + +/* + * 'vst_num_elems' is the total number of addressable statistic elements + * 'vst_num_types' is the number of unique statistic types + * + * It is always true that 'vst_num_elems' is greater than or equal to + * 'vst_num_types'. This is because a stat type may represent more than + * one element (for e.g. VMM_STAT_ARRAY). + */ +static int vst_num_elems, vst_num_types; +static struct vmm_stat_type *vsttab[MAX_VMM_STAT_ELEMS]; + +static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat"); + +#define vst_size ((size_t)vst_num_elems * sizeof(uint64_t)) + +void +vmm_stat_register(void *arg) +{ + struct vmm_stat_type *vst = arg; + + /* We require all stats to identify themselves with a description */ + if (vst->desc == NULL) + return; + + if (vst_num_elems + vst->nelems >= MAX_VMM_STAT_ELEMS) { + printf("Cannot accommodate vmm stat type \"%s\"!\n", vst->desc); + return; + } + + vst->index = vst_num_elems; + vst_num_elems += vst->nelems; + + vsttab[vst_num_types++] = vst; +} + +int +vmm_stat_copy(struct vm *vm, int vcpu, int index, int count, int *num_stats, + uint64_t *buf) +{ + struct vmm_stat_type *vst; + uint64_t *stats; + int i, tocopy; + + if (vcpu < 0 || vcpu >= vm_get_maxcpus(vm)) + return (EINVAL); + + if (index < 0 || count < 0) + return (EINVAL); + + if (index > vst_num_elems) + return (ENOENT); + + if (index == vst_num_elems) { + *num_stats = 0; + return (0); + } + + tocopy = min(vst_num_elems - index, count); + + /* Let stats functions update their counters */ + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (vst->func != NULL) + (*vst->func)(vm, vcpu, vst); + } + + /* Copy over the stats */ + stats = vcpu_stats(vm, vcpu); + memcpy(buf, stats + index, tocopy * sizeof(stats[0])); + *num_stats = tocopy; + return (0); +} + +void * +vmm_stat_alloc(void) +{ + + return (malloc(vst_size, M_VMM_STAT, M_WAITOK)); +} + +void +vmm_stat_init(void *vp) +{ + + bzero(vp, vst_size); +} + +void +vmm_stat_free(void *vp) +{ + free(vp, M_VMM_STAT); +} + +int +vmm_stat_desc_copy(int index, char *buf, int bufsize) +{ + int i; + struct vmm_stat_type *vst; + + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (index >= vst->index && index < vst->index + vst->nelems) { + if (vst->nelems > 1) { + snprintf(buf, bufsize, "%s[%d]", + vst->desc, index - vst->index); + } else { + strlcpy(buf, vst->desc, bufsize); + } + return (0); /* found it */ + } + } + + return (EINVAL); +} + +/* global statistics */ +VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); +VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); +VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt"); +VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted"); +VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted"); +VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted"); +VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted"); +VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits"); +VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted"); +VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening"); +VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); +VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); +VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); +VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); +VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation"); +VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); +VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); +VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); +VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit"); +VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -113,6 +113,39 @@ dev/iommu/busdma_iommu.c optional iommu dev/iommu/iommu_gas.c optional iommu +arm64/vmm/vmm.c optional vmm +arm64/vmm/vmm_dev.c optional vmm +arm64/vmm/vmm_instruction_emul.c optional vmm +arm64/vmm/vmm_mem.c optional vmm +arm64/vmm/vmm_stat.c optional vmm +arm64/vmm/vmm_arm64.c optional vmm +arm64/vmm/vmm_psci.c optional vmm +arm64/vmm/vmm_reset.c optional vmm +arm64/vmm/vmm_call.S optional vmm +arm64/vmm/vmm_hyp_exception.S optional vmm \ + compile-with "${NORMAL_C} -fpie" \ + no-obj +arm64/vmm/vmm_hyp.c optional vmm \ + compile-with "${NORMAL_C} -fpie" \ + no-obj +vmm_hyp_blob.elf.full optional vmm \ + dependency "vmm_hyp.o vmm_hyp_exception.o" \ + compile-with "${CC} -o ${.TARGET} ${.ALLSRC} -fPIE -nostdlib -T ${LDSCRIPT} -Wl,--defsym=text_start='0x0'" \ + no-obj no-implicit-rule +vmm_hyp_blob.elf optional vmm \ + dependency "vmm_hyp_blob.elf.full" \ + compile-with "${OBJCOPY} --strip-debug ${.ALLSRC} ${.TARGET}" \ + no-obj no-implicit-rule +vmm_hyp_blob.bin optional vmm \ + dependency vmm_hyp_blob.elf \ + compile-with "${OBJCOPY} --output-target=binary ${.ALLSRC} ${.TARGET}" \ + no-obj no-implicit-rule +arm64/vmm/vmm_hyp_el2.S optional vmm \ + dependency vmm_hyp_blob.bin +arm64/vmm/vmm_mmu.c optional vmm +arm64/vmm/io/vgic_v3.c optional vmm +arm64/vmm/io/vtimer.c optional vmm + crypto/armv8/armv8_crypto.c optional armv8crypto armv8_crypto_wrap.o optional armv8crypto \ dependency "$S/crypto/armv8/armv8_crypto_wrap.c" \ diff --git a/sys/conf/ldscript.arm64 b/sys/conf/ldscript.arm64 --- a/sys/conf/ldscript.arm64 +++ b/sys/conf/ldscript.arm64 @@ -7,6 +7,7 @@ { /* Read-only sections, merged into text segment: */ . = text_start; /* This is set using --defsym= on the command line. */ + .vmm_vectors : { (*.vmm_vectors); } .text : { *(.text) @@ -17,6 +18,7 @@ } =0x9090 _etext = .; PROVIDE (etext = .); + .fini : { *(.fini) } =0x9090 .rodata : { *(.rodata*) *(.gnu.linkonce.r*) } .rodata1 : { *(.rodata1) } diff --git a/sys/conf/options.arm64 b/sys/conf/options.arm64 --- a/sys/conf/options.arm64 +++ b/sys/conf/options.arm64 @@ -18,6 +18,9 @@ # EFI Runtime services support EFIRT opt_efirt.h +# Bhyve +VMM opt_global.h + # SoC Support SOC_ALLWINNER_A64 opt_soc.h SOC_ALLWINNER_H5 opt_soc.h diff --git a/sys/modules/Makefile b/sys/modules/Makefile --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -798,7 +798,9 @@ _sgx_linux= sgx_linux _smartpqi= smartpqi _p2sb= p2sb +.endif +.if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" .if ${MK_BHYVE} != "no" || defined(ALL_MODULES) .if ${KERN_OPTS:MSMP} _vmm= vmm diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile --- a/sys/modules/vmm/Makefile +++ b/sys/modules/vmm/Makefile @@ -4,31 +4,68 @@ KMOD= vmm -SRCS= opt_acpi.h opt_bhyve_snapshot.h opt_ddb.h -SRCS+= device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h vnode_if.h -DPSRCS+= vmx_assym.h svm_assym.h -DPSRCS+= vmx_genassym.c svm_genassym.c offset.inc +SRCS= opt_acpi.h opt_ddb.h device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h CFLAGS+= -DVMM_KEEP_STATS -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/io -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/intel -CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/amd +CFLAGS+= -I${SRCTOP}/sys/${MACHINE}/vmm +CFLAGS+= -I${SRCTOP}/sys/${MACHINE}/vmm/io # generic vmm support -.PATH: ${SRCTOP}/sys/amd64/vmm +.PATH: ${SRCTOP}/sys/${MACHINE}/vmm SRCS+= vmm.c \ vmm_dev.c \ - vmm_host.c \ vmm_instruction_emul.c \ + vmm_mem.c \ + vmm_stat.c + +.if ${MACHINE_CPUARCH} == "aarch64" +# TODO: Add the new EL2 code +SRCS+= vmm_arm64.c \ + vmm_psci.c \ + vmm_reset.c \ + vmm_call.S \ + vmm_mmu.c \ + vmm_hyp_el2.S + +.PATH: ${SRCTOP}/sys/${MACHINE}/vmm/io +SRCS+= vgic_v3.c \ + vtimer.c + +CLEANFILES+= vmm_hyp_exception.o vmm_hyp.o vmm_hyp_blob.elf.full +CLEANFILES+= vmm_hyp_blob.elf vmm_hyp_blob.bin + +CFLAGS.vmm_hyp_exception.S += -fpie +CFLAGS.vmm_hyp.c += -fpie +vmm_hyp_exception.o: vmm_hyp_exception.S +vmm_hyp.o: vmm_hyp.c + +vmm_hyp_blob.elf.full: vmm_hyp_exception.o vmm_hyp.o + ${CC} -o ${.TARGET} ${.ALLSRC} -fPIE -nostdlib \ + -T ${SYSDIR}/conf/ldscript.arm64 \ + -Wl,--defsym=text_start='0x0' + +vmm_hyp_blob.elf: vmm_hyp_blob.elf.full + ${OBJCOPY} --strip-debug ${.ALLSRC} ${.TARGET} + +vmm_hyp_blob.bin: vmm_hyp_blob.elf + ${OBJCOPY} --output-target=binary ${.ALLSRC} ${.TARGET} + +vmm_hyp_el2.o: vmm_hyp_blob.bin + +.elif ${MACHINE_CPUARCH} == "amd64" +DPSRCS+= vmx_assym.h svm_assym.h +DPSRCS+= vmx_genassym.c svm_genassym.c offset.inc + +CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/intel +CFLAGS+= -I${SRCTOP}/sys/amd64/vmm/amd + +SRCS+= vmm_host.c \ vmm_ioport.c \ vmm_lapic.c \ - vmm_mem.c \ - vmm_stat.c \ vmm_util.c \ x86.c -.PATH: ${SRCTOP}/sys/amd64/vmm/io +.PATH: ${SRCTOP}/sys/${MACHINE}/vmm/io SRCS+= iommu.c \ ppt.c \ vatpic.c \ @@ -65,10 +102,11 @@ SRCS+= vmm_snapshot.c .endif -CLEANFILES= vmx_assym.h vmx_genassym.o svm_assym.h svm_genassym.o +CLEANFILES+= vmx_assym.h vmx_genassym.o svm_assym.h svm_genassym.o OBJS_DEPEND_GUESS.vmx_support.o+= vmx_assym.h OBJS_DEPEND_GUESS.svm_support.o+= svm_assym.h +.endif vmx_assym.h: vmx_genassym.o sh ${SYSDIR}/kern/genassym.sh vmx_genassym.o > ${.TARGET} @@ -84,6 +122,9 @@ ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ ${.IMPSRC} -o ${.TARGET} +hyp_genassym.o: offset.inc + ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} + vmx_genassym.o: offset.inc ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC}