diff --git a/sys/arm64/arm64/exception.S b/sys/arm64/arm64/exception.S --- a/sys/arm64/arm64/exception.S +++ b/sys/arm64/arm64/exception.S @@ -67,6 +67,7 @@ stp x18, lr, [sp, #(TF_SP - TF_X)]! stp x10, x11, [sp, #(TF_ELR)] stp x12, x13, [sp, #(TF_ESR)] + str xzr, [sp, #(TF_FLAGS)] mrs x18, tpidr_el1 .endm diff --git a/sys/arm64/arm64/exec_machdep.c b/sys/arm64/arm64/exec_machdep.c --- a/sys/arm64/arm64/exec_machdep.c +++ b/sys/arm64/arm64/exec_machdep.c @@ -606,6 +606,8 @@ if (copyin(uap->sigcntxp, &uc, sizeof(uc))) return (EFAULT); + /* Tell the thread switcher it needs to preserve the SVE registers */ + td->td_frame->tf_flags &= ~TF_FLAG_CAN_DROP_SVE; error = set_mcontext(td, &uc.uc_mcontext); if (error != 0) return (error); diff --git a/sys/arm64/arm64/genassym.c b/sys/arm64/arm64/genassym.c --- a/sys/arm64/arm64/genassym.c +++ b/sys/arm64/arm64/genassym.c @@ -69,10 +69,11 @@ ASSYM(TD_LOCK, offsetof(struct thread, td_lock)); ASSYM(TD_MD_CANARY, offsetof(struct thread, td_md.md_canary)); -ASSYM(TF_SIZE, sizeof(struct trapframe)); +ASSYM(TF_SIZE, roundup2(sizeof(struct trapframe), STACKALIGNBYTES + 1)); ASSYM(TF_SP, offsetof(struct trapframe, tf_sp)); ASSYM(TF_LR, offsetof(struct trapframe, tf_lr)); ASSYM(TF_ELR, offsetof(struct trapframe, tf_elr)); ASSYM(TF_SPSR, offsetof(struct trapframe, tf_spsr)); ASSYM(TF_ESR, offsetof(struct trapframe, tf_esr)); ASSYM(TF_X, offsetof(struct trapframe, tf_x)); +ASSYM(TF_FLAGS, offsetof(struct trapframe, tf_flags)); diff --git a/sys/arm64/arm64/trap.c b/sys/arm64/arm64/trap.c --- a/sys/arm64/arm64/trap.c +++ b/sys/arm64/arm64/trap.c @@ -195,6 +195,7 @@ { if ((frame->tf_esr & ESR_ELx_ISS_MASK) == 0) { + frame->tf_flags |= TF_FLAG_CAN_DROP_SVE; syscallenter(td); syscallret(td); } else { @@ -734,7 +735,8 @@ break; } - KASSERT((td->td_pcb->pcb_fpflags & ~PCB_FP_USERMASK) == 0, + KASSERT( + (td->td_pcb->pcb_fpflags & ~(PCB_FP_USERMASK|PCB_FP_SVEVALID)) == 0, ("Kernel VFP flags set while entering userspace")); KASSERT( td->td_pcb->pcb_fpusaved == &td->td_pcb->pcb_fpustate, diff --git a/sys/arm64/arm64/vfp.c b/sys/arm64/arm64/vfp.c --- a/sys/arm64/arm64/vfp.c +++ b/sys/arm64/arm64/vfp.c @@ -30,11 +30,13 @@ #ifdef VFP #include #include +#include #include #include #include #include #include +#include #include @@ -60,6 +62,63 @@ static uma_zone_t fpu_save_area_zone; static struct vfpstate *fpu_initialstate; +static u_int sve_max_vector_len; + +static size_t +_sve_buf_size(u_int sve_len) +{ + size_t len; + + /* 32 vector registers */ + len = (size_t)sve_len * 32; + /* + * 16 predicate registers and the fault fault register, each 1/8th + * the size of a vector register. + */ + len += ((size_t)sve_len * 17) / 8; + /* + * FPSR and FPCR + */ + len += sizeof(uint64_t) * 2; + + return (len); +} + +size_t +sve_max_buf_size(void) +{ + MPASS(sve_max_vector_len > 0); + return (_sve_buf_size(sve_max_vector_len)); +} + +size_t +sve_buf_size(struct thread *td) +{ + struct pcb *pcb; + + pcb = td->td_pcb; + MPASS(pcb->pcb_svesaved != NULL); + MPASS(pcb->pcb_sve_len > 0); + + return (_sve_buf_size(pcb->pcb_sve_len)); +} + +static void * +sve_alloc(void) +{ + void *buf; + + buf = malloc(sve_max_buf_size(), M_FPUKERN_CTX, M_WAITOK | M_ZERO); + + return (buf); +} + +static void +sve_free(void *buf) +{ + free(buf, M_FPUKERN_CTX); +} + void vfp_enable(void) { @@ -71,13 +130,30 @@ isb(); } +static void +sve_enable(void) +{ + uint32_t cpacr; + + cpacr = READ_SPECIALREG(cpacr_el1); + /* Enable FP */ + cpacr = (cpacr & ~CPACR_FPEN_MASK) | CPACR_FPEN_TRAP_NONE; + /* Enable SVE */ + cpacr = (cpacr & ~CPACR_ZEN_MASK) | CPACR_ZEN_TRAP_NONE; + WRITE_SPECIALREG(cpacr_el1, cpacr); + isb(); +} + void vfp_disable(void) { uint32_t cpacr; cpacr = READ_SPECIALREG(cpacr_el1); + /* Disable FP */ cpacr = (cpacr & ~CPACR_FPEN_MASK) | CPACR_FPEN_TRAP_ALL1; + /* Disable SVE */ + cpacr = (cpacr & ~CPACR_ZEN_MASK) | CPACR_ZEN_TRAP_ALL1; WRITE_SPECIALREG(cpacr_el1, cpacr); isb(); } @@ -171,9 +247,266 @@ } static void -vfp_save_state_common(struct thread *td, struct pcb *pcb) +sve_store(void *state, u_int sve_len) +{ + vm_offset_t f_start, p_start, z_start; + uint64_t fpcr, fpsr; + + /* + * Calculate the start of each register groups. There are three + * groups depending on size, with the First Fault Register (FFR) + * stored with the predicate registers as we use one of them to + * temporarily hold it. + * + * +-------------------------+-------------------+ + * | Contents | Register size | + * z_start -> +-------------------------+-------------------+ + * | | | + * | 32 Z regs | sve_len | + * | | | + * p_start -> +-------------------------+-------------------+ + * | | | + * | 16 Predicate registers | 1/8 size of Z reg | + * | 1 First Fault register | | + * | | | + * f_start -> +-------------------------+-------------------+ + * | | | + * | FPSR/FPCR | 32 bit | + * | | | + * +-------------------------+-------------------+ + */ + z_start = (vm_offset_t)state; + p_start = z_start + sve_len * 32; + f_start = p_start + (sve_len / 8) * 17; + + __asm __volatile( + ".arch_extension sve \n" + "str z0, [%0, #0, MUL VL] \n" + "str z1, [%0, #1, MUL VL] \n" + "str z2, [%0, #2, MUL VL] \n" + "str z3, [%0, #3, MUL VL] \n" + "str z4, [%0, #4, MUL VL] \n" + "str z5, [%0, #5, MUL VL] \n" + "str z6, [%0, #6, MUL VL] \n" + "str z7, [%0, #7, MUL VL] \n" + "str z8, [%0, #8, MUL VL] \n" + "str z9, [%0, #9, MUL VL] \n" + "str z10, [%0, #10, MUL VL] \n" + "str z11, [%0, #11, MUL VL] \n" + "str z12, [%0, #12, MUL VL] \n" + "str z13, [%0, #13, MUL VL] \n" + "str z14, [%0, #14, MUL VL] \n" + "str z15, [%0, #15, MUL VL] \n" + "str z16, [%0, #16, MUL VL] \n" + "str z17, [%0, #17, MUL VL] \n" + "str z18, [%0, #18, MUL VL] \n" + "str z19, [%0, #19, MUL VL] \n" + "str z20, [%0, #20, MUL VL] \n" + "str z21, [%0, #21, MUL VL] \n" + "str z22, [%0, #22, MUL VL] \n" + "str z23, [%0, #23, MUL VL] \n" + "str z24, [%0, #24, MUL VL] \n" + "str z25, [%0, #25, MUL VL] \n" + "str z26, [%0, #26, MUL VL] \n" + "str z27, [%0, #27, MUL VL] \n" + "str z28, [%0, #28, MUL VL] \n" + "str z29, [%0, #29, MUL VL] \n" + "str z30, [%0, #30, MUL VL] \n" + "str z31, [%0, #31, MUL VL] \n" + /* Store the predicate registers */ + "str p0, [%1, #0, MUL VL] \n" + "str p1, [%1, #1, MUL VL] \n" + "str p2, [%1, #2, MUL VL] \n" + "str p3, [%1, #3, MUL VL] \n" + "str p4, [%1, #4, MUL VL] \n" + "str p5, [%1, #5, MUL VL] \n" + "str p6, [%1, #6, MUL VL] \n" + "str p7, [%1, #7, MUL VL] \n" + "str p8, [%1, #8, MUL VL] \n" + "str p9, [%1, #9, MUL VL] \n" + "str p10, [%1, #10, MUL VL] \n" + "str p11, [%1, #11, MUL VL] \n" + "str p12, [%1, #12, MUL VL] \n" + "str p13, [%1, #13, MUL VL] \n" + "str p14, [%1, #14, MUL VL] \n" + "str p15, [%1, #15, MUL VL] \n" + ".arch_extension nosve \n" + : : "r"(z_start), "r"(p_start)); + + /* Save the FFR if needed */ + /* TODO: Skip if in SME streaming mode (when supported) */ + __asm __volatile( + ".arch_extension sve \n" + "rdffr p0.b \n" + "str p0, [%0, #16, MUL VL] \n" + /* + * Load the old p0 value to ensure it is consistent if we enable + * without calling sve_restore, e.g. switch to a kernel thread and + * back. + */ + "ldr p0, [%0, #0, MUL VL] \n" + ".arch_extension nosve \n" + : : "r"(p_start)); + + __asm __volatile( + ".arch_extension fp \n" + "mrs %0, fpsr \n" + "mrs %1, fpcr \n" + "stp %w0, %w1, [%2] \n" + ".arch_extension nofp \n" + : "=&r"(fpsr), "=&r"(fpcr) : "r"(f_start)); +} + +static void +sve_restore(void *state, u_int sve_len) +{ + vm_offset_t f_start, p_start, z_start; + uint64_t fpcr, fpsr; + + /* See sve_store for the layout of the state buffer */ + z_start = (vm_offset_t)state; + p_start = z_start + sve_len * 32; + f_start = p_start + (sve_len / 8) * 17; + + __asm __volatile( + ".arch_extension sve \n" + "ldr p0, [%0, #16, MUL VL] \n" + "wrffr p0.b \n" + ".arch_extension nosve \n" + : : "r"(p_start)); + + __asm __volatile( + ".arch_extension sve \n" + "ldr z0, [%0, #0, MUL VL] \n" + "ldr z1, [%0, #1, MUL VL] \n" + "ldr z2, [%0, #2, MUL VL] \n" + "ldr z3, [%0, #3, MUL VL] \n" + "ldr z4, [%0, #4, MUL VL] \n" + "ldr z5, [%0, #5, MUL VL] \n" + "ldr z6, [%0, #6, MUL VL] \n" + "ldr z7, [%0, #7, MUL VL] \n" + "ldr z8, [%0, #8, MUL VL] \n" + "ldr z9, [%0, #9, MUL VL] \n" + "ldr z10, [%0, #10, MUL VL] \n" + "ldr z11, [%0, #11, MUL VL] \n" + "ldr z12, [%0, #12, MUL VL] \n" + "ldr z13, [%0, #13, MUL VL] \n" + "ldr z14, [%0, #14, MUL VL] \n" + "ldr z15, [%0, #15, MUL VL] \n" + "ldr z16, [%0, #16, MUL VL] \n" + "ldr z17, [%0, #17, MUL VL] \n" + "ldr z18, [%0, #18, MUL VL] \n" + "ldr z19, [%0, #19, MUL VL] \n" + "ldr z20, [%0, #20, MUL VL] \n" + "ldr z21, [%0, #21, MUL VL] \n" + "ldr z22, [%0, #22, MUL VL] \n" + "ldr z23, [%0, #23, MUL VL] \n" + "ldr z24, [%0, #24, MUL VL] \n" + "ldr z25, [%0, #25, MUL VL] \n" + "ldr z26, [%0, #26, MUL VL] \n" + "ldr z27, [%0, #27, MUL VL] \n" + "ldr z28, [%0, #28, MUL VL] \n" + "ldr z29, [%0, #29, MUL VL] \n" + "ldr z30, [%0, #30, MUL VL] \n" + "ldr z31, [%0, #31, MUL VL] \n" + /* Store the predicate registers */ + "ldr p0, [%1, #0, MUL VL] \n" + "ldr p1, [%1, #1, MUL VL] \n" + "ldr p2, [%1, #2, MUL VL] \n" + "ldr p3, [%1, #3, MUL VL] \n" + "ldr p4, [%1, #4, MUL VL] \n" + "ldr p5, [%1, #5, MUL VL] \n" + "ldr p6, [%1, #6, MUL VL] \n" + "ldr p7, [%1, #7, MUL VL] \n" + "ldr p8, [%1, #8, MUL VL] \n" + "ldr p9, [%1, #9, MUL VL] \n" + "ldr p10, [%1, #10, MUL VL] \n" + "ldr p11, [%1, #11, MUL VL] \n" + "ldr p12, [%1, #12, MUL VL] \n" + "ldr p13, [%1, #13, MUL VL] \n" + "ldr p14, [%1, #14, MUL VL] \n" + "ldr p15, [%1, #15, MUL VL] \n" + ".arch_extension nosve \n" + : : "r"(z_start), "r"(p_start)); + + __asm __volatile( + ".arch_extension fp \n" + "ldp %w0, %w1, [%2] \n" + "msr fpsr, %0 \n" + "msr fpcr, %1 \n" + ".arch_extension nofp \n" + : "=&r"(fpsr), "=&r"(fpcr) : "r"(f_start)); +} + +/* + * Sync the VFP registers to the SVE register state, e.g. in signal return + * when userspace may have changed the vfp register values and expect them + * to be used when the signal handler returns. + */ +void +vfp_to_sve_sync(struct thread *td) +{ + struct pcb *pcb; + uint32_t *fpxr; + + pcb = td->td_pcb; + if (pcb->pcb_svesaved == NULL) + return; + + MPASS(pcb->pcb_fpusaved != NULL); + + /* Copy the VFP registers to the SVE region */ + for (int i = 0; i < nitems(pcb->pcb_fpusaved->vfp_regs); i++) { + __uint128_t *sve_reg; + + sve_reg = (__uint128_t *)((uintptr_t)pcb->pcb_svesaved + + i * pcb->pcb_sve_len); + *sve_reg = pcb->pcb_fpusaved->vfp_regs[i]; + } + + fpxr = (uint32_t *)((uintptr_t)pcb->pcb_svesaved + + (32 * pcb->pcb_sve_len) + (17 * pcb->pcb_sve_len / 8)); + fpxr[0] = pcb->pcb_fpusaved->vfp_fpsr; + fpxr[1] = pcb->pcb_fpusaved->vfp_fpcr; +} + +/* + * Sync the SVE registers to the VFP register state. + */ +void +sve_to_vfp_sync(struct thread *td) +{ + struct pcb *pcb; + uint32_t *fpxr; + + pcb = td->td_pcb; + if (pcb->pcb_svesaved == NULL) + return; + + MPASS(pcb->pcb_fpusaved == &pcb->pcb_fpustate); + + /* Copy the SVE registers to the VFP saved state */ + for (int i = 0; i < nitems(pcb->pcb_fpusaved->vfp_regs); i++) { + __uint128_t *sve_reg; + + sve_reg = (__uint128_t *)((uintptr_t)pcb->pcb_svesaved + + i * pcb->pcb_sve_len); + pcb->pcb_fpusaved->vfp_regs[i] = *sve_reg; + } + + fpxr = (uint32_t *)((uintptr_t)pcb->pcb_svesaved + + (32 * pcb->pcb_sve_len) + (17 * pcb->pcb_sve_len / 8)); + pcb->pcb_fpusaved->vfp_fpsr = fpxr[0]; + pcb->pcb_fpusaved->vfp_fpcr = fpxr[1]; +} + +static void +vfp_save_state_common(struct thread *td, struct pcb *pcb, bool full_save) { uint32_t cpacr; + bool save_sve; + + save_sve = false; critical_enter(); /* @@ -181,14 +514,48 @@ * i.e. return if we are trapping on FP access. */ cpacr = READ_SPECIALREG(cpacr_el1); - if ((cpacr & CPACR_FPEN_MASK) == CPACR_FPEN_TRAP_NONE) { - KASSERT(PCPU_GET(fpcurthread) == td, - ("Storing an invalid VFP state")); + if ((cpacr & CPACR_FPEN_MASK) != CPACR_FPEN_TRAP_NONE) + goto done; + KASSERT(PCPU_GET(fpcurthread) == td, + ("Storing an invalid VFP state")); + + /* + * Also save the SVE state. As SVE depends on the VFP being + * enabled we can rely on only needing to check this when + * the VFP unit has been enabled. + */ + if ((cpacr & CPACR_ZEN_MASK) == CPACR_ZEN_TRAP_NONE) { + /* If SVE is enabled it should be valid */ + MPASS((pcb->pcb_fpflags & PCB_FP_SVEVALID) != 0); + + /* + * If we are switching while in a system call skip saving + * SVE registers. The ABI allows us to drop them over any + * system calls, however doing so is expensive in SVE + * heavy userspace code. This would require us to disable + * SVE for all system calls and trap the next use of them. + * As an optimisation only disable SVE on context switch. + */ + if (td->td_frame == NULL || + (td->td_frame->tf_flags & TF_FLAG_CAN_DROP_SVE) == 0) + save_sve = true; + } + + if (save_sve) { + KASSERT(pcb->pcb_svesaved != NULL, + ("Storing to a NULL SVE state")); + sve_store(pcb->pcb_svesaved, pcb->pcb_sve_len); + if (full_save) + sve_to_vfp_sync(td); + } else { + pcb->pcb_fpflags &= ~PCB_FP_SVEVALID; vfp_store(pcb->pcb_fpusaved); - dsb(ish); - vfp_disable(); } + dsb(ish); + vfp_disable(); + +done: critical_exit(); } @@ -199,7 +566,7 @@ KASSERT(pcb != NULL, ("NULL vfp pcb")); KASSERT(td->td_pcb == pcb, ("Invalid vfp pcb")); - vfp_save_state_common(td, pcb); + vfp_save_state_common(td, pcb, true); } void @@ -213,7 +580,7 @@ MPASS(pcb->pcb_fpusaved == NULL); pcb->pcb_fpusaved = &pcb->pcb_fpustate; - vfp_save_state_common(curthread, pcb); + vfp_save_state_common(curthread, pcb, true); } void @@ -221,7 +588,7 @@ { KASSERT(td != NULL, ("NULL vfp thread")); - vfp_save_state_common(td, td->td_pcb); + vfp_save_state_common(td, td->td_pcb, false); } /* @@ -231,21 +598,40 @@ void vfp_new_thread(struct thread *newtd, struct thread *oldtd, bool fork) { - struct pcb *newpcb; + struct pcb *newpcb, *oldpcb; newpcb = newtd->td_pcb; + oldpcb = oldtd->td_pcb; /* Kernel threads start with clean VFP */ if ((oldtd->td_pflags & TDP_KTHREAD) != 0) { newpcb->pcb_fpflags &= - ~(PCB_FP_STARTED | PCB_FP_KERN | PCB_FP_NOSAVE); + ~(PCB_FP_STARTED | PCB_FP_SVEVALID | PCB_FP_KERN | + PCB_FP_NOSAVE); } else { MPASS((newpcb->pcb_fpflags & (PCB_FP_KERN|PCB_FP_NOSAVE)) == 0); + + /* + * The only SVE register state to be guaranteed to be saved + * a system call is the lower bits of the Z registers as + * these are aliased with the existing FP registers. Because + * we can only create a new thread or fork through a system + * call it is safe to drop the SVE state in the new thread. + */ + newpcb->pcb_fpflags &= ~PCB_FP_SVEVALID; if (!fork) { newpcb->pcb_fpflags &= ~PCB_FP_STARTED; } } + newpcb->pcb_svesaved = NULL; + if (oldpcb->pcb_svesaved == NULL) + newpcb->pcb_sve_len = sve_max_vector_len; + else + KASSERT(newpcb->pcb_sve_len == oldpcb->pcb_sve_len, + ("%s: pcb sve vector length differs: %x != %x", __func__, + newpcb->pcb_sve_len, oldpcb->pcb_sve_len)); + newpcb->pcb_fpusaved = &newpcb->pcb_fpustate; newpcb->pcb_vfpcpu = UINT_MAX; } @@ -272,23 +658,48 @@ ("pcb_fpusaved should point to pcb_fpustate.")); pcb->pcb_fpustate.vfp_fpcr = VFPCR_INIT; pcb->pcb_fpustate.vfp_fpsr = 0; + /* XXX: Memory leak when using SVE between fork & exec? */ + pcb->pcb_svesaved = NULL; pcb->pcb_vfpcpu = UINT_MAX; pcb->pcb_fpflags = 0; } -void -vfp_restore_state(void) +static void +vfp_restore_state_common(struct thread *td, int flags) { struct pcb *curpcb; u_int cpu; + bool restore_sve; + + KASSERT(td == curthread, ("%s: Called with non-current thread", + __func__)); critical_enter(); cpu = PCPU_GET(cpuid); - curpcb = curthread->td_pcb; - curpcb->pcb_fpflags |= PCB_FP_STARTED; + curpcb = td->td_pcb; - vfp_enable(); + /* + * If SVE has been used and the base VFP state is in use then + * restore the SVE registers. A non-base VFP state should only + * be used by the kernel and SVE should onlu be used by userspace. + */ + restore_sve = false; + if ((curpcb->pcb_fpflags & PCB_FP_SVEVALID) != 0 && + curpcb->pcb_fpusaved == &curpcb->pcb_fpustate) { + MPASS(curpcb->pcb_svesaved != NULL); + /* SVE shouldn't be enabled in the kernel */ + MPASS((flags & PCB_FP_KERN) == 0); + restore_sve = true; + } + + if (restore_sve) { + MPASS((curpcb->pcb_fpflags & PCB_FP_SVEVALID) != 0); + sve_enable(); + } else { + curpcb->pcb_fpflags |= PCB_FP_STARTED; + vfp_enable(); + } /* * If the previous thread on this cpu to use the VFP was not the @@ -296,14 +707,104 @@ * cpu we need to restore the old state. */ if (PCPU_GET(fpcurthread) != curthread || cpu != curpcb->pcb_vfpcpu) { - vfp_restore(curthread->td_pcb->pcb_fpusaved); - PCPU_SET(fpcurthread, curthread); + /* + * The VFP registers are the lower 128 bits of the SVE + * registers. Use the SVE store state if it was previously + * enabled. + */ + if (restore_sve) { + MPASS(td->td_pcb->pcb_svesaved != NULL); + sve_restore(td->td_pcb->pcb_svesaved, + td->td_pcb->pcb_sve_len); + } else { + vfp_restore(td->td_pcb->pcb_fpusaved); + } + PCPU_SET(fpcurthread, td); curpcb->pcb_vfpcpu = cpu; } critical_exit(); } +void +vfp_restore_state(void) +{ + struct thread *td; + + td = curthread; + vfp_restore_state_common(td, td->td_pcb->pcb_fpflags); +} + +bool +sve_restore_state(struct thread *td) +{ + struct pcb *curpcb; + void *svesaved; + uint64_t cpacr; + + KASSERT(td == curthread, ("%s: Called with non-current thread", + __func__)); + + curpcb = td->td_pcb; + + /* The SVE state should alias the base VFP state */ + MPASS(curpcb->pcb_fpusaved == &curpcb->pcb_fpustate); + + /* SVE not enabled, tell the caller to raise a fault */ + if (curpcb->pcb_sve_len == 0) { + /* + * The init pcb is created before we read the vector length. + * Set it to the default length. + */ + if (sve_max_vector_len == 0) + return (false); + + MPASS(curpcb->pcb_svesaved == NULL); + curpcb->pcb_sve_len = sve_max_vector_len; + } + + if (curpcb->pcb_svesaved == NULL) { + /* SVE should be disabled so will be invalid */ + MPASS((curpcb->pcb_fpflags & PCB_FP_SVEVALID) == 0); + + /* + * Allocate the SVE buffer of this thread. + * Enable interrupts so the allocation can sleep + */ + svesaved = sve_alloc(); + + critical_enter(); + + /* Restore the VFP state if needed */ + cpacr = READ_SPECIALREG(cpacr_el1); + if ((cpacr & CPACR_FPEN_MASK) != CPACR_FPEN_TRAP_NONE) { + vfp_restore_state_common(td, curpcb->pcb_fpflags); + } + + /* + * Set the flags after enabling the VFP as the SVE saved + * state will be invalid. + */ + curpcb->pcb_svesaved = svesaved; + curpcb->pcb_fpflags |= PCB_FP_SVEVALID; + sve_enable(); + + critical_exit(); + } else { + vfp_restore_state_common(td, curpcb->pcb_fpflags); + + /* Enable SVE if it wasn't previously enabled */ + if ((curpcb->pcb_fpflags & PCB_FP_SVEVALID) == 0) { + critical_enter(); + sve_enable(); + curpcb->pcb_fpflags |= PCB_FP_SVEVALID; + critical_exit(); + } + } + + return (true); +} + void vfp_init_secondary(void) { @@ -348,6 +849,74 @@ SYSINIT(vfp, SI_SUB_CPU, SI_ORDER_ANY, vfp_init, NULL); +static void +sve_thread_dtor(void *arg __unused, struct thread *td) +{ + sve_free(td->td_pcb->pcb_svesaved); +} + +static void +sve_pcpu_read(void *arg) +{ + u_int *len; + uint64_t vl; + + len = arg; + + /* Enable SVE to read zcr_el1 and VFP for rdvl */ + sve_enable(); + + /* Set the longest vector length */ + WRITE_SPECIALREG(ZCR_EL1_REG, ZCR_LEN_MASK); + isb(); + + /* Read the real vector length */ + __asm __volatile( + ".arch_extension sve \n" + "rdvl %0, #1 \n" + ".arch_extension nosve \n" + : "=&r"(vl)); + + vfp_disable(); + + len[PCPU_GET(cpuid)] = vl; +} + +static void +sve_init(const void *dummy __unused) +{ + u_int *len_list; + uint64_t reg; + int i; + + if (!get_kernel_reg(ID_AA64PFR0_EL1, ®)) + return; + + if (ID_AA64PFR0_SVE_VAL(reg) == ID_AA64PFR0_SVE_NONE) + return; + + len_list = malloc(sizeof(*len_list) * (mp_maxid + 1), M_TEMP, + M_WAITOK | M_ZERO); + smp_rendezvous(NULL, sve_pcpu_read, NULL, len_list); + + sve_max_vector_len = ZCR_LEN_BYTES(ZCR_LEN_MASK); + CPU_FOREACH(i) { + if (bootverbose) + printf("CPU%d SVE vector length: %u\n", i, len_list[i]); + sve_max_vector_len = MIN(sve_max_vector_len, len_list[i]); + } + free(len_list, M_TEMP); + + if (bootverbose) + printf("SVE with %u byte vectors\n", sve_max_vector_len); + + if (sve_max_vector_len > 0) { + EVENTHANDLER_REGISTER(thread_dtor, sve_thread_dtor, NULL, + EVENTHANDLER_PRI_ANY); + } +} +SYSINIT(sve, SI_SUB_SMP, SI_ORDER_ANY, sve_init, NULL); + struct fpu_kern_ctx * fpu_kern_alloc_ctx(u_int flags) { diff --git a/sys/arm64/include/armreg.h b/sys/arm64/include/armreg.h --- a/sys/arm64/include/armreg.h +++ b/sys/arm64/include/armreg.h @@ -2225,6 +2225,13 @@ #define TTBR_CnP (1ul << TTBR_CnP_SHIFT) /* ZCR_EL1 - SVE Control Register */ +#define ZCR_EL1 MRS_REG(ZCR_EL1) +#define ZCR_EL1_REG MRS_REG_ALT_NAME(ZCR_EL1_REG) +#define ZCR_EL1_REG_op0 3 +#define ZCR_EL1_REG_op1 0 +#define ZCR_EL1_REG_CRn 1 +#define ZCR_EL1_REG_CRm 2 +#define ZCR_EL1_REG_op2 0 #define ZCR_LEN_SHIFT 0 #define ZCR_LEN_MASK (0xf << ZCR_LEN_SHIFT) #define ZCR_LEN_BYTES(x) ((((x) & ZCR_LEN_MASK) + 1) * 16) diff --git a/sys/arm64/include/frame.h b/sys/arm64/include/frame.h --- a/sys/arm64/include/frame.h +++ b/sys/arm64/include/frame.h @@ -51,6 +51,8 @@ uint64_t tf_esr; uint64_t tf_far; uint64_t tf_x[30]; +#define TF_FLAG_CAN_DROP_SVE (1ul << 0) + uint64_t tf_flags; }; /* diff --git a/sys/arm64/include/pcb.h b/sys/arm64/include/pcb.h --- a/sys/arm64/include/pcb.h +++ b/sys/arm64/include/pcb.h @@ -59,17 +59,19 @@ u_int pcb_flags; #define PCB_SINGLE_STEP_SHIFT 0 #define PCB_SINGLE_STEP (1 << PCB_SINGLE_STEP_SHIFT) - uint32_t pcb_pad1; + u_int pcb_sve_len; /* The SVE vector length */ struct vfpstate *pcb_fpusaved; int pcb_fpflags; #define PCB_FP_STARTED 0x00000001 +#define PCB_FP_SVEVALID 0x00000002 #define PCB_FP_KERN 0x40000000 #define PCB_FP_NOSAVE 0x80000000 /* The bits passed to userspace in get_fpcontext */ -#define PCB_FP_USERMASK (PCB_FP_STARTED) +#define PCB_FP_USERMASK (PCB_FP_STARTED | PCB_FP_SVEVALID) u_int pcb_vfpcpu; /* Last cpu this thread ran VFP code */ - uint64_t pcb_reserved[5]; + void *pcb_svesaved; + uint64_t pcb_reserved[4]; /* * The userspace VFP state. The pcb_fpusaved pointer will point to diff --git a/sys/arm64/include/vfp.h b/sys/arm64/include/vfp.h --- a/sys/arm64/include/vfp.h +++ b/sys/arm64/include/vfp.h @@ -80,6 +80,12 @@ void vfp_save_state(struct thread *, struct pcb *); void vfp_save_state_savectx(struct pcb *); void vfp_save_state_switch(struct thread *); +void vfp_to_sve_sync(struct thread *); +void sve_to_vfp_sync(struct thread *); + +size_t sve_max_buf_size(void); +size_t sve_buf_size(struct thread *); +bool sve_restore_state(struct thread *); struct fpu_kern_ctx;