diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index f8c67319fe12..7325b54d9716 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -1,52 +1,52 @@ # # String handling from the Arm Optimized Routines # https://github.com/ARM-software/optimized-routines # AARCH64_STRING_FUNCS= \ memchr \ memcmp \ memcpy \ memmove \ memrchr \ memset \ stpcpy \ strchr \ strchrnul \ strcpy \ - strlen \ strnlen \ strrchr # SIMD-enhanced routines not derived from Arm's code MDSRCS+= \ strcmp.S \ strspn.S \ strcspn.S \ strpbrk.c \ strsep.c \ strcat.c \ strlcpy.S \ strncmp.S \ memccpy.S \ strncat.c \ - strlcat.c + strlcat.c \ + strlen.S # # Add the above functions. Generate an asm file that includes the needed # Arm Optimized Routines file defining the function name to the libc name. # Some file need multiple macros defined or a weak symbol added we can # override the generated file in these cases. # .for FUNC in ${AARCH64_STRING_FUNCS} .if !exists(${FUNC}.S) ${FUNC}.S: printf '/* %sgenerated by libc/aarch64/string/Makefile.inc */\n' @ > ${.TARGET} printf '#define __%s_aarch64 %s\n' ${FUNC} ${FUNC} >> ${.TARGET} printf '#include "aarch64/%s.S"\n' ${FUNC} >> ${.TARGET} CLEANFILES+= ${FUNC}.S .endif MDSRCS+= ${FUNC}.S CFLAGS.${FUNC}.S+=-I${SRCTOP}/contrib/arm-optimized-routines/string .endfor diff --git a/lib/libc/aarch64/string/strlen.S b/lib/libc/aarch64/string/strlen.S new file mode 100644 index 000000000000..7bfac7f4b1e1 --- /dev/null +++ b/lib/libc/aarch64/string/strlen.S @@ -0,0 +1,46 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Getz Mikalsen +*/ + +#include + + .weak strlen + .set strlen, __strlen + .text + +ENTRY(__strlen) + bic x10, x0, #0xf // aligned src + and x9, x0, #0xf + ldr q0, [x10] + cmeq v0.16b, v0.16b, #0 + shrn v0.8b, v0.8h, #4 + fmov x1, d0 + cbz x9, .Laligned + lsl x2, x0, #2 // get the byte offset + lsr x1, x1, x2 // shift by offset index + cbz x1, .Lloop + rbit x1, x1 + clz x0, x1 + lsr x0, x0, #2 + ret + +.Laligned: + cbnz x1, .Ldone + +.Lloop: + ldr q0, [x10, #16]! + cmeq v0.16b, v0.16b, #0 + shrn v0.8b, v0.8h, #4 // reduce to fit mask in GPR + fcmp d0, #0.0 + b.eq .Lloop + fmov x1, d0 +.Ldone: + sub x0, x10, x0 + rbit x1, x1 // reverse bits as NEON has no ctz + clz x3, x1 + lsr x3, x3, #2 + add x0, x0, x3 + ret +END(__strlen)