diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -15,11 +15,13 @@ strchrnul \ strcmp \ strcpy \ - strlen \ strncmp \ strnlen \ strrchr +MDSRCS+= \ + strlen.S + # # Add the above functions. Generate an asm file that includes the needed # Arm Optimized Routines file defining the function name to the libc name. diff --git a/lib/libc/aarch64/string/strlen.S b/lib/libc/aarch64/string/strlen.S new file mode 100644 --- /dev/null +++ b/lib/libc/aarch64/string/strlen.S @@ -0,0 +1,68 @@ +/*- + * Copyright (c) 2024 Getz Mikalsen + * + * SPDX-License-Identifier: BSD-2-Clause +*/ + +#include + + .text + +ENTRY(strlen) + bic x10,x0,#0xf // alignment + and x9,x0,#0xf + ldr q0,[x10] + cmeq v0.16b,v0.16b,#0 + shrn v0.8b,v0.8h,#4 + cbz x9,.Laligned // dont handle unaligned for prealigned + fmov x1,d0 + lsl x2,x0,#2 // get the byte offset + lsr x1,x1,x2 // shift by offset index + cbz x1,.Lloop + rbit x1,x1 + clz x0,x1 + lsr x0,x0,#2 + ret + + /* + * If a string is already aligned we can skip a few cycles by handling + * it as a special case. + */ + +.Laligned: + fmov x1,d0 + cbnz x1,.Lfix + + .p2align 4 +.Lloop: + ldr q1,[x10,#16] + ldr q2,[x10,#32]! + uminp v0.16b,v1.16b,v2.16b // Find smallest byte in buffer + uminp v0.16b,v0.16b,v0.16b + cmeq v0.8b,v0.8b,#0 // 0xFF if 0 in any buffer + + fmov x1,d0 // GPR move for loop check + cbz x1,.Lloop + + cmeq v0.16b,v1.16b,#0 // Check if it matched in buf 1 + sub x0,x10,x0 + cbnz w1,1f // If matched jump and find offset + cmeq v0.16b,v2.16b,#0 // Check where match in buf 2 + add x0,x0,#16 + +1: + shrn v0.8b,v0.8h,#4 // Find offset of match + fmov x1,d0 + sub x0,x0,#16 + rbit x1,x1 // reverse bits as NEON has no ctz + clz x3,x1 + lsr x3,x3,#2 // get offset index + add x0,x0,x3 + ret + +.Lfix: + rbit x1,x1 + clz x3,x1 + lsr x0,x3,#2 + ret +END(strlen)