diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -20,6 +20,9 @@ strnlen \ strrchr +MDSRCS+= \ + strlcpy.S + # # Add the above functions. Generate an asm file that includes the needed # Arm Optimized Routines file defining the function name to the libc name. diff --git a/lib/libc/aarch64/string/strlcpy.S b/lib/libc/aarch64/string/strlcpy.S new file mode 100644 --- /dev/null +++ b/lib/libc/aarch64/string/strlcpy.S @@ -0,0 +1,316 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Getz Mikalsen +*/ + +#include + + .weak strlcpy + .set strlcpy, __strlcpy + .text + +ENTRY(__strlcpy) + subs x2, x2, #1 + b.lo .L0 + + mov x9, x0 // stash copy of dst pointer + bic x10, x1, #0xf // src aligned + and x11, x1, #0xf // src offset + + ldr q1, [x10] + cmeq v1.16b, v1.16b, #0 // NUL found in head? + + mov x8, #-1 // fill register with 0xfff..fff + lsl x12, x11, #2 + lsl x8, x8, x12 // mask of bytes in the string + + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + ands x5, x5, x8 + b.ne .Lhead_nul + + ldr q3, [x10, #16] // load second string chunk + ldr q2, [x1] // load true head + mov x8, #32 + sub x8, x8, x11 + + cmeq v1.16b, v3.16b, #0 // NUL found in second chunk? + + subs x2, x2, x8 + b.ls .Lhead_buf_end + + /* process second chunk */ + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + cbnz x5, .Lsecond_nul + + /* string didn't end in second chunk and neither did buffer */ + ldr q1, [x10, #32] // load next string chunk + str q2, [x0] // deposit head into buffer + sub x0, x0, x11 // adjust x0 + str q3, [x0, #16] // deposit second chunk + add x10, x10, #32 // advance src + add x0, x0, #32 // advance dst + subs x2, x2, #16 // enough left for another round? + b.ls 1f + + /* main loop unrolled twice */ + .p2align 4 +0: + cmeq v2.16b, v1.16b, #0 // NUL found in second chunk? + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + + cbnz x5, 3f + + str q1, [x0] + ldr q1, [x10, #16] // load next chunk + + cmp x2, #16 // more than a full chunk left? + b.ls 2f + + add x10, x10, #32 // advance pointers + add x0, x0, #32 + + cmeq v2.16b, v1.16b, #0 // NUL found in second chunk? + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + cbnz x5, 4f // process chunk if match + + str q1, [x0, #-16] + ldr q1, [x10] // load next chunk + + subs x2, x2, #32 + b.hi 0b + +1: + sub x10, x10, #16 // undo second advancement + add x2, x2, #16 + sub x0, x0, #16 + + /* 1--16 bytes left in the buffer but string has not ended yet */ +2: + cmeq v2.16b, v1.16b, #0 // NUL found in second chunk? + shrn v2.8b, v2.8h, #4 + fmov x4, d2 + + mov x6, #0xf + mov x7, x4 + + lsl x5, x2, #2 // shift 0xf to the limits position + lsl x5, x6, x5 + cmp x2, #16 // dont induce match if limit >=16 + csel x5, x5, xzr, lo + orr x8, x4, x5 // treat limit as if terminator present + + rbit x8, x8 // simulate x86 tzcnt + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + add x0, x0, x8 + + ldr q1, [x10, x8] // load tail + str q1, [x0] // store tail + strb wzr, [x0, #16] + + /* continue to find the end of the string */ + cbnz x7, 1f + + /* we opt for a simpler strlen than the one in libc as the + * cmeq, shrn approach is faster for shorter strings. + */ + .p2align 4 +0: + ldr q1, [x10, #32] + cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL + shrn v1.8b, v1.8h, #4 + fmov x7, d1 + cbnz x7, 2f + + ldr q1, [x10, #48] + cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL + shrn v1.8b, v1.8h, #4 + fmov x7, d1 + add x10, x10, #32 + cbz x7, 0b + +1: sub x10, x10, #16 +2: rbit x8, x7 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + sub x10, x10, x1 + add x0, x10, #32 + add x0, x0, x8 + + ret + +4: + sub x10, x10, #16 // undo second advancement + sub x0, x0, #16 // undo second advancement + + /* string has ended but buffer has not */ +3: + rbit x8, x5 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + add x0, x0, x8 // restore dst pointer + add x10, x10, x8 + + ldr q1, [x10, #-15] + str q1, [x0, #-15] + add x0, x0, #1 + sub x0, x10, x1 + + ret + +.Lhead_buf_end: + shrn v1.8b, v1.8h, #4 + fmov x8, d1 + + add x2, x2, #32 // restore limit + + mov x7, x8 + mov x6, #0xf + + cmp x2, #16 // should we induce a match or not + b.lo 0f + + rbit x8, x8 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + add x8, x8, #16 + + cmp x8, x2 + csel x8, x8, x2, lo // copy min(buflen, srclen) bytes + b 1f +0: + + rbit x8, x8 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + mov x8, x2 +1: + + sub x8, x8, x11 + strb wzr, [x9, x8] + + /* continue to find the end of the string */ + cbnz x7, 1f + + /* we opt for a simpler strlen than the one in libc as the + * cmeq, shrn approach is faster for shorter strings. + */ + .p2align 4 +0: + ldr q1, [x10, #32] + cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL + shrn v1.8b, v1.8h, #4 + fmov x7, d1 + cbnz x7, 2f + + ldr q1, [x10, #48] + cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL + shrn v1.8b, v1.8h, #4 + fmov x7, d1 + add x10, x10, #32 + cbz x7, 0b + +1: sub x10, x10, #16 +2: rbit x6, x7 + clz x6, x6 // index of mismatch + lsr x6, x6, #2 + + sub x10, x10, x1 + add x0, x10, #32 + add x0, x0, x6 + + add x4, x9, x8 // dst + cnt + add x5, x1, x8 // src + cnt + + b .L1732 + +.Lsecond_nul: + add x2, x2, x8 + + rbit x8, x5 + clz x8, x8 // index of mismatch + lsr x5, x8, #2 + + sub x8, x11, #16 + sub x0, x5, x8 // string length + + cmp x0, x2 // did we match or hit limit first? + csel x8, x2, x0, hi + + add x4, x9, x8 // dst + cnt + add x5, x1, x8 // src + cnt + + strb wzr, [x4] + + /* copy 17-32 bytes */ +.L1732: + cmp x8, #16 + b.lo .L0816 + ldp x16, x17, [x1] + ldp x12, x1, [x5, #-16] + stp x16, x17, [x9] + stp x12, x1, [x4, #-16] + ret + +.Lhead_nul: + rbit x8, x5 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + sub x0, x8, x11 + cmp x0, x2 + csel x8, x2, x0, hi + + add x4, x9, x8 // dst + cnt + add x5, x1, x8 // src + cnt + strb wzr, [x4] + + /* Copy 8-16 bytes */ +.L0816: + tbz x8, #3, .L0407 + ldr x16, [x1] + ldr x17, [x5, #-8] + str x16, [x9] + str x17, [x4, #-8] + ret + + /* Copy 4-7 bytes */ + .p2align 4 +.L0407: + cmp x8, #3 + b.ls .L0203 + ldr w16, [x1] + ldr w18, [x5, #-4] + str w16, [x9] + str w18, [x4, #-4] + ret + +.L0203: + tbz x8, 1, .L0001 + ldrh w16, [x1] + ldrh w17, [x5, #-2] + strh w16, [x9] + strh w17, [x4, #-2] + ret + +.L0001: + ldrb w16, [x1] + strb w16, [x9] + strb wzr, [x4] + ret + +.L0: + mov x0, x1 + b strlen + ret +END(__strlcpy)