diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -13,13 +13,15 @@ stpcpy \ strchr \ strchrnul \ - strcmp \ strcpy \ strlen \ strncmp \ strnlen \ strrchr + +MDSRCS+= \ + strcmp.S # # Add the above functions. Generate an asm file that includes the needed # Arm Optimized Routines file defining the function name to the libc name. diff --git a/lib/libc/aarch64/string/strcmp.S b/lib/libc/aarch64/string/strcmp.S new file mode 100644 --- /dev/null +++ b/lib/libc/aarch64/string/strcmp.S @@ -0,0 +1,317 @@ +/*- + * Copyright (c) 2024 Getz Mikalsen + * + * SPDX-License-Identifier: BSD-2-Clause +*/ + +#include +#include + + .text + +ENTRY(_strcmp) + + bic x8, x0, #0xf // x8 is x0 but aligned to the boundary + and x9, x0, #0xf // x9 is the offset + bic x10, x1, #0xf // x10 is x1 but aligned to the boundary + and x11, x1, #0xf // x11 is the offset + + /* + * Check if either string is located at end of page to avoid crossing + * into unmapped page. If so, we load 16 bytes from the nearest + * alignment boundary and shift based on the offset. + */ + + add x3, x0, #16 // end of head + add x4, x1, #16 + eor x3, x3, x0 + eor x4, x4, x1 // bits that changed + orr x3, x3, x4 // in either str1 or str2 + tbz w3, #PAGE_SHIFT, .Lbegin + + ldr q0, [x8] // load aligned head + ldr q2, [x10] + + adrp x2, shift_data + add x2, x2, :lo12:shift_data + + /* heads may cross page boundary, avoid unmapped loads */ + ldr q4, [x2, x9] // load permutation table + tbl v0.16b, {v0.16b}, v4.16b + ldr q4, [x2, x11] + tbl v4.16b, {v2.16b}, v4.16b + + .p2align 4 +.Lbegin: + ldr q0, [x0] // load true heads + ldr q4, [x1] + + cmeq v2.16b, v0.16b, #0 // NUL byte present? + cmeq v4.16b, v0.16b, v4.16b // which bytes match? + + orn v2.16b, v2.16b, v4.16b // mismatch or NUL byte? + + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + + cbnz x5, .Lhead_mismatch + + ldr q2, [x8, #16] // load second chunk + ldr q3, [x10, #16] + subs x9, x9, x11 // is a&0xf >= b&0xf + b.lt .Lswapped // if not swap operands + sub x12,x10,x9 + ldr q0, [x12, #16]! + sub x10, x10, x8 + sub x11,x10,x9 // point x11 to offset in second string + + cmeq v1.16b, v3.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + add x8, x8, #16 + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfound + mvn x5, x5 + cbnz x5, .Lmismatch + add x8, x8, #16 // advance aligned pointers + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * X1 doesn't end within region 2, then we compare chunk B between the + * two strings. As X1 is known not to hold a NUL byte in regions 1 + * and 2 at this point, this also ensures that x0 has not ended yet. + */ + .p2align 4 +0: + ldr q0, [x8, x11] + ldr q1, [x8, x9] + ldr q2, [x8] + + cmeq v1.16b, v1.16b, #0 // end of string? + cmeq v0.16b, v0.16b, v2.16b // do the chunks match? + + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfound + mvn x5, x5 // any mismatches? + cbnz x5, .Lmismatch + + add x8, x8, #16 + + ldr q0, [x8, x11] + ldr q1, [x8, x9] + ldr q2, [x8] + + add x8, x8, #16 + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfound2 + mvn x5, x5 + cbz x5, 0b + + sub x8, x8, #16 // roll back second increment +.Lmismatch: + rbit x2, x5 + clz x2, x2 // index of mismatch + lsr x2, x2, #2 + add x11, x8, x11 + + ldrb w4, [x8, x2] + ldrb w5, [x11, x2] + sub w0, w4, w5 // difference of the mismatching chars + ret + + .p2align 4 +.Lnulfound2: + sub x8, x8, #16 + +.Lnulfound: + mov x7, x9 + mov x4, x6 + + ubfiz x7, x7, #2, #4 // x7 = (x7 & 0xf) << 2 + lsl x6, x6, x7 // adjust NUL mask to indices + orn x5, x6, x5 + cbnz x5, .Lmismatch + + /* + * (x0) == (x1) and NUL is past the string. + * Compare (x1) with the corresponding part + * of the other string until the NUL byte. + */ + ldr q0, [x8, x9] + ldr q1, [x8, x10] + + cmeq v1.16b, v0.16b, v1.16b + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + orn x5, x4, x5 + + rbit x2, x5 + clz x2, x2 + lsr x5, x2, #2 + + add x10, x10, x8 // restore x10 pointer + add x8, x8, x9 // point to corresponding chunk in x0 + + ldrb w4, [x8, x5] + ldrb w5, [x10, x5] + sub w0, w4, w5 + ret + + .p2align 4 +.Lhead_mismatch: + rbit x2, x5 + clz x2, x2 // index of mismatch + lsr x2, x2, #2 + ldrb w4, [x0, x2] + ldrb w5, [x1, x2] + sub w0, w4, w5 + ret + + /* + * If (a&0xf) < (b&0xf), we do the same thing but with swapped + * operands. I found that this performs slightly better than + * using conditional moves to do the swap branchless. + */ + .p2align 4 +.Lswapped: + add x12, x8, x9 + ldr q0, [x12, #16]! + sub x8, x8, x10 + add x11, x8, x9 + neg x9, x9 + + cmeq v1.16b, v2.16b, #0 + cmeq v0.16b, v0.16b, v3.16b + add x10, x10, #16 + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfounds + mvn x5, x5 + cbnz x5, .Lmismatchs + add x10, x10, #16 + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * X0 doesn't end within region 2, then we compare chunk B between the + * two strings. As X0 is known not to hold a NUL byte in regions 1 + * and 2 at this point, this also ensures that X1 has not ended yet. + */ + .p2align 4 +0: + ldr q0, [x10, x11] + ldr q1, [x10, x8] + ldr q2, [x10] + + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfounds + mvn x5, x5 + cbnz x5, .Lmismatchs + + add x10, x10, #16 + + ldr q0, [x10, x11] + ldr q1, [x10, x8] + ldr q2, [x10] + + add x10, x10, #16 + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfound2s + mvn x5, x5 + cbz x5, 0b + + sub x10, x10, #16 + +.Lmismatchs: + rbit x2, x5 + clz x2, x2 + lsr x2, x2, #2 + add x11, x10, x11 + + ldrb w4, [x10, x2] + ldrb w5, [x11, x2] + sub w0, w5, w4 + ret + + .p2align 4 +.Lnulfound2s: + sub x10, x10, #16 +.Lnulfounds: + mov x7, x9 + mov x4, x6 + + ubfiz x7, x7, #2, #4 + lsl x6, x6, x7 + orn x5, x6, x5 + cbnz x5, .Lmismatchs + + ldr q0, [x10, x9] + ldr q1, [x10, x8] + + cmeq v1.16b, v0.16b, v1.16b + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + orn x5, x4, x5 + + rbit x2, x5 + clz x2, x2 + lsr x5, x2, #2 + + add x11, x10, x8 + add x10, x10, x9 + + ldrb w4, [x10, x5] + ldrb w5, [x11, x5] + sub w0, w5, w4 + ret + +END(_strcmp) + + .section .rodata + .p2align 4 +shift_data: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + .fill 16, 1, -1 + .size shift_data, .-shift_data