diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -13,13 +13,15 @@ stpcpy \ strchr \ strchrnul \ - strcmp \ strcpy \ strlen \ strncmp \ strnlen \ strrchr + +MDSRCS+= \ + strcmp.S # # Add the above functions. Generate an asm file that includes the needed # Arm Optimized Routines file defining the function name to the libc name. diff --git a/lib/libc/aarch64/string/strcmp.S b/lib/libc/aarch64/string/strcmp.S new file mode 100644 --- /dev/null +++ b/lib/libc/aarch64/string/strcmp.S @@ -0,0 +1,355 @@ +/*- + * Copyright (c) 2024 Getz Mikalsen + * + * SPDX-License-Identifier: BSD-2-Clause +*/ + +#include +#include + + .text + +ENTRY(strcmp) + + bic x8, x0, #0xf // x8 is x0 but aligned to the boundary + and x9, x0, #0xf // x9 is the offset + bic x10, x1, #0xf // x10 is x1 but aligned to the boundary + and x11, x1, #0xf // x11 is the offset + + /* + * Check if either string is located at end of page to avoid crossing + * into unmapped page. If so, we load 16 bytes from the nearest + * alignment boundary and shift based on the offset. + */ + + add x3, x0, #16 // end of head + add x4, x1, #16 + eor x3, x3, x0 + eor x4, x4, x1 // bits that changed + orr x3, x3, x4 // in either str1 or str2 + tst w3, #PAGE_SIZE + b.eq .Lbegin + + ldr q0, [x8] // load aligned head + ldr q2, [x10] + + mov x4, #0xffffffffffffffff + lsl x14, x9, #2 + lsl x3, x4, x14 // string head + lsl x15, x11, #2 + lsl x4, x4, x15 + + cmeq v0.16b, v0.16b, #0 + cmeq v2.16b, v2.16b, #0 + + shrn v0.8b, v0.8h, #4 + shrn v2.8b, v2.8h, #4 + fmov x5, d0 + fmov x6, d2 + + /* heads may cross page boundary, avoid unmapped loads */ + tst x5, x3 + b.eq 0f + adrp x2, shift_data + add x2, x2, :lo12:shift_data + add x2, x2, x9 + ldr q0, [x8] + ldr q4, [x2] // load permutation table + tbl v0.16b, {v0.16b}, v4.16b + b 1f + .p2align 4 +0: + ldr q0, [x0] // load true head +1: + tst x6, x4 + b.eq 0f + + adrp x2, shift_data + add x2, x2, :lo12:shift_data + add x2, x2, x11 + ldr q1, [x10] + ldr q4, [x2] + tbl v4.16b, {v1.16b}, v4.16b + + b 1f + + .p2align 4 +.Lbegin: + ldr q0, [x0] // load true heads +0: + ldr q4, [x1] +1: + + cmeq v2.16b, v0.16b, #0 // NUL byte present? + cmeq v4.16b, v0.16b, v4.16b // which bytes match? + + not v2.16b, v2.16b + and v2.16b, v2.16b, v4.16b // match and not NUL byte + + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + mvn x5, x5 // mismatch or NUL byte? + + cbnz x5, .Lhead_mismatch + + ldr q2, [x8, #16] // load second chunk + ldr q3, [x10, #16] + subs x9, x9, x11 // is a&0xf >= b&0xf + b.lt .Lswapped // if not swap operands + neg x9, x9 + add x12, x10, x9 + ldr q0, [x12, #16]! + sub x10, x10, x8 + add x11, x10, x9 // point x11 to offset in second string + neg x9, x9 + + cmeq v1.16b, v3.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + add x8, x8, #16 + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfound + mvn x5, x5 + cbnz x5, .Lmismatch + add x8, x8, #16 // advance aligned pointers + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * X1 doesn't end within region 2, then we compare chunk B between the + * two strings. As X1 is known not to hold a NUL byte in regions 1 + * and 2 at this point, this also ensures that x0 has not ended yet. + */ + .p2align 4 +0: + ldr q0, [x8, x11] + ldr q1, [x8, x9] + ldr q2, [x8] + + cmeq v1.16b, v1.16b, #0 // end of string? + cmeq v0.16b, v0.16b, v2.16b // do the chunks match? + + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfound + mvn x5, x5 // any mismatches? + cbnz x5, .Lmismatch + + add x8, x8, #16 + + ldr q0, [x8, x11] + ldr q1, [x8, x9] + ldr q2, [x8] + + add x8, x8, #16 + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfound2 + mvn x5, x5 + cbz x5, 0b + + sub x8, x8, #16 // roll back second increment +.Lmismatch: + rbit x2, x5 + clz x2, x2 // index of mismatch + lsr x2, x2, #2 + add x11, x8, x11 + + ldrb w4, [x8, x2] + ldrb w5, [x11, x2] + sub w0, w4, w5 // difference of the mismatching chars + ret + +.Lnulfound2: + sub x8, x8, #16 + +.Lnulfound: + mov x7, x9 + mov x4, x6 + + and x7, x7, #0xf + lsl x7, x7, #2 + lsl x6, x6, x7 // adjust NUL mask to indices + mvn x5, x5 + orr x5, x5, x6 // NUL bytes also count as mismatches + cbnz x5, .Lmismatch + + /* + * (x0) == (x1) and NUL is past the string. + * Compare (x1) with the corresponding part + * of the other string until the NUL byte. + */ + ldr q0, [x8, x9] + ldr q1, [x8, x10] + + cmeq v1.16b, v0.16b, v1.16b + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + mvn x5, x5 + orr x5, x5, x4 + + rbit x2, x5 + clz x2, x2 + lsr x5, x2, #2 + + add x10, x10, x8 // restore x10 pointer + add x8, x8, x9 // point to corresponding chunk in x0 + + ldrb w4, [x8, x5] + ldrb w5, [x10, x5] + sub w0, w4, w5 + ret + +.Lhead_mismatch: + rbit x2, x5 + clz x2, x2 // index of mismatch + lsr x2, x2, #2 + ldrb w4, [x0, x2] + ldrb w5, [x1, x2] + sub w0, w4, w5 + ret + + /* + * If (a&0xf) < (b&0xf), we do the same thing but with swapped + * operands. I found that this performs slightly better than + * using conditional moves to do the swap branchless. + */ +.Lswapped: + add x12, x8, x9 + ldr q0, [x12, #16]! + sub x8, x8, x10 + add x11, x8, x9 + neg x9, x9 + + cmeq v1.16b, v2.16b, #0 + cmeq v0.16b, v0.16b, v3.16b + add x10, x10, #16 + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfounds + mvn x5, x5 + cbnz x5, .Lmismatchs + add x10, x10, #16 + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * X0 doesn't end within region 2, then we compare chunk B between the + * two strings. As X0 is known not to hold a NUL byte in regions 1 + * and 2 at this point, this also ensures that X1 has not ended yet. + */ + .p2align 4 +0: + ldr q0, [x10, x11] + ldr q1, [x10, x8] + ldr q2, [x10] + + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfounds + mvn x5, x5 + cbnz x5, .Lmismatchs + + add x10, x10, #16 + + ldr q0, [x10, x11] + ldr q1, [x10, x8] + ldr q2, [x10] + + add x10, x10, #16 + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfound2s + mvn x5, x5 + cbz x5, 0b + + sub x10, x10, #16 + +.Lmismatchs: + rbit x2, x5 + clz x2, x2 + lsr x2, x2, #2 + add x11, x10, x11 + + ldrb w4, [x10, x2] + ldrb w5, [x11, x2] + sub w0, w5, w4 + ret + +.Lnulfound2s: + sub x10, x10, #16 +.Lnulfounds: + mov x7, x9 + mov x4, x6 + + and x7, x7, #0xf + lsl x7, x7, #2 + lsl x6, x6, x7 + mvn x5, x5 + orr x5, x5, x6 + cbnz x5, .Lmismatchs + + ldr q0, [x10, x9] + ldr q1, [x10, x8] + + cmeq v1.16b, v0.16b, v1.16b + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + mvn x5, x5 + orr x5, x5, x4 + + rbit x2, x5 + clz x2, x2 + lsr x5, x2, #2 + + add x11, x10, x8 + add x10, x10, x9 + + ldrb w4, [x10, x5] + ldrb w5, [x11, x5] + sub w0, w5, w4 + ret + +END(strcmp) + + .section .rodata + .p2align 4 +shift_data: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15