diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -16,10 +16,12 @@ strcmp \ strcpy \ strlen \ - strncmp \ strnlen \ strrchr + +MDSRCS+= \ + strncmp.S # # Add the above functions. Generate an asm file that includes the needed # Arm Optimized Routines file defining the function name to the libc name. diff --git a/lib/libc/aarch64/string/strncmp.S b/lib/libc/aarch64/string/strncmp.S new file mode 100644 --- /dev/null +++ b/lib/libc/aarch64/string/strncmp.S @@ -0,0 +1,567 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Getz Mikalsen +*/ + +#include +#include + + .text + +ENTRY(strncmp) + + bic x8, x0, #0xf // x8 is x0 but aligned to the boundary + and x9, x0, #0xf // x9 is the offset + bic x10, x1, #0xf // x10 is x1 but aligned to the boundary + and x11, x1, #0xf // x11 is the offset + + subs x2, x2, #1 + b.mi .Lempty + + mov x13, #-1 // save constants for later + mov x16, #0xf + + /* + * Check if either string is located at end of page to avoid crossing + * into unmapped page. If so, we load 16 bytes from the nearest + * alignment boundary and shift based on the offset. + */ + + add x3, x0, #16 // end of head + add x4, x1, #16 + eor x3, x3, x0 + eor x4, x4, x1 // bits that changed + orr x3, x3, x4 // in either str1 or str2 + cmp x2,#16 + b.lo .Llt16 + tbz w3, #PAGE_SHIFT, .Lbegin + + ldr q0, [x8] // load aligned head + ldr q1, [x10] + + lsl x14, x9, #2 + lsl x15, x11, #2 + lsl x3, x13, x14 // string head + lsl x4, x13, x15 + + cmeq v5.16b, v0.16b, #0 + cmeq v6.16b, v1.16b, #0 + + shrn v5.8b, v5.8h, #4 + shrn v6.8b, v6.8h, #4 + fmov x5, d5 + fmov x6, d6 + + adrp x14, shift_data + add x14, x14, :lo12:shift_data + + /* heads may cross page boundary, avoid unmapped loads */ + tst x5, x3 + b.eq 0f + + ldr q4, [x14, x9] // load permutation table + tbl v0.16b, {v0.16b}, v4.16b + + b 1f + .p2align 4 +0: + ldr q0, [x0] // load true head +1: + tst x6, x4 + b.eq 0f + + ldr q4, [x14, x11] + tbl v4.16b, {v1.16b}, v4.16b + + b 1f + + .p2align 4 +.Lbegin: + ldr q0, [x0] // load true heads +0: + ldr q4, [x1] +1: + cmeq v2.16b, v0.16b, #0 // NUL byte present? + cmeq v4.16b, v0.16b, v4.16b // which bytes match? + + orn v2.16b, v2.16b, v4.16b // mismatch or NUL byte? + + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + + cbnz x5, .Lhead_mismatch + /* load head and second chunk */ + ldr q2, [x8, #16] // load second chunk + ldr q3, [x10, #16] + + add x2, x2, x11 + sub x2, x2, #16 // account for length of RSI chunk? + + subs x9, x9, x11 // is a&0xf >= b&0xf + b.lo .Lswapped // if not swap operands + b .Lnormal + + .p2align 4 +.Llt16: + /* + * Check if either string is located at end of page to avoid crossing + * into unmapped page. If so, we load 16 bytes from the nearest + * alignment boundary and shift based on the offset. + */ + tbz w3, #PAGE_SHIFT, 2f + + ldr q0, [x8] // load aligned head + ldr q1, [x10] + + lsl x14, x9, #2 + lsl x15, x11, #2 + lsl x3, x13, x14 // string head + lsl x4, x13, x15 + + /* Introduce a null byte match if the limit is within the aligned chunk */ + add x14, x2, x9 + add x15, x2, x11 + lsl x14, x14, #2 + lsl x15, x15, #2 + lsl x14, x16, x14 + lsl x15, x16, x15 + + cmeq v5.16b, v0.16b, #0 + cmeq v6.16b, v1.16b, #0 + + shrn v5.8b, v5.8h, #4 + shrn v6.8b, v6.8h, #4 + fmov x5, d5 + fmov x6, d6 + + orr x5, x5, x14 // insert null match if limit reached + orr x6, x6, x15 + + adrp x14, shift_data + add x14, x14, :lo12:shift_data + + /* heads may cross page boundary, avoid unmapped loads */ + tst x5, x3 + b.eq 0f + + ldr q4, [x14, x9] // load permutation table + tbl v0.16b, {v0.16b}, v4.16b + + b 1f + .p2align 4 +0: + ldr q0, [x0] // load true head +1: + tst x6, x4 + b.eq 0f + + ldr q4, [x14, x11] + tbl v4.16b, {v1.16b}, v4.16b + + b 1f + + .p2align 4 +2: + ldr q0, [x0] // load true heads +0: + ldr q4, [x1] +1: + + cmeq v2.16b, v0.16b, #0 // NUL byte present? + cmeq v4.16b, v0.16b, v4.16b // which bytes match? + + bic v2.16b, v4.16b, v2.16b // match and not NUL byte + + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + lsl x4, x2, #2 + lsl x4, x13, x4 + orn x5, x4, x5 // mismatch or NUL byte? + +.Lhead_mismatch: + rbit x3, x5 + clz x3, x3 // index of mismatch + lsr x3, x3, #2 + ldrb w4, [x0, x3] + ldrb w5, [x1, x3] + sub w0, w4, w5 + ret + + .p2align 4 +.Lnormal: + sub x12, x10, x9 + ldr q0, [x12, #16]! + sub x10, x10, x8 + sub x11, x10, x9 + + cmeq v1.16b, v3.16b, #0 // NUL present? + cmeq v0.16b, v0.16b, v2.16b // Mismatch between chunks? + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + + add x8, x8, #32 // advance to next iteration + + lsl x4, x2, #2 + lsl x4, x13, x4 + orr x3, x6, x4 // introduce a null byte match + cmp x2, #16 // does the buffer end within x2 + csel x6, x3, x6, lo + cbnz x6, .Lnulfound2 // NUL or end of buffer found? + mvn x5, x5 + cbnz x5, .Lmismatch2 + sub x2, x2, #16 + cmp x2, #32 // end of buffer within first main loop iteration? + b.lo .Ltail + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * X1 doesn't end within region 2, then we compare chunk B between the + * two strings. As X1 is known not to hold a NUL byte in regions 1 + * and 2 at this point, this also ensures that x0 has not ended yet. + */ + .p2align 4 +0: + ldr q0, [x8, x11] + ldr q1, [x8, x10] + ldr q2, [x8] + + cmeq v1.16b, v1.16b, #0 // end of string? + cmeq v0.16b, v0.16b, v2.16b // do the chunks match? + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + cbnz x6, .Lnulfound + mvn x5, x5 // any mismatches? + cbnz x5, .Lmismatch + + add x8, x8, #16 + + /* main loop unrolled twice */ + ldr q0, [x8, x11] + ldr q1, [x8, x10] + ldr q2, [x8] + + add x8, x8, #16 + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + cbnz x6, .Lnulfound2 + mvn x5, x5 + cbnz x5, .Lmismatch2 + sub x2, x2, #32 + cmp x2, #32 // end of buffer within next iteration + b.hs 0b // if yes, process tail + + /* end of buffer will occur in next 32 bytes */ +.Ltail: + ldr q0, [x8, x11] + ldr q1, [x8, x10] + ldr q2, [x8] + + cmeq v1.16b, v1.16b, #0 // end of string? + cmeq v0.16b, v0.16b, v2.16b // do the chunks match? + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + + /* + * If x2 <= 16 then we introduce a NUL byte in the + * result from CMEQ to avoid comparing further! + */ + + lsl x4, x2, #2 + lsl x4, x13, x4 + orr x3, x6, x4 // introduce a null byte match + cmp x2, #16 // does the buffer end within x2 + csel x6, x3, x6, lo + + cbnz x6, .Lnulfound // NUL or end of string found + mvn x5, x5 + cbnz x5, .Lmismatch + + add x8, x8, #16 + + /* main loop unrolled twice */ + ldr q0, [x8, x11] + ldr q1, [x8, x10] + ldr q2, [x8] + + add x8, x8, #16 + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + + ubfiz x4, x2, #2, #4 // (x2 - 16) << 2 + lsl x4, x13, x4 // take first half into account + orr x6, x6, x4 // introduce a null byte match + +.Lnulfound2: + sub x8, x8, #16 + +.Lnulfound: + mov x4, x6 + + ubfiz x7, x9, #2, #4 + lsl x6, x6, x7 // adjust NUL mask to indices + + orn x5, x6, x5 + cbnz x5, .Lmismatch + + /* + * (x0) == (x1) and NUL is past the string. + * Compare (x1) with the corresponding part + * of the other string until the NUL byte. + */ + ldr q0, [x8, x9] + ldr q1, [x8, x10] + + cmeq v1.16b, v0.16b, v1.16b + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + orn x5, x4, x5 + + rbit x3, x5 + clz x3, x3 + lsr x5, x3, #2 + + add x10, x10, x8 // restore x10 pointer + add x8, x8, x9 // point to corresponding chunk in x0 + + ldrb w4, [x8, x5] + ldrb w5, [x10, x5] + sub w0, w4, w5 + ret + + .p2align 4 +.Lmismatch2: + sub x8, x8, #16 // roll back second increment +.Lmismatch: + rbit x3, x5 + clz x3, x3 // index of mismatch + lsr x3, x3, #2 + add x11, x8, x11 + + ldrb w4, [x8, x3] + ldrb w5, [x11, x3] + sub w0, w4, w5 // difference of the mismatching chars + ret + + /* + * If (a&0xf) < (b&0xf), we do the same thing but with swapped + * operands. I found that this performs slightly better than + * using conditional moves to do the swap branchless. + */ + .p2align 4 +.Lswapped: + add x12, x8, x9 + ldr q0, [x12, #16]! + sub x8, x8, x10 + add x11, x8, x9 + add x2,x2,x9 + neg x9, x9 + + cmeq v1.16b, v2.16b, #0 + cmeq v0.16b, v0.16b, v3.16b + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + + add x10, x10, #32 + + lsl x4, x2, #2 + lsl x4, x13, x4 + orr x3,x6,x4 // introduce a null byte match + cmp x2,#16 + csel x6, x3, x6, lo + cbnz x6, .Lnulfound2s + mvn x5, x5 + cbnz x5, .Lmismatch2s + + sub x2, x2, #16 + cmp x2, #32 + b.lo .Ltails + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * X0 doesn't end within region 2, then we compare chunk B between the + * two strings. As X0 is known not to hold a NUL byte in regions 1 + * and 2 at this point, this also ensures that X1 has not ended yet. + */ + .p2align 4 +0: + ldr q0, [x10, x11] + ldr q1, [x10, x8] + ldr q2, [x10] + + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + cbnz x6, .Lnulfounds + mvn x5, x5 + cbnz x5, .Lmismatchs + + add x10, x10, #16 + + /* main loop unrolled twice */ + ldr q0, [x10, x11] + ldr q1, [x10, x8] + ldr q2, [x10] + + add x10, x10, #16 + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + cbnz x6, .Lnulfound2s + mvn x5, x5 + cbnz x5, .Lmismatch2s + sub x2, x2, #32 + cmp x2, #32 + b.hs 0b + +.Ltails: + ldr q0, [x10, x11] + ldr q1, [x10, x8] + ldr q2, [x10] + + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + + /* + * If x2 <= 16 then we introduce a NUL byte in the + * result from CMEQ to avoid comparing further! + */ + + lsl x4, x2, #2 + lsl x4, x13, x4 + orr x3, x6, x4 // introduce a null byte match + cmp x2, #16 + csel x6, x3, x6, lo + + cbnz x6, .Lnulfounds + mvn x5, x5 + cbnz x5, .Lmismatchs + + add x10, x10, #16 + + ldr q0, [x10, x11] + ldr q1, [x10, x8] + ldr q2, [x10] + + add x10, x10, #16 + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + + ubfiz x4, x2, #2, #4 + lsl x4, x13, x4 + orr x6, x6, x4 // introduce a null byte match + +.Lnulfound2s: + sub x10, x10, #16 +.Lnulfounds: + mov x4, x6 + + ubfiz x7, x9, #2, #4 + lsl x6, x6, x7 + + orn x5, x6, x5 + + cbnz x5, .Lmismatchs + + ldr q0, [x10, x9] + ldr q1, [x10, x8] + + cmeq v1.16b, v0.16b, v1.16b + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + orn x5, x4, x5 + + rbit x3, x5 + clz x3, x3 + lsr x5, x3, #2 + + add x11, x10, x8 + add x10, x10, x9 + + ldrb w4, [x10, x5] + ldrb w5, [x11, x5] + sub w0, w5, w4 + ret + + .p2align 4 +.Lmismatch2s: + sub x10, x10, #16 +.Lmismatchs: + rbit x3, x5 + clz x3, x3 + lsr x3, x3, #2 + add x11, x10, x11 + + ldrb w4, [x10, x3] + ldrb w5, [x11, x3] + sub w0, w5, w4 + ret + + .p2align 4 +.Lempty: + eor x0, x0, x0 + ret + +END(strncmp) + + .section .rodata + .p2align 4 +shift_data: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + .fill 16, 1, -1 + .size shift_data, .-shift_data