diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -5,7 +5,6 @@ AARCH64_STRING_FUNCS= \ memchr \ - memcmp \ memcpy \ memmove \ memrchr \ @@ -20,6 +19,10 @@ strnlen \ strrchr + +MDSRCS+= \ + memcmp.S + # # Add the above functions. Generate an asm file that includes the needed # Arm Optimized Routines file defining the function name to the libc name. diff --git a/lib/libc/aarch64/string/memcmp.S b/lib/libc/aarch64/string/memcmp.S new file mode 100644 --- /dev/null +++ b/lib/libc/aarch64/string/memcmp.S @@ -0,0 +1,189 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Getz Mikalsen +*/ + +#include +#include + + .text + +ENTRY(memcmp) + + mov x8, x0 // store base addresses + mov x9, x1 + cbz x2, .Lnone // 0 length + + /* + * Check if buffer is located at end of page to avoid crossing + * into unmapped page. If so, we load 32 bytes from the end of the + * limit and check the other buffer. + */ + + cmp x2, #32 + b.hi .Lbegin + add x3, x8, #32 + add x4, x9, #32 + eor x3, x3, x8 + eor x4, x4, x9 + + tst w3, #PAGE_SIZE + b.eq 0f + + mov x3, #32 + sub x3, x3, x2 + sub x8, x8, x3 + + /* + * We perform a variable shift in the vector register using TBL, + * a suitable permutation is generated by loading a table of bytes + * with a desired offset. + */ + + adrp x0, shift_table + add x0, x0, :lo12:shift_table + add x0, x0, x3 + ldp q0, q1, [x8] + ldp q4, q5, [x0] // load permutation table + tbl v0.16b, {v0.16b, v1.16b}, v4.16b + tbl v1.16b, {v0.16b, v1.16b}, v5.16b + add x8, x8, x3 // reset pointer to beginning of src + b 1f + +0: + ldp q0, q1, [x8] + +1: + tst w4, #PAGE_SIZE + b.eq 0f + + mov x3, #32 + sub x3, x3, x2 + sub x9, x9, x3 + + ldp q2, q3, [x9] + adrp x0, shift_table + add x0, x0, :lo12:shift_table + add x0, x0, x3 + ldp q4, q5, [x0] + tbl v2.16b, {v2.16b, v3.16b}, v4.16b + tbl v3.16b, {v2.16b, v3.16b}, v5.16b + add x9, x9, x3 + b 1f + + /* + * Compare strings of 1--32 bytes. We do this by loading into two + * vector registers and then doing a quick compare with XOR, UMAXP + * do determine if the first 32 bytes all match. + */ +.Lbegin: + ldp q0, q1, [x8] +0: + ldp q2, q3, [x9] +1: + + /* quick check if no matches in first 32 bytes */ + eor v4.16b, v0.16b, v2.16b // v4 = b1(0-15) XOR b2(0-15) + eor v5.16b, v1.16b, v3.16b + umaxp v4.16b, v4.16b, v5.16b + umaxp v4.16b, v4.16b, v4.16b // fill v4 with max value + fmov x6, d4 + cbz x6, .Lloop // if d4 is 0 then all matched + + cmeq v0.16b, v0.16b, v2.16b // do compare between 0-15 b1 vs b2 + cmeq v1.16b, v1.16b, v3.16b // do compare between 16-31 b1 vs b2 + shrn v0.8b, v0.8h, #4 // shift right to fit in x1 + shrn v1.8b, v1.8h, #4 + + fmov x1, d0 + fmov x3, d1 + + mvn x0, x1 // invert for clz + mvn x3, x3 + rbit x1, x0 + rbit x3, x3 + clz x1, x1 + clz x3, x3 + add x3, x3, #64 + cmn x0, #0 // any match in LSB? + csel x0, x3, x1, eq // take x3 if none, else x1 matched + + lsr x0, x0, #2 + cmp x0, x2 + b.hs .Lnone + ldrb w4, [x8, x0] + ldrb w5, [x9, x0] + sub w0, w4, w5 // get the byte difference + ret + + /* + * Compare strings of 32+ bytes. We introduce special handling if + * theres less than 32 bytes left of the limit. + */ + .p2align 4 +.Lloop: + sub x2, x2, #32 + cmp x2, #32 + b.le .Llast32 + ldp q0, q1, [x8,#32]! + ldp q2, q3, [x9,#32]! + + eor v4.16b, v0.16b, v2.16b + eor v5.16b, v1.16b, v3.16b + umaxp v4.16b, v4.16b, v5.16b + umaxp v4.16b, v4.16b, v4.16b + fmov x6, d4 + cbz x6, .Lloop + b .Lmatch + + /* If 32 bytes left to compare only load 32 bytes from x8,x9 - limit to + * avoid overread */ +.Llast32: + cmp x2,#0 + b.le .Lnone + add x8, x8, x2 + add x9, x9, x2 + mov x2, #32 + ldp q0, q1, [x8] + ldp q2, q3, [x9] +.Lmatch: + cmeq v0.16b, v0.16b, v2.16b + cmeq v1.16b, v1.16b, v3.16b + + shrn v0.8b, v0.8h, #4 + shrn v1.8b, v1.8h, #4 + fmov x1, d0 + fmov x3, d1 + + mvn x0, x1 + mvn x3, x3 + rbit x1, x0 + rbit x3, x3 + clz x1, x1 + clz x3, x3 + add x3, x3, #64 + cmn x0, #0 + csel x0, x3, x1, eq + + lsr x0, x0, #2 + cmp x0, x2 + b.hs .Lnone + ldrb w4, [x8, x0] + ldrb w5, [x9, x0] + sub w0, w4, w5 + ret + +.Lnone: + mov x0, #0 + ret + +END(memcmp) + + .section .rodata + .p2align 4 +shift_table: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + .byte 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + .fill 16, 1, -1 + .size shift_table, .-shift_table