diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -5,7 +5,6 @@ AARCH64_STRING_FUNCS= \ memchr \ - memcmp \ memcpy \ memmove \ memrchr \ @@ -20,6 +19,10 @@ strnlen \ strrchr + +MDSRCS+= \ + memcmp.S + # # Add the above functions. Generate an asm file that includes the needed # Arm Optimized Routines file defining the function name to the libc name. diff --git a/lib/libc/aarch64/string/memcmp.S b/lib/libc/aarch64/string/memcmp.S new file mode 100644 --- /dev/null +++ b/lib/libc/aarch64/string/memcmp.S @@ -0,0 +1,138 @@ +/*- + * Copyright (c) 2024 Getz Mikalsen + * + * SPDX-License-Identifier: BSD-2-Clause +*/ + +#include +#include + + .text + +ENTRY(memcmp) + + mov x8, x0 // store base address for later + mov x9, x1 + cbz x2, .Lnone // 0 length + + /* + * TODO: Check if buffer is located at end of page to avoid crossing + * into unmapped page. + */ + +// cmp x2, #32 +// b.hi .Lbegin +// add x3, x8, #32 +// add x4, x9, #32 +// eor x3, x3, x8 +// eor x4, x4, x9 +// tst w3, #PAGE_SIZE +// b.eq .Lbegin +// tst w4, #PAGE_SIZE +// b.eq .Lbegin + + /* + * Compare strings of 1--32 bytes. We do this by loading into two + * vector registers and then doing a quick compare with XOR, UMAXP + * do determine if the first 32 bytes all match. + */ + +.Lbegin: + ldp q0, q1, [x8] // load 32 bytes into vector registers + ldp q2, q3, [x9] + + /* quick check if no matches in first 32 bytes */ + eor v4.16b, v0.16b, v2.16b // v4 = b1(0-15) XOR b2(0-15) + eor v5.16b, v1.16b, v3.16b // v5 = b1(16-32) XOR b2(16-32) + umaxp v4.16b, v4.16b, v5.16b + umaxp v4.16b, v4.16b, v4.16b // fill v4 with max value + fmov x6, d4 + cbz x6, .Lloop // if d4 is 0 then all matched + + cmeq v0.16b, v0.16b, v2.16b // do compare between 0-15 b1 vs b2 + cmeq v1.16b, v1.16b, v3.16b // do compare between 16-31 b1 vs b2 + shrn v0.8b, v0.8h, #4 // shift right to fit in x1 + shrn v1.8b, v1.8h, #4 + + fmov x1, d0 + fmov x3, d1 + + mvn x0, x1 // invert for clz + mvn x3, x3 + rbit x1, x0 + rbit x3, x3 + clz x1, x1 + clz x3, x3 + add x3, x3, #64 + cmn x0, #0 // any match in LSB? + csel x0, x3, x1, eq // take x3 if none, else x1 matched + + lsr x0, x0, #2 + cmp x0, x2 + b.ge .Lnone + ldrb w4, [x8, x0] + ldrb w5, [x9, x0] + sub w0, w4, w5 // get the byte difference + ret + + /* + * Compare strings of 32+ bytes. We introduce special handling if + * theres less than 32 bytes left of the limit. + */ + .p2align 4 +.Lloop: + subs x2, x2, #32 + b.le .Lnone + cmp x2, #32 + b.le .Llast32 + ldp q0, q1, [x8,#32]! + ldp q2, q3, [x9,#32]! + + eor v4.16b, v0.16b, v2.16b + eor v5.16b, v1.16b, v3.16b + umaxp v4.16b, v4.16b, v5.16b + umaxp v4.16b, v4.16b, v4.16b + fmov x6, d4 + cbz x6, .Lloop + b .Lmatch + + /* If 32 bytes left to compare only load 32 bytes from x8,x9 - limit to + * avoid overread */ +.Llast32: + add x8, x8, x2 + add x9, x9, x2 + mov x2, #32 + ldp q0, q1, [x8] + ldp q2, q3, [x9] +.Lmatch: + cmeq v0.16b, v0.16b, v2.16b + cmeq v1.16b, v1.16b, v3.16b + + shrn v0.8b, v0.8h, #4 + shrn v1.8b, v1.8h, #4 + fmov x1, d0 + fmov x3, d1 + + mvn x0, x1 // invert for clz + mvn x3, x3 + rbit x1, x0 + rbit x3, x3 + clz x1, x1 + clz x3, x3 + add x3, x3, #64 + cmn x0, #0 + csel x0, x3, x1, eq + + lsr x0, x0, #2 + cmp x0, x2 + b.ge .Lnone + ldrb w4, [x8, x0] + ldrb w5, [x9, x0] + sub w0, w4, w5 // get the byte difference + ret + +.Lnone: + mov x0, #0 + ret + +END(memcmp)