diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -5,7 +5,6 @@ AARCH64_STRING_FUNCS= \ memchr \ - memcmp \ memcpy \ memmove \ memrchr \ @@ -20,6 +19,10 @@ strnlen \ strrchr + +MDSRCS+= \ + memcmp.S + # # Add the above functions. Generate an asm file that includes the needed # Arm Optimized Routines file defining the function name to the libc name. diff --git a/lib/libc/aarch64/string/memcmp.S b/lib/libc/aarch64/string/memcmp.S new file mode 100644 --- /dev/null +++ b/lib/libc/aarch64/string/memcmp.S @@ -0,0 +1,130 @@ +/*- + * Copyright (c) 2024 Getz Mikalsen + * + * SPDX-License-Identifier: BSD-2-Clause +*/ + +#include +#include + + .text + +ENTRY(memcmp) + + mov x8,x0 // store base address for later + mov x9,x1 + + cbz x2,.Lnone // 0 length + + + /* + * TODO: Check if buffer is located at end of page to avoid crossing + * into unmapped page. + */ + +// cmp x2,#32 +// b.hi .Lbegin +// add x3,x8,#32 +// add x4,x9,#32 +// eor x3,x3,x8 +// eor x4,x4,x9 +// tst w3,#PAGE_SIZE +// b.ne .Lbegin +// tst w4,#PAGE_SIZE +// b.ne .Lbegin + + /* + * Compare strings of 1--32 bytes. We do this by loading into two + * vector registers and then comparing. + */ + +.Lbegin: + ldp q0,q1,[x0] // load 32 bytes into vector registers + ldp q2,q3,[x1] + + /* quick check if no matches in first 32 bytes */ + eor v4.16b,v0.16b,v2.16b // v4 = b1(0-15) XOR b2(0-15) + eor v5.16b,v1.16b,v3.16b // v5 = b1(16-32) XOR b2(16-32) + umaxp v4.16b,v4.16b,v5.16b + umaxp v4.16b,v4.16b,v4.16b // fill v4 with max value + fmov x6,d4 + cbz x6,.Lloop // if d4 is 0 then all matched + + cmeq v0.16b,v0.16b,v2.16b // do compare between 0-15 b1 vs b2 + shrn v0.8b,v0.8h,#4 // shift them right to fit in x1 + cmeq v1.16b,v1.16b,v3.16b // do compare between 16-31 b1 vs b2 + shrn v1.8b,v1.8h,#4 + + fmov x1,d0 + fmov x3,d1 + + mvn x0,x1 // invert to use clz + cbz x0,0f + rbit x0,x0 + clz x0,x0 // if this is zero check bytes 16..32 + b 1f + +0: + rbit x1,x3 + mvn x1,x1 + clz x0,x1 + add x0,x0,#64 +1: + lsr x0,x0,#2 + cmp x0,x2 + b.ge .Lnone + cmp x0,#32 // x0 == 32 if no hit (32 0's) + b.eq .Lloop +2: + ldrb w4,[x8,x0] + ldrb w5,[x9,x0] + subs w0,w4,w5 // get the byte difference + ret + + + .p2align 4 +.Lloop: + subs x2,x2,#32 + b.le .Lnone + ldp q0,q1,[x8,32]! // load 32 bytes to vector registers + ldp q2,q3,[x9,32]! + + eor v4.16b,v0.16b,v2.16b + eor v5.16b,v1.16b,v3.16b + umaxp v4.16b,v4.16b,v5.16b + umaxp v4.16b,v4.16b,v4.16b + fmov x6,d4 + cbz x6,.Lloop + + cmeq v0.16b,v0.16b,v2.16b // compare between 0-15 b1 vs b2 + cmeq v1.16b,v1.16b,v3.16b // compare between 16-31 b1 vs b2 + + shrn v0.8b,v0.8h,#4 + fmov x1,d0 + mvn x0,x1 // invert to use clz + cbz x0,0f + rbit x0,x0 + clz x0,x0 // if this is zero check bytes 16..32 + b 1f + +0: + shrn v1.8b,v1.8h,#4 + fmov x3,d1 + rbit x1,x3 + mvn x1,x1 + clz x0,x1 + add x0,x0,#64 // due to shift on next line +1: + lsr x0,x0,#2 + cmp x0,x2 // offending byte is past limit + b.ge .Lnone + ldrb w4,[x8,x0] + ldrb w5,[x9,x0] + subs w0,w4,w5 + ret + +.Lnone: + mov x0,#0 + ret + +END(memcmp)