Page MenuHomeFreeBSD

D45839.diff
No OneTemporary

D45839.diff

diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc
--- a/lib/libc/aarch64/string/Makefile.inc
+++ b/lib/libc/aarch64/string/Makefile.inc
@@ -13,13 +13,15 @@
stpcpy \
strchr \
strchrnul \
- strcmp \
strcpy \
strlen \
strncmp \
strnlen \
strrchr
+
+MDSRCS+= \
+ strcmp.S
#
# Add the above functions. Generate an asm file that includes the needed
# Arm Optimized Routines file defining the function name to the libc name.
diff --git a/lib/libc/aarch64/string/strcmp.S b/lib/libc/aarch64/string/strcmp.S
new file mode 100644
--- /dev/null
+++ b/lib/libc/aarch64/string/strcmp.S
@@ -0,0 +1,350 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
+*/
+
+#include <machine/asm.h>
+#include <machine/param.h>
+
+ .weak strcmp
+ .set strcmp, __strcmp
+ .text
+
+ENTRY(__strcmp)
+
+ bic x8, x0, #0xf // x0 aligned to the boundary
+ and x9, x0, #0xf // x9 is the offset
+ bic x10, x1, #0xf // x1 aligned to the boundary
+ and x11, x1, #0xf // x11 is the offset
+
+ mov x13, #-1
+
+ /*
+ * Check if either string is located at end of page to avoid crossing
+ * into unmapped page. If so, we load 16 bytes from the nearest
+ * alignment boundary and shift based on the offset.
+ */
+
+ add x3, x0, #16 // end of head
+ add x4, x1, #16
+ eor x3, x3, x0
+ eor x4, x4, x1 // bits that changed
+ orr x3, x3, x4 // in either str1 or str2
+ tbz w3, #PAGE_SHIFT, .Lbegin
+
+ ldr q0, [x8] // load aligned head
+ ldr q2, [x10]
+
+ lsl x14, x9, #2
+ lsl x15, x11, #2
+ lsl x3, x13, x14 // string head
+ lsl x4, x13, x15
+
+ cmeq v5.16b, v0.16b, #0
+ cmeq v6.16b, v2.16b, #0
+
+ shrn v5.8b, v5.8h, #4
+ shrn v6.8b, v6.8h, #4
+ fmov x5, d5
+ fmov x6, d6
+
+ adrp x2, shift_data
+ add x2, x2, :lo12:shift_data
+
+ /* heads may cross page boundary, avoid unmapped loads */
+ tst x5, x3
+ b.eq 0f
+
+ ldr q4, [x2, x9] // load permutation table
+ tbl v0.16b, {v0.16b}, v4.16b
+
+ b 1f
+ .p2align 4
+0:
+ ldr q0, [x0] // load true head
+1:
+ tst x6, x4
+ b.eq 0f
+
+ ldr q4, [x2, x11]
+ tbl v4.16b, {v2.16b}, v4.16b
+
+ b 1f
+
+ .p2align 4
+.Lbegin:
+ ldr q0, [x0] // load true heads
+0:
+ ldr q4, [x1]
+1:
+
+ cmeq v2.16b, v0.16b, #0 // NUL byte present?
+ cmeq v4.16b, v0.16b, v4.16b // which bytes match?
+
+ orn v2.16b, v2.16b, v4.16b // mismatch or NUL byte?
+
+ shrn v2.8b, v2.8h, #4
+ fmov x5, d2
+
+ cbnz x5, .Lhead_mismatch
+
+ ldr q2, [x8, #16] // load second chunk
+ ldr q3, [x10, #16]
+ subs x9, x9, x11 // is a&0xf >= b&0xf
+ b.lo .Lswapped // if not swap operands
+ sub x12, x10, x9
+ ldr q0, [x12, #16]!
+ sub x10, x10, x8
+ sub x11, x10, x9
+
+ cmeq v1.16b, v3.16b, #0
+ cmeq v0.16b, v0.16b, v2.16b
+ add x8, x8, #16
+ shrn v1.8b, v1.8h, #4
+ fmov x6, d1
+ shrn v0.8b, v0.8h, #4
+ fmov x5, d0
+ cbnz x6, .Lnulfound
+ mvn x5, x5
+ cbnz x5, .Lmismatch
+ add x8, x8, #16 // advance aligned pointers
+
+ /*
+ * During the main loop, the layout of the two strings is something like:
+ *
+ * v ------1------ v ------2------ v
+ * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
+ * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
+ *
+ * where v indicates the alignment boundaries and corresponding chunks
+ * of the strings have the same letters. Chunk A has been checked in
+ * the previous iteration. This iteration, we first check that string
+ * X1 doesn't end within region 2, then we compare chunk B between the
+ * two strings. As X1 is known not to hold a NUL byte in regions 1
+ * and 2 at this point, this also ensures that x0 has not ended yet.
+ */
+ .p2align 4
+0:
+ ldr q0, [x8, x11]
+ ldr q1, [x8, x10]
+ ldr q2, [x8]
+
+ cmeq v1.16b, v1.16b, #0 // end of string?
+ cmeq v0.16b, v0.16b, v2.16b // do the chunks match?
+
+ shrn v1.8b, v1.8h, #4
+ fmov x6, d1
+ shrn v0.8b, v0.8h, #4
+ fmov x5, d0
+ cbnz x6, .Lnulfound
+ mvn x5, x5 // any mismatches?
+ cbnz x5, .Lmismatch
+
+ add x8, x8, #16
+
+ ldr q0, [x8, x11]
+ ldr q1, [x8, x10]
+ ldr q2, [x8]
+
+ add x8, x8, #16
+ cmeq v1.16b, v1.16b, #0
+ cmeq v0.16b, v0.16b, v2.16b
+
+ shrn v1.8b, v1.8h, #4
+ fmov x6, d1
+ shrn v0.8b, v0.8h, #4
+ fmov x5, d0
+ cbnz x6, .Lnulfound2
+ mvn x5, x5
+ cbz x5, 0b
+
+ sub x8, x8, #16 // roll back second increment
+.Lmismatch:
+ rbit x2, x5
+ clz x2, x2 // index of mismatch
+ lsr x2, x2, #2
+ add x11, x8, x11
+
+ ldrb w4, [x8, x2]
+ ldrb w5, [x11, x2]
+ sub w0, w4, w5 // byte difference
+ ret
+
+ .p2align 4
+.Lnulfound2:
+ sub x8, x8, #16
+
+.Lnulfound:
+ mov x7, x9
+ mov x4, x6
+
+ ubfiz x7, x7, #2, #4 // x7 = (x7 & 0xf) << 2
+ lsl x6, x6, x7 // adjust NUL mask to indices
+ orn x5, x6, x5
+ cbnz x5, .Lmismatch
+
+ /*
+ * (x0) == (x1) and NUL is past the string.
+ * Compare (x1) with the corresponding part
+ * of the other string until the NUL byte.
+ */
+ ldr q0, [x8, x9]
+ ldr q1, [x8, x10]
+
+ cmeq v1.16b, v0.16b, v1.16b
+ shrn v1.8b, v1.8h, #4
+ fmov x5, d1
+
+ orn x5, x4, x5
+
+ rbit x2, x5
+ clz x2, x2
+ lsr x5, x2, #2
+
+ add x10, x10, x8 // restore x10 pointer
+ add x8, x8, x9 // point to corresponding chunk
+
+ ldrb w4, [x8, x5]
+ ldrb w5, [x10, x5]
+ sub w0, w4, w5
+ ret
+
+ .p2align 4
+.Lhead_mismatch:
+ rbit x2, x5
+ clz x2, x2 // index of mismatch
+ lsr x2, x2, #2
+ ldrb w4, [x0, x2]
+ ldrb w5, [x1, x2]
+ sub w0, w4, w5
+ ret
+
+ /*
+ * If (a&0xf) < (b&0xf), we do the same thing but with swapped
+ * operands. I found that this performs slightly better than
+ * using conditional moves to do the swap branchless.
+ */
+ .p2align 4
+.Lswapped:
+ add x12, x8, x9
+ ldr q0, [x12, #16]!
+ sub x8, x8, x10
+ add x11, x8, x9
+ neg x9, x9
+
+ cmeq v1.16b, v2.16b, #0
+ cmeq v0.16b, v0.16b, v3.16b
+ add x10, x10, #16
+ shrn v1.8b, v1.8h, #4
+ fmov x6, d1
+ shrn v0.8b, v0.8h, #4
+ fmov x5, d0
+ cbnz x6, .Lnulfounds
+ mvn x5, x5
+ cbnz x5, .Lmismatchs
+ add x10, x10, #16
+
+ /*
+ * During the main loop, the layout of the two strings is something like:
+ *
+ * v ------1------ v ------2------ v
+ * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
+ * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
+ *
+ * where v indicates the alignment boundaries and corresponding chunks
+ * of the strings have the same letters. Chunk A has been checked in
+ * the previous iteration. This iteration, we first check that string
+ * X0 doesn't end within region 2, then we compare chunk B between the
+ * two strings. As X0 is known not to hold a NUL byte in regions 1
+ * and 2 at this point, this also ensures that X1 has not ended yet.
+ */
+ .p2align 4
+0:
+ ldr q0, [x10, x11]
+ ldr q1, [x10, x8]
+ ldr q2, [x10]
+
+ cmeq v1.16b, v1.16b, #0
+ cmeq v0.16b, v0.16b, v2.16b
+
+ shrn v1.8b, v1.8h, #4
+ fmov x6, d1
+ shrn v0.8b, v0.8h, #4
+ fmov x5, d0
+ cbnz x6, .Lnulfounds
+ mvn x5, x5
+ cbnz x5, .Lmismatchs
+
+ add x10, x10, #16
+
+ ldr q0, [x10, x11]
+ ldr q1, [x10, x8]
+ ldr q2, [x10]
+
+ add x10, x10, #16
+ cmeq v1.16b, v1.16b, #0
+ cmeq v0.16b, v0.16b, v2.16b
+
+ shrn v1.8b, v1.8h, #4
+ fmov x6, d1
+ shrn v0.8b, v0.8h, #4
+ fmov x5, d0
+ cbnz x6, .Lnulfound2s
+ mvn x5, x5
+ cbz x5, 0b
+
+ sub x10, x10, #16
+
+.Lmismatchs:
+ rbit x2, x5
+ clz x2, x2
+ lsr x2, x2, #2
+ add x11, x10, x11
+
+ ldrb w4, [x10, x2]
+ ldrb w5, [x11, x2]
+ sub w0, w5, w4
+ ret
+
+ .p2align 4
+.Lnulfound2s:
+ sub x10, x10, #16
+.Lnulfounds:
+ mov x7, x9
+ mov x4, x6
+
+ ubfiz x7, x7, #2, #4
+ lsl x6, x6, x7
+ orn x5, x6, x5
+ cbnz x5, .Lmismatchs
+
+ ldr q0, [x10, x9]
+ ldr q1, [x10, x8]
+
+ cmeq v1.16b, v0.16b, v1.16b
+ shrn v1.8b, v1.8h, #4
+ fmov x5, d1
+
+ orn x5, x4, x5
+
+ rbit x2, x5
+ clz x2, x2
+ lsr x5, x2, #2
+
+ add x11, x10, x8
+ add x10, x10, x9
+
+ ldrb w4, [x10, x5]
+ ldrb w5, [x11, x5]
+ sub w0, w5, w4
+ ret
+
+END(__strcmp)
+
+ .section .rodata
+ .p2align 4
+shift_data:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ .fill 16, 1, -1
+ .size shift_data, .-shift_data

File Metadata

Mime Type
text/plain
Expires
Thu, Dec 12, 4:21 AM (4 h, 35 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
15308549
Default Alt Text
D45839.diff (7 KB)

Event Timeline