diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc
--- a/lib/libc/aarch64/string/Makefile.inc
+++ b/lib/libc/aarch64/string/Makefile.inc
@@ -5,7 +5,6 @@
 
 AARCH64_STRING_FUNCS= \
 	memchr \
-	memcmp \
 	memcpy \
 	memmove \
 	memrchr \
@@ -20,6 +19,10 @@
 	strnlen \
 	strrchr
 
+
+MDSRCS+= \
+	memcmp.S
+
 #
 # Add the above functions. Generate an asm file that includes the needed
 # Arm Optimized Routines file defining the function name to the libc name.
diff --git a/lib/libc/aarch64/string/memcmp.S b/lib/libc/aarch64/string/memcmp.S
new file mode 100644
--- /dev/null
+++ b/lib/libc/aarch64/string/memcmp.S
@@ -0,0 +1,189 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
+*/
+
+#include <machine/asm.h>
+#include <machine/param.h>
+
+	.text
+
+ENTRY(memcmp)
+
+	mov	x8, x0			// store base addresses
+	mov	x9, x1
+	cbz	x2, .Lnone		// 0 length
+
+	/*
+	 * Check if buffer is located at end of page to avoid crossing
+	 * into unmapped page. If so, we load 32 bytes from the end of the
+	 * limit and check the other buffer.
+	 */
+
+	cmp	x2, #32
+	b.hi	.Lbegin
+	add	x3, x8, #32
+	add	x4, x9, #32
+	eor	x3, x3, x8
+	eor	x4, x4, x9
+
+	tst	w3, #PAGE_SIZE
+	b.eq	0f
+
+	mov	x3, #32
+	sub	x3, x3, x2
+	sub	x8, x8, x3
+
+	/*
+	 * We perform a variable shift in the vector register using TBL,
+	 * a suitable permutation is generated by loading a table of bytes
+	 * with a desired offset.
+	 */
+
+	adrp	x0, shift_table
+	add	x0, x0, :lo12:shift_table
+	add	x0, x0, x3
+	ldp	q0, q1, [x8]
+	ldp	q4, q5, [x0]		// load permutation table
+	tbl	v0.16b, {v0.16b, v1.16b}, v4.16b
+	tbl	v1.16b, {v0.16b, v1.16b}, v5.16b
+	add	x8, x8, x3		// reset pointer to beginning of src
+	b	1f
+
+0:
+	ldp	q0, q1, [x8]
+
+1:
+	tst	w4, #PAGE_SIZE
+	b.eq	0f
+
+	mov	x3, #32
+	sub	x3, x3, x2
+	sub	x9, x9, x3
+
+	ldp	q2, q3, [x9]
+	adrp	x0, shift_table
+	add	x0, x0, :lo12:shift_table
+	add	x0, x0, x3
+	ldp	q4, q5, [x0]
+	tbl	v2.16b, {v2.16b, v3.16b}, v4.16b
+	tbl	v3.16b, {v2.16b, v3.16b}, v5.16b
+	add	x9, x9, x3
+	b	1f
+
+	/*
+	 * Compare strings of 1--32 bytes.  We do this by loading into two
+	 * vector registers and then doing a quick compare with XOR,  UMAXP
+	 * do determine if the first 32 bytes all match.
+	 */
+.Lbegin:
+	ldp	q0, q1, [x8]
+0:
+	ldp	q2, q3,	[x9]
+1:
+
+	/* quick check if no matches in first 32 bytes */
+	eor	v4.16b, v0.16b, v2.16b	// v4 = b1(0-15) XOR b2(0-15)
+	eor	v5.16b, v1.16b, v3.16b
+	umaxp	v4.16b, v4.16b, v5.16b
+	umaxp	v4.16b, v4.16b, v4.16b	// fill v4 with max value
+	fmov	x6, d4
+	cbz	x6, .Lloop		// if d4 is 0 then all matched
+
+	cmeq	v0.16b, v0.16b, v2.16b	// do compare between 0-15 b1 vs b2
+	cmeq	v1.16b, v1.16b, v3.16b	// do compare between 16-31 b1 vs b2
+	shrn	v0.8b, v0.8h, #4	// shift right to fit in x1
+	shrn	v1.8b, v1.8h, #4
+
+	fmov	x1, d0
+	fmov	x3, d1
+
+	mvn	x0, x1			// invert for clz
+	mvn	x3, x3
+	rbit	x1, x0
+	rbit	x3, x3
+	clz	x1, x1
+	clz	x3, x3
+	add	x3, x3, #64
+	cmn	x0, #0			// any match in LSB?
+	csel	x0, x3, x1, eq		// take x3 if none, else x1 matched
+
+	lsr	x0, x0, #2
+	cmp	x0, x2
+	b.hs	.Lnone
+	ldrb	w4, [x8, x0]
+	ldrb	w5, [x9, x0]
+	sub	w0, w4, w5		// get the byte difference
+	ret
+
+	/*
+	 * Compare strings of 32+ bytes. We introduce special handling if
+	 * theres less than 32 bytes left of the limit.
+	 */
+	.p2align 4
+.Lloop:
+	sub	x2, x2, #32
+	cmp	x2, #32
+	b.le	.Llast32
+	ldp	q0, q1, [x8,#32]!
+	ldp	q2, q3, [x9,#32]!
+
+	eor	v4.16b, v0.16b, v2.16b
+	eor	v5.16b, v1.16b, v3.16b
+	umaxp	v4.16b, v4.16b, v5.16b
+	umaxp	v4.16b, v4.16b, v4.16b
+	fmov	x6, d4
+	cbz	x6, .Lloop
+	b	.Lmatch
+
+	/* If 32 bytes left to compare only load 32 bytes from x8,x9 - limit to
+	 * avoid overread */
+.Llast32:
+	cmp	x2,#0
+	b.le	.Lnone
+	add	x8, x8, x2
+	add	x9, x9, x2
+	mov	x2, #32
+	ldp	q0, q1, [x8]
+	ldp	q2, q3, [x9]
+.Lmatch:
+	cmeq	v0.16b, v0.16b, v2.16b
+	cmeq	v1.16b, v1.16b, v3.16b
+
+	shrn	v0.8b, v0.8h, #4
+	shrn	v1.8b, v1.8h, #4
+	fmov	x1, d0
+	fmov	x3, d1
+
+	mvn	x0, x1
+	mvn	x3, x3
+	rbit	x1, x0
+	rbit	x3, x3
+	clz	x1, x1
+	clz	x3, x3
+	add	x3, x3, #64
+	cmn	x0, #0
+	csel	x0, x3, x1, eq
+
+	lsr	x0, x0, #2
+	cmp	x0, x2
+	b.hs	.Lnone
+	ldrb	w4, [x8, x0]
+	ldrb	w5, [x9, x0]
+	sub	w0, w4, w5
+	ret
+
+.Lnone:
+	mov	x0, #0
+	ret
+
+END(memcmp)
+
+	.section .rodata
+	.p2align 4
+shift_table:
+	.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+	.byte 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+	.fill 16, 1, -1
+	.size shift_table, .-shift_table