diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc
--- a/lib/libc/aarch64/string/Makefile.inc
+++ b/lib/libc/aarch64/string/Makefile.inc
@@ -5,7 +5,6 @@
 
 AARCH64_STRING_FUNCS= \
 	memchr \
-	memcmp \
 	memcpy \
 	memmove \
 	memrchr \
@@ -20,6 +19,10 @@
 	strnlen \
 	strrchr
 
+
+MDSRCS+= \
+	memcmp.S
+
 #
 # Add the above functions. Generate an asm file that includes the needed
 # Arm Optimized Routines file defining the function name to the libc name.
diff --git a/lib/libc/aarch64/string/memcmp.S b/lib/libc/aarch64/string/memcmp.S
new file mode 100644
--- /dev/null
+++ b/lib/libc/aarch64/string/memcmp.S
@@ -0,0 +1,130 @@
+/*-
+ * Copyright (c) 2024 Getz Mikalsen
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+*/
+
+#include <machine/asm.h>
+#include <machine/param.h>
+
+	.text
+
+ENTRY(memcmp)
+
+	mov	x8,x0			// store base address for later
+	mov	x9,x1
+
+	cbz	x2,.Lnone		// 0 length
+
+
+	/*
+	 * TODO: Check if buffer is located at end of page to avoid crossing
+	 * into unmapped page.
+	 */
+
+//	cmp	x2,#32
+//	b.hi	.Lbegin
+//	add	x3,x8,#32
+//	add	x4,x9,#32
+//	eor	x3,x3,x8
+//	eor	x4,x4,x9
+//	tst	w3,#PAGE_SIZE
+//	b.ne	.Lbegin
+//	tst	w4,#PAGE_SIZE
+//	b.ne	.Lbegin
+
+	/*
+	 * Compare strings of 1--32 bytes.  We do this by loading into two
+	 * vector registers and then comparing.
+	 */
+
+.Lbegin:
+	ldp	q0,q1,[x0]		// load 32 bytes into vector registers
+	ldp	q2,q3,[x1]
+
+	/* quick check if no matches in first 32 bytes */
+	eor	v4.16b,v0.16b,v2.16b	// v4 = b1(0-15) XOR b2(0-15)
+	eor	v5.16b,v1.16b,v3.16b	// v5 = b1(16-32) XOR b2(16-32)
+	umaxp	v4.16b,v4.16b,v5.16b
+	umaxp	v4.16b,v4.16b,v4.16b	// fill v4 with max value
+	fmov	x6,d4
+	cbz	x6,.Lloop		// if d4 is 0 then all matched
+
+	cmeq	v0.16b,v0.16b,v2.16b	// do compare between 0-15 b1 vs b2
+	shrn	v0.8b,v0.8h,#4		// shift them right to fit in x1
+	cmeq	v1.16b,v1.16b,v3.16b	// do compare between 16-31 b1 vs b2
+	shrn	v1.8b,v1.8h,#4
+
+	fmov	x1,d0
+	fmov	x3,d1
+
+	mvn	x0,x1			// invert to use clz
+	cbz	x0,0f
+	rbit	x0,x0
+	clz	x0,x0			// if this is zero check bytes 16..32
+	b	1f
+
+0:
+	rbit	x1,x3
+	mvn	x1,x1
+	clz	x0,x1
+	add	x0,x0,#64
+1:
+	lsr	x0,x0,#2
+	cmp	x0,x2
+	b.ge	.Lnone
+	cmp	x0,#32			// x0 == 32 if no hit (32 0's)
+	b.eq	.Lloop
+2:
+	ldrb	w4,[x8,x0]
+	ldrb	w5,[x9,x0]
+	subs	w0,w4,w5		// get the byte difference
+	ret
+
+
+	.p2align 4
+.Lloop:
+	subs	x2,x2,#32
+	b.le	.Lnone
+	ldp	q0,q1,[x8,32]!		// load 32 bytes to vector registers
+	ldp	q2,q3,[x9,32]!
+
+	eor	v4.16b,v0.16b,v2.16b
+	eor	v5.16b,v1.16b,v3.16b
+	umaxp	v4.16b,v4.16b,v5.16b
+	umaxp	v4.16b,v4.16b,v4.16b
+	fmov	x6,d4
+	cbz	x6,.Lloop
+
+	cmeq	v0.16b,v0.16b,v2.16b	// compare between 0-15 b1 vs b2
+	cmeq	v1.16b,v1.16b,v3.16b	// compare between 16-31 b1 vs b2
+
+	shrn	v0.8b,v0.8h,#4
+	fmov	x1,d0
+	mvn	x0,x1			// invert to use clz
+	cbz	x0,0f
+	rbit	x0,x0
+	clz	x0,x0			// if this is zero check bytes 16..32
+	b	1f
+
+0:
+	shrn	v1.8b,v1.8h,#4
+	fmov	x3,d1
+	rbit	x1,x3
+	mvn	x1,x1
+	clz	x0,x1
+	add	x0,x0,#64		// due to shift on next line
+1:
+	lsr	x0,x0,#2
+	cmp	x0,x2			// offending byte is past limit
+	b.ge	.Lnone
+	ldrb	w4,[x8,x0]
+	ldrb	w5,[x9,x0]
+	subs	w0,w4,w5
+	ret
+
+.Lnone:
+	mov	x0,#0
+	ret
+
+END(memcmp)