diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc
--- a/lib/libc/aarch64/string/Makefile.inc
+++ b/lib/libc/aarch64/string/Makefile.inc
@@ -16,10 +16,12 @@
 	strcmp \
 	strcpy \
 	strlen \
-	strncmp \
 	strnlen \
 	strrchr
 
+
+MDSRCS+= \
+	strncmp.S
 #
 # Add the above functions. Generate an asm file that includes the needed
 # Arm Optimized Routines file defining the function name to the libc name.
diff --git a/lib/libc/aarch64/string/strncmp.S b/lib/libc/aarch64/string/strncmp.S
new file mode 100644
--- /dev/null
+++ b/lib/libc/aarch64/string/strncmp.S
@@ -0,0 +1,542 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
+*/
+
+#include <machine/asm.h>
+#include <machine/param.h>
+
+	.text
+
+ENTRY(strncmp)
+
+	bic	x8, x0, #0xf			// x8 is x0 but aligned to the boundary
+	and	x9, x0, #0xf			// x9 is the offset
+	bic	x10, x1, #0xf			// x10 is x1 but aligned to the boundary
+	and	x11, x1, #0xf			// x11 is the offset
+
+	subs	x2, x2, #1
+	b.mi	.Lempty
+
+	/*
+	 * Check if either string is located at end of page to avoid crossing
+	 * into unmapped page. If so, we load 16 bytes from the nearest
+	 * alignment boundary and shift based on the offset.
+	 */
+	cmp	x2,#16
+	b.lt	.Llt16
+
+	add	x3, x0, #16			// end of head
+	add	x4, x1, #16
+	eor	x3, x3, x0
+	eor	x4, x4, x1			// bits that changed
+	orr	x3, x3, x4			// in either str1 or str2
+	tbz	w3, #PAGE_SHIFT, .Lbegin
+
+	ldr	q0, [x8]			// load aligned head
+	ldr	q1, [x10]
+
+	mov	x4, #0xffffffffffffffff
+	lsl	x14, x9, #2
+	lsl	x3, x4, x14			// string head
+	lsl	x15, x11, #2
+	lsl	x4, x4, x15
+
+	cmeq	v5.16b, v0.16b, #0
+	cmeq	v6.16b, v1.16b, #0
+
+	shrn	v5.8b, v5.8h, #4
+	shrn	v6.8b, v6.8h, #4
+	fmov	x5, d5
+	fmov	x6, d6
+
+	adrp	x14, shift_data
+	add	x14, x14, :lo12:shift_data
+
+	/* heads may cross page boundary, avoid unmapped loads */
+	tst	x5, x3
+	b.eq	0f
+
+	add	x15, x14, x9
+	ldr	q4, [x15]			// load permutation table
+	tbl	v0.16b, {v0.16b}, v4.16b
+
+	b	1f
+	.p2align 4
+0:
+	ldr	q0, [x0]			// load true head
+1:
+	tst	x6, x4
+	b.eq	0f
+
+	add	x14, x14, x11
+	ldr	q4, [x14]
+	tbl	v4.16b, {v1.16b}, v4.16b
+
+	b 1f
+
+	.p2align 4
+.Lbegin:
+	ldr	q0, [x0]			// load true heads
+0:
+	ldr	q4, [x1]
+1:
+	cmeq	v2.16b, v0.16b, #0		// NUL byte present?
+	cmeq	v4.16b, v0.16b, v4.16b		// which bytes match?
+
+	orn	v2.16b, v2.16b, v4.16b		// mismatch or NUL byte?
+
+	shrn	v2.8b, v2.8h, #4
+	fmov	x5, d2
+
+	cbnz	x5, .Lhead_mismatch
+	/* load head and second chunk */
+	ldr	q2, [x8, #16]			// load second chunk
+	ldr	q3, [x10, #16]
+
+	add	x2, x2, x11
+	sub	x2, x2, #16			// account for length of RSI chunk?
+
+	subs	x9, x9, x11			// is a&0xf >= b&0xf
+	b.lt	.Lswapped			// if not swap operands
+	b	.Lnormal
+
+	.p2align 4
+.Llt16:
+	add	x3, x0, #16			// end of head
+	add	x4, x1, #16
+	eor	x3, x3, x0
+	eor	x4, x4, x1			// bits that changed
+	orr	x3, x3, x4			// in either str1 or str2
+	tbz	w3, #PAGE_SHIFT, 0f
+
+	ldr	q0, [x8]			// load aligned head
+	ldr	q1, [x10]
+
+	adrp	x14, shift_data
+	add	x14, x14, :lo12:shift_data
+	add	x15, x14, x9
+	add	x14, x14, x11
+
+	/* heads may cross page boundary, avoid unmapped loads */
+	ldr	q4, [x15]			// load permutation table
+	tbl	v0.16b, {v0.16b}, v4.16b
+	ldr	q4, [x14]
+	tbl	v4.16b, {v1.16b}, v4.16b
+	b 1f
+
+	.p2align 4
+0:
+	ldr	q0, [x0]			// load true heads
+	ldr	q4, [x1]
+1:
+
+	cmeq	v2.16b, v0.16b, #0		// NUL byte present?
+	cmeq	v4.16b, v0.16b, v4.16b		// which bytes match?
+
+	not	v2.16b, v2.16b
+	and	v2.16b, v2.16b, v4.16b		// match and not NUL byte
+
+	shrn	v2.8b, v2.8h, #4
+	fmov	x5, d2
+	mov	x3, #0xf
+	lsl	x4, x2, #2
+	lsl	x4, x3, x4
+	bic	x5, x5, x4
+	mvn	x5, x5				// mismatch or NUL byte?
+
+.Lhead_mismatch:
+	rbit	x3, x5
+	clz	x3, x3				// index of mismatch
+	lsr	x3, x3, #2
+	ldrb	w4, [x0, x3]
+	ldrb	w5, [x1, x3]
+	sub	w0, w4, w5
+	ret
+
+	.p2align 4
+.Lnormal:
+	sub	x12, x10, x9
+	ldr	q0, [x12, #16]!
+	sub	x10, x10, x8
+	sub	x11, x10, x9
+
+	cmeq	v1.16b, v3.16b, #0		// NUL present?
+	cmeq	v0.16b, v0.16b, v2.16b		// Mismatch between chunks?
+	shrn	v1.8b, v1.8h, #4
+	shrn	v0.8b, v0.8h, #4
+	fmov	x6, d1
+	fmov	x5, d0
+
+	add	x8, x8, #32			// advance to next iteration
+
+	mov	x3, #0xf
+	lsl	x4, x2, #2
+	lsl	x4, x3, x4
+	orr	x3, x6, x4			// introduce a null byte match
+	cmp	x2, #16				// does the buffer end within x2
+	csel	x6, x3, x6, lt
+	cbnz	x6, .Lnulfound2			// NUL or end of buffer found?
+	mvn	x5, x5
+	cbnz	x5, .Lmismatch2
+	sub	x2, x2, #16
+	cmp	x2, #32				// end of buffer within first main loop iteration?
+	b.lt	.Ltail
+	/*
+	 * During the main loop, the layout of the two strings is something like:
+	 *
+	 *          v ------1------ v ------2------ v
+	 *      X0:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
+	 *      X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
+	 *
+	 * where v indicates the alignment boundaries and corresponding chunks
+	 * of the strings have the same letters.  Chunk A has been checked in
+	 * the previous iteration.  This iteration, we first check that string
+	 * X1 doesn't end within region 2, then we compare chunk B between the
+	 * two strings.  As X1 is known not to hold a NUL byte in regions 1
+	 * and 2 at this point, this also ensures that x0 has not ended yet.
+	 */
+	.p2align 4
+0:
+	ldr	q0, [x8, x11]
+	ldr	q1, [x8, x10]
+	ldr	q2, [x8]
+
+	cmeq	v1.16b, v1.16b, #0		// end of string?
+	cmeq	v0.16b, v0.16b, v2.16b		// do the chunks match?
+
+	shrn	v1.8b, v1.8h, #4
+	shrn	v0.8b, v0.8h, #4
+	fmov	x6, d1
+	fmov	x5, d0
+	cbnz	x6, .Lnulfound
+	mvn	x5, x5				// any mismatches?
+	cbnz	x5, .Lmismatch
+
+	add	x8, x8, #16
+
+	/* main loop unrolled twice */
+	ldr	q0, [x8, x11]
+	ldr	q1, [x8, x10]
+	ldr	q2, [x8]
+
+	add	x8, x8, #16
+	cmeq	v1.16b, v1.16b, #0
+	cmeq	v0.16b, v0.16b, v2.16b
+
+	shrn	v1.8b, v1.8h, #4
+	shrn	v0.8b, v0.8h, #4
+	fmov	x6, d1
+	fmov	x5, d0
+	cbnz	x6, .Lnulfound2
+	mvn	x5, x5
+	cbnz	x5, .Lmismatch2
+	sub	x2, x2, #32
+	cmp	x2, #32				// end of buffer within next iteration
+	b.ge	0b				// if yes, process tail
+
+	/* end of buffer will occur in next 32 bytes */
+.Ltail:
+	ldr	q0, [x8, x11]
+	ldr	q1, [x8, x10]
+	ldr	q2, [x8]
+
+	cmeq	v1.16b, v1.16b, #0		// end of string?
+	cmeq	v0.16b, v0.16b, v2.16b		// do the chunks match?
+
+	shrn	v1.8b, v1.8h, #4
+	shrn	v0.8b, v0.8h, #4
+	fmov	x6, d1
+	fmov	x5, d0
+
+	/*
+	 * If x2 <= 16 then we introduce a NUL byte in the
+	 * result from CMEQ to avoid comparing further!
+	 */
+
+	mov	x3, #0xf
+	lsl	x4, x2, #2
+	lsl	x4, x3, x4
+	orr	x3, x6, x4			// introduce a null byte match
+	cmp	x2, #16				// does the buffer end within x2
+	csel	x6, x3, x6, lt
+
+	cbnz	x6, .Lnulfound			// NUL or end of string found
+	mvn	x5, x5
+	cbnz	x5, .Lmismatch
+
+	add	x8, x8, #16
+
+	/* main loop unrolled twice */
+	ldr	q0, [x8, x11]
+	ldr	q1, [x8, x10]
+	ldr	q2, [x8]
+
+	add	x8, x8, #16
+	cmeq	v1.16b, v1.16b, #0
+	cmeq	v0.16b, v0.16b, v2.16b
+
+	shrn	v1.8b, v1.8h, #4
+	shrn	v0.8b, v0.8h, #4
+	fmov	x6, d1
+	fmov	x5, d0
+
+	sub	x2, x2, #16			// take first half into account
+	mov	x3, #0xf
+	lsl	x4, x2, #2
+	lsl	x4, x3, x4
+	orr	x6, x6, x4			// introduce a null byte match
+
+.Lnulfound2:
+	sub	x8, x8, #16
+
+.Lnulfound:
+	mov	x4, x6
+
+	ubfiz	x7, x9, #2, #4
+	lsl	x6, x6, x7			// adjust NUL mask to indices
+
+	orn	x5, x6, x5
+	cbnz	x5, .Lmismatch
+
+	/*
+	 * (x0) == (x1) and NUL is past the string.
+	 * Compare (x1) with the corresponding part
+	 * of the other string until the NUL byte.
+	 */
+	ldr	q0, [x8, x9]
+	ldr	q1, [x8, x10]
+
+	cmeq	v1.16b, v0.16b, v1.16b
+	shrn	v1.8b, v1.8h, #4
+	fmov	x5, d1
+
+	orn	x5, x4, x5
+
+	rbit	x3, x5
+	clz	x3, x3
+	lsr	x5, x3, #2
+
+	add	x10, x10, x8			// restore x10 pointer
+	add	x8, x8, x9			// point to corresponding chunk in x0
+
+	ldrb	w4, [x8, x5]
+	ldrb	w5, [x10, x5]
+	sub	w0, w4, w5
+	ret
+
+	.p2align 4
+.Lmismatch2:
+	sub	x8, x8, #16			// roll back second increment
+.Lmismatch:
+	rbit	x3, x5
+	clz	x3, x3				// index of mismatch
+	lsr	x3, x3, #2
+	add	x11, x8, x11
+
+	ldrb	w4, [x8, x3]
+	ldrb	w5, [x11, x3]
+	sub	w0, w4, w5			// difference of the mismatching chars
+	ret
+
+	/*
+	 * If (a&0xf) < (b&0xf), we do the same thing but with swapped
+	 * operands.  I found that this performs slightly better than
+	 * using conditional moves to do the swap branchless.
+	 */
+	.p2align 4
+.Lswapped:
+	add	x12, x8, x9
+	ldr	q0, [x12, #16]!
+	sub	x8, x8, x10
+	add	x11, x8, x9
+	add	x2,x2,x9
+	neg	x9, x9
+
+	cmeq	v1.16b, v2.16b, #0
+	cmeq	v0.16b, v0.16b, v3.16b
+	shrn	v1.8b, v1.8h, #4
+	shrn	v0.8b, v0.8h, #4
+	fmov	x6, d1
+	fmov	x5, d0
+
+	add	x10, x10, #32
+
+	mov	x3, #0xf
+	lsl	x4, x2, #2
+	lsl	x4, x3, x4
+	orr	x3,x6,x4			// introduce a null byte match
+	cmp	x2,#16
+	csel	x6, x3, x6, lt
+	cbnz	x6, .Lnulfound2s
+	mvn	x5, x5
+	cbnz	x5, .Lmismatch2s
+
+	sub	x2, x2, #16
+	cmp	x2, #32
+	b.lt	.Ltails
+
+	/*
+	 * During the main loop, the layout of the two strings is something like:
+	 *
+	 *          v ------1------ v ------2------ v
+	 *      X1:    AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
+	 *      X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
+	 *
+	 * where v indicates the alignment boundaries and corresponding chunks
+	 * of the strings have the same letters.  Chunk A has been checked in
+	 * the previous iteration.  This iteration, we first check that string
+	 * X0 doesn't end within region 2, then we compare chunk B between the
+	 * two strings.  As X0 is known not to hold a NUL byte in regions 1
+	 * and 2 at this point, this also ensures that X1 has not ended yet.
+	 */
+	.p2align 4
+0:
+	ldr	q0, [x10, x11]
+	ldr	q1, [x10, x8]
+	ldr	q2, [x10]
+
+	cmeq	v1.16b, v1.16b, #0
+	cmeq	v0.16b, v0.16b, v2.16b
+
+	shrn	v1.8b, v1.8h, #4
+	shrn	v0.8b, v0.8h, #4
+	fmov	x6, d1
+	fmov	x5, d0
+	cbnz	x6, .Lnulfounds
+	mvn	x5, x5
+	cbnz	x5, .Lmismatchs
+
+	add	x10, x10, #16
+
+	/* main loop unrolled twice */
+	ldr	q0, [x10, x11]
+	ldr	q1, [x10, x8]
+	ldr	q2, [x10]
+
+	add	x10, x10, #16
+	cmeq	v1.16b, v1.16b, #0
+	cmeq	v0.16b, v0.16b, v2.16b
+
+	shrn	v1.8b, v1.8h, #4
+	shrn	v0.8b, v0.8h, #4
+	fmov	x6, d1
+	fmov	x5, d0
+	cbnz	x6, .Lnulfound2s
+	mvn	x5, x5
+	cbnz	x5, .Lmismatch2s
+	sub	x2, x2, #32
+	cmp	x2, #32
+	b.ge	0b
+
+.Ltails:
+	ldr	q0, [x10, x11]
+	ldr	q1, [x10, x8]
+	ldr	q2, [x10]
+
+	cmeq	v1.16b, v1.16b, #0
+	cmeq	v0.16b, v0.16b, v2.16b
+
+	shrn	v1.8b, v1.8h, #4
+	shrn	v0.8b, v0.8h, #4
+	fmov	x6, d1
+	fmov	x5, d0
+
+	/*
+	 * If x2 <= 16 then we introduce a NUL byte in the
+	 * result from CMEQ to avoid comparing further!
+	 */
+
+	mov	x3, #0xf
+	lsl	x4, x2, #2
+	lsl	x4, x3, x4
+	orr	x3, x6, x4				// introduce a null byte match
+	cmp	x2, #16
+	csel	x6, x3, x6, lt
+
+	cbnz	x6, .Lnulfounds
+	mvn	x5, x5
+	cbnz	x5, .Lmismatchs
+
+	add	x10, x10, #16
+
+	ldr	q0, [x10, x11]
+	ldr	q1, [x10, x8]
+	ldr	q2, [x10]
+
+	add	x10, x10, #16
+	cmeq	v1.16b, v1.16b, #0
+	cmeq	v0.16b, v0.16b, v2.16b
+
+	shrn	v1.8b, v1.8h, #4
+	shrn	v0.8b, v0.8h, #4
+	fmov	x6, d1
+	fmov	x5, d0
+
+	sub	x2, x2, #16
+	mov	x3, #0xf
+	lsl	x4, x2, #2
+	lsl	x4, x3, x4
+	orr	x6, x6, x4				// introduce a null byte match
+
+.Lnulfound2s:
+	sub	x10, x10, #16
+.Lnulfounds:
+	mov	x4, x6
+
+	ubfiz	x7, x9, #2, #4
+	lsl	x6, x6, x7
+
+	orn	x5, x6, x5
+
+	cbnz	x5, .Lmismatchs
+
+	ldr	q0, [x10, x9]
+	ldr	q1, [x10, x8]
+
+	cmeq	v1.16b, v0.16b, v1.16b
+	shrn	v1.8b, v1.8h, #4
+	fmov	x5, d1
+
+	orn	x5, x4, x5
+
+	rbit	x3, x5
+	clz	x3, x3
+	lsr	x5, x3, #2
+
+	add	x11, x10, x8
+	add	x10, x10, x9
+
+	ldrb	w4, [x10, x5]
+	ldrb	w5, [x11, x5]
+	sub	w0, w5, w4
+	ret
+
+	.p2align 4
+.Lmismatch2s:
+	sub	x10, x10, #16
+.Lmismatchs:
+	rbit	x3, x5
+	clz	x3, x3
+	lsr	x3, x3, #2
+	add	x11, x10, x11
+
+	ldrb	w4, [x10, x3]
+	ldrb	w5, [x11, x3]
+	sub	w0, w5, w4
+	ret
+
+	.p2align 4
+.Lempty:
+	eor	x0, x0, x0
+	ret
+
+END(strncmp)
+
+	.section .rodata
+	.p2align 4
+shift_data:
+	.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+	.fill 16, 1, -1
+	.size shift_data, .-shift_data