diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc
index f8c67319fe12..7325b54d9716 100644
--- a/lib/libc/aarch64/string/Makefile.inc
+++ b/lib/libc/aarch64/string/Makefile.inc
@@ -1,52 +1,52 @@
 #
 # String handling from the Arm Optimized Routines
 # https://github.com/ARM-software/optimized-routines
 #
 
 AARCH64_STRING_FUNCS= \
 	memchr \
 	memcmp \
 	memcpy \
 	memmove \
 	memrchr \
 	memset \
 	stpcpy \
 	strchr \
 	strchrnul \
 	strcpy \
-	strlen \
 	strnlen \
 	strrchr
 
 # SIMD-enhanced routines not derived from Arm's code
 MDSRCS+= \
 	strcmp.S \
 	strspn.S \
 	strcspn.S \
 	strpbrk.c \
 	strsep.c \
 	strcat.c \
 	strlcpy.S \
 	strncmp.S \
 	memccpy.S \
 	strncat.c \
-	strlcat.c
+	strlcat.c \
+	strlen.S
 
 #
 # Add the above functions. Generate an asm file that includes the needed
 # Arm Optimized Routines file defining the function name to the libc name.
 # Some file need multiple macros defined or a weak symbol added we can
 # override the generated file in these cases.
 #
 .for FUNC in ${AARCH64_STRING_FUNCS}
 .if !exists(${FUNC}.S)
 ${FUNC}.S:
 	printf '/* %sgenerated by libc/aarch64/string/Makefile.inc */\n' @ > ${.TARGET}
 	printf '#define	__%s_aarch64 %s\n' ${FUNC} ${FUNC} >> ${.TARGET}
 	printf '#include "aarch64/%s.S"\n' ${FUNC} >> ${.TARGET}
 CLEANFILES+=	${FUNC}.S
 .endif
 
 MDSRCS+=	${FUNC}.S
 CFLAGS.${FUNC}.S+=-I${SRCTOP}/contrib/arm-optimized-routines/string
 .endfor
diff --git a/lib/libc/aarch64/string/strlen.S b/lib/libc/aarch64/string/strlen.S
new file mode 100644
index 000000000000..7bfac7f4b1e1
--- /dev/null
+++ b/lib/libc/aarch64/string/strlen.S
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
+*/
+
+#include <machine/asm.h>
+
+	.weak	strlen
+	.set	strlen, __strlen
+	.text
+
+ENTRY(__strlen)
+	bic	x10, x0, #0xf		// aligned src
+	and	x9, x0, #0xf
+	ldr	q0, [x10]
+	cmeq	v0.16b, v0.16b, #0
+	shrn	v0.8b, v0.8h, #4
+	fmov	x1, d0
+	cbz	x9, .Laligned
+	lsl	x2, x0, #2		// get the byte offset
+	lsr	x1, x1, x2		// shift by offset index
+	cbz	x1, .Lloop
+	rbit	x1, x1
+	clz	x0, x1
+	lsr	x0, x0, #2
+	ret
+
+.Laligned:
+	cbnz	x1, .Ldone
+
+.Lloop:
+	ldr	q0, [x10, #16]!
+	cmeq	v0.16b, v0.16b, #0
+	shrn	v0.8b, v0.8h, #4	// reduce to fit mask in GPR
+	fcmp	d0, #0.0
+	b.eq	.Lloop
+	fmov	x1, d0
+.Ldone:
+	sub	x0, x10, x0
+	rbit	x1, x1			// reverse bits as NEON has no ctz
+	clz	x3, x1
+	lsr	x3, x3, #2
+	add	x0, x0, x3
+	ret
+END(__strlen)