Page MenuHomeFreeBSD

D41349.id126287.diff
No OneTemporary

D41349.id126287.diff

diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -7,8 +7,9 @@
memcpy.S \
memmove.S \
memset.S \
+ stpcpy.S \
strcat.S \
strchrnul.S \
strcmp.S \
strlen.S \
- stpcpy.S
+ strcpy.c
diff --git a/lib/libc/amd64/string/stpcpy.S b/lib/libc/amd64/string/stpcpy.S
--- a/lib/libc/amd64/string/stpcpy.S
+++ b/lib/libc/amd64/string/stpcpy.S
@@ -1,12 +1,31 @@
-/*
- * Adapted by Guillaume Morin <guillaume@morinfr.org> from strcpy.S
- * written by J.T. Conklin <jtc@acorntoolworks.com>
- * Public domain.
+/*-
+ * Copyright (c) 2023, The FreeBSD Foundation
+ *
+ * SPDX-License-Expression: BSD-2-Clause
+ *
+ * Portions of this software were developed by Robert Clausecker
+ * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
+ *
+ * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S
+ * written by J.T. Conklin <jtc@acorntoolworks.com> and
+ * adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy
+ * that was originally dedicated to the public domain
*/
#include <machine/asm.h>
__FBSDID("$FreeBSD$");
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4, 0x90
+
+ .weak stpcpy
+ .set stpcpy, __stpcpy
+ARCHFUNCS(__stpcpy)
+ ARCHFUNC(__stpcpy, scalar)
+ ARCHFUNC(__stpcpy, baseline)
+ENDARCHFUNCS(__stpcpy)
+
/*
* This stpcpy implementation copies a byte at a time until the
* source pointer is aligned to a word boundary, it then copies by
@@ -20,9 +39,7 @@
* requirements.
*/
- .globl stpcpy,__stpcpy
-ENTRY(stpcpy)
-__stpcpy:
+ARCHENTRY(__stpcpy, scalar)
movabsq $0x0101010101010101,%r8
movabsq $0x8080808080808080,%r9
@@ -43,7 +60,7 @@
dec %rax
ret
- .p2align 4
+ ALIGN_TEXT
.Lloop:
movq %rdx,(%rdi)
addq $8,%rdi
@@ -111,6 +128,111 @@
.Ldone:
movq %rdi,%rax
ret
-END(stpcpy)
-
+ARCHEND(__stpcpy, scalar)
+
+ARCHENTRY(__stpcpy, baseline)
+ mov %esi, %ecx
+ mov %rdi, %rdx
+ sub %rsi, %rdi # express destination as distance to surce
+ and $~0xf, %rsi # align source to 16 byte
+ movdqa (%rsi), %xmm0 # head of string with junk before
+ pxor %xmm1, %xmm1
+ and $0xf, %ecx # misalignment in bytes
+ pcmpeqb %xmm1, %xmm0 # NUL byte present?
+ pmovmskb %xmm0, %eax
+ shr %cl, %eax # clear out matches in junk bytes
+ bsf %eax, %eax # find match if any
+ jnz .Lrunt
+
+ /* first normal iteration: write head back if it succeeds */
+ movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration
+ movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string
+ pcmpeqb %xmm0, %xmm1 # NUL byte present?
+ pmovmskb %xmm1, %eax
+ test %eax, %eax # find match if any
+ jnz .Lshorty
+
+ movdqu %xmm2, (%rdx) # store beginning of string
+
+ /* main loop, unrolled twice */
+ ALIGN_TEXT
+0: movdqa 32(%rsi), %xmm2 # load current iteraion
+ movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion
+ pxor %xmm1, %xmm1
+ add $32, %rsi
+ pcmpeqb %xmm2, %xmm1 # NUL byte present?
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz 1f
+
+ movdqa 16(%rsi), %xmm0 # load current iteraion
+ movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm0, %xmm1 # NUL byte present?
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jz 0b
+
+ /* end of string after main loop has iterated */
+ add $16, %rsi # advance rsi to second unrolled half
+1: tzcnt %eax, %eax # find location of match
+ # (behaves as bsf on pre-x86-64-v3 CPUs)
+ add %rsi, %rax # point to NUL byte
+ movdqu -15(%rax), %xmm0 # last 16 bytes of string
+ movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination
+ add %rdi, %rax # point to destination's NUL byte
+ ret
+
+ /* NUL encountered in second iteration */
+.Lshorty:
+ tzcnt %eax, %eax
+ add $16, %eax # account for length of first iteration
+ sub %ecx, %eax # but not the parts before the string
+
+ /* NUL encountered in first iteration */
+.Lrunt: lea 1(%rax), %edi # string length including NUL byte
+ add %rcx, %rsi # point to beginning of string
+ add %rdx, %rax # point to NUL byte
+
+ /* transfer 16--32 bytes */
+.L1632: cmp $16, %edi
+ jb .L0815
+
+ movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes
+ movdqu %xmm2, (%rdx) # store first 16 bytes
+ movdqu %xmm0, -15(%rax) # store last 16 bytes
+ ret
+
+ /* transfer 8--15 bytes */
+.L0815: cmp $8, %edi
+ jb .L0407
+
+ mov (%rsi), %rcx # load first 8 bytes
+ mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes
+ mov %rcx, (%rdx) # store to dst
+ mov %rdi, -7(%rax) # dito
+ ret
+
+ /* transfer 4--7 bytes */
+.L0407: cmp $4, %edi
+ jb .L0203
+
+ mov (%rsi), %ecx
+ mov -4(%rsi, %rdi, 1), %edi
+ mov %ecx, (%rdx)
+ mov %edi, -3(%rax)
+ ret
+
+ /* transfer 2--3 bytes */
+.L0203: cmp $2, %edi
+ jb .L0101
+
+ movzwl (%rsi), %ecx
+ mov %cx, (%rdx) # store first two bytes
+
+ /* transfer 0 bytes (last byte is always NUL) */
+.L0101: movb $0, (%rax) # store terminating NUL byte
+ ret
+ARCHEND(__stpcpy, baseline)
+
.section .note.GNU-stack,"",%progbits
diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7
--- a/share/man/man7/simd.7
+++ b/share/man/man7/simd.7
@@ -24,7 +24,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE
.
-.Dd August 5, 2023
+.Dd August 7, 2023
.Dt SIMD 7
.Os
.Sh NAME
@@ -63,12 +63,12 @@
.It memmove Ta S Ta S Ta S Ta S Ta SV
.It memset Ta Ta S Ta S Ta S
.It rindex Ta S
-.It stpcpy Ta Ta Ta S
+.It stpcpy Ta Ta Ta S1
.It strcat Ta Ta Ta S Ta S
.It strchr Ta S Ta Ta S1 Ta S
.It strchrnul Ta Ta Ta S1
.It strcmp Ta Ta S Ta S Ta S
-.It strcpy Ta Ta Ta S Ta S Ta S2
+.It strcpy Ta Ta Ta S1 Ta S Ta S2
.It strlen Ta Ta S Ta S1
.It strncmp Ta Ta S Ta Ta S
.It strncpy Ta Ta Ta Ta Ta S2

File Metadata

Mime Type
text/plain
Expires
Fri, Oct 24, 3:47 AM (4 h, 26 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
24117100
Default Alt Text
D41349.id126287.diff (6 KB)

Event Timeline