diff --git a/contrib/netbsd-tests/lib/libc/string/t_strcpy.c b/contrib/netbsd-tests/lib/libc/string/t_strcpy.c --- a/contrib/netbsd-tests/lib/libc/string/t_strcpy.c +++ b/contrib/netbsd-tests/lib/libc/string/t_strcpy.c @@ -2,6 +2,10 @@ /* * Written by J.T. Conklin + * + * Portions of this software were developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * * Public domain. */ diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -7,8 +7,9 @@ memcpy.S \ memmove.S \ memset.S \ + stpcpy.S \ strcat.S \ strchrnul.S \ strcmp.S \ strlen.S \ - stpcpy.S + strcpy.c diff --git a/lib/libc/amd64/string/stpcpy.S b/lib/libc/amd64/string/stpcpy.S --- a/lib/libc/amd64/string/stpcpy.S +++ b/lib/libc/amd64/string/stpcpy.S @@ -1,12 +1,31 @@ -/* - * Adapted by Guillaume Morin from strcpy.S - * written by J.T. Conklin - * Public domain. +/*- + * Copyright (c) 2023, The FreeBSD Foundation + * + * SPDX-License-Expression: BSD-2-Clause + * + * Portions of this software were developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S + * written by J.T. Conklin and + * adapted by Guillaume Morin to implement stpcpy + * that was originally dedicated to the public domain */ #include __FBSDID("$FreeBSD$"); +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak stpcpy + .set stpcpy, __stpcpy +ARCHFUNCS(__stpcpy) + ARCHFUNC(__stpcpy, scalar) + ARCHFUNC(__stpcpy, baseline) +ENDARCHFUNCS(__stpcpy) + /* * This stpcpy implementation copies a byte at a time until the * source pointer is aligned to a word boundary, it then copies by @@ -20,9 +39,7 @@ * requirements. */ - .globl stpcpy,__stpcpy -ENTRY(stpcpy) -__stpcpy: +ARCHENTRY(__stpcpy, scalar) movabsq $0x0101010101010101,%r8 movabsq $0x8080808080808080,%r9 @@ -43,7 +60,7 @@ dec %rax ret - .p2align 4 + ALIGN_TEXT .Lloop: movq %rdx,(%rdi) addq $8,%rdi @@ -111,6 +128,111 @@ .Ldone: movq %rdi,%rax ret -END(stpcpy) - +ARCHEND(__stpcpy, scalar) + +ARCHENTRY(__stpcpy, baseline) + mov %esi, %ecx + mov %rdi, %rdx + sub %rsi, %rdi # express destination as distance to surce + and $~0xf, %rsi # align source to 16 byte + movdqa (%rsi), %xmm0 # head of string with junk before + pxor %xmm1, %xmm1 + and $0xf, %ecx # misalignment in bytes + pcmpeqb %xmm1, %xmm0 # NUL byte present? + pmovmskb %xmm0, %eax + shr %cl, %eax # clear out matches in junk bytes + bsf %eax, %eax # find match if any + jnz .Lrunt + + /* first normal iteration: write head back if it succeeds */ + movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration + movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string + pcmpeqb %xmm0, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax # find match if any + jnz .Lshorty + + movdqu %xmm2, (%rdx) # store beginning of string + + /* main loop, unrolled twice */ + ALIGN_TEXT +0: movdqa 32(%rsi), %xmm2 # load current iteraion + movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion + pxor %xmm1, %xmm1 + add $32, %rsi + pcmpeqb %xmm2, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 1f + + movdqa 16(%rsi), %xmm0 # load current iteraion + movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax + jz 0b + + /* end of string after main loop has iterated */ + add $16, %rsi # advance rsi to second unrolled half +1: tzcnt %eax, %eax # find location of match + # (behaves as bsf on pre-x86-64-v3 CPUs) + add %rsi, %rax # point to NUL byte + movdqu -15(%rax), %xmm0 # last 16 bytes of string + movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination + add %rdi, %rax # point to destination's NUL byte + ret + + /* NUL encountered in second iteration */ +.Lshorty: + tzcnt %eax, %eax + add $16, %eax # account for length of first iteration + sub %ecx, %eax # but not the parts before the string + + /* NUL encountered in first iteration */ +.Lrunt: lea 1(%rax), %edi # string length including NUL byte + add %rcx, %rsi # point to beginning of string + add %rdx, %rax # point to NUL byte + + /* transfer 16--32 bytes */ +.L1632: cmp $16, %edi + jb .L0815 + + movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes + movdqu %xmm2, (%rdx) # store first 16 bytes + movdqu %xmm0, -15(%rax) # store last 16 bytes + ret + + /* transfer 8--15 bytes */ +.L0815: cmp $8, %edi + jb .L0407 + + mov (%rsi), %rcx # load first 8 bytes + mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes + mov %rcx, (%rdx) # store to dst + mov %rdi, -7(%rax) # dito + ret + + /* transfer 4--7 bytes */ +.L0407: cmp $4, %edi + jb .L0203 + + mov (%rsi), %ecx + mov -4(%rsi, %rdi, 1), %edi + mov %ecx, (%rdx) + mov %edi, -3(%rax) + ret + + /* transfer 2--3 bytes */ +.L0203: cmp $2, %edi + jb .L0101 + + movzwl (%rsi), %ecx + mov %cx, (%rdx) # store first two bytes + + /* transfer 0 bytes (last byte is always NUL) */ +.L0101: movb $0, (%rax) # store terminating NUL byte + ret +ARCHEND(__stpcpy, baseline) + .section .note.GNU-stack,"",%progbits diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd August 5, 2023 +.Dd August 7, 2023 .Dt SIMD 7 .Os .Sh NAME @@ -63,12 +63,12 @@ .It memmove Ta S Ta S Ta S Ta S Ta SV .It memset Ta Ta S Ta S Ta S .It rindex Ta S -.It stpcpy Ta Ta Ta S +.It stpcpy Ta Ta Ta S1 .It strcat Ta Ta Ta S Ta S .It strchr Ta S Ta Ta S1 Ta S .It strchrnul Ta Ta Ta S1 .It strcmp Ta Ta S Ta S Ta S -.It strcpy Ta Ta Ta S Ta S Ta S2 +.It strcpy Ta Ta Ta S1 Ta S Ta S2 .It strlen Ta Ta S Ta S1 .It strncmp Ta Ta S Ta Ta S .It strncpy Ta Ta Ta Ta Ta S2