diff --git a/contrib/netbsd-tests/lib/libc/string/t_strcpy.c b/contrib/netbsd-tests/lib/libc/string/t_strcpy.c --- a/contrib/netbsd-tests/lib/libc/string/t_strcpy.c +++ b/contrib/netbsd-tests/lib/libc/string/t_strcpy.c @@ -2,6 +2,10 @@ /* * Written by J.T. Conklin + * + * Portions of this software were developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * * Public domain. */ @@ -10,6 +14,9 @@ #include #include #include +#include + +char * (*volatile strcpy_fn)(char *restrict, const char *restrict); ATF_TC(strcpy_basic); ATF_TC_HEAD(strcpy_basic, tc) @@ -19,12 +26,10 @@ ATF_TC_BODY(strcpy_basic, tc) { - /* try to trick the compiler */ - char * (*f)(char *, const char *s) = strcpy; - + void *dl_handle; unsigned int a0, a1, t; - char buf0[64]; - char buf1[64]; + char buf0[128]; + char buf1[128]; char *ret; struct tab { @@ -38,30 +43,38 @@ * trailing unaligned characters (on a 64 bit processor) */ - { "", 0 }, - { "a", 1 }, - { "ab", 2 }, - { "abc", 3 }, - { "abcd", 4 }, - { "abcde", 5 }, - { "abcdef", 6 }, - { "abcdefg", 7 }, - { "abcdefgh", 8 }, - { "abcdefghi", 9 }, - { "abcdefghij", 10 }, - { "abcdefghijk", 11 }, - { "abcdefghijkl", 12 }, - { "abcdefghijklm", 13 }, - { "abcdefghijklmn", 14 }, - { "abcdefghijklmno", 15 }, - { "abcdefghijklmnop", 16 }, - { "abcdefghijklmnopq", 17 }, - { "abcdefghijklmnopqr", 18 }, - { "abcdefghijklmnopqrs", 19 }, - { "abcdefghijklmnopqrst", 20 }, - { "abcdefghijklmnopqrstu", 21 }, - { "abcdefghijklmnopqrstuv", 22 }, - { "abcdefghijklmnopqrstuvw", 23 }, + { "", 0 }, + { "a", 1 }, + { "ab", 2 }, + { "abc", 3 }, + { "abcd", 4 }, + { "abcde", 5 }, + { "abcdef", 6 }, + { "abcdefg", 7 }, + { "abcdefgh", 8 }, + { "abcdefghi", 9 }, + { "abcdefghij", 10 }, + { "abcdefghijk", 11 }, + { "abcdefghijkl", 12 }, + { "abcdefghijklm", 13 }, + { "abcdefghijklmn", 14 }, + { "abcdefghijklmno", 15 }, + { "abcdefghijklmnop", 16 }, + { "abcdefghijklmnopq", 17 }, + { "abcdefghijklmnopqr", 18 }, + { "abcdefghijklmnopqrs", 19 }, + { "abcdefghijklmnopqrst", 20 }, + { "abcdefghijklmnopqrstu", 21 }, + { "abcdefghijklmnopqrstuv", 22 }, + { "abcdefghijklmnopqrstuvw", 23 }, + { "abcdefghijklmnopqrstuvwx", 24 }, + { "abcdefghijklmnopqrstuvwxy", 25 }, + { "abcdefghijklmnopqrstuvwxyz", 26 }, + { "abcdefghijklmnopqrstuvwxyz0", 27 }, + { "abcdefghijklmnopqrstuvwxyz01", 28 }, + { "abcdefghijklmnopqrstuvwxyz012", 29 }, + { "abcdefghijklmnopqrstuvwxyz0123", 30 }, + { "abcdefghijklmnopqrstuvwxyz01234", 31 }, /* * patterns that check for the cases where the expression: @@ -83,12 +96,17 @@ { "abcdefgh" "\xff\xff\xff\xff\xff\xff\xff\xff" "", 16 }, }; - for (a0 = 0; a0 < sizeof(long); ++a0) { + dl_handle = dlopen(NULL, RTLD_LAZY); + strcpy_fn = dlsym(dl_handle, "test_strcpy"); + if (!strcpy_fn) + strcpy_fn = strcpy; + + for (a0 = 0; a0 < 16; ++a0) { for (a1 = 0; a1 < sizeof(long); ++a1) { for (t = 0; t < (sizeof(tab) / sizeof(tab[0])); ++t) { memcpy(&buf1[a1], tab[t].val, tab[t].len + 1); - ret = f(&buf0[a0], &buf1[a1]); + ret = strcpy_fn(&buf0[a0], &buf1[a1]); /* * verify strcpy returns address of diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -7,8 +7,9 @@ memcpy.S \ memmove.S \ memset.S \ + stpcpy.S \ strcat.S \ strchrnul.S \ strcmp.S \ strlen.S \ - stpcpy.S + strcpy.c diff --git a/lib/libc/amd64/string/stpcpy.S b/lib/libc/amd64/string/stpcpy.S --- a/lib/libc/amd64/string/stpcpy.S +++ b/lib/libc/amd64/string/stpcpy.S @@ -1,12 +1,28 @@ -/* +/*- * Adapted by Guillaume Morin from strcpy.S * written by J.T. Conklin + * Copyright (c) 2023 The FreeBSD Foundation + * + * Portions of this software were developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * * Public domain. */ #include __FBSDID("$FreeBSD$"); +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak stpcpy + .set stpcpy, __stpcpy +ARCHFUNCS(__stpcpy) + ARCHFUNC(__stpcpy, scalar) + ARCHFUNC(__stpcpy, baseline) +ENDARCHFUNCS(__stpcpy) + /* * This stpcpy implementation copies a byte at a time until the * source pointer is aligned to a word boundary, it then copies by @@ -20,9 +36,7 @@ * requirements. */ - .globl stpcpy,__stpcpy -ENTRY(stpcpy) -__stpcpy: +ARCHENTRY(__stpcpy, scalar) movabsq $0x0101010101010101,%r8 movabsq $0x8080808080808080,%r9 @@ -43,7 +57,7 @@ dec %rax ret - .p2align 4 + ALIGN_TEXT .Lloop: movq %rdx,(%rdi) addq $8,%rdi @@ -111,6 +125,111 @@ .Ldone: movq %rdi,%rax ret -END(stpcpy) - +ARCHEND(__stpcpy, scalar) + +ARCHENTRY(__stpcpy, baseline) + mov %esi, %ecx + mov %rdi, %rdx + sub %rsi, %rdi # express destination as distance to surce + and $~0xf, %rsi # align source to 16 byte + movdqa (%rsi), %xmm0 # head of string with junk before + pxor %xmm1, %xmm1 + and $0xf, %ecx # misalignment in bytes + pcmpeqb %xmm1, %xmm0 # NUL byte present? + pmovmskb %xmm0, %eax + shr %cl, %eax # clear out matches in junk bytes + bsf %eax, %eax # find match if any + jnz .Lrunt + + /* first normal iteration: write head back if it succeeds */ + movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration + movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string + pcmpeqb %xmm0, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax # find match if any + jnz .Lshorty + + movdqu %xmm2, (%rdx) # store beginning of string + + /* main loop, unrolled twice */ + ALIGN_TEXT +0: movdqa 32(%rsi), %xmm2 # load current iteraion + movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion + pxor %xmm1, %xmm1 + add $32, %rsi + pcmpeqb %xmm2, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 1f + + movdqa 16(%rsi), %xmm0 # load current iteraion + movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax + jz 0b + + /* end of string after main loop has iterated */ + add $16, %rsi # advance rsi to second unrolled half +1: tzcnt %eax, %eax # find location of match + # (behaves as bsf on pre-x86-64-v3 CPUs) + add %rsi, %rax # point to NUL byte + movdqu -15(%rax), %xmm0 # last 16 bytes of string + movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination + add %rdi, %rax # point to destination's NUL byte + ret + + /* NUL encountered in second iteration */ +.Lshorty: + tzcnt %eax, %eax + add $16, %eax # account for length of first iteration + sub %ecx, %eax # but not the parts before the string + + /* NUL encountered in first iteration */ +.Lrunt: lea 1(%rax), %edi # string length including NUL byte + add %rcx, %rsi # point to beginning of string + add %rdx, %rax # point to NUL byte + + /* transfer 16--32 bytes */ +.L1632: cmp $16, %edi + jb .L0815 + + movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes + movdqu %xmm2, (%rdx) # store first 16 bytes + movdqu %xmm0, -15(%rax) # store last 16 bytes + ret + + /* transfer 8--15 bytes */ +.L0815: cmp $8, %edi + jb .L0407 + + mov (%rsi), %rcx # load first 8 bytes + mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes + mov %rcx, (%rdx) # store to dst + mov %rdi, -7(%rax) # dito + ret + + /* transfer 4--7 bytes */ +.L0407: cmp $4, %edi + jb .L0203 + + mov (%rsi), %ecx + mov -4(%rsi, %rdi, 1), %edi + mov %ecx, (%rdx) + mov %edi, -3(%rax) + ret + + /* transfer 2--3 bytes */ +.L0203: cmp $2, %edi + jb .L0101 + + movzwl (%rsi), %ecx + mov %cx, (%rdx) # store first two bytes + + /* transfer 0 bytes (last byte is always NUL) */ +.L0101: movb $0, (%rax) # store terminating NUL byte + ret +ARCHEND(__stpcpy, baseline) + .section .note.GNU-stack,"",%progbits diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd August 5, 2023 +.Dd August 7, 2023 .Dt SIMD 7 .Os .Sh NAME @@ -63,12 +63,12 @@ .It memmove Ta S Ta S Ta S Ta S Ta SV .It memset Ta Ta S Ta S Ta S .It rindex Ta S -.It stpcpy Ta Ta Ta S +.It stpcpy Ta Ta Ta S1 .It strcat Ta Ta Ta S Ta S .It strchr Ta S Ta Ta S1 Ta S .It strchrnul Ta Ta Ta S1 .It strcmp Ta Ta S Ta S Ta S -.It strcpy Ta Ta Ta S Ta S Ta S2 +.It strcpy Ta Ta Ta S1 Ta S Ta S2 .It strlen Ta Ta S Ta S1 .It strncmp Ta Ta S Ta Ta S .It strncpy Ta Ta Ta Ta Ta S2