Page MenuHomeFreeBSD

D41349.id125669.diff
No OneTemporary

D41349.id125669.diff

diff --git a/contrib/netbsd-tests/lib/libc/string/t_strcpy.c b/contrib/netbsd-tests/lib/libc/string/t_strcpy.c
--- a/contrib/netbsd-tests/lib/libc/string/t_strcpy.c
+++ b/contrib/netbsd-tests/lib/libc/string/t_strcpy.c
@@ -2,6 +2,10 @@
/*
* Written by J.T. Conklin <jtc@acorntoolworks.com>
+ *
+ * Portions of this software were developed by Robert Clausecker
+ * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
+ *
* Public domain.
*/
@@ -10,6 +14,9 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
+#include <dlfcn.h>
+
+char * (*volatile strcpy_fn)(char *restrict, const char *restrict);
ATF_TC(strcpy_basic);
ATF_TC_HEAD(strcpy_basic, tc)
@@ -19,12 +26,10 @@
ATF_TC_BODY(strcpy_basic, tc)
{
- /* try to trick the compiler */
- char * (*f)(char *, const char *s) = strcpy;
-
+ void *dl_handle;
unsigned int a0, a1, t;
- char buf0[64];
- char buf1[64];
+ char buf0[128];
+ char buf1[128];
char *ret;
struct tab {
@@ -38,30 +43,38 @@
* trailing unaligned characters (on a 64 bit processor)
*/
- { "", 0 },
- { "a", 1 },
- { "ab", 2 },
- { "abc", 3 },
- { "abcd", 4 },
- { "abcde", 5 },
- { "abcdef", 6 },
- { "abcdefg", 7 },
- { "abcdefgh", 8 },
- { "abcdefghi", 9 },
- { "abcdefghij", 10 },
- { "abcdefghijk", 11 },
- { "abcdefghijkl", 12 },
- { "abcdefghijklm", 13 },
- { "abcdefghijklmn", 14 },
- { "abcdefghijklmno", 15 },
- { "abcdefghijklmnop", 16 },
- { "abcdefghijklmnopq", 17 },
- { "abcdefghijklmnopqr", 18 },
- { "abcdefghijklmnopqrs", 19 },
- { "abcdefghijklmnopqrst", 20 },
- { "abcdefghijklmnopqrstu", 21 },
- { "abcdefghijklmnopqrstuv", 22 },
- { "abcdefghijklmnopqrstuvw", 23 },
+ { "", 0 },
+ { "a", 1 },
+ { "ab", 2 },
+ { "abc", 3 },
+ { "abcd", 4 },
+ { "abcde", 5 },
+ { "abcdef", 6 },
+ { "abcdefg", 7 },
+ { "abcdefgh", 8 },
+ { "abcdefghi", 9 },
+ { "abcdefghij", 10 },
+ { "abcdefghijk", 11 },
+ { "abcdefghijkl", 12 },
+ { "abcdefghijklm", 13 },
+ { "abcdefghijklmn", 14 },
+ { "abcdefghijklmno", 15 },
+ { "abcdefghijklmnop", 16 },
+ { "abcdefghijklmnopq", 17 },
+ { "abcdefghijklmnopqr", 18 },
+ { "abcdefghijklmnopqrs", 19 },
+ { "abcdefghijklmnopqrst", 20 },
+ { "abcdefghijklmnopqrstu", 21 },
+ { "abcdefghijklmnopqrstuv", 22 },
+ { "abcdefghijklmnopqrstuvw", 23 },
+ { "abcdefghijklmnopqrstuvwx", 24 },
+ { "abcdefghijklmnopqrstuvwxy", 25 },
+ { "abcdefghijklmnopqrstuvwxyz", 26 },
+ { "abcdefghijklmnopqrstuvwxyz0", 27 },
+ { "abcdefghijklmnopqrstuvwxyz01", 28 },
+ { "abcdefghijklmnopqrstuvwxyz012", 29 },
+ { "abcdefghijklmnopqrstuvwxyz0123", 30 },
+ { "abcdefghijklmnopqrstuvwxyz01234", 31 },
/*
* patterns that check for the cases where the expression:
@@ -83,12 +96,17 @@
{ "abcdefgh" "\xff\xff\xff\xff\xff\xff\xff\xff" "", 16 },
};
- for (a0 = 0; a0 < sizeof(long); ++a0) {
+ dl_handle = dlopen(NULL, RTLD_LAZY);
+ strcpy_fn = dlsym(dl_handle, "test_strcpy");
+ if (!strcpy_fn)
+ strcpy_fn = strcpy;
+
+ for (a0 = 0; a0 < 16; ++a0) {
for (a1 = 0; a1 < sizeof(long); ++a1) {
for (t = 0; t < (sizeof(tab) / sizeof(tab[0])); ++t) {
memcpy(&buf1[a1], tab[t].val, tab[t].len + 1);
- ret = f(&buf0[a0], &buf1[a1]);
+ ret = strcpy_fn(&buf0[a0], &buf1[a1]);
/*
* verify strcpy returns address of
diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -7,8 +7,9 @@
memcpy.S \
memmove.S \
memset.S \
+ stpcpy.S \
strcat.S \
strchrnul.S \
strcmp.S \
strlen.S \
- stpcpy.S
+ strcpy.c
diff --git a/lib/libc/amd64/string/stpcpy.S b/lib/libc/amd64/string/stpcpy.S
--- a/lib/libc/amd64/string/stpcpy.S
+++ b/lib/libc/amd64/string/stpcpy.S
@@ -1,12 +1,28 @@
-/*
+/*-
* Adapted by Guillaume Morin <guillaume@morinfr.org> from strcpy.S
* written by J.T. Conklin <jtc@acorntoolworks.com>
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by Robert Clausecker
+ * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
+ *
* Public domain.
*/
#include <machine/asm.h>
__FBSDID("$FreeBSD$");
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4, 0x90
+
+ .weak stpcpy
+ .set stpcpy, __stpcpy
+ARCHFUNCS(__stpcpy)
+ ARCHFUNC(__stpcpy, scalar)
+ ARCHFUNC(__stpcpy, baseline)
+ENDARCHFUNCS(__stpcpy)
+
/*
* This stpcpy implementation copies a byte at a time until the
* source pointer is aligned to a word boundary, it then copies by
@@ -20,9 +36,7 @@
* requirements.
*/
- .globl stpcpy,__stpcpy
-ENTRY(stpcpy)
-__stpcpy:
+ARCHENTRY(__stpcpy, scalar)
movabsq $0x0101010101010101,%r8
movabsq $0x8080808080808080,%r9
@@ -43,7 +57,7 @@
dec %rax
ret
- .p2align 4
+ ALIGN_TEXT
.Lloop:
movq %rdx,(%rdi)
addq $8,%rdi
@@ -111,6 +125,111 @@
.Ldone:
movq %rdi,%rax
ret
-END(stpcpy)
-
+ARCHEND(__stpcpy, scalar)
+
+ARCHENTRY(__stpcpy, baseline)
+ mov %esi, %ecx
+ mov %rdi, %rdx
+ sub %rsi, %rdi # express destination as distance to surce
+ and $~0xf, %rsi # align source to 16 byte
+ movdqa (%rsi), %xmm0 # head of string with junk before
+ pxor %xmm1, %xmm1
+ and $0xf, %ecx # misalignment in bytes
+ pcmpeqb %xmm1, %xmm0 # NUL byte present?
+ pmovmskb %xmm0, %eax
+ shr %cl, %eax # clear out matches in junk bytes
+ bsf %eax, %eax # find match if any
+ jnz .Lrunt
+
+ /* first normal iteration: write head back if it succeeds */
+ movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration
+ movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string
+ pcmpeqb %xmm0, %xmm1 # NUL byte present?
+ pmovmskb %xmm1, %eax
+ test %eax, %eax # find match if any
+ jnz .Lshorty
+
+ movdqu %xmm2, (%rdx) # store beginning of string
+
+ /* main loop, unrolled twice */
+ ALIGN_TEXT
+0: movdqa 32(%rsi), %xmm2 # load current iteraion
+ movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion
+ pxor %xmm1, %xmm1
+ add $32, %rsi
+ pcmpeqb %xmm2, %xmm1 # NUL byte present?
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz 1f
+
+ movdqa 16(%rsi), %xmm0 # load current iteraion
+ movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm0, %xmm1 # NUL byte present?
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jz 0b
+
+ /* end of string after main loop has iterated */
+ add $16, %rsi # advance rsi to second unrolled half
+1: tzcnt %eax, %eax # find location of match
+ # (behaves as bsf on pre-x86-64-v3 CPUs)
+ add %rsi, %rax # point to NUL byte
+ movdqu -15(%rax), %xmm0 # last 16 bytes of string
+ movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination
+ add %rdi, %rax # point to destination's NUL byte
+ ret
+
+ /* NUL encountered in second iteration */
+.Lshorty:
+ tzcnt %eax, %eax
+ add $16, %eax # account for length of first iteration
+ sub %ecx, %eax # but not the parts before the string
+
+ /* NUL encountered in first iteration */
+.Lrunt: lea 1(%rax), %edi # string length including NUL byte
+ add %rcx, %rsi # point to beginning of string
+ add %rdx, %rax # point to NUL byte
+
+ /* transfer 16--32 bytes */
+.L1632: cmp $16, %edi
+ jb .L0815
+
+ movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes
+ movdqu %xmm2, (%rdx) # store first 16 bytes
+ movdqu %xmm0, -15(%rax) # store last 16 bytes
+ ret
+
+ /* transfer 8--15 bytes */
+.L0815: cmp $8, %edi
+ jb .L0407
+
+ mov (%rsi), %rcx # load first 8 bytes
+ mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes
+ mov %rcx, (%rdx) # store to dst
+ mov %rdi, -7(%rax) # dito
+ ret
+
+ /* transfer 4--7 bytes */
+.L0407: cmp $4, %edi
+ jb .L0203
+
+ mov (%rsi), %ecx
+ mov -4(%rsi, %rdi, 1), %edi
+ mov %ecx, (%rdx)
+ mov %edi, -3(%rax)
+ ret
+
+ /* transfer 2--3 bytes */
+.L0203: cmp $2, %edi
+ jb .L0101
+
+ movzwl (%rsi), %ecx
+ mov %cx, (%rdx) # store first two bytes
+
+ /* transfer 0 bytes (last byte is always NUL) */
+.L0101: movb $0, (%rax) # store terminating NUL byte
+ ret
+ARCHEND(__stpcpy, baseline)
+
.section .note.GNU-stack,"",%progbits
diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7
--- a/share/man/man7/simd.7
+++ b/share/man/man7/simd.7
@@ -24,7 +24,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE
.
-.Dd August 5, 2023
+.Dd August 7, 2023
.Dt SIMD 7
.Os
.Sh NAME
@@ -63,12 +63,12 @@
.It memmove Ta S Ta S Ta S Ta S Ta SV
.It memset Ta Ta S Ta S Ta S
.It rindex Ta S
-.It stpcpy Ta Ta Ta S
+.It stpcpy Ta Ta Ta S1
.It strcat Ta Ta Ta S Ta S
.It strchr Ta S Ta Ta S1 Ta S
.It strchrnul Ta Ta Ta S1
.It strcmp Ta Ta S Ta S Ta S
-.It strcpy Ta Ta Ta S Ta S Ta S2
+.It strcpy Ta Ta Ta S1 Ta S Ta S2
.It strlen Ta Ta S Ta S1
.It strncmp Ta Ta S Ta Ta S
.It strncpy Ta Ta Ta Ta Ta S2

File Metadata

Mime Type
text/plain
Expires
Tue, Jan 13, 11:03 PM (5 h, 4 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27631596
Default Alt Text
D41349.id125669.diff (9 KB)

Event Timeline