diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -7,6 +7,7 @@ memmove.S \ memset.S \ stpcpy.S \ + stpncpy.S \ strcat.S \ strchrnul.S \ strcmp.S \ @@ -14,6 +15,7 @@ strcspn.S \ strlen.S \ strncmp.S \ + strncpy.c \ strnlen.c \ strpbrk.c \ strrchr.S \ diff --git a/lib/libc/amd64/string/stpncpy.S b/lib/libc/amd64/string/stpncpy.S new file mode 100644 --- /dev/null +++ b/lib/libc/amd64/string/stpncpy.S @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak stpncpy + .set stpncpy, __stpncpy +ARCHFUNCS(__stpncpy) + ARCHFUNC(__stpncpy, scalar) + ARCHFUNC(__stpncpy, baseline) +ENDARCHFUNCS(__stpncpy) + +ARCHENTRY(__stpncpy, scalar) + push %rbp # establish stack frame + mov %rsp, %rbp + + push %rdx + push %rdi + push %rsi + push %rax # dummy push for alignment + + mov %rsi, %rdi + xor %esi, %esi + call CNAME(__memchr) # memchr(src, '\0', len) + pop %rcx # dummy pop + pop %rsi + mov -16(%rbp), %rdi + + test %rax, %rax # NUL found? + jz .Lfullcopy + + mov %rax, %rdx + sub %rsi, %rdx # copy until the NUL byte + add %rdx, -16(%rbp) # advance destination by string length + sub %rdx, -8(%rbp) # and shorten buffer size by string length + call CNAME(memcpy) + + pop %rdi + pop %rdx + xor %esi, %esi + pop %rbp + jmp CNAME(memset) # clear remaining buffer + +.Lfullcopy: + mov -8(%rbp), %rdx + call CNAME(memcpy) # copy whole string + add -8(%rbp), %rax # point to dest[n] + leave + ret +ARCHEND(__stpncpy, scalar) + + /* + * this mask allows us to generate masks of 16-n 0xff bytes + * followed by n 0x00 bytes by loading from .Lmask+n. + */ + .section .rodata +.Lmask: .quad 0xffffffffffffffff + .quad 0xffffffffffffffff + .quad 0x0000000000000000 + .quad 0x0000000000000000 + +/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */ +ARCHENTRY(__stpncpy, baseline) +#define bounce (-3*16-8) /* location of on-stack bounce buffer */ + + test %rdx, %rdx # no bytes to copy? + jz .L0 + + mov %esi, %ecx + and $~0xf, %rsi # align source to 16 bytes + movdqa (%rsi), %xmm0 # load head + and $0xf, %ecx # offset from alignment + mov $-1, %r9d + lea -32(%rcx), %rax # set up overflow-proof comparison rdx+rcx<32 + shl %cl, %r9d # mask of bytes belonging to the string + sub %rcx, %rdi # adjust RDI to correspond to RSI + pxor %xmm1, %xmm1 + movdqa %xmm0, bounce(%rsp) # stash copy of head on the stack + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %r8d + + lea (%rdx, %rcx, 1), %r10 # buffer length from alignment boundary + add %rdx, %rax # less than 2 chunks (32 bytes) to play with? + jnc .Lrunt # if yes, use special runt processing + + movdqu %xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination + and %r9d, %r8d # end of string within head? + jnz .Lheadnul + + movdqu (%rsi, %rcx, 1), %xmm2 # load head from source buffer + movdqu %xmm2, (%rdi, %rcx, 1) # an deposit + + add $16, %rsi + add $16, %rdi + sub $32, %r10 + + /* main loop unrolled twice */ + ALIGN_TEXT +0: movdqa (%rsi), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %r8d + test %r8d, %r8d + jnz 3f + + movdqu %xmm0, (%rdi) + cmp $16, %r10 # another full chunk left? + jb 1f + + movdqa 16(%rsi), %xmm0 + add $32, %rdi # advance pointers to next chunk + add $32, %rsi + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %r8d + test %r8d, %r8d + jnz 2f + + movdqu %xmm0, -16(%rdi) + sub $32, %r10 # another full chunk left? + jae 0b + + sub $16, %rdi # undo second advancement + sub $16, %rsi + add $16, %r10d # restore number of remaining bytes + + /* buffer ends in the next 16 bytes but string has not ended yet */ +1: lea 16(%rsi), %rcx # pointer to the next string chunk + cmovz %rsi, %rcx # if no bytes remain, we must not + # load additional bytes from RSI, + # so fake-load from RSI-16 instead + movdqa (%rcx), %xmm0 # (fake) load source tail + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %r8d + bts %r10d, %r8d # treat end of buffer as NUL + tzcnt %r8d, %r8d # where is the NUL byte? + movdqu (%rsi, %r8, 1), %xmm0 # load source tail before NUL + lea 16(%rdi, %r8, 1), %rax # point return value to NUL byte + # or end of buffer + movdqu %xmm0, (%rdi, %r8, 1) # store tail into the buffer + ret + +2: sub $16, %rdi # undo second advancement + sub $16, %rsi + sub $16, %r10 + + /* string has ended and buffer has not */ +3: tzcnt %r8d, %r8d # where did the string end? + lea .Lmask+16(%rip), %rcx + lea (%rdi, %r8, 1), %rax # where the NUL byte will be + neg %r8 + movdqu (%rcx, %r8, 1), %xmm1 # mask with FF where the string is, + # 00 where it is not + pand %xmm1, %xmm0 # mask out bytes after the string + movdqu %xmm0, (%rdi) # store masked current chunk + pxor %xmm1, %xmm1 + sub $16, %r10 # another full chunk left? + jb 1f + + /* clear remaining destination buffer (tail has been cleared earlier) */ + ALIGN_TEXT +0: movdqu %xmm1, 16(%rdi) + cmp $16, %r10 + jb 1f + + movdqu %xmm1, 32(%rdi) + add $32, %rdi + sub $32, %r10 + jae 0b + +1: ret + + /* at least two chunks to play with and NUL while processing head */ +.Lheadnul: + movdqu bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack + tzcnt %r8d, %r8d # find location of NUL byte + movdqu %xmm0, (%rdi, %rcx, 1) # deposit head in the destination + movdqu %xmm1, (%rdi, %r8, 1) # clear out following bytes + movdqu %xmm1, 16(%rdi) # clear out second chunk + lea (%rdi, %r8, 1), %rax # make RAX point to the NUL byte + + add $32, %rdi # advance past first two chunks + sub $32+16, %r10 # advance past first three chunks + jb 1f # did we pass the end of the buffer? + + /* clear remaining destination buffer (tail has been cleared earlier) */ + ALIGN_TEXT +0: movdqu %xmm1, (%rdi) # clear out buffer chunk + cmp $16, %r10 + jb 1f + + movdqu %xmm1, 16(%rdi) + add $32, %rdi + sub $32, %r10 + jae 0b + +1: ret + + /* 1--32 bytes to copy, bounce through the stack */ +.Lrunt: movdqa %xmm1, bounce+16(%rsp) # clear out rest of on-stack copy + bts %r10d, %r8d # treat end of buffer as end of string + and %r9w, %r8w # end of string within first buffer? + jnz 0f # if yes, do not inspect second buffer + + movdqa 16(%rsi), %xmm0 # load second chunk of input + movdqa %xmm0, bounce+16(%rsp) # stash copy on stack + pcmpeqb %xmm1, %xmm0 # NUL in second chunk? + pmovmskb %xmm0, %r9d + shl $16, %r9d + or %r9d, %r8d # merge found NUL bytes into NUL mask + + /* end of string after one buffer */ +0: tzcnt %r8d, %r8d # location of last char in string + movdqu %xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string + lea bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack + lea (%rdi, %r8, 1), %rax # return pointer to NUL byte + + cmp $16, %edx # at least 16 bytes to transfer? + jae .L1631 + + mov (%rsi), %r8 # load string head + cmp $8, %edx # at least 8 bytes to transfer? + jae .L0815 + + cmp $4, %edx # at least 4 bytes to transfer? + jae .L0407 + + movzwl -2(%rsi, %rdx, 1), %esi # load last two bytes of string + mov %r8b, (%rdi, %rcx, 1) # store first byte + + cmp $2, %edx # at least 2 bytes to transfer? + jb .L1 + + mov %si, -2(%rdi, %r10, 1) # store last two bytes of string +.L1: ret + +.L1631: movdqu (%rsi), %xmm0 # load first 16 bytes of string + movdqu -16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string + movdqu %xmm0, (%rdi, %rcx, 1) + movdqu %xmm1, -16(%rdi, %r10, 1) + ret + +.L0815: mov -8(%rsi, %rdx, 1), %rdx # load last 8 bytes of string + mov %r8, (%rdi, %rcx, 1) + mov %rdx, -8(%rdi, %r10, 1) + ret + +.L0407: mov -4(%rsi, %rdx, 1), %edx # load last four bytes of string + mov %r8d, (%rdi, %rcx, 1) + mov %edx, -4(%rdi, %r10, 1) + ret + + /* length 0 buffer: just return dest */ +.L0: mov %rdi, %rax + ret +ARCHEND(__stpncpy, baseline) + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strncpy.c b/lib/libc/amd64/string/strncpy.c new file mode 100644 --- /dev/null +++ b/lib/libc/amd64/string/strncpy.c @@ -0,0 +1,41 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include +#include + +char *__stpncpy(char *restrict, const char *restrict, size_t); + +char * +strncpy(char *restrict dst, const char *restrict src, size_t len) +{ + + __stpncpy(dst, src, len); + + return (dst); +} diff --git a/lib/libc/tests/string/stpncpy_test.c b/lib/libc/tests/string/stpncpy_test.c --- a/lib/libc/tests/string/stpncpy_test.c +++ b/lib/libc/tests/string/stpncpy_test.c @@ -1,7 +1,11 @@ /*- * Copyright (c) 2009 David Schultz + * Copyright (c) 2023 The FreeBSD Foundation * All rights reserved. * + * Portions of this software were developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -28,12 +32,15 @@ #include #include #include +#include #include #include #include #include +static char *(*stpncpy_fn)(char *restrict, const char *restrict, size_t); + static char * makebuf(size_t len, int guard_at_end) { @@ -70,7 +77,7 @@ dst = makebuf(bufsize, j); memset(dst, 'X', bufsize); len = (bufsize < size) ? bufsize : size - 1; - assert(stpncpy(dst, src, bufsize) == dst+len); + assert(stpncpy_fn(dst, src, bufsize) == dst+len); assert(memcmp(src, dst, len) == 0); for (x = len; x < bufsize; x++) assert(dst[x] == '\0'); @@ -79,33 +86,97 @@ } } -ATF_TC_WITHOUT_HEAD(nul); -ATF_TC_BODY(nul, tc) +static void +test_sentinel(char *dest, char *src, size_t destlen, size_t srclen) { + size_t i; + const char *res, *wantres; + const char *fail = NULL; + + for (i = 0; i < srclen; i++) + /* src will never include (){} */ + src[i] = '0' + i; + src[srclen] = '\0'; + + /* source sentinels: not to be copied */ + src[-1] = '('; + src[srclen+1] = ')'; + + memset(dest, 0xee, destlen); + + /* destination sentinels: not to be touched */ + dest[-1] = '{'; + dest[destlen] = '}'; + + wantres = dest + (srclen > destlen ? destlen : srclen); + res = stpncpy_fn(dest, src, destlen); + + if (dest[-1] != '{') + fail = "start sentinel overwritten"; + else if (dest[destlen] != '}') + fail = "end sentinel overwritten"; + else if (strncmp(src, dest, destlen) != 0) + fail = "string not copied correctly"; + else if (res != wantres) + fail = "incorrect return value"; + else for (i = srclen; i < destlen; i++) + if (dest[i] != '\0') { + fail = "incomplete NUL padding"; + break; + } - test_stpncpy(""); + if (fail) + atf_tc_fail_nonfatal("%s\n" + "stpncpy(%p \"%s\", %p \"%s\", %zu) = %p (want %p)\n", + fail, dest, dest, src, src, destlen, res, wantres); } -ATF_TC_WITHOUT_HEAD(foo); -ATF_TC_BODY(foo, tc) +ATF_TC_WITHOUT_HEAD(null); +ATF_TC_BODY(null, tc) { - - test_stpncpy("foo"); + ATF_CHECK_EQ(stpncpy_fn(NULL, NULL, 0), NULL); } -ATF_TC_WITHOUT_HEAD(glorp); -ATF_TC_BODY(glorp, tc) +ATF_TC_WITHOUT_HEAD(bounds); +ATF_TC_BODY(bounds, tc) { + size_t i; + char buf[64+1]; - test_stpncpy("glorp"); + for (i = 0; i < sizeof(buf) - 1; i++) { + buf[i] = ' ' + i; + buf[i+1] = '\0'; + test_stpncpy(buf); + } +} + +ATF_TC_WITHOUT_HEAD(alignments); +ATF_TC_BODY(alignments, tc) +{ + size_t srcalign, destalign, srclen, destlen; + char src[15+3+64]; /* 15 offsets + 64 max length + NUL + sentinels */ + char dest[15+2+64]; /* 15 offsets + 64 max length + sentinels */ + + for (srcalign = 0; srcalign < 16; srcalign++) + for (destalign = 0; destalign < 16; destalign++) + for (srclen = 0; srclen < 64; srclen++) + for (destlen = 0; destlen < 64; destlen++) + test_sentinel(dest+destalign+1, + src+srcalign+1, destlen, srclen); } ATF_TP_ADD_TCS(tp) { + void *dl_handle; + + dl_handle = dlopen(NULL, RTLD_LAZY); + stpncpy_fn = dlsym(dl_handle, "test_stpncpy"); + if (stpncpy_fn == NULL) + stpncpy_fn = stpncpy; - ATF_TP_ADD_TC(tp, nul); - ATF_TP_ADD_TC(tp, foo); - ATF_TP_ADD_TC(tp, glorp); + ATF_TP_ADD_TC(tp, null); + ATF_TP_ADD_TC(tp, bounds); + ATF_TP_ADD_TC(tp, alignments); return (atf_no_error()); } diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd October 23, 2023 +.Dd November 8, 2023 .Dt SIMD 7 .Os .Sh NAME @@ -65,6 +65,7 @@ .It memset Ta Ta S Ta S Ta S .It rindex Ta S Ta Ta S1 Ta S .It stpcpy Ta Ta Ta S1 +.It stpncpy Ta Ta Ta S1 .It strcat Ta Ta Ta S Ta S .It strchr Ta S Ta Ta S1 Ta S .It strchrnul Ta Ta Ta S1 @@ -73,7 +74,7 @@ .It strcspn Ta Ta Ta S2 .It strlen Ta Ta S Ta S1 .It strncmp Ta Ta S Ta S1 Ta S -.It strncpy Ta Ta Ta Ta Ta S2 +.It strncpy Ta Ta Ta S1 Ta Ta S2 .It strnlen Ta Ta Ta S1 .It strrchr Ta S Ta Ta S1 Ta S .It strpbrk Ta Ta Ta S2 @@ -209,7 +210,7 @@ for .Cm powerpc64 and with -.Fx 14.0 +.Fx 14.1 for .Cm amd64 . .Pp