diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index d982061e080b..03bca498e116 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -1,25 +1,26 @@ MDSRCS+= \ amd64_archlevel.c \ bcmp.S \ memchr.S \ memcmp.S \ memcpy.S \ memmove.S \ memset.S \ stpcpy.S \ stpncpy.S \ strcat.S \ strchrnul.S \ strcmp.S \ strcpy.c \ strcspn.S \ + strlcpy.S \ strlen.S \ strncmp.S \ strncpy.c \ strnlen.c \ strpbrk.c \ strrchr.S \ strsep.c \ strspn.S \ timingsafe_bcmp.S \ timingsafe_memcmp.S diff --git a/lib/libc/amd64/string/strlcpy.S b/lib/libc/amd64/string/strlcpy.S new file mode 100644 index 000000000000..2b32c6c78047 --- /dev/null +++ b/lib/libc/amd64/string/strlcpy.S @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak strlcpy + .set strlcpy, __strlcpy +ARCHFUNCS(__strlcpy) + ARCHFUNC(__strlcpy, scalar) + ARCHFUNC(__strlcpy, baseline) +ENDARCHFUNCS(__strlcpy) + +ARCHENTRY(__strlcpy, scalar) + push %rbp # establish stack frame + mov %rsp, %rbp + push %rsi + push %rbx + push %rdi + push %rdx + mov %rsi, %rdi + call CNAME(strlen) # strlen(src) + pop %rdx + pop %rdi + mov -8(%rbp), %rsi + mov %rax, %rbx # remember string length for return value + sub $1, %rdx # do not copy into the final byte of the buffer + jc 0f # skip copying altogether if buffer was empty + cmp %rax, %rdx # is the buffer longer than the input? + cmova %rax, %rdx # if yes, only copy the part that fits + movb $0, (%rdi, %rdx, 1) # NUL-terminate output buffer + call CNAME(memcpy) # copy string to output +0: mov %rbx, %rax # restore return value + pop %rbx + leave + ret +ARCHEND(__strlcpy, scalar) + +ARCHENTRY(__strlcpy, baseline) + sub $1, %rdx # do not count NUL byte in buffer length + jb .L0 # go to special code path if len was 0 + + mov %esi, %ecx + pxor %xmm1, %xmm1 + mov %rsi, %r9 # stash a copy of the source pointer for later + and $~0xf, %rsi + pcmpeqb (%rsi), %xmm1 # NUL found in head? + mov $-1, %r8d + and $0xf, %ecx + shl %cl, %r8d # mask of bytes in the string + pmovmskb %xmm1, %eax + and %r8d, %eax + jnz .Lhead_nul + + movdqa 16(%rsi), %xmm3 # load second string chunk + movdqu (%r9), %xmm2 # load unaligned string head + mov $32, %r8d + sub %ecx, %r8d # head length + length of second chunk + pxor %xmm1, %xmm1 + pcmpeqb %xmm3, %xmm1 # NUL found in second chunk? + + sub %r8, %rdx # enough space left for the second chunk? + jbe .Lhead_buf_end + + /* process second chunk */ + pmovmskb %xmm1, %eax + test %eax, %eax + jnz .Lsecond_nul + + /* string didn't end in second chunk and neither did buffer -- not a runt! */ + movdqa 32(%rsi), %xmm0 # load next string chunk + pxor %xmm1, %xmm1 + movdqu %xmm2, (%rdi) # deposit head into buffer + sub %rcx, %rdi # adjust RDI to correspond to RSI + movdqu %xmm3, 16(%rdi) # deposit second chunk + sub %rsi, %rdi # express RDI as distance from RSI + add $32, %rsi # advance RSI past first two chunks + sub $16, %rdx # enough left for another round? + jbe 1f + + /* main loop unrolled twice */ + ALIGN_TEXT +0: pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 3f + + movdqu %xmm0, (%rsi, %rdi) + movdqa 16(%rsi), %xmm0 # load next string chunk + pxor %xmm1, %xmm1 + cmp $16, %rdx # more than a full chunk left? + jbe 2f + + add $32, %rsi # advance pointers to next chunk + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 4f + + movdqu %xmm0, -16(%rsi, %rdi) + movdqa (%rsi), %xmm0 # load next string chunk + pxor %xmm1, %xmm1 + sub $32, %rdx + ja 0b + +1: sub $16, %rsi # undo second advancement + add $16, %edx + + /* 1--16 bytes left in the buffer but string has not ended yet */ +2: pcmpeqb %xmm1, %xmm0 # NUL byte encountered? + pmovmskb %xmm0, %r8d + mov %r8d, %eax + bts %edx, %r8d # treat end of buffer as end of string + tzcnt %r8d, %r8d # find tail length + add %rsi, %rdi # restore RDI + movdqu (%rsi, %r8, 1), %xmm0 # load string tail + movdqu %xmm0, (%rdi, %r8, 1) # store string tail + movb $0, 16(%rdi, %r8, 1) # NUL terminate + + /* continue to find the end of the string */ + test %eax, %eax # end of string already reached? + jnz 1f + + ALIGN_TEXT +0: pcmpeqb 32(%rsi), %xmm1 + pmovmskb %xmm1, %eax + pxor %xmm1, %xmm1 + test %eax, %eax + jnz 2f + + pcmpeqb 48(%rsi), %xmm1 + pmovmskb %xmm1, %eax + add $32, %rsi + pxor %xmm1, %xmm1 + test %eax, %eax + jz 0b + +1: sub $16, %rsi # undo second advancement +2: tzcnt %eax, %eax # where is the NUL byte? + sub %r9, %rsi + lea 32(%rsi, %rax, 1), %rax # return string length + ret + +4: sub $16, %rsi # undo second advancement + add $16, %rdx # restore number of remaining bytes + + /* string has ended but buffer has not */ +3: tzcnt %eax, %eax # find length of string tail + movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL) + add %rsi, %rdi # restore destination pointer + movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL) + sub %r9, %rsi # string length to current chunk + add %rsi, %rax # plus length of current chunk + ret + +.Lhead_buf_end: + pmovmskb %xmm1, %r8d + add $32, %edx # restore edx to (len-1) + ecx + mov %r8d, %eax + shl $16, %r8d # place 2nd chunk NUL mask into bits 16--31 + bts %rdx, %r8 # treat end of buffer as end of string + tzcnt %r8, %rdx # find string/bufer len from alignment boundary + sub %ecx, %edx # find actual string/buffer len + movb $0, (%rdi, %rdx, 1) # write NUL terminator + + /* continue to find the end of the string */ + test %eax, %eax # end of string already reached? + jnz 1f + + ALIGN_TEXT +0: pcmpeqb 32(%rsi), %xmm1 + pmovmskb %xmm1, %eax + pxor %xmm1, %xmm1 + test %eax, %eax + jnz 2f + + pcmpeqb 48(%rsi), %xmm1 + pmovmskb %xmm1, %eax + add $32, %rsi + pxor %xmm1, %xmm1 + test %eax, %eax + jz 0b + +1: sub $16, %rsi +2: tzcnt %eax, %eax + sub %r9, %rsi + lea 32(%rsi, %rax, 1), %rax # return string length + jmp .L0031 + +.Lsecond_nul: + add %r8, %rdx # restore buffer length + tzcnt %eax, %eax # where is the NUL byte? + lea -16(%rcx), %r8d + sub %r8d, %eax # string length + cmp %rax, %rdx # is the string shorter than the buffer? + cmova %rax, %rdx # copy only min(buflen, srclen) bytes + movb $0, (%rdi, %rdx, 1) # write NUL terminator +.L0031: cmp $16, %rdx # at least 16 bytes to copy (not incl NUL)? + jb .L0015 + + /* copy 16--31 bytes */ + movdqu (%r9), %xmm0 # load first 16 bytes + movdqu -16(%r9, %rdx, 1), %xmm1 # load last 16 bytes + movdqu %xmm0, (%rdi) + movdqu %xmm1, -16(%rdi, %rdx, 1) + ret + +.Lhead_nul: + tzcnt %eax, %eax # where is the NUL byte? + sub %ecx, %eax # ... from the beginning of the string? + cmp %rax, %rdx # is the string shorter than the buffer? + cmova %rax, %rdx # copy only min(buflen, srclen) bytes + movb $0, (%rdi, %rdx, 1) # write NUL terminator + + /* process strings of 0--15 bytes (rdx: min(buflen, srclen), rax: srclen) */ +.L0015: cmp $8, %rdx # at least 8 bytes to copy? + jae .L0815 + + cmp $4, %rdx # at least 4 bytes to copy? + jae .L0407 + + cmp $2, %rdx # at least 2 bytes to copy? + jae .L0203 + + movzbl (%r9), %ecx # load first byte from src + mov %cl, (%rdi) # deposit into destination + movb $0, (%rdi, %rdx, 1) # add NUL terminator (again) + ret + +.L0203: movzwl (%r9), %ecx + movzwl -2(%r9, %rdx, 1), %esi + mov %cx, (%rdi) + mov %si, -2(%rdi, %rdx, 1) + ret + +.L0407: mov (%r9), %ecx + mov -4(%r9, %rdx, 1), %esi + mov %ecx, (%rdi) + mov %esi, -4(%rdi, %rdx, 1) + ret + +.L0815: mov (%r9), %rcx + mov -8(%r9, %rdx, 1), %rsi + mov %rcx, (%rdi) + mov %rsi, -8(%rdi, %rdx, 1) + ret + + /* length zero destination: just return the string length */ +.L0: mov %rsi, %rdi + jmp CNAME(strlen) +ARCHEND(__strlcpy, baseline) + + .section .note.GNU-stack,"",%progbits