diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index 50c70007e99b..51645ba3b8af 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -1,20 +1,21 @@ MDSRCS+= \ amd64_archlevel.c \ bcmp.S \ memchr.S \ memcmp.S \ memcpy.S \ memmove.S \ memset.S \ stpcpy.S \ strcat.S \ strchrnul.S \ strcmp.S \ strcpy.c \ strcspn.S \ strlen.S \ + strncmp.S \ strnlen.c \ strpbrk.c \ strspn.S \ timingsafe_bcmp.S \ timingsafe_memcmp.S diff --git a/lib/libc/amd64/string/strncmp.S b/lib/libc/amd64/string/strncmp.S new file mode 100644 index 000000000000..932cf078bdfc --- /dev/null +++ b/lib/libc/amd64/string/strncmp.S @@ -0,0 +1,488 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include +#include + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + +ARCHFUNCS(strncmp) + ARCHFUNC(strncmp, scalar) + ARCHFUNC(strncmp, baseline) +ENDARCHFUNCS(strncmp) + +/* + * This is just the scalar loop unrolled a bunch of times. + */ +ARCHENTRY(strncmp, scalar) + xor %eax, %eax + sub $4, %rdx # 4 chars left to compare? + jbe 1f + + ALIGN_TEXT +0: movzbl (%rdi), %ecx + test %ecx, %ecx # NUL char in first string? + jz .L0 + cmpb (%rsi), %cl # mismatch between strings? + jnz .L0 + + movzbl 1(%rdi), %ecx + test %ecx, %ecx + jz .L1 + cmpb 1(%rsi), %cl + jnz .L1 + + movzbl 2(%rdi), %ecx + test %ecx, %ecx + jz .L2 + cmpb 2(%rsi), %cl + jnz .L2 + + movzbl 3(%rdi), %ecx + test %ecx, %ecx + jz .L3 + cmpb 3(%rsi), %cl + jnz .L3 + + add $4, %rdi # advance to next iteration + add $4, %rsi + sub $4, %rdx + ja 0b + + /* end of string within the next 4 characters */ +1: cmp $-4, %edx # end of string reached immediately? + jz .Leq + movzbl (%rdi), %ecx + test %ecx, %ecx + jz .L0 + cmpb (%rsi), %cl + jnz .L0 + + cmp $-3, %edx # end of string reached after 1 char? + jz .Leq + movzbl 1(%rdi), %ecx + test %ecx, %ecx + jz .L1 + cmpb 1(%rsi), %cl + jnz .L1 + + cmp $-2, %edx + jz .Leq + movzbl 2(%rdi), %ecx + test %ecx, %ecx + jz .L2 + cmpb 2(%rsi), %cl + jnz .L2 + + cmp $-1, %edx # either end of string after 3 chars, + jz .Leq # or it boils down to the last char + +.L3: inc %eax +.L2: inc %eax +.L1: inc %eax +.L0: movzbl (%rsi, %rax, 1), %ecx + movzbl (%rdi, %rax, 1), %eax + sub %ecx, %eax +.Leq: ret +ARCHEND(strncmp, scalar) + +ARCHENTRY(strncmp, baseline) + push %rbx + sub $1, %rdx # RDX--, so RDX points to the last byte to compare + jb .Lempty # where there any bytes to compare at all? + + lea 15(%rdi), %r8d # end of head + lea 15(%rsi), %r9d + mov %edi, %eax + mov %esi, %ebx + xor %edi, %r8d # bits that changed between first and last byte + xor %esi, %r9d + and $~0xf, %rdi # align heads to 16 bytes + and $~0xf, %rsi + or %r8d, %r9d + and $0xf, %eax # offset from alignment + and $0xf, %ebx + movdqa (%rdi), %xmm0 # load aligned heads + movdqa (%rsi), %xmm2 + pxor %xmm1, %xmm1 + cmp $16, %rdx # end of buffer within the first 32 bytes? + jb .Llt16 + + test $PAGE_SIZE, %r9d # did the page change? + jz 0f # if not, take fast path + + + /* heads may cross page boundary, avoid unmapped loads */ + movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack + movdqa %xmm2, -16(%rsp) + mov $-1, %r8d + mov $-1, %r9d + mov %eax, %ecx + shl %cl, %r8d # string head in XMM0 + mov %ebx, %ecx + shl %cl, %r9d # string head in XMM2 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm0, %r10d + pmovmskb %xmm2, %r11d + test %r8d, %r10d # NUL byte present in first string? + lea -32(%rsp), %r8 + cmovz %rdi, %r8 + test %r9d, %r11d # NUL byte present in second string? + lea -16(%rsp), %r9 + cmovz %rsi, %r9 + movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads + movdqu (%r9, %rbx, 1), %xmm4 + jmp 1f + + /* rdx == 0 */ +.Lempty: + xor %eax, %eax # zero-length buffers compare equal + pop %rbx + ret + +0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads + movdqu (%rsi, %rbx, 1), %xmm4 +1: pxor %xmm2, %xmm2 + pcmpeqb %xmm0, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm4 # which bytes match? + pandn %xmm4, %xmm2 # match and not NUL byte? + pmovmskb %xmm2, %r9d + xor $0xffff, %r9d # mismatch or NUL byte? + jnz .Lhead_mismatch + + /* load head and second chunk */ + movdqa 16(%rdi), %xmm2 # load second chunks + movdqa 16(%rsi), %xmm3 + lea -16(%rdx, %rbx, 1), %rdx # account for length of RSI chunk + sub %rbx, %rax # is a&0xf >= b&0xf? + jb .Lswapped # if not, proceed with swapped operands + jmp .Lnormal + + /* buffer ends within the first 16 bytes */ +.Llt16: test $PAGE_SIZE, %r9d # did the page change? + jz 0f # if not, take fast path + + /* heads may cross page boundary */ + movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack + movdqa %xmm2, -16(%rsp) + mov $-1, %r8d + mov $-1, %r9d + mov %eax, %ecx + shl %cl, %r8d # string head in XMM0 + mov %ebx, %ecx + shl %cl, %r9d # string head in XMM2 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm0, %r10d + pmovmskb %xmm2, %r11d + lea (%rdx, %rax, 1), %ecx # location of last buffer byte in xmm0 + bts %ecx, %r10d # treat as if NUL byte present + lea (%rdx, %rbx, 1), %ecx + bts %ecx, %r11d + test %r8w, %r10w # NUL byte present in first string head? + lea -32(%rsp), %r8 + cmovz %rdi, %r8 + test %r9w, %r11w # NUL byte present in second string head? + lea -16(%rsp), %r9 + cmovz %rsi, %r9 + movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads + movdqu (%r9, %rbx, 1), %xmm4 + jmp 1f + +0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads + movdqu (%rsi, %rbx, 1), %xmm4 +1: pxor %xmm2, %xmm2 + pcmpeqb %xmm0, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm4 # which bytes match? + pandn %xmm4, %xmm2 # match and not NUL byte? + pmovmskb %xmm2, %r9d + btr %edx, %r9d # induce mismatch in last byte of buffer + not %r9d # mismatch or NUL byte? + + /* mismatch in true heads */ + ALIGN_TEXT +.Lhead_mismatch: + tzcnt %r9d, %r9d # where is the mismatch? + add %rax, %rdi # return to true heads + add %rbx, %rsi + movzbl (%rdi, %r9, 1), %eax # mismatching characters + movzbl (%rsi, %r9, 1), %ecx + sub %ecx, %eax + pop %rbx + ret + + /* rax >= 0 */ + ALIGN_TEXT +.Lnormal: + neg %rax + movdqu 16(%rsi, %rax, 1), %xmm0 + sub %rdi, %rsi # express RSI as distance from RDI + lea (%rsi, %rax, 1), %rbx # point RBX to offset in second string + neg %rax # ... corresponding to RDI + pcmpeqb %xmm3, %xmm1 # NUL present? + pcmpeqb %xmm2, %xmm0 # Mismatch between chunks? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + mov $16, %ecx + cmp %rcx, %rdx # does the buffer end within (RDI,RSI,1)? + cmovb %edx, %ecx # ECX = min(16, RDX) + add $32, %rdi # advance to next iteration + bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte + test %r8w, %r8w # NUL or end of buffer found? + jnz .Lnul_found2 + xor $0xffff, %r9d + jnz .Lmismatch2 + sub $48, %rdx # end of buffer within first main loop iteration? + jb .Ltail # if yes, process tail + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * RSI doesn't end within region 2, then we compare chunk B between the + * two strings. As RSI is known not to hold a NUL byte in regsions 1 + * and 2 at this point, this also ensures that RDI has not ended yet. + */ + ALIGN_TEXT +0: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI + pxor %xmm1, %xmm1 + pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? + pcmpeqb (%rdi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + test %r8d, %r8d + jnz .Lnul_found + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatch + + /* main loop unrolled twice */ + movdqu 16(%rdi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rdi, %rsi, 1), %xmm1 + pcmpeqb 16(%rdi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $32, %rdi + test %r8d, %r8d + jnz .Lnul_found2 + xor $0xffff, %r9d + jnz .Lmismatch2 + sub $32, %rdx # end of buffer within next iteration? + jae 0b + + /* end of buffer will occur in next 32 bytes */ +.Ltail: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI + pxor %xmm1, %xmm1 + pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? + pcmpeqb (%rdi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + bts %edx, %r8d # indicate NUL byte at last byte in buffer + test %r8w, %r8w # NUL byte in first chunk? + jnz .Lnul_found + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatch + + /* main loop unrolled twice */ + movdqu 16(%rdi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rdi, %rsi, 1), %xmm1 + pcmpeqb 16(%rdi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + sub $16, %edx # take first half into account + bts %edx, %r8d # indicate NUL byte at last byte in buffer + add $32, %rdi + +.Lnul_found2: + sub $16, %rdi + +.Lnul_found: + mov %eax, %ecx + mov %r8d, %r10d + shl %cl, %r8d # adjust NUL mask to positions in RDI/RBX + not %r9d # mask of mismatches + or %r8w, %r9w # NUL bytes als count as mismatches + jnz .Lmismatch + + /* + * (RDI) == (RSI) and NUL is past the string. + * compare (RSI) with the corresponding part + * of the other string until the NUL byte. + */ + movdqu (%rdi, %rax, 1), %xmm0 + pcmpeqb (%rdi, %rsi, 1), %xmm0 + add %rdi, %rsi # restore RSI pointer + add %rax, %rdi # point RDI to chunk corresponding to (RSI) + pmovmskb %xmm0, %ecx # mask of matches + not %ecx # mask of mismatches + or %r10d, %ecx # mask of mismatches or NUL bytes + tzcnt %ecx, %ecx # location of first mismatch + movzbl (%rdi, %rcx, 1), %eax + movzbl (%rsi, %rcx, 1), %ecx + sub %ecx, %eax + pop %rbx + ret + +.Lmismatch2: + sub $16, %rdi + + /* a mismatch has been found between RBX and RSI */ +.Lmismatch: + tzcnt %r9d, %r9d # where is the mismatch? + add %rdi, %rbx # turn RBX from offset into pointer + movzbl (%rbx, %r9, 1), %ecx + movzbl (%rdi, %r9, 1), %eax + sub %ecx, %eax + pop %rbx + ret + + /* rax < 0 */ + ALIGN_TEXT +.Lswapped: + movdqu 16(%rdi, %rax, 1), %xmm0 + sub %rsi, %rdi # express RDI as distance from RDI + lea (%rdi, %rax, 1), %rbx # point RBX to offset in first string + pcmpeqb %xmm2, %xmm1 # NUL present? + pcmpeqb %xmm3, %xmm0 # mismatch between chunks? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add %rax, %rdx # RDX points to buffer end in RSI + neg %rax # ... corresponding to RSI + mov $16, %ecx + cmp %rcx, %rdx # does the buffer end within (RSI,RDI,1)? + cmovb %edx, %ecx # ECX = min(16, RDX) + add $32, %rsi + bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte + test %r8w, %r8w # NUL or end of buffer found? + jnz .Lnul_found2s + xor $0xffff, %r9d + jnz .Lmismatch2s + sub $48, %rdx # end of buffer within first main loop iteration? + jb .Ltails # if yes, process tail + + ALIGN_TEXT +0: movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI + pxor %xmm1, %xmm1 + pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI? + pcmpeqb (%rsi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + test %r8d, %r8d + jnz .Lnul_founds + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatchs + + /* main loop unrolled twice */ + movdqu 16(%rsi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rsi, %rdi, 1), %xmm1 + pcmpeqb 16(%rsi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $32, %rsi + test %r8d, %r8d + jnz .Lnul_found2s + xor $0xffff, %r9d + jnz .Lmismatch2s + sub $32, %rdx # end of buffer within next iteration? + jae 0b + + /* end of buffer will occur in next 32 bytes */ +.Ltails: + movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI + pxor %xmm1, %xmm1 + pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI? + pcmpeqb (%rsi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + bts %edx, %r8d # indicate NUL byte at laste byte in buffer + test %r8w, %r8w # NUL byte in first chunk? + jnz .Lnul_founds + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatchs + + /* main loop unrolled twice */ + movdqu 16(%rsi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rsi, %rdi, 1), %xmm1 + pcmpeqb 16(%rsi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + sub $16, %edx # take first half into account + bts %edx, %r8d # indicate NUL byte at laste byte in buffer + add $32, %rsi + +.Lnul_found2s: + sub $16, %rsi + +.Lnul_founds: + mov %eax, %ecx + mov %r8d, %r10d + shl %cl, %r8d # adjust NUL mask to positions in RSI/RBX + not %r9d # mask of mismatches + or %r8w, %r9w # NUL bytes also count as mismatches + jnz .Lmismatchs + + movdqu (%rsi, %rax, 1), %xmm0 + pcmpeqb (%rsi, %rdi, 1), %xmm0 + add %rsi, %rdi # restore RDI pointer + add %rax, %rsi # point RSI to chunk corresponding to (RDI) + pmovmskb %xmm0, %ecx # mask of matches + not %ecx # mask of mismatches + or %r10d, %ecx # mask of mismatches or NUL bytes + tzcnt %ecx, %ecx # location of first mismatch + movzbl (%rdi, %rcx, 1), %eax + movzbl (%rsi, %rcx, 1), %ecx + sub %ecx, %eax + pop %rbx + ret + +.Lmismatch2s: + sub $16, %rsi + +.Lmismatchs: + tzcnt %r9d, %r9d # where is the mismatch? + add %rsi, %rbx # turn RBX from offset into pointer + movzbl (%rbx, %r9, 1), %eax + movzbl (%rsi, %r9, 1), %ecx + sub %ecx, %eax + pop %rbx + ret +ARCHEND(strncmp, baseline) + + .section .note.GNU-stack,"",%progbits