diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -6,6 +6,7 @@ memccpy.S \ memcpy.S \ memmove.S \ + memrchr.S \ memset.S \ stpcpy.S \ stpncpy.S \ diff --git a/lib/libc/amd64/string/memrchr.S b/lib/libc/amd64/string/memrchr.S new file mode 100644 --- /dev/null +++ b/lib/libc/amd64/string/memrchr.S @@ -0,0 +1,166 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Robert Clausecker + */ + +#include + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + +ARCHFUNCS(memrchr) + ARCHFUNC(memrchr, scalar) + ARCHFUNC(memrchr, baseline) +ENDARCHFUNCS(memrchr) + +ARCHENTRY(memrchr, scalar) + xor %eax, %eax # prospective return value + sub $4, %rdx # 4 bytes left to process? + jb 1f + + ALIGN_TEXT +0: xor %r8, %r8 + lea 2(%rdi), %r10 + cmp %sil, 2(%rdi) + cmovne %r8, %r10 # point to null if no match + + cmp %sil, (%rdi) + cmove %rdi, %r8 # point to first char if match + + lea 1(%rdi), %r9 + cmp %sil, 1(%rdi) + cmovne %r8, %r9 # point to first result if no match in second + + lea 3(%rdi), %r11 + cmp %sil, 3(%rdi) + cmovne %r10, %r11 + + test %r11, %r11 + cmovz %r9, %r11 # take first pair match if none in second + + test %r11, %r11 + cmovnz %r11, %rax # take match in current set if any + + add $4, %rdi + sub $4, %rdx + jae 0b + +1: cmp $-3, %edx # a least one character left to process? + jb 2f + + cmp %sil, (%rdi) + cmove %rdi, %rax + + lea 1(%rdi), %rcx + cmp $-2, %edx # at least two characters left to process? + jb 2f + + cmp %sil, 1(%rdi) + cmove %rcx, %rax + + lea 2(%rdi), %rcx + cmp $-1, %edx # at least three character left to process? + jb 2f + + cmp %sil, 2(%rdi) + cmove %rcx, %rax + +2: ret +ARCHEND(memrchr, scalar) + +ARCHENTRY(memrchr, baseline) + movd %esi, %xmm4 + test %rdx, %rdx # empty buffer? + jz .L0 # if yes, return immediately + + punpcklbw %xmm4, %xmm4 # c -> cc + mov %edi, %ecx + punpcklwd %xmm4, %xmm4 # cc -> cccc + and $~0xf, %rdi # align source pointer + pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc + and $0xf, %ecx + movdqa %xmm4, %xmm0 + mov $-1, %r8d + pcmpeqb (%rdi), %xmm0 # compare aligned head + shl %cl, %r8d # mask of bytes in the head of the buffer + pmovmskb %xmm0, %eax + + sub $16, %rcx + and %r8d, %eax # match mask + add %rcx, %rdx # advance past head + cmc + jbe .Lrunt # did the string end in the buffer? + + mov %rdi, %rsi # pointer to matching chunk + add $16, %rdi + sub $16, %rdx # enough left for another round? + jbe 1f + + /* main loop unrolled twice */ + ALIGN_TEXT +0: movdqa %xmm4, %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %r8d + + cmp $16, %rdx # enough left for second chunk? + jbe 2f + + movdqa %xmm4, %xmm0 + pcmpeqb 16(%rdi), %xmm0 + pmovmskb %xmm0, %ecx + + lea 16(%rdi), %r9 + test %ecx, %ecx # match found in second chunk? + cmovz %r8d, %ecx # if not, use match data from first chunk + cmovz %rdi, %r9 + + test %ecx, %ecx # any match found? + cmovnz %ecx, %eax # if yes, overwrite previously found match + cmovnz %r9, %rsi + + add $32, %rdi # advance to next iteration + sub $32, %rdx # advance to next chunks + ja 0b + + /* process remaining 1--16 bytes */ +1: pcmpeqb (%rdi), %xmm4 + mov $0xffff, %r8d + xor %ecx, %ecx + sub %edx, %ecx # number of bytes to be masked out + pmovmskb %xmm4, %r9d + shr %cl, %r8d # mask of bytes to be kept in the buffer + and %r9d, %r8d + cmovnz %r8d, %eax + cmovnz %rdi, %rsi + bsr %eax, %eax + lea (%rsi, %rax, 1), %rsi # pointer to match (or junk) + cmovnz %rsi, %rax # if any match was found, return it + ret + + /* end of chunk reached within first half iteration */ +2: test %r8d, %r8d # match in previous chunk? + cmovnz %r8d, %eax # if yes, overwrite previous chunks + cmovnz %rdi, %rsi + add $16, %rdi # point to tail + sub $16, %edx + jmp 1b # handle tail the same otherwise + + /* runt: string ends within head, edx has negated amount of invalid head bytes */ +.Lrunt: mov $0xffff, %r8d + xor %ecx, %ecx + sub %edx, %ecx + shr %cl, %r8d + and %r8d, %eax + bsr %eax, %eax + lea (%rdi, %rax, 1), %rdi + cmovnz %rdi, %rax + ret + + /* empty buffer: return a null pointer */ +.L0: xor %eax, %eax + ret +ARCHEND(memrchr, baseline) + + .section .note.GNU-stack, "", %progbits diff --git a/lib/libc/tests/string/Makefile b/lib/libc/tests/string/Makefile --- a/lib/libc/tests/string/Makefile +++ b/lib/libc/tests/string/Makefile @@ -11,6 +11,7 @@ ATF_TESTS_C+= flsll_test ATF_TESTS_C+= memccpy_test ATF_TESTS_C+= memcmp_test +ATF_TESTS_C+= memrchr_test ATF_TESTS_C+= memset_s_test ATF_TESTS_C+= strncmp_test ATF_TESTS_C+= stpncpy_test diff --git a/lib/libc/tests/string/memrchr_test.c b/lib/libc/tests/string/memrchr_test.c new file mode 100644 --- /dev/null +++ b/lib/libc/tests/string/memrchr_test.c @@ -0,0 +1,116 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Robert Clausecker + */ + +#include + +#include +#include +#include + +#include + +static void *(*memrchr_fn)(const void *, int, size_t); + +ATF_TC_WITHOUT_HEAD(null); +ATF_TC_BODY(null, tc) +{ + ATF_CHECK_EQ(memrchr_fn(NULL, 42, 0), NULL); +} + +ATF_TC_WITHOUT_HEAD(not_found); +ATF_TC_BODY(not_found, tc) +{ + size_t i, j; + char buf[1+15+64+1]; /* offset [0..15] + 64 buffer bytes + sentinels */ + + buf[0] = 'X'; + memset(buf + 1, '-', sizeof(buf) - 1); + + for (i = 0; i < 16; i++) + for (j = 0; j < 64; j++) { + buf[i + j + 1] = 'X'; + ATF_CHECK_EQ(memrchr_fn(buf + i + 1, 'X', j), NULL); + buf[i + j + 1] = '-'; + } +} + +static void +do_found_test(char buf[], size_t len, size_t first, size_t second) +{ + /* invariant: first <= second */ + + buf[first] = 'X'; + buf[second] = 'X'; + ATF_CHECK_EQ(memrchr_fn(buf, 'X', len), buf + second); + buf[first] = '-'; + buf[second] = '-'; +} + +ATF_TC_WITHOUT_HEAD(found); +ATF_TC_BODY(found, tc) +{ + size_t i, j, k, l; + char buf[1+15+64+1]; + + buf[0] = 'X'; + memset(buf + 1, '-', sizeof(buf) - 1); + + for (i = 0; i < 16; i++) + for (j = 0; j < 64; j++) + for (k = 0; k < j; k++) + for (l = 0; l <= k; l++) { + buf[i + j + 1] = 'X'; + do_found_test(buf + i + 1, j, l, k); + buf[i + j + 1] = '-'; + } +} + +/* check that the right character is found */ +static void +do_values_test(unsigned char buf[], size_t len, size_t i, int c) +{ + /* sentinels */ + buf[-1] = c; + buf[len] = c; + memset(buf, c + 1, len); + + if (i < len) { + buf[i] = c; + ATF_CHECK_EQ(memrchr_fn(buf, c, len), buf + i); + } else + ATF_CHECK_EQ(memrchr_fn(buf, c, len), NULL); +} + +ATF_TC_WITHOUT_HEAD(values); +ATF_TC_BODY(values, tc) +{ + size_t i, j, k; + int c; + unsigned char buf[1+15+64+1]; + + for (i = 0; i < 16; i++) + for (j = 0; j < 64; j++) + for (k = 0; k <= j; k++) + for (c = 0; c <= UCHAR_MAX; c++) + do_values_test(buf + i + 1, j, k, c); +} + +ATF_TP_ADD_TCS(tp) +{ + void *dl_handle; + + dl_handle = dlopen(NULL, RTLD_LAZY); + memrchr_fn = dlsym(dl_handle, "test_memrchr"); + if (memrchr_fn == NULL) + memrchr_fn = memrchr; + + ATF_TP_ADD_TC(tp, null); + ATF_TP_ADD_TC(tp, not_found); + ATF_TP_ADD_TC(tp, found); + ATF_TP_ADD_TC(tp, values); + + return (atf_no_error()); +} diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd December 4, 2023 +.Dd December 6, 2023 .Dt SIMD 7 .Os .Sh NAME @@ -63,6 +63,7 @@ .It memccpy Ta Ta Ta S1 .It memcpy Ta S Ta S Ta S1 Ta S Ta SV .It memmove Ta S Ta S Ta S1 Ta S Ta SV +.It memrchr Ta Ta Ta S1 .It memset Ta S Ta S Ta S Ta S .It rindex Ta S Ta Ta S1 Ta S .It stpcpy Ta S Ta Ta S1