diff --git a/lib/libc/amd64/string/memcmp.S b/lib/libc/amd64/string/memcmp.S --- a/lib/libc/amd64/string/memcmp.S +++ b/lib/libc/amd64/string/memcmp.S @@ -1,9 +1,12 @@ /*- - * Copyright (c) 2018 The FreeBSD Foundation + * Copyright (c) 2018, 2023 The FreeBSD Foundation * * This software was developed by Mateusz Guzik * under sponsorship from the FreeBSD Foundation. * + * Portions of this software were developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -29,7 +32,9 @@ */ #include -__FBSDID("$FreeBSD$"); +#include + +#include "amd64_archlevel.h" /* * Note: this routine was written with kernel use in mind (read: no simd), @@ -40,10 +45,15 @@ #define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ #ifdef BCMP -ENTRY(bcmp) -#else -ENTRY(memcmp) +#define memcmp bcmp #endif + +ARCHFUNCS(memcmp) + ARCHFUNC(memcmp, scalar) + ARCHFUNC(memcmp, baseline) +ENDARCHFUNCS(memcmp) + +ARCHENTRY(memcmp, scalar) xorl %eax,%eax 10: cmpq $16,%rdx @@ -161,7 +171,6 @@ 1: leal 1(%eax),%eax ret -END(bcmp) #else /* * We need to compute the difference between strings. @@ -234,7 +243,165 @@ 2: subl %r8d,%eax ret -END(memcmp) #endif +ARCHEND(memcmp, scalar) + +ARCHENTRY(memcmp, baseline) + cmp $32, %rdx # enough to permit use of the long kernel? + ja .Llong + + test %rdx, %rdx # zero bytes buffer? + je .L0 + + /* + * Compare strings of 1--32 bytes. We want to do this by + * loading into two xmm registers and then comparing. To avoid + * crossing into unmapped pages, we either load 32 bytes from + * the start of the buffer or 32 bytes before its end, depending + * on whether there is a page boundary between the overread area + * or not. + */ + + /* check for page boundaries overreads */ + lea 31(%rdi), %eax # end of overread + lea 31(%rsi), %r8d + lea -1(%rdi, %rdx, 1), %ecx # last character in buffer + lea -1(%rsi, %rdx, 1), %r9d + xor %ecx, %eax + xor %r9d, %r8d + test $PAGE_SIZE, %eax # are they on different pages? + jz 0f + + /* fix up rdi */ + movdqu -32(%rdi, %rdx, 1), %xmm0 + movdqu -16(%rdi, %rdx, 1), %xmm1 + lea -8(%rsp), %rdi # end of replacement buffer + sub %rdx, %rdi # start of replacement buffer + movdqa %xmm0, -40(%rsp) # copy to replacement buffer + movdqa %xmm1, -24(%rsp) + +0: test $PAGE_SIZE, %r8d + jz 0f + + /* fix up rsi */ + movdqu -32(%rsi, %rdx, 1), %xmm0 + movdqu -16(%rsi, %rdx, 1), %xmm1 + lea -40(%rsp), %rsi # end of replacement buffer + sub %rdx, %rsi # start of replacement buffer + movdqa %xmm0, -72(%rsp) # copy to replacement buffer + movdqa %xmm1, -56(%rsp) + + /* load data and compare properly */ +0: movdqu 16(%rdi), %xmm1 + movdqu 16(%rsi), %xmm3 + movdqu (%rdi), %xmm0 + movdqu (%rsi), %xmm2 + mov %edx, %ecx + mov $-1, %edx + shl %cl, %rdx # ones where the buffer is not + pcmpeqb %xmm3, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %ecx + pmovmskb %xmm0, %eax + shl $16, %ecx + or %ecx, %eax # ones where the buffers match + or %edx, %eax # including where the buffer is not + not %eax # ones where there is a mismatch +#ifndef BCMP + bsf %eax, %edx # location of the first mismatch + cmovz %eax, %edx # including if there is no mismatch + movzbl (%rdi, %rdx, 1), %eax # mismatching bytes + movzbl (%rsi, %rdx, 1), %edx + sub %edx, %eax +#endif + ret + + /* empty input */ +.L0: xor %eax, %eax + ret + + /* compare 33+ bytes */ + ALIGN_TEXT +.Llong: movdqu (%rdi), %xmm0 # load head + movdqu (%rsi), %xmm2 + mov %rdi, %rcx + sub %rdi, %rsi # express rsi as distance from rdi + and $~0xf, %rdi # align rdi to 16 bytes + movdqu 16(%rsi, %rdi, 1), %xmm1 + pcmpeqb 16(%rdi), %xmm1 # compare second half of this iteration + add %rcx, %rdx # pointer to last byte in buffer + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + xor $0xffff, %eax # any mismatch? + jne .Lmismatch_head + add $64, %rdi # advance to next iteration + jmp 1f # and get going with the loop + + /* process buffer 32 bytes at a time */ + ALIGN_TEXT +0: movdqu -32(%rsi, %rdi, 1), %xmm0 + movdqu -16(%rsi, %rdi, 1), %xmm1 + pcmpeqb -32(%rdi), %xmm0 + pcmpeqb -16(%rdi), %xmm1 + add $32, %rdi # advance to next iteration +1: pand %xmm0, %xmm1 # 0xff where both halves matched + pmovmskb %xmm1, %eax + cmp $0xffff, %eax # all bytes matched? + jne .Lmismatch + cmp %rdx, %rdi # end of buffer reached? + jb 0b + + /* less than 32 bytes left to compare */ + movdqu -16(%rdx), %xmm1 # load 32 byte tail through end pointer + movdqu -16(%rdx, %rsi, 1), %xmm3 + movdqu -32(%rdx), %xmm0 + movdqu -32(%rdx, %rsi, 1), %xmm2 + pcmpeqb %xmm3, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %ecx + pmovmskb %xmm0, %eax + shl $16, %ecx + or %ecx, %eax # ones where the buffers match + not %eax # ones where there is a mismatch +#ifndef BCMP + bsf %eax, %ecx # location of the first mismatch + cmovz %eax, %ecx # including if there is no mismatch + add %rcx, %rdx # pointer to potential mismatch + movzbl -32(%rdx), %eax # mismatching bytes + movzbl -32(%rdx, %rsi, 1), %edx + sub %edx, %eax +#endif + ret + +#ifdef BCMP +.Lmismatch: + mov $1, %eax +.Lmismatch_head: + ret +#else /* memcmp */ +.Lmismatch_head: + tzcnt %eax, %eax # location of mismatch + add %rax, %rcx # pointer to mismatch + movzbl (%rcx), %eax # mismatching bytes + movzbl (%rcx, %rsi, 1), %ecx + sub %ecx, %eax + ret + +.Lmismatch: + movdqu -48(%rsi, %rdi, 1), %xmm1 + pcmpeqb -48(%rdi), %xmm1 # reconstruct xmm1 before PAND + pmovmskb %xmm0, %eax # mismatches in first 16 bytes + pmovmskb %xmm1, %edx # mismatches in second 16 bytes + shl $16, %edx + or %edx, %eax # mismatches in both + not %eax # matches in both + tzcnt %eax, %eax # location of mismatch + add %rax, %rdi # pointer to mismatch + movzbl -64(%rdi), %eax # mismatching bytes + movzbl -64(%rdi, %rsi, 1), %ecx + sub %ecx, %eax + ret +#endif +ARCHEND(memcmp, baseline) .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/tests/string/memcmp_test.c b/lib/libc/tests/string/memcmp_test.c --- a/lib/libc/tests/string/memcmp_test.c +++ b/lib/libc/tests/string/memcmp_test.c @@ -28,18 +28,21 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include #include #include +static int (*memcmp_fn)(const void *, const void *, size_t); + ATF_TC_WITHOUT_HEAD(zero); ATF_TC_BODY(zero, tc) { - assert(memcmp("a", "b", 0) == 0); - assert(memcmp("", "", 0) == 0); + assert(memcmp_fn("a", "b", 0) == 0); + assert(memcmp_fn("", "", 0) == 0); } ATF_TC_WITHOUT_HEAD(eq); @@ -51,9 +54,9 @@ for (i = 0; i < 256; i++) data1[i] = data2[i] = i ^ 0x55; for (i = 1; i < 256; i++) - assert(memcmp(data1, data2, i) == 0); + assert(memcmp_fn(data1, data2, i) == 0); for (i = 1; i < 256; i++) - assert(memcmp(data1 + i, data2 + i, 256 - i) == 0); + assert(memcmp_fn(data1 + i, data2 + i, 256 - i) == 0); } ATF_TC_WITHOUT_HEAD(neq); @@ -67,9 +70,9 @@ data2[i] = i ^ 0x55; } for (i = 1; i < 256; i++) - assert(memcmp(data1, data2, i) != 0); + assert(memcmp_fn(data1, data2, i) != 0); for (i = 1; i < 256; i++) - assert(memcmp(data1 + i, data2 + i, 256 - i) != 0); + assert(memcmp_fn(data1 + i, data2 + i, 256 - i) != 0); } ATF_TC_WITHOUT_HEAD(diff); @@ -83,37 +86,43 @@ data1[128] = 255; data2[128] = 0; for (i = 1; i < 66; i++) { - assert(memcmp(data1 + 128, data2 + 128, i) == 255); - assert(memcmp(data2 + 128, data1 + 128, i) == -255); - assert(memcmp(data1 + 129 - i, data2 + 129 - i, i) == 255); - assert(memcmp(data2 + 129 - i, data1 + 129 - i, i) == -255); - assert(memcmp(data1 + 129 - i, data2 + 129 - i, i * 2) == 255); - assert(memcmp(data2 + 129 - i, data1 + 129 - i, i * 2) == -255); + assert(memcmp_fn(data1 + 128, data2 + 128, i) == 255); + assert(memcmp_fn(data2 + 128, data1 + 128, i) == -255); + assert(memcmp_fn(data1 + 129 - i, data2 + 129 - i, i) == 255); + assert(memcmp_fn(data2 + 129 - i, data1 + 129 - i, i) == -255); + assert(memcmp_fn(data1 + 129 - i, data2 + 129 - i, i * 2) == 255); + assert(memcmp_fn(data2 + 129 - i, data1 + 129 - i, i * 2) == -255); } data1[128] = 'c'; data2[128] = 'e'; for (i = 1; i < 66; i++) { - assert(memcmp(data1 + 128, data2 + 128, i) == -2); - assert(memcmp(data2 + 128, data1 + 128, i) == 2); - assert(memcmp(data1 + 129 - i, data2 + 129 - i, i) == -2); - assert(memcmp(data2 + 129 - i, data1 + 129 - i, i) == 2); - assert(memcmp(data1 + 129 - i, data2 + 129 - i, i * 2) == -2); - assert(memcmp(data2 + 129 - i, data1 + 129 - i, i * 2) == 2); + assert(memcmp_fn(data1 + 128, data2 + 128, i) == -2); + assert(memcmp_fn(data2 + 128, data1 + 128, i) == 2); + assert(memcmp_fn(data1 + 129 - i, data2 + 129 - i, i) == -2); + assert(memcmp_fn(data2 + 129 - i, data1 + 129 - i, i) == 2); + assert(memcmp_fn(data1 + 129 - i, data2 + 129 - i, i * 2) == -2); + assert(memcmp_fn(data2 + 129 - i, data1 + 129 - i, i * 2) == 2); } memset(data1 + 129, 'A', sizeof(data1) - 129); memset(data2 + 129, 'Z', sizeof(data2) - 129); for (i = 1; i < 66; i++) { - assert(memcmp(data1 + 128, data2 + 128, i) == -2); - assert(memcmp(data2 + 128, data1 + 128, i) == 2); - assert(memcmp(data1 + 129 - i, data2 + 129 - i, i) == -2); - assert(memcmp(data2 + 129 - i, data1 + 129 - i, i) == 2); - assert(memcmp(data1 + 129 - i, data2 + 129 - i, i * 2) == -2); - assert(memcmp(data2 + 129 - i, data1 + 129 - i, i * 2) == 2); + assert(memcmp_fn(data1 + 128, data2 + 128, i) == -2); + assert(memcmp_fn(data2 + 128, data1 + 128, i) == 2); + assert(memcmp_fn(data1 + 129 - i, data2 + 129 - i, i) == -2); + assert(memcmp_fn(data2 + 129 - i, data1 + 129 - i, i) == 2); + assert(memcmp_fn(data1 + 129 - i, data2 + 129 - i, i * 2) == -2); + assert(memcmp_fn(data2 + 129 - i, data1 + 129 - i, i * 2) == 2); } } ATF_TP_ADD_TCS(tp) { + void *dl_handle; + + dl_handle = dlopen(NULL, RTLD_LAZY); + memcmp_fn = dlsym(dl_handle, "test_memcmp"); + if (memcmp_fn == NULL) + memcmp_fn = memcmp; ATF_TP_ADD_TC(tp, zero); ATF_TP_ADD_TC(tp, eq); diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd August 5, 2023 +.Dd August 13, 2023 .Dt SIMD 7 .Os .Sh NAME @@ -51,14 +51,14 @@ Enhanced functions are present in the following architectures: .Bl -column FUNCTION__ aarch64_ arm_ amd64_ i386_ ppc64_ -offset indent .It Em FUNCTION Ta Em AARCH64 Ta Em ARM Ta Em AMD64 Ta Em I386 Ta Em PPC64 -.It bcmp Ta Ta Ta S Ta S +.It bcmp Ta Ta Ta S1 Ta S .It bcopy Ta Ta S Ta S Ta S Ta SV .It bzero Ta Ta S Ta S Ta S .It div Ta Ta Ta S Ta S .It index Ta S Ta Ta S1 .It ldiv Ta Ta Ta S Ta S .It lldiv Ta Ta Ta S -.It memcmp Ta Ta S Ta S Ta S +.It memcmp Ta Ta S Ta S1 Ta S .It memcpy Ta S Ta S Ta S Ta S Ta SV .It memmove Ta S Ta S Ta S Ta S Ta SV .It memset Ta Ta S Ta S Ta S