diff --git a/lib/libc/amd64/amd64_archlevel.h b/lib/libc/amd64/amd64_archlevel.h deleted file mode 100644 --- a/lib/libc/amd64/amd64_archlevel.h +++ /dev/null @@ -1,90 +0,0 @@ -/*- - * Copyright (c) 2023 The FreeBSD Foundation - * - * This software was developed by Robert Clausecker - * under sponsorship from the FreeBSD Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE - */ - -/* must be macros so they can be accessed from assembly */ -#define X86_64_SCALAR 0 /* disable SIMD optimisations */ -#define X86_64_BASELINE 1 /* CMOV, CX8, FPU, FXSR, MMX, OSFXSR, SSE, SSE2 */ -#define X86_64_V2 2 /* CMPXCHG16B, LAHF-SAHF, POPCNT, SSE3, SSSE3, SSE4_1, SSE4_2 */ -#define X86_64_V3 3 /* AVX, AVX2, BMI1, BMI2, F16C, FMA, LZCNT, MOVBE, OSXSAVE */ -#define X86_64_V4 4 /* AVX512F, AVX512BW, AVX512CD, AVX512DQ, AVX512VL */ - -#define X86_64_MAX X86_64_V4 /* highest supported architecture level */ -#define X86_64_UNDEFINED -1 /* architecture level not set yet */ - -#ifndef __ASSEMBLER__ -#include - -dlfunc_t __archlevel_resolve(u_int, u_int, u_int, u_int, - int32_t[X86_64_MAX + 1]) __hidden; -#else -#include - -#define ARCHRESOLVE(func) \ - .globl CNAME(func); \ - .type CNAME(func), @gnu_indirect_function; \ - .set CNAME(func), __CONCAT(func,_resolver); \ - ARCHENTRY(func, resolver); \ - lea __CONCAT(func,_funcs)(%rip), %r8; \ - jmp CNAME(__archlevel_resolve); \ - ARCHEND(func, resolver) - -/* - * The func_funcs array stores the location of the implementations - * as the distance from the func_funcs array to the function. Due - * to compiling for the medium code model, a 32 bit integer suffices - * to hold the distance. - * - * Doing it this way both saves storage and avoids giving rtld - * relocations to process at load time. - */ -#define ARCHFUNCS(func) \ - ARCHRESOLVE(func); \ - .section .rodata; \ - .align 4; \ - __CONCAT(func,_funcs): - -#define NOARCHFUNC \ - .4byte 0 - -#define ARCHFUNC(func, level) \ - .4byte __CONCAT(__CONCAT(func,_),level) - __CONCAT(func,_funcs) - -#define ENDARCHFUNCS(func) \ - .zero 4*(X86_64_MAX+1)-(.-__CONCAT(func,_funcs)); \ - .size __CONCAT(func,_funcs), .-__CONCAT(func,_funcs) - -#define ARCHENTRY(func, level) \ - _START_ENTRY; \ - .type __CONCAT(__CONCAT(func,_),level), @function; \ - __CONCAT(__CONCAT(func,_),level):; \ - .cfi_startproc - -#define ARCHEND(func, level) \ - END(__CONCAT(__CONCAT(func,_),level)) - -#endif /* __ASSEMBLER__ */ diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -1,14 +1,11 @@ MDSRCS+= \ - amd64_archlevel.c \ bcmp.S \ memcmp.S \ memcpy.S \ memmove.S \ memset.S \ - stpcpy.S \ strcat.S \ - strchrnul.S \ strcmp.S \ strlen.S \ - strcpy.c + stpcpy.S diff --git a/lib/libc/amd64/string/amd64_archlevel.c b/lib/libc/amd64/string/amd64_archlevel.c deleted file mode 100644 --- a/lib/libc/amd64/string/amd64_archlevel.c +++ /dev/null @@ -1,241 +0,0 @@ -/*- - * Copyright (c) 2023 The FreeBSD Foundation - * - * This software was developed by Robert Clausecker - * under sponsorship from the FreeBSD Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE - */ - -#include - -#include -#include -#include - -#include -#include - -#include "amd64_archlevel.h" -#include "libc_private.h" - -#define ARCHLEVEL_ENV "ARCHLEVEL" - -static volatile int amd64_archlevel = X86_64_UNDEFINED; - -static const struct archlevel { - char name[10]; - /* CPUID feature bits that need to be present */ - u_int feat_edx, feat_ecx, amd_ecx, ext_ebx; -} levels[] = { - { - .name = "scalar", - .feat_edx = 0, - .feat_ecx = 0, - .amd_ecx = 0, - .ext_ebx = 0, - }, { -#define FEAT_EDX_BASELINE (CPUID_FPU | CPUID_CX8 | CPUID_CMOV | CPUID_MMX | \ - CPUID_FXSR | CPUID_SSE | CPUID_SSE2) - .name = "baseline", - .feat_edx = FEAT_EDX_BASELINE, - .feat_ecx = 0, - .amd_ecx = 0, - .ext_ebx = 0, - }, { -#define FEAT_ECX_V2 (CPUID2_SSE3 | CPUID2_SSSE3 | CPUID2_CX16 | CPUID2_SSE41 | \ - CPUID2_SSE42 | CPUID2_POPCNT) -#define AMD_ECX_V2 AMDID2_LAHF - .name = "x86-64-v2", - .feat_edx = FEAT_EDX_BASELINE, - .feat_ecx = FEAT_ECX_V2, - .amd_ecx = AMD_ECX_V2, - .ext_ebx = 0, - }, { -#define FEAT_ECX_V3 (FEAT_ECX_V2 | CPUID2_FMA | CPUID2_MOVBE | \ - CPUID2_OSXSAVE | CPUID2_AVX | CPUID2_F16C) -#define AMD_ECX_V3 (AMD_ECX_V2 | AMDID2_ABM) -#define EXT_EBX_V3 (CPUID_STDEXT_BMI1 | CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2) - .name = "x86-64-v3", - .feat_edx = FEAT_EDX_BASELINE, - .feat_ecx = FEAT_ECX_V3, - .amd_ecx = AMD_ECX_V3, - .ext_ebx = EXT_EBX_V3, - }, { -#define EXT_EBX_V4 (EXT_EBX_V3 | CPUID_STDEXT_AVX512F | \ - CPUID_STDEXT_AVX512DQ | CPUID_STDEXT_AVX512CD | \ - CPUID_STDEXT_AVX512BW | CPUID_STDEXT_AVX512VL) - .name = "x86-64-v4", - .feat_edx = FEAT_EDX_BASELINE, - .feat_ecx = FEAT_ECX_V3, - .amd_ecx = AMD_ECX_V3, - .ext_ebx = EXT_EBX_V4, - } -}; - -static int -supported_archlevel(u_int feat_edx, u_int feat_ecx, u_int ext_ebx, u_int ext_ecx) -{ - int level; - u_int p[4], max_leaf; - u_int amd_ecx = 0; - - (void)ext_ecx; - - do_cpuid(0x80000000, p); - max_leaf = p[0]; - - if (max_leaf >= 0x80000001) { - do_cpuid(0x80000001, p); - amd_ecx = p[2]; - } - - for (level = X86_64_BASELINE; level <= X86_64_MAX; level++) { - const struct archlevel *lvl = &levels[level]; - - if ((lvl->feat_edx & feat_edx) != lvl->feat_edx || - (lvl->feat_ecx & feat_ecx) != lvl->feat_ecx || - (lvl->amd_ecx & amd_ecx) != lvl->amd_ecx || - (lvl->ext_ebx & ext_ebx) != lvl->ext_ebx) - return (level - 1); - } - - return (X86_64_MAX); -} - -static int -match_archlevel(const char *str, int *force) -{ - int level, want_force = 0; - - *force = 0; - - if (str[0] == '!') { - str++; - want_force = 1; - } - - for (level = 0; level <= X86_64_MAX; level++) { - size_t i; - const char *candidate = levels[level].name; - - /* can't use strcmp here: would recurse during ifunc resolution */ - for (i = 0; str[i] == candidate[i]; i++) - /* suffixes starting with : or + are ignored for future extensions */ - if (str[i] == '\0' || str[i] == ':' || str[i] == '+') { - if (want_force) - *force = 1; - - return (level); - } - } - - return (X86_64_UNDEFINED); -} - -/* - * We can't use getenv(), strcmp(), and a bunch of other functions here as - * they may in turn call SIMD-optimised string functions. - * - * *force is set to 1 if the architecture level is valid and begins with a ! - * and to 0 otherwise. - */ -static int -env_archlevel(int *force) -{ - size_t i; - - if (environ == NULL) - return (X86_64_UNDEFINED); - - for (i = 0; environ[i] != NULL; i++) { - size_t j; - - for (j = 0; environ[i][j] == ARCHLEVEL_ENV "="[j]; j++) - if (environ[i][j] == '=') - return (match_archlevel(&environ[i][j + 1], force)); - } - - *force = 0; - - return (X86_64_UNDEFINED); - -} - -/* - * Determine the architecture level by checking the CPU capabilities - * and the environment: - * - * 1. If environment variable ARCHLEVEL starts with a ! and is followed - * by a valid architecture level, that level is returned. - * 2. Else if ARCHLEVEL is set to a valid architecture level that is - * supported by the CPU, that level is returned. - * 3. Else the highest architecture level supported by the CPU is - * returned. - * - * Valid architecture levels are those defined in the levels array. - * The architecture level "scalar" indicates that SIMD enhancements - * shall not be used. - */ -static int -archlevel(u_int feat_edx, u_int feat_ecx, u_int ext_ebx, u_int ext_ecx) -{ - int islevel, wantlevel, hwlevel, force; - - islevel = atomic_load_int(&amd64_archlevel); - if (islevel != X86_64_UNDEFINED) - return (islevel); - - wantlevel = env_archlevel(&force); - if (!force) { - hwlevel = supported_archlevel(feat_edx, feat_ecx, ext_ebx, ext_ecx); - if (wantlevel == X86_64_UNDEFINED || wantlevel > hwlevel) - wantlevel = hwlevel; - } - - /* - * Ensure amd64_archlevel is set only once and - * all calls agree on what it was set to. - */ - if (atomic_cmpset_int(&amd64_archlevel, islevel, wantlevel)) - return (wantlevel); - else - return (atomic_load_int(&amd64_archlevel)); -} - -/* - * Helper function for SIMD ifunc dispatch: select the highest level - * implementation up to the current architecture level. - */ -dlfunc_t -__archlevel_resolve(u_int feat_edx, u_int feat_ecx, u_int ext_ebx, - u_int ext_ecx, int32_t funcs[static X86_64_MAX + 1]) -{ - int level; - - for (level = archlevel(feat_edx, feat_ecx, ext_ebx, ext_ecx); level >= 0; level--) - if (funcs[level] != 0) - return (dlfunc_t)((uintptr_t)funcs + (ptrdiff_t)funcs[level]); - - /* no function is present -- what now? */ - __builtin_trap(); -} diff --git a/lib/libc/amd64/string/memcmp.S b/lib/libc/amd64/string/memcmp.S --- a/lib/libc/amd64/string/memcmp.S +++ b/lib/libc/amd64/string/memcmp.S @@ -1,12 +1,9 @@ /*- - * Copyright (c) 2018, 2023 The FreeBSD Foundation + * Copyright (c) 2018 The FreeBSD Foundation * * This software was developed by Mateusz Guzik * under sponsorship from the FreeBSD Foundation. * - * Portions of this software were developed by Robert Clausecker - * under sponsorship from the FreeBSD Foundation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -30,10 +27,6 @@ */ #include -#include - -#include "amd64_archlevel.h" - /* * Note: this routine was written with kernel use in mind (read: no simd), * it is only present in userspace as a temporary measure until something @@ -43,15 +36,10 @@ #define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ #ifdef BCMP -#define memcmp bcmp +ENTRY(bcmp) +#else +ENTRY(memcmp) #endif - -ARCHFUNCS(memcmp) - ARCHFUNC(memcmp, scalar) - ARCHFUNC(memcmp, baseline) -ENDARCHFUNCS(memcmp) - -ARCHENTRY(memcmp, scalar) xorl %eax,%eax 10: cmpq $16,%rdx @@ -169,6 +157,7 @@ 1: leal 1(%eax),%eax ret +END(bcmp) #else /* * We need to compute the difference between strings. @@ -241,165 +230,7 @@ 2: subl %r8d,%eax ret +END(memcmp) #endif -ARCHEND(memcmp, scalar) - -ARCHENTRY(memcmp, baseline) - cmp $32, %rdx # enough to permit use of the long kernel? - ja .Llong - - test %rdx, %rdx # zero bytes buffer? - je .L0 - - /* - * Compare strings of 1--32 bytes. We want to do this by - * loading into two xmm registers and then comparing. To avoid - * crossing into unmapped pages, we either load 32 bytes from - * the start of the buffer or 32 bytes before its end, depending - * on whether there is a page boundary between the overread area - * or not. - */ - - /* check for page boundaries overreads */ - lea 31(%rdi), %eax # end of overread - lea 31(%rsi), %r8d - lea -1(%rdi, %rdx, 1), %ecx # last character in buffer - lea -1(%rsi, %rdx, 1), %r9d - xor %ecx, %eax - xor %r9d, %r8d - test $PAGE_SIZE, %eax # are they on different pages? - jz 0f - - /* fix up rdi */ - movdqu -32(%rdi, %rdx, 1), %xmm0 - movdqu -16(%rdi, %rdx, 1), %xmm1 - lea -8(%rsp), %rdi # end of replacement buffer - sub %rdx, %rdi # start of replacement buffer - movdqa %xmm0, -40(%rsp) # copy to replacement buffer - movdqa %xmm1, -24(%rsp) - -0: test $PAGE_SIZE, %r8d - jz 0f - - /* fix up rsi */ - movdqu -32(%rsi, %rdx, 1), %xmm0 - movdqu -16(%rsi, %rdx, 1), %xmm1 - lea -40(%rsp), %rsi # end of replacement buffer - sub %rdx, %rsi # start of replacement buffer - movdqa %xmm0, -72(%rsp) # copy to replacement buffer - movdqa %xmm1, -56(%rsp) - - /* load data and compare properly */ -0: movdqu 16(%rdi), %xmm1 - movdqu 16(%rsi), %xmm3 - movdqu (%rdi), %xmm0 - movdqu (%rsi), %xmm2 - mov %edx, %ecx - mov $-1, %edx - shl %cl, %rdx # ones where the buffer is not - pcmpeqb %xmm3, %xmm1 - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm1, %ecx - pmovmskb %xmm0, %eax - shl $16, %ecx - or %ecx, %eax # ones where the buffers match - or %edx, %eax # including where the buffer is not - not %eax # ones where there is a mismatch -#ifndef BCMP - bsf %eax, %edx # location of the first mismatch - cmovz %eax, %edx # including if there is no mismatch - movzbl (%rdi, %rdx, 1), %eax # mismatching bytes - movzbl (%rsi, %rdx, 1), %edx - sub %edx, %eax -#endif - ret - - /* empty input */ -.L0: xor %eax, %eax - ret - - /* compare 33+ bytes */ - ALIGN_TEXT -.Llong: movdqu (%rdi), %xmm0 # load head - movdqu (%rsi), %xmm2 - mov %rdi, %rcx - sub %rdi, %rsi # express rsi as distance from rdi - and $~0xf, %rdi # align rdi to 16 bytes - movdqu 16(%rsi, %rdi, 1), %xmm1 - pcmpeqb 16(%rdi), %xmm1 # compare second half of this iteration - add %rcx, %rdx # pointer to last byte in buffer - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %eax - xor $0xffff, %eax # any mismatch? - jne .Lmismatch_head - add $64, %rdi # advance to next iteration - jmp 1f # and get going with the loop - - /* process buffer 32 bytes at a time */ - ALIGN_TEXT -0: movdqu -32(%rsi, %rdi, 1), %xmm0 - movdqu -16(%rsi, %rdi, 1), %xmm1 - pcmpeqb -32(%rdi), %xmm0 - pcmpeqb -16(%rdi), %xmm1 - add $32, %rdi # advance to next iteration -1: pand %xmm0, %xmm1 # 0xff where both halves matched - pmovmskb %xmm1, %eax - cmp $0xffff, %eax # all bytes matched? - jne .Lmismatch - cmp %rdx, %rdi # end of buffer reached? - jb 0b - - /* less than 32 bytes left to compare */ - movdqu -16(%rdx), %xmm1 # load 32 byte tail through end pointer - movdqu -16(%rdx, %rsi, 1), %xmm3 - movdqu -32(%rdx), %xmm0 - movdqu -32(%rdx, %rsi, 1), %xmm2 - pcmpeqb %xmm3, %xmm1 - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm1, %ecx - pmovmskb %xmm0, %eax - shl $16, %ecx - or %ecx, %eax # ones where the buffers match - not %eax # ones where there is a mismatch -#ifndef BCMP - bsf %eax, %ecx # location of the first mismatch - cmovz %eax, %ecx # including if there is no mismatch - add %rcx, %rdx # pointer to potential mismatch - movzbl -32(%rdx), %eax # mismatching bytes - movzbl -32(%rdx, %rsi, 1), %edx - sub %edx, %eax -#endif - ret - -#ifdef BCMP -.Lmismatch: - mov $1, %eax -.Lmismatch_head: - ret -#else /* memcmp */ -.Lmismatch_head: - tzcnt %eax, %eax # location of mismatch - add %rax, %rcx # pointer to mismatch - movzbl (%rcx), %eax # mismatching bytes - movzbl (%rcx, %rsi, 1), %ecx - sub %ecx, %eax - ret - -.Lmismatch: - movdqu -48(%rsi, %rdi, 1), %xmm1 - pcmpeqb -48(%rdi), %xmm1 # reconstruct xmm1 before PAND - pmovmskb %xmm0, %eax # mismatches in first 16 bytes - pmovmskb %xmm1, %edx # mismatches in second 16 bytes - shl $16, %edx - or %edx, %eax # mismatches in both - not %eax # matches in both - tzcnt %eax, %eax # location of mismatch - add %rax, %rdi # pointer to mismatch - movzbl -64(%rdi), %eax # mismatching bytes - movzbl -64(%rdi, %rsi, 1), %ecx - sub %ecx, %eax - ret -#endif -ARCHEND(memcmp, baseline) .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/stpcpy.S b/lib/libc/amd64/string/stpcpy.S --- a/lib/libc/amd64/string/stpcpy.S +++ b/lib/libc/amd64/string/stpcpy.S @@ -1,30 +1,10 @@ -/*- - * Copyright (c) 2023, The FreeBSD Foundation - * - * SPDX-License-Expression: BSD-2-Clause - * - * Portions of this software were developed by Robert Clausecker - * under sponsorship from the FreeBSD Foundation. - * - * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S - * written by J.T. Conklin and - * adapted by Guillaume Morin to implement stpcpy - * that was originally dedicated to the public domain +/* + * Adapted by Guillaume Morin from strcpy.S + * written by J.T. Conklin + * Public domain. */ #include - -#include "amd64_archlevel.h" - -#define ALIGN_TEXT .p2align 4, 0x90 - - .weak stpcpy - .set stpcpy, __stpcpy -ARCHFUNCS(__stpcpy) - ARCHFUNC(__stpcpy, scalar) - ARCHFUNC(__stpcpy, baseline) -ENDARCHFUNCS(__stpcpy) - /* * This stpcpy implementation copies a byte at a time until the * source pointer is aligned to a word boundary, it then copies by @@ -38,7 +18,9 @@ * requirements. */ -ARCHENTRY(__stpcpy, scalar) + .globl stpcpy,__stpcpy +ENTRY(stpcpy) +__stpcpy: movabsq $0x0101010101010101,%r8 movabsq $0x8080808080808080,%r9 @@ -59,7 +41,7 @@ dec %rax ret - ALIGN_TEXT + .p2align 4 .Lloop: movq %rdx,(%rdi) addq $8,%rdi @@ -127,111 +109,6 @@ .Ldone: movq %rdi,%rax ret -ARCHEND(__stpcpy, scalar) - -ARCHENTRY(__stpcpy, baseline) - mov %esi, %ecx - mov %rdi, %rdx - sub %rsi, %rdi # express destination as distance to surce - and $~0xf, %rsi # align source to 16 byte - movdqa (%rsi), %xmm0 # head of string with junk before - pxor %xmm1, %xmm1 - and $0xf, %ecx # misalignment in bytes - pcmpeqb %xmm1, %xmm0 # NUL byte present? - pmovmskb %xmm0, %eax - shr %cl, %eax # clear out matches in junk bytes - bsf %eax, %eax # find match if any - jnz .Lrunt - - /* first normal iteration: write head back if it succeeds */ - movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration - movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string - pcmpeqb %xmm0, %xmm1 # NUL byte present? - pmovmskb %xmm1, %eax - test %eax, %eax # find match if any - jnz .Lshorty - - movdqu %xmm2, (%rdx) # store beginning of string - - /* main loop, unrolled twice */ - ALIGN_TEXT -0: movdqa 32(%rsi), %xmm2 # load current iteraion - movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion - pxor %xmm1, %xmm1 - add $32, %rsi - pcmpeqb %xmm2, %xmm1 # NUL byte present? - pmovmskb %xmm1, %eax - test %eax, %eax - jnz 1f - - movdqa 16(%rsi), %xmm0 # load current iteraion - movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion - pxor %xmm1, %xmm1 - pcmpeqb %xmm0, %xmm1 # NUL byte present? - pmovmskb %xmm1, %eax - test %eax, %eax - jz 0b - - /* end of string after main loop has iterated */ - add $16, %rsi # advance rsi to second unrolled half -1: tzcnt %eax, %eax # find location of match - # (behaves as bsf on pre-x86-64-v3 CPUs) - add %rsi, %rax # point to NUL byte - movdqu -15(%rax), %xmm0 # last 16 bytes of string - movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination - add %rdi, %rax # point to destination's NUL byte - ret - - /* NUL encountered in second iteration */ -.Lshorty: - tzcnt %eax, %eax - add $16, %eax # account for length of first iteration - sub %ecx, %eax # but not the parts before the string - - /* NUL encountered in first iteration */ -.Lrunt: lea 1(%rax), %edi # string length including NUL byte - add %rcx, %rsi # point to beginning of string - add %rdx, %rax # point to NUL byte - - /* transfer 16--32 bytes */ -.L1632: cmp $16, %edi - jb .L0815 - - movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes - movdqu %xmm2, (%rdx) # store first 16 bytes - movdqu %xmm0, -15(%rax) # store last 16 bytes - ret - - /* transfer 8--15 bytes */ -.L0815: cmp $8, %edi - jb .L0407 - - mov (%rsi), %rcx # load first 8 bytes - mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes - mov %rcx, (%rdx) # store to dst - mov %rdi, -7(%rax) # dito - ret - - /* transfer 4--7 bytes */ -.L0407: cmp $4, %edi - jb .L0203 - - mov (%rsi), %ecx - mov -4(%rsi, %rdi, 1), %edi - mov %ecx, (%rdx) - mov %edi, -3(%rax) - ret - - /* transfer 2--3 bytes */ -.L0203: cmp $2, %edi - jb .L0101 - - movzwl (%rsi), %ecx - mov %cx, (%rdx) # store first two bytes - - /* transfer 0 bytes (last byte is always NUL) */ -.L0101: movb $0, (%rax) # store terminating NUL byte - ret -ARCHEND(__stpcpy, baseline) - +END(stpcpy) + .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strchrnul.S b/lib/libc/amd64/string/strchrnul.S deleted file mode 100644 --- a/lib/libc/amd64/string/strchrnul.S +++ /dev/null @@ -1,170 +0,0 @@ -/*- - * Copyright (c) 2023 The FreeBSD Foundation - * - * This software was developed by Robert Clausecker - * under sponsorship from the FreeBSD Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE - */ - -#include - -#include "amd64_archlevel.h" - -#define ALIGN_TEXT .p2align 4,0x90 # 16-byte alignment, nop-filled - - .weak strchrnul - .set strchrnul, __strchrnul - -ARCHFUNCS(__strchrnul) - ARCHFUNC(__strchrnul, scalar) - ARCHFUNC(__strchrnul, baseline) -ENDARCHFUNCS(__strchrnul) - -/* - * strchrnul(str, c) - * This is implemented like strlen(str), but we check for the - * presence of both NUL and c in each iteration. - */ -ARCHENTRY(__strchrnul, scalar) - mov %edi, %ecx - and $~7, %rdi # align to 8 byte - movzbl %sil, %esi # clear stray high bits - movabs $0x0101010101010101, %r8 - mov (%rdi), %rax # load first word - imul %r8, %rsi # replicate char 8 times - - /* - * Unaligned input: align to 8 bytes. Then proceed the same - * way as with aligned input, but prevent matches before the - * beginning of the string. This is achieved by oring 0x01 - * into each byte of the buffer before the string - */ - shl $3, %ecx - mov %r8, %r10 - add $8, %rdi - shl %cl, %r10 # 0x01 where the string is - xor %r8, %r10 # 0x01 where it is not - neg %r8 # negate 01..01 so we can use lea - movabs $0x8080808080808080, %r9 - - mov %rsi, %rcx - xor %rax, %rcx # str ^ c - or %r10, %rax # str without NUL bytes before it - or %r10, %rcx # (str ^ c) without matches before it - lea (%rax, %r8, 1), %rdx # str - 0x01..01 - lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01 - not %rax # ~str - not %rcx # ~(str ^ c) - and %rdx, %rax # (str - 0x01..01) & ~str - and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) - or %rcx, %rax # matches for both - and %r9, %rax # not including junk bytes - jnz 1f - - /* main loop unrolled twice */ - ALIGN_TEXT -0: mov (%rdi), %rax # str - mov %rsi, %rcx - xor %rax, %rcx # str ^ c - lea (%rax, %r8, 1), %rdx # str - 0x01..01 - lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01 - not %rax # ~str - not %rcx # ~(str ^ c) - and %rdx, %rax # (str - 0x01..01) & ~str - and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) - or %rcx, %rax # matches for both - and %r9, %rax # not including junk bits - jnz 2f - - mov 8(%rdi), %rax # str - add $16, %rdi - mov %rsi, %rcx - xor %rax, %rcx # str ^ c - lea (%rax, %r8, 1), %rdx # str - 0x01..01 - lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01 - not %rax # ~str - not %rcx # ~(str ^ c) - and %rdx, %rax # (str - 0x01..01) & ~str - and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) - or %rcx, %rax # matches for both - and %r9, %rax # not including junk bits - jz 0b - - /* NUL or c found */ -1: sub $8, %rdi # undo advance past buffer -2: tzcnt %rax, %rax # first NUL or c byte match - shr $3, %eax # scale from bit to byte index - add %rdi, %rax # pointer to found c or NUL - ret -ARCHEND(__strchrnul, scalar) - -ARCHENTRY(__strchrnul, baseline) - mov %edi, %ecx - and $~0xf, %rdi # align to 16 byte - movdqa (%rdi), %xmm1 - movd %esi, %xmm0 - and $0xf, %ecx # distance from (%rdi) to start of string - pxor %xmm2, %xmm2 - mov $-1, %edx - punpcklbw %xmm0, %xmm0 # c -> cc - shl %cl, %edx # bits corresponding to bytes in the string - punpcklwd %xmm0, %xmm0 # cc -> cccc - add $16, %rdi - - /* check for match in head */ - pcmpeqb %xmm1, %xmm2 # NUL bytes present? - pshufd $0, %xmm0, %xmm0 # cccc -> cccccccccccccccc - pcmpeqb %xmm0, %xmm1 # c present? - por %xmm2, %xmm1 # either present? - pmovmskb %xmm1, %eax - and %edx, %eax # match in the string? - jnz 1f - - /* main loop unrolled twice */ - ALIGN_TEXT -0: movdqa (%rdi), %xmm1 - pxor %xmm2, %xmm2 - pcmpeqb %xmm1, %xmm2 # NUL bytes present? - pcmpeqb %xmm0, %xmm1 # c present? - por %xmm2, %xmm1 # either present? - pmovmskb %xmm1, %eax - test %eax, %eax # match in the string? - jnz 2f - - movdqa 16(%rdi), %xmm1 - add $32, %rdi - pxor %xmm2, %xmm2 - pcmpeqb %xmm1, %xmm2 # NUL bytes present? - pcmpeqb %xmm0, %xmm1 # c present? - por %xmm2, %xmm1 # either present? - pmovmskb %xmm1, %eax - test %eax, %eax # match in the string? - jz 0b - -1: sub $16, %rdi # undo advance past buffer -2: tzcnt %eax, %eax # where is the match? - add %rdi, %rax # pointer to found c or NUL - ret -ARCHEND(__strchrnul, baseline) - - .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strlen.S b/lib/libc/amd64/string/strlen.S --- a/lib/libc/amd64/string/strlen.S +++ b/lib/libc/amd64/string/strlen.S @@ -1,15 +1,9 @@ -/*- +/* * Written by Mateusz Guzik - * Copyright (c) 2023 The FreeBSD Foundation - * - * Portions of this software were developed by Robert Clausecker - * under sponsorship from the FreeBSD Foundation. - * * Public domain. */ #include -#include "amd64_archlevel.h" /* * Note: this routine was written with kernel use in mind (read: no simd), @@ -19,11 +13,6 @@ #define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ -ARCHFUNCS(strlen) - ARCHFUNC(strlen, scalar) - ARCHFUNC(strlen, baseline) -ENDARCHFUNCS(strlen) - /* * strlen(string) * %rdi @@ -40,7 +29,7 @@ * * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386. */ -ARCHENTRY(strlen, scalar) +ENTRY(strlen) movabsq $0xfefefefefefefeff,%r8 movabsq $0x8080808080808080,%r9 @@ -86,46 +75,6 @@ leaq (%rcx,%rdi),%rax subq %r10,%rax ret -ARCHEND(strlen, scalar) - -ARCHENTRY(strlen, baseline) - mov %rdi, %rcx - pxor %xmm1, %xmm1 - and $~0xf, %rdi # align string - pcmpeqb (%rdi), %xmm1 # compare head (with junk before string) - mov %rcx, %rsi # string pointer copy for later - and $0xf, %ecx # amount of bytes rdi is past 16 byte alignment - pmovmskb %xmm1, %eax - add $32, %rdi # advance to next iteration - shr %cl, %eax # clear out matches in junk bytes - test %eax, %eax # any match? (can't use ZF from SHR as CL=0 is possible) - jnz 2f - - ALIGN_TEXT -1: pxor %xmm1, %xmm1 - pcmpeqb -16(%rdi), %xmm1 # find NUL bytes - pmovmskb %xmm1, %eax - test %eax, %eax # were any NUL bytes present? - jnz 3f - - /* the same unrolled once more */ - pxor %xmm1, %xmm1 - pcmpeqb (%rdi), %xmm1 - pmovmskb %xmm1, %eax - add $32, %rdi # advance to next iteration - test %eax, %eax - jz 1b - - /* match found in loop body */ - sub $16, %rdi # undo half the advancement -3: tzcnt %eax, %eax # find the first NUL byte - sub %rsi, %rdi # string length until beginning of (%rdi) - lea -16(%rdi, %rax, 1), %rax # that plus loc. of NUL byte: full string length - ret - - /* match found in head */ -2: tzcnt %eax, %eax # compute string length - ret -ARCHEND(strlen, baseline) +END(strlen) .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/string/string.3 b/lib/libc/string/string.3 --- a/lib/libc/string/string.3 +++ b/lib/libc/string/string.3 @@ -29,7 +29,7 @@ .\" .\" @(#)string.3 8.2 (Berkeley) 12/11/93 .\" -.Dd September 2, 2023 +.Dd December 11, 1993 .Dt STRING 3 .Os .Sh NAME @@ -132,8 +132,7 @@ .Xr strsep 3 , .Xr strspn 3 , .Xr strstr 3 , -.Xr strtok 3 , -.Xr simd 7 +.Xr strtok 3 .Sh STANDARDS The .Fn strcat , diff --git a/share/man/man7/Makefile b/share/man/man7/Makefile --- a/share/man/man7/Makefile +++ b/share/man/man7/Makefile @@ -25,7 +25,6 @@ release.7 \ sdoc.7 \ security.7 \ - simd.7 \ sizeof.7 \ sprog.7 \ stats.7 \ diff --git a/share/man/man7/arch.7 b/share/man/man7/arch.7 --- a/share/man/man7/arch.7 +++ b/share/man/man7/arch.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd September 2, 2023 +.Dd April 12, 2023 .Dt ARCH 7 .Os .Sh NAME @@ -431,8 +431,7 @@ .El .Sh SEE ALSO .Xr src.conf 5 , -.Xr build 7 , -.Xr simd 7 +.Xr build 7 .Sh HISTORY An .Nm diff --git a/share/man/man7/environ.7 b/share/man/man7/environ.7 --- a/share/man/man7/environ.7 +++ b/share/man/man7/environ.7 @@ -27,7 +27,7 @@ .\" .\" @(#)environ.7 8.3 (Berkeley) 4/19/94 .\" -.Dd September 3, 2023 +.Dd August 5, 2020 .Dt ENVIRON 7 .Os .Sh NAME @@ -66,13 +66,6 @@ section of the appropriate manual page. .Sh ENVIRONMENT .Bl -tag -width LD_LIBRARY_PATH -.It Ev ARCHLEVEL -On -.Em amd64 , -controls the level of SIMD enhancements used. -See -.Xr simd 7 -for details. .It Ev BLOCKSIZE The size of the block units used by several disk-related commands, most notably @@ -312,8 +305,7 @@ .Xr setlocale 3 , .Xr system 3 , .Xr termcap 3 , -.Xr termcap 5 , -.Xr simd 7 +.Xr termcap 5 .Sh HISTORY The .Nm diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 deleted file mode 100644 --- a/share/man/man7/simd.7 +++ /dev/null @@ -1,227 +0,0 @@ -.\" Copyright (c) 2023 The FreeBSD Foundation -. -.\" This documentation was written by Robert Clausecker -.\" under sponsorship from the FreeBSD Foundation. -. -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -. -.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE -. -.Dd August 13, 2023 -.Dt SIMD 7 -.Os -.Sh NAME -.Nm simd -.Nd SIMD enhancements -. -.Sh DESCRIPTION -On some architectures, the -.Fx -.Em libc -provides enhanced implementations of commonly used functions, replacing -the architecture-independent implementations used otherwise. -Depending on architecture and function, an enhanced -implementation of a function may either always be used or the -.Em libc -detects at runtime which SIMD instruction set extensions are -supported and picks the most suitable implementation automatically. -On -.Cm amd64 , -the environment variable -.Ev ARCHLEVEL -can be used to override this mechanism. -.Pp -Enhanced functions are present in the following architectures: -.Bl -column FUNCTION__ aarch64_ arm_ amd64_ i386_ ppc64_ -offset indent -.It Em FUNCTION Ta Em AARCH64 Ta Em ARM Ta Em AMD64 Ta Em I386 Ta Em PPC64 -.It bcmp Ta Ta Ta S1 Ta S -.It bcopy Ta Ta S Ta S Ta S Ta SV -.It bzero Ta Ta S Ta S Ta S -.It div Ta Ta Ta S Ta S -.It index Ta S Ta Ta S1 -.It ldiv Ta Ta Ta S Ta S -.It lldiv Ta Ta Ta S -.It memcmp Ta Ta S Ta S1 Ta S -.It memcpy Ta S Ta S Ta S Ta S Ta SV -.It memmove Ta S Ta S Ta S Ta S Ta SV -.It memset Ta Ta S Ta S Ta S -.It rindex Ta S -.It stpcpy Ta Ta Ta S1 -.It strcat Ta Ta Ta S Ta S -.It strchr Ta S Ta Ta S1 Ta S -.It strchrnul Ta Ta Ta S1 -.It strcmp Ta Ta S Ta S Ta S -.It strcpy Ta Ta Ta S1 Ta S Ta S2 -.It strlen Ta Ta S Ta S1 -.It strncmp Ta Ta S Ta Ta S -.It strncpy Ta Ta Ta Ta Ta S2 -.It strrchr Ta S Ta Ta Ta S -.It swab Ta Ta Ta Ta S -.It wcschr Ta Ta Ta Ta S -.It wcscmp Ta Ta Ta Ta S -.It wcslen Ta Ta Ta Ta S -.It wmemchr Ta Ta Ta Ta S -.El -.Pp -.Sy S Ns :\ scalar (non-SIMD), -.Sy 1 Ns :\ amd64 baseline, -.Sy 2 Ns :\ x86-64-v2 -or PowerPC\ 2.05, -.Sy 3 Ns :\ x86-64-v3, -.Sy 4 Ns :\ x86-64-v4, -.Sy V Ns :\ PowerPC\ VSX. -. -.Sh ENVIRONMENT -.Bl -tag -.It Ev ARCHLEVEL -On -.Em amd64 , -controls the level of SIMD enhancements used. -If this variable is set to an architecture level from the list below -and that architecture level is supported by the processor, SIMD -enhancements up to -.Ev ARCHLEVEL -are used. -If -.Ev ARCHLEVEL -is unset, not recognised, or not supported by the processor, the highest -level of SIMD enhancements supported by the processor is used. -.Pp -A suffix beginning with -.Sq ":" -or -.Sq "+" -in -.Ev ARCHLEVEL -is ignored and may be used for future extensions. -The architecture level can be prefixed with a -.Sq "!" -character to force use of the requested architecture level, even if the -processor does not advertise that it is supported. -This usually causes applications to crash and should only be used for -testing purposes or if architecture level detection yields incorrect -results. -.Pp -The architecture levels follow the AMD64 SysV ABI supplement: -.Bl -tag -width x86-64-v2 -.It Cm scalar -scalar enhancements only (no SIMD) -.It Cm baseline -cmov, cx8, x87 FPU, fxsr, MMX, osfxsr, SSE, SSE2 -.It Cm x86-64-v2 -cx16, lahf/sahf, popcnt, SSE3, SSSE3, SSE4.1, SSE4.2 -.It Cm x86-64-v3 -AVX, AVX2, BMI1, BMI2, F16C, FMA, lzcnt, movbe, osxsave -.It Cm x86-64-v4 -AVX-512F/BW/CD/DQ/VL -.El -.El -. -.Sh DIAGNOSTICS -.Bl -diag -.It "Illegal Instruction" -Printed by -.Xr sh 1 -if a command is terminated through delivery of a -.Dv SIGILL -signal, see -.Xr signal 3 . -.Pp -Use of an unsupported architecture level was forced by setting -.Ev ARCHLEVEL -to a string beginning with a -.Sq "!" -character, causing a process to crash due to use of an unsupported -instruction. -Unset -.Ev ARCHLEVEL , -remove the -.Sq "!" -prefix or select a supported architecture level. -.Pp -Message may also appear for unrelated reasons. -.El -. -.Sh SEE ALSO -.Xr string 3 , -.Xr arch 7 -.Rs -.%A H. J. Lu -.%A Michael Matz -.%A Milind Girkar -.%A Jan Hubi\[u010D]ka \" \(vc -.%A Andreas Jaeger -.%A Mark Mitchell -.%B System V Application Binary Interface -.%D May 23, 2023 -.%T AMD64 Architecture Processor Supplement -.%O Version 1.0 -.Re -. -.Sh HISTORY -Architecture-specific enhanced -.Em libc -functions were added starting -with -.Fx 2.0 -for -.Cm i386 , -.Fx 6.0 -for -.Cm arm , -.Fx 6.1 -for -.Cm amd64 , -.Fx 11.0 -for -.Cm aarch64 , -and -.Fx 12.0 -for -.Cm powerpc64 . -SIMD-enhanced functions were first added with -.Fx 13.0 -for -.Cm powerpc64 -and with -.Fx 14.0 -for -.Cm amd64 . -.Pp -A -.Nm -manual page appeared in -.Fx 14.0 . -. -.Sh AUTHOR -.An Robert Clausecker Aq Mt fuz@FreeBSD.org -. -.Sh CAVEATS -Other parts of -.Fx -such as cryptographic routines in the kernel or in -OpenSSL may also use SIMD enhancements. -These enhancements are not subject to the -.Ev ARCHLEVEL -variable and may have their own configuration -mechanism. -. -.Sh BUGS -Use of SIMD enhancements cannot be configured on powerpc64.