diff --git a/lib/libc/amd64/amd64_archlevel.h b/lib/libc/amd64/amd64_archlevel.h new file mode 100644 --- /dev/null +++ b/lib/libc/amd64/amd64_archlevel.h @@ -0,0 +1,88 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +/* must be macros so they can be accessed from assembly */ +#define X86_64_SCALAR 0 /* disable SIMD optimisations */ +#define X86_64_BASELINE 1 /* CMOV, CX8, FPU, FXSR, MMX, OSFXSR, SSE, SSE2 */ +#define X86_64_V2 2 /* CMPXCHG16B, LAHF-SAHF, POPCNT, SSE3, SSSE3, SSE4_1, SSE4_2 */ +#define X86_64_V3 3 /* AVX, AVX2, BMI1, BMI2, F16C, FMA, LZCNT, MOVBE, OSXSAVE */ +#define X86_64_V4 4 /* AVX512F, AVX512BW, AVX512CD, AVX512DQ, AVX512VL */ + +#define X86_64_MAX X86_64_V4 /* highest supported architecture level */ +#define X86_64_UNDEFINED -1 /* architecture level not set yet */ + +#ifndef __ASSEMBLER__ +extern int __archlevel(void) __hidden; +extern void (*__archlevel_resolve(int32_t[X86_64_MAX + 1]))() __hidden; +#else +# include + +# define ARCHRESOLVE(func) \ + .globl CNAME(func); \ + .type CNAME(func), @gnu_indirect_function; \ + .set CNAME(func), func ## _resolver; \ + ARCHENTRY(func, resolver); \ + lea func ## _funcs(%rip), %rdi; \ + jmp CNAME(__archlevel_resolve); \ + ARCHEND(func, resolver) + +/* + * The func_funcs array stores the location of the implementations + * as the distance from the func_funcs array to the function. Due + * to compiling for the medium code model, a 32 bit integer suffices + * to hold the distance. + * + * Doing it this way both saves storage and avoids giving rtld + * relocations to process at load time. + */ +# define ARCHFUNCS(func) \ + ARCHRESOLVE(func); \ + .section .rodata; \ + .align 4; \ + func ## _funcs: + +# define NOARCHFUNC \ + .int 0 + +# define ARCHFUNC(func, level) \ + .int func ## _ ## level - func ## _funcs + +# define ENDARCHFUNCS(func) \ + .zero 4*(X86_64_MAX+1)-(.-func ## _funcs); \ + .size func ## _funcs, .-func ## _funcs + +# define ARCHENTRY(func, level) \ + _START_ENTRY; \ + .type func ## _ ## level, @function; \ + func ## _ ## level:; \ + .cfi_startproc + +# define ARCHEND(func, level) \ + END(func ## _ ## level) + +#endif /* __ASSEMBLER__ */ diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -1,6 +1,7 @@ # $FreeBSD$ MDSRCS+= \ + amd64_archlevel.c \ bcmp.S \ memcmp.S \ memcpy.S \ diff --git a/lib/libc/amd64/string/amd64_archlevel.c b/lib/libc/amd64/string/amd64_archlevel.c new file mode 100644 --- /dev/null +++ b/lib/libc/amd64/string/amd64_archlevel.c @@ -0,0 +1,252 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include + +#include +#include + +#include +#include +#include + +#include "amd64_archlevel.h" + +#define ARCHLEVEL_ENV "ARCHLEVEL" + +static atomic_int amd64_archlevel = X86_64_UNDEFINED; + +static const struct archlevel { + char name[10]; + u_int feat_edx, feat_ecx, amd_ecx, ext_ebx; /* CPUID feature bits that need to be present */ +} levels[] = { + { + .name = "scalar", + .feat_edx = 0, + .feat_ecx = 0, + .amd_ecx = 0, + .ext_ebx = 0, + }, { + .name = "baseline", + .feat_edx = CPUID_FPU | CPUID_CX8 | CPUID_CMOV | CPUID_MMX | + CPUID_FXSR | CPUID_SSE | CPUID_SSE2, + .feat_ecx = 0, + .amd_ecx = 0, + .ext_ebx = 0, + }, { + .name = "x86-64-v2", + .feat_edx = CPUID_FPU | CPUID_CX8 | CPUID_CMOV | CPUID_MMX | + CPUID_FXSR | CPUID_SSE | CPUID_SSE2, + .feat_ecx = CPUID2_SSE3 | CPUID2_SSSE3 | CPUID2_CX16 | + CPUID2_SSE41 | CPUID2_SSE42 | CPUID2_POPCNT, + .amd_ecx = AMDID2_LAHF, + .ext_ebx = 0, + }, { + .name = "x86-64-v3", + .feat_edx = CPUID_FPU | CPUID_CX8 | CPUID_CMOV | CPUID_MMX | + CPUID_FXSR | CPUID_SSE | CPUID_SSE2, + .feat_ecx = CPUID2_SSE3 | CPUID2_SSSE3 | CPUID2_FMA | + CPUID2_CX16 | CPUID2_SSE41 | CPUID2_SSE42 | CPUID2_MOVBE | + CPUID2_POPCNT | CPUID2_OSXSAVE | CPUID2_AVX | CPUID2_F16C, + .amd_ecx = AMDID2_LAHF | AMDID2_ABM, + .ext_ebx = CPUID_STDEXT_BMI1 | CPUID_STDEXT_AVX2 | + CPUID_STDEXT_BMI2, + }, { + .name = "x86-64-v4", + .feat_edx = CPUID_FPU | CPUID_CX8 | CPUID_CMOV | CPUID_MMX | + CPUID_FXSR | CPUID_SSE | CPUID_SSE2, + .feat_ecx = CPUID2_SSE3 | CPUID2_SSSE3 | CPUID2_FMA | + CPUID2_CX16 | CPUID2_SSE41 | CPUID2_SSE42 | CPUID2_MOVBE | + CPUID2_POPCNT | CPUID2_OSXSAVE | CPUID2_AVX | CPUID2_F16C, + .amd_ecx = AMDID2_LAHF | AMDID2_ABM, + .ext_ebx = CPUID_STDEXT_BMI1 | CPUID_STDEXT_AVX2 | + CPUID_STDEXT_BMI2 | CPUID_STDEXT_AVX512F | + CPUID_STDEXT_AVX512DQ | CPUID_STDEXT_AVX512CD | + CPUID_STDEXT_AVX512BW | CPUID_STDEXT_AVX512VL, + } +}; + +static int +supported_archlevel(void) +{ + int level; + u_int p[4], max_leaf; + u_int feat_edx = 0, feat_ecx = 0, amd_ecx = 0, ext_ebx = 0; + + do_cpuid(0, p); + max_leaf = p[0]; + + if (max_leaf >= 1) { + do_cpuid(1, p); + feat_edx = p[3]; + feat_ecx = p[2]; + } + + if (max_leaf >= 7) { + cpuid_count(7, 0, p); + ext_ebx = p[1]; + } + + do_cpuid(0x80000000, p); + max_leaf = p[0]; + + if (max_leaf >= 0x80000001) { + do_cpuid(0x80000001, p); + amd_ecx = p[2]; + } + + for (level = X86_64_BASELINE; level <= X86_64_MAX; level++) { + const struct archlevel *lvl = &levels[level]; + + if ((lvl->feat_edx & feat_edx) != lvl->feat_edx || + (lvl->feat_ecx & feat_ecx) != lvl->feat_ecx || + (lvl->amd_ecx & amd_ecx) != lvl->amd_ecx || + (lvl->ext_ebx & ext_ebx) != lvl->ext_ebx) + return (level - 1); + } + + return (X86_64_MAX); +} + +static int +match_archlevel(const char *str, int *force) +{ + int level, want_force = 0; + + *force = 0; + + if (str[0] == '!') { + str++; + want_force = 1; + } + + for (level = 0; level <= X86_64_MAX; level++) { + size_t i; + const char *candidate = levels[level].name; + + /* can't use strcmp here: would recurse during ifunc resolution */ + for (i = 0; str[i] == candidate[i]; i++) + /* suffixes starting with : or + are ignored for future extensions */ + if (str[i] == '\0' || str[i] == ':' || str[i] == '+') { + if (want_force) + *force = 1; + + return (level); + } + } + + return (X86_64_UNDEFINED); +} + +/* + * We can't use getenv(), strcmp(), and a bunch of other functions here as + * they may in turn call SIMD-optimised string functions. + * + * *force is set to 1 if the architecture level is valid and begins with a ! + * and to 0 otherwise. + */ +static int +env_archlevel(int *force) +{ + size_t i; + extern char **environ; + + if (environ == NULL) + return (X86_64_UNDEFINED); + + for (i = 0; environ[i] != NULL; i++) { + size_t j; + + for (j = 0; environ[i][j] == ARCHLEVEL_ENV "="[j]; j++) + if (environ[i][j] == '=') + return (match_archlevel(&environ[i][j + 1], force)); + } + + *force = 0; + + return (X86_64_UNDEFINED); + +} + +/* + * Determine the architecture level by checking the CPU capabilities + * and the environment: + * + * 1. If environment variable ARCHLEVEL starts with a ! and is followed + * by a valid architecture level, that level is returned. + * 2. Else if ARCHLEVEL is set to a valid architecture level that is + * supported by the CPU, that level is returned. + * 3. Else the highest architecture level supported by the CPU is + * returned. + * + * Valid architecture levels are those defined in the levels array. + * The architecture level "scalar" indicates that SIMD enhancements + * shall not be used. + */ +int __hidden +__archlevel(void) +{ + int wantlevel, hwlevel, force; + int islevel = amd64_archlevel; + + if (islevel != X86_64_UNDEFINED) + return (islevel); + + wantlevel = env_archlevel(&force); + if (!force) { + hwlevel = supported_archlevel(); + if (wantlevel == X86_64_UNDEFINED || wantlevel > hwlevel) + wantlevel = hwlevel; + } + + /* + * Ensure amd64_archlevel is set only once and + * all calls agree on what it was set to. + */ + if (atomic_compare_exchange_strong(&amd64_archlevel, &islevel, wantlevel)) + return (wantlevel); + else + return (islevel); +} + +/* + * Helper function for SIMD ifunc dispatch: select the highest level + * implementation up to the current architecture level. + */ +void __hidden (* +__archlevel_resolve(int32_t funcs[static X86_64_MAX + 1]))() +{ + int level; + + for (level = __archlevel(); level >= 0; level--) + if (funcs[level] != 0) + return ((void (*)())((uintptr_t)funcs + (ptrdiff_t)funcs[level])); + + /* no function is present -- what now? */ + __builtin_trap(); +} diff --git a/lib/libc/amd64/string/strlen.S b/lib/libc/amd64/string/strlen.S --- a/lib/libc/amd64/string/strlen.S +++ b/lib/libc/amd64/string/strlen.S @@ -1,11 +1,18 @@ -/* +/*- * Written by Mateusz Guzik + * Copyright (c) 2023 The FreeBSD Foundation + * + * Portions of this software were developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * * Public domain. */ #include __FBSDID("$FreeBSD$"); +#include "amd64_archlevel.h" + /* * Note: this routine was written with kernel use in mind (read: no simd), * it is only present in userspace as a temporary measure until something @@ -14,6 +21,14 @@ #define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ +ARCHFUNCS(strlen) + ARCHFUNC(strlen, scalar) + ARCHFUNC(strlen, baseline) + NOARCHFUNC + ARCHFUNC(strlen, x86_64_v3) + ARCHFUNC(strlen, x86_64_v4) +ENDARCHFUNCS(strlen) + /* * strlen(string) * %rdi @@ -30,7 +45,7 @@ * * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386. */ -ENTRY(strlen) +ARCHENTRY(strlen, scalar) movabsq $0xfefefefefefefeff,%r8 movabsq $0x8080808080808080,%r9 @@ -76,6 +91,140 @@ leaq (%rcx,%rdi),%rax subq %r10,%rax ret -END(strlen) +ARCHEND(strlen, scalar) + +ARCHENTRY(strlen, baseline) + mov %edi, %ecx + mov %rdi, %rsi # string pointer copy for later + pxor %xmm1, %xmm1 + and $~0xf, %rdi # align string + pcmpeqb (%rdi), %xmm1 # compare head (with junk before string) + and $0xf, %ecx # amount of bytes rdi is past 16 byte alignment + pmovmskb %xmm1, %eax + shr %cl, %eax # clear out matches in junk bytes + test %eax, %eax # any match? (can't use ZF from SHR as CL=0 is possible) + jnz 2f + + ALIGN_TEXT +1: pxor %xmm1, %xmm1 + pcmpeqb 16(%rdi), %xmm1 # find NUL bytes + add $32, %rdi # advance to next iteration + pmovmskb %xmm1, %eax + test %eax, %eax # were any NUL bytes present? + jnz 3f + + /* the same unrolled once more */ + pxor %xmm1, %xmm1 + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jz 1b + + /* match found in second unrolled loop body */ + tzcnt %eax, %eax # find the first NUL byte + sub %rsi, %rdi # string length until beginning of (%rdi) + add %rdi, %rax # that plus loc. of NUL byte: full string length + ret + + /* match found in head */ +2: tzcnt %eax, %eax # compute string length + ret + + /* match found in first unrolled loop body */ +3: tzcnt %eax, %eax + sub %rsi, %rdi + lea -16(%rdi,%rax,1), %rax # as above, but undo advancement to next iteration + ret +ARCHEND(strlen, baseline) + +ARCHENTRY(strlen, x86_64_v3) + mov %edi, %ecx + mov %rdi, %rsi # string pointer copy for later + vpxor %ymm1, %ymm1, %ymm1 + and $~0x1f, %rdi # align string + vpcmpeqb (%rdi), %ymm1, %ymm0 # compare head (with junk before string) + and $0x1f, %ecx # amount of bytes rdi is past 32 byte alignment + vpmovmskb %ymm0, %eax + shr %cl, %eax # clear out matches in junk bytes + test %eax, %eax # any match? (can't use ZF from SHR as CL=0 is possible) + jnz 2f + + ALIGN_TEXT +1: vpcmpeqb 32(%rdi), %ymm1, %ymm0 # find NUL bytes + add $64, %rdi # advance to next iteration + vptest %ymm0, %ymm0 # were any NUL bytes present? + jnz 3f + + /* the same unrolled once more */ + vpcmpeqb (%rdi), %ymm1, %ymm0 + vptest %ymm0, %ymm0 + jz 1b + + /* match found in second unrolled loop body */ + vpmovmskb %ymm0, %eax + sub %rsi, %rdi # string length until beginning of (%rdi) + tzcnt %eax, %eax # find the first NUL byte + add %rdi, %rax # that plus loc. of NUL byte: full string length + vzeroupper + ret + + /* match found in head */ +2: tzcnt %eax, %eax # compute string length + vzeroupper + ret + + /* match found in first unrolled loop body */ +3: vpmovmskb %ymm0, %eax + sub %rsi, %rdi + tzcnt %eax, %eax + lea -32(%rdi,%rax,1), %rax # as above, but undo advancement to next iteration + vzeroupper + ret +ARCHEND(strlen, x86_64_v3) + +ARCHENTRY(strlen, x86_64_v4) + mov %edi, %ecx + mov $-1, %rax + and $0x3f, %ecx # amount of bytes rdi is past 64 byte alignment + mov %rdi, %rsi # string pointer copy for later + shl %cl, %rax # one bits for non-junk bytes in (%rdi) + and $~0x3f, %rdi # align string + kmovq %rax, %k2 + vpxord %zmm16, %zmm16, %zmm16 + vpcmpeqb (%rdi), %zmm16, %k1 # compare head (with junk before string) + ktestq %k1, %k2 # any NUL bytes in non-junk locations? + jnz 2f + + ALIGN_TEXT +1: vpcmpeqb 64(%rdi), %zmm16, %k1 # find NUL bytes + sub $-128, %rdi # advance to next iteration + ktestq %k1, %k1 # were any NUL bytes present? + jnz 3f + + /* the same unrolled once more */ + vpcmpeqb (%rdi), %zmm16, %k1 + ktestq %k1, %k1 + jz 1b + + /* match found in second unrolled loop body */ + kmovq %k1, %rax + sub %rsi, %rdi # string length until beginning of (%rdi) + tzcnt %rax, %rax # find the first NUL byte + add %rdi, %rax # that plus loc. of NUL byte: full string length + ret + + /* match found in head */ +2: kmovq %k1, %rax # mask of matches (including junk) + shr %cl, %rax # mask out junk matches + tzcnt %rax, %rax # compute string length + ret + + /* match found in first unrolled loop body */ +3: kmovq %k1, %rax + sub %rsi, %rdi + tzcnt %rax, %rax + lea -64(%rdi,%rax,1), %rax # as above, but undo advancement to next iteration + ret +ARCHEND(strlen, x86_64_v4) .section .note.GNU-stack,"",%progbits diff --git a/libexec/rtld-elf/rtld-libc/Makefile.inc b/libexec/rtld-elf/rtld-libc/Makefile.inc --- a/libexec/rtld-elf/rtld-libc/Makefile.inc +++ b/libexec/rtld-elf/rtld-libc/Makefile.inc @@ -38,11 +38,28 @@ # errlst.c needs the errlst.h header from libc: CFLAGS.errlst.c+=-I${LIBC_SRCTOP}/include +.if ${LIBC_ARCH} == "amd64" +# on amd64, string functions use ifunc dispatch so we have to use the +# generic versions in rtld. +.PATH: ${LIBC_SRCTOP}/string +SRCS+= bcopy.c bzero.c memchr.c memcmp.c memcpy.c memmove.c memset.c strcat.c \ + strchr.c strchrnul.c strcmp.c strcpy.c strcspn.c strdup.c strlcat.c \ + strlcpy.c strlen.c strncmp.c strncpy.c strrchr.c strsep.c strspn.c \ + strstr.c strtok.c +CFLAGS.memchr.c+=-Wno-cast-qual +CFLAGS.strchr.c+=-Wno-cast-qual +CFLAGS.strchrnul.c+=-Wno-cast-qual +CFLAGS.strrchr.c+=-Wno-cast-qual +CFLAGS.strstr.c+=-Wno-cast-qual -Wno-sign-compare +CFLAGS.strtok.c+=-Wno-cast-qual +.else # !amd64 # Use the string and memory .o files from libc instead of rebuilding them (they # might be using optimized assembly and duplicating that logic here is awkward). _libc_string_objects= bcmp bcopy bzero memset memchr memcmp memcpy memmove \ stpncpy strcat strchr strchrnul strcmp stpcpy strcpy strcspn strdup \ strlcat strlcpy strlen strncmp strncpy strrchr strsep strspn strstr strtok +.endif + # Also use all the syscall .o files from libc_nossp_pic: _libc_other_objects= sigsetjmp lstat stat fstat fstatat fstatfs syscall \ cerror geteuid getegid sigfastblock munmap mprotect \ diff --git a/share/man/man7/Makefile b/share/man/man7/Makefile --- a/share/man/man7/Makefile +++ b/share/man/man7/Makefile @@ -26,6 +26,7 @@ release.7 \ sdoc.7 \ security.7 \ + simd.7 \ sizeof.7 \ sprog.7 \ stats.7 \ diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 new file mode 100644 --- /dev/null +++ b/share/man/man7/simd.7 @@ -0,0 +1,227 @@ +.\" Copyright (c) 2023 The FreeBSD Foundation +. +.\" This documentation was written by Robert Clausecker +.\" under sponsorship from the FreeBSD Foundation. +. +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +. +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE +. +.Dd June 19, 2023 +.Dt SIMD 7 +.Os +.Sh NAME +.Nm simd +.Nd SIMD enhancements +. +.Sh DESCRIPTION +On some architectures, the +.Fx +.Em libc +provides enhanced implementations of commonly used functions, replacing +the architecture-independent implementations used otherwise. +Depending on architecture and function, an enhanced +implementation of a function may either always be used or the +.Em libc +detects at runtime which SIMD instruction set extensions are +supported and picks the most suitable implementation automatically. +On +.Cm amd64 , +the environment variable +.Ev ARCHLEVEL +can be used to override this mechanism. +.Pp +Enhanced functions are present in the following architectures: +.Bl -column FUNCTION_ aarch64_ arm_ amd64_ i386_ ppc64_ -offset indent +.It Em FUNCTION Ta Em AARCH64 Ta Em ARM Ta Em AMD64 Ta Em I386 Ta Em PPC64 +.It bcmp Ta Ta Ta S Ta S +.It bcopy Ta Ta S Ta S Ta S Ta SV +.It bzero Ta Ta S Ta S Ta S +.It div Ta Ta Ta S Ta S +.It ffs Ta Ta S Ta Ta S +.It index Ta S +.It ldiv Ta Ta Ta S Ta S +.It lldiv Ta Ta Ta S +.It memcmp Ta Ta S Ta S Ta S +.It memcpy Ta S Ta S Ta S Ta S Ta SV +.It memmove Ta S Ta S Ta S Ta S Ta SV +.It memset Ta Ta S Ta S Ta S +.It rindex Ta S +.It stpcpy Ta Ta Ta S +.It strcat Ta Ta Ta S Ta S +.It strchr Ta S Ta Ta Ta S +.It strcmp Ta Ta S Ta S Ta S +.It strcpy Ta Ta Ta S Ta S Ta S2 +.It strlen Ta Ta S Ta S134 +.It strncmp Ta Ta S Ta Ta S +.It strncpy Ta Ta Ta Ta Ta S2 +.It strrchr Ta S Ta Ta Ta S +.It swab Ta Ta Ta Ta S +.It wcschr Ta Ta Ta Ta S +.It wcscmp Ta Ta Ta Ta S +.It wcslen Ta Ta Ta Ta S +.It wmemchr Ta Ta Ta Ta S +.El +.Pp +.Sy S Ns :\ scalar (non-SIMD), +.Sy 1 Ns :\ amd64 baseline, +.Sy 2 Ns :\ x86-64-v2 +or PowerPC\ 2.05, +.Sy 3 Ns :\ x86-64-v3, +.Sy 4 Ns :\ x86-64-v4, +.Sy V Ns :\ PowerPC\ VSX. +. +.Sh ENVIRONMENT +.Bl -tag +.It Ev ARCHLEVEL +On +.Em amd64 , +controls the level of SIMD enhancements used. +If this variable is set to an architecture level from the list below +and that architecture level is supported by the processor, SIMD +enhancements up to +.Ev ARCHLEVEL +are used. +If +.Ev ARCHLEVEL +is unset, not recognised, or not supported by the processor, the highest +level of SIMD enhancements supported by the processor is used. +.Pp +A suffix beginning with +.Sq ":" +or +.Sq "+" +in +.Ev ARCHLEVEL +is ignored and may be used for future extensions. +The architecture level can be prefixed with a +.Sq "!" +character to force use of the requested architecture level, even if the +processor does not advertise that it is supported. +This usually causes applications to crash and should only be used for +testing purposes or if architecture level detection yields incorrect +results. +.Pp +The architecture levels follow the AMD64 SysV ABI supplement: +.Bl -tag -width x86-64-v2 +.It Cm scalar +scalar enhancements only (no SIMD) +.It Cm baseline +cmov, cx8, x87 FPU, fxsr, MMX, osfxsr, SSE, SSE2 +.It Cm x86-64-v2 +cx16, lahf/sahf, popcnt, SSE3, SSSE3, SSE4.1, SSE4.2 +.It Cm x86-64-v3 +AVX, AVX2, BMI1, BMI2, F16C, FMA, lzcnt, movbe, osxsave +.It Cm x86-64-v4 +AVX-512F/BW/CD/DQ/VL +.El +.El +. +.Sh DIAGNOSTICS +.Bl -diag +.It "Illegal Instruction" +Printed by +.Xr sh 1 +if a command is terminated through delivery of a +.Dv SIGILL +signal, see +.Xr signal 3 . +.Pp +Use of an unsupported architecture level was forced by setting +.Ev ARCHLEVEL +to a string beginning with a +.Sq "!" +character, causing a process to crash due to use of an unsupported +instruction. +Unset +.Ev ARCHLEVEL , +remove the +.Sq "!" +prefix or select a supported architecture level. +.Pp +Message may also appear for unrelated reasons. +.El +. +.Sh SEE ALSO +.Xr string 3 , +.Xr arch 7 +.Rs +.%A H. J. Lu +.%A Michael Matz +.%A Milind Girkar +.%A Jan Hubi\[u010D]ka \" \(vc +.%A Andreas Jaeger +.%A Mark Mitchell +.%B System V Application Binary Interface +.%D May 23, 2023 +.%T AMD64 Architecture Processor Supplement +.%O Version 1.0 +.Re +. +.Sh HISTORY +Architecture-specific enhanced +.Em libc +functions were added starting +with +.Fx 2.0 +for +.Cm i386 , +.Fx 6.0 +for +.Cm arm , +.Fx 6.1 +for +.Cm amd64 , +.Fx 11.0 +for +.Cm aarch64 , +and +.Fx 12.0 +for +.Cm powerpc64 . +SIMD-enhanced functions were first added with +.Fx 13.0 +for +.Cm powerpc64 +and with +.Fx 14.0 +for +.Cm amd64 . +.Pp +A +.Nm +manual page appeared in +.Fx 14.0 . +. +.Sh AUTHOR +.An Robert Clausecker Aq Mt fuz@FreeBSD.org +. +.Sh CAVEATS +Other parts of +.Fx +such as cryptographic routines in the kernel or in +OpenSSL may also use SIMD enhancements. +These enhancements are not subject to the +.Ev ARCHLEVEL +variable and may have their own configuration +mechanism. +. +.Sh BUGS +Use of SIMD enhancements cannot be configured on powerpc64.