Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F140725529
D40693.id123906.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
25 KB
Referenced Files
None
Subscribers
None
D40693.id123906.diff
View Options
diff --git a/lib/libc/amd64/amd64_archlevel.h b/lib/libc/amd64/amd64_archlevel.h
new file mode 100644
--- /dev/null
+++ b/lib/libc/amd64/amd64_archlevel.h
@@ -0,0 +1,88 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+/* must be macros so they can be accessed from assembly */
+#define X86_64_SCALAR 0 /* disable SIMD optimisations */
+#define X86_64_BASELINE 1 /* CMOV, CX8, FPU, FXSR, MMX, OSFXSR, SSE, SSE2 */
+#define X86_64_V2 2 /* CMPXCHG16B, LAHF-SAHF, POPCNT, SSE3, SSSE3, SSE4_1, SSE4_2 */
+#define X86_64_V3 3 /* AVX, AVX2, BMI1, BMI2, F16C, FMA, LZCNT, MOVBE, OSXSAVE */
+#define X86_64_V4 4 /* AVX512F, AVX512BW, AVX512CD, AVX512DQ, AVX512VL */
+
+#define X86_64_MAX X86_64_V4 /* highest supported architecture level */
+#define X86_64_UNDEFINED -1 /* architecture level not set yet */
+
+#ifndef __ASSEMBLER__
+extern int __archlevel(void) __hidden;
+extern void (*__archlevel_resolve(int32_t[X86_64_MAX + 1]))() __hidden;
+#else
+# include <machine/asm.h>
+
+# define ARCHRESOLVE(func) \
+ .globl CNAME(func); \
+ .type CNAME(func), @gnu_indirect_function; \
+ .set CNAME(func), func ## _resolver; \
+ ARCHENTRY(func, resolver); \
+ lea func ## _funcs(%rip), %rdi; \
+ jmp CNAME(__archlevel_resolve); \
+ ARCHEND(func, resolver)
+
+/*
+ * The func_funcs array stores the location of the implementations
+ * as the distance from the func_funcs array to the function. Due
+ * to compiling for the medium code model, a 32 bit integer suffices
+ * to hold the distance.
+ *
+ * Doing it this way both saves storage and avoids giving rtld
+ * relocations to process at load time.
+ */
+# define ARCHFUNCS(func) \
+ ARCHRESOLVE(func); \
+ .section .rodata; \
+ .align 4; \
+ func ## _funcs:
+
+# define NOARCHFUNC \
+ .int 0
+
+# define ARCHFUNC(func, level) \
+ .int func ## _ ## level - func ## _funcs
+
+# define ENDARCHFUNCS(func) \
+ .zero 4*(X86_64_MAX+1)-(.-func ## _funcs); \
+ .size func ## _funcs, .-func ## _funcs
+
+# define ARCHENTRY(func, level) \
+ _START_ENTRY; \
+ .type func ## _ ## level, @function; \
+ func ## _ ## level:; \
+ .cfi_startproc
+
+# define ARCHEND(func, level) \
+ END(func ## _ ## level)
+
+#endif /* __ASSEMBLER__ */
diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -1,6 +1,7 @@
# $FreeBSD$
MDSRCS+= \
+ amd64_archlevel.c \
bcmp.S \
memcmp.S \
memcpy.S \
diff --git a/lib/libc/amd64/string/amd64_archlevel.c b/lib/libc/amd64/string/amd64_archlevel.c
new file mode 100644
--- /dev/null
+++ b/lib/libc/amd64/string/amd64_archlevel.c
@@ -0,0 +1,247 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <sys/types.h>
+
+#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
+
+#include <stdatomic.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "amd64_archlevel.h"
+
+#define ARCHLEVEL_ENV "ARCHLEVEL"
+
+static atomic_int amd64_archlevel = X86_64_UNDEFINED;
+
+static const struct archlevel {
+ char name[12];
+ u_int feat_edx, feat_ecx, amd_ecx, ext_ebx; /* CPUID feature bits that need to be present */
+} levels[] = {
+ {
+ .name = "scalar",
+ .feat_edx = 0,
+ .feat_ecx = 0,
+ .amd_ecx = 0,
+ .ext_ebx = 0,
+ }, {
+ .name = "baseline",
+ .feat_edx = CPUID_FPU | CPUID_CX8 | CPUID_CMOV | CPUID_MMX |
+ CPUID_FXSR | CPUID_SSE | CPUID_SSE2,
+ .feat_ecx = 0,
+ .amd_ecx = 0,
+ .ext_ebx = 0,
+ }, {
+ .name = "x86-64-v2",
+ .feat_edx = CPUID_FPU | CPUID_CX8 | CPUID_CMOV | CPUID_MMX |
+ CPUID_FXSR | CPUID_SSE | CPUID_SSE2,
+ .feat_ecx = CPUID2_SSE3 | CPUID2_SSSE3 | CPUID2_CX16 |
+ CPUID2_SSE41 | CPUID2_SSE42 | CPUID2_POPCNT,
+ .amd_ecx = AMDID2_LAHF,
+ .ext_ebx = 0,
+ }, {
+ .name = "x86-64-v3",
+ .feat_edx = CPUID_FPU | CPUID_CX8 | CPUID_CMOV | CPUID_MMX |
+ CPUID_FXSR | CPUID_SSE | CPUID_SSE2,
+ .feat_ecx = CPUID2_SSE3 | CPUID2_SSSE3 | CPUID2_FMA |
+ CPUID2_CX16 | CPUID2_SSE41 | CPUID2_SSE42 | CPUID2_MOVBE |
+ CPUID2_POPCNT | CPUID2_OSXSAVE | CPUID2_AVX | CPUID2_F16C,
+ .amd_ecx = AMDID2_LAHF | AMDID2_ABM,
+ .ext_ebx = CPUID_STDEXT_BMI1 | CPUID_STDEXT_AVX2 |
+ CPUID_STDEXT_BMI2,
+ }, {
+ .name = "x86-64-v4",
+ .feat_edx = CPUID_FPU | CPUID_CX8 | CPUID_CMOV | CPUID_MMX |
+ CPUID_FXSR | CPUID_SSE | CPUID_SSE2,
+ .feat_ecx = CPUID2_SSE3 | CPUID2_SSSE3 | CPUID2_FMA |
+ CPUID2_CX16 | CPUID2_SSE41 | CPUID2_SSE42 | CPUID2_MOVBE |
+ CPUID2_POPCNT | CPUID2_OSXSAVE | CPUID2_AVX | CPUID2_F16C,
+ .amd_ecx = AMDID2_LAHF | AMDID2_ABM,
+ .ext_ebx = CPUID_STDEXT_BMI1 | CPUID_STDEXT_AVX2 |
+ CPUID_STDEXT_BMI2 | CPUID_STDEXT_AVX512F |
+ CPUID_STDEXT_AVX512DQ | CPUID_STDEXT_AVX512CD |
+ CPUID_STDEXT_AVX512BW | CPUID_STDEXT_AVX512VL,
+ }
+};
+
+static int
+supported_archlevel(void)
+{
+ int level;
+ u_int p[4], max_leaf;
+ u_int feat_edx = 0, feat_ecx = 0, amd_ecx = 0, ext_ebx = 0;
+
+ do_cpuid(0, p);
+ max_leaf = p[0];
+
+ if (max_leaf >= 1) {
+ do_cpuid(1, p);
+ feat_edx = p[3];
+ feat_ecx = p[2];
+ }
+
+ if (max_leaf >= 7) {
+ cpuid_count(7, 0, p);
+ ext_ebx = p[1];
+ }
+
+ do_cpuid(0x80000000, p);
+ max_leaf = p[0];
+
+ if (max_leaf >= 0x80000001) {
+ do_cpuid(0x80000001, p);
+ amd_ecx = p[2];
+ }
+
+ for (level = X86_64_BASELINE; level <= X86_64_MAX; level++) {
+ const struct archlevel *lvl = &levels[level];
+
+ if ((lvl->feat_edx & feat_edx) != lvl->feat_edx ||
+ (lvl->feat_ecx & feat_ecx) != lvl->feat_ecx ||
+ (lvl->amd_ecx & amd_ecx) != lvl->amd_ecx ||
+ (lvl->ext_ebx & ext_ebx) != lvl->ext_ebx)
+ return (level - 1);
+ }
+
+ return (X86_64_MAX);
+}
+
+static int
+match_archlevel(const char *str)
+{
+ int level;
+
+ if (str[0] == '!')
+ str++;
+
+ for (level = 0; level <= X86_64_MAX; level++) {
+ size_t i;
+ const char *candidate = levels[level].name;
+
+ /* can't use strcmp here: would recurse during ifunc resolution */
+ for (i = 0; str[i] == candidate[i]; i++)
+ /* suffixes starting with : or + are ignored for future extensions */
+ if (str[i] == '\0' || str[i] == ':' || str[i] == '+')
+ return (level);
+ }
+
+ return (X86_64_UNDEFINED);
+}
+
+/*
+ * We can't use getenv(), strcmp(), and a bunch of other functions here as
+ * they may in turn call SIMD-optimised string functions.
+ *
+ * *force is set to 1 if the architecture level is valid and begins with a !
+ */
+static int
+env_archlevel(int *force)
+{
+ size_t i;
+ int level;
+ extern char **environ;
+
+ if (environ == NULL)
+ return (X86_64_UNDEFINED);
+
+ for (i = 0; environ[i] != NULL; i++) {
+ size_t j;
+
+ for (j = 0; environ[i][j] == ARCHLEVEL_ENV "="[j]; j++)
+ if (environ[i][j] == '=') {
+ level = match_archlevel(&environ[i][j + 1]);
+ if (level != X86_64_UNDEFINED && environ[i][j + 1] == '!')
+ *force = 1;
+
+ return (level);
+ }
+ }
+
+ return (X86_64_UNDEFINED);
+
+}
+
+/*
+ * Determine the architecture level by checking the CPU capabilities
+ * and the environment:
+ *
+ * 1. If environment variable ARCHLEVEL starts with a ! and is followed
+ * by a valid architecture level, that level is returned.
+ * 2. Else if ARCHLEVEL is set to a valid architecture level that is
+ * supported by the CPU, that level is returned.
+ * 3. Else the highest architecture level supported by the CPU is
+ * returned.
+ *
+ * Valid architecture levels are those defined in the levels array.
+ * The architecture level "scalar" indicates that SIMD enhancements
+ * shall not be used.
+ */
+int __hidden
+__archlevel(void)
+{
+ int wantlevel, hwlevel, force = 0;
+ int islevel = amd64_archlevel;
+
+ if (islevel != X86_64_UNDEFINED)
+ return (islevel);
+
+ wantlevel = env_archlevel(&force);
+ if (!force) {
+ hwlevel = supported_archlevel();
+ if (wantlevel == X86_64_UNDEFINED || wantlevel > hwlevel)
+ wantlevel = hwlevel;
+ }
+
+ /*
+ * Ensure amd64_archlevel is set only once and
+ * all calls agree on what it was set to.
+ */
+ if (atomic_compare_exchange_strong(&amd64_archlevel, &islevel, wantlevel))
+ return (wantlevel);
+ else
+ return (islevel);
+}
+
+/*
+ * Helper function for SIMD ifunc dispatch: select the highest level
+ * implementation up to the current architecture level.
+ */
+void __hidden (*
+__archlevel_resolve(int32_t funcs[static X86_64_MAX + 1]))()
+{
+ int level;
+
+ for (level = __archlevel(); level >= 0; level--)
+ if (funcs[level] != 0)
+ return ((void (*)())((uintptr_t)funcs + (ptrdiff_t)funcs[level]));
+
+ /* no function is present -- what now? */
+ __builtin_trap();
+}
diff --git a/lib/libc/amd64/string/strlen.S b/lib/libc/amd64/string/strlen.S
--- a/lib/libc/amd64/string/strlen.S
+++ b/lib/libc/amd64/string/strlen.S
@@ -1,11 +1,18 @@
-/*
+/*-
* Written by Mateusz Guzik <mjg@freebsd.org>
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by Robert Clausecker
+ * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
+ *
* Public domain.
*/
#include <machine/asm.h>
__FBSDID("$FreeBSD$");
+#include "amd64_archlevel.h"
+
/*
* Note: this routine was written with kernel use in mind (read: no simd),
* it is only present in userspace as a temporary measure until something
@@ -14,6 +21,14 @@
#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
+ARCHFUNCS(strlen)
+ ARCHFUNC(strlen, scalar)
+ ARCHFUNC(strlen, baseline)
+ NOARCHFUNC
+ ARCHFUNC(strlen, x86_64_v3)
+ ARCHFUNC(strlen, x86_64_v4)
+ENDARCHFUNCS(strlen)
+
/*
* strlen(string)
* %rdi
@@ -30,7 +45,7 @@
*
* The latter contains a 32-bit variant of the same algorithm coded in assembly for i386.
*/
-ENTRY(strlen)
+ARCHENTRY(strlen, scalar)
movabsq $0xfefefefefefefeff,%r8
movabsq $0x8080808080808080,%r9
@@ -76,6 +91,140 @@
leaq (%rcx,%rdi),%rax
subq %r10,%rax
ret
-END(strlen)
+ARCHEND(strlen, scalar)
+
+ARCHENTRY(strlen, baseline)
+ mov %edi, %ecx
+ mov %rdi, %rsi // string pointer copy for later
+ pxor %xmm1, %xmm1
+ and $~0xf, %rdi // align string
+ pcmpeqb (%rdi), %xmm1 // compare head (with junk before string)
+ and $0xf, %ecx // amount of bytes rdi is past 16 byte alignment
+ pmovmskb %xmm1, %eax
+ shr %cl, %eax // clear out matches in junk bytes
+ test %eax, %eax // any match? (can't use ZF from SHR as CL=0 is possible)
+ jnz 2f
+
+ ALIGN_TEXT
+1: pxor %xmm1, %xmm1
+ pcmpeqb 16(%rdi), %xmm1 // find NUL bytes
+ add $32, %rdi // advance to next iteration
+ pmovmskb %xmm1, %eax
+ test %eax, %eax // were any NUL bytes present?
+ jnz 3f
+
+ /* the same unrolled once more */
+ pxor %xmm1, %xmm1
+ pcmpeqb (%rdi), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jz 1b
+
+ /* match found in second unrolled loop body */
+ tzcnt %eax, %eax // find the first NUL byte
+ sub %rsi, %rdi // string length until beginning of (%rdi)
+ add %rdi, %rax // that plus loc. of NUL byte: full string length
+ ret
+
+ /* match found in head */
+2: tzcnt %eax, %eax // compute string length
+ ret
+
+ /* match found in first unrolled loop body */
+3: tzcnt %eax, %eax
+ sub %rsi, %rdi
+ lea -16(%rdi,%rax,1), %rax // as above, but undo advancement to next iteration
+ ret
+ARCHEND(strlen, baseline)
+
+ARCHENTRY(strlen, x86_64_v3)
+ mov %edi, %ecx
+ mov %rdi, %rsi // string pointer copy for later
+ vpxor %ymm1, %ymm1, %ymm1
+ and $~0x1f, %rdi // align string
+ vpcmpeqb (%rdi), %ymm1, %ymm0 // compare head (with junk before string)
+ and $0x1f, %ecx // amount of bytes rdi is past 32 byte alignment
+ vpmovmskb %ymm0, %eax
+ shr %cl, %eax // clear out matches in junk bytes
+ test %eax, %eax // any match? (can't use ZF from SHR as CL=0 is possible)
+ jnz 2f
+
+ ALIGN_TEXT
+1: vpcmpeqb 32(%rdi), %ymm1, %ymm0 // find NUL bytes
+ add $64, %rdi // advance to next iteration
+ vptest %ymm0, %ymm0 // were any NUL bytes present?
+ jnz 3f
+
+ /* the same unrolled once more */
+ vpcmpeqb (%rdi), %ymm1, %ymm0
+ vptest %ymm0, %ymm0
+ jz 1b
+
+ /* match found in second unrolled loop body */
+ vpmovmskb %ymm0, %eax
+ sub %rsi, %rdi // string length until beginning of (%rdi)
+ tzcnt %eax, %eax // find the first NUL byte
+ add %rdi, %rax // that plus loc. of NUL byte: full string length
+ vzeroupper
+ ret
+
+ /* match found in head */
+2: tzcnt %eax, %eax // compute string length
+ vzeroupper
+ ret
+
+ /* match found in first unrolled loop body */
+3: vpmovmskb %ymm0, %eax
+ sub %rsi, %rdi
+ tzcnt %eax, %eax
+ lea -32(%rdi,%rax,1), %rax // as above, but undo advancement to next iteration
+ vzeroupper
+ ret
+ARCHEND(strlen, x86_64_v3)
+
+ARCHENTRY(strlen, x86_64_v4)
+ mov %edi, %ecx
+ mov $-1, %rax
+ and $0x3f, %ecx // amount of bytes rdi is past 64 byte alignment
+ mov %rdi, %rsi // string pointer copy for later
+ shl %cl, %rax // one bits for non-junk bytes in (%rdi)
+ and $~0x3f, %rdi // align string
+ kmovq %rax, %k2
+ vpxord %zmm16, %zmm16, %zmm16
+ vpcmpeqb (%rdi), %zmm16, %k1 // compare head (with junk before string)
+ ktestq %k1, %k2 // any NUL bytes in non-junk locations?
+ jnz 2f
+
+ ALIGN_TEXT
+1: vpcmpeqb 64(%rdi), %zmm16, %k1 // find NUL bytes
+ sub $-128, %rdi // advance to next iteration
+ ktestq %k1, %k1 // were any NUL bytes present?
+ jnz 3f
+
+ /* the same unrolled once more */
+ vpcmpeqb (%rdi), %zmm16, %k1
+ ktestq %k1, %k1
+ jz 1b
+
+ /* match found in second unrolled loop body */
+ kmovq %k1, %rax
+ sub %rsi, %rdi // string length until beginning of (%rdi)
+ tzcnt %rax, %rax // find the first NUL byte
+ add %rdi, %rax // that plus loc. of NUL byte: full string length
+ ret
+
+ /* match found in head */
+2: kmovq %k1, %rax // mask of matches (including junk)
+ shr %cl, %rax // mask out junk matches
+ tzcnt %rax, %rax // compute string length
+ ret
+
+ /* match found in first unrolled loop body */
+3: kmovq %k1, %rax
+ sub %rsi, %rdi
+ tzcnt %rax, %rax
+ lea -64(%rdi,%rax,1), %rax // as above, but undo advancement to next iteration
+ ret
+ARCHEND(strlen, x86_64_v4)
.section .note.GNU-stack,"",%progbits
diff --git a/libexec/rtld-elf/rtld-libc/Makefile.inc b/libexec/rtld-elf/rtld-libc/Makefile.inc
--- a/libexec/rtld-elf/rtld-libc/Makefile.inc
+++ b/libexec/rtld-elf/rtld-libc/Makefile.inc
@@ -38,11 +38,28 @@
# errlst.c needs the errlst.h header from libc:
CFLAGS.errlst.c+=-I${LIBC_SRCTOP}/include
+.if ${LIBC_ARCH} == "amd64"
+# on amd64, string functions use ifunc dispatch so we have to use the
+# generic versions in rtld.
+.PATH: ${LIBC_SRCTOP}/string
+SRCS+= bcopy.c bzero.c memchr.c memcmp.c memcpy.c memmove.c memset.c strcat.c \
+ strchr.c strchrnul.c strcmp.c strcpy.c strcspn.c strdup.c strlcat.c \
+ strlcpy.c strlen.c strncmp.c strncpy.c strrchr.c strsep.c strspn.c \
+ strstr.c strtok.c
+CFLAGS.memchr.c+=-Wno-cast-qual
+CFLAGS.strchr.c+=-Wno-cast-qual
+CFLAGS.strchrnul.c+=-Wno-cast-qual
+CFLAGS.strrchr.c+=-Wno-cast-qual
+CFLAGS.strstr.c+=-Wno-cast-qual -Wno-sign-compare
+CFLAGS.strtok.c+=-Wno-cast-qual
+.else # !amd64
# Use the string and memory .o files from libc instead of rebuilding them (they
# might be using optimized assembly and duplicating that logic here is awkward).
_libc_string_objects= bcmp bcopy bzero memset memchr memcmp memcpy memmove \
stpncpy strcat strchr strchrnul strcmp stpcpy strcpy strcspn strdup \
strlcat strlcpy strlen strncmp strncpy strrchr strsep strspn strstr strtok
+.endif
+
# Also use all the syscall .o files from libc_nossp_pic:
_libc_other_objects= sigsetjmp lstat stat fstat fstatat fstatfs syscall \
cerror geteuid getegid sigfastblock munmap mprotect \
diff --git a/share/man/man7/Makefile b/share/man/man7/Makefile
--- a/share/man/man7/Makefile
+++ b/share/man/man7/Makefile
@@ -26,6 +26,7 @@
release.7 \
sdoc.7 \
security.7 \
+ simd.7 \
sizeof.7 \
sprog.7 \
stats.7 \
diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7
new file mode 100644
--- /dev/null
+++ b/share/man/man7/simd.7
@@ -0,0 +1,227 @@
+.\" Copyright (c) 2023 The FreeBSD Foundation
+.
+.\" This documentation was written by Robert Clausecker <fuz@FreeBSD.org>
+.\" under sponsorship from the FreeBSD Foundation.
+.
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE
+.
+.Dd June 19, 2023
+.Dt SIMD 7
+.Os
+.Sh NAME
+.Nm simd
+.Nd SIMD enhancements
+.
+.Sh DESCRIPTION
+On some architectures, the
+.Fx
+.Em libc
+provides enhanced implementations of commonly used functions, replacing
+the architecture-independent implementations used otherwise.
+Depending on architecture and function, an enhanced
+implementation of a function may either always be used or the
+.Em libc
+detects at runtime which SIMD instruction set extensions are
+supported and picks the most suitable implementation automatically.
+On
+.Cm amd64 ,
+the environment variable
+.Ev ARCHLEVEL
+can be used to override this mechanism.
+.Pp
+Enhanced functions are present in the following architectures:
+.Bl -column FUNCTION_ aarch64_ arm_ amd64_ i386_ ppc64_ -offset indent
+.It Em FUNCTION Ta Em AARCH64 Ta Em ARM Ta Em AMD64 Ta Em I386 Ta Em PPC64
+.It bcmp Ta Ta Ta S Ta S
+.It bcopy Ta Ta S Ta S Ta S Ta SV
+.It bzero Ta Ta S Ta S Ta S
+.It div Ta Ta Ta S Ta S
+.It ffs Ta Ta S Ta Ta S
+.It index Ta S
+.It ldiv Ta Ta Ta S Ta S
+.It lldiv Ta Ta Ta S
+.It memcmp Ta Ta S Ta S Ta S
+.It memcpy Ta S Ta S Ta S Ta S Ta SV
+.It memmove Ta S Ta S Ta S Ta S Ta SV
+.It memset Ta Ta S Ta S Ta S
+.It rindex Ta S
+.It stpcpy Ta Ta Ta S
+.It strcat Ta Ta Ta S Ta S
+.It strchr Ta S Ta Ta Ta S
+.It strcmp Ta Ta S Ta S Ta S
+.It strcpy Ta Ta Ta S Ta S Ta S2
+.It strlen Ta Ta S Ta S134
+.It strncmp Ta Ta S Ta Ta S
+.It strncpy Ta Ta Ta Ta Ta S2
+.It strrchr Ta S Ta Ta Ta S
+.It swab Ta Ta Ta Ta S
+.It wcschr Ta Ta Ta Ta S
+.It wcscmp Ta Ta Ta Ta S
+.It wcslen Ta Ta Ta Ta S
+.It wmemchr Ta Ta Ta Ta S
+.El
+.Pp
+.Sy S Ns :\ scalar (non-SIMD),
+.Sy 1 Ns :\ amd64 baseline,
+.Sy 2 Ns :\ x86-64-v2
+or PowerPC\ 2.05,
+.Sy 3 Ns :\ x86-64-v3,
+.Sy 4 Ns :\ x86-64-v4,
+.Sy V Ns :\ PowerPC\ VSX.
+.
+.Sh ENVIRONMENT
+.Bl -tag
+.It Ev ARCHLEVEL
+On
+.Em amd64 ,
+controls the level of SIMD enhancements used.
+If this variable is set to an architecture level from the list below
+and that architecture level is supported by the processor, SIMD
+enhancements up to
+.Ev ARCHLEVEL
+are used.
+If
+.Ev ARCHLEVEL
+is unset, not recognised, or not supported by the processor, the highest
+level of SIMD enhancements supported by the processor is used.
+.Pp
+A suffix beginning with
+.Sq ":"
+or
+.Sq "+"
+in
+.Ev ARCHLEVEL
+is ignored and may be used for future extensions.
+The architecture level can be prefixed with a
+.Sq "!"
+character to force use of the requested architecture level, even if the
+processor does not advertise that it is supported.
+This usually causes applications to crash and should only be used for
+testing purposes or if architecture level detection yields incorrect
+results.
+.Pp
+The architecture levels follow the AMD64 SysV ABI supplement:
+.Bl -tag -width x86-64-v2
+.It Cm scalar
+scalar enhancements only (no SIMD)
+.It Cm baseline
+cmov, cx8, x87 FPU, fxsr, MMX, osfxsr, SSE, SSE2
+.It Cm x86-64-v2
+cx16, lahf/sahf, popcnt, SSE3, SSSE3, SSE4.1, SSE4.2
+.It Cm x86-64-v3
+AVX, AVX2, BMI1, BMI2, F16C, FMA, lzcnt, movbe, osxsave
+.It Cm x86-64-v4
+AVX-512F/BW/CD/DQ/VL
+.El
+.El
+.
+.Sh DIAGNOSTICS
+.Bl -diag
+.It "Illegal Instruction"
+Printed by
+.Xr sh 1
+if a command is terminated through delivery of a
+.Dv SIGILL
+signal, see
+.Xr signal 3 .
+.Pp
+Use of an unsupported architecture level was forced by setting
+.Ev ARCHLEVEL
+to a string beginning with a
+.Sq "!"
+character, causing a process to crash due to use of an unsupported
+instruction.
+Unset
+.Ev ARCHLEVEL ,
+remove the
+.Sq "!"
+prefix or select a supported architecture level.
+.Pp
+Message may also appear for unrelated reasons.
+.El
+.
+.Sh SEE ALSO
+.Xr string 3 ,
+.Xr arch 7
+.Rs
+.%A H. J. Lu
+.%A Michael Matz
+.%A Milind Girkar
+.%A Jan Hubi\[u010D]ka \" \(vc
+.%A Andreas Jaeger
+.%A Mark Mitchell
+.%B System V Application Binary Interface
+.%D May 23, 2023
+.%T AMD64 Architecture Processor Supplement
+.%O Version 1.0
+.Re
+.
+.Sh HISTORY
+Architecture-specific enhanced
+.Em libc
+functions were added starting
+with
+.Fx 2.0
+for
+.Cm i386 ,
+.Fx 6.0
+for
+.Cm arm ,
+.Fx 6.1
+for
+.Cm amd64 ,
+.Fx 11.0
+for
+.Cm aarch64 ,
+and
+.Fx 12.0
+for
+.Cm powerpc64 .
+SIMD-enhanced functions were first added with
+.Fx 13.0
+for
+.Cm powerpc64
+and with
+.Fx 14.0
+for
+.Cm amd64 .
+.Pp
+A
+.Nm
+manual page appeared in
+.Fx 14.0 .
+.
+.Sh AUTHOR
+.An Robert Clausecker Aq Mt fuz@FreeBSD.org
+.
+.Sh CAVEATS
+Other parts of
+.Fx
+such as cryptographic routines in the kernel or in
+OpenSSL may also use SIMD enhancements.
+These enhancements are not subject to the
+.Ev ARCHLEVEL
+variable and may have their own configuration
+mechanism.
+.
+.Sh BUGS
+Use of SIMD enhancements cannot be configured on powerpc64.
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Dec 28, 8:19 AM (3 h, 43 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27327224
Default Alt Text
D40693.id123906.diff (25 KB)
Attached To
Mode
D40693: lib/libc/amd64: add archlevel-based simd dispatch framework
Attached
Detach File
Event Timeline
Log In to Comment